alexue4 commited on
Commit
8168262
1 Parent(s): 4312d76

End of training

Browse files
Files changed (4) hide show
  1. README.md +20 -25
  2. pytorch_model.bin +1 -1
  3. trainer_state.json +904 -954
  4. training_args.bin +1 -1
README.md CHANGED
@@ -15,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0405
19
  - Mean Distance: 0
20
- - Max Distance: 6
21
 
22
  ## Model description
23
 
@@ -43,32 +43,27 @@ The following hyperparameters were used during training:
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 20
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Max Distance | Mean Distance |
51
- |:-------------:|:-----:|:------:|:---------------:|:------------:|:-------------:|
52
- | 0.0052 | 1.0 | 14041 | 0.0272 | 9 | 0 |
53
- | 0.0045 | 2.0 | 28088 | 0.0327 | 0 | 9 |
54
- | 0.0043 | 3.0 | 42132 | 0.0317 | 0 | 6 |
55
- | 0.0042 | 4.0 | 56176 | 0.0316 | 0 | 6 |
56
- | 0.0035 | 5.0 | 70220 | 0.0357 | 0 | 6 |
57
- | 0.0032 | 6.0 | 84264 | 0.0365 | 0 | 6 |
58
- | 0.0027 | 7.0 | 98308 | 0.0403 | 0 | 6 |
59
- | 0.0027 | 8.0 | 112352 | 0.0398 | 0 | 6 |
60
- | 0.0023 | 9.0 | 126396 | 0.0404 | 0 | 6 |
61
- | 0.0023 | 10.0 | 140440 | 0.0385 | 0 | 6 |
62
- | 0.002 | 11.0 | 154484 | 0.0407 | 0 | 6 |
63
- | 0.0018 | 12.0 | 168528 | 0.0426 | 0 | 9 |
64
- | 0.0018 | 13.0 | 182572 | 0.0422 | 0 | 6 |
65
- | 0.0016 | 14.0 | 196616 | 0.0421 | 0 | 6 |
66
- | 0.0016 | 15.0 | 210660 | 0.0402 | 0 | 6 |
67
- | 0.0014 | 16.0 | 224704 | 0.0407 | 0 | 6 |
68
- | 0.0014 | 17.0 | 238748 | 0.0427 | 0 | 6 |
69
- | 0.0014 | 18.0 | 252792 | 0.0411 | 0 | 6 |
70
- | 0.0013 | 19.0 | 266836 | 0.0406 | 0 | 6 |
71
- | 0.0013 | 20.0 | 280880 | 0.0405 | 0 | 6 |
72
 
73
 
74
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0366
19
  - Mean Distance: 0
20
+ - Max Distance: 8
21
 
22
  ## Model description
23
 
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 15
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
+ |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
+ | 0.0052 | 1.0 | 22916 | 0.0271 | 0 | 9 |
53
+ | 0.0051 | 2.0 | 45832 | 0.0261 | 0 | 8 |
54
+ | 0.0043 | 3.0 | 68748 | 0.0313 | 0 | 8 |
55
+ | 0.0041 | 4.0 | 91664 | 0.0278 | 0 | 10 |
56
+ | 0.0037 | 5.0 | 114580 | 0.0280 | 0 | 8 |
57
+ | 0.0032 | 6.0 | 137496 | 0.0288 | 0 | 8 |
58
+ | 0.003 | 7.0 | 160412 | 0.0308 | 0 | 8 |
59
+ | 0.0025 | 8.0 | 183328 | 0.0305 | 0 | 8 |
60
+ | 0.0025 | 9.0 | 206244 | 0.0303 | 0 | 8 |
61
+ | 0.0023 | 10.0 | 229160 | 0.0341 | 0 | 8 |
62
+ | 0.0022 | 11.0 | 252076 | 0.0329 | 0 | 8 |
63
+ | 0.0019 | 12.0 | 274992 | 0.0336 | 0 | 8 |
64
+ | 0.002 | 13.0 | 297908 | 0.0358 | 0 | 8 |
65
+ | 0.0018 | 14.0 | 320824 | 0.0355 | 0 | 8 |
66
+ | 0.0019 | 15.0 | 343740 | 0.0366 | 0 | 8 |
 
 
 
 
 
67
 
68
 
69
  ### Framework versions
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5969f5d1763ec4c33ef6b8b477385b4628340069265990d31d83adc01ff3d90
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f02dda551ed1f056d9fed08e40df3447ff8597cf88883f5f1ca2067d54133a61
3
  size 258643461
trainer_state.json CHANGED
@@ -1,1428 +1,1378 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 280880,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 3.5609999287800016e-09,
14
- "loss": 0.0011,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.1,
19
- "learning_rate": 5.003204899935903e-06,
20
- "loss": 0.0046,
21
- "step": 1405
22
  },
23
  {
24
- "epoch": 0.2,
25
- "learning_rate": 1.0006409799871805e-05,
26
- "loss": 0.005,
27
- "step": 2810
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.3,
31
- "learning_rate": 1.5009614699807706e-05,
 
 
 
 
 
 
32
  "loss": 0.0052,
33
- "step": 4215
34
  },
35
  {
36
- "epoch": 0.4,
37
- "learning_rate": 2.001281959974361e-05,
38
- "loss": 0.005,
39
- "step": 5620
40
  },
41
  {
42
- "epoch": 0.5,
43
- "learning_rate": 2.5016024499679513e-05,
44
- "loss": 0.0048,
45
- "step": 7025
46
  },
47
  {
48
  "epoch": 0.6,
49
- "learning_rate": 3.001922939961541e-05,
50
- "loss": 0.0045,
51
- "step": 8430
52
  },
53
  {
54
- "epoch": 0.7,
55
- "learning_rate": 3.5022434299551316e-05,
56
- "loss": 0.0049,
57
- "step": 9835
58
  },
59
  {
60
- "epoch": 0.8,
61
- "learning_rate": 4.002563919948722e-05,
62
  "loss": 0.0051,
63
- "step": 11240
 
 
 
 
 
 
64
  },
65
  {
66
  "epoch": 0.9,
67
- "learning_rate": 4.502884409942312e-05,
 
 
 
 
 
 
68
  "loss": 0.0052,
69
- "step": 12645
70
  },
71
  {
72
  "epoch": 1.0,
73
- "eval_loss": 0.027173755690455437,
74
  "eval_max_distance": 9,
75
  "eval_mean_distance": 0,
76
- "eval_runtime": 13.945,
77
- "eval_samples_per_second": 17.928,
78
- "eval_steps_per_second": 1.219,
79
- "step": 14041
80
  },
81
  {
82
- "epoch": 1.0,
83
- "learning_rate": 5.0032048999359025e-05,
84
  "loss": 0.0045,
85
- "step": 14050
86
  },
87
  {
88
- "epoch": 1.1,
89
- "learning_rate": 5.5035253899294924e-05,
90
- "loss": 0.0047,
91
- "step": 15455
92
  },
93
  {
94
  "epoch": 1.2,
95
- "learning_rate": 6.003845879923082e-05,
96
- "loss": 0.0044,
97
- "step": 16860
98
  },
99
  {
100
- "epoch": 1.3,
101
- "learning_rate": 6.504166369916673e-05,
102
  "loss": 0.0045,
103
- "step": 18265
104
  },
105
  {
106
- "epoch": 1.4,
107
- "learning_rate": 7.002990600968385e-05,
108
- "loss": 0.0046,
109
- "step": 19670
 
 
 
 
 
 
110
  },
111
  {
112
  "epoch": 1.5,
113
- "learning_rate": 7.50320421532327e-05,
114
- "loss": 0.0043,
115
- "step": 21075
116
  },
117
  {
118
- "epoch": 1.6,
119
- "learning_rate": 8.003417829678155e-05,
120
- "loss": 0.0042,
121
- "step": 22480
122
  },
123
  {
124
- "epoch": 1.7,
125
- "learning_rate": 8.503631444033039e-05,
126
- "loss": 0.0046,
127
- "step": 23885
 
 
 
 
 
 
128
  },
129
  {
130
  "epoch": 1.8,
131
- "learning_rate": 9.003845058387924e-05,
132
- "loss": 0.0043,
133
- "step": 25290
134
  },
135
  {
136
- "epoch": 1.9,
137
- "learning_rate": 9.504058672742809e-05,
138
- "loss": 0.0045,
139
- "step": 26695
 
 
 
 
 
 
140
  },
141
  {
142
  "epoch": 2.0,
143
- "eval_loss": 0.03272660821676254,
144
- "eval_max_distance": 9,
145
  "eval_mean_distance": 0,
146
- "eval_runtime": 12.7866,
147
- "eval_samples_per_second": 19.552,
148
- "eval_steps_per_second": 1.33,
149
- "step": 28088
150
  },
151
  {
152
- "epoch": 2.0,
153
- "learning_rate": 9.99952530143359e-05,
154
- "loss": 0.0046,
155
- "step": 28100
156
  },
157
  {
158
  "epoch": 2.1,
159
- "learning_rate": 9.943946010949714e-05,
160
- "loss": 0.0042,
161
- "step": 29505
162
  },
163
  {
164
- "epoch": 2.2,
165
- "learning_rate": 9.888366720465838e-05,
166
- "loss": 0.0043,
167
- "step": 30910
168
  },
169
  {
170
- "epoch": 2.3,
171
- "learning_rate": 9.832787429981962e-05,
172
- "loss": 0.0044,
173
- "step": 32315
 
 
 
 
 
 
174
  },
175
  {
176
  "epoch": 2.4,
177
- "learning_rate": 9.777208139498087e-05,
178
- "loss": 0.0045,
179
- "step": 33720
180
  },
181
  {
182
- "epoch": 2.5,
183
- "learning_rate": 9.721628849014209e-05,
184
- "loss": 0.0041,
185
- "step": 35125
186
  },
187
  {
188
- "epoch": 2.6,
189
- "learning_rate": 9.666049558530333e-05,
190
- "loss": 0.0043,
191
- "step": 36530
 
 
 
 
 
 
192
  },
193
  {
194
  "epoch": 2.7,
195
- "learning_rate": 9.610470268046457e-05,
196
- "loss": 0.0046,
197
- "step": 37935
198
  },
199
  {
200
- "epoch": 2.8,
201
- "learning_rate": 9.554890977562582e-05,
202
- "loss": 0.0043,
203
- "step": 39340
204
  },
205
  {
206
- "epoch": 2.9,
207
- "learning_rate": 9.499311687078706e-05,
 
 
 
 
 
 
208
  "loss": 0.0043,
209
- "step": 40745
210
  },
211
  {
212
  "epoch": 3.0,
213
- "eval_loss": 0.03169206902384758,
214
- "eval_max_distance": 6,
215
  "eval_mean_distance": 0,
216
- "eval_runtime": 12.3553,
217
- "eval_samples_per_second": 20.234,
218
- "eval_steps_per_second": 1.376,
219
- "step": 42132
220
  },
221
  {
222
  "epoch": 3.0,
223
- "learning_rate": 9.44373239659483e-05,
224
- "loss": 0.0048,
225
- "step": 42150
226
  },
227
  {
228
- "epoch": 3.1,
229
- "learning_rate": 9.388153106110953e-05,
230
- "loss": 0.0034,
231
- "step": 43555
232
  },
233
  {
234
- "epoch": 3.2,
235
- "learning_rate": 9.332573815627077e-05,
236
  "loss": 0.0036,
237
- "step": 44960
 
 
 
 
 
 
238
  },
239
  {
240
  "epoch": 3.3,
241
- "learning_rate": 9.276994525143201e-05,
242
- "loss": 0.0037,
243
- "step": 46365
244
  },
245
  {
246
- "epoch": 3.4,
247
- "learning_rate": 9.221415234659325e-05,
248
- "loss": 0.004,
249
- "step": 47770
250
  },
251
  {
252
- "epoch": 3.5,
253
- "learning_rate": 9.16583594417545e-05,
254
- "loss": 0.0041,
255
- "step": 49175
 
 
 
 
 
 
256
  },
257
  {
258
  "epoch": 3.6,
259
- "learning_rate": 9.110256653691574e-05,
260
- "loss": 0.0039,
261
- "step": 50580
262
  },
263
  {
264
- "epoch": 3.7,
265
- "learning_rate": 9.054677363207696e-05,
266
- "loss": 0.0039,
267
- "step": 51985
 
 
 
 
 
 
268
  },
269
  {
270
- "epoch": 3.8,
271
- "learning_rate": 8.99909807272382e-05,
272
  "loss": 0.0039,
273
- "step": 53390
274
  },
275
  {
276
  "epoch": 3.9,
277
- "learning_rate": 8.943518782239945e-05,
278
- "loss": 0.0042,
279
- "step": 54795
 
 
 
 
 
 
280
  },
281
  {
282
  "epoch": 4.0,
283
- "eval_loss": 0.031554438173770905,
284
- "eval_max_distance": 6,
285
  "eval_mean_distance": 0,
286
- "eval_runtime": 12.1943,
287
- "eval_samples_per_second": 20.501,
288
- "eval_steps_per_second": 1.394,
289
- "step": 56176
290
  },
291
  {
292
- "epoch": 4.0,
293
- "learning_rate": 8.887939491756069e-05,
294
- "loss": 0.004,
295
- "step": 56200
296
  },
297
  {
298
- "epoch": 4.1,
299
- "learning_rate": 8.832360201272193e-05,
300
- "loss": 0.0034,
301
- "step": 57605
302
  },
303
  {
304
  "epoch": 4.2,
305
- "learning_rate": 8.776780910788317e-05,
306
- "loss": 0.0033,
307
- "step": 59010
308
  },
309
  {
310
- "epoch": 4.3,
311
- "learning_rate": 8.72120162030444e-05,
312
  "loss": 0.0034,
313
- "step": 60415
 
 
 
 
 
 
314
  },
315
  {
316
- "epoch": 4.4,
317
- "learning_rate": 8.665622329820564e-05,
318
  "loss": 0.0034,
319
- "step": 61820
320
  },
321
  {
322
  "epoch": 4.5,
323
- "learning_rate": 8.610043039336688e-05,
324
- "loss": 0.0032,
325
- "step": 63225
326
  },
327
  {
328
- "epoch": 4.6,
329
- "learning_rate": 8.554463748852812e-05,
330
  "loss": 0.0035,
331
- "step": 64630
332
  },
333
  {
334
- "epoch": 4.7,
335
- "learning_rate": 8.498884458368937e-05,
336
- "loss": 0.0035,
337
- "step": 66035
 
 
 
 
 
 
338
  },
339
  {
340
  "epoch": 4.8,
341
- "learning_rate": 8.443305167885061e-05,
342
- "loss": 0.0035,
343
- "step": 67440
344
  },
345
  {
346
- "epoch": 4.9,
347
- "learning_rate": 8.387725877401183e-05,
348
- "loss": 0.0035,
349
- "step": 68845
 
 
 
 
 
 
350
  },
351
  {
352
  "epoch": 5.0,
353
- "eval_loss": 0.03568544238805771,
354
- "eval_max_distance": 6,
355
  "eval_mean_distance": 0,
356
- "eval_runtime": 12.3273,
357
- "eval_samples_per_second": 20.28,
358
- "eval_steps_per_second": 1.379,
359
- "step": 70220
360
  },
361
  {
362
- "epoch": 5.0,
363
- "learning_rate": 8.332146586917307e-05,
364
- "loss": 0.0037,
365
- "step": 70250
366
  },
367
  {
368
  "epoch": 5.1,
369
- "learning_rate": 8.276567296433432e-05,
370
- "loss": 0.0028,
371
- "step": 71655
372
  },
373
  {
374
- "epoch": 5.2,
375
- "learning_rate": 8.220988005949556e-05,
376
- "loss": 0.0029,
377
- "step": 73060
378
  },
379
  {
380
- "epoch": 5.3,
381
- "learning_rate": 8.16540871546568e-05,
382
- "loss": 0.003,
383
- "step": 74465
 
 
 
 
 
 
384
  },
385
  {
386
  "epoch": 5.4,
387
- "learning_rate": 8.109829424981804e-05,
 
 
 
 
 
 
388
  "loss": 0.0031,
389
- "step": 75870
390
  },
391
  {
392
- "epoch": 5.5,
393
- "learning_rate": 8.054250134497927e-05,
394
  "loss": 0.0032,
395
- "step": 77275
396
  },
397
  {
398
- "epoch": 5.6,
399
- "learning_rate": 7.998670844014051e-05,
400
- "loss": 0.003,
401
- "step": 78680
402
  },
403
  {
404
  "epoch": 5.7,
405
- "learning_rate": 7.943091553530175e-05,
406
  "loss": 0.0032,
407
- "step": 80085
408
  },
409
  {
410
- "epoch": 5.8,
411
- "learning_rate": 7.8875122630463e-05,
412
  "loss": 0.0031,
413
- "step": 81490
414
  },
415
  {
416
- "epoch": 5.9,
417
- "learning_rate": 7.831932972562424e-05,
 
 
 
 
 
 
418
  "loss": 0.0032,
419
- "step": 82895
420
  },
421
  {
422
  "epoch": 6.0,
423
- "eval_loss": 0.03648597374558449,
424
- "eval_max_distance": 6,
425
  "eval_mean_distance": 0,
426
- "eval_runtime": 12.1121,
427
- "eval_samples_per_second": 20.641,
428
- "eval_steps_per_second": 1.404,
429
- "step": 84264
430
  },
431
  {
432
  "epoch": 6.0,
433
- "learning_rate": 7.776353682078548e-05,
434
- "loss": 0.0032,
435
- "step": 84300
436
  },
437
  {
438
- "epoch": 6.1,
439
- "learning_rate": 7.72077439159467e-05,
440
- "loss": 0.0027,
441
- "step": 85705
442
  },
443
  {
444
- "epoch": 6.2,
445
- "learning_rate": 7.665195101110795e-05,
446
  "loss": 0.0026,
447
- "step": 87110
 
 
 
 
 
 
448
  },
449
  {
450
  "epoch": 6.3,
451
- "learning_rate": 7.609615810626919e-05,
452
- "loss": 0.0028,
453
- "step": 88515
454
  },
455
  {
456
- "epoch": 6.4,
457
- "learning_rate": 7.554036520143043e-05,
458
  "loss": 0.0029,
459
- "step": 89920
460
  },
461
  {
462
- "epoch": 6.5,
463
- "learning_rate": 7.498457229659167e-05,
464
- "loss": 0.0026,
465
- "step": 91325
466
  },
467
  {
468
- "epoch": 6.6,
469
- "learning_rate": 7.442877939175292e-05,
470
  "loss": 0.0028,
471
- "step": 92730
472
  },
473
  {
474
- "epoch": 6.7,
475
- "learning_rate": 7.387298648691414e-05,
476
- "loss": 0.0031,
477
- "step": 94135
478
  },
479
  {
480
- "epoch": 6.8,
481
- "learning_rate": 7.331719358207538e-05,
482
- "loss": 0.0027,
483
- "step": 95540
 
 
 
 
 
 
 
 
 
 
 
 
484
  },
485
  {
486
  "epoch": 6.9,
487
- "learning_rate": 7.276140067723662e-05,
488
- "loss": 0.0027,
489
- "step": 96945
490
  },
491
  {
492
- "epoch": 7.0,
493
- "eval_loss": 0.04028007388114929,
494
- "eval_max_distance": 6,
495
- "eval_mean_distance": 0,
496
- "eval_runtime": 12.1827,
497
- "eval_samples_per_second": 20.521,
498
- "eval_steps_per_second": 1.395,
499
- "step": 98308
500
  },
501
  {
502
  "epoch": 7.0,
503
- "learning_rate": 7.220560777239787e-05,
504
- "loss": 0.0028,
505
- "step": 98350
 
 
 
 
506
  },
507
  {
508
- "epoch": 7.1,
509
- "learning_rate": 7.164981486755911e-05,
510
  "loss": 0.0027,
511
- "step": 99755
512
  },
513
  {
514
- "epoch": 7.2,
515
- "learning_rate": 7.109402196272035e-05,
516
  "loss": 0.0025,
517
- "step": 101160
 
 
 
 
 
 
518
  },
519
  {
520
- "epoch": 7.3,
521
- "learning_rate": 7.053822905788158e-05,
 
 
 
 
 
 
522
  "loss": 0.0025,
523
- "step": 102565
524
  },
525
  {
526
- "epoch": 7.4,
527
- "learning_rate": 6.998243615304282e-05,
528
- "loss": 0.0024,
529
- "step": 103970
530
  },
531
  {
532
  "epoch": 7.5,
533
- "learning_rate": 6.942664324820406e-05,
534
  "loss": 0.0026,
535
- "step": 105375
536
  },
537
  {
538
- "epoch": 7.6,
539
- "learning_rate": 6.88708503433653e-05,
540
  "loss": 0.0026,
541
- "step": 106780
542
  },
543
  {
544
- "epoch": 7.7,
545
- "learning_rate": 6.831505743852654e-05,
546
  "loss": 0.0026,
547
- "step": 108185
548
  },
549
  {
550
- "epoch": 7.8,
551
- "learning_rate": 6.775926453368779e-05,
552
  "loss": 0.0025,
553
- "step": 109590
554
  },
555
  {
556
- "epoch": 7.9,
557
- "learning_rate": 6.720347162884901e-05,
558
  "loss": 0.0027,
559
- "step": 110995
 
 
 
 
 
 
 
 
 
 
 
 
560
  },
561
  {
562
  "epoch": 8.0,
563
- "eval_loss": 0.03977961093187332,
564
- "eval_max_distance": 6,
565
  "eval_mean_distance": 0,
566
- "eval_runtime": 12.1864,
567
- "eval_samples_per_second": 20.515,
568
- "eval_steps_per_second": 1.395,
569
- "step": 112352
570
  },
571
  {
572
- "epoch": 8.0,
573
- "learning_rate": 6.664767872401025e-05,
574
- "loss": 0.0028,
575
- "step": 112400
576
  },
577
  {
578
  "epoch": 8.1,
579
- "learning_rate": 6.60918858191715e-05,
580
  "loss": 0.0023,
581
- "step": 113805
582
  },
583
  {
584
- "epoch": 8.2,
585
- "learning_rate": 6.553609291433274e-05,
586
- "loss": 0.0023,
587
- "step": 115210
588
  },
589
  {
590
- "epoch": 8.3,
591
- "learning_rate": 6.498030000949398e-05,
592
- "loss": 0.0022,
593
- "step": 116615
594
  },
595
  {
596
- "epoch": 8.4,
597
- "learning_rate": 6.442450710465522e-05,
598
- "loss": 0.0023,
599
- "step": 118020
600
  },
601
  {
602
- "epoch": 8.5,
603
- "learning_rate": 6.386871419981645e-05,
604
  "loss": 0.0024,
605
- "step": 119425
606
  },
607
  {
608
- "epoch": 8.6,
609
- "learning_rate": 6.331292129497769e-05,
610
  "loss": 0.0025,
611
- "step": 120830
612
  },
613
  {
614
- "epoch": 8.7,
615
- "learning_rate": 6.275712839013893e-05,
616
  "loss": 0.0024,
617
- "step": 122235
618
  },
619
  {
620
- "epoch": 8.8,
621
- "learning_rate": 6.220133548530017e-05,
622
- "loss": 0.0022,
623
- "step": 123640
624
  },
625
  {
626
- "epoch": 8.9,
627
- "learning_rate": 6.164554258046142e-05,
628
  "loss": 0.0023,
629
- "step": 125045
630
  },
631
  {
632
- "epoch": 9.0,
633
- "eval_loss": 0.040445487946271896,
634
- "eval_max_distance": 6,
635
- "eval_mean_distance": 0,
636
- "eval_runtime": 12.3201,
637
- "eval_samples_per_second": 20.292,
638
- "eval_steps_per_second": 1.38,
639
- "step": 126396
640
  },
641
  {
642
- "epoch": 9.0,
643
- "learning_rate": 6.108974967562266e-05,
644
  "loss": 0.0024,
645
- "step": 126450
646
  },
647
  {
648
- "epoch": 9.1,
649
- "learning_rate": 6.053395677078388e-05,
650
- "loss": 0.002,
651
- "step": 127855
652
  },
653
  {
654
- "epoch": 9.2,
655
- "learning_rate": 5.9978163865945126e-05,
656
- "loss": 0.0021,
657
- "step": 129260
 
 
 
 
658
  },
659
  {
660
- "epoch": 9.3,
661
- "learning_rate": 5.9422370961106364e-05,
662
- "loss": 0.0022,
663
- "step": 130665
664
  },
665
  {
666
- "epoch": 9.4,
667
- "learning_rate": 5.886657805626761e-05,
668
- "loss": 0.0023,
669
- "step": 132070
670
  },
671
  {
672
- "epoch": 9.5,
673
- "learning_rate": 5.831078515142885e-05,
674
  "loss": 0.0022,
675
- "step": 133475
676
- },
677
- {
678
- "epoch": 9.6,
679
- "learning_rate": 5.775499224659009e-05,
680
- "loss": 0.0021,
681
- "step": 134880
682
  },
683
  {
684
- "epoch": 9.7,
685
- "learning_rate": 5.719919934175132e-05,
686
- "loss": 0.0021,
687
- "step": 136285
688
  },
689
  {
690
- "epoch": 9.8,
691
- "learning_rate": 5.664340643691256e-05,
692
- "loss": 0.0021,
693
- "step": 137690
694
  },
695
  {
696
- "epoch": 9.9,
697
- "learning_rate": 5.60876135320738e-05,
698
  "loss": 0.0023,
699
- "step": 139095
700
  },
701
  {
702
- "epoch": 10.0,
703
- "eval_loss": 0.03852245956659317,
704
- "eval_max_distance": 6,
705
- "eval_mean_distance": 0,
706
- "eval_runtime": 12.131,
707
- "eval_samples_per_second": 20.608,
708
- "eval_steps_per_second": 1.401,
709
- "step": 140440
710
  },
711
  {
712
- "epoch": 10.0,
713
- "learning_rate": 5.5531820627235044e-05,
714
  "loss": 0.0023,
715
- "step": 140500
716
  },
717
  {
718
- "epoch": 10.1,
719
- "learning_rate": 5.497602772239628e-05,
720
- "loss": 0.0018,
721
- "step": 141905
722
  },
723
  {
724
- "epoch": 10.2,
725
- "learning_rate": 5.4420234817557526e-05,
726
- "loss": 0.0021,
727
- "step": 143310
728
  },
729
  {
730
- "epoch": 10.3,
731
- "learning_rate": 5.386444191271876e-05,
732
- "loss": 0.0021,
733
- "step": 144715
734
  },
735
  {
736
- "epoch": 10.4,
737
- "learning_rate": 5.3308649007879995e-05,
738
- "loss": 0.0021,
739
- "step": 146120
740
  },
741
  {
742
- "epoch": 10.5,
743
- "learning_rate": 5.275285610304124e-05,
744
- "loss": 0.0021,
745
- "step": 147525
746
  },
747
  {
748
- "epoch": 10.6,
749
- "learning_rate": 5.219706319820248e-05,
750
- "loss": 0.002,
751
- "step": 148930
752
  },
753
  {
754
- "epoch": 10.7,
755
- "learning_rate": 5.1641270293363716e-05,
756
- "loss": 0.002,
757
- "step": 150335
 
 
 
 
758
  },
759
  {
760
- "epoch": 10.8,
761
- "learning_rate": 5.108547738852496e-05,
762
- "loss": 0.0019,
763
- "step": 151740
764
  },
765
  {
766
- "epoch": 10.9,
767
- "learning_rate": 5.052968448368619e-05,
768
- "loss": 0.002,
769
- "step": 153145
770
  },
771
  {
772
- "epoch": 11.0,
773
- "eval_loss": 0.040671207010746,
774
- "eval_max_distance": 6,
775
- "eval_mean_distance": 0,
776
- "eval_runtime": 12.2718,
777
- "eval_samples_per_second": 20.372,
778
- "eval_steps_per_second": 1.385,
779
- "step": 154484
780
  },
781
  {
782
- "epoch": 11.0,
783
- "learning_rate": 4.997389157884743e-05,
784
- "loss": 0.0018,
785
- "step": 154550
786
  },
787
  {
788
- "epoch": 11.1,
789
- "learning_rate": 4.9418098674008675e-05,
790
- "loss": 0.002,
791
- "step": 155955
792
  },
793
  {
794
- "epoch": 11.2,
795
- "learning_rate": 4.886230576916991e-05,
796
- "loss": 0.0019,
797
- "step": 157360
798
  },
799
  {
800
- "epoch": 11.3,
801
- "learning_rate": 4.830651286433115e-05,
802
- "loss": 0.0018,
803
- "step": 158765
804
  },
805
  {
806
- "epoch": 11.4,
807
- "learning_rate": 4.775071995949239e-05,
808
- "loss": 0.0018,
809
- "step": 160170
810
  },
811
  {
812
- "epoch": 11.5,
813
- "learning_rate": 4.7194927054653634e-05,
814
- "loss": 0.002,
815
- "step": 161575
816
  },
817
  {
818
- "epoch": 11.6,
819
- "learning_rate": 4.6639134149814865e-05,
820
- "loss": 0.0019,
821
- "step": 162980
822
  },
823
  {
824
- "epoch": 11.7,
825
- "learning_rate": 4.608334124497611e-05,
826
- "loss": 0.0017,
827
- "step": 164385
828
  },
829
  {
830
- "epoch": 11.81,
831
- "learning_rate": 4.552754834013735e-05,
832
  "loss": 0.002,
833
- "step": 165790
834
  },
835
  {
836
- "epoch": 11.91,
837
- "learning_rate": 4.4971755435298586e-05,
838
- "loss": 0.0018,
839
- "step": 167195
840
  },
841
  {
842
- "epoch": 12.0,
843
- "eval_loss": 0.04258317872881889,
844
- "eval_max_distance": 9,
845
  "eval_mean_distance": 0,
846
- "eval_runtime": 12.8224,
847
- "eval_samples_per_second": 19.497,
848
- "eval_steps_per_second": 1.326,
849
- "step": 168528
850
- },
851
- {
852
- "epoch": 12.01,
853
- "learning_rate": 4.4415962530459824e-05,
854
- "loss": 0.0017,
855
- "step": 168600
856
- },
857
- {
858
- "epoch": 12.11,
859
- "learning_rate": 4.386016962562107e-05,
860
- "loss": 0.0016,
861
- "step": 170005
862
- },
863
- {
864
- "epoch": 12.21,
865
- "learning_rate": 4.33043767207823e-05,
866
- "loss": 0.0018,
867
- "step": 171410
868
  },
869
  {
870
- "epoch": 12.31,
871
- "learning_rate": 4.2748583815943544e-05,
872
- "loss": 0.0016,
873
- "step": 172815
874
- },
875
- {
876
- "epoch": 12.41,
877
- "learning_rate": 4.219279091110478e-05,
878
- "loss": 0.0018,
879
- "step": 174220
880
- },
881
- {
882
- "epoch": 12.51,
883
- "learning_rate": 4.163699800626602e-05,
884
- "loss": 0.0018,
885
- "step": 175625
886
- },
887
- {
888
- "epoch": 12.61,
889
- "learning_rate": 4.108120510142726e-05,
890
- "loss": 0.0018,
891
- "step": 177030
892
- },
893
- {
894
- "epoch": 12.71,
895
- "learning_rate": 4.05254121965885e-05,
896
- "loss": 0.0016,
897
- "step": 178435
898
  },
899
  {
900
- "epoch": 12.81,
901
- "learning_rate": 3.996961929174974e-05,
902
  "loss": 0.002,
903
- "step": 179840
904
  },
905
  {
906
- "epoch": 12.91,
907
- "learning_rate": 3.941382638691098e-05,
908
- "loss": 0.0018,
909
- "step": 181245
910
- },
911
- {
912
- "epoch": 13.0,
913
- "eval_loss": 0.042234089225530624,
914
- "eval_max_distance": 6,
915
- "eval_mean_distance": 0,
916
- "eval_runtime": 13.9619,
917
- "eval_samples_per_second": 17.906,
918
- "eval_steps_per_second": 1.218,
919
- "step": 182572
920
  },
921
  {
922
- "epoch": 13.01,
923
- "learning_rate": 3.885803348207222e-05,
924
- "loss": 0.0018,
925
- "step": 182650
926
  },
927
  {
928
- "epoch": 13.11,
929
- "learning_rate": 3.8302240577233455e-05,
930
- "loss": 0.0017,
931
- "step": 184055
932
  },
933
  {
934
- "epoch": 13.21,
935
- "learning_rate": 3.77464476723947e-05,
936
- "loss": 0.0017,
937
- "step": 185460
938
  },
939
  {
940
- "epoch": 13.31,
941
- "learning_rate": 3.719065476755594e-05,
942
- "loss": 0.0016,
943
- "step": 186865
944
  },
945
  {
946
- "epoch": 13.41,
947
- "learning_rate": 3.6634861862717176e-05,
948
- "loss": 0.0016,
949
- "step": 188270
950
  },
951
  {
952
- "epoch": 13.51,
953
- "learning_rate": 3.6079068957878414e-05,
954
- "loss": 0.0016,
955
- "step": 189675
956
  },
957
  {
958
- "epoch": 13.61,
959
- "learning_rate": 3.552327605303966e-05,
960
- "loss": 0.0017,
961
- "step": 191080
962
  },
963
  {
964
- "epoch": 13.71,
965
- "learning_rate": 3.4967483148200896e-05,
966
- "loss": 0.0017,
967
- "step": 192485
968
  },
969
  {
970
- "epoch": 13.81,
971
- "learning_rate": 3.4411690243362134e-05,
972
- "loss": 0.0015,
973
- "step": 193890
974
  },
975
  {
976
- "epoch": 13.91,
977
- "learning_rate": 3.385589733852337e-05,
978
- "loss": 0.0016,
979
- "step": 195295
980
  },
981
  {
982
- "epoch": 14.0,
983
- "eval_loss": 0.04207869619131088,
984
- "eval_max_distance": 6,
985
  "eval_mean_distance": 0,
986
- "eval_runtime": 12.3358,
987
- "eval_samples_per_second": 20.266,
988
- "eval_steps_per_second": 1.378,
989
- "step": 196616
990
- },
991
- {
992
- "epoch": 14.01,
993
- "learning_rate": 3.330010443368462e-05,
994
- "loss": 0.0016,
995
- "step": 196700
996
  },
997
  {
998
- "epoch": 14.11,
999
- "learning_rate": 3.274431152884585e-05,
1000
- "loss": 0.0016,
1001
- "step": 198105
1002
- },
1003
- {
1004
- "epoch": 14.21,
1005
- "learning_rate": 3.218851862400709e-05,
1006
- "loss": 0.0015,
1007
- "step": 199510
1008
- },
1009
- {
1010
- "epoch": 14.31,
1011
- "learning_rate": 3.163272571916833e-05,
1012
- "loss": 0.0016,
1013
- "step": 200915
1014
- },
1015
- {
1016
- "epoch": 14.41,
1017
- "learning_rate": 3.107693281432957e-05,
1018
- "loss": 0.0016,
1019
- "step": 202320
1020
- },
1021
- {
1022
- "epoch": 14.51,
1023
- "learning_rate": 3.052113990949081e-05,
1024
- "loss": 0.0016,
1025
- "step": 203725
1026
- },
1027
- {
1028
- "epoch": 14.61,
1029
- "learning_rate": 2.9965347004652052e-05,
1030
- "loss": 0.0016,
1031
- "step": 205130
1032
- },
1033
- {
1034
- "epoch": 14.71,
1035
- "learning_rate": 2.9409554099813286e-05,
1036
- "loss": 0.0015,
1037
- "step": 206535
1038
  },
1039
  {
1040
- "epoch": 14.81,
1041
- "learning_rate": 2.8853761194974528e-05,
1042
- "loss": 0.0016,
1043
- "step": 207940
1044
  },
1045
  {
1046
- "epoch": 14.91,
1047
- "learning_rate": 2.829796829013577e-05,
1048
- "loss": 0.0016,
1049
- "step": 209345
1050
  },
1051
  {
1052
- "epoch": 15.0,
1053
- "eval_loss": 0.04015611857175827,
1054
- "eval_max_distance": 6,
1055
- "eval_mean_distance": 0,
1056
- "eval_runtime": 12.1736,
1057
- "eval_samples_per_second": 20.536,
1058
- "eval_steps_per_second": 1.396,
1059
- "step": 210660
1060
  },
1061
  {
1062
- "epoch": 15.01,
1063
- "learning_rate": 2.7742175385297004e-05,
1064
- "loss": 0.0015,
1065
- "step": 210750
1066
  },
1067
  {
1068
- "epoch": 15.11,
1069
- "learning_rate": 2.7186382480458245e-05,
1070
- "loss": 0.0015,
1071
- "step": 212155
1072
  },
1073
  {
1074
- "epoch": 15.21,
1075
- "learning_rate": 2.6630589575619486e-05,
1076
- "loss": 0.0014,
1077
- "step": 213560
1078
  },
1079
  {
1080
- "epoch": 15.31,
1081
- "learning_rate": 2.607479667078072e-05,
1082
- "loss": 0.0016,
1083
- "step": 214965
1084
  },
1085
  {
1086
- "epoch": 15.41,
1087
- "learning_rate": 2.5519003765941962e-05,
1088
- "loss": 0.0014,
1089
- "step": 216370
1090
  },
1091
  {
1092
- "epoch": 15.51,
1093
- "learning_rate": 2.49632108611032e-05,
1094
- "loss": 0.0016,
1095
- "step": 217775
1096
  },
1097
  {
1098
- "epoch": 15.61,
1099
- "learning_rate": 2.440741795626444e-05,
1100
- "loss": 0.0014,
1101
- "step": 219180
1102
  },
1103
  {
1104
- "epoch": 15.71,
1105
- "learning_rate": 2.385162505142568e-05,
1106
- "loss": 0.0015,
1107
- "step": 220585
1108
  },
1109
  {
1110
- "epoch": 15.81,
1111
- "learning_rate": 2.3295832146586918e-05,
1112
- "loss": 0.0015,
1113
- "step": 221990
1114
  },
1115
  {
1116
- "epoch": 15.91,
1117
- "learning_rate": 2.2740039241748156e-05,
1118
- "loss": 0.0014,
1119
- "step": 223395
1120
  },
1121
  {
1122
- "epoch": 16.0,
1123
- "eval_loss": 0.04069029539823532,
1124
- "eval_max_distance": 6,
1125
  "eval_mean_distance": 0,
1126
- "eval_runtime": 12.2719,
1127
- "eval_samples_per_second": 20.372,
1128
- "eval_steps_per_second": 1.385,
1129
- "step": 224704
1130
- },
1131
- {
1132
- "epoch": 16.01,
1133
- "learning_rate": 2.2184246336909397e-05,
1134
- "loss": 0.0015,
1135
- "step": 224800
1136
- },
1137
- {
1138
- "epoch": 16.11,
1139
- "learning_rate": 2.1628453432070635e-05,
1140
- "loss": 0.0013,
1141
- "step": 226205
1142
- },
1143
- {
1144
- "epoch": 16.21,
1145
- "learning_rate": 2.1072660527231873e-05,
1146
- "loss": 0.0014,
1147
- "step": 227610
1148
- },
1149
- {
1150
- "epoch": 16.31,
1151
- "learning_rate": 2.0516867622393114e-05,
1152
- "loss": 0.0014,
1153
- "step": 229015
1154
- },
1155
- {
1156
- "epoch": 16.41,
1157
- "learning_rate": 1.9961074717554352e-05,
1158
- "loss": 0.0015,
1159
- "step": 230420
1160
  },
1161
  {
1162
- "epoch": 16.51,
1163
- "learning_rate": 1.9405281812715594e-05,
1164
- "loss": 0.0016,
1165
- "step": 231825
1166
- },
1167
- {
1168
- "epoch": 16.61,
1169
- "learning_rate": 1.8849488907876832e-05,
1170
- "loss": 0.0015,
1171
- "step": 233230
1172
- },
1173
- {
1174
- "epoch": 16.71,
1175
- "learning_rate": 1.8293696003038073e-05,
1176
- "loss": 0.0014,
1177
- "step": 234635
1178
- },
1179
- {
1180
- "epoch": 16.81,
1181
- "learning_rate": 1.773790309819931e-05,
1182
- "loss": 0.0013,
1183
- "step": 236040
1184
  },
1185
  {
1186
- "epoch": 16.91,
1187
- "learning_rate": 1.7182110193360553e-05,
1188
- "loss": 0.0014,
1189
- "step": 237445
1190
  },
1191
  {
1192
- "epoch": 17.0,
1193
- "eval_loss": 0.042702946811914444,
1194
- "eval_max_distance": 6,
1195
- "eval_mean_distance": 0,
1196
- "eval_runtime": 12.1336,
1197
- "eval_samples_per_second": 20.604,
1198
- "eval_steps_per_second": 1.401,
1199
- "step": 238748
1200
  },
1201
  {
1202
- "epoch": 17.01,
1203
- "learning_rate": 1.662631728852179e-05,
1204
- "loss": 0.0013,
1205
- "step": 238850
1206
  },
1207
  {
1208
- "epoch": 17.11,
1209
- "learning_rate": 1.607052438368303e-05,
1210
- "loss": 0.0014,
1211
- "step": 240255
1212
  },
1213
  {
1214
- "epoch": 17.21,
1215
- "learning_rate": 1.551473147884427e-05,
1216
- "loss": 0.0014,
1217
- "step": 241660
1218
  },
1219
  {
1220
- "epoch": 17.31,
1221
- "learning_rate": 1.4958938574005508e-05,
1222
- "loss": 0.0013,
1223
- "step": 243065
1224
  },
1225
  {
1226
- "epoch": 17.41,
1227
- "learning_rate": 1.4403145669166746e-05,
1228
- "loss": 0.0014,
1229
- "step": 244470
1230
  },
1231
  {
1232
- "epoch": 17.51,
1233
- "learning_rate": 1.3847352764327987e-05,
1234
- "loss": 0.0013,
1235
- "step": 245875
1236
  },
1237
  {
1238
- "epoch": 17.61,
1239
- "learning_rate": 1.3291559859489225e-05,
1240
- "loss": 0.0013,
1241
- "step": 247280
1242
  },
1243
  {
1244
- "epoch": 17.71,
1245
- "learning_rate": 1.2735766954650463e-05,
1246
- "loss": 0.0014,
1247
- "step": 248685
1248
  },
1249
  {
1250
- "epoch": 17.81,
1251
- "learning_rate": 1.2179974049811703e-05,
1252
- "loss": 0.0014,
1253
- "step": 250090
1254
  },
1255
  {
1256
- "epoch": 17.91,
1257
- "learning_rate": 1.1624181144972943e-05,
1258
- "loss": 0.0014,
1259
- "step": 251495
1260
  },
1261
  {
1262
- "epoch": 18.0,
1263
- "eval_loss": 0.04110053926706314,
1264
- "eval_max_distance": 6,
1265
  "eval_mean_distance": 0,
1266
- "eval_runtime": 12.0446,
1267
- "eval_samples_per_second": 20.756,
1268
- "eval_steps_per_second": 1.411,
1269
- "step": 252792
1270
- },
1271
- {
1272
- "epoch": 18.01,
1273
- "learning_rate": 1.1068388240134182e-05,
1274
- "loss": 0.0014,
1275
- "step": 252900
1276
- },
1277
- {
1278
- "epoch": 18.11,
1279
- "learning_rate": 1.051259533529542e-05,
1280
- "loss": 0.0014,
1281
- "step": 254305
1282
- },
1283
- {
1284
- "epoch": 18.21,
1285
- "learning_rate": 9.95680243045666e-06,
1286
- "loss": 0.0012,
1287
- "step": 255710
1288
  },
1289
  {
1290
- "epoch": 18.31,
1291
- "learning_rate": 9.4010095256179e-06,
1292
- "loss": 0.0012,
1293
- "step": 257115
1294
- },
1295
- {
1296
- "epoch": 18.41,
1297
- "learning_rate": 8.84521662077914e-06,
1298
- "loss": 0.0014,
1299
- "step": 258520
1300
- },
1301
- {
1302
- "epoch": 18.51,
1303
- "learning_rate": 8.289423715940379e-06,
1304
- "loss": 0.0012,
1305
- "step": 259925
1306
- },
1307
- {
1308
- "epoch": 18.61,
1309
- "learning_rate": 7.733630811101619e-06,
1310
- "loss": 0.0013,
1311
- "step": 261330
1312
- },
1313
- {
1314
- "epoch": 18.71,
1315
- "learning_rate": 7.1778379062628565e-06,
1316
- "loss": 0.0014,
1317
- "step": 262735
1318
  },
1319
  {
1320
- "epoch": 18.81,
1321
- "learning_rate": 6.622045001424096e-06,
1322
- "loss": 0.0012,
1323
- "step": 264140
1324
  },
1325
  {
1326
- "epoch": 18.91,
1327
- "learning_rate": 6.066252096585336e-06,
1328
- "loss": 0.0013,
1329
- "step": 265545
1330
  },
1331
  {
1332
- "epoch": 19.0,
1333
- "eval_loss": 0.040625352412462234,
1334
- "eval_max_distance": 6,
1335
- "eval_mean_distance": 0,
1336
- "eval_runtime": 12.0757,
1337
- "eval_samples_per_second": 20.703,
1338
- "eval_steps_per_second": 1.408,
1339
- "step": 266836
1340
  },
1341
  {
1342
- "epoch": 19.01,
1343
- "learning_rate": 5.510459191746575e-06,
1344
- "loss": 0.0013,
1345
- "step": 266950
1346
  },
1347
  {
1348
- "epoch": 19.11,
1349
- "learning_rate": 4.9546662869078136e-06,
1350
- "loss": 0.0014,
1351
- "step": 268355
1352
  },
1353
  {
1354
- "epoch": 19.21,
1355
- "learning_rate": 4.398873382069053e-06,
1356
- "loss": 0.0013,
1357
- "step": 269760
1358
  },
1359
  {
1360
- "epoch": 19.31,
1361
- "learning_rate": 3.843080477230292e-06,
1362
- "loss": 0.0013,
1363
- "step": 271165
1364
  },
1365
  {
1366
- "epoch": 19.41,
1367
- "learning_rate": 3.2872875723915313e-06,
1368
- "loss": 0.0013,
1369
- "step": 272570
1370
  },
1371
  {
1372
- "epoch": 19.51,
1373
- "learning_rate": 2.7314946675527706e-06,
1374
- "loss": 0.0012,
1375
- "step": 273975
1376
  },
1377
  {
1378
- "epoch": 19.61,
1379
- "learning_rate": 2.1757017627140103e-06,
1380
- "loss": 0.0012,
1381
- "step": 275380
1382
  },
1383
  {
1384
- "epoch": 19.71,
1385
- "learning_rate": 1.619908857875249e-06,
1386
- "loss": 0.0013,
1387
- "step": 276785
1388
  },
1389
  {
1390
- "epoch": 19.81,
1391
- "learning_rate": 1.0641159530364886e-06,
1392
- "loss": 0.0014,
1393
- "step": 278190
1394
  },
1395
  {
1396
- "epoch": 19.91,
1397
- "learning_rate": 5.083230481977278e-07,
1398
- "loss": 0.0013,
1399
- "step": 279595
 
 
 
 
1400
  },
1401
  {
1402
- "epoch": 20.0,
1403
- "eval_loss": 0.040508754551410675,
1404
- "eval_max_distance": 6,
1405
- "eval_mean_distance": 0,
1406
- "eval_runtime": 12.1336,
1407
- "eval_samples_per_second": 20.604,
1408
- "eval_steps_per_second": 1.401,
1409
- "step": 280880
1410
- },
1411
- {
1412
- "epoch": 20.0,
1413
- "step": 280880,
1414
- "total_flos": 7.269950341627085e+16,
1415
- "train_loss": 0.0021448437322240947,
1416
- "train_runtime": 20054.4611,
1417
- "train_samples_per_second": 210.084,
1418
- "train_steps_per_second": 14.006
1419
  }
1420
  ],
1421
- "logging_steps": 1405,
1422
- "max_steps": 280880,
1423
- "num_train_epochs": 20,
1424
- "save_steps": 2809,
1425
- "total_flos": 7.269950341627085e+16,
1426
  "trial_name": null,
1427
  "trial_params": null
1428
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 15.0,
5
  "eval_steps": 500,
6
+ "global_step": 343740,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 2.909175539652063e-09,
14
+ "loss": 0.0085,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.08,
19
+ "learning_rate": 5.000872752661896e-06,
20
+ "loss": 0.0066,
21
+ "step": 1719
22
  },
23
  {
24
+ "epoch": 0.15,
25
+ "learning_rate": 1.0001745505323792e-05,
26
+ "loss": 0.0061,
27
+ "step": 3438
28
+ },
29
+ {
30
+ "epoch": 0.23,
31
+ "learning_rate": 1.5002618257985687e-05,
32
+ "loss": 0.0059,
33
+ "step": 5157
34
  },
35
  {
36
  "epoch": 0.3,
37
+ "learning_rate": 2.0003491010647585e-05,
38
+ "loss": 0.0056,
39
+ "step": 6876
40
+ },
41
+ {
42
+ "epoch": 0.38,
43
+ "learning_rate": 2.500436376330948e-05,
44
  "loss": 0.0052,
45
+ "step": 8595
46
  },
47
  {
48
+ "epoch": 0.45,
49
+ "learning_rate": 3.0005236515971374e-05,
50
+ "loss": 0.0052,
51
+ "step": 10314
52
  },
53
  {
54
+ "epoch": 0.53,
55
+ "learning_rate": 3.500610926863327e-05,
56
+ "loss": 0.0049,
57
+ "step": 12033
58
  },
59
  {
60
  "epoch": 0.6,
61
+ "learning_rate": 4.000698202129517e-05,
62
+ "loss": 0.0049,
63
+ "step": 13752
64
  },
65
  {
66
+ "epoch": 0.68,
67
+ "learning_rate": 4.5007854773957064e-05,
68
+ "loss": 0.0052,
69
+ "step": 15471
70
  },
71
  {
72
+ "epoch": 0.75,
73
+ "learning_rate": 5.000872752661896e-05,
74
  "loss": 0.0051,
75
+ "step": 17190
76
+ },
77
+ {
78
+ "epoch": 0.83,
79
+ "learning_rate": 5.500960027928086e-05,
80
+ "loss": 0.0052,
81
+ "step": 18909
82
  },
83
  {
84
  "epoch": 0.9,
85
+ "learning_rate": 6.001047303194275e-05,
86
+ "loss": 0.0048,
87
+ "step": 20628
88
+ },
89
+ {
90
+ "epoch": 0.98,
91
+ "learning_rate": 6.501134578460465e-05,
92
  "loss": 0.0052,
93
+ "step": 22347
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "eval_loss": 0.027106985449790955,
98
  "eval_max_distance": 9,
99
  "eval_mean_distance": 0,
100
+ "eval_runtime": 14.4629,
101
+ "eval_samples_per_second": 17.355,
102
+ "eval_steps_per_second": 1.175,
103
+ "step": 22916
104
  },
105
  {
106
+ "epoch": 1.05,
107
+ "learning_rate": 7.001221853726654e-05,
108
  "loss": 0.0045,
109
+ "step": 24066
110
  },
111
  {
112
+ "epoch": 1.13,
113
+ "learning_rate": 7.501309128992844e-05,
114
+ "loss": 0.0044,
115
+ "step": 25785
116
  },
117
  {
118
  "epoch": 1.2,
119
+ "learning_rate": 8.001396404259034e-05,
120
+ "loss": 0.0048,
121
+ "step": 27504
122
  },
123
  {
124
+ "epoch": 1.28,
125
+ "learning_rate": 8.501483679525223e-05,
126
  "loss": 0.0045,
127
+ "step": 29223
128
  },
129
  {
130
+ "epoch": 1.35,
131
+ "learning_rate": 9.001570954791413e-05,
132
+ "loss": 0.0044,
133
+ "step": 30942
134
+ },
135
+ {
136
+ "epoch": 1.43,
137
+ "learning_rate": 9.501658230057602e-05,
138
+ "loss": 0.0047,
139
+ "step": 32661
140
  },
141
  {
142
  "epoch": 1.5,
143
+ "learning_rate": 9.999806054964024e-05,
144
+ "loss": 0.005,
145
+ "step": 34380
146
  },
147
  {
148
+ "epoch": 1.58,
149
+ "learning_rate": 9.944240802156669e-05,
150
+ "loss": 0.0049,
151
+ "step": 36099
152
  },
153
  {
154
+ "epoch": 1.65,
155
+ "learning_rate": 9.888675549349314e-05,
156
+ "loss": 0.0053,
157
+ "step": 37818
158
+ },
159
+ {
160
+ "epoch": 1.73,
161
+ "learning_rate": 9.83311029654196e-05,
162
+ "loss": 0.0047,
163
+ "step": 39537
164
  },
165
  {
166
  "epoch": 1.8,
167
+ "learning_rate": 9.777545043734606e-05,
168
+ "loss": 0.005,
169
+ "step": 41256
170
  },
171
  {
172
+ "epoch": 1.88,
173
+ "learning_rate": 9.721979790927251e-05,
174
+ "loss": 0.0048,
175
+ "step": 42975
176
+ },
177
+ {
178
+ "epoch": 1.95,
179
+ "learning_rate": 9.666414538119898e-05,
180
+ "loss": 0.0051,
181
+ "step": 44694
182
  },
183
  {
184
  "epoch": 2.0,
185
+ "eval_loss": 0.026126669719815254,
186
+ "eval_max_distance": 8,
187
  "eval_mean_distance": 0,
188
+ "eval_runtime": 13.5883,
189
+ "eval_samples_per_second": 18.472,
190
+ "eval_steps_per_second": 1.251,
191
+ "step": 45832
192
  },
193
  {
194
+ "epoch": 2.03,
195
+ "learning_rate": 9.610849285312543e-05,
196
+ "loss": 0.0047,
197
+ "step": 46413
198
  },
199
  {
200
  "epoch": 2.1,
201
+ "learning_rate": 9.555284032505189e-05,
202
+ "loss": 0.004,
203
+ "step": 48132
204
  },
205
  {
206
+ "epoch": 2.18,
207
+ "learning_rate": 9.499718779697834e-05,
208
+ "loss": 0.004,
209
+ "step": 49851
210
  },
211
  {
212
+ "epoch": 2.25,
213
+ "learning_rate": 9.44415352689048e-05,
214
+ "loss": 0.0042,
215
+ "step": 51570
216
+ },
217
+ {
218
+ "epoch": 2.33,
219
+ "learning_rate": 9.388588274083125e-05,
220
+ "loss": 0.004,
221
+ "step": 53289
222
  },
223
  {
224
  "epoch": 2.4,
225
+ "learning_rate": 9.333023021275771e-05,
226
+ "loss": 0.0043,
227
+ "step": 55008
228
  },
229
  {
230
+ "epoch": 2.48,
231
+ "learning_rate": 9.277457768468416e-05,
232
+ "loss": 0.0042,
233
+ "step": 56727
234
  },
235
  {
236
+ "epoch": 2.55,
237
+ "learning_rate": 9.221892515661063e-05,
238
+ "loss": 0.004,
239
+ "step": 58446
240
+ },
241
+ {
242
+ "epoch": 2.63,
243
+ "learning_rate": 9.166327262853708e-05,
244
+ "loss": 0.0045,
245
+ "step": 60165
246
  },
247
  {
248
  "epoch": 2.7,
249
+ "learning_rate": 9.110762010046352e-05,
250
+ "loss": 0.0044,
251
+ "step": 61884
252
  },
253
  {
254
+ "epoch": 2.78,
255
+ "learning_rate": 9.055196757238999e-05,
256
+ "loss": 0.0044,
257
+ "step": 63603
258
  },
259
  {
260
+ "epoch": 2.85,
261
+ "learning_rate": 8.999631504431645e-05,
262
+ "loss": 0.0044,
263
+ "step": 65322
264
+ },
265
+ {
266
+ "epoch": 2.93,
267
+ "learning_rate": 8.94406625162429e-05,
268
  "loss": 0.0043,
269
+ "step": 67041
270
  },
271
  {
272
  "epoch": 3.0,
273
+ "eval_loss": 0.03130079433321953,
274
+ "eval_max_distance": 8,
275
  "eval_mean_distance": 0,
276
+ "eval_runtime": 13.4308,
277
+ "eval_samples_per_second": 18.688,
278
+ "eval_steps_per_second": 1.266,
279
+ "step": 68748
280
  },
281
  {
282
  "epoch": 3.0,
283
+ "learning_rate": 8.888500998816935e-05,
284
+ "loss": 0.0046,
285
+ "step": 68760
286
  },
287
  {
288
+ "epoch": 3.08,
289
+ "learning_rate": 8.832935746009581e-05,
290
+ "loss": 0.0036,
291
+ "step": 70479
292
  },
293
  {
294
+ "epoch": 3.15,
295
+ "learning_rate": 8.777370493202228e-05,
296
  "loss": 0.0036,
297
+ "step": 72198
298
+ },
299
+ {
300
+ "epoch": 3.23,
301
+ "learning_rate": 8.721805240394872e-05,
302
+ "loss": 0.0038,
303
+ "step": 73917
304
  },
305
  {
306
  "epoch": 3.3,
307
+ "learning_rate": 8.666239987587517e-05,
308
+ "loss": 0.0036,
309
+ "step": 75636
310
  },
311
  {
312
+ "epoch": 3.38,
313
+ "learning_rate": 8.610674734780164e-05,
314
+ "loss": 0.0038,
315
+ "step": 77355
316
  },
317
  {
318
+ "epoch": 3.45,
319
+ "learning_rate": 8.55510948197281e-05,
320
+ "loss": 0.0038,
321
+ "step": 79074
322
+ },
323
+ {
324
+ "epoch": 3.53,
325
+ "learning_rate": 8.499544229165455e-05,
326
+ "loss": 0.0038,
327
+ "step": 80793
328
  },
329
  {
330
  "epoch": 3.6,
331
+ "learning_rate": 8.4439789763581e-05,
332
+ "loss": 0.004,
333
+ "step": 82512
334
  },
335
  {
336
+ "epoch": 3.68,
337
+ "learning_rate": 8.388413723550746e-05,
338
+ "loss": 0.0037,
339
+ "step": 84231
340
+ },
341
+ {
342
+ "epoch": 3.75,
343
+ "learning_rate": 8.332848470743392e-05,
344
+ "loss": 0.0038,
345
+ "step": 85950
346
  },
347
  {
348
+ "epoch": 3.83,
349
+ "learning_rate": 8.277283217936037e-05,
350
  "loss": 0.0039,
351
+ "step": 87669
352
  },
353
  {
354
  "epoch": 3.9,
355
+ "learning_rate": 8.221717965128682e-05,
356
+ "loss": 0.0039,
357
+ "step": 89388
358
+ },
359
+ {
360
+ "epoch": 3.98,
361
+ "learning_rate": 8.166152712321329e-05,
362
+ "loss": 0.0041,
363
+ "step": 91107
364
  },
365
  {
366
  "epoch": 4.0,
367
+ "eval_loss": 0.02780107595026493,
368
+ "eval_max_distance": 10,
369
  "eval_mean_distance": 0,
370
+ "eval_runtime": 13.351,
371
+ "eval_samples_per_second": 18.8,
372
+ "eval_steps_per_second": 1.273,
373
+ "step": 91664
374
  },
375
  {
376
+ "epoch": 4.05,
377
+ "learning_rate": 8.110587459513974e-05,
378
+ "loss": 0.0037,
379
+ "step": 92826
380
  },
381
  {
382
+ "epoch": 4.13,
383
+ "learning_rate": 8.05502220670662e-05,
384
+ "loss": 0.0032,
385
+ "step": 94545
386
  },
387
  {
388
  "epoch": 4.2,
389
+ "learning_rate": 7.999456953899266e-05,
390
+ "loss": 0.0034,
391
+ "step": 96264
392
  },
393
  {
394
+ "epoch": 4.28,
395
+ "learning_rate": 7.943891701091911e-05,
396
  "loss": 0.0034,
397
+ "step": 97983
398
+ },
399
+ {
400
+ "epoch": 4.35,
401
+ "learning_rate": 7.888326448284556e-05,
402
+ "loss": 0.0035,
403
+ "step": 99702
404
  },
405
  {
406
+ "epoch": 4.43,
407
+ "learning_rate": 7.832761195477202e-05,
408
  "loss": 0.0034,
409
+ "step": 101421
410
  },
411
  {
412
  "epoch": 4.5,
413
+ "learning_rate": 7.777195942669849e-05,
414
+ "loss": 0.0036,
415
+ "step": 103140
416
  },
417
  {
418
+ "epoch": 4.58,
419
+ "learning_rate": 7.721630689862494e-05,
420
  "loss": 0.0035,
421
+ "step": 104859
422
  },
423
  {
424
+ "epoch": 4.65,
425
+ "learning_rate": 7.666065437055139e-05,
426
+ "loss": 0.0034,
427
+ "step": 106578
428
+ },
429
+ {
430
+ "epoch": 4.73,
431
+ "learning_rate": 7.610500184247783e-05,
432
+ "loss": 0.0034,
433
+ "step": 108297
434
  },
435
  {
436
  "epoch": 4.8,
437
+ "learning_rate": 7.554934931440431e-05,
438
+ "loss": 0.0036,
439
+ "step": 110016
440
  },
441
  {
442
+ "epoch": 4.88,
443
+ "learning_rate": 7.499369678633076e-05,
444
+ "loss": 0.0034,
445
+ "step": 111735
446
+ },
447
+ {
448
+ "epoch": 4.95,
449
+ "learning_rate": 7.443804425825721e-05,
450
+ "loss": 0.0037,
451
+ "step": 113454
452
  },
453
  {
454
  "epoch": 5.0,
455
+ "eval_loss": 0.028013188391923904,
456
+ "eval_max_distance": 8,
457
  "eval_mean_distance": 0,
458
+ "eval_runtime": 13.4584,
459
+ "eval_samples_per_second": 18.65,
460
+ "eval_steps_per_second": 1.263,
461
+ "step": 114580
462
  },
463
  {
464
+ "epoch": 5.03,
465
+ "learning_rate": 7.388239173018366e-05,
466
+ "loss": 0.0033,
467
+ "step": 115173
468
  },
469
  {
470
  "epoch": 5.1,
471
+ "learning_rate": 7.332673920211012e-05,
472
+ "loss": 0.0031,
473
+ "step": 116892
474
  },
475
  {
476
+ "epoch": 5.18,
477
+ "learning_rate": 7.277108667403659e-05,
478
+ "loss": 0.0031,
479
+ "step": 118611
480
  },
481
  {
482
+ "epoch": 5.25,
483
+ "learning_rate": 7.221543414596303e-05,
484
+ "loss": 0.0032,
485
+ "step": 120330
486
+ },
487
+ {
488
+ "epoch": 5.33,
489
+ "learning_rate": 7.16597816178895e-05,
490
+ "loss": 0.0031,
491
+ "step": 122049
492
  },
493
  {
494
  "epoch": 5.4,
495
+ "learning_rate": 7.110412908981595e-05,
496
+ "loss": 0.0032,
497
+ "step": 123768
498
+ },
499
+ {
500
+ "epoch": 5.48,
501
+ "learning_rate": 7.054847656174241e-05,
502
  "loss": 0.0031,
503
+ "step": 125487
504
  },
505
  {
506
+ "epoch": 5.55,
507
+ "learning_rate": 6.999282403366886e-05,
508
  "loss": 0.0032,
509
+ "step": 127206
510
  },
511
  {
512
+ "epoch": 5.63,
513
+ "learning_rate": 6.943717150559532e-05,
514
+ "loss": 0.0032,
515
+ "step": 128925
516
  },
517
  {
518
  "epoch": 5.7,
519
+ "learning_rate": 6.888151897752177e-05,
520
  "loss": 0.0032,
521
+ "step": 130644
522
  },
523
  {
524
+ "epoch": 5.78,
525
+ "learning_rate": 6.832586644944823e-05,
526
  "loss": 0.0031,
527
+ "step": 132363
528
  },
529
  {
530
+ "epoch": 5.85,
531
+ "learning_rate": 6.777021392137468e-05,
532
+ "loss": 0.0031,
533
+ "step": 134082
534
+ },
535
+ {
536
+ "epoch": 5.93,
537
+ "learning_rate": 6.721456139330115e-05,
538
  "loss": 0.0032,
539
+ "step": 135801
540
  },
541
  {
542
  "epoch": 6.0,
543
+ "eval_loss": 0.028835317119956017,
544
+ "eval_max_distance": 8,
545
  "eval_mean_distance": 0,
546
+ "eval_runtime": 13.4137,
547
+ "eval_samples_per_second": 18.712,
548
+ "eval_steps_per_second": 1.267,
549
+ "step": 137496
550
  },
551
  {
552
  "epoch": 6.0,
553
+ "learning_rate": 6.66589088652276e-05,
554
+ "loss": 0.0033,
555
+ "step": 137520
556
  },
557
  {
558
+ "epoch": 6.08,
559
+ "learning_rate": 6.610325633715405e-05,
560
+ "loss": 0.0028,
561
+ "step": 139239
562
  },
563
  {
564
+ "epoch": 6.15,
565
+ "learning_rate": 6.554760380908051e-05,
566
  "loss": 0.0026,
567
+ "step": 140958
568
+ },
569
+ {
570
+ "epoch": 6.23,
571
+ "learning_rate": 6.499195128100697e-05,
572
+ "loss": 0.0027,
573
+ "step": 142677
574
  },
575
  {
576
  "epoch": 6.3,
577
+ "learning_rate": 6.443629875293342e-05,
578
+ "loss": 0.0029,
579
+ "step": 144396
580
  },
581
  {
582
+ "epoch": 6.38,
583
+ "learning_rate": 6.388064622485987e-05,
584
  "loss": 0.0029,
585
+ "step": 146115
586
  },
587
  {
588
+ "epoch": 6.45,
589
+ "learning_rate": 6.332499369678633e-05,
590
+ "loss": 0.0029,
591
+ "step": 147834
592
  },
593
  {
594
+ "epoch": 6.53,
595
+ "learning_rate": 6.27693411687128e-05,
596
  "loss": 0.0028,
597
+ "step": 149553
598
  },
599
  {
600
+ "epoch": 6.6,
601
+ "learning_rate": 6.221368864063925e-05,
602
+ "loss": 0.0029,
603
+ "step": 151272
604
  },
605
  {
606
+ "epoch": 6.68,
607
+ "learning_rate": 6.16580361125657e-05,
608
+ "loss": 0.0029,
609
+ "step": 152991
610
+ },
611
+ {
612
+ "epoch": 6.75,
613
+ "learning_rate": 6.110238358449216e-05,
614
+ "loss": 0.0029,
615
+ "step": 154710
616
+ },
617
+ {
618
+ "epoch": 6.83,
619
+ "learning_rate": 6.0546731056418614e-05,
620
+ "loss": 0.0028,
621
+ "step": 156429
622
  },
623
  {
624
  "epoch": 6.9,
625
+ "learning_rate": 5.999107852834507e-05,
626
+ "loss": 0.0029,
627
+ "step": 158148
628
  },
629
  {
630
+ "epoch": 6.98,
631
+ "learning_rate": 5.943542600027152e-05,
632
+ "loss": 0.003,
633
+ "step": 159867
 
 
 
 
634
  },
635
  {
636
  "epoch": 7.0,
637
+ "eval_loss": 0.030847659334540367,
638
+ "eval_max_distance": 8,
639
+ "eval_mean_distance": 0,
640
+ "eval_runtime": 13.4895,
641
+ "eval_samples_per_second": 18.607,
642
+ "eval_steps_per_second": 1.26,
643
+ "step": 160412
644
  },
645
  {
646
+ "epoch": 7.05,
647
+ "learning_rate": 5.887977347219798e-05,
648
  "loss": 0.0027,
649
+ "step": 161586
650
  },
651
  {
652
+ "epoch": 7.13,
653
+ "learning_rate": 5.832412094412444e-05,
654
  "loss": 0.0025,
655
+ "step": 163305
656
+ },
657
+ {
658
+ "epoch": 7.2,
659
+ "learning_rate": 5.7768468416050895e-05,
660
+ "loss": 0.0026,
661
+ "step": 165024
662
  },
663
  {
664
+ "epoch": 7.28,
665
+ "learning_rate": 5.7212815887977344e-05,
666
+ "loss": 0.0027,
667
+ "step": 166743
668
+ },
669
+ {
670
+ "epoch": 7.35,
671
+ "learning_rate": 5.665716335990381e-05,
672
  "loss": 0.0025,
673
+ "step": 168462
674
  },
675
  {
676
+ "epoch": 7.43,
677
+ "learning_rate": 5.610151083183026e-05,
678
+ "loss": 0.0026,
679
+ "step": 170181
680
  },
681
  {
682
  "epoch": 7.5,
683
+ "learning_rate": 5.554585830375671e-05,
684
  "loss": 0.0026,
685
+ "step": 171900
686
  },
687
  {
688
+ "epoch": 7.58,
689
+ "learning_rate": 5.499020577568318e-05,
690
  "loss": 0.0026,
691
+ "step": 173619
692
  },
693
  {
694
+ "epoch": 7.65,
695
+ "learning_rate": 5.443455324760963e-05,
696
  "loss": 0.0026,
697
+ "step": 175338
698
  },
699
  {
700
+ "epoch": 7.73,
701
+ "learning_rate": 5.387890071953609e-05,
702
  "loss": 0.0025,
703
+ "step": 177057
704
  },
705
  {
706
+ "epoch": 7.8,
707
+ "learning_rate": 5.332324819146254e-05,
708
  "loss": 0.0027,
709
+ "step": 178776
710
+ },
711
+ {
712
+ "epoch": 7.88,
713
+ "learning_rate": 5.2767595663389e-05,
714
+ "loss": 0.0028,
715
+ "step": 180495
716
+ },
717
+ {
718
+ "epoch": 7.95,
719
+ "learning_rate": 5.2211943135315456e-05,
720
+ "loss": 0.0025,
721
+ "step": 182214
722
  },
723
  {
724
  "epoch": 8.0,
725
+ "eval_loss": 0.03048335202038288,
726
+ "eval_max_distance": 8,
727
  "eval_mean_distance": 0,
728
+ "eval_runtime": 13.5077,
729
+ "eval_samples_per_second": 18.582,
730
+ "eval_steps_per_second": 1.259,
731
+ "step": 183328
732
  },
733
  {
734
+ "epoch": 8.03,
735
+ "learning_rate": 5.1656290607241906e-05,
736
+ "loss": 0.0026,
737
+ "step": 183933
738
  },
739
  {
740
  "epoch": 8.1,
741
+ "learning_rate": 5.110063807916836e-05,
742
  "loss": 0.0023,
743
+ "step": 185652
744
  },
745
  {
746
+ "epoch": 8.18,
747
+ "learning_rate": 5.0544985551094825e-05,
748
+ "loss": 0.0024,
749
+ "step": 187371
750
  },
751
  {
752
+ "epoch": 8.25,
753
+ "learning_rate": 4.998933302302128e-05,
754
+ "loss": 0.0023,
755
+ "step": 189090
756
  },
757
  {
758
+ "epoch": 8.33,
759
+ "learning_rate": 4.943368049494773e-05,
760
+ "loss": 0.0024,
761
+ "step": 190809
762
  },
763
  {
764
+ "epoch": 8.4,
765
+ "learning_rate": 4.887802796687419e-05,
766
  "loss": 0.0024,
767
+ "step": 192528
768
  },
769
  {
770
+ "epoch": 8.48,
771
+ "learning_rate": 4.832237543880065e-05,
772
  "loss": 0.0025,
773
+ "step": 194247
774
  },
775
  {
776
+ "epoch": 8.55,
777
+ "learning_rate": 4.77667229107271e-05,
778
  "loss": 0.0024,
779
+ "step": 195966
780
  },
781
  {
782
+ "epoch": 8.63,
783
+ "learning_rate": 4.721107038265356e-05,
784
+ "loss": 0.0026,
785
+ "step": 197685
786
  },
787
  {
788
+ "epoch": 8.7,
789
+ "learning_rate": 4.665541785458001e-05,
790
  "loss": 0.0023,
791
+ "step": 199404
792
  },
793
  {
794
+ "epoch": 8.78,
795
+ "learning_rate": 4.6099765326506474e-05,
796
+ "loss": 0.0024,
797
+ "step": 201123
 
 
 
 
798
  },
799
  {
800
+ "epoch": 8.85,
801
+ "learning_rate": 4.5544112798432924e-05,
802
  "loss": 0.0024,
803
+ "step": 202842
804
  },
805
  {
806
+ "epoch": 8.93,
807
+ "learning_rate": 4.4988460270359386e-05,
808
+ "loss": 0.0025,
809
+ "step": 204561
810
  },
811
  {
812
+ "epoch": 9.0,
813
+ "eval_loss": 0.030335595831274986,
814
+ "eval_max_distance": 8,
815
+ "eval_mean_distance": 0,
816
+ "eval_runtime": 13.4109,
817
+ "eval_samples_per_second": 18.716,
818
+ "eval_steps_per_second": 1.268,
819
+ "step": 206244
820
  },
821
  {
822
+ "epoch": 9.0,
823
+ "learning_rate": 4.4432807742285836e-05,
824
+ "loss": 0.0026,
825
+ "step": 206280
826
  },
827
  {
828
+ "epoch": 9.08,
829
+ "learning_rate": 4.38771552142123e-05,
830
+ "loss": 0.0021,
831
+ "step": 207999
832
  },
833
  {
834
+ "epoch": 9.15,
835
+ "learning_rate": 4.332150268613875e-05,
836
  "loss": 0.0022,
837
+ "step": 209718
 
 
 
 
 
 
838
  },
839
  {
840
+ "epoch": 9.23,
841
+ "learning_rate": 4.2765850158065204e-05,
842
+ "loss": 0.0022,
843
+ "step": 211437
844
  },
845
  {
846
+ "epoch": 9.3,
847
+ "learning_rate": 4.221019762999166e-05,
848
+ "loss": 0.0023,
849
+ "step": 213156
850
  },
851
  {
852
+ "epoch": 9.38,
853
+ "learning_rate": 4.165454510191812e-05,
854
  "loss": 0.0023,
855
+ "step": 214875
856
  },
857
  {
858
+ "epoch": 9.45,
859
+ "learning_rate": 4.109889257384457e-05,
860
+ "loss": 0.0023,
861
+ "step": 216594
 
 
 
 
862
  },
863
  {
864
+ "epoch": 9.53,
865
+ "learning_rate": 4.054324004577103e-05,
866
  "loss": 0.0023,
867
+ "step": 218313
868
  },
869
  {
870
+ "epoch": 9.6,
871
+ "learning_rate": 3.998758751769749e-05,
872
+ "loss": 0.0024,
873
+ "step": 220032
874
  },
875
  {
876
+ "epoch": 9.68,
877
+ "learning_rate": 3.943193498962394e-05,
878
+ "loss": 0.0024,
879
+ "step": 221751
880
  },
881
  {
882
+ "epoch": 9.75,
883
+ "learning_rate": 3.88762824615504e-05,
884
+ "loss": 0.0022,
885
+ "step": 223470
886
  },
887
  {
888
+ "epoch": 9.83,
889
+ "learning_rate": 3.8320629933476854e-05,
890
+ "loss": 0.0023,
891
+ "step": 225189
892
  },
893
  {
894
+ "epoch": 9.9,
895
+ "learning_rate": 3.776497740540331e-05,
896
+ "loss": 0.0024,
897
+ "step": 226908
898
  },
899
  {
900
+ "epoch": 9.98,
901
+ "learning_rate": 3.7209324877329766e-05,
902
+ "loss": 0.0023,
903
+ "step": 228627
904
  },
905
  {
906
+ "epoch": 10.0,
907
+ "eval_loss": 0.034065987914800644,
908
+ "eval_max_distance": 8,
909
+ "eval_mean_distance": 0,
910
+ "eval_runtime": 13.4726,
911
+ "eval_samples_per_second": 18.63,
912
+ "eval_steps_per_second": 1.262,
913
+ "step": 229160
914
  },
915
  {
916
+ "epoch": 10.05,
917
+ "learning_rate": 3.665367234925622e-05,
918
+ "loss": 0.0021,
919
+ "step": 230346
920
  },
921
  {
922
+ "epoch": 10.13,
923
+ "learning_rate": 3.609801982118268e-05,
924
+ "loss": 0.0021,
925
+ "step": 232065
926
  },
927
  {
928
+ "epoch": 10.2,
929
+ "learning_rate": 3.5542367293109135e-05,
930
+ "loss": 0.0021,
931
+ "step": 233784
 
 
 
 
932
  },
933
  {
934
+ "epoch": 10.28,
935
+ "learning_rate": 3.498671476503559e-05,
936
+ "loss": 0.0022,
937
+ "step": 235503
938
  },
939
  {
940
+ "epoch": 10.35,
941
+ "learning_rate": 3.443106223696205e-05,
942
+ "loss": 0.0021,
943
+ "step": 237222
944
  },
945
  {
946
+ "epoch": 10.43,
947
+ "learning_rate": 3.38754097088885e-05,
948
+ "loss": 0.0022,
949
+ "step": 238941
950
  },
951
  {
952
+ "epoch": 10.5,
953
+ "learning_rate": 3.331975718081496e-05,
954
+ "loss": 0.0021,
955
+ "step": 240660
956
  },
957
  {
958
+ "epoch": 10.58,
959
+ "learning_rate": 3.276410465274141e-05,
960
+ "loss": 0.0021,
961
+ "step": 242379
962
  },
963
  {
964
+ "epoch": 10.65,
965
+ "learning_rate": 3.220845212466787e-05,
966
+ "loss": 0.0021,
967
+ "step": 244098
968
  },
969
  {
970
+ "epoch": 10.73,
971
+ "learning_rate": 3.165279959659433e-05,
972
+ "loss": 0.0022,
973
+ "step": 245817
974
  },
975
  {
976
+ "epoch": 10.8,
977
+ "learning_rate": 3.1097147068520784e-05,
978
+ "loss": 0.0022,
979
+ "step": 247536
980
  },
981
  {
982
+ "epoch": 10.88,
983
+ "learning_rate": 3.054149454044724e-05,
984
  "loss": 0.002,
985
+ "step": 249255
986
  },
987
  {
988
+ "epoch": 10.95,
989
+ "learning_rate": 2.9985842012373693e-05,
990
+ "loss": 0.0022,
991
+ "step": 250974
992
  },
993
  {
994
+ "epoch": 11.0,
995
+ "eval_loss": 0.03288768604397774,
996
+ "eval_max_distance": 8,
997
  "eval_mean_distance": 0,
998
+ "eval_runtime": 13.3832,
999
+ "eval_samples_per_second": 18.755,
1000
+ "eval_steps_per_second": 1.27,
1001
+ "step": 252076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  },
1003
  {
1004
+ "epoch": 11.03,
1005
+ "learning_rate": 2.9430189484300152e-05,
1006
+ "loss": 0.0022,
1007
+ "step": 252693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  },
1009
  {
1010
+ "epoch": 11.1,
1011
+ "learning_rate": 2.8874536956226605e-05,
1012
  "loss": 0.002,
1013
+ "step": 254412
1014
  },
1015
  {
1016
+ "epoch": 11.18,
1017
+ "learning_rate": 2.8318884428153065e-05,
1018
+ "loss": 0.002,
1019
+ "step": 256131
 
 
 
 
 
 
 
 
 
 
1020
  },
1021
  {
1022
+ "epoch": 11.25,
1023
+ "learning_rate": 2.7763231900079517e-05,
1024
+ "loss": 0.002,
1025
+ "step": 257850
1026
  },
1027
  {
1028
+ "epoch": 11.33,
1029
+ "learning_rate": 2.7207579372005977e-05,
1030
+ "loss": 0.0021,
1031
+ "step": 259569
1032
  },
1033
  {
1034
+ "epoch": 11.4,
1035
+ "learning_rate": 2.665192684393243e-05,
1036
+ "loss": 0.0019,
1037
+ "step": 261288
1038
  },
1039
  {
1040
+ "epoch": 11.48,
1041
+ "learning_rate": 2.6096274315858886e-05,
1042
+ "loss": 0.002,
1043
+ "step": 263007
1044
  },
1045
  {
1046
+ "epoch": 11.55,
1047
+ "learning_rate": 2.5540621787785342e-05,
1048
+ "loss": 0.002,
1049
+ "step": 264726
1050
  },
1051
  {
1052
+ "epoch": 11.63,
1053
+ "learning_rate": 2.4984969259711798e-05,
1054
+ "loss": 0.002,
1055
+ "step": 266445
1056
  },
1057
  {
1058
+ "epoch": 11.7,
1059
+ "learning_rate": 2.4429316731638254e-05,
1060
+ "loss": 0.0019,
1061
+ "step": 268164
1062
  },
1063
  {
1064
+ "epoch": 11.78,
1065
+ "learning_rate": 2.387366420356471e-05,
1066
+ "loss": 0.0021,
1067
+ "step": 269883
1068
  },
1069
  {
1070
+ "epoch": 11.85,
1071
+ "learning_rate": 2.3318011675491167e-05,
1072
+ "loss": 0.0021,
1073
+ "step": 271602
1074
  },
1075
  {
1076
+ "epoch": 11.93,
1077
+ "learning_rate": 2.2762359147417623e-05,
1078
+ "loss": 0.0019,
1079
+ "step": 273321
1080
  },
1081
  {
1082
+ "epoch": 12.0,
1083
+ "eval_loss": 0.03355114161968231,
1084
+ "eval_max_distance": 8,
1085
  "eval_mean_distance": 0,
1086
+ "eval_runtime": 13.4567,
1087
+ "eval_samples_per_second": 18.652,
1088
+ "eval_steps_per_second": 1.263,
1089
+ "step": 274992
 
 
 
 
 
 
1090
  },
1091
  {
1092
+ "epoch": 12.0,
1093
+ "learning_rate": 2.220670661934408e-05,
1094
+ "loss": 0.0021,
1095
+ "step": 275040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1096
  },
1097
  {
1098
+ "epoch": 12.08,
1099
+ "learning_rate": 2.1651054091270535e-05,
1100
+ "loss": 0.002,
1101
+ "step": 276759
1102
  },
1103
  {
1104
+ "epoch": 12.15,
1105
+ "learning_rate": 2.109540156319699e-05,
1106
+ "loss": 0.002,
1107
+ "step": 278478
1108
  },
1109
  {
1110
+ "epoch": 12.23,
1111
+ "learning_rate": 2.0539749035123444e-05,
1112
+ "loss": 0.0018,
1113
+ "step": 280197
 
 
 
 
1114
  },
1115
  {
1116
+ "epoch": 12.3,
1117
+ "learning_rate": 1.9984096507049904e-05,
1118
+ "loss": 0.002,
1119
+ "step": 281916
1120
  },
1121
  {
1122
+ "epoch": 12.38,
1123
+ "learning_rate": 1.942844397897636e-05,
1124
+ "loss": 0.0019,
1125
+ "step": 283635
1126
  },
1127
  {
1128
+ "epoch": 12.45,
1129
+ "learning_rate": 1.8872791450902816e-05,
1130
+ "loss": 0.0018,
1131
+ "step": 285354
1132
  },
1133
  {
1134
+ "epoch": 12.53,
1135
+ "learning_rate": 1.8317138922829272e-05,
1136
+ "loss": 0.0019,
1137
+ "step": 287073
1138
  },
1139
  {
1140
+ "epoch": 12.6,
1141
+ "learning_rate": 1.776148639475573e-05,
1142
+ "loss": 0.0018,
1143
+ "step": 288792
1144
  },
1145
  {
1146
+ "epoch": 12.68,
1147
+ "learning_rate": 1.7205833866682185e-05,
1148
+ "loss": 0.0019,
1149
+ "step": 290511
1150
  },
1151
  {
1152
+ "epoch": 12.75,
1153
+ "learning_rate": 1.665018133860864e-05,
1154
+ "loss": 0.0019,
1155
+ "step": 292230
1156
  },
1157
  {
1158
+ "epoch": 12.83,
1159
+ "learning_rate": 1.6094528810535094e-05,
1160
+ "loss": 0.0021,
1161
+ "step": 293949
1162
  },
1163
  {
1164
+ "epoch": 12.9,
1165
+ "learning_rate": 1.553887628246155e-05,
1166
+ "loss": 0.0019,
1167
+ "step": 295668
1168
  },
1169
  {
1170
+ "epoch": 12.98,
1171
+ "learning_rate": 1.4983223754388006e-05,
1172
+ "loss": 0.002,
1173
+ "step": 297387
1174
  },
1175
  {
1176
+ "epoch": 13.0,
1177
+ "eval_loss": 0.035788267850875854,
1178
+ "eval_max_distance": 8,
1179
  "eval_mean_distance": 0,
1180
+ "eval_runtime": 13.4958,
1181
+ "eval_samples_per_second": 18.598,
1182
+ "eval_steps_per_second": 1.26,
1183
+ "step": 297908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1184
  },
1185
  {
1186
+ "epoch": 13.05,
1187
+ "learning_rate": 1.4427571226314462e-05,
1188
+ "loss": 0.0019,
1189
+ "step": 299106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1190
  },
1191
  {
1192
+ "epoch": 13.13,
1193
+ "learning_rate": 1.3871918698240918e-05,
1194
+ "loss": 0.0018,
1195
+ "step": 300825
1196
  },
1197
  {
1198
+ "epoch": 13.2,
1199
+ "learning_rate": 1.3316266170167374e-05,
1200
+ "loss": 0.0018,
1201
+ "step": 302544
 
 
 
 
1202
  },
1203
  {
1204
+ "epoch": 13.28,
1205
+ "learning_rate": 1.276061364209383e-05,
1206
+ "loss": 0.0018,
1207
+ "step": 304263
1208
  },
1209
  {
1210
+ "epoch": 13.35,
1211
+ "learning_rate": 1.2204961114020287e-05,
1212
+ "loss": 0.0018,
1213
+ "step": 305982
1214
  },
1215
  {
1216
+ "epoch": 13.43,
1217
+ "learning_rate": 1.1649308585946743e-05,
1218
+ "loss": 0.0018,
1219
+ "step": 307701
1220
  },
1221
  {
1222
+ "epoch": 13.5,
1223
+ "learning_rate": 1.1093656057873199e-05,
1224
+ "loss": 0.0019,
1225
+ "step": 309420
1226
  },
1227
  {
1228
+ "epoch": 13.58,
1229
+ "learning_rate": 1.0538003529799655e-05,
1230
+ "loss": 0.0018,
1231
+ "step": 311139
1232
  },
1233
  {
1234
+ "epoch": 13.65,
1235
+ "learning_rate": 9.982351001726111e-06,
1236
+ "loss": 0.0017,
1237
+ "step": 312858
1238
  },
1239
  {
1240
+ "epoch": 13.73,
1241
+ "learning_rate": 9.426698473652567e-06,
1242
+ "loss": 0.0018,
1243
+ "step": 314577
1244
  },
1245
  {
1246
+ "epoch": 13.8,
1247
+ "learning_rate": 8.871045945579024e-06,
1248
+ "loss": 0.002,
1249
+ "step": 316296
1250
  },
1251
  {
1252
+ "epoch": 13.88,
1253
+ "learning_rate": 8.31539341750548e-06,
1254
+ "loss": 0.0018,
1255
+ "step": 318015
1256
  },
1257
  {
1258
+ "epoch": 13.95,
1259
+ "learning_rate": 7.759740889431934e-06,
1260
+ "loss": 0.0018,
1261
+ "step": 319734
1262
  },
1263
  {
1264
+ "epoch": 14.0,
1265
+ "eval_loss": 0.03550655022263527,
1266
+ "eval_max_distance": 8,
1267
  "eval_mean_distance": 0,
1268
+ "eval_runtime": 13.4713,
1269
+ "eval_samples_per_second": 18.632,
1270
+ "eval_steps_per_second": 1.262,
1271
+ "step": 320824
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1272
  },
1273
  {
1274
+ "epoch": 14.03,
1275
+ "learning_rate": 7.204088361358391e-06,
1276
+ "loss": 0.0017,
1277
+ "step": 321453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1278
  },
1279
  {
1280
+ "epoch": 14.1,
1281
+ "learning_rate": 6.648435833284847e-06,
1282
+ "loss": 0.0018,
1283
+ "step": 323172
1284
  },
1285
  {
1286
+ "epoch": 14.18,
1287
+ "learning_rate": 6.092783305211304e-06,
1288
+ "loss": 0.0018,
1289
+ "step": 324891
1290
  },
1291
  {
1292
+ "epoch": 14.25,
1293
+ "learning_rate": 5.537130777137759e-06,
1294
+ "loss": 0.0017,
1295
+ "step": 326610
 
 
 
 
1296
  },
1297
  {
1298
+ "epoch": 14.33,
1299
+ "learning_rate": 4.981478249064216e-06,
1300
+ "loss": 0.0018,
1301
+ "step": 328329
1302
  },
1303
  {
1304
+ "epoch": 14.4,
1305
+ "learning_rate": 4.425825720990671e-06,
1306
+ "loss": 0.0018,
1307
+ "step": 330048
1308
  },
1309
  {
1310
+ "epoch": 14.48,
1311
+ "learning_rate": 3.8701731929171274e-06,
1312
+ "loss": 0.0018,
1313
+ "step": 331767
1314
  },
1315
  {
1316
+ "epoch": 14.55,
1317
+ "learning_rate": 3.3145206648435836e-06,
1318
+ "loss": 0.0019,
1319
+ "step": 333486
1320
  },
1321
  {
1322
+ "epoch": 14.63,
1323
+ "learning_rate": 2.7588681367700398e-06,
1324
+ "loss": 0.0017,
1325
+ "step": 335205
1326
  },
1327
  {
1328
+ "epoch": 14.7,
1329
+ "learning_rate": 2.2032156086964955e-06,
1330
+ "loss": 0.0018,
1331
+ "step": 336924
1332
  },
1333
  {
1334
+ "epoch": 14.78,
1335
+ "learning_rate": 1.6475630806229517e-06,
1336
+ "loss": 0.0017,
1337
+ "step": 338643
1338
  },
1339
  {
1340
+ "epoch": 14.85,
1341
+ "learning_rate": 1.0919105525494076e-06,
1342
+ "loss": 0.0018,
1343
+ "step": 340362
1344
  },
1345
  {
1346
+ "epoch": 14.93,
1347
+ "learning_rate": 5.362580244758636e-07,
1348
+ "loss": 0.0019,
1349
+ "step": 342081
1350
  },
1351
  {
1352
+ "epoch": 15.0,
1353
+ "eval_loss": 0.03661360964179039,
1354
+ "eval_max_distance": 8,
1355
+ "eval_mean_distance": 0,
1356
+ "eval_runtime": 13.3536,
1357
+ "eval_samples_per_second": 18.796,
1358
+ "eval_steps_per_second": 1.273,
1359
+ "step": 343740
1360
  },
1361
  {
1362
+ "epoch": 15.0,
1363
+ "step": 343740,
1364
+ "total_flos": 8.727792619277722e+16,
1365
+ "train_loss": 0.0029792904397642345,
1366
+ "train_runtime": 24306.5697,
1367
+ "train_samples_per_second": 212.119,
1368
+ "train_steps_per_second": 14.142
 
 
 
 
 
 
 
 
 
 
1369
  }
1370
  ],
1371
+ "logging_steps": 1719,
1372
+ "max_steps": 343740,
1373
+ "num_train_epochs": 15,
1374
+ "save_steps": 3438,
1375
+ "total_flos": 8.727792619277722e+16,
1376
  "trial_name": null,
1377
  "trial_params": null
1378
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:768220270743ed53ef0cb8a00d97f4eba8645c310d633cf26655e6848b4b3523
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cad5fdc011eae68aae9d7cd252dcf011f18199df3fd5c6b107c8e3cbed177f
3
  size 4091