alexue4 commited on
Commit
359fc6b
1 Parent(s): 6d4fe45

End of training

Browse files
Files changed (4) hide show
  1. README.md +19 -10
  2. pytorch_model.bin +1 -1
  3. trainer_state.json +655 -925
  4. training_args.bin +1 -1
README.md CHANGED
@@ -15,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0279
19
  - Mean Distance: 0
20
- - Max Distance: 8
21
 
22
  ## Model description
23
 
@@ -43,17 +43,26 @@ The following hyperparameters were used during training:
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
- - num_epochs: 5
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
- |:-------------:|:-----:|:------:|:---------------:|:-------------:|:------------:|
52
- | 0.0024 | 1.0 | 22994 | 0.0264 | 0 | 8 |
53
- | 0.0022 | 2.0 | 45988 | 0.0259 | 0 | 8 |
54
- | 0.0019 | 3.0 | 68982 | 0.0292 | 0 | 8 |
55
- | 0.0016 | 4.0 | 91976 | 0.0281 | 0 | 8 |
56
- | 0.0016 | 5.0 | 114970 | 0.0279 | 0 | 8 |
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [alexue4/text-normalization-ru-new](https://huggingface.co/alexue4/text-normalization-ru-new) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0114
19
  - Mean Distance: 0
20
+ - Max Distance: 3
21
 
22
  ## Model description
23
 
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
  - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 14
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Max Distance | Mean Distance |
51
+ |:-------------:|:-----:|:------:|:---------------:|:------------:|:-------------:|
52
+ | 0.0017 | 1.0 | 23077 | 0.0089 | 1 | 0 |
53
+ | 0.0019 | 2.0 | 46154 | 0.0084 | 3 | 0 |
54
+ | 0.002 | 3.0 | 69231 | 0.0087 | 1 | 0 |
55
+ | 0.0021 | 4.0 | 92308 | 0.0120 | 4 | 0 |
56
+ | 0.0019 | 5.0 | 115385 | 0.0100 | 4 | 0 |
57
+ | 0.0018 | 6.0 | 138462 | 0.0111 | 3 | 0 |
58
+ | 0.0017 | 7.0 | 161539 | 0.0070 | 3 | 0 |
59
+ | 0.0017 | 8.0 | 184616 | 0.0142 | 4 | 0 |
60
+ | 0.0014 | 9.0 | 207693 | 0.0118 | 4 | 0 |
61
+ | 0.0014 | 10.0 | 230770 | 0.0115 | 3 | 0 |
62
+ | 0.0013 | 11.0 | 253847 | 0.0113 | 3 | 0 |
63
+ | 0.0012 | 12.0 | 276924 | 0.0120 | 3 | 0 |
64
+ | 0.0012 | 13.0 | 300001 | 0.0132 | 3 | 0 |
65
+ | 0.001 | 14.0 | 323078 | 0.0114 | 0 | 3 |
66
 
67
 
68
  ### Framework versions
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02a57c7242c971b094f9debfa498d92a23843f114d3e18a09c850885d9c933eb
3
  size 258643461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cac54e664bb72f32d01abbce7b6a284b9b0a4e4e545e2fc90ad6ae63a03897d2
3
  size 258643461
trainer_state.json CHANGED
@@ -1,1278 +1,1008 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 114970,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 8.697921196833957e-09,
14
- "loss": 0.0,
15
  "step": 1
16
  },
17
- {
18
- "epoch": 0.03,
19
- "learning_rate": 5.001304688179525e-06,
20
- "loss": 0.0024,
21
- "step": 575
22
- },
23
- {
24
- "epoch": 0.05,
25
- "learning_rate": 1.000260937635905e-05,
26
- "loss": 0.0025,
27
- "step": 1150
28
- },
29
- {
30
- "epoch": 0.08,
31
- "learning_rate": 1.5003914064538576e-05,
32
- "loss": 0.0021,
33
- "step": 1725
34
- },
35
  {
36
  "epoch": 0.1,
37
- "learning_rate": 2.00052187527181e-05,
38
- "loss": 0.002,
39
- "step": 2300
40
- },
41
- {
42
- "epoch": 0.13,
43
- "learning_rate": 2.500652344089763e-05,
44
- "loss": 0.0024,
45
- "step": 2875
46
- },
47
- {
48
- "epoch": 0.15,
49
- "learning_rate": 3.0007828129077153e-05,
50
- "loss": 0.0021,
51
- "step": 3450
52
- },
53
- {
54
- "epoch": 0.18,
55
- "learning_rate": 3.500913281725668e-05,
56
- "loss": 0.0024,
57
- "step": 4025
58
  },
59
  {
60
  "epoch": 0.2,
61
- "learning_rate": 4.00104375054362e-05,
62
- "loss": 0.0021,
63
- "step": 4600
64
- },
65
- {
66
- "epoch": 0.23,
67
- "learning_rate": 4.501174219361573e-05,
68
- "loss": 0.0021,
69
- "step": 5175
70
- },
71
- {
72
- "epoch": 0.25,
73
- "learning_rate": 5.001304688179526e-05,
74
- "loss": 0.0021,
75
- "step": 5750
76
- },
77
- {
78
- "epoch": 0.28,
79
- "learning_rate": 5.501435156997478e-05,
80
- "loss": 0.0022,
81
- "step": 6325
82
  },
83
  {
84
  "epoch": 0.3,
85
- "learning_rate": 6.0015656258154306e-05,
86
- "loss": 0.002,
87
- "step": 6900
88
- },
89
- {
90
- "epoch": 0.33,
91
- "learning_rate": 6.501696094633383e-05,
92
- "loss": 0.0021,
93
- "step": 7475
94
- },
95
- {
96
- "epoch": 0.35,
97
- "learning_rate": 7.001826563451336e-05,
98
- "loss": 0.0018,
99
- "step": 8050
100
- },
101
- {
102
- "epoch": 0.38,
103
- "learning_rate": 7.501957032269288e-05,
104
- "loss": 0.0021,
105
- "step": 8625
106
  },
107
  {
108
  "epoch": 0.4,
109
- "learning_rate": 8.00208750108724e-05,
110
- "loss": 0.0019,
111
- "step": 9200
112
- },
113
- {
114
- "epoch": 0.43,
115
- "learning_rate": 8.502217969905193e-05,
116
- "loss": 0.0023,
117
- "step": 9775
118
- },
119
- {
120
- "epoch": 0.45,
121
- "learning_rate": 9.002348438723146e-05,
122
- "loss": 0.0021,
123
- "step": 10350
124
- },
125
- {
126
- "epoch": 0.48,
127
- "learning_rate": 9.502478907541099e-05,
128
- "loss": 0.0021,
129
- "step": 10925
130
  },
131
  {
132
  "epoch": 0.5,
133
- "learning_rate": 9.99971006929344e-05,
134
- "loss": 0.0022,
135
- "step": 11500
136
- },
137
- {
138
- "epoch": 0.53,
139
- "learning_rate": 9.944140017202556e-05,
140
- "loss": 0.0025,
141
- "step": 12075
142
- },
143
- {
144
- "epoch": 0.55,
145
- "learning_rate": 9.888569965111673e-05,
146
- "loss": 0.0022,
147
- "step": 12650
148
- },
149
- {
150
- "epoch": 0.58,
151
- "learning_rate": 9.832999913020789e-05,
152
- "loss": 0.0022,
153
- "step": 13225
154
  },
155
  {
156
  "epoch": 0.6,
157
- "learning_rate": 9.777429860929905e-05,
158
- "loss": 0.002,
159
- "step": 13800
160
- },
161
- {
162
- "epoch": 0.63,
163
- "learning_rate": 9.721859808839022e-05,
164
- "loss": 0.0021,
165
- "step": 14375
166
- },
167
- {
168
- "epoch": 0.65,
169
- "learning_rate": 9.666289756748138e-05,
170
- "loss": 0.0023,
171
- "step": 14950
172
- },
173
- {
174
- "epoch": 0.68,
175
- "learning_rate": 9.610719704657253e-05,
176
- "loss": 0.0023,
177
- "step": 15525
178
  },
179
  {
180
  "epoch": 0.7,
181
- "learning_rate": 9.55514965256637e-05,
182
- "loss": 0.0023,
183
- "step": 16100
184
- },
185
- {
186
- "epoch": 0.73,
187
- "learning_rate": 9.499579600475486e-05,
188
- "loss": 0.002,
189
- "step": 16675
190
- },
191
- {
192
- "epoch": 0.75,
193
- "learning_rate": 9.444009548384603e-05,
194
- "loss": 0.0025,
195
- "step": 17250
196
- },
197
- {
198
- "epoch": 0.78,
199
- "learning_rate": 9.388439496293719e-05,
200
- "loss": 0.0023,
201
- "step": 17825
202
  },
203
  {
204
  "epoch": 0.8,
205
- "learning_rate": 9.332869444202836e-05,
206
- "loss": 0.0026,
207
- "step": 18400
208
- },
209
- {
210
- "epoch": 0.83,
211
- "learning_rate": 9.277299392111952e-05,
212
- "loss": 0.0025,
213
- "step": 18975
214
- },
215
- {
216
- "epoch": 0.85,
217
- "learning_rate": 9.221729340021069e-05,
218
- "loss": 0.0027,
219
- "step": 19550
220
- },
221
- {
222
- "epoch": 0.88,
223
- "learning_rate": 9.166159287930185e-05,
224
- "loss": 0.0024,
225
- "step": 20125
226
  },
227
  {
228
  "epoch": 0.9,
229
- "learning_rate": 9.110589235839302e-05,
230
- "loss": 0.0022,
231
- "step": 20700
232
- },
233
- {
234
- "epoch": 0.93,
235
- "learning_rate": 9.055019183748418e-05,
236
- "loss": 0.0026,
237
- "step": 21275
238
- },
239
- {
240
- "epoch": 0.95,
241
- "learning_rate": 8.999449131657535e-05,
242
- "loss": 0.0025,
243
- "step": 21850
244
- },
245
- {
246
- "epoch": 0.98,
247
- "learning_rate": 8.94387907956665e-05,
248
- "loss": 0.0024,
249
- "step": 22425
250
  },
251
  {
252
  "epoch": 1.0,
253
- "eval_loss": 0.02637363225221634,
254
- "eval_max_distance": 8,
255
  "eval_mean_distance": 0,
256
- "eval_runtime": 23.0018,
257
- "eval_samples_per_second": 11.217,
258
- "eval_steps_per_second": 0.783,
259
- "step": 22994
260
  },
261
  {
262
  "epoch": 1.0,
263
- "learning_rate": 8.888309027475768e-05,
264
- "loss": 0.0026,
265
- "step": 23000
266
- },
267
- {
268
- "epoch": 1.03,
269
- "learning_rate": 8.832738975384883e-05,
270
- "loss": 0.0019,
271
- "step": 23575
272
- },
273
- {
274
- "epoch": 1.05,
275
- "learning_rate": 8.777168923294e-05,
276
- "loss": 0.002,
277
- "step": 24150
278
- },
279
- {
280
- "epoch": 1.08,
281
- "learning_rate": 8.721598871203116e-05,
282
- "loss": 0.002,
283
- "step": 24725
284
  },
285
  {
286
  "epoch": 1.1,
287
- "learning_rate": 8.666028819112233e-05,
288
- "loss": 0.0021,
289
- "step": 25300
290
- },
291
- {
292
- "epoch": 1.13,
293
- "learning_rate": 8.610458767021349e-05,
294
- "loss": 0.0022,
295
- "step": 25875
296
- },
297
- {
298
- "epoch": 1.15,
299
- "learning_rate": 8.554888714930466e-05,
300
- "loss": 0.0022,
301
- "step": 26450
302
- },
303
- {
304
- "epoch": 1.18,
305
- "learning_rate": 8.499318662839582e-05,
306
- "loss": 0.0023,
307
- "step": 27025
308
  },
309
  {
310
  "epoch": 1.2,
311
- "learning_rate": 8.443748610748699e-05,
312
- "loss": 0.0023,
313
- "step": 27600
314
- },
315
- {
316
- "epoch": 1.23,
317
- "learning_rate": 8.388178558657815e-05,
318
- "loss": 0.002,
319
- "step": 28175
320
- },
321
- {
322
- "epoch": 1.25,
323
- "learning_rate": 8.33260850656693e-05,
324
- "loss": 0.0022,
325
- "step": 28750
326
- },
327
- {
328
- "epoch": 1.28,
329
- "learning_rate": 8.277038454476048e-05,
330
- "loss": 0.0022,
331
- "step": 29325
332
  },
333
  {
334
  "epoch": 1.3,
335
- "learning_rate": 8.221468402385163e-05,
336
- "loss": 0.002,
337
- "step": 29900
338
- },
339
- {
340
- "epoch": 1.33,
341
- "learning_rate": 8.165898350294279e-05,
342
- "loss": 0.0023,
343
- "step": 30475
344
- },
345
- {
346
- "epoch": 1.35,
347
- "learning_rate": 8.110328298203396e-05,
348
- "loss": 0.0026,
349
- "step": 31050
350
- },
351
- {
352
- "epoch": 1.38,
353
- "learning_rate": 8.054758246112512e-05,
354
- "loss": 0.0022,
355
- "step": 31625
356
  },
357
  {
358
  "epoch": 1.4,
359
- "learning_rate": 7.999188194021629e-05,
360
- "loss": 0.0023,
361
- "step": 32200
362
- },
363
- {
364
- "epoch": 1.43,
365
- "learning_rate": 7.943618141930745e-05,
366
- "loss": 0.0023,
367
- "step": 32775
368
- },
369
- {
370
- "epoch": 1.45,
371
- "learning_rate": 7.888048089839862e-05,
372
- "loss": 0.0024,
373
- "step": 33350
374
- },
375
- {
376
- "epoch": 1.48,
377
- "learning_rate": 7.832478037748978e-05,
378
- "loss": 0.0024,
379
- "step": 33925
380
  },
381
  {
382
  "epoch": 1.5,
383
- "learning_rate": 7.776907985658095e-05,
384
- "loss": 0.0025,
385
- "step": 34500
386
- },
387
- {
388
- "epoch": 1.53,
389
- "learning_rate": 7.721337933567211e-05,
390
- "loss": 0.0023,
391
- "step": 35075
392
- },
393
- {
394
- "epoch": 1.55,
395
- "learning_rate": 7.665767881476328e-05,
396
- "loss": 0.0021,
397
- "step": 35650
398
- },
399
- {
400
- "epoch": 1.58,
401
- "learning_rate": 7.610197829385444e-05,
402
- "loss": 0.0022,
403
- "step": 36225
404
  },
405
  {
406
  "epoch": 1.6,
407
- "learning_rate": 7.554627777294561e-05,
408
- "loss": 0.002,
409
- "step": 36800
410
- },
411
- {
412
- "epoch": 1.63,
413
- "learning_rate": 7.499057725203676e-05,
414
- "loss": 0.0021,
415
- "step": 37375
416
  },
417
  {
418
- "epoch": 1.65,
419
- "learning_rate": 7.443487673112794e-05,
420
- "loss": 0.0024,
421
- "step": 37950
422
  },
423
  {
424
- "epoch": 1.68,
425
- "learning_rate": 7.38791762102191e-05,
426
- "loss": 0.0024,
427
- "step": 38525
428
  },
429
  {
430
- "epoch": 1.7,
431
- "learning_rate": 7.332347568931026e-05,
432
- "loss": 0.0022,
433
- "step": 39100
434
  },
435
  {
436
- "epoch": 1.73,
437
- "learning_rate": 7.276777516840142e-05,
438
- "loss": 0.0024,
439
- "step": 39675
 
 
 
 
440
  },
441
  {
442
- "epoch": 1.75,
443
- "learning_rate": 7.221207464749259e-05,
444
- "loss": 0.0023,
445
- "step": 40250
446
  },
447
  {
448
- "epoch": 1.78,
449
- "learning_rate": 7.165637412658375e-05,
450
- "loss": 0.0023,
451
- "step": 40825
452
  },
453
  {
454
- "epoch": 1.8,
455
- "learning_rate": 7.110067360567492e-05,
456
- "loss": 0.0022,
457
- "step": 41400
458
  },
459
  {
460
- "epoch": 1.83,
461
- "learning_rate": 7.054497308476608e-05,
462
- "loss": 0.0023,
463
- "step": 41975
464
  },
465
  {
466
- "epoch": 1.85,
467
- "learning_rate": 6.998927256385725e-05,
468
- "loss": 0.0023,
469
- "step": 42550
470
  },
471
  {
472
- "epoch": 1.88,
473
- "learning_rate": 6.943357204294841e-05,
474
- "loss": 0.0023,
475
- "step": 43125
476
  },
477
  {
478
- "epoch": 1.9,
479
- "learning_rate": 6.887787152203957e-05,
480
- "loss": 0.0024,
481
- "step": 43700
482
  },
483
  {
484
- "epoch": 1.93,
485
- "learning_rate": 6.832217100113074e-05,
486
- "loss": 0.0024,
487
- "step": 44275
488
  },
489
  {
490
- "epoch": 1.95,
491
- "learning_rate": 6.77664704802219e-05,
492
  "loss": 0.0022,
493
- "step": 44850
494
  },
495
  {
496
- "epoch": 1.98,
497
- "learning_rate": 6.721076995931305e-05,
498
- "loss": 0.0022,
499
- "step": 45425
500
  },
501
  {
502
- "epoch": 2.0,
503
- "eval_loss": 0.025934860110282898,
504
- "eval_max_distance": 8,
505
  "eval_mean_distance": 0,
506
- "eval_runtime": 21.6745,
507
- "eval_samples_per_second": 11.903,
508
- "eval_steps_per_second": 0.83,
509
- "step": 45988
510
  },
511
  {
512
- "epoch": 2.0,
513
- "learning_rate": 6.665506943840422e-05,
514
- "loss": 0.0024,
515
- "step": 46000
516
  },
517
  {
518
- "epoch": 2.03,
519
- "learning_rate": 6.609936891749538e-05,
520
- "loss": 0.0019,
521
- "step": 46575
522
  },
523
  {
524
- "epoch": 2.05,
525
- "learning_rate": 6.554366839658655e-05,
526
- "loss": 0.0021,
527
- "step": 47150
528
  },
529
  {
530
- "epoch": 2.08,
531
- "learning_rate": 6.498796787567771e-05,
532
  "loss": 0.0019,
533
- "step": 47725
534
  },
535
  {
536
- "epoch": 2.1,
537
- "learning_rate": 6.443226735476888e-05,
538
- "loss": 0.0021,
539
- "step": 48300
540
  },
541
  {
542
- "epoch": 2.13,
543
- "learning_rate": 6.387656683386004e-05,
544
  "loss": 0.0018,
545
- "step": 48875
546
  },
547
  {
548
- "epoch": 2.15,
549
- "learning_rate": 6.332086631295121e-05,
550
- "loss": 0.002,
551
- "step": 49450
552
- },
553
- {
554
- "epoch": 2.18,
555
- "learning_rate": 6.276516579204237e-05,
556
- "loss": 0.0021,
557
- "step": 50025
558
- },
559
- {
560
- "epoch": 2.2,
561
- "learning_rate": 6.220946527113354e-05,
562
  "loss": 0.002,
563
- "step": 50600
564
  },
565
  {
566
- "epoch": 2.23,
567
- "learning_rate": 6.16537647502247e-05,
568
  "loss": 0.0019,
569
- "step": 51175
570
  },
571
  {
572
- "epoch": 2.25,
573
- "learning_rate": 6.109806422931587e-05,
574
  "loss": 0.002,
575
- "step": 51750
576
- },
577
- {
578
- "epoch": 2.28,
579
- "learning_rate": 6.0542363708407024e-05,
580
- "loss": 0.0021,
581
- "step": 52325
582
  },
583
  {
584
- "epoch": 2.3,
585
- "learning_rate": 5.9986663187498195e-05,
586
  "loss": 0.0021,
587
- "step": 52900
588
  },
589
  {
590
- "epoch": 2.33,
591
- "learning_rate": 5.943096266658935e-05,
592
- "loss": 0.0019,
593
- "step": 53475
594
- },
595
- {
596
- "epoch": 2.35,
597
- "learning_rate": 5.8875262145680524e-05,
598
- "loss": 0.0018,
599
- "step": 54050
600
  },
601
  {
602
- "epoch": 2.38,
603
- "learning_rate": 5.831956162477168e-05,
604
  "loss": 0.0021,
605
- "step": 54625
606
  },
607
  {
608
- "epoch": 2.4,
609
- "learning_rate": 5.7763861103862846e-05,
610
- "loss": 0.0021,
611
- "step": 55200
612
  },
613
  {
614
- "epoch": 2.43,
615
- "learning_rate": 5.720816058295401e-05,
616
- "loss": 0.0019,
617
- "step": 55775
618
  },
619
  {
620
- "epoch": 2.45,
621
- "learning_rate": 5.6652460062045174e-05,
622
- "loss": 0.002,
623
- "step": 56350
624
  },
625
  {
626
- "epoch": 2.48,
627
- "learning_rate": 5.609675954113633e-05,
628
- "loss": 0.0022,
629
- "step": 56925
630
  },
631
  {
632
- "epoch": 2.5,
633
- "learning_rate": 5.55410590202275e-05,
634
- "loss": 0.0018,
635
- "step": 57500
636
  },
637
  {
638
- "epoch": 2.53,
639
- "learning_rate": 5.498535849931866e-05,
640
- "loss": 0.0023,
641
- "step": 58075
642
  },
643
  {
644
- "epoch": 2.55,
645
- "learning_rate": 5.442965797840983e-05,
646
- "loss": 0.0021,
647
- "step": 58650
648
  },
649
  {
650
- "epoch": 2.58,
651
- "learning_rate": 5.387395745750099e-05,
652
  "loss": 0.002,
653
- "step": 59225
654
  },
655
  {
656
- "epoch": 2.6,
657
- "learning_rate": 5.331825693659216e-05,
658
  "loss": 0.0019,
659
- "step": 59800
660
  },
661
  {
662
- "epoch": 2.63,
663
- "learning_rate": 5.276255641568332e-05,
664
- "loss": 0.0021,
665
- "step": 60375
 
 
 
 
666
  },
667
  {
668
- "epoch": 2.65,
669
- "learning_rate": 5.220685589477449e-05,
670
- "loss": 0.0022,
671
- "step": 60950
672
  },
673
  {
674
- "epoch": 2.68,
675
- "learning_rate": 5.1651155373865647e-05,
676
  "loss": 0.0017,
677
- "step": 61525
678
  },
679
  {
680
- "epoch": 2.7,
681
- "learning_rate": 5.109545485295682e-05,
682
- "loss": 0.0019,
683
- "step": 62100
684
  },
685
  {
686
- "epoch": 2.73,
687
- "learning_rate": 5.0539754332047975e-05,
688
- "loss": 0.002,
689
- "step": 62675
690
  },
691
  {
692
- "epoch": 2.75,
693
- "learning_rate": 4.998405381113914e-05,
694
- "loss": 0.0019,
695
- "step": 63250
696
  },
697
  {
698
- "epoch": 2.78,
699
- "learning_rate": 4.9428353290230304e-05,
700
- "loss": 0.0022,
701
- "step": 63825
702
  },
703
  {
704
- "epoch": 2.8,
705
- "learning_rate": 4.887265276932147e-05,
706
- "loss": 0.0021,
707
- "step": 64400
708
  },
709
  {
710
- "epoch": 2.83,
711
- "learning_rate": 4.831695224841263e-05,
712
- "loss": 0.0023,
713
- "step": 64975
714
  },
715
  {
716
- "epoch": 2.85,
717
- "learning_rate": 4.77612517275038e-05,
718
- "loss": 0.002,
719
- "step": 65550
720
  },
721
  {
722
- "epoch": 2.88,
723
- "learning_rate": 4.720555120659496e-05,
724
- "loss": 0.002,
725
- "step": 66125
726
  },
727
  {
728
- "epoch": 2.9,
729
- "learning_rate": 4.6649850685686126e-05,
730
- "loss": 0.0021,
731
- "step": 66700
 
 
 
 
732
  },
733
  {
734
- "epoch": 2.93,
735
- "learning_rate": 4.609415016477729e-05,
736
- "loss": 0.0021,
737
- "step": 67275
738
  },
739
  {
740
- "epoch": 2.95,
741
- "learning_rate": 4.5538449643868454e-05,
742
- "loss": 0.002,
743
- "step": 67850
744
  },
745
  {
746
- "epoch": 2.98,
747
- "learning_rate": 4.498274912295962e-05,
748
- "loss": 0.0019,
749
- "step": 68425
750
  },
751
  {
752
- "epoch": 3.0,
753
- "eval_loss": 0.029171258211135864,
754
- "eval_max_distance": 8,
755
- "eval_mean_distance": 0,
756
- "eval_runtime": 20.5415,
757
- "eval_samples_per_second": 12.56,
758
- "eval_steps_per_second": 0.876,
759
- "step": 68982
760
  },
761
  {
762
- "epoch": 3.0,
763
- "learning_rate": 4.442704860205078e-05,
764
- "loss": 0.0021,
765
- "step": 69000
766
  },
767
  {
768
- "epoch": 3.03,
769
- "learning_rate": 4.387134808114195e-05,
770
  "loss": 0.0017,
771
- "step": 69575
772
  },
773
  {
774
- "epoch": 3.05,
775
- "learning_rate": 4.3315647560233105e-05,
776
- "loss": 0.002,
777
- "step": 70150
778
  },
779
  {
780
- "epoch": 3.08,
781
- "learning_rate": 4.275994703932427e-05,
782
  "loss": 0.0016,
783
- "step": 70725
784
  },
785
  {
786
- "epoch": 3.1,
787
- "learning_rate": 4.2204246518415434e-05,
788
- "loss": 0.0018,
789
- "step": 71300
790
  },
791
  {
792
- "epoch": 3.13,
793
- "learning_rate": 4.16485459975066e-05,
794
- "loss": 0.0018,
795
- "step": 71875
796
  },
797
  {
798
- "epoch": 3.15,
799
- "learning_rate": 4.109284547659776e-05,
800
- "loss": 0.0018,
801
- "step": 72450
 
 
 
 
802
  },
803
  {
804
- "epoch": 3.18,
805
- "learning_rate": 4.0537144955688927e-05,
806
- "loss": 0.0018,
807
- "step": 73025
808
  },
809
  {
810
- "epoch": 3.2,
811
- "learning_rate": 3.998144443478009e-05,
812
- "loss": 0.002,
813
- "step": 73600
814
  },
815
  {
816
- "epoch": 3.23,
817
- "learning_rate": 3.9425743913871255e-05,
818
- "loss": 0.002,
819
- "step": 74175
820
  },
821
  {
822
- "epoch": 3.25,
823
- "learning_rate": 3.887004339296242e-05,
824
- "loss": 0.0019,
825
- "step": 74750
826
  },
827
  {
828
- "epoch": 3.28,
829
- "learning_rate": 3.8314342872053584e-05,
830
- "loss": 0.0018,
831
- "step": 75325
832
  },
833
  {
834
- "epoch": 3.3,
835
- "learning_rate": 3.775864235114475e-05,
836
- "loss": 0.0018,
837
- "step": 75900
838
  },
839
  {
840
- "epoch": 3.33,
841
- "learning_rate": 3.720294183023591e-05,
842
- "loss": 0.0017,
843
- "step": 76475
844
  },
845
  {
846
- "epoch": 3.35,
847
- "learning_rate": 3.664724130932708e-05,
848
- "loss": 0.0017,
849
- "step": 77050
850
  },
851
  {
852
- "epoch": 3.38,
853
- "learning_rate": 3.6091540788418234e-05,
854
- "loss": 0.0018,
855
- "step": 77625
856
  },
857
  {
858
- "epoch": 3.4,
859
- "learning_rate": 3.55358402675094e-05,
860
- "loss": 0.0018,
861
- "step": 78200
862
  },
863
  {
864
- "epoch": 3.43,
865
- "learning_rate": 3.498013974660056e-05,
866
- "loss": 0.0016,
867
- "step": 78775
 
 
 
 
868
  },
869
  {
870
- "epoch": 3.45,
871
- "learning_rate": 3.442443922569173e-05,
872
  "loss": 0.0016,
873
- "step": 79350
874
  },
875
  {
876
- "epoch": 3.48,
877
- "learning_rate": 3.386873870478289e-05,
878
- "loss": 0.0018,
879
- "step": 79925
880
  },
881
  {
882
- "epoch": 3.5,
883
- "learning_rate": 3.3313038183874056e-05,
884
- "loss": 0.0017,
885
- "step": 80500
886
  },
887
  {
888
- "epoch": 3.53,
889
- "learning_rate": 3.275733766296522e-05,
890
- "loss": 0.0017,
891
- "step": 81075
892
  },
893
  {
894
- "epoch": 3.55,
895
- "learning_rate": 3.2201637142056385e-05,
896
- "loss": 0.0016,
897
- "step": 81650
898
  },
899
  {
900
- "epoch": 3.58,
901
- "learning_rate": 3.164593662114755e-05,
902
- "loss": 0.002,
903
- "step": 82225
904
  },
905
  {
906
- "epoch": 3.6,
907
- "learning_rate": 3.1090236100238714e-05,
908
- "loss": 0.0018,
909
- "step": 82800
910
  },
911
  {
912
- "epoch": 3.63,
913
- "learning_rate": 3.053453557932988e-05,
914
- "loss": 0.0016,
915
- "step": 83375
916
  },
917
  {
918
- "epoch": 3.65,
919
- "learning_rate": 2.997883505842104e-05,
920
- "loss": 0.0017,
921
- "step": 83950
922
  },
923
  {
924
- "epoch": 3.68,
925
- "learning_rate": 2.9423134537512203e-05,
926
- "loss": 0.0019,
927
- "step": 84525
928
  },
929
  {
930
- "epoch": 3.7,
931
- "learning_rate": 2.8867434016603368e-05,
932
- "loss": 0.0018,
933
- "step": 85100
 
 
 
 
934
  },
935
  {
936
- "epoch": 3.73,
937
- "learning_rate": 2.8311733495694532e-05,
938
- "loss": 0.0017,
939
- "step": 85675
940
  },
941
  {
942
- "epoch": 3.75,
943
- "learning_rate": 2.7756032974785696e-05,
944
- "loss": 0.0017,
945
- "step": 86250
946
  },
947
  {
948
- "epoch": 3.78,
949
- "learning_rate": 2.720033245387686e-05,
950
- "loss": 0.0019,
951
- "step": 86825
952
  },
953
  {
954
- "epoch": 3.8,
955
- "learning_rate": 2.664463193296802e-05,
956
- "loss": 0.002,
957
- "step": 87400
958
  },
959
  {
960
- "epoch": 3.83,
961
- "learning_rate": 2.6088931412059186e-05,
962
- "loss": 0.0019,
963
- "step": 87975
964
  },
965
  {
966
- "epoch": 3.85,
967
- "learning_rate": 2.553323089115035e-05,
968
- "loss": 0.0017,
969
- "step": 88550
970
  },
971
  {
972
- "epoch": 3.88,
973
- "learning_rate": 2.4977530370241514e-05,
974
- "loss": 0.0018,
975
- "step": 89125
976
  },
977
  {
978
- "epoch": 3.9,
979
- "learning_rate": 2.442182984933268e-05,
980
- "loss": 0.0018,
981
- "step": 89700
982
  },
983
  {
984
- "epoch": 3.93,
985
- "learning_rate": 2.3866129328423843e-05,
986
- "loss": 0.0018,
987
- "step": 90275
988
  },
989
  {
990
- "epoch": 3.95,
991
- "learning_rate": 2.3310428807515004e-05,
992
- "loss": 0.0019,
993
- "step": 90850
994
  },
995
  {
996
- "epoch": 3.98,
997
- "learning_rate": 2.275472828660617e-05,
998
- "loss": 0.0016,
999
- "step": 91425
 
 
 
 
1000
  },
1001
  {
1002
- "epoch": 4.0,
1003
- "eval_loss": 0.02807791158556938,
1004
- "eval_max_distance": 8,
1005
- "eval_mean_distance": 0,
1006
- "eval_runtime": 20.9058,
1007
- "eval_samples_per_second": 12.341,
1008
- "eval_steps_per_second": 0.861,
1009
- "step": 91976
1010
  },
1011
  {
1012
- "epoch": 4.0,
1013
- "learning_rate": 2.2199027765697333e-05,
1014
- "loss": 0.0019,
1015
- "step": 92000
1016
  },
1017
  {
1018
- "epoch": 4.03,
1019
- "learning_rate": 2.1643327244788497e-05,
1020
- "loss": 0.0015,
1021
- "step": 92575
1022
  },
1023
  {
1024
- "epoch": 4.05,
1025
- "learning_rate": 2.108762672387966e-05,
1026
- "loss": 0.0016,
1027
- "step": 93150
1028
  },
1029
  {
1030
- "epoch": 4.08,
1031
- "learning_rate": 2.0531926202970826e-05,
1032
- "loss": 0.0015,
1033
- "step": 93725
1034
  },
1035
  {
1036
- "epoch": 4.1,
1037
- "learning_rate": 1.997622568206199e-05,
1038
- "loss": 0.0017,
1039
- "step": 94300
1040
  },
1041
  {
1042
- "epoch": 4.13,
1043
- "learning_rate": 1.942052516115315e-05,
1044
- "loss": 0.0016,
1045
- "step": 94875
1046
  },
1047
  {
1048
- "epoch": 4.15,
1049
- "learning_rate": 1.8864824640244315e-05,
1050
- "loss": 0.0017,
1051
- "step": 95450
1052
  },
1053
  {
1054
- "epoch": 4.18,
1055
- "learning_rate": 1.830912411933548e-05,
1056
- "loss": 0.0017,
1057
- "step": 96025
1058
  },
1059
  {
1060
- "epoch": 4.2,
1061
- "learning_rate": 1.7753423598426644e-05,
1062
- "loss": 0.0015,
1063
- "step": 96600
1064
  },
1065
  {
1066
- "epoch": 4.23,
1067
- "learning_rate": 1.719772307751781e-05,
1068
- "loss": 0.0018,
1069
- "step": 97175
 
 
 
 
1070
  },
1071
  {
1072
- "epoch": 4.25,
1073
- "learning_rate": 1.6642022556608973e-05,
1074
- "loss": 0.0018,
1075
- "step": 97750
1076
  },
1077
  {
1078
- "epoch": 4.28,
1079
- "learning_rate": 1.6086322035700134e-05,
1080
- "loss": 0.0014,
1081
- "step": 98325
1082
  },
1083
  {
1084
- "epoch": 4.3,
1085
- "learning_rate": 1.5530621514791298e-05,
1086
- "loss": 0.0016,
1087
- "step": 98900
1088
  },
1089
  {
1090
- "epoch": 4.33,
1091
- "learning_rate": 1.4974920993882462e-05,
1092
- "loss": 0.0018,
1093
- "step": 99475
1094
  },
1095
  {
1096
- "epoch": 4.35,
1097
- "learning_rate": 1.4419220472973627e-05,
1098
- "loss": 0.0015,
1099
- "step": 100050
1100
  },
1101
  {
1102
- "epoch": 4.38,
1103
- "learning_rate": 1.3863519952064791e-05,
1104
- "loss": 0.0019,
1105
- "step": 100625
1106
  },
1107
  {
1108
- "epoch": 4.4,
1109
- "learning_rate": 1.3307819431155954e-05,
1110
- "loss": 0.0016,
1111
- "step": 101200
1112
  },
1113
  {
1114
- "epoch": 4.43,
1115
- "learning_rate": 1.2752118910247118e-05,
1116
- "loss": 0.0015,
1117
- "step": 101775
1118
  },
1119
  {
1120
- "epoch": 4.45,
1121
- "learning_rate": 1.2196418389338282e-05,
1122
- "loss": 0.0017,
1123
- "step": 102350
1124
  },
1125
  {
1126
- "epoch": 4.48,
1127
- "learning_rate": 1.1640717868429447e-05,
1128
- "loss": 0.0016,
1129
- "step": 102925
1130
  },
1131
  {
1132
- "epoch": 4.5,
1133
- "learning_rate": 1.108501734752061e-05,
1134
- "loss": 0.0015,
1135
- "step": 103500
 
 
 
 
1136
  },
1137
  {
1138
- "epoch": 4.53,
1139
- "learning_rate": 1.0529316826611774e-05,
1140
- "loss": 0.0017,
1141
- "step": 104075
1142
  },
1143
  {
1144
- "epoch": 4.55,
1145
- "learning_rate": 9.973616305702938e-06,
1146
- "loss": 0.0016,
1147
- "step": 104650
1148
  },
1149
  {
1150
- "epoch": 4.58,
1151
- "learning_rate": 9.4179157847941e-06,
1152
- "loss": 0.0016,
1153
- "step": 105225
1154
  },
1155
  {
1156
- "epoch": 4.6,
1157
- "learning_rate": 8.862215263885265e-06,
1158
- "loss": 0.0017,
1159
- "step": 105800
1160
  },
1161
  {
1162
- "epoch": 4.63,
1163
- "learning_rate": 8.30651474297643e-06,
1164
- "loss": 0.0015,
1165
- "step": 106375
1166
  },
1167
  {
1168
- "epoch": 4.65,
1169
- "learning_rate": 7.750814222067592e-06,
1170
- "loss": 0.0014,
1171
- "step": 106950
1172
  },
1173
  {
1174
- "epoch": 4.68,
1175
- "learning_rate": 7.195113701158756e-06,
1176
- "loss": 0.0017,
1177
- "step": 107525
1178
  },
1179
  {
1180
- "epoch": 4.7,
1181
- "learning_rate": 6.639413180249921e-06,
1182
- "loss": 0.0017,
1183
- "step": 108100
1184
  },
1185
  {
1186
- "epoch": 4.73,
1187
- "learning_rate": 6.083712659341084e-06,
1188
- "loss": 0.0016,
1189
- "step": 108675
1190
  },
1191
  {
1192
- "epoch": 4.75,
1193
- "learning_rate": 5.528012138432248e-06,
1194
- "loss": 0.0017,
1195
- "step": 109250
1196
  },
1197
  {
1198
- "epoch": 4.78,
1199
- "learning_rate": 4.972311617523412e-06,
1200
- "loss": 0.0018,
1201
- "step": 109825
 
 
 
 
1202
  },
1203
  {
1204
- "epoch": 4.8,
1205
- "learning_rate": 4.4166110966145756e-06,
1206
- "loss": 0.0015,
1207
- "step": 110400
1208
  },
1209
  {
1210
- "epoch": 4.83,
1211
- "learning_rate": 3.86091057570574e-06,
1212
- "loss": 0.0017,
1213
- "step": 110975
1214
  },
1215
  {
1216
- "epoch": 4.85,
1217
- "learning_rate": 3.3052100547969034e-06,
1218
- "loss": 0.0014,
1219
- "step": 111550
1220
  },
1221
  {
1222
- "epoch": 4.88,
1223
- "learning_rate": 2.7495095338880677e-06,
1224
- "loss": 0.0018,
1225
- "step": 112125
1226
  },
1227
  {
1228
- "epoch": 4.9,
1229
- "learning_rate": 2.1938090129792312e-06,
1230
- "loss": 0.0017,
1231
- "step": 112700
1232
  },
1233
  {
1234
- "epoch": 4.93,
1235
- "learning_rate": 1.6381084920703951e-06,
1236
- "loss": 0.0018,
1237
- "step": 113275
1238
  },
1239
  {
1240
- "epoch": 4.95,
1241
- "learning_rate": 1.082407971161559e-06,
1242
- "loss": 0.0015,
1243
- "step": 113850
1244
  },
1245
  {
1246
- "epoch": 4.98,
1247
- "learning_rate": 5.26707450252723e-07,
1248
- "loss": 0.0016,
1249
- "step": 114425
1250
  },
1251
  {
1252
- "epoch": 5.0,
1253
- "eval_loss": 0.027882983908057213,
1254
- "eval_max_distance": 8,
1255
- "eval_mean_distance": 0,
1256
- "eval_runtime": 20.563,
1257
- "eval_samples_per_second": 12.547,
1258
- "eval_steps_per_second": 0.875,
1259
- "step": 114970
1260
  },
1261
  {
1262
- "epoch": 5.0,
1263
- "step": 114970,
1264
- "total_flos": 2.9139726999711744e+16,
1265
- "train_loss": 0.0019824859863435676,
1266
- "train_runtime": 8494.2801,
1267
- "train_samples_per_second": 203.021,
1268
- "train_steps_per_second": 13.535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1269
  }
1270
  ],
1271
- "logging_steps": 575,
1272
- "max_steps": 114970,
1273
- "num_train_epochs": 5,
1274
- "save_steps": 1150,
1275
- "total_flos": 2.9139726999711744e+16,
1276
  "trial_name": null,
1277
  "trial_params": null
1278
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 14.0,
5
  "eval_steps": 500,
6
+ "global_step": 323078,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 2.1666594444685182e-09,
14
+ "loss": 0.0003,
15
  "step": 1
16
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  {
18
  "epoch": 0.1,
19
+ "learning_rate": 5.000649997833341e-06,
20
+ "loss": 0.0017,
21
+ "step": 2308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.2,
25
+ "learning_rate": 1.0001299995666681e-05,
26
+ "loss": 0.0017,
27
+ "step": 4616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.3,
31
+ "learning_rate": 1.5001949993500023e-05,
32
+ "loss": 0.0016,
33
+ "step": 6924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  {
36
  "epoch": 0.4,
37
+ "learning_rate": 2.0002599991333363e-05,
38
+ "loss": 0.0016,
39
+ "step": 9232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  },
41
  {
42
  "epoch": 0.5,
43
+ "learning_rate": 2.5003249989166704e-05,
44
+ "loss": 0.0016,
45
+ "step": 11540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
  "epoch": 0.6,
49
+ "learning_rate": 3.0003899987000046e-05,
50
+ "loss": 0.0015,
51
+ "step": 13848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.7,
55
+ "learning_rate": 3.5004549984833384e-05,
56
+ "loss": 0.0016,
57
+ "step": 16156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 0.8,
61
+ "learning_rate": 4.0005199982666725e-05,
62
+ "loss": 0.0017,
63
+ "step": 18464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  },
65
  {
66
  "epoch": 0.9,
67
+ "learning_rate": 4.500584998050007e-05,
68
+ "loss": 0.0017,
69
+ "step": 20772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  },
71
  {
72
  "epoch": 1.0,
73
+ "eval_loss": 0.00893497932702303,
74
+ "eval_max_distance": 1,
75
  "eval_mean_distance": 0,
76
+ "eval_runtime": 18.8658,
77
+ "eval_samples_per_second": 13.676,
78
+ "eval_steps_per_second": 0.954,
79
+ "step": 23077
80
  },
81
  {
82
  "epoch": 1.0,
83
+ "learning_rate": 5.000649997833341e-05,
84
+ "loss": 0.0018,
85
+ "step": 23080
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  },
87
  {
88
  "epoch": 1.1,
89
+ "learning_rate": 5.500714997616675e-05,
90
+ "loss": 0.0017,
91
+ "step": 25388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  },
93
  {
94
  "epoch": 1.2,
95
+ "learning_rate": 6.000779997400009e-05,
96
+ "loss": 0.0017,
97
+ "step": 27696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  },
99
  {
100
  "epoch": 1.3,
101
+ "learning_rate": 6.500844997183343e-05,
102
+ "loss": 0.0017,
103
+ "step": 30004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  },
105
  {
106
  "epoch": 1.4,
107
+ "learning_rate": 7.000909996966677e-05,
108
+ "loss": 0.0017,
109
+ "step": 32312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  },
111
  {
112
  "epoch": 1.5,
113
+ "learning_rate": 7.50097499675001e-05,
114
+ "loss": 0.0017,
115
+ "step": 34620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  },
117
  {
118
  "epoch": 1.6,
119
+ "learning_rate": 8.001039996533345e-05,
120
+ "loss": 0.0018,
121
+ "step": 36928
 
 
 
 
 
 
122
  },
123
  {
124
+ "epoch": 1.7,
125
+ "learning_rate": 8.501104996316679e-05,
126
+ "loss": 0.0017,
127
+ "step": 39236
128
  },
129
  {
130
+ "epoch": 1.8,
131
+ "learning_rate": 9.001169996100013e-05,
132
+ "loss": 0.0018,
133
+ "step": 41544
134
  },
135
  {
136
+ "epoch": 1.9,
137
+ "learning_rate": 9.501234995883348e-05,
138
+ "loss": 0.0019,
139
+ "step": 43852
140
  },
141
  {
142
+ "epoch": 2.0,
143
+ "eval_loss": 0.008421082980930805,
144
+ "eval_max_distance": 3,
145
+ "eval_mean_distance": 0,
146
+ "eval_runtime": 18.622,
147
+ "eval_samples_per_second": 13.855,
148
+ "eval_steps_per_second": 0.967,
149
+ "step": 46154
150
  },
151
  {
152
+ "epoch": 2.0,
153
+ "learning_rate": 9.999855556037036e-05,
154
+ "loss": 0.0018,
155
+ "step": 46160
156
  },
157
  {
158
+ "epoch": 2.1,
159
+ "learning_rate": 9.944292778283332e-05,
160
+ "loss": 0.0019,
161
+ "step": 48468
162
  },
163
  {
164
+ "epoch": 2.2,
165
+ "learning_rate": 9.888730000529628e-05,
166
+ "loss": 0.0018,
167
+ "step": 50776
168
  },
169
  {
170
+ "epoch": 2.3,
171
+ "learning_rate": 9.833167222775925e-05,
172
+ "loss": 0.0019,
173
+ "step": 53084
174
  },
175
  {
176
+ "epoch": 2.4,
177
+ "learning_rate": 9.777604445022221e-05,
178
+ "loss": 0.0021,
179
+ "step": 55392
180
  },
181
  {
182
+ "epoch": 2.5,
183
+ "learning_rate": 9.722041667268516e-05,
184
+ "loss": 0.0021,
185
+ "step": 57700
186
  },
187
  {
188
+ "epoch": 2.6,
189
+ "learning_rate": 9.666478889514812e-05,
190
+ "loss": 0.0021,
191
+ "step": 60008
192
  },
193
  {
194
+ "epoch": 2.7,
195
+ "learning_rate": 9.610916111761109e-05,
196
+ "loss": 0.002,
197
+ "step": 62316
198
  },
199
  {
200
+ "epoch": 2.8,
201
+ "learning_rate": 9.555353334007405e-05,
202
  "loss": 0.0022,
203
+ "step": 64624
204
  },
205
  {
206
+ "epoch": 2.9,
207
+ "learning_rate": 9.499790556253701e-05,
208
+ "loss": 0.002,
209
+ "step": 66932
210
  },
211
  {
212
+ "epoch": 3.0,
213
+ "eval_loss": 0.008709550835192204,
214
+ "eval_max_distance": 1,
215
  "eval_mean_distance": 0,
216
+ "eval_runtime": 17.0366,
217
+ "eval_samples_per_second": 15.144,
218
+ "eval_steps_per_second": 1.057,
219
+ "step": 69231
220
  },
221
  {
222
+ "epoch": 3.0,
223
+ "learning_rate": 9.444227778499997e-05,
224
+ "loss": 0.0022,
225
+ "step": 69240
226
  },
227
  {
228
+ "epoch": 3.1,
229
+ "learning_rate": 9.388665000746294e-05,
230
+ "loss": 0.0018,
231
+ "step": 71548
232
  },
233
  {
234
+ "epoch": 3.2,
235
+ "learning_rate": 9.33310222299259e-05,
236
+ "loss": 0.0019,
237
+ "step": 73856
238
  },
239
  {
240
+ "epoch": 3.3,
241
+ "learning_rate": 9.277539445238886e-05,
242
  "loss": 0.0019,
243
+ "step": 76164
244
  },
245
  {
246
+ "epoch": 3.4,
247
+ "learning_rate": 9.221976667485182e-05,
248
+ "loss": 0.0019,
249
+ "step": 78472
250
  },
251
  {
252
+ "epoch": 3.5,
253
+ "learning_rate": 9.166413889731479e-05,
254
  "loss": 0.0018,
255
+ "step": 80780
256
  },
257
  {
258
+ "epoch": 3.6,
259
+ "learning_rate": 9.110851111977775e-05,
 
 
 
 
 
 
 
 
 
 
 
 
260
  "loss": 0.002,
261
+ "step": 83088
262
  },
263
  {
264
+ "epoch": 3.7,
265
+ "learning_rate": 9.055288334224071e-05,
266
  "loss": 0.0019,
267
+ "step": 85396
268
  },
269
  {
270
+ "epoch": 3.8,
271
+ "learning_rate": 8.999725556470368e-05,
272
  "loss": 0.002,
273
+ "step": 87704
 
 
 
 
 
 
274
  },
275
  {
276
+ "epoch": 3.9,
277
+ "learning_rate": 8.944162778716664e-05,
278
  "loss": 0.0021,
279
+ "step": 90012
280
  },
281
  {
282
+ "epoch": 4.0,
283
+ "eval_loss": 0.011967115104198456,
284
+ "eval_max_distance": 4,
285
+ "eval_mean_distance": 0,
286
+ "eval_runtime": 18.1135,
287
+ "eval_samples_per_second": 14.243,
288
+ "eval_steps_per_second": 0.994,
289
+ "step": 92308
 
 
290
  },
291
  {
292
+ "epoch": 4.0,
293
+ "learning_rate": 8.88860000096296e-05,
294
  "loss": 0.0021,
295
+ "step": 92320
296
  },
297
  {
298
+ "epoch": 4.1,
299
+ "learning_rate": 8.833037223209256e-05,
300
+ "loss": 0.0018,
301
+ "step": 94628
302
  },
303
  {
304
+ "epoch": 4.2,
305
+ "learning_rate": 8.777474445455553e-05,
306
+ "loss": 0.0018,
307
+ "step": 96936
308
  },
309
  {
310
+ "epoch": 4.3,
311
+ "learning_rate": 8.721911667701849e-05,
312
+ "loss": 0.0018,
313
+ "step": 99244
314
  },
315
  {
316
+ "epoch": 4.4,
317
+ "learning_rate": 8.666348889948145e-05,
318
+ "loss": 0.0018,
319
+ "step": 101552
320
  },
321
  {
322
+ "epoch": 4.5,
323
+ "learning_rate": 8.610786112194442e-05,
324
+ "loss": 0.0017,
325
+ "step": 103860
326
  },
327
  {
328
+ "epoch": 4.6,
329
+ "learning_rate": 8.555223334440738e-05,
330
+ "loss": 0.0019,
331
+ "step": 106168
332
  },
333
  {
334
+ "epoch": 4.7,
335
+ "learning_rate": 8.499660556687034e-05,
336
+ "loss": 0.0019,
337
+ "step": 108476
338
  },
339
  {
340
+ "epoch": 4.8,
341
+ "learning_rate": 8.44409777893333e-05,
342
  "loss": 0.002,
343
+ "step": 110784
344
  },
345
  {
346
+ "epoch": 4.9,
347
+ "learning_rate": 8.388535001179625e-05,
348
  "loss": 0.0019,
349
+ "step": 113092
350
  },
351
  {
352
+ "epoch": 5.0,
353
+ "eval_loss": 0.010002830997109413,
354
+ "eval_max_distance": 4,
355
+ "eval_mean_distance": 0,
356
+ "eval_runtime": 17.3823,
357
+ "eval_samples_per_second": 14.843,
358
+ "eval_steps_per_second": 1.036,
359
+ "step": 115385
360
  },
361
  {
362
+ "epoch": 5.0,
363
+ "learning_rate": 8.332972223425922e-05,
364
+ "loss": 0.0019,
365
+ "step": 115400
366
  },
367
  {
368
+ "epoch": 5.1,
369
+ "learning_rate": 8.277409445672218e-05,
370
  "loss": 0.0017,
371
+ "step": 117708
372
  },
373
  {
374
+ "epoch": 5.2,
375
+ "learning_rate": 8.221846667918514e-05,
376
+ "loss": 0.0017,
377
+ "step": 120016
378
  },
379
  {
380
+ "epoch": 5.3,
381
+ "learning_rate": 8.16628389016481e-05,
382
+ "loss": 0.0017,
383
+ "step": 122324
384
  },
385
  {
386
+ "epoch": 5.4,
387
+ "learning_rate": 8.110721112411107e-05,
388
+ "loss": 0.0017,
389
+ "step": 124632
390
  },
391
  {
392
+ "epoch": 5.5,
393
+ "learning_rate": 8.055158334657403e-05,
394
+ "loss": 0.0018,
395
+ "step": 126940
396
  },
397
  {
398
+ "epoch": 5.6,
399
+ "learning_rate": 7.999595556903699e-05,
400
+ "loss": 0.0017,
401
+ "step": 129248
402
  },
403
  {
404
+ "epoch": 5.7,
405
+ "learning_rate": 7.944032779149996e-05,
406
+ "loss": 0.0017,
407
+ "step": 131556
408
  },
409
  {
410
+ "epoch": 5.8,
411
+ "learning_rate": 7.888470001396292e-05,
412
+ "loss": 0.0017,
413
+ "step": 133864
414
  },
415
  {
416
+ "epoch": 5.9,
417
+ "learning_rate": 7.832907223642588e-05,
418
+ "loss": 0.0018,
419
+ "step": 136172
420
  },
421
  {
422
+ "epoch": 6.0,
423
+ "eval_loss": 0.011129369959235191,
424
+ "eval_max_distance": 3,
425
+ "eval_mean_distance": 0,
426
+ "eval_runtime": 17.0083,
427
+ "eval_samples_per_second": 15.169,
428
+ "eval_steps_per_second": 1.058,
429
+ "step": 138462
430
  },
431
  {
432
+ "epoch": 6.0,
433
+ "learning_rate": 7.777344445888884e-05,
434
+ "loss": 0.0018,
435
+ "step": 138480
436
  },
437
  {
438
+ "epoch": 6.1,
439
+ "learning_rate": 7.721781668135181e-05,
440
+ "loss": 0.0016,
441
+ "step": 140788
442
  },
443
  {
444
+ "epoch": 6.2,
445
+ "learning_rate": 7.666218890381477e-05,
446
+ "loss": 0.0016,
447
+ "step": 143096
448
  },
449
  {
450
+ "epoch": 6.3,
451
+ "learning_rate": 7.610656112627773e-05,
452
+ "loss": 0.0017,
453
+ "step": 145404
 
 
 
 
454
  },
455
  {
456
+ "epoch": 6.4,
457
+ "learning_rate": 7.55509333487407e-05,
458
+ "loss": 0.0017,
459
+ "step": 147712
460
  },
461
  {
462
+ "epoch": 6.5,
463
+ "learning_rate": 7.499530557120366e-05,
464
  "loss": 0.0017,
465
+ "step": 150020
466
  },
467
  {
468
+ "epoch": 6.6,
469
+ "learning_rate": 7.443967779366662e-05,
470
+ "loss": 0.0016,
471
+ "step": 152328
472
  },
473
  {
474
+ "epoch": 6.7,
475
+ "learning_rate": 7.388405001612958e-05,
476
  "loss": 0.0016,
477
+ "step": 154636
478
  },
479
  {
480
+ "epoch": 6.8,
481
+ "learning_rate": 7.332842223859255e-05,
482
+ "loss": 0.0017,
483
+ "step": 156944
484
  },
485
  {
486
+ "epoch": 6.9,
487
+ "learning_rate": 7.277279446105551e-05,
488
+ "loss": 0.0017,
489
+ "step": 159252
490
  },
491
  {
492
+ "epoch": 7.0,
493
+ "eval_loss": 0.007010257337242365,
494
+ "eval_max_distance": 3,
495
+ "eval_mean_distance": 0,
496
+ "eval_runtime": 16.7205,
497
+ "eval_samples_per_second": 15.43,
498
+ "eval_steps_per_second": 1.077,
499
+ "step": 161539
500
  },
501
  {
502
+ "epoch": 7.0,
503
+ "learning_rate": 7.221716668351847e-05,
504
+ "loss": 0.0017,
505
+ "step": 161560
506
  },
507
  {
508
+ "epoch": 7.1,
509
+ "learning_rate": 7.166153890598143e-05,
510
+ "loss": 0.0015,
511
+ "step": 163868
512
  },
513
  {
514
+ "epoch": 7.2,
515
+ "learning_rate": 7.11059111284444e-05,
516
+ "loss": 0.0015,
517
+ "step": 166176
518
  },
519
  {
520
+ "epoch": 7.3,
521
+ "learning_rate": 7.055028335090735e-05,
522
+ "loss": 0.0015,
523
+ "step": 168484
524
  },
525
  {
526
+ "epoch": 7.4,
527
+ "learning_rate": 6.999465557337031e-05,
528
+ "loss": 0.0015,
529
+ "step": 170792
530
  },
531
  {
532
+ "epoch": 7.5,
533
+ "learning_rate": 6.943902779583327e-05,
534
+ "loss": 0.0015,
535
+ "step": 173100
536
  },
537
  {
538
+ "epoch": 7.6,
539
+ "learning_rate": 6.888340001829624e-05,
540
+ "loss": 0.0016,
541
+ "step": 175408
542
  },
543
  {
544
+ "epoch": 7.7,
545
+ "learning_rate": 6.83277722407592e-05,
546
+ "loss": 0.0015,
547
+ "step": 177716
548
  },
549
  {
550
+ "epoch": 7.8,
551
+ "learning_rate": 6.777214446322216e-05,
552
+ "loss": 0.0016,
553
+ "step": 180024
554
  },
555
  {
556
+ "epoch": 7.9,
557
+ "learning_rate": 6.721651668568512e-05,
558
+ "loss": 0.0017,
559
+ "step": 182332
560
  },
561
  {
562
+ "epoch": 8.0,
563
+ "eval_loss": 0.01417616382241249,
564
+ "eval_max_distance": 4,
565
+ "eval_mean_distance": 0,
566
+ "eval_runtime": 17.4179,
567
+ "eval_samples_per_second": 14.812,
568
+ "eval_steps_per_second": 1.033,
569
+ "step": 184616
570
  },
571
  {
572
+ "epoch": 8.0,
573
+ "learning_rate": 6.666088890814809e-05,
574
  "loss": 0.0016,
575
+ "step": 184640
576
  },
577
  {
578
+ "epoch": 8.1,
579
+ "learning_rate": 6.610526113061105e-05,
580
+ "loss": 0.0014,
581
+ "step": 186948
582
  },
583
  {
584
+ "epoch": 8.2,
585
+ "learning_rate": 6.554963335307401e-05,
586
+ "loss": 0.0014,
587
+ "step": 189256
588
  },
589
  {
590
+ "epoch": 8.3,
591
+ "learning_rate": 6.499400557553698e-05,
592
+ "loss": 0.0014,
593
+ "step": 191564
594
  },
595
  {
596
+ "epoch": 8.4,
597
+ "learning_rate": 6.443837779799994e-05,
598
+ "loss": 0.0015,
599
+ "step": 193872
600
  },
601
  {
602
+ "epoch": 8.5,
603
+ "learning_rate": 6.38827500204629e-05,
604
+ "loss": 0.0014,
605
+ "step": 196180
606
  },
607
  {
608
+ "epoch": 8.6,
609
+ "learning_rate": 6.332712224292586e-05,
610
+ "loss": 0.0014,
611
+ "step": 198488
612
  },
613
  {
614
+ "epoch": 8.7,
615
+ "learning_rate": 6.277149446538883e-05,
616
+ "loss": 0.0015,
617
+ "step": 200796
618
  },
619
  {
620
+ "epoch": 8.8,
621
+ "learning_rate": 6.221586668785179e-05,
622
+ "loss": 0.0015,
623
+ "step": 203104
624
  },
625
  {
626
+ "epoch": 8.9,
627
+ "learning_rate": 6.166023891031475e-05,
628
+ "loss": 0.0014,
629
+ "step": 205412
630
  },
631
  {
632
+ "epoch": 9.0,
633
+ "eval_loss": 0.011828480288386345,
634
+ "eval_max_distance": 4,
635
+ "eval_mean_distance": 0,
636
+ "eval_runtime": 17.0049,
637
+ "eval_samples_per_second": 15.172,
638
+ "eval_steps_per_second": 1.059,
639
+ "step": 207693
640
  },
641
  {
642
+ "epoch": 9.0,
643
+ "learning_rate": 6.110461113277771e-05,
644
+ "loss": 0.0015,
645
+ "step": 207720
646
  },
647
  {
648
+ "epoch": 9.1,
649
+ "learning_rate": 6.054898335524067e-05,
650
+ "loss": 0.0014,
651
+ "step": 210028
652
  },
653
  {
654
+ "epoch": 9.2,
655
+ "learning_rate": 5.9993355577703634e-05,
656
+ "loss": 0.0013,
657
+ "step": 212336
658
  },
659
  {
660
+ "epoch": 9.3,
661
+ "learning_rate": 5.9437727800166596e-05,
662
+ "loss": 0.0014,
663
+ "step": 214644
664
  },
665
  {
666
+ "epoch": 9.4,
667
+ "learning_rate": 5.888210002262956e-05,
668
+ "loss": 0.0014,
669
+ "step": 216952
670
  },
671
  {
672
+ "epoch": 9.5,
673
+ "learning_rate": 5.832647224509252e-05,
674
+ "loss": 0.0013,
675
+ "step": 219260
676
  },
677
  {
678
+ "epoch": 9.6,
679
+ "learning_rate": 5.7770844467555485e-05,
680
+ "loss": 0.0013,
681
+ "step": 221568
682
  },
683
  {
684
+ "epoch": 9.7,
685
+ "learning_rate": 5.721521669001845e-05,
686
+ "loss": 0.0013,
687
+ "step": 223876
688
  },
689
  {
690
+ "epoch": 9.8,
691
+ "learning_rate": 5.665958891248141e-05,
692
+ "loss": 0.0014,
693
+ "step": 226184
694
  },
695
  {
696
+ "epoch": 9.9,
697
+ "learning_rate": 5.610396113494437e-05,
698
+ "loss": 0.0014,
699
+ "step": 228492
700
  },
701
  {
702
+ "epoch": 10.0,
703
+ "eval_loss": 0.011539922095835209,
704
+ "eval_max_distance": 3,
705
+ "eval_mean_distance": 0,
706
+ "eval_runtime": 17.0363,
707
+ "eval_samples_per_second": 15.144,
708
+ "eval_steps_per_second": 1.057,
709
+ "step": 230770
710
  },
711
  {
712
+ "epoch": 10.0,
713
+ "learning_rate": 5.5548333357407336e-05,
714
+ "loss": 0.0015,
715
+ "step": 230800
 
 
 
 
716
  },
717
  {
718
+ "epoch": 10.1,
719
+ "learning_rate": 5.49927055798703e-05,
720
+ "loss": 0.0013,
721
+ "step": 233108
722
  },
723
  {
724
+ "epoch": 10.2,
725
+ "learning_rate": 5.443707780233326e-05,
726
+ "loss": 0.0012,
727
+ "step": 235416
728
  },
729
  {
730
+ "epoch": 10.3,
731
+ "learning_rate": 5.388145002479622e-05,
732
+ "loss": 0.0013,
733
+ "step": 237724
734
  },
735
  {
736
+ "epoch": 10.4,
737
+ "learning_rate": 5.332582224725918e-05,
738
+ "loss": 0.0013,
739
+ "step": 240032
740
  },
741
  {
742
+ "epoch": 10.5,
743
+ "learning_rate": 5.277019446972214e-05,
744
+ "loss": 0.0014,
745
+ "step": 242340
746
  },
747
  {
748
+ "epoch": 10.6,
749
+ "learning_rate": 5.2214566692185106e-05,
750
+ "loss": 0.0013,
751
+ "step": 244648
752
  },
753
  {
754
+ "epoch": 10.7,
755
+ "learning_rate": 5.165893891464807e-05,
756
+ "loss": 0.0013,
757
+ "step": 246956
758
  },
759
  {
760
+ "epoch": 10.8,
761
+ "learning_rate": 5.110331113711103e-05,
762
+ "loss": 0.0013,
763
+ "step": 249264
764
  },
765
  {
766
+ "epoch": 10.9,
767
+ "learning_rate": 5.0547683359573994e-05,
768
+ "loss": 0.0013,
769
+ "step": 251572
770
  },
771
  {
772
+ "epoch": 11.0,
773
+ "eval_loss": 0.011254764162003994,
774
+ "eval_max_distance": 3,
775
+ "eval_mean_distance": 0,
776
+ "eval_runtime": 16.9212,
777
+ "eval_samples_per_second": 15.247,
778
+ "eval_steps_per_second": 1.064,
779
+ "step": 253847
780
  },
781
  {
782
+ "epoch": 11.0,
783
+ "learning_rate": 4.999205558203695e-05,
784
+ "loss": 0.0012,
785
+ "step": 253880
786
  },
787
  {
788
+ "epoch": 11.1,
789
+ "learning_rate": 4.943642780449991e-05,
790
+ "loss": 0.0012,
791
+ "step": 256188
792
  },
793
  {
794
+ "epoch": 11.2,
795
+ "learning_rate": 4.8880800026962876e-05,
796
+ "loss": 0.0012,
797
+ "step": 258496
798
  },
799
  {
800
+ "epoch": 11.3,
801
+ "learning_rate": 4.832517224942584e-05,
802
+ "loss": 0.0013,
803
+ "step": 260804
804
  },
805
  {
806
+ "epoch": 11.4,
807
+ "learning_rate": 4.77695444718888e-05,
808
+ "loss": 0.0012,
809
+ "step": 263112
810
  },
811
  {
812
+ "epoch": 11.5,
813
+ "learning_rate": 4.7213916694351764e-05,
814
+ "loss": 0.0012,
815
+ "step": 265420
816
  },
817
  {
818
+ "epoch": 11.6,
819
+ "learning_rate": 4.665828891681473e-05,
820
+ "loss": 0.0012,
821
+ "step": 267728
822
  },
823
  {
824
+ "epoch": 11.7,
825
+ "learning_rate": 4.610266113927768e-05,
826
+ "loss": 0.0013,
827
+ "step": 270036
828
  },
829
  {
830
+ "epoch": 11.8,
831
+ "learning_rate": 4.5547033361740646e-05,
832
+ "loss": 0.0012,
833
+ "step": 272344
834
  },
835
  {
836
+ "epoch": 11.9,
837
+ "learning_rate": 4.499140558420361e-05,
838
+ "loss": 0.0012,
839
+ "step": 274652
840
  },
841
  {
842
+ "epoch": 12.0,
843
+ "eval_loss": 0.012018387205898762,
844
+ "eval_max_distance": 3,
845
+ "eval_mean_distance": 0,
846
+ "eval_runtime": 16.9292,
847
+ "eval_samples_per_second": 15.24,
848
+ "eval_steps_per_second": 1.063,
849
+ "step": 276924
850
  },
851
  {
852
+ "epoch": 12.0,
853
+ "learning_rate": 4.443577780666657e-05,
854
+ "loss": 0.0013,
855
+ "step": 276960
856
  },
857
  {
858
+ "epoch": 12.1,
859
+ "learning_rate": 4.3880150029129535e-05,
860
+ "loss": 0.0012,
861
+ "step": 279268
862
  },
863
  {
864
+ "epoch": 12.2,
865
+ "learning_rate": 4.33245222515925e-05,
866
+ "loss": 0.0011,
867
+ "step": 281576
868
  },
869
  {
870
+ "epoch": 12.3,
871
+ "learning_rate": 4.276889447405546e-05,
872
+ "loss": 0.0012,
873
+ "step": 283884
874
  },
875
  {
876
+ "epoch": 12.4,
877
+ "learning_rate": 4.221326669651842e-05,
878
+ "loss": 0.0012,
879
+ "step": 286192
880
  },
881
  {
882
+ "epoch": 12.5,
883
+ "learning_rate": 4.1657638918981386e-05,
884
+ "loss": 0.0012,
885
+ "step": 288500
886
  },
887
  {
888
+ "epoch": 12.6,
889
+ "learning_rate": 4.110201114144435e-05,
890
+ "loss": 0.0011,
891
+ "step": 290808
892
  },
893
  {
894
+ "epoch": 12.7,
895
+ "learning_rate": 4.054638336390731e-05,
896
+ "loss": 0.0011,
897
+ "step": 293116
898
  },
899
  {
900
+ "epoch": 12.8,
901
+ "learning_rate": 3.9990755586370274e-05,
902
+ "loss": 0.0012,
903
+ "step": 295424
904
  },
905
  {
906
+ "epoch": 12.9,
907
+ "learning_rate": 3.943512780883323e-05,
908
+ "loss": 0.0012,
909
+ "step": 297732
910
  },
911
  {
912
+ "epoch": 13.0,
913
+ "eval_loss": 0.013248566538095474,
914
+ "eval_max_distance": 3,
915
+ "eval_mean_distance": 0,
916
+ "eval_runtime": 16.6859,
917
+ "eval_samples_per_second": 15.462,
918
+ "eval_steps_per_second": 1.079,
919
+ "step": 300001
920
  },
921
  {
922
+ "epoch": 13.0,
923
+ "learning_rate": 3.887950003129619e-05,
924
+ "loss": 0.0012,
925
+ "step": 300040
926
  },
927
  {
928
+ "epoch": 13.1,
929
+ "learning_rate": 3.8323872253759156e-05,
930
+ "loss": 0.0011,
931
+ "step": 302348
932
  },
933
  {
934
+ "epoch": 13.2,
935
+ "learning_rate": 3.776824447622212e-05,
936
+ "loss": 0.0011,
937
+ "step": 304656
938
  },
939
  {
940
+ "epoch": 13.3,
941
+ "learning_rate": 3.721261669868508e-05,
942
+ "loss": 0.0011,
943
+ "step": 306964
944
  },
945
  {
946
+ "epoch": 13.4,
947
+ "learning_rate": 3.6656988921148044e-05,
948
+ "loss": 0.0012,
949
+ "step": 309272
950
  },
951
  {
952
+ "epoch": 13.5,
953
+ "learning_rate": 3.954328163153008e-06,
954
+ "loss": 0.0012,
955
+ "step": 311580
956
  },
957
  {
958
+ "epoch": 13.6,
959
+ "learning_rate": 3.16057364927606e-06,
960
+ "loss": 0.0011,
961
+ "step": 313888
962
  },
963
  {
964
+ "epoch": 13.7,
965
+ "learning_rate": 2.3668191353991126e-06,
966
+ "loss": 0.0011,
967
+ "step": 316196
968
  },
969
  {
970
+ "epoch": 13.8,
971
+ "learning_rate": 1.5730646215221654e-06,
972
+ "loss": 0.001,
973
+ "step": 318504
 
 
 
 
974
  },
975
  {
976
+ "epoch": 13.9,
977
+ "learning_rate": 7.79310107645218e-07,
978
+ "loss": 0.001,
979
+ "step": 320812
980
+ },
981
+ {
982
+ "epoch": 14.0,
983
+ "eval_loss": 0.011406470090150833,
984
+ "eval_max_distance": 3,
985
+ "eval_mean_distance": 0,
986
+ "eval_runtime": 18.3924,
987
+ "eval_samples_per_second": 14.028,
988
+ "eval_steps_per_second": 0.979,
989
+ "step": 323078
990
+ },
991
+ {
992
+ "epoch": 14.0,
993
+ "step": 323078,
994
+ "total_flos": 8.183655700394803e+16,
995
+ "train_loss": 4.6522844528394624e-05,
996
+ "train_runtime": 1031.7354,
997
+ "train_samples_per_second": 4696.956,
998
+ "train_steps_per_second": 313.14
999
  }
1000
  ],
1001
+ "logging_steps": 2308,
1002
+ "max_steps": 323078,
1003
+ "num_train_epochs": 14,
1004
+ "save_steps": 4616,
1005
+ "total_flos": 8.183655700394803e+16,
1006
  "trial_name": null,
1007
  "trial_params": null
1008
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8318e24b1ced526ec88f5a701462bec50052cffbe6f8dcc3d2adf56c581c256
3
  size 4091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e60278def126043d26fff9ae6c83d3b556a4273694a6e32f09bf128f657352
3
  size 4091