VJLasmanis commited on
Commit
1a3972c
·
1 Parent(s): c5ed8cd

Delete checkpoint data

Browse files
large-1/checkpoint-97002/config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "_name_or_path": "google/mt5-large",
3
- "architectures": [
4
- "MT5ForConditionalGeneration"
5
- ],
6
- "d_ff": 2816,
7
- "d_kv": 64,
8
- "d_model": 1024,
9
- "decoder_start_token_id": 0,
10
- "dense_act_fn": "gelu_new",
11
- "dropout_rate": 0.1,
12
- "eos_token_id": 1,
13
- "feed_forward_proj": "gated-gelu",
14
- "initializer_factor": 1.0,
15
- "is_encoder_decoder": true,
16
- "is_gated_act": true,
17
- "layer_norm_epsilon": 1e-06,
18
- "model_type": "mt5",
19
- "num_decoder_layers": 24,
20
- "num_heads": 16,
21
- "num_layers": 24,
22
- "output_past": true,
23
- "pad_token_id": 0,
24
- "relative_attention_max_distance": 128,
25
- "relative_attention_num_buckets": 32,
26
- "tie_word_embeddings": false,
27
- "tokenizer_class": "T5Tokenizer",
28
- "torch_dtype": "float32",
29
- "transformers_version": "4.29.1",
30
- "use_cache": true,
31
- "vocab_size": 250112
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
large-1/checkpoint-97002/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "decoder_start_token_id": 0,
4
- "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.29.1"
7
- }
 
 
 
 
 
 
 
 
large-1/checkpoint-97002/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:97bc9add4003432e4b73fbf5195365630a1e94653a8a6299f12887ab8f518f8f
3
- size 9836995421
 
 
 
 
large-1/checkpoint-97002/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bde965fc31b963bf99a38ce54b1ab687c2386e8b45f342fa07e82f61e2c2bf0d
3
- size 4918519065
 
 
 
 
large-1/checkpoint-97002/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6d2c3e848b2aa31b46401d9c34136b5d405f81068653c8c796ae9973715d12f
3
- size 14575
 
 
 
 
large-1/checkpoint-97002/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d549102eb8db911ba7a9c667c760cbf641146127e2cc87d0dbbf7e01a68f02d
3
- size 627
 
 
 
 
large-1/checkpoint-97002/trainer_state.json DELETED
@@ -1,1228 +0,0 @@
1
- {
2
- "best_metric": 0.004835214000195265,
3
- "best_model_checkpoint": "./results/large-1/checkpoint-64668",
4
- "epoch": 6.0,
5
- "global_step": 97002,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.03,
12
- "learning_rate": 1e-05,
13
- "loss": 29.3333,
14
- "step": 500
15
- },
16
- {
17
- "epoch": 0.06,
18
- "learning_rate": 2e-05,
19
- "loss": 12.5737,
20
- "step": 1000
21
- },
22
- {
23
- "epoch": 0.09,
24
- "learning_rate": 1.9968976856735126e-05,
25
- "loss": 2.3813,
26
- "step": 1500
27
- },
28
- {
29
- "epoch": 0.12,
30
- "learning_rate": 1.993795371347025e-05,
31
- "loss": 0.2316,
32
- "step": 2000
33
- },
34
- {
35
- "epoch": 0.15,
36
- "learning_rate": 1.9906930570205375e-05,
37
- "loss": 0.099,
38
- "step": 2500
39
- },
40
- {
41
- "epoch": 0.19,
42
- "learning_rate": 1.98759074269405e-05,
43
- "loss": 0.0608,
44
- "step": 3000
45
- },
46
- {
47
- "epoch": 0.22,
48
- "learning_rate": 1.9844884283675623e-05,
49
- "loss": 0.0565,
50
- "step": 3500
51
- },
52
- {
53
- "epoch": 0.25,
54
- "learning_rate": 1.9813861140410748e-05,
55
- "loss": 0.0435,
56
- "step": 4000
57
- },
58
- {
59
- "epoch": 0.28,
60
- "learning_rate": 1.9782837997145872e-05,
61
- "loss": 0.0354,
62
- "step": 4500
63
- },
64
- {
65
- "epoch": 0.31,
66
- "learning_rate": 1.9751814853880996e-05,
67
- "loss": 0.0339,
68
- "step": 5000
69
- },
70
- {
71
- "epoch": 0.34,
72
- "learning_rate": 1.972079171061612e-05,
73
- "loss": 0.0294,
74
- "step": 5500
75
- },
76
- {
77
- "epoch": 0.37,
78
- "learning_rate": 1.9689768567351245e-05,
79
- "loss": 0.035,
80
- "step": 6000
81
- },
82
- {
83
- "epoch": 0.4,
84
- "learning_rate": 1.965874542408637e-05,
85
- "loss": 0.0515,
86
- "step": 6500
87
- },
88
- {
89
- "epoch": 0.43,
90
- "learning_rate": 1.9627722280821494e-05,
91
- "loss": 0.0232,
92
- "step": 7000
93
- },
94
- {
95
- "epoch": 0.46,
96
- "learning_rate": 1.9596699137556618e-05,
97
- "loss": 0.0349,
98
- "step": 7500
99
- },
100
- {
101
- "epoch": 0.49,
102
- "learning_rate": 1.9565675994291742e-05,
103
- "loss": 0.0226,
104
- "step": 8000
105
- },
106
- {
107
- "epoch": 0.53,
108
- "learning_rate": 1.9534652851026867e-05,
109
- "loss": 0.0215,
110
- "step": 8500
111
- },
112
- {
113
- "epoch": 0.56,
114
- "learning_rate": 1.9503629707761994e-05,
115
- "loss": 0.0204,
116
- "step": 9000
117
- },
118
- {
119
- "epoch": 0.59,
120
- "learning_rate": 1.9472606564497115e-05,
121
- "loss": 0.0192,
122
- "step": 9500
123
- },
124
- {
125
- "epoch": 0.62,
126
- "learning_rate": 1.9441583421232243e-05,
127
- "loss": 0.0194,
128
- "step": 10000
129
- },
130
- {
131
- "epoch": 0.65,
132
- "learning_rate": 1.9410560277967364e-05,
133
- "loss": 0.0179,
134
- "step": 10500
135
- },
136
- {
137
- "epoch": 0.68,
138
- "learning_rate": 1.937953713470249e-05,
139
- "loss": 0.017,
140
- "step": 11000
141
- },
142
- {
143
- "epoch": 0.71,
144
- "learning_rate": 1.9348513991437613e-05,
145
- "loss": 0.0226,
146
- "step": 11500
147
- },
148
- {
149
- "epoch": 0.74,
150
- "learning_rate": 1.9317490848172737e-05,
151
- "loss": 0.016,
152
- "step": 12000
153
- },
154
- {
155
- "epoch": 0.77,
156
- "learning_rate": 1.928646770490786e-05,
157
- "loss": 0.0154,
158
- "step": 12500
159
- },
160
- {
161
- "epoch": 0.8,
162
- "learning_rate": 1.9255444561642986e-05,
163
- "loss": 0.0157,
164
- "step": 13000
165
- },
166
- {
167
- "epoch": 0.84,
168
- "learning_rate": 1.9224421418378113e-05,
169
- "loss": 0.0139,
170
- "step": 13500
171
- },
172
- {
173
- "epoch": 0.87,
174
- "learning_rate": 1.9193398275113234e-05,
175
- "loss": 0.0129,
176
- "step": 14000
177
- },
178
- {
179
- "epoch": 0.9,
180
- "learning_rate": 1.9162375131848362e-05,
181
- "loss": 0.0134,
182
- "step": 14500
183
- },
184
- {
185
- "epoch": 0.93,
186
- "learning_rate": 1.9131351988583483e-05,
187
- "loss": 0.0134,
188
- "step": 15000
189
- },
190
- {
191
- "epoch": 0.96,
192
- "learning_rate": 1.910032884531861e-05,
193
- "loss": 0.0133,
194
- "step": 15500
195
- },
196
- {
197
- "epoch": 0.99,
198
- "learning_rate": 1.9069305702053732e-05,
199
- "loss": 0.0133,
200
- "step": 16000
201
- },
202
- {
203
- "epoch": 1.0,
204
- "eval_loss": 0.007159320637583733,
205
- "eval_runtime": 63.5558,
206
- "eval_samples_per_second": 113.05,
207
- "eval_steps_per_second": 7.08,
208
- "step": 16167
209
- },
210
- {
211
- "epoch": 1.02,
212
- "learning_rate": 1.903828255878886e-05,
213
- "loss": 0.0118,
214
- "step": 16500
215
- },
216
- {
217
- "epoch": 1.05,
218
- "learning_rate": 1.900725941552398e-05,
219
- "loss": 0.0114,
220
- "step": 17000
221
- },
222
- {
223
- "epoch": 1.08,
224
- "learning_rate": 1.8976236272259108e-05,
225
- "loss": 0.0115,
226
- "step": 17500
227
- },
228
- {
229
- "epoch": 1.11,
230
- "learning_rate": 1.8945213128994233e-05,
231
- "loss": 0.0115,
232
- "step": 18000
233
- },
234
- {
235
- "epoch": 1.14,
236
- "learning_rate": 1.8914189985729357e-05,
237
- "loss": 0.0118,
238
- "step": 18500
239
- },
240
- {
241
- "epoch": 1.18,
242
- "learning_rate": 1.888316684246448e-05,
243
- "loss": 0.0119,
244
- "step": 19000
245
- },
246
- {
247
- "epoch": 1.21,
248
- "learning_rate": 1.8852143699199606e-05,
249
- "loss": 0.0117,
250
- "step": 19500
251
- },
252
- {
253
- "epoch": 1.24,
254
- "learning_rate": 1.882112055593473e-05,
255
- "loss": 0.0113,
256
- "step": 20000
257
- },
258
- {
259
- "epoch": 1.27,
260
- "learning_rate": 1.8790097412669854e-05,
261
- "loss": 0.0102,
262
- "step": 20500
263
- },
264
- {
265
- "epoch": 1.3,
266
- "learning_rate": 1.875907426940498e-05,
267
- "loss": 0.0109,
268
- "step": 21000
269
- },
270
- {
271
- "epoch": 1.33,
272
- "learning_rate": 1.87280511261401e-05,
273
- "loss": 0.0096,
274
- "step": 21500
275
- },
276
- {
277
- "epoch": 1.36,
278
- "learning_rate": 1.8697027982875227e-05,
279
- "loss": 0.0105,
280
- "step": 22000
281
- },
282
- {
283
- "epoch": 1.39,
284
- "learning_rate": 1.866600483961035e-05,
285
- "loss": 0.0093,
286
- "step": 22500
287
- },
288
- {
289
- "epoch": 1.42,
290
- "learning_rate": 1.8634981696345476e-05,
291
- "loss": 0.0096,
292
- "step": 23000
293
- },
294
- {
295
- "epoch": 1.45,
296
- "learning_rate": 1.86039585530806e-05,
297
- "loss": 0.0091,
298
- "step": 23500
299
- },
300
- {
301
- "epoch": 1.48,
302
- "learning_rate": 1.8572935409815725e-05,
303
- "loss": 0.0099,
304
- "step": 24000
305
- },
306
- {
307
- "epoch": 1.52,
308
- "learning_rate": 1.854191226655085e-05,
309
- "loss": 0.009,
310
- "step": 24500
311
- },
312
- {
313
- "epoch": 1.55,
314
- "learning_rate": 1.8510889123285973e-05,
315
- "loss": 0.0097,
316
- "step": 25000
317
- },
318
- {
319
- "epoch": 1.58,
320
- "learning_rate": 1.8479865980021098e-05,
321
- "loss": 0.0089,
322
- "step": 25500
323
- },
324
- {
325
- "epoch": 1.61,
326
- "learning_rate": 1.8448842836756222e-05,
327
- "loss": 0.0097,
328
- "step": 26000
329
- },
330
- {
331
- "epoch": 1.64,
332
- "learning_rate": 1.8417819693491346e-05,
333
- "loss": 0.0099,
334
- "step": 26500
335
- },
336
- {
337
- "epoch": 1.67,
338
- "learning_rate": 1.838679655022647e-05,
339
- "loss": 0.0092,
340
- "step": 27000
341
- },
342
- {
343
- "epoch": 1.7,
344
- "learning_rate": 1.8355773406961595e-05,
345
- "loss": 0.0094,
346
- "step": 27500
347
- },
348
- {
349
- "epoch": 1.73,
350
- "learning_rate": 1.832475026369672e-05,
351
- "loss": 0.0087,
352
- "step": 28000
353
- },
354
- {
355
- "epoch": 1.76,
356
- "learning_rate": 1.8293727120431844e-05,
357
- "loss": 0.0091,
358
- "step": 28500
359
- },
360
- {
361
- "epoch": 1.79,
362
- "learning_rate": 1.8262703977166968e-05,
363
- "loss": 0.0098,
364
- "step": 29000
365
- },
366
- {
367
- "epoch": 1.82,
368
- "learning_rate": 1.8231680833902092e-05,
369
- "loss": 0.0093,
370
- "step": 29500
371
- },
372
- {
373
- "epoch": 1.86,
374
- "learning_rate": 1.8200657690637217e-05,
375
- "loss": 0.0088,
376
- "step": 30000
377
- },
378
- {
379
- "epoch": 1.89,
380
- "learning_rate": 1.816963454737234e-05,
381
- "loss": 0.008,
382
- "step": 30500
383
- },
384
- {
385
- "epoch": 1.92,
386
- "learning_rate": 1.8138611404107465e-05,
387
- "loss": 0.0088,
388
- "step": 31000
389
- },
390
- {
391
- "epoch": 1.95,
392
- "learning_rate": 1.810758826084259e-05,
393
- "loss": 0.0084,
394
- "step": 31500
395
- },
396
- {
397
- "epoch": 1.98,
398
- "learning_rate": 1.8076565117577714e-05,
399
- "loss": 0.0087,
400
- "step": 32000
401
- },
402
- {
403
- "epoch": 2.0,
404
- "eval_loss": 0.005626102443784475,
405
- "eval_runtime": 63.5514,
406
- "eval_samples_per_second": 113.058,
407
- "eval_steps_per_second": 7.081,
408
- "step": 32334
409
- },
410
- {
411
- "epoch": 2.01,
412
- "learning_rate": 1.804554197431284e-05,
413
- "loss": 0.0081,
414
- "step": 32500
415
- },
416
- {
417
- "epoch": 2.04,
418
- "learning_rate": 1.8014518831047963e-05,
419
- "loss": 0.007,
420
- "step": 33000
421
- },
422
- {
423
- "epoch": 2.07,
424
- "learning_rate": 1.7983495687783087e-05,
425
- "loss": 0.0072,
426
- "step": 33500
427
- },
428
- {
429
- "epoch": 2.1,
430
- "learning_rate": 1.795247254451821e-05,
431
- "loss": 0.0069,
432
- "step": 34000
433
- },
434
- {
435
- "epoch": 2.13,
436
- "learning_rate": 1.7921449401253336e-05,
437
- "loss": 0.0072,
438
- "step": 34500
439
- },
440
- {
441
- "epoch": 2.16,
442
- "learning_rate": 1.789042625798846e-05,
443
- "loss": 0.0067,
444
- "step": 35000
445
- },
446
- {
447
- "epoch": 2.2,
448
- "learning_rate": 1.7859403114723584e-05,
449
- "loss": 0.007,
450
- "step": 35500
451
- },
452
- {
453
- "epoch": 2.23,
454
- "learning_rate": 1.782837997145871e-05,
455
- "loss": 0.0072,
456
- "step": 36000
457
- },
458
- {
459
- "epoch": 2.26,
460
- "learning_rate": 1.7797356828193833e-05,
461
- "loss": 0.0074,
462
- "step": 36500
463
- },
464
- {
465
- "epoch": 2.29,
466
- "learning_rate": 1.7766333684928957e-05,
467
- "loss": 0.0075,
468
- "step": 37000
469
- },
470
- {
471
- "epoch": 2.32,
472
- "learning_rate": 1.7735310541664085e-05,
473
- "loss": 0.0072,
474
- "step": 37500
475
- },
476
- {
477
- "epoch": 2.35,
478
- "learning_rate": 1.7704287398399206e-05,
479
- "loss": 0.0061,
480
- "step": 38000
481
- },
482
- {
483
- "epoch": 2.38,
484
- "learning_rate": 1.7673264255134334e-05,
485
- "loss": 0.0071,
486
- "step": 38500
487
- },
488
- {
489
- "epoch": 2.41,
490
- "learning_rate": 1.7642241111869455e-05,
491
- "loss": 0.0074,
492
- "step": 39000
493
- },
494
- {
495
- "epoch": 2.44,
496
- "learning_rate": 1.7611217968604582e-05,
497
- "loss": 0.0072,
498
- "step": 39500
499
- },
500
- {
501
- "epoch": 2.47,
502
- "learning_rate": 1.7580194825339703e-05,
503
- "loss": 0.0072,
504
- "step": 40000
505
- },
506
- {
507
- "epoch": 2.51,
508
- "learning_rate": 1.754917168207483e-05,
509
- "loss": 0.0075,
510
- "step": 40500
511
- },
512
- {
513
- "epoch": 2.54,
514
- "learning_rate": 1.7518148538809952e-05,
515
- "loss": 0.0069,
516
- "step": 41000
517
- },
518
- {
519
- "epoch": 2.57,
520
- "learning_rate": 1.7487125395545076e-05,
521
- "loss": 0.007,
522
- "step": 41500
523
- },
524
- {
525
- "epoch": 2.6,
526
- "learning_rate": 1.7456102252280204e-05,
527
- "loss": 0.0058,
528
- "step": 42000
529
- },
530
- {
531
- "epoch": 2.63,
532
- "learning_rate": 1.7425079109015325e-05,
533
- "loss": 0.0064,
534
- "step": 42500
535
- },
536
- {
537
- "epoch": 2.66,
538
- "learning_rate": 1.7394055965750453e-05,
539
- "loss": 0.0067,
540
- "step": 43000
541
- },
542
- {
543
- "epoch": 2.69,
544
- "learning_rate": 1.7363032822485574e-05,
545
- "loss": 0.0062,
546
- "step": 43500
547
- },
548
- {
549
- "epoch": 2.72,
550
- "learning_rate": 1.73320096792207e-05,
551
- "loss": 0.0066,
552
- "step": 44000
553
- },
554
- {
555
- "epoch": 2.75,
556
- "learning_rate": 1.7300986535955822e-05,
557
- "loss": 0.0066,
558
- "step": 44500
559
- },
560
- {
561
- "epoch": 2.78,
562
- "learning_rate": 1.726996339269095e-05,
563
- "loss": 0.0071,
564
- "step": 45000
565
- },
566
- {
567
- "epoch": 2.81,
568
- "learning_rate": 1.723894024942607e-05,
569
- "loss": 0.0065,
570
- "step": 45500
571
- },
572
- {
573
- "epoch": 2.85,
574
- "learning_rate": 1.72079171061612e-05,
575
- "loss": 0.0065,
576
- "step": 46000
577
- },
578
- {
579
- "epoch": 2.88,
580
- "learning_rate": 1.7176893962896323e-05,
581
- "loss": 0.0065,
582
- "step": 46500
583
- },
584
- {
585
- "epoch": 2.91,
586
- "learning_rate": 1.7145870819631448e-05,
587
- "loss": 0.0062,
588
- "step": 47000
589
- },
590
- {
591
- "epoch": 2.94,
592
- "learning_rate": 1.7114847676366572e-05,
593
- "loss": 0.0075,
594
- "step": 47500
595
- },
596
- {
597
- "epoch": 2.97,
598
- "learning_rate": 1.7083824533101696e-05,
599
- "loss": 0.0064,
600
- "step": 48000
601
- },
602
- {
603
- "epoch": 3.0,
604
- "learning_rate": 1.705280138983682e-05,
605
- "loss": 0.0063,
606
- "step": 48500
607
- },
608
- {
609
- "epoch": 3.0,
610
- "eval_loss": 0.005162048153579235,
611
- "eval_runtime": 63.5102,
612
- "eval_samples_per_second": 113.132,
613
- "eval_steps_per_second": 7.085,
614
- "step": 48501
615
- },
616
- {
617
- "epoch": 3.03,
618
- "learning_rate": 1.7021778246571945e-05,
619
- "loss": 0.0055,
620
- "step": 49000
621
- },
622
- {
623
- "epoch": 3.06,
624
- "learning_rate": 1.699075510330707e-05,
625
- "loss": 0.0055,
626
- "step": 49500
627
- },
628
- {
629
- "epoch": 3.09,
630
- "learning_rate": 1.6959731960042194e-05,
631
- "loss": 0.0059,
632
- "step": 50000
633
- },
634
- {
635
- "epoch": 3.12,
636
- "learning_rate": 1.6928708816777318e-05,
637
- "loss": 0.0052,
638
- "step": 50500
639
- },
640
- {
641
- "epoch": 3.15,
642
- "learning_rate": 1.6897685673512442e-05,
643
- "loss": 0.0053,
644
- "step": 51000
645
- },
646
- {
647
- "epoch": 3.19,
648
- "learning_rate": 1.6866662530247567e-05,
649
- "loss": 0.005,
650
- "step": 51500
651
- },
652
- {
653
- "epoch": 3.22,
654
- "learning_rate": 1.683563938698269e-05,
655
- "loss": 0.0056,
656
- "step": 52000
657
- },
658
- {
659
- "epoch": 3.25,
660
- "learning_rate": 1.6804616243717815e-05,
661
- "loss": 0.0051,
662
- "step": 52500
663
- },
664
- {
665
- "epoch": 3.28,
666
- "learning_rate": 1.677359310045294e-05,
667
- "loss": 0.0053,
668
- "step": 53000
669
- },
670
- {
671
- "epoch": 3.31,
672
- "learning_rate": 1.6742569957188064e-05,
673
- "loss": 0.0053,
674
- "step": 53500
675
- },
676
- {
677
- "epoch": 3.34,
678
- "learning_rate": 1.6711546813923188e-05,
679
- "loss": 0.0051,
680
- "step": 54000
681
- },
682
- {
683
- "epoch": 3.37,
684
- "learning_rate": 1.6680523670658313e-05,
685
- "loss": 0.0056,
686
- "step": 54500
687
- },
688
- {
689
- "epoch": 3.4,
690
- "learning_rate": 1.6649500527393437e-05,
691
- "loss": 0.0068,
692
- "step": 55000
693
- },
694
- {
695
- "epoch": 3.43,
696
- "learning_rate": 1.661847738412856e-05,
697
- "loss": 0.0054,
698
- "step": 55500
699
- },
700
- {
701
- "epoch": 3.46,
702
- "learning_rate": 1.6587454240863686e-05,
703
- "loss": 0.0055,
704
- "step": 56000
705
- },
706
- {
707
- "epoch": 3.49,
708
- "learning_rate": 1.655643109759881e-05,
709
- "loss": 0.0056,
710
- "step": 56500
711
- },
712
- {
713
- "epoch": 3.53,
714
- "learning_rate": 1.6525407954333934e-05,
715
- "loss": 0.0053,
716
- "step": 57000
717
- },
718
- {
719
- "epoch": 3.56,
720
- "learning_rate": 1.649438481106906e-05,
721
- "loss": 0.0051,
722
- "step": 57500
723
- },
724
- {
725
- "epoch": 3.59,
726
- "learning_rate": 1.6463361667804183e-05,
727
- "loss": 0.005,
728
- "step": 58000
729
- },
730
- {
731
- "epoch": 3.62,
732
- "learning_rate": 1.6432338524539307e-05,
733
- "loss": 0.0055,
734
- "step": 58500
735
- },
736
- {
737
- "epoch": 3.65,
738
- "learning_rate": 1.640131538127443e-05,
739
- "loss": 0.0048,
740
- "step": 59000
741
- },
742
- {
743
- "epoch": 3.68,
744
- "learning_rate": 1.6370292238009556e-05,
745
- "loss": 0.0052,
746
- "step": 59500
747
- },
748
- {
749
- "epoch": 3.71,
750
- "learning_rate": 1.633926909474468e-05,
751
- "loss": 0.0048,
752
- "step": 60000
753
- },
754
- {
755
- "epoch": 3.74,
756
- "learning_rate": 1.6308245951479805e-05,
757
- "loss": 0.0049,
758
- "step": 60500
759
- },
760
- {
761
- "epoch": 3.77,
762
- "learning_rate": 1.627722280821493e-05,
763
- "loss": 0.0054,
764
- "step": 61000
765
- },
766
- {
767
- "epoch": 3.8,
768
- "learning_rate": 1.6246199664950053e-05,
769
- "loss": 0.0052,
770
- "step": 61500
771
- },
772
- {
773
- "epoch": 3.83,
774
- "learning_rate": 1.6215176521685178e-05,
775
- "loss": 0.0053,
776
- "step": 62000
777
- },
778
- {
779
- "epoch": 3.87,
780
- "learning_rate": 1.6184153378420302e-05,
781
- "loss": 0.0052,
782
- "step": 62500
783
- },
784
- {
785
- "epoch": 3.9,
786
- "learning_rate": 1.6153130235155426e-05,
787
- "loss": 0.0053,
788
- "step": 63000
789
- },
790
- {
791
- "epoch": 3.93,
792
- "learning_rate": 1.612210709189055e-05,
793
- "loss": 0.0052,
794
- "step": 63500
795
- },
796
- {
797
- "epoch": 3.96,
798
- "learning_rate": 1.6091083948625675e-05,
799
- "loss": 0.0052,
800
- "step": 64000
801
- },
802
- {
803
- "epoch": 3.99,
804
- "learning_rate": 1.60600608053608e-05,
805
- "loss": 0.0054,
806
- "step": 64500
807
- },
808
- {
809
- "epoch": 4.0,
810
- "eval_loss": 0.004835214000195265,
811
- "eval_runtime": 63.5993,
812
- "eval_samples_per_second": 112.973,
813
- "eval_steps_per_second": 7.076,
814
- "step": 64668
815
- },
816
- {
817
- "epoch": 4.02,
818
- "learning_rate": 1.6029037662095924e-05,
819
- "loss": 0.0042,
820
- "step": 65000
821
- },
822
- {
823
- "epoch": 4.05,
824
- "learning_rate": 1.5998014518831048e-05,
825
- "loss": 0.0044,
826
- "step": 65500
827
- },
828
- {
829
- "epoch": 4.08,
830
- "learning_rate": 1.5966991375566176e-05,
831
- "loss": 0.0041,
832
- "step": 66000
833
- },
834
- {
835
- "epoch": 4.11,
836
- "learning_rate": 1.5935968232301297e-05,
837
- "loss": 0.0046,
838
- "step": 66500
839
- },
840
- {
841
- "epoch": 4.14,
842
- "learning_rate": 1.5904945089036424e-05,
843
- "loss": 0.004,
844
- "step": 67000
845
- },
846
- {
847
- "epoch": 4.18,
848
- "learning_rate": 1.5873921945771545e-05,
849
- "loss": 0.0044,
850
- "step": 67500
851
- },
852
- {
853
- "epoch": 4.21,
854
- "learning_rate": 1.5842898802506673e-05,
855
- "loss": 0.0047,
856
- "step": 68000
857
- },
858
- {
859
- "epoch": 4.24,
860
- "learning_rate": 1.5811875659241794e-05,
861
- "loss": 0.0043,
862
- "step": 68500
863
- },
864
- {
865
- "epoch": 4.27,
866
- "learning_rate": 1.5780852515976922e-05,
867
- "loss": 0.004,
868
- "step": 69000
869
- },
870
- {
871
- "epoch": 4.3,
872
- "learning_rate": 1.5749829372712043e-05,
873
- "loss": 0.0043,
874
- "step": 69500
875
- },
876
- {
877
- "epoch": 4.33,
878
- "learning_rate": 1.571880622944717e-05,
879
- "loss": 0.0048,
880
- "step": 70000
881
- },
882
- {
883
- "epoch": 4.36,
884
- "learning_rate": 1.5687783086182295e-05,
885
- "loss": 0.0044,
886
- "step": 70500
887
- },
888
- {
889
- "epoch": 4.39,
890
- "learning_rate": 1.5656759942917416e-05,
891
- "loss": 0.0042,
892
- "step": 71000
893
- },
894
- {
895
- "epoch": 4.42,
896
- "learning_rate": 1.5625736799652544e-05,
897
- "loss": 0.0041,
898
- "step": 71500
899
- },
900
- {
901
- "epoch": 4.45,
902
- "learning_rate": 1.5594713656387664e-05,
903
- "loss": 0.0045,
904
- "step": 72000
905
- },
906
- {
907
- "epoch": 4.48,
908
- "learning_rate": 1.5563690513122792e-05,
909
- "loss": 0.0041,
910
- "step": 72500
911
- },
912
- {
913
- "epoch": 4.52,
914
- "learning_rate": 1.5532667369857913e-05,
915
- "loss": 0.0044,
916
- "step": 73000
917
- },
918
- {
919
- "epoch": 4.55,
920
- "learning_rate": 1.550164422659304e-05,
921
- "loss": 0.0039,
922
- "step": 73500
923
- },
924
- {
925
- "epoch": 4.58,
926
- "learning_rate": 1.5470621083328162e-05,
927
- "loss": 0.0045,
928
- "step": 74000
929
- },
930
- {
931
- "epoch": 4.61,
932
- "learning_rate": 1.543959794006329e-05,
933
- "loss": 0.0039,
934
- "step": 74500
935
- },
936
- {
937
- "epoch": 4.64,
938
- "learning_rate": 1.5408574796798414e-05,
939
- "loss": 0.0044,
940
- "step": 75000
941
- },
942
- {
943
- "epoch": 4.67,
944
- "learning_rate": 1.5377551653533538e-05,
945
- "loss": 0.004,
946
- "step": 75500
947
- },
948
- {
949
- "epoch": 4.7,
950
- "learning_rate": 1.5346528510268663e-05,
951
- "loss": 0.0048,
952
- "step": 76000
953
- },
954
- {
955
- "epoch": 4.73,
956
- "learning_rate": 1.5315505367003787e-05,
957
- "loss": 0.004,
958
- "step": 76500
959
- },
960
- {
961
- "epoch": 4.76,
962
- "learning_rate": 1.528448222373891e-05,
963
- "loss": 0.0046,
964
- "step": 77000
965
- },
966
- {
967
- "epoch": 4.79,
968
- "learning_rate": 1.5253459080474036e-05,
969
- "loss": 0.0043,
970
- "step": 77500
971
- },
972
- {
973
- "epoch": 4.82,
974
- "learning_rate": 1.5222435937209158e-05,
975
- "loss": 0.0043,
976
- "step": 78000
977
- },
978
- {
979
- "epoch": 4.86,
980
- "learning_rate": 1.5191412793944284e-05,
981
- "loss": 0.0042,
982
- "step": 78500
983
- },
984
- {
985
- "epoch": 4.89,
986
- "learning_rate": 1.5160389650679409e-05,
987
- "loss": 0.0045,
988
- "step": 79000
989
- },
990
- {
991
- "epoch": 4.92,
992
- "learning_rate": 1.5129366507414533e-05,
993
- "loss": 0.004,
994
- "step": 79500
995
- },
996
- {
997
- "epoch": 4.95,
998
- "learning_rate": 1.5098343364149657e-05,
999
- "loss": 0.0037,
1000
- "step": 80000
1001
- },
1002
- {
1003
- "epoch": 4.98,
1004
- "learning_rate": 1.506732022088478e-05,
1005
- "loss": 0.0045,
1006
- "step": 80500
1007
- },
1008
- {
1009
- "epoch": 5.0,
1010
- "eval_loss": 0.004868941381573677,
1011
- "eval_runtime": 63.5068,
1012
- "eval_samples_per_second": 113.137,
1013
- "eval_steps_per_second": 7.086,
1014
- "step": 80835
1015
- },
1016
- {
1017
- "epoch": 5.01,
1018
- "learning_rate": 1.5036297077619906e-05,
1019
- "loss": 0.0038,
1020
- "step": 81000
1021
- },
1022
- {
1023
- "epoch": 5.04,
1024
- "learning_rate": 1.5005273934355029e-05,
1025
- "loss": 0.0035,
1026
- "step": 81500
1027
- },
1028
- {
1029
- "epoch": 5.07,
1030
- "learning_rate": 1.4974250791090155e-05,
1031
- "loss": 0.0035,
1032
- "step": 82000
1033
- },
1034
- {
1035
- "epoch": 5.1,
1036
- "learning_rate": 1.4943227647825277e-05,
1037
- "loss": 0.0037,
1038
- "step": 82500
1039
- },
1040
- {
1041
- "epoch": 5.13,
1042
- "learning_rate": 1.4912204504560403e-05,
1043
- "loss": 0.0031,
1044
- "step": 83000
1045
- },
1046
- {
1047
- "epoch": 5.16,
1048
- "learning_rate": 1.4881181361295528e-05,
1049
- "loss": 0.0035,
1050
- "step": 83500
1051
- },
1052
- {
1053
- "epoch": 5.2,
1054
- "learning_rate": 1.4850158218030652e-05,
1055
- "loss": 0.0037,
1056
- "step": 84000
1057
- },
1058
- {
1059
- "epoch": 5.23,
1060
- "learning_rate": 1.4819135074765776e-05,
1061
- "loss": 0.0033,
1062
- "step": 84500
1063
- },
1064
- {
1065
- "epoch": 5.26,
1066
- "learning_rate": 1.47881119315009e-05,
1067
- "loss": 0.0039,
1068
- "step": 85000
1069
- },
1070
- {
1071
- "epoch": 5.29,
1072
- "learning_rate": 1.4757088788236025e-05,
1073
- "loss": 0.0036,
1074
- "step": 85500
1075
- },
1076
- {
1077
- "epoch": 5.32,
1078
- "learning_rate": 1.4726065644971151e-05,
1079
- "loss": 0.0039,
1080
- "step": 86000
1081
- },
1082
- {
1083
- "epoch": 5.35,
1084
- "learning_rate": 1.4695042501706274e-05,
1085
- "loss": 0.0035,
1086
- "step": 86500
1087
- },
1088
- {
1089
- "epoch": 5.38,
1090
- "learning_rate": 1.46640193584414e-05,
1091
- "loss": 0.0034,
1092
- "step": 87000
1093
- },
1094
- {
1095
- "epoch": 5.41,
1096
- "learning_rate": 1.4632996215176522e-05,
1097
- "loss": 0.003,
1098
- "step": 87500
1099
- },
1100
- {
1101
- "epoch": 5.44,
1102
- "learning_rate": 1.4601973071911648e-05,
1103
- "loss": 0.0034,
1104
- "step": 88000
1105
- },
1106
- {
1107
- "epoch": 5.47,
1108
- "learning_rate": 1.4570949928646771e-05,
1109
- "loss": 0.0045,
1110
- "step": 88500
1111
- },
1112
- {
1113
- "epoch": 5.51,
1114
- "learning_rate": 1.4539926785381897e-05,
1115
- "loss": 0.0036,
1116
- "step": 89000
1117
- },
1118
- {
1119
- "epoch": 5.54,
1120
- "learning_rate": 1.450890364211702e-05,
1121
- "loss": 0.0036,
1122
- "step": 89500
1123
- },
1124
- {
1125
- "epoch": 5.57,
1126
- "learning_rate": 1.4477880498852146e-05,
1127
- "loss": 0.0037,
1128
- "step": 90000
1129
- },
1130
- {
1131
- "epoch": 5.6,
1132
- "learning_rate": 1.444685735558727e-05,
1133
- "loss": 0.0035,
1134
- "step": 90500
1135
- },
1136
- {
1137
- "epoch": 5.63,
1138
- "learning_rate": 1.4415834212322393e-05,
1139
- "loss": 0.0037,
1140
- "step": 91000
1141
- },
1142
- {
1143
- "epoch": 5.66,
1144
- "learning_rate": 1.4384811069057519e-05,
1145
- "loss": 0.0037,
1146
- "step": 91500
1147
- },
1148
- {
1149
- "epoch": 5.69,
1150
- "learning_rate": 1.4353787925792641e-05,
1151
- "loss": 0.0035,
1152
- "step": 92000
1153
- },
1154
- {
1155
- "epoch": 5.72,
1156
- "learning_rate": 1.4322764782527767e-05,
1157
- "loss": 0.0039,
1158
- "step": 92500
1159
- },
1160
- {
1161
- "epoch": 5.75,
1162
- "learning_rate": 1.429174163926289e-05,
1163
- "loss": 0.0038,
1164
- "step": 93000
1165
- },
1166
- {
1167
- "epoch": 5.78,
1168
- "learning_rate": 1.4260718495998016e-05,
1169
- "loss": 0.003,
1170
- "step": 93500
1171
- },
1172
- {
1173
- "epoch": 5.81,
1174
- "learning_rate": 1.4229695352733139e-05,
1175
- "loss": 0.0033,
1176
- "step": 94000
1177
- },
1178
- {
1179
- "epoch": 5.85,
1180
- "learning_rate": 1.4198672209468265e-05,
1181
- "loss": 0.004,
1182
- "step": 94500
1183
- },
1184
- {
1185
- "epoch": 5.88,
1186
- "learning_rate": 1.4167649066203389e-05,
1187
- "loss": 0.0031,
1188
- "step": 95000
1189
- },
1190
- {
1191
- "epoch": 5.91,
1192
- "learning_rate": 1.4136625922938513e-05,
1193
- "loss": 0.0034,
1194
- "step": 95500
1195
- },
1196
- {
1197
- "epoch": 5.94,
1198
- "learning_rate": 1.4105602779673638e-05,
1199
- "loss": 0.0032,
1200
- "step": 96000
1201
- },
1202
- {
1203
- "epoch": 5.97,
1204
- "learning_rate": 1.4074579636408762e-05,
1205
- "loss": 0.0036,
1206
- "step": 96500
1207
- },
1208
- {
1209
- "epoch": 6.0,
1210
- "learning_rate": 1.4043556493143886e-05,
1211
- "loss": 0.0035,
1212
- "step": 97000
1213
- },
1214
- {
1215
- "epoch": 6.0,
1216
- "eval_loss": 0.004906239919364452,
1217
- "eval_runtime": 63.3798,
1218
- "eval_samples_per_second": 113.364,
1219
- "eval_steps_per_second": 7.1,
1220
- "step": 97002
1221
- }
1222
- ],
1223
- "max_steps": 323340,
1224
- "num_train_epochs": 20,
1225
- "total_flos": 2.90069697134592e+17,
1226
- "trial_name": null,
1227
- "trial_params": null
1228
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
large-1/checkpoint-97002/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3edc56f0f0e1d7be1ce3265d43ba3e04b7d248bf3e6aeba8adfbe111cabeb09
3
- size 4027