antonpolishko commited on
Commit
365078b
·
verified ·
1 Parent(s): fc7c8ac

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:005461edf1879aad10a0e74fa8063b899d4dc153c4c4ee2ee870ec6ab02d0724
3
  size 4903351912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da40a5e2cd69e35db1c4bfd34661796a25ac71a510cbccbfb7b36b7caf9025a
3
  size 4903351912
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c242ad1a6b91bc1ab29f05b1aa380973a69a8aa79ef7deed8c29db48896f89f5
3
  size 4947570872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d4d922ecb0fbc025a2f903f492c8baa33ea8cceb3dc068450e7e0b6518684e
3
  size 4947570872
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30f6cf7f219f0d8b2be6e68425eaf8b45e404706a2998972e580bfbce5114648
3
  size 4962221464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c5d33e285bcbf98218ba55bce7efbdd306b8046089737b99aa4fe5c5e7ebea5
3
  size 4962221464
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3de7dc324edd38a81977296cbbf841bca1256ee41679022983816e95b863604d
3
  size 3670322200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261c82a71bbdefce5c6b82990717f18e2f9cb6c02b8e8a09188d4f81d1e757ea
3
  size 3670322200
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b3f2db12fa449ee60e14ed19e4869728b4fe4cc9bf5a3ba552cf15c1e8a0861
3
  size 36967230034
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca3ba1546a764910d034f0ce3e93370a84a16db802bc17eb73ebc48babbb424c
3
  size 36967230034
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f709d9064ea54be0c4294c2790b947f9f308ff4a7e82014074b3bee9d133837
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5713040e0c397c4c3f4a5dd5b82184ec7e9ce43fb4fc625e2c829d6f4609bb2d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 300,
6
- "global_step": 273,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -392,6 +392,399 @@
392
  "learning_rate": 1.5121454937319975e-06,
393
  "loss": 4.9708,
394
  "step": 270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  }
396
  ],
397
  "logging_steps": 5,
@@ -411,7 +804,7 @@
411
  "attributes": {}
412
  }
413
  },
414
- "total_flos": 1.7871723276837847e+18,
415
  "train_batch_size": 8,
416
  "trial_name": null,
417
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 300,
6
+ "global_step": 546,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
392
  "learning_rate": 1.5121454937319975e-06,
393
  "loss": 4.9708,
394
  "step": 270
395
+ },
396
+ {
397
+ "epoch": 1.0073260073260073,
398
+ "grad_norm": 3.9375,
399
+ "learning_rate": 1.4955587465513128e-06,
400
+ "loss": 4.9494,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 1.0256410256410255,
405
+ "grad_norm": 3.703125,
406
+ "learning_rate": 1.4787892672278555e-06,
407
+ "loss": 4.9172,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 1.043956043956044,
412
+ "grad_norm": 3.515625,
413
+ "learning_rate": 1.461843239333021e-06,
414
+ "loss": 4.9262,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 1.0622710622710623,
419
+ "grad_norm": 3.75,
420
+ "learning_rate": 1.444726911538657e-06,
421
+ "loss": 4.9115,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 1.0805860805860805,
426
+ "grad_norm": 3.671875,
427
+ "learning_rate": 1.4274465953129325e-06,
428
+ "loss": 4.8926,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 1.098901098901099,
433
+ "grad_norm": 3.625,
434
+ "learning_rate": 1.4100086625930462e-06,
435
+ "loss": 4.8845,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 1.098901098901099,
440
+ "eval_loss": 4.895308017730713,
441
+ "eval_runtime": 14.5245,
442
+ "eval_samples_per_second": 36.421,
443
+ "eval_steps_per_second": 1.17,
444
+ "step": 300
445
+ },
446
+ {
447
+ "epoch": 1.1172161172161172,
448
+ "grad_norm": 3.5,
449
+ "learning_rate": 1.3924195434356441e-06,
450
+ "loss": 4.8754,
451
+ "step": 305
452
+ },
453
+ {
454
+ "epoch": 1.1355311355311355,
455
+ "grad_norm": 3.484375,
456
+ "learning_rate": 1.3746857236458005e-06,
457
+ "loss": 4.8566,
458
+ "step": 310
459
+ },
460
+ {
461
+ "epoch": 1.1538461538461537,
462
+ "grad_norm": 3.875,
463
+ "learning_rate": 1.3568137423854457e-06,
464
+ "loss": 4.873,
465
+ "step": 315
466
+ },
467
+ {
468
+ "epoch": 1.1721611721611722,
469
+ "grad_norm": 3.515625,
470
+ "learning_rate": 1.3388101897621183e-06,
471
+ "loss": 4.8476,
472
+ "step": 320
473
+ },
474
+ {
475
+ "epoch": 1.1904761904761905,
476
+ "grad_norm": 4.0,
477
+ "learning_rate": 1.32068170439893e-06,
478
+ "loss": 4.8846,
479
+ "step": 325
480
+ },
481
+ {
482
+ "epoch": 1.2087912087912087,
483
+ "grad_norm": 3.453125,
484
+ "learning_rate": 1.3024349709866448e-06,
485
+ "loss": 4.864,
486
+ "step": 330
487
+ },
488
+ {
489
+ "epoch": 1.2271062271062272,
490
+ "grad_norm": 5.125,
491
+ "learning_rate": 1.2840767178187654e-06,
492
+ "loss": 4.8457,
493
+ "step": 335
494
+ },
495
+ {
496
+ "epoch": 1.2454212454212454,
497
+ "grad_norm": 3.828125,
498
+ "learning_rate": 1.265613714310548e-06,
499
+ "loss": 4.8469,
500
+ "step": 340
501
+ },
502
+ {
503
+ "epoch": 1.2637362637362637,
504
+ "grad_norm": 3.59375,
505
+ "learning_rate": 1.2470527685028482e-06,
506
+ "loss": 4.8406,
507
+ "step": 345
508
+ },
509
+ {
510
+ "epoch": 1.282051282051282,
511
+ "grad_norm": 3.609375,
512
+ "learning_rate": 1.228400724551728e-06,
513
+ "loss": 4.8545,
514
+ "step": 350
515
+ },
516
+ {
517
+ "epoch": 1.3003663003663004,
518
+ "grad_norm": 3.578125,
519
+ "learning_rate": 1.2096644602047445e-06,
520
+ "loss": 4.8236,
521
+ "step": 355
522
+ },
523
+ {
524
+ "epoch": 1.3186813186813187,
525
+ "grad_norm": 3.515625,
526
+ "learning_rate": 1.1908508842648505e-06,
527
+ "loss": 4.8126,
528
+ "step": 360
529
+ },
530
+ {
531
+ "epoch": 1.3369963369963371,
532
+ "grad_norm": 3.453125,
533
+ "learning_rate": 1.171966934042847e-06,
534
+ "loss": 4.8038,
535
+ "step": 365
536
+ },
537
+ {
538
+ "epoch": 1.3553113553113554,
539
+ "grad_norm": 3.484375,
540
+ "learning_rate": 1.1530195727993199e-06,
541
+ "loss": 4.8278,
542
+ "step": 370
543
+ },
544
+ {
545
+ "epoch": 1.3736263736263736,
546
+ "grad_norm": 3.53125,
547
+ "learning_rate": 1.1340157871770115e-06,
548
+ "loss": 4.8217,
549
+ "step": 375
550
+ },
551
+ {
552
+ "epoch": 1.3919413919413919,
553
+ "grad_norm": 3.6875,
554
+ "learning_rate": 1.1149625846245681e-06,
555
+ "loss": 4.7912,
556
+ "step": 380
557
+ },
558
+ {
559
+ "epoch": 1.4102564102564101,
560
+ "grad_norm": 3.515625,
561
+ "learning_rate": 1.095866990812615e-06,
562
+ "loss": 4.8004,
563
+ "step": 385
564
+ },
565
+ {
566
+ "epoch": 1.4285714285714286,
567
+ "grad_norm": 3.5625,
568
+ "learning_rate": 1.0767360470431157e-06,
569
+ "loss": 4.7896,
570
+ "step": 390
571
+ },
572
+ {
573
+ "epoch": 1.4468864468864469,
574
+ "grad_norm": 3.625,
575
+ "learning_rate": 1.0575768076529625e-06,
576
+ "loss": 4.7788,
577
+ "step": 395
578
+ },
579
+ {
580
+ "epoch": 1.4652014652014653,
581
+ "grad_norm": 3.640625,
582
+ "learning_rate": 1.0383963374127645e-06,
583
+ "loss": 4.8106,
584
+ "step": 400
585
+ },
586
+ {
587
+ "epoch": 1.4835164835164836,
588
+ "grad_norm": 3.71875,
589
+ "learning_rate": 1.0192017089217861e-06,
590
+ "loss": 4.7749,
591
+ "step": 405
592
+ },
593
+ {
594
+ "epoch": 1.5018315018315018,
595
+ "grad_norm": 3.484375,
596
+ "learning_rate": 1e-06,
597
+ "loss": 4.7864,
598
+ "step": 410
599
+ },
600
+ {
601
+ "epoch": 1.52014652014652,
602
+ "grad_norm": 3.59375,
603
+ "learning_rate": 9.80798291078214e-07,
604
+ "loss": 4.7869,
605
+ "step": 415
606
+ },
607
+ {
608
+ "epoch": 1.5384615384615383,
609
+ "grad_norm": 3.625,
610
+ "learning_rate": 9.616036625872356e-07,
611
+ "loss": 4.8314,
612
+ "step": 420
613
+ },
614
+ {
615
+ "epoch": 1.5567765567765568,
616
+ "grad_norm": 3.734375,
617
+ "learning_rate": 9.424231923470376e-07,
618
+ "loss": 4.7831,
619
+ "step": 425
620
+ },
621
+ {
622
+ "epoch": 1.575091575091575,
623
+ "grad_norm": 3.578125,
624
+ "learning_rate": 9.232639529568842e-07,
625
+ "loss": 4.7636,
626
+ "step": 430
627
+ },
628
+ {
629
+ "epoch": 1.5934065934065935,
630
+ "grad_norm": 3.71875,
631
+ "learning_rate": 9.041330091873851e-07,
632
+ "loss": 4.7465,
633
+ "step": 435
634
+ },
635
+ {
636
+ "epoch": 1.6117216117216118,
637
+ "grad_norm": 3.546875,
638
+ "learning_rate": 8.850374153754321e-07,
639
+ "loss": 4.7616,
640
+ "step": 440
641
+ },
642
+ {
643
+ "epoch": 1.63003663003663,
644
+ "grad_norm": 3.46875,
645
+ "learning_rate": 8.659842128229886e-07,
646
+ "loss": 4.7542,
647
+ "step": 445
648
+ },
649
+ {
650
+ "epoch": 1.6483516483516483,
651
+ "grad_norm": 3.5,
652
+ "learning_rate": 8.4698042720068e-07,
653
+ "loss": 4.745,
654
+ "step": 450
655
+ },
656
+ {
657
+ "epoch": 1.6666666666666665,
658
+ "grad_norm": 3.921875,
659
+ "learning_rate": 8.280330659571531e-07,
660
+ "loss": 4.738,
661
+ "step": 455
662
+ },
663
+ {
664
+ "epoch": 1.684981684981685,
665
+ "grad_norm": 3.75,
666
+ "learning_rate": 8.091491157351493e-07,
667
+ "loss": 4.7335,
668
+ "step": 460
669
+ },
670
+ {
671
+ "epoch": 1.7032967032967035,
672
+ "grad_norm": 3.546875,
673
+ "learning_rate": 7.903355397952556e-07,
674
+ "loss": 4.7792,
675
+ "step": 465
676
+ },
677
+ {
678
+ "epoch": 1.7216117216117217,
679
+ "grad_norm": 3.546875,
680
+ "learning_rate": 7.715992754482718e-07,
681
+ "loss": 4.7288,
682
+ "step": 470
683
+ },
684
+ {
685
+ "epoch": 1.73992673992674,
686
+ "grad_norm": 3.515625,
687
+ "learning_rate": 7.529472314971522e-07,
688
+ "loss": 4.7489,
689
+ "step": 475
690
+ },
691
+ {
692
+ "epoch": 1.7582417582417582,
693
+ "grad_norm": 3.484375,
694
+ "learning_rate": 7.34386285689452e-07,
695
+ "loss": 4.7163,
696
+ "step": 480
697
+ },
698
+ {
699
+ "epoch": 1.7765567765567765,
700
+ "grad_norm": 3.828125,
701
+ "learning_rate": 7.159232821812347e-07,
702
+ "loss": 4.7201,
703
+ "step": 485
704
+ },
705
+ {
706
+ "epoch": 1.7948717948717947,
707
+ "grad_norm": 3.5625,
708
+ "learning_rate": 6.975650290133554e-07,
709
+ "loss": 4.7334,
710
+ "step": 490
711
+ },
712
+ {
713
+ "epoch": 1.8131868131868132,
714
+ "grad_norm": 3.609375,
715
+ "learning_rate": 6.793182956010699e-07,
716
+ "loss": 4.7339,
717
+ "step": 495
718
+ },
719
+ {
720
+ "epoch": 1.8315018315018317,
721
+ "grad_norm": 3.484375,
722
+ "learning_rate": 6.611898102378818e-07,
723
+ "loss": 4.7384,
724
+ "step": 500
725
+ },
726
+ {
727
+ "epoch": 1.84981684981685,
728
+ "grad_norm": 3.546875,
729
+ "learning_rate": 6.431862576145544e-07,
730
+ "loss": 4.7402,
731
+ "step": 505
732
+ },
733
+ {
734
+ "epoch": 1.8681318681318682,
735
+ "grad_norm": 3.703125,
736
+ "learning_rate": 6.253142763541995e-07,
737
+ "loss": 4.723,
738
+ "step": 510
739
+ },
740
+ {
741
+ "epoch": 1.8864468864468864,
742
+ "grad_norm": 3.5,
743
+ "learning_rate": 6.075804565643561e-07,
744
+ "loss": 4.7457,
745
+ "step": 515
746
+ },
747
+ {
748
+ "epoch": 1.9047619047619047,
749
+ "grad_norm": 3.5625,
750
+ "learning_rate": 5.899913374069538e-07,
751
+ "loss": 4.712,
752
+ "step": 520
753
+ },
754
+ {
755
+ "epoch": 1.9230769230769231,
756
+ "grad_norm": 3.703125,
757
+ "learning_rate": 5.725534046870677e-07,
758
+ "loss": 4.7273,
759
+ "step": 525
760
+ },
761
+ {
762
+ "epoch": 1.9413919413919414,
763
+ "grad_norm": 3.5625,
764
+ "learning_rate": 5.552730884613428e-07,
765
+ "loss": 4.6981,
766
+ "step": 530
767
+ },
768
+ {
769
+ "epoch": 1.9597069597069599,
770
+ "grad_norm": 3.609375,
771
+ "learning_rate": 5.381567606669793e-07,
772
+ "loss": 4.7482,
773
+ "step": 535
774
+ },
775
+ {
776
+ "epoch": 1.978021978021978,
777
+ "grad_norm": 3.765625,
778
+ "learning_rate": 5.212107327721445e-07,
779
+ "loss": 4.718,
780
+ "step": 540
781
+ },
782
+ {
783
+ "epoch": 1.9963369963369964,
784
+ "grad_norm": 3.6875,
785
+ "learning_rate": 5.044412534486873e-07,
786
+ "loss": 4.7148,
787
+ "step": 545
788
  }
789
  ],
790
  "logging_steps": 5,
 
804
  "attributes": {}
805
  }
806
  },
807
+ "total_flos": 3.574344656441311e+18,
808
  "train_batch_size": 8,
809
  "trial_name": null,
810
  "trial_params": null