antonpolishko commited on
Commit
6933117
·
verified ·
1 Parent(s): 9276722

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f1f58242f921ff5cabef15e7b456ddab6ee5c2492aca332696d9923220d554e
3
  size 4949453792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e7bd1801d0d4245d02fcc221540d160d9ca4436762abd40ab696436db37997
3
  size 4949453792
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aae85a730f6054c602af36fcff51046f218fc5b35a76b78b3fce17eeaa541d82
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fac833f220a4de94d80f1c6128e85dbd0534b37e1778d74f64d089406b9a9cd
3
  size 4999819336
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa43ebd02383e8d9c58aac2b25bfe0e394870e4024e4ba4da7f830fa19be0f07
3
  size 4546807800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5d6ebf26b088bbc6f219afaf49ff803ae69e0485761a48a6794d7aaaccc4eb7
3
  size 4546807800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:082b179ff93c4a9aff94779ffadea379af9c2834f7256314cddc7667645a41dc
3
  size 28992348490
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84bf6c04da13948bebd4d30d4b13cd682886c3f4762b4f85e343d6b5fe8ad40e
3
  size 28992348490
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78d3f197f6c6558fa8056324f1563ab9e957255f5a1a959362aa4eed7a9545db
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:092f3dcf63385897e30ca4d02c4ae9c2eac8f7e2e0b5f3c908bbed7efb16cffd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a23384b1a4df8f5cde36ecc73a9742d7fae5940c0f154d6cc580286c571d0dba
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 300,
6
- "global_step": 481,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -694,6 +694,694 @@
694
  "learning_rate": 1.5043989923528937e-06,
695
  "loss": 1.2183,
696
  "step": 480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  }
698
  ],
699
  "logging_steps": 5,
@@ -713,7 +1401,7 @@
713
  "attributes": {}
714
  }
715
  },
716
- "total_flos": 2.690966406601638e+18,
717
  "train_batch_size": 8,
718
  "trial_name": null,
719
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 300,
6
+ "global_step": 962,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
694
  "learning_rate": 1.5043989923528937e-06,
695
  "loss": 1.2183,
696
  "step": 480
697
+ },
698
+ {
699
+ "epoch": 1.0083160083160083,
700
+ "grad_norm": 3.453125,
701
+ "learning_rate": 1.4949567433771448e-06,
702
+ "loss": 1.1826,
703
+ "step": 485
704
+ },
705
+ {
706
+ "epoch": 1.0187110187110187,
707
+ "grad_norm": 3.609375,
708
+ "learning_rate": 1.4854556812111887e-06,
709
+ "loss": 1.1925,
710
+ "step": 490
711
+ },
712
+ {
713
+ "epoch": 1.0291060291060292,
714
+ "grad_norm": 3.5,
715
+ "learning_rate": 1.4758969348178766e-06,
716
+ "loss": 1.19,
717
+ "step": 495
718
+ },
719
+ {
720
+ "epoch": 1.0395010395010396,
721
+ "grad_norm": 3.53125,
722
+ "learning_rate": 1.4662816400143836e-06,
723
+ "loss": 1.1963,
724
+ "step": 500
725
+ },
726
+ {
727
+ "epoch": 1.04989604989605,
728
+ "grad_norm": 3.640625,
729
+ "learning_rate": 1.4566109393372433e-06,
730
+ "loss": 1.1872,
731
+ "step": 505
732
+ },
733
+ {
734
+ "epoch": 1.0602910602910602,
735
+ "grad_norm": 3.578125,
736
+ "learning_rate": 1.4468859819065882e-06,
737
+ "loss": 1.1833,
738
+ "step": 510
739
+ },
740
+ {
741
+ "epoch": 1.0706860706860706,
742
+ "grad_norm": 3.5625,
743
+ "learning_rate": 1.4371079232896044e-06,
744
+ "loss": 1.1815,
745
+ "step": 515
746
+ },
747
+ {
748
+ "epoch": 1.0810810810810811,
749
+ "grad_norm": 3.53125,
750
+ "learning_rate": 1.4272779253632212e-06,
751
+ "loss": 1.1855,
752
+ "step": 520
753
+ },
754
+ {
755
+ "epoch": 1.0914760914760915,
756
+ "grad_norm": 3.59375,
757
+ "learning_rate": 1.4173971561760518e-06,
758
+ "loss": 1.188,
759
+ "step": 525
760
+ },
761
+ {
762
+ "epoch": 1.1018711018711018,
763
+ "grad_norm": 3.609375,
764
+ "learning_rate": 1.4074667898096009e-06,
765
+ "loss": 1.1873,
766
+ "step": 530
767
+ },
768
+ {
769
+ "epoch": 1.1122661122661124,
770
+ "grad_norm": 3.53125,
771
+ "learning_rate": 1.397488006238752e-06,
772
+ "loss": 1.1945,
773
+ "step": 535
774
+ },
775
+ {
776
+ "epoch": 1.1226611226611227,
777
+ "grad_norm": 3.5,
778
+ "learning_rate": 1.387461991191559e-06,
779
+ "loss": 1.1856,
780
+ "step": 540
781
+ },
782
+ {
783
+ "epoch": 1.133056133056133,
784
+ "grad_norm": 3.609375,
785
+ "learning_rate": 1.3773899360083524e-06,
786
+ "loss": 1.1868,
787
+ "step": 545
788
+ },
789
+ {
790
+ "epoch": 1.1434511434511434,
791
+ "grad_norm": 3.609375,
792
+ "learning_rate": 1.3672730375001773e-06,
793
+ "loss": 1.1791,
794
+ "step": 550
795
+ },
796
+ {
797
+ "epoch": 1.1538461538461537,
798
+ "grad_norm": 3.65625,
799
+ "learning_rate": 1.357112497806582e-06,
800
+ "loss": 1.1969,
801
+ "step": 555
802
+ },
803
+ {
804
+ "epoch": 1.1642411642411643,
805
+ "grad_norm": 3.609375,
806
+ "learning_rate": 1.3469095242527764e-06,
807
+ "loss": 1.1828,
808
+ "step": 560
809
+ },
810
+ {
811
+ "epoch": 1.1746361746361746,
812
+ "grad_norm": 3.515625,
813
+ "learning_rate": 1.3366653292061682e-06,
814
+ "loss": 1.1803,
815
+ "step": 565
816
+ },
817
+ {
818
+ "epoch": 1.185031185031185,
819
+ "grad_norm": 3.53125,
820
+ "learning_rate": 1.3263811299323063e-06,
821
+ "loss": 1.1803,
822
+ "step": 570
823
+ },
824
+ {
825
+ "epoch": 1.1954261954261955,
826
+ "grad_norm": 3.6875,
827
+ "learning_rate": 1.3160581484502382e-06,
828
+ "loss": 1.1823,
829
+ "step": 575
830
+ },
831
+ {
832
+ "epoch": 1.2058212058212059,
833
+ "grad_norm": 3.71875,
834
+ "learning_rate": 1.3056976113873037e-06,
835
+ "loss": 1.1832,
836
+ "step": 580
837
+ },
838
+ {
839
+ "epoch": 1.2162162162162162,
840
+ "grad_norm": 3.625,
841
+ "learning_rate": 1.2953007498333807e-06,
842
+ "loss": 1.1841,
843
+ "step": 585
844
+ },
845
+ {
846
+ "epoch": 1.2266112266112266,
847
+ "grad_norm": 3.59375,
848
+ "learning_rate": 1.284868799194602e-06,
849
+ "loss": 1.1804,
850
+ "step": 590
851
+ },
852
+ {
853
+ "epoch": 1.237006237006237,
854
+ "grad_norm": 3.5,
855
+ "learning_rate": 1.2744029990465574e-06,
856
+ "loss": 1.1811,
857
+ "step": 595
858
+ },
859
+ {
860
+ "epoch": 1.2474012474012475,
861
+ "grad_norm": 3.5625,
862
+ "learning_rate": 1.2639045929870018e-06,
863
+ "loss": 1.1794,
864
+ "step": 600
865
+ },
866
+ {
867
+ "epoch": 1.2474012474012475,
868
+ "eval_loss": 1.2036519050598145,
869
+ "eval_runtime": 10.8624,
870
+ "eval_samples_per_second": 85.616,
871
+ "eval_steps_per_second": 2.762,
872
+ "step": 600
873
+ },
874
+ {
875
+ "epoch": 1.2577962577962578,
876
+ "grad_norm": 3.609375,
877
+ "learning_rate": 1.2533748284880842e-06,
878
+ "loss": 1.1905,
879
+ "step": 605
880
+ },
881
+ {
882
+ "epoch": 1.2681912681912682,
883
+ "grad_norm": 3.5625,
884
+ "learning_rate": 1.2428149567481184e-06,
885
+ "loss": 1.1836,
886
+ "step": 610
887
+ },
888
+ {
889
+ "epoch": 1.2785862785862787,
890
+ "grad_norm": 3.5625,
891
+ "learning_rate": 1.2322262325429063e-06,
892
+ "loss": 1.1823,
893
+ "step": 615
894
+ },
895
+ {
896
+ "epoch": 1.288981288981289,
897
+ "grad_norm": 3.5625,
898
+ "learning_rate": 1.2216099140766436e-06,
899
+ "loss": 1.1836,
900
+ "step": 620
901
+ },
902
+ {
903
+ "epoch": 1.2993762993762994,
904
+ "grad_norm": 3.71875,
905
+ "learning_rate": 1.2109672628324104e-06,
906
+ "loss": 1.1837,
907
+ "step": 625
908
+ },
909
+ {
910
+ "epoch": 1.3097713097713097,
911
+ "grad_norm": 3.578125,
912
+ "learning_rate": 1.2002995434222767e-06,
913
+ "loss": 1.1827,
914
+ "step": 630
915
+ },
916
+ {
917
+ "epoch": 1.32016632016632,
918
+ "grad_norm": 3.6875,
919
+ "learning_rate": 1.1896080234370355e-06,
920
+ "loss": 1.1803,
921
+ "step": 635
922
+ },
923
+ {
924
+ "epoch": 1.3305613305613306,
925
+ "grad_norm": 3.515625,
926
+ "learning_rate": 1.178893973295581e-06,
927
+ "loss": 1.1788,
928
+ "step": 640
929
+ },
930
+ {
931
+ "epoch": 1.340956340956341,
932
+ "grad_norm": 3.640625,
933
+ "learning_rate": 1.1681586660939504e-06,
934
+ "loss": 1.1918,
935
+ "step": 645
936
+ },
937
+ {
938
+ "epoch": 1.3513513513513513,
939
+ "grad_norm": 3.578125,
940
+ "learning_rate": 1.1574033774540505e-06,
941
+ "loss": 1.1796,
942
+ "step": 650
943
+ },
944
+ {
945
+ "epoch": 1.3617463617463619,
946
+ "grad_norm": 3.65625,
947
+ "learning_rate": 1.1466293853720795e-06,
948
+ "loss": 1.1837,
949
+ "step": 655
950
+ },
951
+ {
952
+ "epoch": 1.3721413721413722,
953
+ "grad_norm": 3.609375,
954
+ "learning_rate": 1.1358379700666703e-06,
955
+ "loss": 1.1776,
956
+ "step": 660
957
+ },
958
+ {
959
+ "epoch": 1.3825363825363826,
960
+ "grad_norm": 3.546875,
961
+ "learning_rate": 1.1250304138267701e-06,
962
+ "loss": 1.181,
963
+ "step": 665
964
+ },
965
+ {
966
+ "epoch": 1.392931392931393,
967
+ "grad_norm": 3.6875,
968
+ "learning_rate": 1.11420800085927e-06,
969
+ "loss": 1.1764,
970
+ "step": 670
971
+ },
972
+ {
973
+ "epoch": 1.4033264033264032,
974
+ "grad_norm": 3.65625,
975
+ "learning_rate": 1.1033720171364108e-06,
976
+ "loss": 1.1792,
977
+ "step": 675
978
+ },
979
+ {
980
+ "epoch": 1.4137214137214138,
981
+ "grad_norm": 3.546875,
982
+ "learning_rate": 1.092523750242977e-06,
983
+ "loss": 1.1784,
984
+ "step": 680
985
+ },
986
+ {
987
+ "epoch": 1.4241164241164241,
988
+ "grad_norm": 3.65625,
989
+ "learning_rate": 1.0816644892232997e-06,
990
+ "loss": 1.1855,
991
+ "step": 685
992
+ },
993
+ {
994
+ "epoch": 1.4345114345114345,
995
+ "grad_norm": 3.640625,
996
+ "learning_rate": 1.070795524428086e-06,
997
+ "loss": 1.1782,
998
+ "step": 690
999
+ },
1000
+ {
1001
+ "epoch": 1.444906444906445,
1002
+ "grad_norm": 3.5625,
1003
+ "learning_rate": 1.0599181473610938e-06,
1004
+ "loss": 1.1837,
1005
+ "step": 695
1006
+ },
1007
+ {
1008
+ "epoch": 1.4553014553014554,
1009
+ "grad_norm": 3.59375,
1010
+ "learning_rate": 1.049033650525668e-06,
1011
+ "loss": 1.1786,
1012
+ "step": 700
1013
+ },
1014
+ {
1015
+ "epoch": 1.4656964656964657,
1016
+ "grad_norm": 3.5625,
1017
+ "learning_rate": 1.0381433272711585e-06,
1018
+ "loss": 1.1747,
1019
+ "step": 705
1020
+ },
1021
+ {
1022
+ "epoch": 1.476091476091476,
1023
+ "grad_norm": 3.71875,
1024
+ "learning_rate": 1.0272484716392408e-06,
1025
+ "loss": 1.1854,
1026
+ "step": 710
1027
+ },
1028
+ {
1029
+ "epoch": 1.4864864864864864,
1030
+ "grad_norm": 3.59375,
1031
+ "learning_rate": 1.0163503782101484e-06,
1032
+ "loss": 1.1755,
1033
+ "step": 715
1034
+ },
1035
+ {
1036
+ "epoch": 1.496881496881497,
1037
+ "grad_norm": 3.578125,
1038
+ "learning_rate": 1.0054503419488454e-06,
1039
+ "loss": 1.1795,
1040
+ "step": 720
1041
+ },
1042
+ {
1043
+ "epoch": 1.5072765072765073,
1044
+ "grad_norm": 3.578125,
1045
+ "learning_rate": 9.945496580511543e-07,
1046
+ "loss": 1.1846,
1047
+ "step": 725
1048
+ },
1049
+ {
1050
+ "epoch": 1.5176715176715176,
1051
+ "grad_norm": 3.6875,
1052
+ "learning_rate": 9.836496217898518e-07,
1053
+ "loss": 1.1806,
1054
+ "step": 730
1055
+ },
1056
+ {
1057
+ "epoch": 1.5280665280665282,
1058
+ "grad_norm": 3.609375,
1059
+ "learning_rate": 9.72751528360759e-07,
1060
+ "loss": 1.1667,
1061
+ "step": 735
1062
+ },
1063
+ {
1064
+ "epoch": 1.5384615384615383,
1065
+ "grad_norm": 3.65625,
1066
+ "learning_rate": 9.618566727288414e-07,
1067
+ "loss": 1.185,
1068
+ "step": 740
1069
+ },
1070
+ {
1071
+ "epoch": 1.5488565488565489,
1072
+ "grad_norm": 3.703125,
1073
+ "learning_rate": 9.509663494743321e-07,
1074
+ "loss": 1.191,
1075
+ "step": 745
1076
+ },
1077
+ {
1078
+ "epoch": 1.5592515592515592,
1079
+ "grad_norm": 4.9375,
1080
+ "learning_rate": 9.400818526389062e-07,
1081
+ "loss": 1.1769,
1082
+ "step": 750
1083
+ },
1084
+ {
1085
+ "epoch": 1.5696465696465696,
1086
+ "grad_norm": 3.546875,
1087
+ "learning_rate": 9.292044755719138e-07,
1088
+ "loss": 1.1741,
1089
+ "step": 755
1090
+ },
1091
+ {
1092
+ "epoch": 1.5800415800415801,
1093
+ "grad_norm": 3.546875,
1094
+ "learning_rate": 9.183355107767003e-07,
1095
+ "loss": 1.1771,
1096
+ "step": 760
1097
+ },
1098
+ {
1099
+ "epoch": 1.5904365904365905,
1100
+ "grad_norm": 3.59375,
1101
+ "learning_rate": 9.07476249757023e-07,
1102
+ "loss": 1.1725,
1103
+ "step": 765
1104
+ },
1105
+ {
1106
+ "epoch": 1.6008316008316008,
1107
+ "grad_norm": 3.59375,
1108
+ "learning_rate": 8.966279828635894e-07,
1109
+ "loss": 1.1801,
1110
+ "step": 770
1111
+ },
1112
+ {
1113
+ "epoch": 1.6112266112266114,
1114
+ "grad_norm": 3.609375,
1115
+ "learning_rate": 8.8579199914073e-07,
1116
+ "loss": 1.1697,
1117
+ "step": 775
1118
+ },
1119
+ {
1120
+ "epoch": 1.6216216216216215,
1121
+ "grad_norm": 3.546875,
1122
+ "learning_rate": 8.749695861732299e-07,
1123
+ "loss": 1.175,
1124
+ "step": 780
1125
+ },
1126
+ {
1127
+ "epoch": 1.632016632016632,
1128
+ "grad_norm": 3.578125,
1129
+ "learning_rate": 8.641620299333295e-07,
1130
+ "loss": 1.1818,
1131
+ "step": 785
1132
+ },
1133
+ {
1134
+ "epoch": 1.6424116424116424,
1135
+ "grad_norm": 3.65625,
1136
+ "learning_rate": 8.533706146279207e-07,
1137
+ "loss": 1.1697,
1138
+ "step": 790
1139
+ },
1140
+ {
1141
+ "epoch": 1.6528066528066527,
1142
+ "grad_norm": 3.59375,
1143
+ "learning_rate": 8.425966225459493e-07,
1144
+ "loss": 1.1702,
1145
+ "step": 795
1146
+ },
1147
+ {
1148
+ "epoch": 1.6632016632016633,
1149
+ "grad_norm": 3.65625,
1150
+ "learning_rate": 8.318413339060495e-07,
1151
+ "loss": 1.1793,
1152
+ "step": 800
1153
+ },
1154
+ {
1155
+ "epoch": 1.6735966735966736,
1156
+ "grad_norm": 3.65625,
1157
+ "learning_rate": 8.21106026704419e-07,
1158
+ "loss": 1.1801,
1159
+ "step": 805
1160
+ },
1161
+ {
1162
+ "epoch": 1.683991683991684,
1163
+ "grad_norm": 3.59375,
1164
+ "learning_rate": 8.103919765629645e-07,
1165
+ "loss": 1.1797,
1166
+ "step": 810
1167
+ },
1168
+ {
1169
+ "epoch": 1.6943866943866945,
1170
+ "grad_norm": 3.5625,
1171
+ "learning_rate": 7.997004565777233e-07,
1172
+ "loss": 1.1659,
1173
+ "step": 815
1174
+ },
1175
+ {
1176
+ "epoch": 1.7047817047817047,
1177
+ "grad_norm": 3.671875,
1178
+ "learning_rate": 7.890327371675895e-07,
1179
+ "loss": 1.1743,
1180
+ "step": 820
1181
+ },
1182
+ {
1183
+ "epoch": 1.7151767151767152,
1184
+ "grad_norm": 3.671875,
1185
+ "learning_rate": 7.783900859233562e-07,
1186
+ "loss": 1.1849,
1187
+ "step": 825
1188
+ },
1189
+ {
1190
+ "epoch": 1.7255717255717256,
1191
+ "grad_norm": 3.625,
1192
+ "learning_rate": 7.677737674570936e-07,
1193
+ "loss": 1.1791,
1194
+ "step": 830
1195
+ },
1196
+ {
1197
+ "epoch": 1.735966735966736,
1198
+ "grad_norm": 3.578125,
1199
+ "learning_rate": 7.571850432518819e-07,
1200
+ "loss": 1.1789,
1201
+ "step": 835
1202
+ },
1203
+ {
1204
+ "epoch": 1.7463617463617465,
1205
+ "grad_norm": 3.5625,
1206
+ "learning_rate": 7.466251715119156e-07,
1207
+ "loss": 1.1773,
1208
+ "step": 840
1209
+ },
1210
+ {
1211
+ "epoch": 1.7567567567567568,
1212
+ "grad_norm": 3.609375,
1213
+ "learning_rate": 7.360954070129981e-07,
1214
+ "loss": 1.175,
1215
+ "step": 845
1216
+ },
1217
+ {
1218
+ "epoch": 1.7671517671517671,
1219
+ "grad_norm": 3.609375,
1220
+ "learning_rate": 7.255970009534425e-07,
1221
+ "loss": 1.1816,
1222
+ "step": 850
1223
+ },
1224
+ {
1225
+ "epoch": 1.7775467775467777,
1226
+ "grad_norm": 3.734375,
1227
+ "learning_rate": 7.151312008053979e-07,
1228
+ "loss": 1.1848,
1229
+ "step": 855
1230
+ },
1231
+ {
1232
+ "epoch": 1.7879417879417878,
1233
+ "grad_norm": 3.5625,
1234
+ "learning_rate": 7.046992501666195e-07,
1235
+ "loss": 1.175,
1236
+ "step": 860
1237
+ },
1238
+ {
1239
+ "epoch": 1.7983367983367984,
1240
+ "grad_norm": 3.625,
1241
+ "learning_rate": 6.943023886126965e-07,
1242
+ "loss": 1.1753,
1243
+ "step": 865
1244
+ },
1245
+ {
1246
+ "epoch": 1.8087318087318087,
1247
+ "grad_norm": 3.5625,
1248
+ "learning_rate": 6.839418515497618e-07,
1249
+ "loss": 1.1718,
1250
+ "step": 870
1251
+ },
1252
+ {
1253
+ "epoch": 1.819126819126819,
1254
+ "grad_norm": 3.59375,
1255
+ "learning_rate": 6.736188700676935e-07,
1256
+ "loss": 1.164,
1257
+ "step": 875
1258
+ },
1259
+ {
1260
+ "epoch": 1.8295218295218296,
1261
+ "grad_norm": 3.59375,
1262
+ "learning_rate": 6.633346707938319e-07,
1263
+ "loss": 1.1761,
1264
+ "step": 880
1265
+ },
1266
+ {
1267
+ "epoch": 1.83991683991684,
1268
+ "grad_norm": 3.671875,
1269
+ "learning_rate": 6.530904757472236e-07,
1270
+ "loss": 1.1869,
1271
+ "step": 885
1272
+ },
1273
+ {
1274
+ "epoch": 1.8503118503118503,
1275
+ "grad_norm": 3.59375,
1276
+ "learning_rate": 6.42887502193418e-07,
1277
+ "loss": 1.1836,
1278
+ "step": 890
1279
+ },
1280
+ {
1281
+ "epoch": 1.8607068607068609,
1282
+ "grad_norm": 3.578125,
1283
+ "learning_rate": 6.327269624998227e-07,
1284
+ "loss": 1.1699,
1285
+ "step": 895
1286
+ },
1287
+ {
1288
+ "epoch": 1.871101871101871,
1289
+ "grad_norm": 3.65625,
1290
+ "learning_rate": 6.226100639916474e-07,
1291
+ "loss": 1.1743,
1292
+ "step": 900
1293
+ },
1294
+ {
1295
+ "epoch": 1.871101871101871,
1296
+ "eval_loss": 1.1942965984344482,
1297
+ "eval_runtime": 10.786,
1298
+ "eval_samples_per_second": 86.223,
1299
+ "eval_steps_per_second": 2.781,
1300
+ "step": 900
1301
+ },
1302
+ {
1303
+ "epoch": 1.8814968814968815,
1304
+ "grad_norm": 3.53125,
1305
+ "learning_rate": 6.125380088084408e-07,
1306
+ "loss": 1.1797,
1307
+ "step": 905
1308
+ },
1309
+ {
1310
+ "epoch": 1.8918918918918919,
1311
+ "grad_norm": 3.71875,
1312
+ "learning_rate": 6.025119937612481e-07,
1313
+ "loss": 1.1758,
1314
+ "step": 910
1315
+ },
1316
+ {
1317
+ "epoch": 1.9022869022869022,
1318
+ "grad_norm": 3.65625,
1319
+ "learning_rate": 5.925332101903994e-07,
1320
+ "loss": 1.1783,
1321
+ "step": 915
1322
+ },
1323
+ {
1324
+ "epoch": 1.9126819126819128,
1325
+ "grad_norm": 3.71875,
1326
+ "learning_rate": 5.826028438239479e-07,
1327
+ "loss": 1.1763,
1328
+ "step": 920
1329
+ },
1330
+ {
1331
+ "epoch": 1.9230769230769231,
1332
+ "grad_norm": 3.546875,
1333
+ "learning_rate": 5.727220746367791e-07,
1334
+ "loss": 1.18,
1335
+ "step": 925
1336
+ },
1337
+ {
1338
+ "epoch": 1.9334719334719335,
1339
+ "grad_norm": 3.546875,
1340
+ "learning_rate": 5.628920767103957e-07,
1341
+ "loss": 1.1782,
1342
+ "step": 930
1343
+ },
1344
+ {
1345
+ "epoch": 1.943866943866944,
1346
+ "grad_norm": 3.65625,
1347
+ "learning_rate": 5.531140180934119e-07,
1348
+ "loss": 1.1772,
1349
+ "step": 935
1350
+ },
1351
+ {
1352
+ "epoch": 1.9542619542619541,
1353
+ "grad_norm": 3.546875,
1354
+ "learning_rate": 5.433890606627568e-07,
1355
+ "loss": 1.175,
1356
+ "step": 940
1357
+ },
1358
+ {
1359
+ "epoch": 1.9646569646569647,
1360
+ "grad_norm": 3.59375,
1361
+ "learning_rate": 5.337183599856164e-07,
1362
+ "loss": 1.1745,
1363
+ "step": 945
1364
+ },
1365
+ {
1366
+ "epoch": 1.975051975051975,
1367
+ "grad_norm": 3.5625,
1368
+ "learning_rate": 5.241030651821231e-07,
1369
+ "loss": 1.1662,
1370
+ "step": 950
1371
+ },
1372
+ {
1373
+ "epoch": 1.9854469854469854,
1374
+ "grad_norm": 3.609375,
1375
+ "learning_rate": 5.145443187888114e-07,
1376
+ "loss": 1.171,
1377
+ "step": 955
1378
+ },
1379
+ {
1380
+ "epoch": 1.995841995841996,
1381
+ "grad_norm": 3.640625,
1382
+ "learning_rate": 5.050432566228552e-07,
1383
+ "loss": 1.1831,
1384
+ "step": 960
1385
  }
1386
  ],
1387
  "logging_steps": 5,
 
1401
  "attributes": {}
1402
  }
1403
  },
1404
+ "total_flos": 5.381932813203276e+18,
1405
  "train_batch_size": 8,
1406
  "trial_name": null,
1407
  "trial_params": null