antonpolishko commited on
Commit
7c471b2
·
verified ·
1 Parent(s): afccde0

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3da40a5e2cd69e35db1c4bfd34661796a25ac71a510cbccbfb7b36b7caf9025a
3
  size 4903351912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc528830c6100da3c0dd9010b862efe600ee6d7d54f51d60902108db6c72960
3
  size 4903351912
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3d4d922ecb0fbc025a2f903f492c8baa33ea8cceb3dc068450e7e0b6518684e
3
  size 4947570872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40a843b0966455dd6d30dccf61fb993e15baeaeeaa7700f2b4813e16cccfbe4b
3
  size 4947570872
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c5d33e285bcbf98218ba55bce7efbdd306b8046089737b99aa4fe5c5e7ebea5
3
  size 4962221464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d30cb4c06c8541c562268facba597713984c9595ed4c681a6756bd388ec6352
3
  size 4962221464
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:261c82a71bbdefce5c6b82990717f18e2f9cb6c02b8e8a09188d4f81d1e757ea
3
  size 3670322200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc934496e8de7190d32b53a19a5f6c70a705b0f9fb5201734e3ec4eb86b9d93
3
  size 3670322200
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca3ba1546a764910d034f0ce3e93370a84a16db802bc17eb73ebc48babbb424c
3
  size 36967230034
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f736309afd4efe1b8dcd7e05ccf5391826fff7c08d993abb1abf26e65aa1a6
3
  size 36967230034
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5713040e0c397c4c3f4a5dd5b82184ec7e9ce43fb4fc625e2c829d6f4609bb2d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4821369fa96c0ec5ed7b70094f8dd7c4588e39d171b9173542d45daf7d2dc0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 300,
6
- "global_step": 546,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -785,6 +785,392 @@
785
  "learning_rate": 5.044412534486873e-07,
786
  "loss": 4.7148,
787
  "step": 545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
788
  }
789
  ],
790
  "logging_steps": 5,
@@ -799,12 +1185,12 @@
799
  "should_evaluate": false,
800
  "should_log": false,
801
  "should_save": true,
802
- "should_training_stop": false
803
  },
804
  "attributes": {}
805
  }
806
  },
807
- "total_flos": 3.574344656441311e+18,
808
  "train_batch_size": 8,
809
  "trial_name": null,
810
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 300,
6
+ "global_step": 819,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
785
  "learning_rate": 5.044412534486873e-07,
786
  "loss": 4.7148,
787
  "step": 545
788
+ },
789
+ {
790
+ "epoch": 2.0146520146520146,
791
+ "grad_norm": 3.71875,
792
+ "learning_rate": 4.878545062680026e-07,
793
+ "loss": 4.6767,
794
+ "step": 550
795
+ },
796
+ {
797
+ "epoch": 2.032967032967033,
798
+ "grad_norm": 3.453125,
799
+ "learning_rate": 4.7145660742090575e-07,
800
+ "loss": 4.706,
801
+ "step": 555
802
+ },
803
+ {
804
+ "epoch": 2.051282051282051,
805
+ "grad_norm": 4.0,
806
+ "learning_rate": 4.5525360346234907e-07,
807
+ "loss": 4.7142,
808
+ "step": 560
809
+ },
810
+ {
811
+ "epoch": 2.06959706959707,
812
+ "grad_norm": 3.546875,
813
+ "learning_rate": 4.392514690818193e-07,
814
+ "loss": 4.7053,
815
+ "step": 565
816
+ },
817
+ {
818
+ "epoch": 2.087912087912088,
819
+ "grad_norm": 3.765625,
820
+ "learning_rate": 4.2345610490022996e-07,
821
+ "loss": 4.6978,
822
+ "step": 570
823
+ },
824
+ {
825
+ "epoch": 2.1062271062271063,
826
+ "grad_norm": 3.53125,
827
+ "learning_rate": 4.078733352941321e-07,
828
+ "loss": 4.7147,
829
+ "step": 575
830
+ },
831
+ {
832
+ "epoch": 2.1245421245421245,
833
+ "grad_norm": 3.546875,
834
+ "learning_rate": 3.925089062480339e-07,
835
+ "loss": 4.6861,
836
+ "step": 580
837
+ },
838
+ {
839
+ "epoch": 2.142857142857143,
840
+ "grad_norm": 3.703125,
841
+ "learning_rate": 3.77368483235628e-07,
842
+ "loss": 4.725,
843
+ "step": 585
844
+ },
845
+ {
846
+ "epoch": 2.161172161172161,
847
+ "grad_norm": 3.453125,
848
+ "learning_rate": 3.6245764913070875e-07,
849
+ "loss": 4.7223,
850
+ "step": 590
851
+ },
852
+ {
853
+ "epoch": 2.1794871794871793,
854
+ "grad_norm": 3.578125,
855
+ "learning_rate": 3.477819021485447e-07,
856
+ "loss": 4.7254,
857
+ "step": 595
858
+ },
859
+ {
860
+ "epoch": 2.197802197802198,
861
+ "grad_norm": 3.578125,
862
+ "learning_rate": 3.333466538184674e-07,
863
+ "loss": 4.7074,
864
+ "step": 600
865
+ },
866
+ {
867
+ "epoch": 2.197802197802198,
868
+ "eval_loss": 4.718916893005371,
869
+ "eval_runtime": 14.5284,
870
+ "eval_samples_per_second": 36.412,
871
+ "eval_steps_per_second": 1.17,
872
+ "step": 600
873
+ },
874
+ {
875
+ "epoch": 2.2161172161172162,
876
+ "grad_norm": 3.546875,
877
+ "learning_rate": 3.1915722698842874e-07,
878
+ "loss": 4.705,
879
+ "step": 605
880
+ },
881
+ {
882
+ "epoch": 2.2344322344322345,
883
+ "grad_norm": 3.734375,
884
+ "learning_rate": 3.0521885386225344e-07,
885
+ "loss": 4.7118,
886
+ "step": 610
887
+ },
888
+ {
889
+ "epoch": 2.2527472527472527,
890
+ "grad_norm": 3.703125,
891
+ "learning_rate": 2.9153667407032066e-07,
892
+ "loss": 4.7285,
893
+ "step": 615
894
+ },
895
+ {
896
+ "epoch": 2.271062271062271,
897
+ "grad_norm": 3.59375,
898
+ "learning_rate": 2.7811573277437603e-07,
899
+ "loss": 4.681,
900
+ "step": 620
901
+ },
902
+ {
903
+ "epoch": 2.2893772893772892,
904
+ "grad_norm": 3.546875,
905
+ "learning_rate": 2.649609788071836e-07,
906
+ "loss": 4.707,
907
+ "step": 625
908
+ },
909
+ {
910
+ "epoch": 2.3076923076923075,
911
+ "grad_norm": 3.46875,
912
+ "learning_rate": 2.520772628476919e-07,
913
+ "loss": 4.7278,
914
+ "step": 630
915
+ },
916
+ {
917
+ "epoch": 2.326007326007326,
918
+ "grad_norm": 3.59375,
919
+ "learning_rate": 2.394693356323997e-07,
920
+ "loss": 4.7202,
921
+ "step": 635
922
+ },
923
+ {
924
+ "epoch": 2.3443223443223444,
925
+ "grad_norm": 3.609375,
926
+ "learning_rate": 2.2714184620356826e-07,
927
+ "loss": 4.706,
928
+ "step": 640
929
+ },
930
+ {
931
+ "epoch": 2.3626373626373627,
932
+ "grad_norm": 3.5,
933
+ "learning_rate": 2.150993401949376e-07,
934
+ "loss": 4.732,
935
+ "step": 645
936
+ },
937
+ {
938
+ "epoch": 2.380952380952381,
939
+ "grad_norm": 4.46875,
940
+ "learning_rate": 2.0334625815557026e-07,
941
+ "loss": 4.6981,
942
+ "step": 650
943
+ },
944
+ {
945
+ "epoch": 2.399267399267399,
946
+ "grad_norm": 3.5625,
947
+ "learning_rate": 1.9188693391244438e-07,
948
+ "loss": 4.7334,
949
+ "step": 655
950
+ },
951
+ {
952
+ "epoch": 2.4175824175824174,
953
+ "grad_norm": 3.609375,
954
+ "learning_rate": 1.8072559297240097e-07,
955
+ "loss": 4.7284,
956
+ "step": 660
957
+ },
958
+ {
959
+ "epoch": 2.435897435897436,
960
+ "grad_norm": 3.515625,
961
+ "learning_rate": 1.6986635096403212e-07,
962
+ "loss": 4.7189,
963
+ "step": 665
964
+ },
965
+ {
966
+ "epoch": 2.4542124542124544,
967
+ "grad_norm": 3.484375,
968
+ "learning_rate": 1.5931321212008465e-07,
969
+ "loss": 4.7007,
970
+ "step": 670
971
+ },
972
+ {
973
+ "epoch": 2.4725274725274726,
974
+ "grad_norm": 3.53125,
975
+ "learning_rate": 1.490700678009421e-07,
976
+ "loss": 4.6923,
977
+ "step": 675
978
+ },
979
+ {
980
+ "epoch": 2.490842490842491,
981
+ "grad_norm": 3.546875,
982
+ "learning_rate": 1.3914069505972482e-07,
983
+ "loss": 4.7265,
984
+ "step": 680
985
+ },
986
+ {
987
+ "epoch": 2.509157509157509,
988
+ "grad_norm": 3.6875,
989
+ "learning_rate": 1.2952875524954232e-07,
990
+ "loss": 4.7318,
991
+ "step": 685
992
+ },
993
+ {
994
+ "epoch": 2.5274725274725274,
995
+ "grad_norm": 3.546875,
996
+ "learning_rate": 1.2023779267340563e-07,
997
+ "loss": 4.7144,
998
+ "step": 690
999
+ },
1000
+ {
1001
+ "epoch": 2.5457875457875456,
1002
+ "grad_norm": 3.609375,
1003
+ "learning_rate": 1.112712332773038e-07,
1004
+ "loss": 4.6979,
1005
+ "step": 695
1006
+ },
1007
+ {
1008
+ "epoch": 2.564102564102564,
1009
+ "grad_norm": 3.5625,
1010
+ "learning_rate": 1.026323833869206e-07,
1011
+ "loss": 4.6933,
1012
+ "step": 700
1013
+ },
1014
+ {
1015
+ "epoch": 2.5824175824175826,
1016
+ "grad_norm": 3.59375,
1017
+ "learning_rate": 9.432442848846289e-08,
1018
+ "loss": 4.731,
1019
+ "step": 705
1020
+ },
1021
+ {
1022
+ "epoch": 2.600732600732601,
1023
+ "grad_norm": 3.53125,
1024
+ "learning_rate": 8.63504320540438e-08,
1025
+ "loss": 4.7246,
1026
+ "step": 710
1027
+ },
1028
+ {
1029
+ "epoch": 2.619047619047619,
1030
+ "grad_norm": 3.53125,
1031
+ "learning_rate": 7.871333441206052e-08,
1032
+ "loss": 4.7311,
1033
+ "step": 715
1034
+ },
1035
+ {
1036
+ "epoch": 2.6373626373626373,
1037
+ "grad_norm": 3.53125,
1038
+ "learning_rate": 7.141595166297832e-08,
1039
+ "loss": 4.7213,
1040
+ "step": 720
1041
+ },
1042
+ {
1043
+ "epoch": 2.6556776556776556,
1044
+ "grad_norm": 3.484375,
1045
+ "learning_rate": 6.446097464092248e-08,
1046
+ "loss": 4.6983,
1047
+ "step": 725
1048
+ },
1049
+ {
1050
+ "epoch": 2.6739926739926743,
1051
+ "grad_norm": 3.546875,
1052
+ "learning_rate": 5.78509679214616e-08,
1053
+ "loss": 4.6839,
1054
+ "step": 730
1055
+ },
1056
+ {
1057
+ "epoch": 2.6923076923076925,
1058
+ "grad_norm": 3.5625,
1059
+ "learning_rate": 5.1588368875946864e-08,
1060
+ "loss": 4.7098,
1061
+ "step": 735
1062
+ },
1063
+ {
1064
+ "epoch": 2.7106227106227108,
1065
+ "grad_norm": 3.65625,
1066
+ "learning_rate": 4.567548677275601e-08,
1067
+ "loss": 4.6954,
1068
+ "step": 740
1069
+ },
1070
+ {
1071
+ "epoch": 2.728937728937729,
1072
+ "grad_norm": 3.578125,
1073
+ "learning_rate": 4.0114501925775925e-08,
1074
+ "loss": 4.7284,
1075
+ "step": 745
1076
+ },
1077
+ {
1078
+ "epoch": 2.7472527472527473,
1079
+ "grad_norm": 3.765625,
1080
+ "learning_rate": 3.490746489043317e-08,
1081
+ "loss": 4.7099,
1082
+ "step": 750
1083
+ },
1084
+ {
1085
+ "epoch": 2.7655677655677655,
1086
+ "grad_norm": 3.5625,
1087
+ "learning_rate": 3.005629570757373e-08,
1088
+ "loss": 4.7144,
1089
+ "step": 755
1090
+ },
1091
+ {
1092
+ "epoch": 2.7838827838827838,
1093
+ "grad_norm": 3.625,
1094
+ "learning_rate": 2.5562783195467675e-08,
1095
+ "loss": 4.6949,
1096
+ "step": 760
1097
+ },
1098
+ {
1099
+ "epoch": 2.802197802197802,
1100
+ "grad_norm": 3.515625,
1101
+ "learning_rate": 2.1428584290201114e-08,
1102
+ "loss": 4.7045,
1103
+ "step": 765
1104
+ },
1105
+ {
1106
+ "epoch": 2.8205128205128203,
1107
+ "grad_norm": 3.703125,
1108
+ "learning_rate": 1.7655223434698053e-08,
1109
+ "loss": 4.688,
1110
+ "step": 770
1111
+ },
1112
+ {
1113
+ "epoch": 2.838827838827839,
1114
+ "grad_norm": 3.578125,
1115
+ "learning_rate": 1.4244092016597931e-08,
1116
+ "loss": 4.7177,
1117
+ "step": 775
1118
+ },
1119
+ {
1120
+ "epoch": 2.857142857142857,
1121
+ "grad_norm": 3.59375,
1122
+ "learning_rate": 1.11964478551958e-08,
1123
+ "loss": 4.7073,
1124
+ "step": 780
1125
+ },
1126
+ {
1127
+ "epoch": 2.8754578754578755,
1128
+ "grad_norm": 3.453125,
1129
+ "learning_rate": 8.513414737635005e-09,
1130
+ "loss": 4.7176,
1131
+ "step": 785
1132
+ },
1133
+ {
1134
+ "epoch": 2.8937728937728937,
1135
+ "grad_norm": 3.515625,
1136
+ "learning_rate": 6.1959820045215385e-09,
1137
+ "loss": 4.6779,
1138
+ "step": 790
1139
+ },
1140
+ {
1141
+ "epoch": 2.912087912087912,
1142
+ "grad_norm": 3.484375,
1143
+ "learning_rate": 4.245004185115752e-09,
1144
+ "loss": 4.7052,
1145
+ "step": 795
1146
+ },
1147
+ {
1148
+ "epoch": 2.9304029304029307,
1149
+ "grad_norm": 3.484375,
1150
+ "learning_rate": 2.661200682232745e-09,
1151
+ "loss": 4.7179,
1152
+ "step": 800
1153
+ },
1154
+ {
1155
+ "epoch": 2.948717948717949,
1156
+ "grad_norm": 3.640625,
1157
+ "learning_rate": 1.4451555069708853e-09,
1158
+ "loss": 4.6552,
1159
+ "step": 805
1160
+ },
1161
+ {
1162
+ "epoch": 2.967032967032967,
1163
+ "grad_norm": 4.21875,
1164
+ "learning_rate": 5.973170633631897e-10,
1165
+ "loss": 4.6979,
1166
+ "step": 810
1167
+ },
1168
+ {
1169
+ "epoch": 2.9853479853479854,
1170
+ "grad_norm": 3.578125,
1171
+ "learning_rate": 1.1799798303335772e-10,
1172
+ "loss": 4.7113,
1173
+ "step": 815
1174
  }
1175
  ],
1176
  "logging_steps": 5,
 
1185
  "should_evaluate": false,
1186
  "should_log": false,
1187
  "should_save": true,
1188
+ "should_training_stop": true
1189
  },
1190
  "attributes": {}
1191
  }
1192
  },
1193
+ "total_flos": 5.361516984661967e+18,
1194
  "train_batch_size": 8,
1195
  "trial_name": null,
1196
  "trial_params": null