pszemraj commited on
Commit
1cc5f67
·
verified ·
1 Parent(s): ca9edf0

Upload folder using huggingface_hub

Browse files
checkpoints/checkpoint-pt-45000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eafe802e64ecff5c2844ac0a6d5c9b245af3a3bd9fa0e7d5e28fd59c09ef0ba6
3
+ size 1202681712
checkpoints/checkpoint-pt-45000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
3
+ size 14344
checkpoints/grad_l2_over_steps.png CHANGED
checkpoints/loss_over_steps.png CHANGED
checkpoints/lr_over_steps.png CHANGED
checkpoints/main.log CHANGED
@@ -941,3 +941,60 @@ Mixed precision type: bf16
941
  [2024-08-11 05:39:30,749][Main][INFO] - [train] Step 43400 out of 80000 | Loss --> 1.873 | Grad_l2 --> 0.306 | Weights_l2 --> 9100.324 | Lr --> 0.004 | Seconds_per_step --> 3.403 |
942
  [2024-08-11 05:42:19,147][Main][INFO] - [train] Step 43450 out of 80000 | Loss --> 1.870 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.419 | Lr --> 0.004 | Seconds_per_step --> 3.368 |
943
  [2024-08-11 05:45:08,526][Main][INFO] - [train] Step 43500 out of 80000 | Loss --> 1.873 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.487 | Lr --> 0.004 | Seconds_per_step --> 3.388 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
941
  [2024-08-11 05:39:30,749][Main][INFO] - [train] Step 43400 out of 80000 | Loss --> 1.873 | Grad_l2 --> 0.306 | Weights_l2 --> 9100.324 | Lr --> 0.004 | Seconds_per_step --> 3.403 |
942
  [2024-08-11 05:42:19,147][Main][INFO] - [train] Step 43450 out of 80000 | Loss --> 1.870 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.419 | Lr --> 0.004 | Seconds_per_step --> 3.368 |
943
  [2024-08-11 05:45:08,526][Main][INFO] - [train] Step 43500 out of 80000 | Loss --> 1.873 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.487 | Lr --> 0.004 | Seconds_per_step --> 3.388 |
944
+ [2024-08-11 05:47:57,721][Main][INFO] - [train] Step 43550 out of 80000 | Loss --> 1.876 | Grad_l2 --> 0.304 | Weights_l2 --> 9100.561 | Lr --> 0.004 | Seconds_per_step --> 3.384 |
945
+ [2024-08-11 05:50:44,763][Main][INFO] - [train] Step 43600 out of 80000 | Loss --> 1.879 | Grad_l2 --> 0.309 | Weights_l2 --> 9100.639 | Lr --> 0.004 | Seconds_per_step --> 3.341 |
946
+ [2024-08-11 05:53:32,976][Main][INFO] - [train] Step 43650 out of 80000 | Loss --> 1.876 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.704 | Lr --> 0.004 | Seconds_per_step --> 3.364 |
947
+ [2024-08-11 05:56:21,776][Main][INFO] - [train] Step 43700 out of 80000 | Loss --> 1.874 | Grad_l2 --> 0.304 | Weights_l2 --> 9100.768 | Lr --> 0.004 | Seconds_per_step --> 3.376 |
948
+ [2024-08-11 05:59:08,951][Main][INFO] - [train] Step 43750 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.303 | Weights_l2 --> 9100.837 | Lr --> 0.004 | Seconds_per_step --> 3.343 |
949
+ [2024-08-11 06:01:57,552][Main][INFO] - [train] Step 43800 out of 80000 | Loss --> 1.872 | Grad_l2 --> 0.306 | Weights_l2 --> 9100.908 | Lr --> 0.004 | Seconds_per_step --> 3.372 |
950
+ [2024-08-11 06:04:47,762][Main][INFO] - [train] Step 43850 out of 80000 | Loss --> 1.873 | Grad_l2 --> 0.306 | Weights_l2 --> 9100.958 | Lr --> 0.004 | Seconds_per_step --> 3.404 |
951
+ [2024-08-11 06:07:36,787][Main][INFO] - [train] Step 43900 out of 80000 | Loss --> 1.878 | Grad_l2 --> 0.306 | Weights_l2 --> 9101.039 | Lr --> 0.004 | Seconds_per_step --> 3.380 |
952
+ [2024-08-11 06:10:25,689][Main][INFO] - [train] Step 43950 out of 80000 | Loss --> 1.871 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.105 | Lr --> 0.004 | Seconds_per_step --> 3.378 |
953
+ [2024-08-11 06:13:14,016][Main][INFO] - [train] Step 44000 out of 80000 | Loss --> 1.871 | Grad_l2 --> 0.306 | Weights_l2 --> 9101.158 | Lr --> 0.004 | Seconds_per_step --> 3.367 |
954
+ [2024-08-11 06:16:02,351][Main][INFO] - [train] Step 44050 out of 80000 | Loss --> 1.859 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.227 | Lr --> 0.004 | Seconds_per_step --> 3.367 |
955
+ [2024-08-11 06:18:51,302][Main][INFO] - [train] Step 44100 out of 80000 | Loss --> 1.864 | Grad_l2 --> 0.301 | Weights_l2 --> 9101.284 | Lr --> 0.004 | Seconds_per_step --> 3.379 |
956
+ [2024-08-11 06:21:40,055][Main][INFO] - [train] Step 44150 out of 80000 | Loss --> 1.857 | Grad_l2 --> 0.304 | Weights_l2 --> 9101.335 | Lr --> 0.004 | Seconds_per_step --> 3.375 |
957
+ [2024-08-11 06:24:28,191][Main][INFO] - [train] Step 44200 out of 80000 | Loss --> 1.858 | Grad_l2 --> 0.301 | Weights_l2 --> 9101.392 | Lr --> 0.004 | Seconds_per_step --> 3.363 |
958
+ [2024-08-11 06:27:17,162][Main][INFO] - [train] Step 44250 out of 80000 | Loss --> 1.861 | Grad_l2 --> 0.304 | Weights_l2 --> 9101.436 | Lr --> 0.004 | Seconds_per_step --> 3.379 |
959
+ [2024-08-11 06:30:06,913][Main][INFO] - [train] Step 44300 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.494 | Lr --> 0.004 | Seconds_per_step --> 3.395 |
960
+ [2024-08-11 06:32:55,550][Main][INFO] - [train] Step 44350 out of 80000 | Loss --> 1.855 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.548 | Lr --> 0.004 | Seconds_per_step --> 3.373 |
961
+ [2024-08-11 06:35:45,051][Main][INFO] - [train] Step 44400 out of 80000 | Loss --> 1.858 | Grad_l2 --> 0.302 | Weights_l2 --> 9101.593 | Lr --> 0.004 | Seconds_per_step --> 3.390 |
962
+ [2024-08-11 06:38:35,493][Main][INFO] - [train] Step 44450 out of 80000 | Loss --> 1.856 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.643 | Lr --> 0.004 | Seconds_per_step --> 3.409 |
963
+ [2024-08-11 06:41:25,254][Main][INFO] - [train] Step 44500 out of 80000 | Loss --> 1.861 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.695 | Lr --> 0.004 | Seconds_per_step --> 3.395 |
964
+ [2024-08-11 06:44:14,450][Main][INFO] - [train] Step 44550 out of 80000 | Loss --> 1.861 | Grad_l2 --> 0.299 | Weights_l2 --> 9101.731 | Lr --> 0.004 | Seconds_per_step --> 3.384 |
965
+ [2024-08-11 06:47:02,935][Main][INFO] - [train] Step 44600 out of 80000 | Loss --> 1.856 | Grad_l2 --> 0.300 | Weights_l2 --> 9101.771 | Lr --> 0.004 | Seconds_per_step --> 3.370 |
966
+ [2024-08-11 06:49:52,282][Main][INFO] - [train] Step 44650 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.299 | Weights_l2 --> 9101.813 | Lr --> 0.004 | Seconds_per_step --> 3.387 |
967
+ [2024-08-11 06:52:40,849][Main][INFO] - [train] Step 44700 out of 80000 | Loss --> 1.854 | Grad_l2 --> 0.304 | Weights_l2 --> 9101.850 | Lr --> 0.004 | Seconds_per_step --> 3.371 |
968
+ [2024-08-11 06:55:29,381][Main][INFO] - [train] Step 44750 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.303 | Weights_l2 --> 9101.887 | Lr --> 0.004 | Seconds_per_step --> 3.371 |
969
+ [2024-08-11 06:58:18,119][Main][INFO] - [train] Step 44800 out of 80000 | Loss --> 1.857 | Grad_l2 --> 0.304 | Weights_l2 --> 9101.931 | Lr --> 0.004 | Seconds_per_step --> 3.375 |
970
+ [2024-08-11 07:01:08,131][Main][INFO] - [train] Step 44850 out of 80000 | Loss --> 1.860 | Grad_l2 --> 0.304 | Weights_l2 --> 9101.974 | Lr --> 0.004 | Seconds_per_step --> 3.400 |
971
+ [2024-08-11 07:03:56,588][Main][INFO] - [train] Step 44900 out of 80000 | Loss --> 1.867 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.024 | Lr --> 0.004 | Seconds_per_step --> 3.369 |
972
+ [2024-08-11 07:06:45,519][Main][INFO] - [train] Step 44950 out of 80000 | Loss --> 1.866 | Grad_l2 --> 0.304 | Weights_l2 --> 9102.067 | Lr --> 0.004 | Seconds_per_step --> 3.379 |
973
+ [2024-08-11 07:09:34,423][Main][INFO] - [train] Step 45000 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.302 | Weights_l2 --> 9102.109 | Lr --> 0.004 | Seconds_per_step --> 3.378 |
974
+ [2024-08-11 07:09:34,423][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-45000
975
+ [2024-08-11 07:09:34,426][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
976
+ [2024-08-11 07:09:36,448][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-45000/model.safetensors
977
+ [2024-08-11 07:09:39,204][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-45000/optimizer.bin
978
+ [2024-08-11 07:09:39,204][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-45000/scheduler.bin
979
+ [2024-08-11 07:09:39,204][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-45000/sampler.bin
980
+ [2024-08-11 07:09:39,204][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-45000/sampler_1.bin
981
+ [2024-08-11 07:09:39,205][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-45000/random_states_0.pkl
982
+ [2024-08-11 07:12:28,043][Main][INFO] - [train] Step 45050 out of 80000 | Loss --> 1.864 | Grad_l2 --> 0.306 | Weights_l2 --> 9102.160 | Lr --> 0.004 | Seconds_per_step --> 3.472 |
983
+ [2024-08-11 07:15:17,961][Main][INFO] - [train] Step 45100 out of 80000 | Loss --> 1.855 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.199 | Lr --> 0.004 | Seconds_per_step --> 3.398 |
984
+ [2024-08-11 07:18:07,676][Main][INFO] - [train] Step 45150 out of 80000 | Loss --> 1.872 | Grad_l2 --> 0.301 | Weights_l2 --> 9102.244 | Lr --> 0.004 | Seconds_per_step --> 3.394 |
985
+ [2024-08-11 07:20:56,273][Main][INFO] - [train] Step 45200 out of 80000 | Loss --> 1.853 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.292 | Lr --> 0.004 | Seconds_per_step --> 3.372 |
986
+ [2024-08-11 07:23:45,427][Main][INFO] - [train] Step 45250 out of 80000 | Loss --> 1.864 | Grad_l2 --> 0.307 | Weights_l2 --> 9102.328 | Lr --> 0.004 | Seconds_per_step --> 3.383 |
987
+ [2024-08-11 07:26:35,053][Main][INFO] - [train] Step 45300 out of 80000 | Loss --> 1.863 | Grad_l2 --> 0.302 | Weights_l2 --> 9102.357 | Lr --> 0.004 | Seconds_per_step --> 3.393 |
988
+ [2024-08-11 07:29:23,673][Main][INFO] - [train] Step 45350 out of 80000 | Loss --> 1.865 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.392 | Lr --> 0.004 | Seconds_per_step --> 3.372 |
989
+ [2024-08-11 07:32:13,388][Main][INFO] - [train] Step 45400 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.302 | Weights_l2 --> 9102.425 | Lr --> 0.004 | Seconds_per_step --> 3.394 |
990
+ [2024-08-11 07:35:02,099][Main][INFO] - [train] Step 45450 out of 80000 | Loss --> 1.860 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.460 | Lr --> 0.004 | Seconds_per_step --> 3.374 |
991
+ [2024-08-11 07:37:52,499][Main][INFO] - [train] Step 45500 out of 80000 | Loss --> 1.860 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.483 | Lr --> 0.004 | Seconds_per_step --> 3.408 |
992
+ [2024-08-11 07:40:42,341][Main][INFO] - [train] Step 45550 out of 80000 | Loss --> 1.858 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.508 | Lr --> 0.004 | Seconds_per_step --> 3.397 |
993
+ [2024-08-11 07:43:32,253][Main][INFO] - [train] Step 45600 out of 80000 | Loss --> 1.855 | Grad_l2 --> 0.303 | Weights_l2 --> 9102.537 | Lr --> 0.004 | Seconds_per_step --> 3.398 |
994
+ [2024-08-11 07:46:21,818][Main][INFO] - [train] Step 45650 out of 80000 | Loss --> 1.862 | Grad_l2 --> 0.300 | Weights_l2 --> 9102.566 | Lr --> 0.004 | Seconds_per_step --> 3.391 |
995
+ [2024-08-11 07:49:11,294][Main][INFO] - [train] Step 45700 out of 80000 | Loss --> 1.863 | Grad_l2 --> 0.302 | Weights_l2 --> 9102.595 | Lr --> 0.004 | Seconds_per_step --> 3.390 |
996
+ [2024-08-11 07:51:59,774][Main][INFO] - [train] Step 45750 out of 80000 | Loss --> 1.850 | Grad_l2 --> 0.304 | Weights_l2 --> 9102.612 | Lr --> 0.004 | Seconds_per_step --> 3.370 |
997
+ [2024-08-11 07:54:48,582][Main][INFO] - [train] Step 45800 out of 80000 | Loss --> 1.861 | Grad_l2 --> 0.301 | Weights_l2 --> 9102.628 | Lr --> 0.004 | Seconds_per_step --> 3.376 |
998
+ [2024-08-11 07:57:37,200][Main][INFO] - [train] Step 45850 out of 80000 | Loss --> 1.856 | Grad_l2 --> 0.304 | Weights_l2 --> 9102.643 | Lr --> 0.004 | Seconds_per_step --> 3.372 |
999
+ [2024-08-11 08:00:26,936][Main][INFO] - [train] Step 45900 out of 80000 | Loss --> 1.857 | Grad_l2 --> 0.302 | Weights_l2 --> 9102.666 | Lr --> 0.004 | Seconds_per_step --> 3.395 |
1000
+ [2024-08-11 08:03:16,131][Main][INFO] - [train] Step 45950 out of 80000 | Loss --> 1.859 | Grad_l2 --> 0.305 | Weights_l2 --> 9102.691 | Lr --> 0.004 | Seconds_per_step --> 3.384 |
checkpoints/seconds_per_step_over_steps.png CHANGED
checkpoints/training_metrics.csv CHANGED
@@ -868,3 +868,52 @@ timestamp,step,loss,grad_l2,weights_l2,lr,seconds_per_step
868
  "2024-08-11 05:36:40,598",43350,1.884,0.304,9100.247,0.004,3.379
869
  "2024-08-11 05:39:30,749",43400,1.873,0.306,9100.324,0.004,3.403
870
  "2024-08-11 05:42:19,147",43450,1.87,0.303,9100.419,0.004,3.368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  "2024-08-11 05:36:40,598",43350,1.884,0.304,9100.247,0.004,3.379
869
  "2024-08-11 05:39:30,749",43400,1.873,0.306,9100.324,0.004,3.403
870
  "2024-08-11 05:42:19,147",43450,1.87,0.303,9100.419,0.004,3.368
871
+ "2024-08-11 05:45:08,526",43500,1.873,0.303,9100.487,0.004,3.388
872
+ "2024-08-11 05:47:57,721",43550,1.876,0.304,9100.561,0.004,3.384
873
+ "2024-08-11 05:50:44,763",43600,1.879,0.309,9100.639,0.004,3.341
874
+ "2024-08-11 05:53:32,976",43650,1.876,0.303,9100.704,0.004,3.364
875
+ "2024-08-11 05:56:21,776",43700,1.874,0.304,9100.768,0.004,3.376
876
+ "2024-08-11 05:59:08,951",43750,1.862,0.303,9100.837,0.004,3.343
877
+ "2024-08-11 06:01:57,552",43800,1.872,0.306,9100.908,0.004,3.372
878
+ "2024-08-11 06:04:47,762",43850,1.873,0.306,9100.958,0.004,3.404
879
+ "2024-08-11 06:07:36,787",43900,1.878,0.306,9101.039,0.004,3.38
880
+ "2024-08-11 06:10:25,689",43950,1.871,0.303,9101.105,0.004,3.378
881
+ "2024-08-11 06:13:14,016",44000,1.871,0.306,9101.158,0.004,3.367
882
+ "2024-08-11 06:16:02,351",44050,1.859,0.303,9101.227,0.004,3.367
883
+ "2024-08-11 06:18:51,302",44100,1.864,0.301,9101.284,0.004,3.379
884
+ "2024-08-11 06:21:40,055",44150,1.857,0.304,9101.335,0.004,3.375
885
+ "2024-08-11 06:24:28,191",44200,1.858,0.301,9101.392,0.004,3.363
886
+ "2024-08-11 06:27:17,162",44250,1.861,0.304,9101.436,0.004,3.379
887
+ "2024-08-11 06:30:06,913",44300,1.862,0.303,9101.494,0.004,3.395
888
+ "2024-08-11 06:32:55,550",44350,1.855,0.303,9101.548,0.004,3.373
889
+ "2024-08-11 06:35:45,051",44400,1.858,0.302,9101.593,0.004,3.39
890
+ "2024-08-11 06:38:35,493",44450,1.856,0.303,9101.643,0.004,3.409
891
+ "2024-08-11 06:41:25,254",44500,1.861,0.303,9101.695,0.004,3.395
892
+ "2024-08-11 06:44:14,450",44550,1.861,0.299,9101.731,0.004,3.384
893
+ "2024-08-11 06:47:02,935",44600,1.856,0.3,9101.771,0.004,3.37
894
+ "2024-08-11 06:49:52,282",44650,1.862,0.299,9101.813,0.004,3.387
895
+ "2024-08-11 06:52:40,849",44700,1.854,0.304,9101.85,0.004,3.371
896
+ "2024-08-11 06:55:29,381",44750,1.862,0.303,9101.887,0.004,3.371
897
+ "2024-08-11 06:58:18,119",44800,1.857,0.304,9101.931,0.004,3.375
898
+ "2024-08-11 07:01:08,131",44850,1.86,0.304,9101.974,0.004,3.4
899
+ "2024-08-11 07:03:56,588",44900,1.867,0.303,9102.024,0.004,3.369
900
+ "2024-08-11 07:06:45,519",44950,1.866,0.304,9102.067,0.004,3.379
901
+ "2024-08-11 07:09:34,423",45000,1.862,0.302,9102.109,0.004,3.378
902
+ "2024-08-11 07:12:28,043",45050,1.864,0.306,9102.16,0.004,3.472
903
+ "2024-08-11 07:15:17,961",45100,1.855,0.303,9102.199,0.004,3.398
904
+ "2024-08-11 07:18:07,676",45150,1.872,0.301,9102.244,0.004,3.394
905
+ "2024-08-11 07:20:56,273",45200,1.853,0.303,9102.292,0.004,3.372
906
+ "2024-08-11 07:23:45,427",45250,1.864,0.307,9102.328,0.004,3.383
907
+ "2024-08-11 07:26:35,053",45300,1.863,0.302,9102.357,0.004,3.393
908
+ "2024-08-11 07:29:23,673",45350,1.865,0.303,9102.392,0.004,3.372
909
+ "2024-08-11 07:32:13,388",45400,1.862,0.302,9102.425,0.004,3.394
910
+ "2024-08-11 07:35:02,099",45450,1.86,0.303,9102.46,0.004,3.374
911
+ "2024-08-11 07:37:52,499",45500,1.86,0.303,9102.483,0.004,3.408
912
+ "2024-08-11 07:40:42,341",45550,1.858,0.303,9102.508,0.004,3.397
913
+ "2024-08-11 07:43:32,253",45600,1.855,0.303,9102.537,0.004,3.398
914
+ "2024-08-11 07:46:21,818",45650,1.862,0.3,9102.566,0.004,3.391
915
+ "2024-08-11 07:49:11,294",45700,1.863,0.302,9102.595,0.004,3.39
916
+ "2024-08-11 07:51:59,774",45750,1.85,0.304,9102.612,0.004,3.37
917
+ "2024-08-11 07:54:48,582",45800,1.861,0.301,9102.628,0.004,3.376
918
+ "2024-08-11 07:57:37,200",45850,1.856,0.304,9102.643,0.004,3.372
919
+ "2024-08-11 08:00:26,936",45900,1.857,0.302,9102.666,0.004,3.395
checkpoints/weights_l2_over_steps.png CHANGED