g4rg commited on
Commit
5237730
·
verified ·
1 Parent(s): dd4a223

Training in progress, step 132, checkpoint

Browse files
Files changed (28) hide show
  1. last-checkpoint/adapter_model.safetensors +1 -1
  2. last-checkpoint/global_step132/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. last-checkpoint/global_step132/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  4. last-checkpoint/global_step132/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  5. last-checkpoint/global_step132/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  6. last-checkpoint/global_step132/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  7. last-checkpoint/global_step132/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  8. last-checkpoint/global_step132/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  9. last-checkpoint/global_step132/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  10. last-checkpoint/global_step132/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  11. last-checkpoint/global_step132/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  12. last-checkpoint/global_step132/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  13. last-checkpoint/global_step132/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  14. last-checkpoint/global_step132/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  15. last-checkpoint/global_step132/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  16. last-checkpoint/global_step132/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  17. last-checkpoint/global_step132/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  18. last-checkpoint/latest +1 -1
  19. last-checkpoint/rng_state_0.pth +1 -1
  20. last-checkpoint/rng_state_1.pth +1 -1
  21. last-checkpoint/rng_state_2.pth +1 -1
  22. last-checkpoint/rng_state_3.pth +1 -1
  23. last-checkpoint/rng_state_4.pth +1 -1
  24. last-checkpoint/rng_state_5.pth +1 -1
  25. last-checkpoint/rng_state_6.pth +1 -1
  26. last-checkpoint/rng_state_7.pth +1 -1
  27. last-checkpoint/scheduler.pt +1 -1
  28. last-checkpoint/trainer_state.json +473 -3
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cde4c0dc915fef419c1193ae86c0d6cad089c08b2c9fd319eb8d1cfc01feab3
3
  size 763470136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b1027b21df86a9ac1c25a185657bca1afb488a02b9101ce864a4e74d409fce
3
  size 763470136
last-checkpoint/global_step132/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:742ce79752a5d8f0f3bf416b20f26d0f43bf95ed67d8c4c7df176a96494ce4a7
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7540fcf32349a07dccbf889ebf653db148b8f90e0077e9c2773d4a587b5e6f0d
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d844d71c8fc5d221f10e43400cb57625275fb899e1239a152e204ad0fc6385a5
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ff85863a7ea2f39275c0d2d3407b1886a4b3986c7ab9ae40b2ceae0faedc6a
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f31cc78687d87c75bfac1aee87d90e2fa493fdf962050eaf448508789290855
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daeee2f1ff388a0c5d58446daca3360a54c25c03bcb2103024266444dff2c968
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57f303a512fb5d564e9ee13087ea01b1c6bd22971339058543e7d100f646b6e9
3
+ size 289064656
last-checkpoint/global_step132/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59477ac9941f46dcd1e7675c7ac8aa0d21d1adaa0c8e8f0a11ec59f473ca0a6d
3
+ size 289064656
last-checkpoint/global_step132/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6e15ec8a4fea8016bdc708db571fc4212d9dad165f5c6e02d9f04f48bd29f8
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183de6c87d63300d6bb1b236b4562d587ff5bd8ec5d12e6120bcdd17afb1a6c4
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5fd70732c9aa6fca7bec2c252b4802ec2a29c3426999950d15a4c4e66dc92a
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4a2665c422731e5e52cb26adf3b9335c8736f3524fd6cfab44a1b8e43065b5f
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ce77382eebe7ffca32bb93b82b1dfed357461d5a71957037e89bcb6836702a
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6492d02148e09765f05c4731dca56fff868571a0dbcda5a427e275c3f948ee31
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af7082a1e598c068aecd4d85e10b3d4798ef0ede3019f3d78b244e87f354703
3
+ size 348711830
last-checkpoint/global_step132/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5886fbf4415c2dfe8a7101e4dca20e4b432af43bece3613f79f94fbff34aed8d
3
+ size 348711830
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step66
 
1
+ global_step132
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f92646c5a2fa7121ebc27f21b41b150cf9055bfe20103354daf6932bc493b7c
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca402835f1af6d48f2f47ac363c7097358373e395ec83d7eb3d57ddbb0a4b2d
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e4bba7e58c6a0dcffa575bfc4f4a34515b83e8aaa1510b610980ff57eb0cce
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82bbd72da921737fac229854d3f27169eee4db7ddeacdba4a7199bad357bf3c
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7b17e78ab4ecfdfe23d8a98330499e2780c1777f23428c0e1ae30c2c65dead5
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5972e7fa3b67599264ff7edaf7cea513fbe8d18030796597e4baae2d425cad3
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cabf58e5a6c7207c39a270554cbc1a122d69f2acb3a6524c23884ae131c30b4
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53ae0f3c148cf1921cf63943d12ae8efd4e59d00acc2c75a186e9cb04f50b9c
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7a2488f153c409c51dffbb327bcdf64f4d17302a658281b1f239ac084c1c80e
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84863aaa0987e7ec58181dc3d6c18d688fa5e191ef4d53a10df375a42bac5e2d
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1eae6522e11d0a1769e0fd4347da083b199a859689a217969c16dbc22713e5ec
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b191726b8c03f5e523ba50eb220f1728e82f11657f92ec30a0f367e31c0945
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d61936499fcf57780309d38cc47b82c866dcadab1dba74812948364a7175461a
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae89d13f767f2c751bc315568edaf6650d050eb04a09f1b3bd9d20f069ee5007
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d80b25072a00168e3d05af00108eeff37fe60f0755470a62f6ae6672dc4ad8e
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1147b95ab7e867550d2f4e0481eddb98045dd538809f0c554423f517ebb61468
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6530e0522c975674706d8073e33fa508580e6b794aaf4f3e6111389796f319e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dea28a2bfa00902c551f1f93e746f32ec9126cb389e7c8deda3380b1f2fec426
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.20245398773006135,
5
  "eval_steps": 66,
6
- "global_step": 66,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -485,6 +485,476 @@
485
  "eval_samples_per_second": 1.799,
486
  "eval_steps_per_second": 0.126,
487
  "step": 66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  }
489
  ],
490
  "logging_steps": 1,
@@ -504,7 +974,7 @@
504
  "attributes": {}
505
  }
506
  },
507
- "total_flos": 72071698710528.0,
508
  "train_batch_size": 2,
509
  "trial_name": null,
510
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4049079754601227,
5
  "eval_steps": 66,
6
+ "global_step": 132,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
485
  "eval_samples_per_second": 1.799,
486
  "eval_steps_per_second": 0.126,
487
  "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.20552147239263804,
491
+ "grad_norm": 0.525841804476554,
492
+ "learning_rate": 9.486202423496679e-05,
493
+ "loss": 1.8319,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.2085889570552147,
498
+ "grad_norm": 0.33648300397500824,
499
+ "learning_rate": 9.46455487428603e-05,
500
+ "loss": 1.889,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.2116564417177914,
505
+ "grad_norm": 0.2982307248009996,
506
+ "learning_rate": 9.442489448996261e-05,
507
+ "loss": 1.9004,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.2147239263803681,
512
+ "grad_norm": 1.3863327829569763,
513
+ "learning_rate": 9.42000847338996e-05,
514
+ "loss": 1.9529,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.21779141104294478,
519
+ "grad_norm": 0.3507002144386185,
520
+ "learning_rate": 9.397114317029975e-05,
521
+ "loss": 1.9561,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.22085889570552147,
526
+ "grad_norm": 0.26047398296778806,
527
+ "learning_rate": 9.373809393029654e-05,
528
+ "loss": 1.9666,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.22392638036809817,
533
+ "grad_norm": 0.31142946623961487,
534
+ "learning_rate": 9.350096157798505e-05,
535
+ "loss": 1.9669,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.22699386503067484,
540
+ "grad_norm": 0.6059103096641723,
541
+ "learning_rate": 9.325977110783264e-05,
542
+ "loss": 1.8732,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.23006134969325154,
547
+ "grad_norm": 0.2988013721693877,
548
+ "learning_rate": 9.301454794204464e-05,
549
+ "loss": 1.9106,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.2331288343558282,
554
+ "grad_norm": 0.3322046656491888,
555
+ "learning_rate": 9.276531792788471e-05,
556
+ "loss": 1.9082,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.2361963190184049,
561
+ "grad_norm": 0.4251032871261752,
562
+ "learning_rate": 9.251210733495039e-05,
563
+ "loss": 1.873,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.2392638036809816,
568
+ "grad_norm": 0.5316920231449993,
569
+ "learning_rate": 9.225494285240432e-05,
570
+ "loss": 1.9237,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.24233128834355827,
575
+ "grad_norm": 0.3879744017362554,
576
+ "learning_rate": 9.199385158616103e-05,
577
+ "loss": 1.9097,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.24539877300613497,
582
+ "grad_norm": 0.34345641723744996,
583
+ "learning_rate": 9.172886105602998e-05,
584
+ "loss": 1.8854,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.24846625766871167,
589
+ "grad_norm": 0.28939057442749516,
590
+ "learning_rate": 9.145999919281481e-05,
591
+ "loss": 1.8964,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.25153374233128833,
596
+ "grad_norm": 1.3304291601448779,
597
+ "learning_rate": 9.118729433536938e-05,
598
+ "loss": 1.9008,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.254601226993865,
603
+ "grad_norm": 0.31217347045844684,
604
+ "learning_rate": 9.091077522761079e-05,
605
+ "loss": 1.9452,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.25766871165644173,
610
+ "grad_norm": 0.437112787156602,
611
+ "learning_rate": 9.063047101548962e-05,
612
+ "loss": 1.8645,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.2607361963190184,
617
+ "grad_norm": 0.29101868827151584,
618
+ "learning_rate": 9.034641124391795e-05,
619
+ "loss": 1.9555,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.26380368098159507,
624
+ "grad_norm": 0.3581357829575129,
625
+ "learning_rate": 9.005862585365517e-05,
626
+ "loss": 1.8963,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.2668711656441718,
631
+ "grad_norm": 0.2870730838141048,
632
+ "learning_rate": 8.976714517815216e-05,
633
+ "loss": 1.9004,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.26993865030674846,
638
+ "grad_norm": 0.432917577879272,
639
+ "learning_rate": 8.947199994035401e-05,
640
+ "loss": 1.9512,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.27300613496932513,
645
+ "grad_norm": 0.2818163590615669,
646
+ "learning_rate": 8.917322124946182e-05,
647
+ "loss": 1.951,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.27607361963190186,
652
+ "grad_norm": 0.35253042451634276,
653
+ "learning_rate": 8.88708405976536e-05,
654
+ "loss": 1.8632,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.2791411042944785,
659
+ "grad_norm": 0.2590173941857926,
660
+ "learning_rate": 8.856488985676495e-05,
661
+ "loss": 1.9345,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.2822085889570552,
666
+ "grad_norm": 0.27658536342174034,
667
+ "learning_rate": 8.825540127492967e-05,
668
+ "loss": 1.9323,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.2852760736196319,
673
+ "grad_norm": 0.4745120742354108,
674
+ "learning_rate": 8.794240747318066e-05,
675
+ "loss": 1.9018,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.2883435582822086,
680
+ "grad_norm": 0.26070920298493305,
681
+ "learning_rate": 8.762594144201167e-05,
682
+ "loss": 1.9387,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.29141104294478526,
687
+ "grad_norm": 0.5280391087971116,
688
+ "learning_rate": 8.73060365378999e-05,
689
+ "loss": 1.862,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.294478527607362,
694
+ "grad_norm": 0.2507206580092369,
695
+ "learning_rate": 8.698272647979012e-05,
696
+ "loss": 1.9286,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.29754601226993865,
701
+ "grad_norm": 0.26686171742356907,
702
+ "learning_rate": 8.665604534554075e-05,
703
+ "loss": 1.8256,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.3006134969325153,
708
+ "grad_norm": 0.2528790515143118,
709
+ "learning_rate": 8.632602756833172e-05,
710
+ "loss": 1.9627,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.30368098159509205,
715
+ "grad_norm": 0.3485782871675419,
716
+ "learning_rate": 8.599270793303524e-05,
717
+ "loss": 1.8465,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.3067484662576687,
722
+ "grad_norm": 0.26793745211248754,
723
+ "learning_rate": 8.565612157254943e-05,
724
+ "loss": 1.8918,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.3098159509202454,
729
+ "grad_norm": 0.25037629545985934,
730
+ "learning_rate": 8.531630396409507e-05,
731
+ "loss": 1.8663,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.3128834355828221,
736
+ "grad_norm": 0.2592216678438039,
737
+ "learning_rate": 8.497329092547627e-05,
738
+ "loss": 1.9302,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.3159509202453988,
743
+ "grad_norm": 0.26334854065125896,
744
+ "learning_rate": 8.46271186113051e-05,
745
+ "loss": 1.8775,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.31901840490797545,
750
+ "grad_norm": 0.2626800828290798,
751
+ "learning_rate": 8.42778235091909e-05,
752
+ "loss": 1.9522,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.3220858895705521,
757
+ "grad_norm": 0.24256073020090993,
758
+ "learning_rate": 8.392544243589427e-05,
759
+ "loss": 1.9295,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.32515337423312884,
764
+ "grad_norm": 0.2484627790629833,
765
+ "learning_rate": 8.357001253344653e-05,
766
+ "loss": 1.9287,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.3282208588957055,
771
+ "grad_norm": 0.31955912356468386,
772
+ "learning_rate": 8.32115712652348e-05,
773
+ "loss": 1.9886,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.3312883435582822,
778
+ "grad_norm": 0.2434642052279205,
779
+ "learning_rate": 8.285015641205325e-05,
780
+ "loss": 1.9623,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.3343558282208589,
785
+ "grad_norm": 0.28552157930226957,
786
+ "learning_rate": 8.248580606812096e-05,
787
+ "loss": 1.8705,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.3374233128834356,
792
+ "grad_norm": 0.27716036272992295,
793
+ "learning_rate": 8.211855863706654e-05,
794
+ "loss": 1.8958,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.34049079754601225,
799
+ "grad_norm": 0.40776621930987433,
800
+ "learning_rate": 8.174845282788041e-05,
801
+ "loss": 1.9219,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.34355828220858897,
806
+ "grad_norm": 0.27546145956009194,
807
+ "learning_rate": 8.137552765083466e-05,
808
+ "loss": 1.8948,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.34662576687116564,
813
+ "grad_norm": 0.2463745150403918,
814
+ "learning_rate": 8.09998224133713e-05,
815
+ "loss": 1.907,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.3496932515337423,
820
+ "grad_norm": 0.2530717713867962,
821
+ "learning_rate": 8.062137671595911e-05,
822
+ "loss": 1.8945,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.35276073619631904,
827
+ "grad_norm": 0.26804689577846247,
828
+ "learning_rate": 8.024023044791964e-05,
829
+ "loss": 1.8984,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.3558282208588957,
834
+ "grad_norm": 0.2922869142073029,
835
+ "learning_rate": 7.985642378322276e-05,
836
+ "loss": 1.9499,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.3588957055214724,
841
+ "grad_norm": 0.2302050850660013,
842
+ "learning_rate": 7.946999717625221e-05,
843
+ "loss": 1.9398,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.3619631901840491,
848
+ "grad_norm": 0.4179152288704764,
849
+ "learning_rate": 7.908099135754152e-05,
850
+ "loss": 1.909,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.36503067484662577,
855
+ "grad_norm": 0.2448034947982603,
856
+ "learning_rate": 7.868944732948101e-05,
857
+ "loss": 1.9202,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.36809815950920244,
862
+ "grad_norm": 0.3642159637354568,
863
+ "learning_rate": 7.829540636199591e-05,
864
+ "loss": 1.9188,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.37116564417177916,
869
+ "grad_norm": 0.2751031027135651,
870
+ "learning_rate": 7.789890998819643e-05,
871
+ "loss": 1.8903,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.37423312883435583,
876
+ "grad_norm": 0.2519348027896112,
877
+ "learning_rate": 7.75e-05,
878
+ "loss": 1.9422,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.3773006134969325,
883
+ "grad_norm": 0.2724753380540709,
884
+ "learning_rate": 7.709871844372639e-05,
885
+ "loss": 1.9314,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.3803680981595092,
890
+ "grad_norm": 0.2831411354349516,
891
+ "learning_rate": 7.669510761566571e-05,
892
+ "loss": 1.8467,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.3834355828220859,
897
+ "grad_norm": 0.34065192298819646,
898
+ "learning_rate": 7.628921005762047e-05,
899
+ "loss": 1.9109,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.38650306748466257,
904
+ "grad_norm": 0.2744987049992245,
905
+ "learning_rate": 7.588106855242135e-05,
906
+ "loss": 1.8961,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.3895705521472393,
911
+ "grad_norm": 0.24972903865472293,
912
+ "learning_rate": 7.547072611941795e-05,
913
+ "loss": 1.9183,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.39263803680981596,
918
+ "grad_norm": 0.2717954573790397,
919
+ "learning_rate": 7.505822600994424e-05,
920
+ "loss": 1.9925,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.39570552147239263,
925
+ "grad_norm": 0.2710599653280406,
926
+ "learning_rate": 7.46436117027598e-05,
927
+ "loss": 1.9588,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.3987730061349693,
932
+ "grad_norm": 0.3038954677693998,
933
+ "learning_rate": 7.422692689946714e-05,
934
+ "loss": 1.9182,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.401840490797546,
939
+ "grad_norm": 0.2587552748890865,
940
+ "learning_rate": 7.380821551990525e-05,
941
+ "loss": 1.9383,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.4049079754601227,
946
+ "grad_norm": 0.25905002770576757,
947
+ "learning_rate": 7.338752169752042e-05,
948
+ "loss": 1.9514,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.4049079754601227,
953
+ "eval_loss": 2.577134370803833,
954
+ "eval_runtime": 55.6924,
955
+ "eval_samples_per_second": 1.796,
956
+ "eval_steps_per_second": 0.126,
957
+ "step": 132
958
  }
959
  ],
960
  "logging_steps": 1,
 
974
  "attributes": {}
975
  }
976
  },
977
+ "total_flos": 144143397421056.0,
978
  "train_batch_size": 2,
979
  "trial_name": null,
980
  "trial_params": null