Romain-XV commited on
Commit
4644a6a
·
verified ·
1 Parent(s): 96c0881

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d35af2216bb44e6085bcbb7deb56cc0f230fb27bb1bb0c0c8c2b52dad52893c
3
  size 50358592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec52b6dac11e4d703e461b57dbd96f63fdad826c54925c92c4a9ed3940c7f925
3
  size 50358592
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f6b727df5e3cf52d32a08cc3af718bb52f16b143a0fe10b3781327db01b8c6c
3
  size 25785082
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f992e9bf2526250ac48863a69bc14c754ff3750575d47e4dfdac1bf35fa0d94
3
  size 25785082
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:032509b8d229993b63bf6eb5ce58592a4165e6eaa97f7e78cf084fd8dfd42e62
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f393434236f666cbb4f691051257e303661800a0f9787f5942457107e1ccaa
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de9d2b2a7ed3e3373e7769cf999e7079547a9a97d1513882ecf425b351ddca4b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beb02bcc76a1d18125c30bb2c994848252fc0f0f039db286b9a31c46c82cab52
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 2.0830085277557373,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-400",
4
- "epoch": 0.05976393246675631,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3555,6 +3555,714 @@
3555
  "eval_samples_per_second": 40.76,
3556
  "eval_steps_per_second": 10.19,
3557
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3558
  }
3559
  ],
3560
  "logging_steps": 1,
@@ -3569,7 +4277,7 @@
3569
  "early_stopping_threshold": 0.0
3570
  },
3571
  "attributes": {
3572
- "early_stopping_patience_counter": 1
3573
  }
3574
  },
3575
  "TrainerControl": {
@@ -3578,12 +4286,12 @@
3578
  "should_evaluate": false,
3579
  "should_log": false,
3580
  "should_save": true,
3581
- "should_training_stop": false
3582
  },
3583
  "attributes": {}
3584
  }
3585
  },
3586
- "total_flos": 3.0963313026269184e+16,
3587
  "train_batch_size": 4,
3588
  "trial_name": null,
3589
  "trial_params": null
 
1
  {
2
  "best_metric": 2.0830085277557373,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-400",
4
+ "epoch": 0.07171671896010758,
5
  "eval_steps": 100,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3555
  "eval_samples_per_second": 40.76,
3556
  "eval_steps_per_second": 10.19,
3557
  "step": 500
3558
+ },
3559
+ {
3560
+ "epoch": 0.05988346033168983,
3561
+ "grad_norm": 6.650758743286133,
3562
+ "learning_rate": 0.0002484064591485923,
3563
+ "loss": 16.1606,
3564
+ "step": 501
3565
+ },
3566
+ {
3567
+ "epoch": 0.060002988196623336,
3568
+ "grad_norm": 6.113708972930908,
3569
+ "learning_rate": 0.0002483999754345083,
3570
+ "loss": 17.3134,
3571
+ "step": 502
3572
+ },
3573
+ {
3574
+ "epoch": 0.06012251606155685,
3575
+ "grad_norm": 6.330297946929932,
3576
+ "learning_rate": 0.0002483934786418565,
3577
+ "loss": 16.6002,
3578
+ "step": 503
3579
+ },
3580
+ {
3581
+ "epoch": 0.060242043926490366,
3582
+ "grad_norm": 6.722366809844971,
3583
+ "learning_rate": 0.0002483869687713254,
3584
+ "loss": 16.2453,
3585
+ "step": 504
3586
+ },
3587
+ {
3588
+ "epoch": 0.060361571791423874,
3589
+ "grad_norm": 6.331783771514893,
3590
+ "learning_rate": 0.000248380445823605,
3591
+ "loss": 17.3577,
3592
+ "step": 505
3593
+ },
3594
+ {
3595
+ "epoch": 0.06048109965635739,
3596
+ "grad_norm": 5.92828369140625,
3597
+ "learning_rate": 0.00024837390979938674,
3598
+ "loss": 16.2485,
3599
+ "step": 506
3600
+ },
3601
+ {
3602
+ "epoch": 0.0606006275212909,
3603
+ "grad_norm": 13.758573532104492,
3604
+ "learning_rate": 0.0002483673606993632,
3605
+ "loss": 16.9102,
3606
+ "step": 507
3607
+ },
3608
+ {
3609
+ "epoch": 0.06072015538622441,
3610
+ "grad_norm": 5.667209625244141,
3611
+ "learning_rate": 0.00024836079852422855,
3612
+ "loss": 15.9809,
3613
+ "step": 508
3614
+ },
3615
+ {
3616
+ "epoch": 0.060839683251157926,
3617
+ "grad_norm": 6.230560779571533,
3618
+ "learning_rate": 0.00024835422327467826,
3619
+ "loss": 16.8764,
3620
+ "step": 509
3621
+ },
3622
+ {
3623
+ "epoch": 0.06095921111609144,
3624
+ "grad_norm": 6.252189636230469,
3625
+ "learning_rate": 0.00024834763495140927,
3626
+ "loss": 16.3699,
3627
+ "step": 510
3628
+ },
3629
+ {
3630
+ "epoch": 0.06107873898102495,
3631
+ "grad_norm": 6.850605010986328,
3632
+ "learning_rate": 0.00024834103355511974,
3633
+ "loss": 18.5964,
3634
+ "step": 511
3635
+ },
3636
+ {
3637
+ "epoch": 0.061198266845958464,
3638
+ "grad_norm": 6.297305107116699,
3639
+ "learning_rate": 0.0002483344190865094,
3640
+ "loss": 16.974,
3641
+ "step": 512
3642
+ },
3643
+ {
3644
+ "epoch": 0.06131779471089198,
3645
+ "grad_norm": 6.262433052062988,
3646
+ "learning_rate": 0.00024832779154627927,
3647
+ "loss": 17.2464,
3648
+ "step": 513
3649
+ },
3650
+ {
3651
+ "epoch": 0.06143732257582549,
3652
+ "grad_norm": 6.210182189941406,
3653
+ "learning_rate": 0.00024832115093513177,
3654
+ "loss": 17.2708,
3655
+ "step": 514
3656
+ },
3657
+ {
3658
+ "epoch": 0.061556850440759,
3659
+ "grad_norm": 6.332021236419678,
3660
+ "learning_rate": 0.0002483144972537706,
3661
+ "loss": 17.5522,
3662
+ "step": 515
3663
+ },
3664
+ {
3665
+ "epoch": 0.06167637830569252,
3666
+ "grad_norm": 6.204654216766357,
3667
+ "learning_rate": 0.00024830783050290117,
3668
+ "loss": 17.1688,
3669
+ "step": 516
3670
+ },
3671
+ {
3672
+ "epoch": 0.061795906170626025,
3673
+ "grad_norm": 6.059911251068115,
3674
+ "learning_rate": 0.00024830115068322987,
3675
+ "loss": 16.7284,
3676
+ "step": 517
3677
+ },
3678
+ {
3679
+ "epoch": 0.06191543403555954,
3680
+ "grad_norm": 6.756778240203857,
3681
+ "learning_rate": 0.00024829445779546476,
3682
+ "loss": 17.4355,
3683
+ "step": 518
3684
+ },
3685
+ {
3686
+ "epoch": 0.062034961900493055,
3687
+ "grad_norm": 6.644315719604492,
3688
+ "learning_rate": 0.0002482877518403152,
3689
+ "loss": 17.4694,
3690
+ "step": 519
3691
+ },
3692
+ {
3693
+ "epoch": 0.06215448976542656,
3694
+ "grad_norm": 6.000011920928955,
3695
+ "learning_rate": 0.00024828103281849184,
3696
+ "loss": 16.1155,
3697
+ "step": 520
3698
+ },
3699
+ {
3700
+ "epoch": 0.06227401763036008,
3701
+ "grad_norm": 6.423489570617676,
3702
+ "learning_rate": 0.0002482743007307068,
3703
+ "loss": 17.8717,
3704
+ "step": 521
3705
+ },
3706
+ {
3707
+ "epoch": 0.06239354549529359,
3708
+ "grad_norm": 6.263865947723389,
3709
+ "learning_rate": 0.00024826755557767364,
3710
+ "loss": 16.6036,
3711
+ "step": 522
3712
+ },
3713
+ {
3714
+ "epoch": 0.0625130733602271,
3715
+ "grad_norm": 6.165550231933594,
3716
+ "learning_rate": 0.0002482607973601072,
3717
+ "loss": 16.4604,
3718
+ "step": 523
3719
+ },
3720
+ {
3721
+ "epoch": 0.06263260122516062,
3722
+ "grad_norm": 6.165558338165283,
3723
+ "learning_rate": 0.0002482540260787238,
3724
+ "loss": 17.6127,
3725
+ "step": 524
3726
+ },
3727
+ {
3728
+ "epoch": 0.06275212909009413,
3729
+ "grad_norm": 6.105308532714844,
3730
+ "learning_rate": 0.0002482472417342411,
3731
+ "loss": 16.3942,
3732
+ "step": 525
3733
+ },
3734
+ {
3735
+ "epoch": 0.06287165695502764,
3736
+ "grad_norm": 6.943295955657959,
3737
+ "learning_rate": 0.00024824044432737805,
3738
+ "loss": 16.9056,
3739
+ "step": 526
3740
+ },
3741
+ {
3742
+ "epoch": 0.06299118481996116,
3743
+ "grad_norm": 6.2281365394592285,
3744
+ "learning_rate": 0.00024823363385885515,
3745
+ "loss": 16.3102,
3746
+ "step": 527
3747
+ },
3748
+ {
3749
+ "epoch": 0.06311071268489467,
3750
+ "grad_norm": 6.561606407165527,
3751
+ "learning_rate": 0.0002482268103293942,
3752
+ "loss": 17.055,
3753
+ "step": 528
3754
+ },
3755
+ {
3756
+ "epoch": 0.06323024054982818,
3757
+ "grad_norm": 6.460758209228516,
3758
+ "learning_rate": 0.0002482199737397184,
3759
+ "loss": 15.1712,
3760
+ "step": 529
3761
+ },
3762
+ {
3763
+ "epoch": 0.0633497684147617,
3764
+ "grad_norm": 6.111406326293945,
3765
+ "learning_rate": 0.0002482131240905523,
3766
+ "loss": 15.0887,
3767
+ "step": 530
3768
+ },
3769
+ {
3770
+ "epoch": 0.0634692962796952,
3771
+ "grad_norm": 6.305261135101318,
3772
+ "learning_rate": 0.0002482062613826219,
3773
+ "loss": 16.2783,
3774
+ "step": 531
3775
+ },
3776
+ {
3777
+ "epoch": 0.06358882414462871,
3778
+ "grad_norm": 6.447376728057861,
3779
+ "learning_rate": 0.00024819938561665444,
3780
+ "loss": 15.3546,
3781
+ "step": 532
3782
+ },
3783
+ {
3784
+ "epoch": 0.06370835200956224,
3785
+ "grad_norm": 6.626522064208984,
3786
+ "learning_rate": 0.0002481924967933788,
3787
+ "loss": 16.1421,
3788
+ "step": 533
3789
+ },
3790
+ {
3791
+ "epoch": 0.06382787987449574,
3792
+ "grad_norm": 6.10511589050293,
3793
+ "learning_rate": 0.00024818559491352496,
3794
+ "loss": 17.1937,
3795
+ "step": 534
3796
+ },
3797
+ {
3798
+ "epoch": 0.06394740773942925,
3799
+ "grad_norm": 6.697993278503418,
3800
+ "learning_rate": 0.00024817867997782453,
3801
+ "loss": 17.1077,
3802
+ "step": 535
3803
+ },
3804
+ {
3805
+ "epoch": 0.06406693560436277,
3806
+ "grad_norm": 6.638538837432861,
3807
+ "learning_rate": 0.0002481717519870103,
3808
+ "loss": 16.2386,
3809
+ "step": 536
3810
+ },
3811
+ {
3812
+ "epoch": 0.06418646346929628,
3813
+ "grad_norm": 8.689882278442383,
3814
+ "learning_rate": 0.00024816481094181656,
3815
+ "loss": 15.8653,
3816
+ "step": 537
3817
+ },
3818
+ {
3819
+ "epoch": 0.06430599133422979,
3820
+ "grad_norm": 5.992562294006348,
3821
+ "learning_rate": 0.00024815785684297905,
3822
+ "loss": 16.4454,
3823
+ "step": 538
3824
+ },
3825
+ {
3826
+ "epoch": 0.06442551919916331,
3827
+ "grad_norm": 6.0935139656066895,
3828
+ "learning_rate": 0.0002481508896912346,
3829
+ "loss": 15.9363,
3830
+ "step": 539
3831
+ },
3832
+ {
3833
+ "epoch": 0.06454504706409682,
3834
+ "grad_norm": 6.106266021728516,
3835
+ "learning_rate": 0.00024814390948732187,
3836
+ "loss": 16.1408,
3837
+ "step": 540
3838
+ },
3839
+ {
3840
+ "epoch": 0.06466457492903033,
3841
+ "grad_norm": 5.9442830085754395,
3842
+ "learning_rate": 0.00024813691623198046,
3843
+ "loss": 16.451,
3844
+ "step": 541
3845
+ },
3846
+ {
3847
+ "epoch": 0.06478410279396385,
3848
+ "grad_norm": 6.229999542236328,
3849
+ "learning_rate": 0.0002481299099259517,
3850
+ "loss": 15.3861,
3851
+ "step": 542
3852
+ },
3853
+ {
3854
+ "epoch": 0.06490363065889736,
3855
+ "grad_norm": 6.283480167388916,
3856
+ "learning_rate": 0.000248122890569978,
3857
+ "loss": 16.3622,
3858
+ "step": 543
3859
+ },
3860
+ {
3861
+ "epoch": 0.06502315852383087,
3862
+ "grad_norm": 5.996384620666504,
3863
+ "learning_rate": 0.0002481158581648034,
3864
+ "loss": 15.9675,
3865
+ "step": 544
3866
+ },
3867
+ {
3868
+ "epoch": 0.06514268638876439,
3869
+ "grad_norm": 5.98213529586792,
3870
+ "learning_rate": 0.00024810881271117324,
3871
+ "loss": 15.637,
3872
+ "step": 545
3873
+ },
3874
+ {
3875
+ "epoch": 0.0652622142536979,
3876
+ "grad_norm": 6.469746112823486,
3877
+ "learning_rate": 0.0002481017542098342,
3878
+ "loss": 16.3619,
3879
+ "step": 546
3880
+ },
3881
+ {
3882
+ "epoch": 0.0653817421186314,
3883
+ "grad_norm": 6.57551383972168,
3884
+ "learning_rate": 0.0002480946826615344,
3885
+ "loss": 17.0118,
3886
+ "step": 547
3887
+ },
3888
+ {
3889
+ "epoch": 0.06550126998356492,
3890
+ "grad_norm": 6.5282182693481445,
3891
+ "learning_rate": 0.0002480875980670233,
3892
+ "loss": 18.4883,
3893
+ "step": 548
3894
+ },
3895
+ {
3896
+ "epoch": 0.06562079784849843,
3897
+ "grad_norm": 6.027569770812988,
3898
+ "learning_rate": 0.0002480805004270518,
3899
+ "loss": 15.2508,
3900
+ "step": 549
3901
+ },
3902
+ {
3903
+ "epoch": 0.06574032571343194,
3904
+ "grad_norm": 6.519893646240234,
3905
+ "learning_rate": 0.0002480733897423721,
3906
+ "loss": 17.2495,
3907
+ "step": 550
3908
+ },
3909
+ {
3910
+ "epoch": 0.06585985357836546,
3911
+ "grad_norm": 5.971823215484619,
3912
+ "learning_rate": 0.0002480662660137378,
3913
+ "loss": 15.5504,
3914
+ "step": 551
3915
+ },
3916
+ {
3917
+ "epoch": 0.06597938144329897,
3918
+ "grad_norm": 6.29182243347168,
3919
+ "learning_rate": 0.000248059129241904,
3920
+ "loss": 15.4693,
3921
+ "step": 552
3922
+ },
3923
+ {
3924
+ "epoch": 0.06609890930823248,
3925
+ "grad_norm": 6.490065574645996,
3926
+ "learning_rate": 0.000248051979427627,
3927
+ "loss": 15.931,
3928
+ "step": 553
3929
+ },
3930
+ {
3931
+ "epoch": 0.066218437173166,
3932
+ "grad_norm": 6.518825531005859,
3933
+ "learning_rate": 0.00024804481657166467,
3934
+ "loss": 16.3236,
3935
+ "step": 554
3936
+ },
3937
+ {
3938
+ "epoch": 0.06633796503809951,
3939
+ "grad_norm": 6.617817401885986,
3940
+ "learning_rate": 0.0002480376406747761,
3941
+ "loss": 15.8417,
3942
+ "step": 555
3943
+ },
3944
+ {
3945
+ "epoch": 0.06645749290303302,
3946
+ "grad_norm": 6.263620376586914,
3947
+ "learning_rate": 0.0002480304517377218,
3948
+ "loss": 16.4035,
3949
+ "step": 556
3950
+ },
3951
+ {
3952
+ "epoch": 0.06657702076796654,
3953
+ "grad_norm": 6.821305274963379,
3954
+ "learning_rate": 0.0002480232497612638,
3955
+ "loss": 16.9507,
3956
+ "step": 557
3957
+ },
3958
+ {
3959
+ "epoch": 0.06669654863290005,
3960
+ "grad_norm": 6.247030735015869,
3961
+ "learning_rate": 0.0002480160347461653,
3962
+ "loss": 16.5576,
3963
+ "step": 558
3964
+ },
3965
+ {
3966
+ "epoch": 0.06681607649783355,
3967
+ "grad_norm": 6.491495132446289,
3968
+ "learning_rate": 0.00024800880669319106,
3969
+ "loss": 18.7603,
3970
+ "step": 559
3971
+ },
3972
+ {
3973
+ "epoch": 0.06693560436276708,
3974
+ "grad_norm": 6.250596523284912,
3975
+ "learning_rate": 0.0002480015656031071,
3976
+ "loss": 15.7767,
3977
+ "step": 560
3978
+ },
3979
+ {
3980
+ "epoch": 0.06705513222770058,
3981
+ "grad_norm": 6.2819390296936035,
3982
+ "learning_rate": 0.0002479943114766808,
3983
+ "loss": 15.8,
3984
+ "step": 561
3985
+ },
3986
+ {
3987
+ "epoch": 0.06717466009263409,
3988
+ "grad_norm": 5.973819732666016,
3989
+ "learning_rate": 0.0002479870443146811,
3990
+ "loss": 15.6882,
3991
+ "step": 562
3992
+ },
3993
+ {
3994
+ "epoch": 0.06729418795756761,
3995
+ "grad_norm": 6.051755428314209,
3996
+ "learning_rate": 0.0002479797641178782,
3997
+ "loss": 16.4719,
3998
+ "step": 563
3999
+ },
4000
+ {
4001
+ "epoch": 0.06741371582250112,
4002
+ "grad_norm": 6.508502960205078,
4003
+ "learning_rate": 0.00024797247088704366,
4004
+ "loss": 16.9064,
4005
+ "step": 564
4006
+ },
4007
+ {
4008
+ "epoch": 0.06753324368743463,
4009
+ "grad_norm": 6.121264934539795,
4010
+ "learning_rate": 0.0002479651646229505,
4011
+ "loss": 16.2867,
4012
+ "step": 565
4013
+ },
4014
+ {
4015
+ "epoch": 0.06765277155236815,
4016
+ "grad_norm": 6.16292667388916,
4017
+ "learning_rate": 0.00024795784532637296,
4018
+ "loss": 15.8647,
4019
+ "step": 566
4020
+ },
4021
+ {
4022
+ "epoch": 0.06777229941730166,
4023
+ "grad_norm": 6.289575099945068,
4024
+ "learning_rate": 0.0002479505129980869,
4025
+ "loss": 16.5442,
4026
+ "step": 567
4027
+ },
4028
+ {
4029
+ "epoch": 0.06789182728223517,
4030
+ "grad_norm": 6.375883102416992,
4031
+ "learning_rate": 0.0002479431676388694,
4032
+ "loss": 16.143,
4033
+ "step": 568
4034
+ },
4035
+ {
4036
+ "epoch": 0.06801135514716869,
4037
+ "grad_norm": 6.734157562255859,
4038
+ "learning_rate": 0.00024793580924949897,
4039
+ "loss": 16.5816,
4040
+ "step": 569
4041
+ },
4042
+ {
4043
+ "epoch": 0.0681308830121022,
4044
+ "grad_norm": 5.916708469390869,
4045
+ "learning_rate": 0.00024792843783075544,
4046
+ "loss": 15.6337,
4047
+ "step": 570
4048
+ },
4049
+ {
4050
+ "epoch": 0.0682504108770357,
4051
+ "grad_norm": 5.987206935882568,
4052
+ "learning_rate": 0.00024792105338342015,
4053
+ "loss": 16.1654,
4054
+ "step": 571
4055
+ },
4056
+ {
4057
+ "epoch": 0.06836993874196923,
4058
+ "grad_norm": 7.18080997467041,
4059
+ "learning_rate": 0.00024791365590827566,
4060
+ "loss": 17.0761,
4061
+ "step": 572
4062
+ },
4063
+ {
4064
+ "epoch": 0.06848946660690274,
4065
+ "grad_norm": 5.998250961303711,
4066
+ "learning_rate": 0.000247906245406106,
4067
+ "loss": 15.8114,
4068
+ "step": 573
4069
+ },
4070
+ {
4071
+ "epoch": 0.06860899447183624,
4072
+ "grad_norm": 5.987746715545654,
4073
+ "learning_rate": 0.0002478988218776967,
4074
+ "loss": 16.9754,
4075
+ "step": 574
4076
+ },
4077
+ {
4078
+ "epoch": 0.06872852233676977,
4079
+ "grad_norm": 6.771132469177246,
4080
+ "learning_rate": 0.0002478913853238344,
4081
+ "loss": 16.317,
4082
+ "step": 575
4083
+ },
4084
+ {
4085
+ "epoch": 0.06884805020170327,
4086
+ "grad_norm": 6.411704063415527,
4087
+ "learning_rate": 0.00024788393574530726,
4088
+ "loss": 17.1471,
4089
+ "step": 576
4090
+ },
4091
+ {
4092
+ "epoch": 0.06896757806663678,
4093
+ "grad_norm": 6.628751754760742,
4094
+ "learning_rate": 0.0002478764731429049,
4095
+ "loss": 16.4329,
4096
+ "step": 577
4097
+ },
4098
+ {
4099
+ "epoch": 0.0690871059315703,
4100
+ "grad_norm": 7.8661651611328125,
4101
+ "learning_rate": 0.00024786899751741827,
4102
+ "loss": 15.7631,
4103
+ "step": 578
4104
+ },
4105
+ {
4106
+ "epoch": 0.06920663379650381,
4107
+ "grad_norm": 6.639915943145752,
4108
+ "learning_rate": 0.0002478615088696396,
4109
+ "loss": 16.3586,
4110
+ "step": 579
4111
+ },
4112
+ {
4113
+ "epoch": 0.06932616166143732,
4114
+ "grad_norm": 6.35560941696167,
4115
+ "learning_rate": 0.0002478540072003626,
4116
+ "loss": 17.8284,
4117
+ "step": 580
4118
+ },
4119
+ {
4120
+ "epoch": 0.06944568952637084,
4121
+ "grad_norm": 6.45347261428833,
4122
+ "learning_rate": 0.00024784649251038233,
4123
+ "loss": 16.4598,
4124
+ "step": 581
4125
+ },
4126
+ {
4127
+ "epoch": 0.06956521739130435,
4128
+ "grad_norm": 6.136906147003174,
4129
+ "learning_rate": 0.00024783896480049525,
4130
+ "loss": 16.1433,
4131
+ "step": 582
4132
+ },
4133
+ {
4134
+ "epoch": 0.06968474525623786,
4135
+ "grad_norm": 6.355082035064697,
4136
+ "learning_rate": 0.00024783142407149917,
4137
+ "loss": 17.7579,
4138
+ "step": 583
4139
+ },
4140
+ {
4141
+ "epoch": 0.06980427312117138,
4142
+ "grad_norm": 6.28627872467041,
4143
+ "learning_rate": 0.00024782387032419334,
4144
+ "loss": 16.5747,
4145
+ "step": 584
4146
+ },
4147
+ {
4148
+ "epoch": 0.06992380098610489,
4149
+ "grad_norm": 6.3861188888549805,
4150
+ "learning_rate": 0.0002478163035593783,
4151
+ "loss": 16.4684,
4152
+ "step": 585
4153
+ },
4154
+ {
4155
+ "epoch": 0.0700433288510384,
4156
+ "grad_norm": 8.057326316833496,
4157
+ "learning_rate": 0.00024780872377785603,
4158
+ "loss": 16.7268,
4159
+ "step": 586
4160
+ },
4161
+ {
4162
+ "epoch": 0.07016285671597192,
4163
+ "grad_norm": 6.8107805252075195,
4164
+ "learning_rate": 0.0002478011309804298,
4165
+ "loss": 14.9781,
4166
+ "step": 587
4167
+ },
4168
+ {
4169
+ "epoch": 0.07028238458090542,
4170
+ "grad_norm": 7.4561076164245605,
4171
+ "learning_rate": 0.0002477935251679045,
4172
+ "loss": 15.9972,
4173
+ "step": 588
4174
+ },
4175
+ {
4176
+ "epoch": 0.07040191244583893,
4177
+ "grad_norm": 6.438401222229004,
4178
+ "learning_rate": 0.0002477859063410861,
4179
+ "loss": 16.3148,
4180
+ "step": 589
4181
+ },
4182
+ {
4183
+ "epoch": 0.07052144031077245,
4184
+ "grad_norm": 6.0730299949646,
4185
+ "learning_rate": 0.0002477782745007821,
4186
+ "loss": 15.5541,
4187
+ "step": 590
4188
+ },
4189
+ {
4190
+ "epoch": 0.07064096817570596,
4191
+ "grad_norm": 6.609666347503662,
4192
+ "learning_rate": 0.00024777062964780137,
4193
+ "loss": 16.2029,
4194
+ "step": 591
4195
+ },
4196
+ {
4197
+ "epoch": 0.07076049604063947,
4198
+ "grad_norm": 6.8198652267456055,
4199
+ "learning_rate": 0.00024776297178295424,
4200
+ "loss": 17.0991,
4201
+ "step": 592
4202
+ },
4203
+ {
4204
+ "epoch": 0.07088002390557299,
4205
+ "grad_norm": 7.101873874664307,
4206
+ "learning_rate": 0.0002477553009070522,
4207
+ "loss": 15.2356,
4208
+ "step": 593
4209
+ },
4210
+ {
4211
+ "epoch": 0.0709995517705065,
4212
+ "grad_norm": 6.614659786224365,
4213
+ "learning_rate": 0.0002477476170209083,
4214
+ "loss": 16.8781,
4215
+ "step": 594
4216
+ },
4217
+ {
4218
+ "epoch": 0.07111907963544001,
4219
+ "grad_norm": 9.546778678894043,
4220
+ "learning_rate": 0.0002477399201253369,
4221
+ "loss": 18.754,
4222
+ "step": 595
4223
+ },
4224
+ {
4225
+ "epoch": 0.07123860750037353,
4226
+ "grad_norm": 5.819904327392578,
4227
+ "learning_rate": 0.0002477322102211538,
4228
+ "loss": 15.7819,
4229
+ "step": 596
4230
+ },
4231
+ {
4232
+ "epoch": 0.07135813536530704,
4233
+ "grad_norm": 6.660562515258789,
4234
+ "learning_rate": 0.00024772448730917614,
4235
+ "loss": 17.3348,
4236
+ "step": 597
4237
+ },
4238
+ {
4239
+ "epoch": 0.07147766323024055,
4240
+ "grad_norm": 6.330270290374756,
4241
+ "learning_rate": 0.0002477167513902224,
4242
+ "loss": 16.4988,
4243
+ "step": 598
4244
+ },
4245
+ {
4246
+ "epoch": 0.07159719109517407,
4247
+ "grad_norm": 5.982934951782227,
4248
+ "learning_rate": 0.00024770900246511246,
4249
+ "loss": 14.5084,
4250
+ "step": 599
4251
+ },
4252
+ {
4253
+ "epoch": 0.07171671896010758,
4254
+ "grad_norm": 6.04694128036499,
4255
+ "learning_rate": 0.0002477012405346676,
4256
+ "loss": 16.4352,
4257
+ "step": 600
4258
+ },
4259
+ {
4260
+ "epoch": 0.07171671896010758,
4261
+ "eval_loss": 2.0841848850250244,
4262
+ "eval_runtime": 122.6125,
4263
+ "eval_samples_per_second": 40.779,
4264
+ "eval_steps_per_second": 10.195,
4265
+ "step": 600
4266
  }
4267
  ],
4268
  "logging_steps": 1,
 
4277
  "early_stopping_threshold": 0.0
4278
  },
4279
  "attributes": {
4280
+ "early_stopping_patience_counter": 2
4281
  }
4282
  },
4283
  "TrainerControl": {
 
4286
  "should_evaluate": false,
4287
  "should_log": false,
4288
  "should_save": true,
4289
+ "should_training_stop": true
4290
  },
4291
  "attributes": {}
4292
  }
4293
  },
4294
+ "total_flos": 3.716990563988275e+16,
4295
  "train_batch_size": 4,
4296
  "trial_name": null,
4297
  "trial_params": null