Rodo-Sami commited on
Commit
3eb50c5
·
verified ·
1 Parent(s): 475cd4c

Training in progress, step 716, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:928d4a2af52f3844b56e9dadd1a91f4ee7548d341c392cf2345bbb82ccbe8a96
3
  size 59933632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a0be10fbdab56a6574767c96c6cb6d55e2096cfe6ce02be376170cb6591b455
3
  size 59933632
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53b563da5cf75dbd60884ab6adc778d596afeafdd6615f8734f35622d20ec7a7
3
  size 31823460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34352f87b92d2f74d7d9b5a0472a2c3eddc2fa9e8a4413356e6f5508c4a7188c
3
  size 31823460
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0200747f4c46bcf39b723dc2c97f56a975a54e4bccfe8ccb9f10a4e5bd32d87
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:075e01a41ae15ae31e4905292e60381e146a6e6459533dfec8259d0b29a913d9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba943f1464f5ccea09b9016cac6eb4a9557ad70c719c5ced24b3066b948201b3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b5f024e7c5c8c41cb2db928207237f59d43ea79815d962e48cc1e5a09368626
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.75,
5
  "eval_steps": 179,
6
- "global_step": 537,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3790,6 +3790,1267 @@
3790
  "eval_samples_per_second": 27.362,
3791
  "eval_steps_per_second": 13.704,
3792
  "step": 537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3793
  }
3794
  ],
3795
  "logging_steps": 1,
@@ -3804,12 +5065,12 @@
3804
  "should_evaluate": false,
3805
  "should_log": false,
3806
  "should_save": true,
3807
- "should_training_stop": false
3808
  },
3809
  "attributes": {}
3810
  }
3811
  },
3812
- "total_flos": 1.472683054328709e+17,
3813
  "train_batch_size": 2,
3814
  "trial_name": null,
3815
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 179,
6
+ "global_step": 716,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3790
  "eval_samples_per_second": 27.362,
3791
  "eval_steps_per_second": 13.704,
3792
  "step": 537
3793
+ },
3794
+ {
3795
+ "epoch": 0.7513966480446927,
3796
+ "grad_norm": 0.5159032940864563,
3797
+ "learning_rate": 2.9762871191687313e-05,
3798
+ "loss": 0.0998,
3799
+ "step": 538
3800
+ },
3801
+ {
3802
+ "epoch": 0.7527932960893855,
3803
+ "grad_norm": 0.5139814615249634,
3804
+ "learning_rate": 2.9446822653615614e-05,
3805
+ "loss": 0.106,
3806
+ "step": 539
3807
+ },
3808
+ {
3809
+ "epoch": 0.7541899441340782,
3810
+ "grad_norm": 0.6283978819847107,
3811
+ "learning_rate": 2.913217114705975e-05,
3812
+ "loss": 0.122,
3813
+ "step": 540
3814
+ },
3815
+ {
3816
+ "epoch": 0.755586592178771,
3817
+ "grad_norm": 0.7149221897125244,
3818
+ "learning_rate": 2.8818922902470135e-05,
3819
+ "loss": 0.1113,
3820
+ "step": 541
3821
+ },
3822
+ {
3823
+ "epoch": 0.7569832402234636,
3824
+ "grad_norm": 0.6622272729873657,
3825
+ "learning_rate": 2.850708412251103e-05,
3826
+ "loss": 0.1471,
3827
+ "step": 542
3828
+ },
3829
+ {
3830
+ "epoch": 0.7583798882681564,
3831
+ "grad_norm": 0.4957078993320465,
3832
+ "learning_rate": 2.819666098193764e-05,
3833
+ "loss": 0.1187,
3834
+ "step": 543
3835
+ },
3836
+ {
3837
+ "epoch": 0.7597765363128491,
3838
+ "grad_norm": 0.6351447105407715,
3839
+ "learning_rate": 2.7887659627474017e-05,
3840
+ "loss": 0.1239,
3841
+ "step": 544
3842
+ },
3843
+ {
3844
+ "epoch": 0.7611731843575419,
3845
+ "grad_norm": 0.7053154706954956,
3846
+ "learning_rate": 2.758008617769129e-05,
3847
+ "loss": 0.1463,
3848
+ "step": 545
3849
+ },
3850
+ {
3851
+ "epoch": 0.7625698324022346,
3852
+ "grad_norm": 0.6546825766563416,
3853
+ "learning_rate": 2.7273946722886366e-05,
3854
+ "loss": 0.1338,
3855
+ "step": 546
3856
+ },
3857
+ {
3858
+ "epoch": 0.7639664804469274,
3859
+ "grad_norm": 0.3492923378944397,
3860
+ "learning_rate": 2.6969247324961555e-05,
3861
+ "loss": 0.0717,
3862
+ "step": 547
3863
+ },
3864
+ {
3865
+ "epoch": 0.7653631284916201,
3866
+ "grad_norm": 0.8152084946632385,
3867
+ "learning_rate": 2.6665994017304407e-05,
3868
+ "loss": 0.1597,
3869
+ "step": 548
3870
+ },
3871
+ {
3872
+ "epoch": 0.7667597765363129,
3873
+ "grad_norm": 1.4545857906341553,
3874
+ "learning_rate": 2.636419280466831e-05,
3875
+ "loss": 0.1784,
3876
+ "step": 549
3877
+ },
3878
+ {
3879
+ "epoch": 0.7681564245810056,
3880
+ "grad_norm": 0.9998311400413513,
3881
+ "learning_rate": 2.6063849663053475e-05,
3882
+ "loss": 0.1589,
3883
+ "step": 550
3884
+ },
3885
+ {
3886
+ "epoch": 0.7695530726256983,
3887
+ "grad_norm": 0.2502938210964203,
3888
+ "learning_rate": 2.5764970539588674e-05,
3889
+ "loss": 0.0333,
3890
+ "step": 551
3891
+ },
3892
+ {
3893
+ "epoch": 0.770949720670391,
3894
+ "grad_norm": 0.3657321631908417,
3895
+ "learning_rate": 2.5467561352413648e-05,
3896
+ "loss": 0.0664,
3897
+ "step": 552
3898
+ },
3899
+ {
3900
+ "epoch": 0.7723463687150838,
3901
+ "grad_norm": 0.36316537857055664,
3902
+ "learning_rate": 2.5171627990561564e-05,
3903
+ "loss": 0.1044,
3904
+ "step": 553
3905
+ },
3906
+ {
3907
+ "epoch": 0.7737430167597765,
3908
+ "grad_norm": 0.3267860412597656,
3909
+ "learning_rate": 2.4877176313842753e-05,
3910
+ "loss": 0.0955,
3911
+ "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.7751396648044693,
3915
+ "grad_norm": 0.5272498726844788,
3916
+ "learning_rate": 2.4584212152728403e-05,
3917
+ "loss": 0.1501,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.776536312849162,
3922
+ "grad_norm": 0.4217236340045929,
3923
+ "learning_rate": 2.4292741308235345e-05,
3924
+ "loss": 0.1161,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.7779329608938548,
3929
+ "grad_norm": 0.29607003927230835,
3930
+ "learning_rate": 2.4002769551811033e-05,
3931
+ "loss": 0.0847,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.7793296089385475,
3936
+ "grad_norm": 0.4450053870677948,
3937
+ "learning_rate": 2.3714302625219243e-05,
3938
+ "loss": 0.1128,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.7807262569832403,
3943
+ "grad_norm": 0.8715819120407104,
3944
+ "learning_rate": 2.3427346240426617e-05,
3945
+ "loss": 0.1642,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.7821229050279329,
3950
+ "grad_norm": 0.3655119836330414,
3951
+ "learning_rate": 2.3141906079489183e-05,
3952
+ "loss": 0.1071,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.7835195530726257,
3957
+ "grad_norm": 0.3583777844905853,
3958
+ "learning_rate": 2.2857987794440205e-05,
3959
+ "loss": 0.0747,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.7849162011173184,
3964
+ "grad_norm": 0.34693679213523865,
3965
+ "learning_rate": 2.2575597007177984e-05,
3966
+ "loss": 0.0968,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.7863128491620112,
3971
+ "grad_norm": 0.29578697681427,
3972
+ "learning_rate": 2.229473930935475e-05,
3973
+ "loss": 0.1034,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.7877094972067039,
3978
+ "grad_norm": 0.45241570472717285,
3979
+ "learning_rate": 2.2015420262265863e-05,
3980
+ "loss": 0.1307,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.7891061452513967,
3985
+ "grad_norm": 0.6317943334579468,
3986
+ "learning_rate": 2.173764539673957e-05,
3987
+ "loss": 0.1464,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.7905027932960894,
3992
+ "grad_norm": 0.25777265429496765,
3993
+ "learning_rate": 2.1461420213027772e-05,
3994
+ "loss": 0.0866,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.7918994413407822,
3999
+ "grad_norm": 0.38986483216285706,
4000
+ "learning_rate": 2.118675018069679e-05,
4001
+ "loss": 0.0812,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.7932960893854749,
4006
+ "grad_norm": 0.3201923370361328,
4007
+ "learning_rate": 2.0913640738519335e-05,
4008
+ "loss": 0.1062,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.7946927374301676,
4013
+ "grad_norm": 0.34531423449516296,
4014
+ "learning_rate": 2.0642097294366557e-05,
4015
+ "loss": 0.0983,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.7960893854748603,
4020
+ "grad_norm": 0.33722755312919617,
4021
+ "learning_rate": 2.0372125225101234e-05,
4022
+ "loss": 0.1119,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.797486033519553,
4027
+ "grad_norm": 0.3482882082462311,
4028
+ "learning_rate": 2.0103729876471145e-05,
4029
+ "loss": 0.102,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.7988826815642458,
4034
+ "grad_norm": 0.38442498445510864,
4035
+ "learning_rate": 1.983691656300314e-05,
4036
+ "loss": 0.1045,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.8002793296089385,
4041
+ "grad_norm": 0.32199081778526306,
4042
+ "learning_rate": 1.957169056789814e-05,
4043
+ "loss": 0.1116,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.8016759776536313,
4048
+ "grad_norm": 0.7939237952232361,
4049
+ "learning_rate": 1.930805714292634e-05,
4050
+ "loss": 0.1522,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.803072625698324,
4055
+ "grad_norm": 0.32797643542289734,
4056
+ "learning_rate": 1.9046021508323243e-05,
4057
+ "loss": 0.0768,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.8044692737430168,
4062
+ "grad_norm": 0.35652273893356323,
4063
+ "learning_rate": 1.8785588852686376e-05,
4064
+ "loss": 0.1023,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.8058659217877095,
4069
+ "grad_norm": 0.424141526222229,
4070
+ "learning_rate": 1.8526764332872447e-05,
4071
+ "loss": 0.1488,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.8072625698324022,
4076
+ "grad_norm": 0.35929641127586365,
4077
+ "learning_rate": 1.8269553073895375e-05,
4078
+ "loss": 0.1021,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.8086592178770949,
4083
+ "grad_norm": 0.4345034062862396,
4084
+ "learning_rate": 1.801396016882456e-05,
4085
+ "loss": 0.1036,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.8100558659217877,
4090
+ "grad_norm": 0.34810173511505127,
4091
+ "learning_rate": 1.7759990678684335e-05,
4092
+ "loss": 0.0864,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.8114525139664804,
4097
+ "grad_norm": 0.40283820033073425,
4098
+ "learning_rate": 1.7507649632353574e-05,
4099
+ "loss": 0.1067,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.8128491620111732,
4104
+ "grad_norm": 0.38133350014686584,
4105
+ "learning_rate": 1.7256942026466072e-05,
4106
+ "loss": 0.0923,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.8142458100558659,
4111
+ "grad_norm": 0.359263151884079,
4112
+ "learning_rate": 1.70078728253118e-05,
4113
+ "loss": 0.102,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.8156424581005587,
4118
+ "grad_norm": 0.43518075346946716,
4119
+ "learning_rate": 1.6760446960738364e-05,
4120
+ "loss": 0.1261,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.8170391061452514,
4125
+ "grad_norm": 0.6720304489135742,
4126
+ "learning_rate": 1.6514669332053634e-05,
4127
+ "loss": 0.1277,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.8184357541899442,
4132
+ "grad_norm": 0.45705312490463257,
4133
+ "learning_rate": 1.6270544805928424e-05,
4134
+ "loss": 0.0853,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.8198324022346368,
4139
+ "grad_norm": 0.5168368220329285,
4140
+ "learning_rate": 1.6028078216300336e-05,
4141
+ "loss": 0.1083,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.8212290502793296,
4146
+ "grad_norm": 0.6351551413536072,
4147
+ "learning_rate": 1.5787274364278004e-05,
4148
+ "loss": 0.1065,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.8226256983240223,
4153
+ "grad_norm": 0.7593076229095459,
4154
+ "learning_rate": 1.5548138018045964e-05,
4155
+ "loss": 0.1282,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.8240223463687151,
4160
+ "grad_norm": 0.6074228286743164,
4161
+ "learning_rate": 1.5310673912770312e-05,
4162
+ "loss": 0.1313,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.8254189944134078,
4167
+ "grad_norm": 0.4425796866416931,
4168
+ "learning_rate": 1.5074886750504846e-05,
4169
+ "loss": 0.0936,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.8268156424581006,
4174
+ "grad_norm": 0.8903464674949646,
4175
+ "learning_rate": 1.4840781200098152e-05,
4176
+ "loss": 0.1461,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.8282122905027933,
4181
+ "grad_norm": 0.4715719223022461,
4182
+ "learning_rate": 1.4608361897100908e-05,
4183
+ "loss": 0.1171,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.8296089385474861,
4188
+ "grad_norm": 0.5678585171699524,
4189
+ "learning_rate": 1.4377633443674233e-05,
4190
+ "loss": 0.0987,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.8310055865921788,
4195
+ "grad_norm": 0.6416317820549011,
4196
+ "learning_rate": 1.4148600408498592e-05,
4197
+ "loss": 0.1381,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.8324022346368715,
4202
+ "grad_norm": 0.5329287648200989,
4203
+ "learning_rate": 1.392126732668323e-05,
4204
+ "loss": 0.1136,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.8337988826815642,
4209
+ "grad_norm": 0.6972792744636536,
4210
+ "learning_rate": 1.3695638699676494e-05,
4211
+ "loss": 0.1039,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.835195530726257,
4216
+ "grad_norm": 0.7396121025085449,
4217
+ "learning_rate": 1.3471718995176507e-05,
4218
+ "loss": 0.1172,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.8365921787709497,
4223
+ "grad_norm": 0.5911096930503845,
4224
+ "learning_rate": 1.3249512647042917e-05,
4225
+ "loss": 0.1097,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.8379888268156425,
4230
+ "grad_norm": 0.8591598868370056,
4231
+ "learning_rate": 1.3029024055209015e-05,
4232
+ "loss": 0.1149,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.8393854748603352,
4237
+ "grad_norm": 0.2031138688325882,
4238
+ "learning_rate": 1.281025758559451e-05,
4239
+ "loss": 0.0357,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.840782122905028,
4244
+ "grad_norm": 0.41612380743026733,
4245
+ "learning_rate": 1.2593217570019267e-05,
4246
+ "loss": 0.088,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.8421787709497207,
4251
+ "grad_norm": 0.544515073299408,
4252
+ "learning_rate": 1.2377908306117391e-05,
4253
+ "loss": 0.1182,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.8435754189944135,
4258
+ "grad_norm": 0.39283642172813416,
4259
+ "learning_rate": 1.2164334057252203e-05,
4260
+ "loss": 0.1298,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.8449720670391061,
4265
+ "grad_norm": 0.29914194345474243,
4266
+ "learning_rate": 1.1952499052431753e-05,
4267
+ "loss": 0.1081,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.8463687150837989,
4272
+ "grad_norm": 0.32783523201942444,
4273
+ "learning_rate": 1.174240748622516e-05,
4274
+ "loss": 0.0814,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.8477653631284916,
4279
+ "grad_norm": 0.3241506516933441,
4280
+ "learning_rate": 1.1534063518679516e-05,
4281
+ "loss": 0.1111,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.8491620111731844,
4286
+ "grad_norm": 0.41864722967147827,
4287
+ "learning_rate": 1.1327471275237456e-05,
4288
+ "loss": 0.1169,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.8505586592178771,
4293
+ "grad_norm": 0.31106308102607727,
4294
+ "learning_rate": 1.11226348466556e-05,
4295
+ "loss": 0.0782,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.8519553072625698,
4300
+ "grad_norm": 0.3926340341567993,
4301
+ "learning_rate": 1.0919558288923426e-05,
4302
+ "loss": 0.1118,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.8533519553072626,
4307
+ "grad_norm": 0.3361690640449524,
4308
+ "learning_rate": 1.0718245623183066e-05,
4309
+ "loss": 0.1096,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.8547486033519553,
4314
+ "grad_norm": 0.3871351480484009,
4315
+ "learning_rate": 1.0518700835649553e-05,
4316
+ "loss": 0.1092,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.8561452513966481,
4321
+ "grad_norm": 0.24205343425273895,
4322
+ "learning_rate": 1.0320927877531971e-05,
4323
+ "loss": 0.0836,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.8575418994413407,
4328
+ "grad_norm": 0.31774112582206726,
4329
+ "learning_rate": 1.0124930664955301e-05,
4330
+ "loss": 0.098,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.8589385474860335,
4335
+ "grad_norm": 0.34544655680656433,
4336
+ "learning_rate": 9.930713078882659e-06,
4337
+ "loss": 0.1347,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.8603351955307262,
4342
+ "grad_norm": 0.24583694338798523,
4343
+ "learning_rate": 9.73827896503865e-06,
4344
+ "loss": 0.0752,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.861731843575419,
4349
+ "grad_norm": 0.3795362114906311,
4350
+ "learning_rate": 9.54763213383314e-06,
4351
+ "loss": 0.1214,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.8631284916201117,
4356
+ "grad_norm": 0.3082280158996582,
4357
+ "learning_rate": 9.358776360285759e-06,
4358
+ "loss": 0.1001,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.8645251396648045,
4363
+ "grad_norm": 0.4244152307510376,
4364
+ "learning_rate": 9.171715383951251e-06,
4365
+ "loss": 0.1184,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.8659217877094972,
4370
+ "grad_norm": 0.315733402967453,
4371
+ "learning_rate": 8.986452908845322e-06,
4372
+ "loss": 0.0912,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.86731843575419,
4377
+ "grad_norm": 0.2573086619377136,
4378
+ "learning_rate": 8.80299260337144e-06,
4379
+ "loss": 0.0754,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.8687150837988827,
4384
+ "grad_norm": 0.3076595067977905,
4385
+ "learning_rate": 8.621338100247988e-06,
4386
+ "loss": 0.122,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.8701117318435754,
4391
+ "grad_norm": 0.3168458044528961,
4392
+ "learning_rate": 8.441492996436573e-06,
4393
+ "loss": 0.0862,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.8715083798882681,
4398
+ "grad_norm": 0.292837917804718,
4399
+ "learning_rate": 8.26346085307057e-06,
4400
+ "loss": 0.122,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.8729050279329609,
4405
+ "grad_norm": 0.3604884445667267,
4406
+ "learning_rate": 8.087245195384774e-06,
4407
+ "loss": 0.1143,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.8743016759776536,
4412
+ "grad_norm": 0.3604539632797241,
4413
+ "learning_rate": 7.91284951264557e-06,
4414
+ "loss": 0.1162,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.8756983240223464,
4419
+ "grad_norm": 0.33363595604896545,
4420
+ "learning_rate": 7.740277258081696e-06,
4421
+ "loss": 0.0852,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.8770949720670391,
4426
+ "grad_norm": 0.6355361342430115,
4427
+ "learning_rate": 7.569531848816147e-06,
4428
+ "loss": 0.1154,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.8784916201117319,
4433
+ "grad_norm": 0.3323967754840851,
4434
+ "learning_rate": 7.400616665798199e-06,
4435
+ "loss": 0.1003,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.8798882681564246,
4440
+ "grad_norm": 0.516983151435852,
4441
+ "learning_rate": 7.233535053736706e-06,
4442
+ "loss": 0.0863,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.8812849162011173,
4447
+ "grad_norm": 0.45355257391929626,
4448
+ "learning_rate": 7.068290321033688e-06,
4449
+ "loss": 0.1229,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.88268156424581,
4454
+ "grad_norm": 0.37939712405204773,
4455
+ "learning_rate": 6.90488573971898e-06,
4456
+ "loss": 0.1081,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.8840782122905028,
4461
+ "grad_norm": 0.45642518997192383,
4462
+ "learning_rate": 6.743324545385354e-06,
4463
+ "loss": 0.091,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.8854748603351955,
4468
+ "grad_norm": 0.365570068359375,
4469
+ "learning_rate": 6.583609937124435e-06,
4470
+ "loss": 0.1067,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.8868715083798883,
4475
+ "grad_norm": 0.38732656836509705,
4476
+ "learning_rate": 6.425745077463408e-06,
4477
+ "loss": 0.105,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.888268156424581,
4482
+ "grad_norm": 2.0591869354248047,
4483
+ "learning_rate": 6.269733092302399e-06,
4484
+ "loss": 0.1407,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.8896648044692738,
4489
+ "grad_norm": 0.48740479350090027,
4490
+ "learning_rate": 6.115577070852507e-06,
4491
+ "loss": 0.1058,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.8910614525139665,
4496
+ "grad_norm": 3.680542469024658,
4497
+ "learning_rate": 5.963280065574694e-06,
4498
+ "loss": 0.1259,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.8924581005586593,
4503
+ "grad_norm": 0.5749529004096985,
4504
+ "learning_rate": 5.8128450921193674e-06,
4505
+ "loss": 0.1333,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.8938547486033519,
4510
+ "grad_norm": 0.3572239875793457,
4511
+ "learning_rate": 5.664275129266605e-06,
4512
+ "loss": 0.104,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.8952513966480447,
4517
+ "grad_norm": 0.49053096771240234,
4518
+ "learning_rate": 5.51757311886717e-06,
4519
+ "loss": 0.1293,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.8966480446927374,
4524
+ "grad_norm": 0.8410274386405945,
4525
+ "learning_rate": 5.372741965784323e-06,
4526
+ "loss": 0.1495,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.8980446927374302,
4531
+ "grad_norm": 0.436309278011322,
4532
+ "learning_rate": 5.2297845378362795e-06,
4533
+ "loss": 0.1046,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.8994413407821229,
4538
+ "grad_norm": 0.6086000800132751,
4539
+ "learning_rate": 5.088703665739336e-06,
4540
+ "loss": 0.1314,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.9008379888268156,
4545
+ "grad_norm": 0.5451857447624207,
4546
+ "learning_rate": 4.949502143051976e-06,
4547
+ "loss": 0.1268,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.9022346368715084,
4552
+ "grad_norm": 0.55224609375,
4553
+ "learning_rate": 4.812182726119397e-06,
4554
+ "loss": 0.074,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.9036312849162011,
4559
+ "grad_norm": 0.5905503630638123,
4560
+ "learning_rate": 4.676748134019105e-06,
4561
+ "loss": 0.1371,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.9050279329608939,
4566
+ "grad_norm": 0.3724439740180969,
4567
+ "learning_rate": 4.543201048506851e-06,
4568
+ "loss": 0.0814,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.9064245810055865,
4573
+ "grad_norm": 0.5553332567214966,
4574
+ "learning_rate": 4.41154411396375e-06,
4575
+ "loss": 0.1375,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.9078212290502793,
4580
+ "grad_norm": 0.5004715919494629,
4581
+ "learning_rate": 4.2817799373437994e-06,
4582
+ "loss": 0.0615,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.909217877094972,
4587
+ "grad_norm": 0.22518612444400787,
4588
+ "learning_rate": 4.153911088122231e-06,
4589
+ "loss": 0.0299,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.9106145251396648,
4594
+ "grad_norm": 0.3304058313369751,
4595
+ "learning_rate": 4.027940098244753e-06,
4596
+ "loss": 0.0849,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.9120111731843575,
4601
+ "grad_norm": 0.413650780916214,
4602
+ "learning_rate": 3.90386946207727e-06,
4603
+ "loss": 0.1193,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.9134078212290503,
4608
+ "grad_norm": 0.3433137834072113,
4609
+ "learning_rate": 3.7817016363566493e-06,
4610
+ "loss": 0.101,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.914804469273743,
4615
+ "grad_norm": 0.30257925391197205,
4616
+ "learning_rate": 3.6614390401419453e-06,
4617
+ "loss": 0.1137,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.9162011173184358,
4622
+ "grad_norm": 0.23929886519908905,
4623
+ "learning_rate": 3.54308405476651e-06,
4624
+ "loss": 0.1059,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.9175977653631285,
4629
+ "grad_norm": 0.33015841245651245,
4630
+ "learning_rate": 3.4266390237909676e-06,
4631
+ "loss": 0.1267,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.9189944134078212,
4636
+ "grad_norm": 0.3994545638561249,
4637
+ "learning_rate": 3.312106252956626e-06,
4638
+ "loss": 0.1451,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.9203910614525139,
4643
+ "grad_norm": 0.27575787901878357,
4644
+ "learning_rate": 3.1994880101399726e-06,
4645
+ "loss": 0.088,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.9217877094972067,
4650
+ "grad_norm": 0.37436044216156006,
4651
+ "learning_rate": 3.0887865253076632e-06,
4652
+ "loss": 0.1151,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.9231843575418994,
4657
+ "grad_norm": 0.27707546949386597,
4658
+ "learning_rate": 2.9800039904724463e-06,
4659
+ "loss": 0.0835,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.9245810055865922,
4664
+ "grad_norm": 0.4059697687625885,
4665
+ "learning_rate": 2.873142559649722e-06,
4666
+ "loss": 0.1027,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.9259776536312849,
4671
+ "grad_norm": 0.22227145731449127,
4672
+ "learning_rate": 2.7682043488148513e-06,
4673
+ "loss": 0.1016,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.9273743016759777,
4678
+ "grad_norm": 0.38108816742897034,
4679
+ "learning_rate": 2.6651914358613252e-06,
4680
+ "loss": 0.1187,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.9287709497206704,
4685
+ "grad_norm": 0.29107677936553955,
4686
+ "learning_rate": 2.564105860559607e-06,
4687
+ "loss": 0.0908,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.9301675977653632,
4692
+ "grad_norm": 0.3450527787208557,
4693
+ "learning_rate": 2.464949624516688e-06,
4694
+ "loss": 0.1126,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.9315642458100558,
4699
+ "grad_norm": 0.3469128906726837,
4700
+ "learning_rate": 2.3677246911365304e-06,
4701
+ "loss": 0.1213,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.9329608938547486,
4706
+ "grad_norm": 0.4438486099243164,
4707
+ "learning_rate": 2.272432985581119e-06,
4708
+ "loss": 0.1135,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.9343575418994413,
4713
+ "grad_norm": 0.28915104269981384,
4714
+ "learning_rate": 2.1790763947324046e-06,
4715
+ "loss": 0.1028,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.9357541899441341,
4720
+ "grad_norm": 0.25452756881713867,
4721
+ "learning_rate": 2.0876567671548773e-06,
4722
+ "loss": 0.1019,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.9371508379888268,
4727
+ "grad_norm": 0.3689977526664734,
4728
+ "learning_rate": 1.9981759130590305e-06,
4729
+ "loss": 0.0743,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.9385474860335196,
4734
+ "grad_norm": 0.27568790316581726,
4735
+ "learning_rate": 1.910635604265465e-06,
4736
+ "loss": 0.0833,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.9399441340782123,
4741
+ "grad_norm": 0.3405630588531494,
4742
+ "learning_rate": 1.82503757416983e-06,
4743
+ "loss": 0.092,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.9413407821229051,
4748
+ "grad_norm": 0.28969278931617737,
4749
+ "learning_rate": 1.7413835177084835e-06,
4750
+ "loss": 0.084,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.9427374301675978,
4755
+ "grad_norm": 0.38578617572784424,
4756
+ "learning_rate": 1.6596750913249304e-06,
4757
+ "loss": 0.108,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.9441340782122905,
4762
+ "grad_norm": 0.3050103187561035,
4763
+ "learning_rate": 1.5799139129370588e-06,
4764
+ "loss": 0.0972,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.9455307262569832,
4769
+ "grad_norm": 0.3887476325035095,
4770
+ "learning_rate": 1.502101561905067e-06,
4771
+ "loss": 0.0775,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.946927374301676,
4776
+ "grad_norm": 0.39781996607780457,
4777
+ "learning_rate": 1.4262395790001881e-06,
4778
+ "loss": 0.0953,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.9483240223463687,
4783
+ "grad_norm": 0.2917478382587433,
4784
+ "learning_rate": 1.3523294663742025e-06,
4785
+ "loss": 0.096,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.9497206703910615,
4790
+ "grad_norm": 0.36107248067855835,
4791
+ "learning_rate": 1.2803726875296963e-06,
4792
+ "loss": 0.1049,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.9511173184357542,
4797
+ "grad_norm": 0.49688655138015747,
4798
+ "learning_rate": 1.2103706672910497e-06,
4799
+ "loss": 0.1086,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.952513966480447,
4804
+ "grad_norm": 0.39250099658966064,
4805
+ "learning_rate": 1.142324791776239e-06,
4806
+ "loss": 0.0881,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.9539106145251397,
4811
+ "grad_norm": 0.41053643822669983,
4812
+ "learning_rate": 1.0762364083694464e-06,
4813
+ "loss": 0.098,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.9553072625698324,
4818
+ "grad_norm": 0.34778672456741333,
4819
+ "learning_rate": 1.01210682569427e-06,
4820
+ "loss": 0.1198,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.9567039106145251,
4825
+ "grad_norm": 0.47372210025787354,
4826
+ "learning_rate": 9.499373135879008e-07,
4827
+ "loss": 0.1078,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.9581005586592178,
4832
+ "grad_norm": 0.5244646668434143,
4833
+ "learning_rate": 8.897291030759314e-07,
4834
+ "loss": 0.1052,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.9594972067039106,
4839
+ "grad_norm": 0.4699241816997528,
4840
+ "learning_rate": 8.314833863480198e-07,
4841
+ "loss": 0.1193,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.9608938547486033,
4846
+ "grad_norm": 0.33639252185821533,
4847
+ "learning_rate": 7.752013167342531e-07,
4848
+ "loss": 0.0922,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.9622905027932961,
4853
+ "grad_norm": 0.46195724606513977,
4854
+ "learning_rate": 7.208840086822988e-07,
4855
+ "loss": 0.1021,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.9636871508379888,
4860
+ "grad_norm": 0.4881128668785095,
4861
+ "learning_rate": 6.68532537735389e-07,
4862
+ "loss": 0.1044,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.9650837988826816,
4867
+ "grad_norm": 0.5272928476333618,
4868
+ "learning_rate": 6.181479405109602e-07,
4869
+ "loss": 0.1259,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.9664804469273743,
4874
+ "grad_norm": 0.6659584641456604,
4875
+ "learning_rate": 5.697312146801915e-07,
4876
+ "loss": 0.1545,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.9678770949720671,
4881
+ "grad_norm": 0.49742087721824646,
4882
+ "learning_rate": 5.232833189481645e-07,
4883
+ "loss": 0.0908,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.9692737430167597,
4888
+ "grad_norm": 0.593102753162384,
4889
+ "learning_rate": 4.788051730349907e-07,
4890
+ "loss": 0.155,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.9706703910614525,
4895
+ "grad_norm": 0.9965668320655823,
4896
+ "learning_rate": 4.3629765765751396e-07,
4897
+ "loss": 0.168,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.9720670391061452,
4902
+ "grad_norm": 0.4818421006202698,
4903
+ "learning_rate": 3.9576161451186923e-07,
4904
+ "loss": 0.1031,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.973463687150838,
4909
+ "grad_norm": 0.6472490429878235,
4910
+ "learning_rate": 3.571978462568959e-07,
4911
+ "loss": 0.1141,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.9748603351955307,
4916
+ "grad_norm": 0.4191659688949585,
4917
+ "learning_rate": 3.2060711649817277e-07,
4918
+ "loss": 0.0783,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.9762569832402235,
4923
+ "grad_norm": 0.5093485713005066,
4924
+ "learning_rate": 2.8599014977289675e-07,
4925
+ "loss": 0.0947,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.9776536312849162,
4930
+ "grad_norm": 1.8124933242797852,
4931
+ "learning_rate": 2.5334763153559424e-07,
4932
+ "loss": 0.1765,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.979050279329609,
4937
+ "grad_norm": 0.30719417333602905,
4938
+ "learning_rate": 2.2268020814447676e-07,
4939
+ "loss": 0.0636,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.9804469273743017,
4944
+ "grad_norm": 1.8330652713775635,
4945
+ "learning_rate": 1.939884868487174e-07,
4946
+ "loss": 0.1424,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.9818435754189944,
4951
+ "grad_norm": 0.6687912344932556,
4952
+ "learning_rate": 1.6727303577633858e-07,
4953
+ "loss": 0.1007,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.9832402234636871,
4958
+ "grad_norm": 0.29703205823898315,
4959
+ "learning_rate": 1.4253438392304307e-07,
4960
+ "loss": 0.0762,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.9846368715083799,
4965
+ "grad_norm": 0.30622580647468567,
4966
+ "learning_rate": 1.197730211417114e-07,
4967
+ "loss": 0.1054,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.9860335195530726,
4972
+ "grad_norm": 0.2646753489971161,
4973
+ "learning_rate": 9.898939813267616e-08,
4974
+ "loss": 0.0877,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.9874301675977654,
4979
+ "grad_norm": 0.36262211203575134,
4980
+ "learning_rate": 8.018392643484029e-08,
4981
+ "loss": 0.0927,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.9888268156424581,
4986
+ "grad_norm": 0.24466976523399353,
4987
+ "learning_rate": 6.335697841747257e-08,
4988
+ "loss": 0.084,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.9902234636871509,
4993
+ "grad_norm": 0.4240047335624695,
4994
+ "learning_rate": 4.850888727290226e-08,
4995
+ "loss": 0.106,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.9916201117318436,
5000
+ "grad_norm": 0.47176429629325867,
5001
+ "learning_rate": 3.563994700988005e-08,
5002
+ "loss": 0.1121,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.9930167597765364,
5007
+ "grad_norm": 0.35240668058395386,
5008
+ "learning_rate": 2.4750412447749337e-08,
5009
+ "loss": 0.0961,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.994413407821229,
5014
+ "grad_norm": 0.5831834673881531,
5015
+ "learning_rate": 1.5840499211439152e-08,
5016
+ "loss": 0.1229,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.9958100558659218,
5021
+ "grad_norm": 0.5493488907814026,
5022
+ "learning_rate": 8.910383727156468e-09,
5023
+ "loss": 0.1237,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.9972067039106145,
5028
+ "grad_norm": 0.4525875747203827,
5029
+ "learning_rate": 3.960203218911218e-09,
5030
+ "loss": 0.1178,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.9986033519553073,
5035
+ "grad_norm": 0.7450011968612671,
5036
+ "learning_rate": 9.9005570577404e-10,
5037
+ "loss": 0.1197,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 1.0,
5042
+ "grad_norm": 0.7382752299308777,
5043
+ "learning_rate": 0.0,
5044
+ "loss": 0.1041,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 1.0,
5049
+ "eval_loss": 0.10793300718069077,
5050
+ "eval_runtime": 22.0207,
5051
+ "eval_samples_per_second": 27.383,
5052
+ "eval_steps_per_second": 13.714,
5053
+ "step": 716
5054
  }
5055
  ],
5056
  "logging_steps": 1,
 
5065
  "should_evaluate": false,
5066
  "should_log": false,
5067
  "should_save": true,
5068
+ "should_training_stop": true
5069
  },
5070
  "attributes": {}
5071
  }
5072
  },
5073
+ "total_flos": 1.963577405771612e+17,
5074
  "train_batch_size": 2,
5075
  "trial_name": null,
5076
  "trial_params": null