text-normalization-ru-new / trainer_state.json
alexue4's picture
End of training
359fc6b
raw
history blame
21.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.0,
"eval_steps": 500,
"global_step": 323078,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.1666594444685182e-09,
"loss": 0.0003,
"step": 1
},
{
"epoch": 0.1,
"learning_rate": 5.000649997833341e-06,
"loss": 0.0017,
"step": 2308
},
{
"epoch": 0.2,
"learning_rate": 1.0001299995666681e-05,
"loss": 0.0017,
"step": 4616
},
{
"epoch": 0.3,
"learning_rate": 1.5001949993500023e-05,
"loss": 0.0016,
"step": 6924
},
{
"epoch": 0.4,
"learning_rate": 2.0002599991333363e-05,
"loss": 0.0016,
"step": 9232
},
{
"epoch": 0.5,
"learning_rate": 2.5003249989166704e-05,
"loss": 0.0016,
"step": 11540
},
{
"epoch": 0.6,
"learning_rate": 3.0003899987000046e-05,
"loss": 0.0015,
"step": 13848
},
{
"epoch": 0.7,
"learning_rate": 3.5004549984833384e-05,
"loss": 0.0016,
"step": 16156
},
{
"epoch": 0.8,
"learning_rate": 4.0005199982666725e-05,
"loss": 0.0017,
"step": 18464
},
{
"epoch": 0.9,
"learning_rate": 4.500584998050007e-05,
"loss": 0.0017,
"step": 20772
},
{
"epoch": 1.0,
"eval_loss": 0.00893497932702303,
"eval_max_distance": 1,
"eval_mean_distance": 0,
"eval_runtime": 18.8658,
"eval_samples_per_second": 13.676,
"eval_steps_per_second": 0.954,
"step": 23077
},
{
"epoch": 1.0,
"learning_rate": 5.000649997833341e-05,
"loss": 0.0018,
"step": 23080
},
{
"epoch": 1.1,
"learning_rate": 5.500714997616675e-05,
"loss": 0.0017,
"step": 25388
},
{
"epoch": 1.2,
"learning_rate": 6.000779997400009e-05,
"loss": 0.0017,
"step": 27696
},
{
"epoch": 1.3,
"learning_rate": 6.500844997183343e-05,
"loss": 0.0017,
"step": 30004
},
{
"epoch": 1.4,
"learning_rate": 7.000909996966677e-05,
"loss": 0.0017,
"step": 32312
},
{
"epoch": 1.5,
"learning_rate": 7.50097499675001e-05,
"loss": 0.0017,
"step": 34620
},
{
"epoch": 1.6,
"learning_rate": 8.001039996533345e-05,
"loss": 0.0018,
"step": 36928
},
{
"epoch": 1.7,
"learning_rate": 8.501104996316679e-05,
"loss": 0.0017,
"step": 39236
},
{
"epoch": 1.8,
"learning_rate": 9.001169996100013e-05,
"loss": 0.0018,
"step": 41544
},
{
"epoch": 1.9,
"learning_rate": 9.501234995883348e-05,
"loss": 0.0019,
"step": 43852
},
{
"epoch": 2.0,
"eval_loss": 0.008421082980930805,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 18.622,
"eval_samples_per_second": 13.855,
"eval_steps_per_second": 0.967,
"step": 46154
},
{
"epoch": 2.0,
"learning_rate": 9.999855556037036e-05,
"loss": 0.0018,
"step": 46160
},
{
"epoch": 2.1,
"learning_rate": 9.944292778283332e-05,
"loss": 0.0019,
"step": 48468
},
{
"epoch": 2.2,
"learning_rate": 9.888730000529628e-05,
"loss": 0.0018,
"step": 50776
},
{
"epoch": 2.3,
"learning_rate": 9.833167222775925e-05,
"loss": 0.0019,
"step": 53084
},
{
"epoch": 2.4,
"learning_rate": 9.777604445022221e-05,
"loss": 0.0021,
"step": 55392
},
{
"epoch": 2.5,
"learning_rate": 9.722041667268516e-05,
"loss": 0.0021,
"step": 57700
},
{
"epoch": 2.6,
"learning_rate": 9.666478889514812e-05,
"loss": 0.0021,
"step": 60008
},
{
"epoch": 2.7,
"learning_rate": 9.610916111761109e-05,
"loss": 0.002,
"step": 62316
},
{
"epoch": 2.8,
"learning_rate": 9.555353334007405e-05,
"loss": 0.0022,
"step": 64624
},
{
"epoch": 2.9,
"learning_rate": 9.499790556253701e-05,
"loss": 0.002,
"step": 66932
},
{
"epoch": 3.0,
"eval_loss": 0.008709550835192204,
"eval_max_distance": 1,
"eval_mean_distance": 0,
"eval_runtime": 17.0366,
"eval_samples_per_second": 15.144,
"eval_steps_per_second": 1.057,
"step": 69231
},
{
"epoch": 3.0,
"learning_rate": 9.444227778499997e-05,
"loss": 0.0022,
"step": 69240
},
{
"epoch": 3.1,
"learning_rate": 9.388665000746294e-05,
"loss": 0.0018,
"step": 71548
},
{
"epoch": 3.2,
"learning_rate": 9.33310222299259e-05,
"loss": 0.0019,
"step": 73856
},
{
"epoch": 3.3,
"learning_rate": 9.277539445238886e-05,
"loss": 0.0019,
"step": 76164
},
{
"epoch": 3.4,
"learning_rate": 9.221976667485182e-05,
"loss": 0.0019,
"step": 78472
},
{
"epoch": 3.5,
"learning_rate": 9.166413889731479e-05,
"loss": 0.0018,
"step": 80780
},
{
"epoch": 3.6,
"learning_rate": 9.110851111977775e-05,
"loss": 0.002,
"step": 83088
},
{
"epoch": 3.7,
"learning_rate": 9.055288334224071e-05,
"loss": 0.0019,
"step": 85396
},
{
"epoch": 3.8,
"learning_rate": 8.999725556470368e-05,
"loss": 0.002,
"step": 87704
},
{
"epoch": 3.9,
"learning_rate": 8.944162778716664e-05,
"loss": 0.0021,
"step": 90012
},
{
"epoch": 4.0,
"eval_loss": 0.011967115104198456,
"eval_max_distance": 4,
"eval_mean_distance": 0,
"eval_runtime": 18.1135,
"eval_samples_per_second": 14.243,
"eval_steps_per_second": 0.994,
"step": 92308
},
{
"epoch": 4.0,
"learning_rate": 8.88860000096296e-05,
"loss": 0.0021,
"step": 92320
},
{
"epoch": 4.1,
"learning_rate": 8.833037223209256e-05,
"loss": 0.0018,
"step": 94628
},
{
"epoch": 4.2,
"learning_rate": 8.777474445455553e-05,
"loss": 0.0018,
"step": 96936
},
{
"epoch": 4.3,
"learning_rate": 8.721911667701849e-05,
"loss": 0.0018,
"step": 99244
},
{
"epoch": 4.4,
"learning_rate": 8.666348889948145e-05,
"loss": 0.0018,
"step": 101552
},
{
"epoch": 4.5,
"learning_rate": 8.610786112194442e-05,
"loss": 0.0017,
"step": 103860
},
{
"epoch": 4.6,
"learning_rate": 8.555223334440738e-05,
"loss": 0.0019,
"step": 106168
},
{
"epoch": 4.7,
"learning_rate": 8.499660556687034e-05,
"loss": 0.0019,
"step": 108476
},
{
"epoch": 4.8,
"learning_rate": 8.44409777893333e-05,
"loss": 0.002,
"step": 110784
},
{
"epoch": 4.9,
"learning_rate": 8.388535001179625e-05,
"loss": 0.0019,
"step": 113092
},
{
"epoch": 5.0,
"eval_loss": 0.010002830997109413,
"eval_max_distance": 4,
"eval_mean_distance": 0,
"eval_runtime": 17.3823,
"eval_samples_per_second": 14.843,
"eval_steps_per_second": 1.036,
"step": 115385
},
{
"epoch": 5.0,
"learning_rate": 8.332972223425922e-05,
"loss": 0.0019,
"step": 115400
},
{
"epoch": 5.1,
"learning_rate": 8.277409445672218e-05,
"loss": 0.0017,
"step": 117708
},
{
"epoch": 5.2,
"learning_rate": 8.221846667918514e-05,
"loss": 0.0017,
"step": 120016
},
{
"epoch": 5.3,
"learning_rate": 8.16628389016481e-05,
"loss": 0.0017,
"step": 122324
},
{
"epoch": 5.4,
"learning_rate": 8.110721112411107e-05,
"loss": 0.0017,
"step": 124632
},
{
"epoch": 5.5,
"learning_rate": 8.055158334657403e-05,
"loss": 0.0018,
"step": 126940
},
{
"epoch": 5.6,
"learning_rate": 7.999595556903699e-05,
"loss": 0.0017,
"step": 129248
},
{
"epoch": 5.7,
"learning_rate": 7.944032779149996e-05,
"loss": 0.0017,
"step": 131556
},
{
"epoch": 5.8,
"learning_rate": 7.888470001396292e-05,
"loss": 0.0017,
"step": 133864
},
{
"epoch": 5.9,
"learning_rate": 7.832907223642588e-05,
"loss": 0.0018,
"step": 136172
},
{
"epoch": 6.0,
"eval_loss": 0.011129369959235191,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 17.0083,
"eval_samples_per_second": 15.169,
"eval_steps_per_second": 1.058,
"step": 138462
},
{
"epoch": 6.0,
"learning_rate": 7.777344445888884e-05,
"loss": 0.0018,
"step": 138480
},
{
"epoch": 6.1,
"learning_rate": 7.721781668135181e-05,
"loss": 0.0016,
"step": 140788
},
{
"epoch": 6.2,
"learning_rate": 7.666218890381477e-05,
"loss": 0.0016,
"step": 143096
},
{
"epoch": 6.3,
"learning_rate": 7.610656112627773e-05,
"loss": 0.0017,
"step": 145404
},
{
"epoch": 6.4,
"learning_rate": 7.55509333487407e-05,
"loss": 0.0017,
"step": 147712
},
{
"epoch": 6.5,
"learning_rate": 7.499530557120366e-05,
"loss": 0.0017,
"step": 150020
},
{
"epoch": 6.6,
"learning_rate": 7.443967779366662e-05,
"loss": 0.0016,
"step": 152328
},
{
"epoch": 6.7,
"learning_rate": 7.388405001612958e-05,
"loss": 0.0016,
"step": 154636
},
{
"epoch": 6.8,
"learning_rate": 7.332842223859255e-05,
"loss": 0.0017,
"step": 156944
},
{
"epoch": 6.9,
"learning_rate": 7.277279446105551e-05,
"loss": 0.0017,
"step": 159252
},
{
"epoch": 7.0,
"eval_loss": 0.007010257337242365,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 16.7205,
"eval_samples_per_second": 15.43,
"eval_steps_per_second": 1.077,
"step": 161539
},
{
"epoch": 7.0,
"learning_rate": 7.221716668351847e-05,
"loss": 0.0017,
"step": 161560
},
{
"epoch": 7.1,
"learning_rate": 7.166153890598143e-05,
"loss": 0.0015,
"step": 163868
},
{
"epoch": 7.2,
"learning_rate": 7.11059111284444e-05,
"loss": 0.0015,
"step": 166176
},
{
"epoch": 7.3,
"learning_rate": 7.055028335090735e-05,
"loss": 0.0015,
"step": 168484
},
{
"epoch": 7.4,
"learning_rate": 6.999465557337031e-05,
"loss": 0.0015,
"step": 170792
},
{
"epoch": 7.5,
"learning_rate": 6.943902779583327e-05,
"loss": 0.0015,
"step": 173100
},
{
"epoch": 7.6,
"learning_rate": 6.888340001829624e-05,
"loss": 0.0016,
"step": 175408
},
{
"epoch": 7.7,
"learning_rate": 6.83277722407592e-05,
"loss": 0.0015,
"step": 177716
},
{
"epoch": 7.8,
"learning_rate": 6.777214446322216e-05,
"loss": 0.0016,
"step": 180024
},
{
"epoch": 7.9,
"learning_rate": 6.721651668568512e-05,
"loss": 0.0017,
"step": 182332
},
{
"epoch": 8.0,
"eval_loss": 0.01417616382241249,
"eval_max_distance": 4,
"eval_mean_distance": 0,
"eval_runtime": 17.4179,
"eval_samples_per_second": 14.812,
"eval_steps_per_second": 1.033,
"step": 184616
},
{
"epoch": 8.0,
"learning_rate": 6.666088890814809e-05,
"loss": 0.0016,
"step": 184640
},
{
"epoch": 8.1,
"learning_rate": 6.610526113061105e-05,
"loss": 0.0014,
"step": 186948
},
{
"epoch": 8.2,
"learning_rate": 6.554963335307401e-05,
"loss": 0.0014,
"step": 189256
},
{
"epoch": 8.3,
"learning_rate": 6.499400557553698e-05,
"loss": 0.0014,
"step": 191564
},
{
"epoch": 8.4,
"learning_rate": 6.443837779799994e-05,
"loss": 0.0015,
"step": 193872
},
{
"epoch": 8.5,
"learning_rate": 6.38827500204629e-05,
"loss": 0.0014,
"step": 196180
},
{
"epoch": 8.6,
"learning_rate": 6.332712224292586e-05,
"loss": 0.0014,
"step": 198488
},
{
"epoch": 8.7,
"learning_rate": 6.277149446538883e-05,
"loss": 0.0015,
"step": 200796
},
{
"epoch": 8.8,
"learning_rate": 6.221586668785179e-05,
"loss": 0.0015,
"step": 203104
},
{
"epoch": 8.9,
"learning_rate": 6.166023891031475e-05,
"loss": 0.0014,
"step": 205412
},
{
"epoch": 9.0,
"eval_loss": 0.011828480288386345,
"eval_max_distance": 4,
"eval_mean_distance": 0,
"eval_runtime": 17.0049,
"eval_samples_per_second": 15.172,
"eval_steps_per_second": 1.059,
"step": 207693
},
{
"epoch": 9.0,
"learning_rate": 6.110461113277771e-05,
"loss": 0.0015,
"step": 207720
},
{
"epoch": 9.1,
"learning_rate": 6.054898335524067e-05,
"loss": 0.0014,
"step": 210028
},
{
"epoch": 9.2,
"learning_rate": 5.9993355577703634e-05,
"loss": 0.0013,
"step": 212336
},
{
"epoch": 9.3,
"learning_rate": 5.9437727800166596e-05,
"loss": 0.0014,
"step": 214644
},
{
"epoch": 9.4,
"learning_rate": 5.888210002262956e-05,
"loss": 0.0014,
"step": 216952
},
{
"epoch": 9.5,
"learning_rate": 5.832647224509252e-05,
"loss": 0.0013,
"step": 219260
},
{
"epoch": 9.6,
"learning_rate": 5.7770844467555485e-05,
"loss": 0.0013,
"step": 221568
},
{
"epoch": 9.7,
"learning_rate": 5.721521669001845e-05,
"loss": 0.0013,
"step": 223876
},
{
"epoch": 9.8,
"learning_rate": 5.665958891248141e-05,
"loss": 0.0014,
"step": 226184
},
{
"epoch": 9.9,
"learning_rate": 5.610396113494437e-05,
"loss": 0.0014,
"step": 228492
},
{
"epoch": 10.0,
"eval_loss": 0.011539922095835209,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 17.0363,
"eval_samples_per_second": 15.144,
"eval_steps_per_second": 1.057,
"step": 230770
},
{
"epoch": 10.0,
"learning_rate": 5.5548333357407336e-05,
"loss": 0.0015,
"step": 230800
},
{
"epoch": 10.1,
"learning_rate": 5.49927055798703e-05,
"loss": 0.0013,
"step": 233108
},
{
"epoch": 10.2,
"learning_rate": 5.443707780233326e-05,
"loss": 0.0012,
"step": 235416
},
{
"epoch": 10.3,
"learning_rate": 5.388145002479622e-05,
"loss": 0.0013,
"step": 237724
},
{
"epoch": 10.4,
"learning_rate": 5.332582224725918e-05,
"loss": 0.0013,
"step": 240032
},
{
"epoch": 10.5,
"learning_rate": 5.277019446972214e-05,
"loss": 0.0014,
"step": 242340
},
{
"epoch": 10.6,
"learning_rate": 5.2214566692185106e-05,
"loss": 0.0013,
"step": 244648
},
{
"epoch": 10.7,
"learning_rate": 5.165893891464807e-05,
"loss": 0.0013,
"step": 246956
},
{
"epoch": 10.8,
"learning_rate": 5.110331113711103e-05,
"loss": 0.0013,
"step": 249264
},
{
"epoch": 10.9,
"learning_rate": 5.0547683359573994e-05,
"loss": 0.0013,
"step": 251572
},
{
"epoch": 11.0,
"eval_loss": 0.011254764162003994,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 16.9212,
"eval_samples_per_second": 15.247,
"eval_steps_per_second": 1.064,
"step": 253847
},
{
"epoch": 11.0,
"learning_rate": 4.999205558203695e-05,
"loss": 0.0012,
"step": 253880
},
{
"epoch": 11.1,
"learning_rate": 4.943642780449991e-05,
"loss": 0.0012,
"step": 256188
},
{
"epoch": 11.2,
"learning_rate": 4.8880800026962876e-05,
"loss": 0.0012,
"step": 258496
},
{
"epoch": 11.3,
"learning_rate": 4.832517224942584e-05,
"loss": 0.0013,
"step": 260804
},
{
"epoch": 11.4,
"learning_rate": 4.77695444718888e-05,
"loss": 0.0012,
"step": 263112
},
{
"epoch": 11.5,
"learning_rate": 4.7213916694351764e-05,
"loss": 0.0012,
"step": 265420
},
{
"epoch": 11.6,
"learning_rate": 4.665828891681473e-05,
"loss": 0.0012,
"step": 267728
},
{
"epoch": 11.7,
"learning_rate": 4.610266113927768e-05,
"loss": 0.0013,
"step": 270036
},
{
"epoch": 11.8,
"learning_rate": 4.5547033361740646e-05,
"loss": 0.0012,
"step": 272344
},
{
"epoch": 11.9,
"learning_rate": 4.499140558420361e-05,
"loss": 0.0012,
"step": 274652
},
{
"epoch": 12.0,
"eval_loss": 0.012018387205898762,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 16.9292,
"eval_samples_per_second": 15.24,
"eval_steps_per_second": 1.063,
"step": 276924
},
{
"epoch": 12.0,
"learning_rate": 4.443577780666657e-05,
"loss": 0.0013,
"step": 276960
},
{
"epoch": 12.1,
"learning_rate": 4.3880150029129535e-05,
"loss": 0.0012,
"step": 279268
},
{
"epoch": 12.2,
"learning_rate": 4.33245222515925e-05,
"loss": 0.0011,
"step": 281576
},
{
"epoch": 12.3,
"learning_rate": 4.276889447405546e-05,
"loss": 0.0012,
"step": 283884
},
{
"epoch": 12.4,
"learning_rate": 4.221326669651842e-05,
"loss": 0.0012,
"step": 286192
},
{
"epoch": 12.5,
"learning_rate": 4.1657638918981386e-05,
"loss": 0.0012,
"step": 288500
},
{
"epoch": 12.6,
"learning_rate": 4.110201114144435e-05,
"loss": 0.0011,
"step": 290808
},
{
"epoch": 12.7,
"learning_rate": 4.054638336390731e-05,
"loss": 0.0011,
"step": 293116
},
{
"epoch": 12.8,
"learning_rate": 3.9990755586370274e-05,
"loss": 0.0012,
"step": 295424
},
{
"epoch": 12.9,
"learning_rate": 3.943512780883323e-05,
"loss": 0.0012,
"step": 297732
},
{
"epoch": 13.0,
"eval_loss": 0.013248566538095474,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 16.6859,
"eval_samples_per_second": 15.462,
"eval_steps_per_second": 1.079,
"step": 300001
},
{
"epoch": 13.0,
"learning_rate": 3.887950003129619e-05,
"loss": 0.0012,
"step": 300040
},
{
"epoch": 13.1,
"learning_rate": 3.8323872253759156e-05,
"loss": 0.0011,
"step": 302348
},
{
"epoch": 13.2,
"learning_rate": 3.776824447622212e-05,
"loss": 0.0011,
"step": 304656
},
{
"epoch": 13.3,
"learning_rate": 3.721261669868508e-05,
"loss": 0.0011,
"step": 306964
},
{
"epoch": 13.4,
"learning_rate": 3.6656988921148044e-05,
"loss": 0.0012,
"step": 309272
},
{
"epoch": 13.5,
"learning_rate": 3.954328163153008e-06,
"loss": 0.0012,
"step": 311580
},
{
"epoch": 13.6,
"learning_rate": 3.16057364927606e-06,
"loss": 0.0011,
"step": 313888
},
{
"epoch": 13.7,
"learning_rate": 2.3668191353991126e-06,
"loss": 0.0011,
"step": 316196
},
{
"epoch": 13.8,
"learning_rate": 1.5730646215221654e-06,
"loss": 0.001,
"step": 318504
},
{
"epoch": 13.9,
"learning_rate": 7.79310107645218e-07,
"loss": 0.001,
"step": 320812
},
{
"epoch": 14.0,
"eval_loss": 0.011406470090150833,
"eval_max_distance": 3,
"eval_mean_distance": 0,
"eval_runtime": 18.3924,
"eval_samples_per_second": 14.028,
"eval_steps_per_second": 0.979,
"step": 323078
},
{
"epoch": 14.0,
"step": 323078,
"total_flos": 8.183655700394803e+16,
"train_loss": 4.6522844528394624e-05,
"train_runtime": 1031.7354,
"train_samples_per_second": 4696.956,
"train_steps_per_second": 313.14
}
],
"logging_steps": 2308,
"max_steps": 323078,
"num_train_epochs": 14,
"save_steps": 4616,
"total_flos": 8.183655700394803e+16,
"trial_name": null,
"trial_params": null
}