move-object-left-right-refined / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
1e9583e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 3800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02631578947368421,
"grad_norm": 9.638947486877441,
"learning_rate": 5.263157894736842e-06,
"loss": 1.4471,
"step": 10
},
{
"epoch": 0.05263157894736842,
"grad_norm": 9.498916625976562,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.2214,
"step": 20
},
{
"epoch": 0.07894736842105263,
"grad_norm": 2.470106601715088,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.6744,
"step": 30
},
{
"epoch": 0.10526315789473684,
"grad_norm": 2.844433546066284,
"learning_rate": 2.105263157894737e-05,
"loss": 0.4362,
"step": 40
},
{
"epoch": 0.13157894736842105,
"grad_norm": 1.874556303024292,
"learning_rate": 2.6315789473684212e-05,
"loss": 0.3136,
"step": 50
},
{
"epoch": 0.15789473684210525,
"grad_norm": 1.6943750381469727,
"learning_rate": 3.157894736842105e-05,
"loss": 0.2589,
"step": 60
},
{
"epoch": 0.18421052631578946,
"grad_norm": 2.1084232330322266,
"learning_rate": 3.6842105263157895e-05,
"loss": 0.2282,
"step": 70
},
{
"epoch": 0.21052631578947367,
"grad_norm": 1.509325623512268,
"learning_rate": 4.210526315789474e-05,
"loss": 0.1952,
"step": 80
},
{
"epoch": 0.23684210526315788,
"grad_norm": 2.0447444915771484,
"learning_rate": 4.736842105263158e-05,
"loss": 0.1842,
"step": 90
},
{
"epoch": 0.2631578947368421,
"grad_norm": 1.149940013885498,
"learning_rate": 5.2631578947368424e-05,
"loss": 0.1915,
"step": 100
},
{
"epoch": 0.2894736842105263,
"grad_norm": 1.1014671325683594,
"learning_rate": 5.789473684210527e-05,
"loss": 0.1688,
"step": 110
},
{
"epoch": 0.3157894736842105,
"grad_norm": 1.9964191913604736,
"learning_rate": 6.31578947368421e-05,
"loss": 0.1494,
"step": 120
},
{
"epoch": 0.34210526315789475,
"grad_norm": 0.7053777575492859,
"learning_rate": 6.842105263157895e-05,
"loss": 0.1418,
"step": 130
},
{
"epoch": 0.3684210526315789,
"grad_norm": 1.077986240386963,
"learning_rate": 7.368421052631579e-05,
"loss": 0.1442,
"step": 140
},
{
"epoch": 0.39473684210526316,
"grad_norm": 1.095324993133545,
"learning_rate": 7.894736842105263e-05,
"loss": 0.1332,
"step": 150
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.9210699200630188,
"learning_rate": 8.421052631578948e-05,
"loss": 0.1322,
"step": 160
},
{
"epoch": 0.4473684210526316,
"grad_norm": 1.3079745769500732,
"learning_rate": 8.947368421052632e-05,
"loss": 0.1204,
"step": 170
},
{
"epoch": 0.47368421052631576,
"grad_norm": 1.196655035018921,
"learning_rate": 9.473684210526316e-05,
"loss": 0.1227,
"step": 180
},
{
"epoch": 0.5,
"grad_norm": 0.8218312859535217,
"learning_rate": 0.0001,
"loss": 0.1215,
"step": 190
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.9372950196266174,
"learning_rate": 9.999810668616086e-05,
"loss": 0.114,
"step": 200
},
{
"epoch": 0.5526315789473685,
"grad_norm": 1.0510334968566895,
"learning_rate": 9.999242688802886e-05,
"loss": 0.1101,
"step": 210
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.7145567536354065,
"learning_rate": 9.998296103574967e-05,
"loss": 0.0964,
"step": 220
},
{
"epoch": 0.6052631578947368,
"grad_norm": 0.7895988821983337,
"learning_rate": 9.996970984619641e-05,
"loss": 0.0948,
"step": 230
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.8074854612350464,
"learning_rate": 9.995267432291555e-05,
"loss": 0.1025,
"step": 240
},
{
"epoch": 0.6578947368421053,
"grad_norm": 0.9896809458732605,
"learning_rate": 9.993185575605073e-05,
"loss": 0.0954,
"step": 250
},
{
"epoch": 0.6842105263157895,
"grad_norm": 1.2382564544677734,
"learning_rate": 9.990725572224521e-05,
"loss": 0.0965,
"step": 260
},
{
"epoch": 0.7105263157894737,
"grad_norm": 0.6677968502044678,
"learning_rate": 9.987887608452235e-05,
"loss": 0.104,
"step": 270
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.8802872896194458,
"learning_rate": 9.984671899214457e-05,
"loss": 0.0936,
"step": 280
},
{
"epoch": 0.7631578947368421,
"grad_norm": 0.5323192477226257,
"learning_rate": 9.981078688045062e-05,
"loss": 0.0937,
"step": 290
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.43582749366760254,
"learning_rate": 9.977108247067108e-05,
"loss": 0.0913,
"step": 300
},
{
"epoch": 0.8157894736842105,
"grad_norm": 0.5718427896499634,
"learning_rate": 9.972760876972226e-05,
"loss": 0.0913,
"step": 310
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.5905811190605164,
"learning_rate": 9.968036906997855e-05,
"loss": 0.091,
"step": 320
},
{
"epoch": 0.868421052631579,
"grad_norm": 0.5551950931549072,
"learning_rate": 9.962936694902307e-05,
"loss": 0.0827,
"step": 330
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.37185293436050415,
"learning_rate": 9.957460626937664e-05,
"loss": 0.0768,
"step": 340
},
{
"epoch": 0.9210526315789473,
"grad_norm": 0.7125674486160278,
"learning_rate": 9.951609117820538e-05,
"loss": 0.0878,
"step": 350
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.518326461315155,
"learning_rate": 9.945382610700657e-05,
"loss": 0.0841,
"step": 360
},
{
"epoch": 0.9736842105263158,
"grad_norm": 0.40797701478004456,
"learning_rate": 9.938781577127306e-05,
"loss": 0.0899,
"step": 370
},
{
"epoch": 1.0,
"grad_norm": 0.8398942351341248,
"learning_rate": 9.931806517013612e-05,
"loss": 0.082,
"step": 380
},
{
"epoch": 1.0263157894736843,
"grad_norm": 0.5522546768188477,
"learning_rate": 9.92445795859869e-05,
"loss": 0.0833,
"step": 390
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.7629371285438538,
"learning_rate": 9.916736458407632e-05,
"loss": 0.0782,
"step": 400
},
{
"epoch": 1.0789473684210527,
"grad_norm": 0.6491877436637878,
"learning_rate": 9.908642601209366e-05,
"loss": 0.0795,
"step": 410
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.5657724142074585,
"learning_rate": 9.900176999972366e-05,
"loss": 0.0763,
"step": 420
},
{
"epoch": 1.131578947368421,
"grad_norm": 0.9348977208137512,
"learning_rate": 9.89134029581823e-05,
"loss": 0.0789,
"step": 430
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.9060670733451843,
"learning_rate": 9.88213315797313e-05,
"loss": 0.0897,
"step": 440
},
{
"epoch": 1.1842105263157894,
"grad_norm": 0.5024181008338928,
"learning_rate": 9.872556283717125e-05,
"loss": 0.0781,
"step": 450
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.6639284491539001,
"learning_rate": 9.86261039833136e-05,
"loss": 0.0871,
"step": 460
},
{
"epoch": 1.236842105263158,
"grad_norm": 0.572541356086731,
"learning_rate": 9.852296255043129e-05,
"loss": 0.0741,
"step": 470
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.8119587898254395,
"learning_rate": 9.841614634968843e-05,
"loss": 0.0732,
"step": 480
},
{
"epoch": 1.2894736842105263,
"grad_norm": 0.45451977849006653,
"learning_rate": 9.830566347054868e-05,
"loss": 0.0734,
"step": 490
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.48096764087677,
"learning_rate": 9.819152228016257e-05,
"loss": 0.0729,
"step": 500
},
{
"epoch": 1.3421052631578947,
"grad_norm": 0.4565185606479645,
"learning_rate": 9.807373142273395e-05,
"loss": 0.0673,
"step": 510
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.7846236824989319,
"learning_rate": 9.795229981886521e-05,
"loss": 0.0687,
"step": 520
},
{
"epoch": 1.3947368421052633,
"grad_norm": 0.46232283115386963,
"learning_rate": 9.782723666488181e-05,
"loss": 0.0718,
"step": 530
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.5114668607711792,
"learning_rate": 9.769855143213575e-05,
"loss": 0.0739,
"step": 540
},
{
"epoch": 1.4473684210526316,
"grad_norm": 0.7048826217651367,
"learning_rate": 9.756625386628832e-05,
"loss": 0.066,
"step": 550
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.609493613243103,
"learning_rate": 9.743035398657201e-05,
"loss": 0.0744,
"step": 560
},
{
"epoch": 1.5,
"grad_norm": 0.6096906065940857,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0678,
"step": 570
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.8587449193000793,
"learning_rate": 9.714778872574541e-05,
"loss": 0.0656,
"step": 580
},
{
"epoch": 1.5526315789473686,
"grad_norm": 0.576169490814209,
"learning_rate": 9.700114474402387e-05,
"loss": 0.0715,
"step": 590
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.6562088131904602,
"learning_rate": 9.685094124559034e-05,
"loss": 0.0694,
"step": 600
},
{
"epoch": 1.6052631578947367,
"grad_norm": 0.3795994818210602,
"learning_rate": 9.669718960573927e-05,
"loss": 0.0643,
"step": 610
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.5144829154014587,
"learning_rate": 9.653990146847499e-05,
"loss": 0.0713,
"step": 620
},
{
"epoch": 1.6578947368421053,
"grad_norm": 0.34501057863235474,
"learning_rate": 9.637908874562978e-05,
"loss": 0.0655,
"step": 630
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.3585371673107147,
"learning_rate": 9.621476361596177e-05,
"loss": 0.0634,
"step": 640
},
{
"epoch": 1.7105263157894737,
"grad_norm": 0.4583841562271118,
"learning_rate": 9.604693852423268e-05,
"loss": 0.0603,
"step": 650
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.372306764125824,
"learning_rate": 9.58756261802652e-05,
"loss": 0.0606,
"step": 660
},
{
"epoch": 1.763157894736842,
"grad_norm": 0.49868544936180115,
"learning_rate": 9.570083955798065e-05,
"loss": 0.0651,
"step": 670
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.531895101070404,
"learning_rate": 9.552259189441626e-05,
"loss": 0.0708,
"step": 680
},
{
"epoch": 1.8157894736842106,
"grad_norm": 0.5189070701599121,
"learning_rate": 9.534089668872274e-05,
"loss": 0.0688,
"step": 690
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.36183297634124756,
"learning_rate": 9.515576770114199e-05,
"loss": 0.0684,
"step": 700
},
{
"epoch": 1.868421052631579,
"grad_norm": 0.2646963894367218,
"learning_rate": 9.496721895196497e-05,
"loss": 0.0604,
"step": 710
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.4395381510257721,
"learning_rate": 9.477526472046995e-05,
"loss": 0.0523,
"step": 720
},
{
"epoch": 1.9210526315789473,
"grad_norm": 0.5579097867012024,
"learning_rate": 9.457991954384105e-05,
"loss": 0.0612,
"step": 730
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.4455315172672272,
"learning_rate": 9.438119821606727e-05,
"loss": 0.0625,
"step": 740
},
{
"epoch": 1.973684210526316,
"grad_norm": 0.5463271737098694,
"learning_rate": 9.417911578682229e-05,
"loss": 0.0605,
"step": 750
},
{
"epoch": 2.0,
"grad_norm": 0.7862467765808105,
"learning_rate": 9.397368756032445e-05,
"loss": 0.0606,
"step": 760
},
{
"epoch": 2.026315789473684,
"grad_norm": 0.48962733149528503,
"learning_rate": 9.376492909417795e-05,
"loss": 0.0565,
"step": 770
},
{
"epoch": 2.0526315789473686,
"grad_norm": 0.37852731347084045,
"learning_rate": 9.35528561981945e-05,
"loss": 0.0622,
"step": 780
},
{
"epoch": 2.0789473684210527,
"grad_norm": 0.37320488691329956,
"learning_rate": 9.333748493319603e-05,
"loss": 0.0602,
"step": 790
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.5312328338623047,
"learning_rate": 9.311883160979844e-05,
"loss": 0.0522,
"step": 800
},
{
"epoch": 2.1315789473684212,
"grad_norm": 0.5505270957946777,
"learning_rate": 9.289691278717623e-05,
"loss": 0.0544,
"step": 810
},
{
"epoch": 2.1578947368421053,
"grad_norm": 0.6213037371635437,
"learning_rate": 9.267174527180853e-05,
"loss": 0.0647,
"step": 820
},
{
"epoch": 2.1842105263157894,
"grad_norm": 0.5001884698867798,
"learning_rate": 9.244334611620629e-05,
"loss": 0.0573,
"step": 830
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.41479626297950745,
"learning_rate": 9.221173261762073e-05,
"loss": 0.0576,
"step": 840
},
{
"epoch": 2.236842105263158,
"grad_norm": 0.4636107385158539,
"learning_rate": 9.197692231673361e-05,
"loss": 0.0607,
"step": 850
},
{
"epoch": 2.263157894736842,
"grad_norm": 0.4859972298145294,
"learning_rate": 9.173893299632856e-05,
"loss": 0.0561,
"step": 860
},
{
"epoch": 2.2894736842105265,
"grad_norm": 0.3786041736602783,
"learning_rate": 9.149778267994457e-05,
"loss": 0.0594,
"step": 870
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.5478907823562622,
"learning_rate": 9.12534896305109e-05,
"loss": 0.0547,
"step": 880
},
{
"epoch": 2.3421052631578947,
"grad_norm": 0.3971192240715027,
"learning_rate": 9.100607234896397e-05,
"loss": 0.058,
"step": 890
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.4420956075191498,
"learning_rate": 9.075554957284633e-05,
"loss": 0.0576,
"step": 900
},
{
"epoch": 2.3947368421052633,
"grad_norm": 0.5636497139930725,
"learning_rate": 9.050194027488754e-05,
"loss": 0.0513,
"step": 910
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.6197888255119324,
"learning_rate": 9.024526366156732e-05,
"loss": 0.0547,
"step": 920
},
{
"epoch": 2.4473684210526314,
"grad_norm": 0.5714887976646423,
"learning_rate": 8.998553917166108e-05,
"loss": 0.0605,
"step": 930
},
{
"epoch": 2.473684210526316,
"grad_norm": 0.3763100206851959,
"learning_rate": 8.972278647476764e-05,
"loss": 0.053,
"step": 940
},
{
"epoch": 2.5,
"grad_norm": 0.4168098270893097,
"learning_rate": 8.945702546981969e-05,
"loss": 0.0587,
"step": 950
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.3482176959514618,
"learning_rate": 8.918827628357677e-05,
"loss": 0.053,
"step": 960
},
{
"epoch": 2.5526315789473686,
"grad_norm": 0.4938332736492157,
"learning_rate": 8.891655926910103e-05,
"loss": 0.06,
"step": 970
},
{
"epoch": 2.5789473684210527,
"grad_norm": 0.3451533317565918,
"learning_rate": 8.864189500421582e-05,
"loss": 0.0612,
"step": 980
},
{
"epoch": 2.6052631578947367,
"grad_norm": 0.4287240505218506,
"learning_rate": 8.836430428994732e-05,
"loss": 0.054,
"step": 990
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.6272952556610107,
"learning_rate": 8.808380814894912e-05,
"loss": 0.0555,
"step": 1000
},
{
"epoch": 2.6578947368421053,
"grad_norm": 0.46518048644065857,
"learning_rate": 8.780042782391028e-05,
"loss": 0.0608,
"step": 1010
},
{
"epoch": 2.6842105263157894,
"grad_norm": 0.7546030282974243,
"learning_rate": 8.751418477594645e-05,
"loss": 0.0566,
"step": 1020
},
{
"epoch": 2.7105263157894735,
"grad_norm": 0.6570419073104858,
"learning_rate": 8.722510068297454e-05,
"loss": 0.0565,
"step": 1030
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.5381316542625427,
"learning_rate": 8.693319743807116e-05,
"loss": 0.0534,
"step": 1040
},
{
"epoch": 2.763157894736842,
"grad_norm": 0.566612958908081,
"learning_rate": 8.663849714781442e-05,
"loss": 0.05,
"step": 1050
},
{
"epoch": 2.7894736842105265,
"grad_norm": 0.49792104959487915,
"learning_rate": 8.634102213060984e-05,
"loss": 0.0501,
"step": 1060
},
{
"epoch": 2.8157894736842106,
"grad_norm": 0.39989131689071655,
"learning_rate": 8.60407949150001e-05,
"loss": 0.0541,
"step": 1070
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.3729485273361206,
"learning_rate": 8.573783823795889e-05,
"loss": 0.0568,
"step": 1080
},
{
"epoch": 2.8684210526315788,
"grad_norm": 0.3304407000541687,
"learning_rate": 8.543217504316896e-05,
"loss": 0.0508,
"step": 1090
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.3478791415691376,
"learning_rate": 8.512382847928461e-05,
"loss": 0.0544,
"step": 1100
},
{
"epoch": 2.9210526315789473,
"grad_norm": 0.35698530077934265,
"learning_rate": 8.48128218981785e-05,
"loss": 0.0522,
"step": 1110
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.37133410573005676,
"learning_rate": 8.44991788531732e-05,
"loss": 0.0488,
"step": 1120
},
{
"epoch": 2.973684210526316,
"grad_norm": 0.31634142994880676,
"learning_rate": 8.418292309725738e-05,
"loss": 0.0551,
"step": 1130
},
{
"epoch": 3.0,
"grad_norm": 0.5727193355560303,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0497,
"step": 1140
},
{
"epoch": 3.026315789473684,
"grad_norm": 0.3365287482738495,
"learning_rate": 8.35426694521716e-05,
"loss": 0.048,
"step": 1150
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.26241013407707214,
"learning_rate": 8.321872005104509e-05,
"loss": 0.0517,
"step": 1160
},
{
"epoch": 3.0789473684210527,
"grad_norm": 0.39349165558815,
"learning_rate": 8.289225491142292e-05,
"loss": 0.0559,
"step": 1170
},
{
"epoch": 3.1052631578947367,
"grad_norm": 0.4374188482761383,
"learning_rate": 8.256329875734375e-05,
"loss": 0.0544,
"step": 1180
},
{
"epoch": 3.1315789473684212,
"grad_norm": 0.3512132465839386,
"learning_rate": 8.223187650149712e-05,
"loss": 0.0477,
"step": 1190
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.3832947313785553,
"learning_rate": 8.189801324333681e-05,
"loss": 0.0453,
"step": 1200
},
{
"epoch": 3.1842105263157894,
"grad_norm": 0.27593138813972473,
"learning_rate": 8.156173426717988e-05,
"loss": 0.0529,
"step": 1210
},
{
"epoch": 3.2105263157894735,
"grad_norm": 0.3951173424720764,
"learning_rate": 8.122306504029194e-05,
"loss": 0.0524,
"step": 1220
},
{
"epoch": 3.236842105263158,
"grad_norm": 0.3123633563518524,
"learning_rate": 8.08820312109583e-05,
"loss": 0.0477,
"step": 1230
},
{
"epoch": 3.263157894736842,
"grad_norm": 0.4703565835952759,
"learning_rate": 8.053865860654175e-05,
"loss": 0.0473,
"step": 1240
},
{
"epoch": 3.2894736842105265,
"grad_norm": 0.3525274991989136,
"learning_rate": 8.019297323152642e-05,
"loss": 0.057,
"step": 1250
},
{
"epoch": 3.3157894736842106,
"grad_norm": 0.352622926235199,
"learning_rate": 7.984500126554853e-05,
"loss": 0.0487,
"step": 1260
},
{
"epoch": 3.3421052631578947,
"grad_norm": 0.5331799387931824,
"learning_rate": 7.94947690614136e-05,
"loss": 0.0519,
"step": 1270
},
{
"epoch": 3.3684210526315788,
"grad_norm": 0.5408539772033691,
"learning_rate": 7.914230314310079e-05,
"loss": 0.0467,
"step": 1280
},
{
"epoch": 3.3947368421052633,
"grad_norm": 0.42730775475502014,
"learning_rate": 7.878763020375415e-05,
"loss": 0.0486,
"step": 1290
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.3785395920276642,
"learning_rate": 7.843077710366105e-05,
"loss": 0.0518,
"step": 1300
},
{
"epoch": 3.4473684210526314,
"grad_norm": 0.3049032986164093,
"learning_rate": 7.807177086821802e-05,
"loss": 0.0433,
"step": 1310
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.3494362533092499,
"learning_rate": 7.771063868588399e-05,
"loss": 0.0453,
"step": 1320
},
{
"epoch": 3.5,
"grad_norm": 0.4008173942565918,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0544,
"step": 1330
},
{
"epoch": 3.526315789473684,
"grad_norm": 0.5177074074745178,
"learning_rate": 7.698210603732454e-05,
"loss": 0.0433,
"step": 1340
},
{
"epoch": 3.5526315789473686,
"grad_norm": 0.42569538950920105,
"learning_rate": 7.661476074473695e-05,
"loss": 0.0428,
"step": 1350
},
{
"epoch": 3.5789473684210527,
"grad_norm": 0.36168932914733887,
"learning_rate": 7.624539984835557e-05,
"loss": 0.0466,
"step": 1360
},
{
"epoch": 3.6052631578947367,
"grad_norm": 0.4134399890899658,
"learning_rate": 7.587405132082433e-05,
"loss": 0.0468,
"step": 1370
},
{
"epoch": 3.6315789473684212,
"grad_norm": 0.2959583103656769,
"learning_rate": 7.550074328531545e-05,
"loss": 0.0477,
"step": 1380
},
{
"epoch": 3.6578947368421053,
"grad_norm": 0.4948980510234833,
"learning_rate": 7.512550401339971e-05,
"loss": 0.0453,
"step": 1390
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.5422468185424805,
"learning_rate": 7.47483619229054e-05,
"loss": 0.049,
"step": 1400
},
{
"epoch": 3.7105263157894735,
"grad_norm": 0.4651060402393341,
"learning_rate": 7.436934557576612e-05,
"loss": 0.0486,
"step": 1410
},
{
"epoch": 3.736842105263158,
"grad_norm": 0.37345531582832336,
"learning_rate": 7.39884836758576e-05,
"loss": 0.0428,
"step": 1420
},
{
"epoch": 3.763157894736842,
"grad_norm": 0.4845902621746063,
"learning_rate": 7.360580506682414e-05,
"loss": 0.0435,
"step": 1430
},
{
"epoch": 3.7894736842105265,
"grad_norm": 0.38114190101623535,
"learning_rate": 7.322133872989398e-05,
"loss": 0.0384,
"step": 1440
},
{
"epoch": 3.8157894736842106,
"grad_norm": 0.4120785892009735,
"learning_rate": 7.283511378168458e-05,
"loss": 0.0422,
"step": 1450
},
{
"epoch": 3.8421052631578947,
"grad_norm": 0.4169304370880127,
"learning_rate": 7.244715947199749e-05,
"loss": 0.0413,
"step": 1460
},
{
"epoch": 3.8684210526315788,
"grad_norm": 0.3038704991340637,
"learning_rate": 7.20575051816033e-05,
"loss": 0.0445,
"step": 1470
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.3277107775211334,
"learning_rate": 7.16661804200164e-05,
"loss": 0.0443,
"step": 1480
},
{
"epoch": 3.9210526315789473,
"grad_norm": 0.3675183057785034,
"learning_rate": 7.127321482326026e-05,
"loss": 0.046,
"step": 1490
},
{
"epoch": 3.9473684210526314,
"grad_norm": 0.3455309569835663,
"learning_rate": 7.087863815162298e-05,
"loss": 0.0424,
"step": 1500
},
{
"epoch": 3.973684210526316,
"grad_norm": 0.49531644582748413,
"learning_rate": 7.04824802874035e-05,
"loss": 0.0457,
"step": 1510
},
{
"epoch": 4.0,
"grad_norm": 0.6025938391685486,
"learning_rate": 7.008477123264848e-05,
"loss": 0.0497,
"step": 1520
},
{
"epoch": 4.026315789473684,
"grad_norm": 0.2605507969856262,
"learning_rate": 6.96855411068802e-05,
"loss": 0.047,
"step": 1530
},
{
"epoch": 4.052631578947368,
"grad_norm": 0.3557581603527069,
"learning_rate": 6.928482014481558e-05,
"loss": 0.0434,
"step": 1540
},
{
"epoch": 4.078947368421052,
"grad_norm": 0.6790323257446289,
"learning_rate": 6.888263869407631e-05,
"loss": 0.0431,
"step": 1550
},
{
"epoch": 4.105263157894737,
"grad_norm": 0.5850968360900879,
"learning_rate": 6.847902721289068e-05,
"loss": 0.0444,
"step": 1560
},
{
"epoch": 4.131578947368421,
"grad_norm": 0.22265614569187164,
"learning_rate": 6.807401626778679e-05,
"loss": 0.0463,
"step": 1570
},
{
"epoch": 4.157894736842105,
"grad_norm": 0.3996434509754181,
"learning_rate": 6.766763653127773e-05,
"loss": 0.0391,
"step": 1580
},
{
"epoch": 4.184210526315789,
"grad_norm": 0.4038737118244171,
"learning_rate": 6.725991877953868e-05,
"loss": 0.0432,
"step": 1590
},
{
"epoch": 4.2105263157894735,
"grad_norm": 0.5624067187309265,
"learning_rate": 6.685089389007612e-05,
"loss": 0.0437,
"step": 1600
},
{
"epoch": 4.2368421052631575,
"grad_norm": 0.32445138692855835,
"learning_rate": 6.644059283938938e-05,
"loss": 0.0439,
"step": 1610
},
{
"epoch": 4.2631578947368425,
"grad_norm": 0.385633647441864,
"learning_rate": 6.602904670062476e-05,
"loss": 0.0362,
"step": 1620
},
{
"epoch": 4.2894736842105265,
"grad_norm": 0.4192872643470764,
"learning_rate": 6.561628664122226e-05,
"loss": 0.0439,
"step": 1630
},
{
"epoch": 4.315789473684211,
"grad_norm": 0.46780234575271606,
"learning_rate": 6.520234392055522e-05,
"loss": 0.0447,
"step": 1640
},
{
"epoch": 4.342105263157895,
"grad_norm": 0.30417683720588684,
"learning_rate": 6.478724988756285e-05,
"loss": 0.0413,
"step": 1650
},
{
"epoch": 4.368421052631579,
"grad_norm": 0.3585522472858429,
"learning_rate": 6.437103597837631e-05,
"loss": 0.0451,
"step": 1660
},
{
"epoch": 4.394736842105263,
"grad_norm": 0.2543241083621979,
"learning_rate": 6.39537337139377e-05,
"loss": 0.0454,
"step": 1670
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.41101086139678955,
"learning_rate": 6.353537469761315e-05,
"loss": 0.0468,
"step": 1680
},
{
"epoch": 4.447368421052632,
"grad_norm": 0.2859801650047302,
"learning_rate": 6.311599061279932e-05,
"loss": 0.0462,
"step": 1690
},
{
"epoch": 4.473684210526316,
"grad_norm": 0.2935991585254669,
"learning_rate": 6.269561322052378e-05,
"loss": 0.0457,
"step": 1700
},
{
"epoch": 4.5,
"grad_norm": 0.311879426240921,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0419,
"step": 1710
},
{
"epoch": 4.526315789473684,
"grad_norm": 0.4428638517856598,
"learning_rate": 6.185200593141593e-05,
"loss": 0.0412,
"step": 1720
},
{
"epoch": 4.552631578947368,
"grad_norm": 0.44332432746887207,
"learning_rate": 6.142883992311781e-05,
"loss": 0.0391,
"step": 1730
},
{
"epoch": 4.578947368421053,
"grad_norm": 0.31303486227989197,
"learning_rate": 6.100480837958802e-05,
"loss": 0.0478,
"step": 1740
},
{
"epoch": 4.605263157894737,
"grad_norm": 0.4248807430267334,
"learning_rate": 6.057994341381813e-05,
"loss": 0.0419,
"step": 1750
},
{
"epoch": 4.631578947368421,
"grad_norm": 0.38378843665122986,
"learning_rate": 6.015427720191693e-05,
"loss": 0.0388,
"step": 1760
},
{
"epoch": 4.657894736842105,
"grad_norm": 0.232202410697937,
"learning_rate": 5.9727841980673604e-05,
"loss": 0.0491,
"step": 1770
},
{
"epoch": 4.684210526315789,
"grad_norm": 0.3846134841442108,
"learning_rate": 5.93006700451164e-05,
"loss": 0.0442,
"step": 1780
},
{
"epoch": 4.7105263157894735,
"grad_norm": 0.24118691682815552,
"learning_rate": 5.887279374606679e-05,
"loss": 0.0379,
"step": 1790
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.3538924753665924,
"learning_rate": 5.844424548768952e-05,
"loss": 0.0371,
"step": 1800
},
{
"epoch": 4.7631578947368425,
"grad_norm": 0.2391858547925949,
"learning_rate": 5.8015057725038534e-05,
"loss": 0.0393,
"step": 1810
},
{
"epoch": 4.7894736842105265,
"grad_norm": 0.43974366784095764,
"learning_rate": 5.7585262961599054e-05,
"loss": 0.0408,
"step": 1820
},
{
"epoch": 4.815789473684211,
"grad_norm": 0.3488527834415436,
"learning_rate": 5.7154893746826014e-05,
"loss": 0.0419,
"step": 1830
},
{
"epoch": 4.842105263157895,
"grad_norm": 0.5002457499504089,
"learning_rate": 5.672398267367902e-05,
"loss": 0.0403,
"step": 1840
},
{
"epoch": 4.868421052631579,
"grad_norm": 0.31320708990097046,
"learning_rate": 5.6292562376154037e-05,
"loss": 0.0423,
"step": 1850
},
{
"epoch": 4.894736842105263,
"grad_norm": 0.26685863733291626,
"learning_rate": 5.586066552681179e-05,
"loss": 0.0377,
"step": 1860
},
{
"epoch": 4.921052631578947,
"grad_norm": 0.3947713077068329,
"learning_rate": 5.542832483430363e-05,
"loss": 0.0401,
"step": 1870
},
{
"epoch": 4.947368421052632,
"grad_norm": 0.20824529230594635,
"learning_rate": 5.499557304089419e-05,
"loss": 0.0382,
"step": 1880
},
{
"epoch": 4.973684210526316,
"grad_norm": 0.25169357657432556,
"learning_rate": 5.4562442919981816e-05,
"loss": 0.0339,
"step": 1890
},
{
"epoch": 5.0,
"grad_norm": 0.4444282054901123,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0443,
"step": 1900
},
{
"epoch": 5.026315789473684,
"grad_norm": 0.4260099530220032,
"learning_rate": 5.3695178930016196e-05,
"loss": 0.0364,
"step": 1910
},
{
"epoch": 5.052631578947368,
"grad_norm": 0.33936482667922974,
"learning_rate": 5.3261110741079525e-05,
"loss": 0.0367,
"step": 1920
},
{
"epoch": 5.078947368421052,
"grad_norm": 0.5758324861526489,
"learning_rate": 5.2826795579898956e-05,
"loss": 0.0383,
"step": 1930
},
{
"epoch": 5.105263157894737,
"grad_norm": 0.4980020821094513,
"learning_rate": 5.2392266338270736e-05,
"loss": 0.0422,
"step": 1940
},
{
"epoch": 5.131578947368421,
"grad_norm": 0.4701398015022278,
"learning_rate": 5.195755592420387e-05,
"loss": 0.0372,
"step": 1950
},
{
"epoch": 5.157894736842105,
"grad_norm": 0.3783451020717621,
"learning_rate": 5.1522697259428146e-05,
"loss": 0.0371,
"step": 1960
},
{
"epoch": 5.184210526315789,
"grad_norm": 0.41548261046409607,
"learning_rate": 5.1087723276900646e-05,
"loss": 0.0415,
"step": 1970
},
{
"epoch": 5.2105263157894735,
"grad_norm": 0.3551521897315979,
"learning_rate": 5.065266691831181e-05,
"loss": 0.0344,
"step": 1980
},
{
"epoch": 5.2368421052631575,
"grad_norm": 0.5270403623580933,
"learning_rate": 5.021756113159062e-05,
"loss": 0.0355,
"step": 1990
},
{
"epoch": 5.2631578947368425,
"grad_norm": 0.29675352573394775,
"learning_rate": 4.978243886840939e-05,
"loss": 0.0424,
"step": 2000
},
{
"epoch": 5.2894736842105265,
"grad_norm": 0.3667472302913666,
"learning_rate": 4.934733308168821e-05,
"loss": 0.0449,
"step": 2010
},
{
"epoch": 5.315789473684211,
"grad_norm": 0.34768638014793396,
"learning_rate": 4.891227672309935e-05,
"loss": 0.041,
"step": 2020
},
{
"epoch": 5.342105263157895,
"grad_norm": 0.38154706358909607,
"learning_rate": 4.8477302740571866e-05,
"loss": 0.0382,
"step": 2030
},
{
"epoch": 5.368421052631579,
"grad_norm": 0.16855105757713318,
"learning_rate": 4.804244407579613e-05,
"loss": 0.0376,
"step": 2040
},
{
"epoch": 5.394736842105263,
"grad_norm": 0.3421989679336548,
"learning_rate": 4.760773366172929e-05,
"loss": 0.0381,
"step": 2050
},
{
"epoch": 5.421052631578947,
"grad_norm": 0.3012019991874695,
"learning_rate": 4.717320442010105e-05,
"loss": 0.0353,
"step": 2060
},
{
"epoch": 5.447368421052632,
"grad_norm": 0.2907329797744751,
"learning_rate": 4.673888925892048e-05,
"loss": 0.0372,
"step": 2070
},
{
"epoch": 5.473684210526316,
"grad_norm": 0.3655814826488495,
"learning_rate": 4.630482106998381e-05,
"loss": 0.0354,
"step": 2080
},
{
"epoch": 5.5,
"grad_norm": 0.3697777986526489,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.0426,
"step": 2090
},
{
"epoch": 5.526315789473684,
"grad_norm": 0.22509945929050446,
"learning_rate": 4.5437557080018175e-05,
"loss": 0.0358,
"step": 2100
},
{
"epoch": 5.552631578947368,
"grad_norm": 0.30516788363456726,
"learning_rate": 4.500442695910582e-05,
"loss": 0.0411,
"step": 2110
},
{
"epoch": 5.578947368421053,
"grad_norm": 0.3428977429866791,
"learning_rate": 4.457167516569637e-05,
"loss": 0.032,
"step": 2120
},
{
"epoch": 5.605263157894737,
"grad_norm": 0.2875311076641083,
"learning_rate": 4.413933447318821e-05,
"loss": 0.0339,
"step": 2130
},
{
"epoch": 5.631578947368421,
"grad_norm": 0.3058890700340271,
"learning_rate": 4.3707437623845995e-05,
"loss": 0.035,
"step": 2140
},
{
"epoch": 5.657894736842105,
"grad_norm": 0.317531019449234,
"learning_rate": 4.3276017326320985e-05,
"loss": 0.0324,
"step": 2150
},
{
"epoch": 5.684210526315789,
"grad_norm": 0.3118530809879303,
"learning_rate": 4.2845106253174e-05,
"loss": 0.0345,
"step": 2160
},
{
"epoch": 5.7105263157894735,
"grad_norm": 0.2496698647737503,
"learning_rate": 4.2414737038400964e-05,
"loss": 0.0373,
"step": 2170
},
{
"epoch": 5.7368421052631575,
"grad_norm": 0.26971638202667236,
"learning_rate": 4.198494227496148e-05,
"loss": 0.0368,
"step": 2180
},
{
"epoch": 5.7631578947368425,
"grad_norm": 0.43503808975219727,
"learning_rate": 4.155575451231048e-05,
"loss": 0.0348,
"step": 2190
},
{
"epoch": 5.7894736842105265,
"grad_norm": 0.26233163475990295,
"learning_rate": 4.112720625393322e-05,
"loss": 0.0354,
"step": 2200
},
{
"epoch": 5.815789473684211,
"grad_norm": 0.40549346804618835,
"learning_rate": 4.069932995488361e-05,
"loss": 0.0385,
"step": 2210
},
{
"epoch": 5.842105263157895,
"grad_norm": 0.3487369120121002,
"learning_rate": 4.0272158019326414e-05,
"loss": 0.0365,
"step": 2220
},
{
"epoch": 5.868421052631579,
"grad_norm": 0.35258445143699646,
"learning_rate": 3.9845722798083066e-05,
"loss": 0.035,
"step": 2230
},
{
"epoch": 5.894736842105263,
"grad_norm": 0.3888448476791382,
"learning_rate": 3.942005658618188e-05,
"loss": 0.0357,
"step": 2240
},
{
"epoch": 5.921052631578947,
"grad_norm": 0.41619524359703064,
"learning_rate": 3.8995191620412e-05,
"loss": 0.0352,
"step": 2250
},
{
"epoch": 5.947368421052632,
"grad_norm": 0.5103023052215576,
"learning_rate": 3.8571160076882204e-05,
"loss": 0.0331,
"step": 2260
},
{
"epoch": 5.973684210526316,
"grad_norm": 0.458965927362442,
"learning_rate": 3.8147994068584087e-05,
"loss": 0.0301,
"step": 2270
},
{
"epoch": 6.0,
"grad_norm": 0.32402315735816956,
"learning_rate": 3.772572564296005e-05,
"loss": 0.0351,
"step": 2280
},
{
"epoch": 6.026315789473684,
"grad_norm": 0.2968176603317261,
"learning_rate": 3.730438677947624e-05,
"loss": 0.0413,
"step": 2290
},
{
"epoch": 6.052631578947368,
"grad_norm": 0.3646889626979828,
"learning_rate": 3.6884009387200714e-05,
"loss": 0.0357,
"step": 2300
},
{
"epoch": 6.078947368421052,
"grad_norm": 0.26003819704055786,
"learning_rate": 3.646462530238684e-05,
"loss": 0.0334,
"step": 2310
},
{
"epoch": 6.105263157894737,
"grad_norm": 0.28209343552589417,
"learning_rate": 3.60462662860623e-05,
"loss": 0.0317,
"step": 2320
},
{
"epoch": 6.131578947368421,
"grad_norm": 0.4188937544822693,
"learning_rate": 3.56289640216237e-05,
"loss": 0.0354,
"step": 2330
},
{
"epoch": 6.157894736842105,
"grad_norm": 0.44584834575653076,
"learning_rate": 3.521275011243715e-05,
"loss": 0.0395,
"step": 2340
},
{
"epoch": 6.184210526315789,
"grad_norm": 0.44341957569122314,
"learning_rate": 3.4797656079444806e-05,
"loss": 0.033,
"step": 2350
},
{
"epoch": 6.2105263157894735,
"grad_norm": 0.3678707182407379,
"learning_rate": 3.4383713358777735e-05,
"loss": 0.0344,
"step": 2360
},
{
"epoch": 6.2368421052631575,
"grad_norm": 0.30562394857406616,
"learning_rate": 3.397095329937526e-05,
"loss": 0.0339,
"step": 2370
},
{
"epoch": 6.2631578947368425,
"grad_norm": 0.4624996781349182,
"learning_rate": 3.355940716061064e-05,
"loss": 0.0341,
"step": 2380
},
{
"epoch": 6.2894736842105265,
"grad_norm": 0.20757588744163513,
"learning_rate": 3.31491061099239e-05,
"loss": 0.0296,
"step": 2390
},
{
"epoch": 6.315789473684211,
"grad_norm": 0.31677284836769104,
"learning_rate": 3.274008122046132e-05,
"loss": 0.0338,
"step": 2400
},
{
"epoch": 6.342105263157895,
"grad_norm": 0.24551235139369965,
"learning_rate": 3.233236346872227e-05,
"loss": 0.0381,
"step": 2410
},
{
"epoch": 6.368421052631579,
"grad_norm": 0.250813364982605,
"learning_rate": 3.192598373221322e-05,
"loss": 0.0333,
"step": 2420
},
{
"epoch": 6.394736842105263,
"grad_norm": 0.5114396810531616,
"learning_rate": 3.152097278710933e-05,
"loss": 0.0396,
"step": 2430
},
{
"epoch": 6.421052631578947,
"grad_norm": 0.4336398243904114,
"learning_rate": 3.1117361305923684e-05,
"loss": 0.0361,
"step": 2440
},
{
"epoch": 6.447368421052632,
"grad_norm": 0.3308594226837158,
"learning_rate": 3.071517985518442e-05,
"loss": 0.0314,
"step": 2450
},
{
"epoch": 6.473684210526316,
"grad_norm": 0.4118829667568207,
"learning_rate": 3.0314458893119808e-05,
"loss": 0.0337,
"step": 2460
},
{
"epoch": 6.5,
"grad_norm": 0.19590795040130615,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0359,
"step": 2470
},
{
"epoch": 6.526315789473684,
"grad_norm": 0.39492878317832947,
"learning_rate": 2.9517519712596498e-05,
"loss": 0.0304,
"step": 2480
},
{
"epoch": 6.552631578947368,
"grad_norm": 0.2706056833267212,
"learning_rate": 2.9121361848377014e-05,
"loss": 0.0301,
"step": 2490
},
{
"epoch": 6.578947368421053,
"grad_norm": 0.2509630024433136,
"learning_rate": 2.872678517673975e-05,
"loss": 0.0313,
"step": 2500
},
{
"epoch": 6.605263157894737,
"grad_norm": 0.37828516960144043,
"learning_rate": 2.8333819579983623e-05,
"loss": 0.0323,
"step": 2510
},
{
"epoch": 6.631578947368421,
"grad_norm": 0.3628530204296112,
"learning_rate": 2.794249481839669e-05,
"loss": 0.0292,
"step": 2520
},
{
"epoch": 6.657894736842105,
"grad_norm": 0.35174816846847534,
"learning_rate": 2.7552840528002498e-05,
"loss": 0.0326,
"step": 2530
},
{
"epoch": 6.684210526315789,
"grad_norm": 0.3942660391330719,
"learning_rate": 2.7164886218315444e-05,
"loss": 0.0314,
"step": 2540
},
{
"epoch": 6.7105263157894735,
"grad_norm": 0.2891586124897003,
"learning_rate": 2.6778661270106025e-05,
"loss": 0.0346,
"step": 2550
},
{
"epoch": 6.7368421052631575,
"grad_norm": 0.3602871000766754,
"learning_rate": 2.6394194933175875e-05,
"loss": 0.0338,
"step": 2560
},
{
"epoch": 6.7631578947368425,
"grad_norm": 0.3285452127456665,
"learning_rate": 2.601151632414241e-05,
"loss": 0.0301,
"step": 2570
},
{
"epoch": 6.7894736842105265,
"grad_norm": 0.3526369035243988,
"learning_rate": 2.5630654424233903e-05,
"loss": 0.0331,
"step": 2580
},
{
"epoch": 6.815789473684211,
"grad_norm": 0.2297569215297699,
"learning_rate": 2.5251638077094602e-05,
"loss": 0.0311,
"step": 2590
},
{
"epoch": 6.842105263157895,
"grad_norm": 0.24023893475532532,
"learning_rate": 2.4874495986600294e-05,
"loss": 0.0299,
"step": 2600
},
{
"epoch": 6.868421052631579,
"grad_norm": 0.3183501660823822,
"learning_rate": 2.4499256714684565e-05,
"loss": 0.0329,
"step": 2610
},
{
"epoch": 6.894736842105263,
"grad_norm": 0.335347056388855,
"learning_rate": 2.4125948679175686e-05,
"loss": 0.0302,
"step": 2620
},
{
"epoch": 6.921052631578947,
"grad_norm": 0.31279057264328003,
"learning_rate": 2.3754600151644445e-05,
"loss": 0.0317,
"step": 2630
},
{
"epoch": 6.947368421052632,
"grad_norm": 0.25722017884254456,
"learning_rate": 2.3385239255263077e-05,
"loss": 0.034,
"step": 2640
},
{
"epoch": 6.973684210526316,
"grad_norm": 0.4903739392757416,
"learning_rate": 2.3017893962675458e-05,
"loss": 0.0272,
"step": 2650
},
{
"epoch": 7.0,
"grad_norm": 0.31394830346107483,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0343,
"step": 2660
},
{
"epoch": 7.026315789473684,
"grad_norm": 0.24519610404968262,
"learning_rate": 2.228936131411601e-05,
"loss": 0.0283,
"step": 2670
},
{
"epoch": 7.052631578947368,
"grad_norm": 0.3303487002849579,
"learning_rate": 2.1928229131782007e-05,
"loss": 0.0311,
"step": 2680
},
{
"epoch": 7.078947368421052,
"grad_norm": 0.18910463154315948,
"learning_rate": 2.1569222896338966e-05,
"loss": 0.0272,
"step": 2690
},
{
"epoch": 7.105263157894737,
"grad_norm": 0.2918621003627777,
"learning_rate": 2.1212369796245864e-05,
"loss": 0.0298,
"step": 2700
},
{
"epoch": 7.131578947368421,
"grad_norm": 0.42113614082336426,
"learning_rate": 2.0857696856899232e-05,
"loss": 0.0295,
"step": 2710
},
{
"epoch": 7.157894736842105,
"grad_norm": 0.3034079670906067,
"learning_rate": 2.0505230938586418e-05,
"loss": 0.0235,
"step": 2720
},
{
"epoch": 7.184210526315789,
"grad_norm": 0.4173114001750946,
"learning_rate": 2.0154998734451474e-05,
"loss": 0.0302,
"step": 2730
},
{
"epoch": 7.2105263157894735,
"grad_norm": 0.3530351519584656,
"learning_rate": 1.980702676847358e-05,
"loss": 0.0322,
"step": 2740
},
{
"epoch": 7.2368421052631575,
"grad_norm": 0.29570046067237854,
"learning_rate": 1.9461341393458254e-05,
"loss": 0.0272,
"step": 2750
},
{
"epoch": 7.2631578947368425,
"grad_norm": 0.2424498051404953,
"learning_rate": 1.9117968789041712e-05,
"loss": 0.028,
"step": 2760
},
{
"epoch": 7.2894736842105265,
"grad_norm": 0.3934868574142456,
"learning_rate": 1.877693495970809e-05,
"loss": 0.0272,
"step": 2770
},
{
"epoch": 7.315789473684211,
"grad_norm": 0.21242979168891907,
"learning_rate": 1.8438265732820126e-05,
"loss": 0.0327,
"step": 2780
},
{
"epoch": 7.342105263157895,
"grad_norm": 0.37688693404197693,
"learning_rate": 1.8101986756663197e-05,
"loss": 0.0351,
"step": 2790
},
{
"epoch": 7.368421052631579,
"grad_norm": 0.22811609506607056,
"learning_rate": 1.776812349850289e-05,
"loss": 0.0276,
"step": 2800
},
{
"epoch": 7.394736842105263,
"grad_norm": 0.22656874358654022,
"learning_rate": 1.7436701242656272e-05,
"loss": 0.0287,
"step": 2810
},
{
"epoch": 7.421052631578947,
"grad_norm": 0.35913366079330444,
"learning_rate": 1.7107745088577087e-05,
"loss": 0.0309,
"step": 2820
},
{
"epoch": 7.447368421052632,
"grad_norm": 0.23046566545963287,
"learning_rate": 1.678127994895492e-05,
"loss": 0.0242,
"step": 2830
},
{
"epoch": 7.473684210526316,
"grad_norm": 0.5616809725761414,
"learning_rate": 1.6457330547828402e-05,
"loss": 0.0292,
"step": 2840
},
{
"epoch": 7.5,
"grad_norm": 0.25659796595573425,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.0265,
"step": 2850
},
{
"epoch": 7.526315789473684,
"grad_norm": 0.32121288776397705,
"learning_rate": 1.5817076902742622e-05,
"loss": 0.0283,
"step": 2860
},
{
"epoch": 7.552631578947368,
"grad_norm": 0.4093388319015503,
"learning_rate": 1.5500821146826805e-05,
"loss": 0.0338,
"step": 2870
},
{
"epoch": 7.578947368421053,
"grad_norm": 0.4066798686981201,
"learning_rate": 1.5187178101821503e-05,
"loss": 0.0354,
"step": 2880
},
{
"epoch": 7.605263157894737,
"grad_norm": 0.2548167109489441,
"learning_rate": 1.4876171520715399e-05,
"loss": 0.0289,
"step": 2890
},
{
"epoch": 7.631578947368421,
"grad_norm": 0.375201940536499,
"learning_rate": 1.4567824956831043e-05,
"loss": 0.0308,
"step": 2900
},
{
"epoch": 7.657894736842105,
"grad_norm": 0.2906716763973236,
"learning_rate": 1.4262161762041121e-05,
"loss": 0.03,
"step": 2910
},
{
"epoch": 7.684210526315789,
"grad_norm": 0.25092822313308716,
"learning_rate": 1.3959205084999911e-05,
"loss": 0.0262,
"step": 2920
},
{
"epoch": 7.7105263157894735,
"grad_norm": 0.27973148226737976,
"learning_rate": 1.3658977869390166e-05,
"loss": 0.0259,
"step": 2930
},
{
"epoch": 7.7368421052631575,
"grad_norm": 0.24641232192516327,
"learning_rate": 1.336150285218558e-05,
"loss": 0.0288,
"step": 2940
},
{
"epoch": 7.7631578947368425,
"grad_norm": 0.17812731862068176,
"learning_rate": 1.3066802561928854e-05,
"loss": 0.0302,
"step": 2950
},
{
"epoch": 7.7894736842105265,
"grad_norm": 0.2735963761806488,
"learning_rate": 1.2774899317025468e-05,
"loss": 0.0281,
"step": 2960
},
{
"epoch": 7.815789473684211,
"grad_norm": 0.2304324209690094,
"learning_rate": 1.2485815224053582e-05,
"loss": 0.0286,
"step": 2970
},
{
"epoch": 7.842105263157895,
"grad_norm": 0.28357961773872375,
"learning_rate": 1.2199572176089741e-05,
"loss": 0.0228,
"step": 2980
},
{
"epoch": 7.868421052631579,
"grad_norm": 0.20956674218177795,
"learning_rate": 1.1916191851050873e-05,
"loss": 0.0276,
"step": 2990
},
{
"epoch": 7.894736842105263,
"grad_norm": 0.25499334931373596,
"learning_rate": 1.163569571005269e-05,
"loss": 0.0263,
"step": 3000
},
{
"epoch": 7.921052631578947,
"grad_norm": 0.406367689371109,
"learning_rate": 1.1358104995784186e-05,
"loss": 0.03,
"step": 3010
},
{
"epoch": 7.947368421052632,
"grad_norm": 0.49673014879226685,
"learning_rate": 1.1083440730898974e-05,
"loss": 0.029,
"step": 3020
},
{
"epoch": 7.973684210526316,
"grad_norm": 0.21329355239868164,
"learning_rate": 1.0811723716423233e-05,
"loss": 0.0254,
"step": 3030
},
{
"epoch": 8.0,
"grad_norm": 0.38303622603416443,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.031,
"step": 3040
},
{
"epoch": 8.026315789473685,
"grad_norm": 0.30326029658317566,
"learning_rate": 1.027721352523237e-05,
"loss": 0.031,
"step": 3050
},
{
"epoch": 8.052631578947368,
"grad_norm": 0.18923179805278778,
"learning_rate": 1.0014460828338928e-05,
"loss": 0.0239,
"step": 3060
},
{
"epoch": 8.078947368421053,
"grad_norm": 0.2916359603404999,
"learning_rate": 9.75473633843268e-06,
"loss": 0.0288,
"step": 3070
},
{
"epoch": 8.105263157894736,
"grad_norm": 0.39194855093955994,
"learning_rate": 9.498059725112467e-06,
"loss": 0.0255,
"step": 3080
},
{
"epoch": 8.131578947368421,
"grad_norm": 0.23311467468738556,
"learning_rate": 9.244450427153683e-06,
"loss": 0.0271,
"step": 3090
},
{
"epoch": 8.157894736842104,
"grad_norm": 0.25806355476379395,
"learning_rate": 8.99392765103605e-06,
"loss": 0.0253,
"step": 3100
},
{
"epoch": 8.18421052631579,
"grad_norm": 0.19991613924503326,
"learning_rate": 8.746510369489103e-06,
"loss": 0.0306,
"step": 3110
},
{
"epoch": 8.210526315789474,
"grad_norm": 0.26584678888320923,
"learning_rate": 8.502217320055427e-06,
"loss": 0.0295,
"step": 3120
},
{
"epoch": 8.236842105263158,
"grad_norm": 0.28366735577583313,
"learning_rate": 8.261067003671447e-06,
"loss": 0.0267,
"step": 3130
},
{
"epoch": 8.263157894736842,
"grad_norm": 0.2124335616827011,
"learning_rate": 8.0230776832664e-06,
"loss": 0.027,
"step": 3140
},
{
"epoch": 8.289473684210526,
"grad_norm": 0.6940687894821167,
"learning_rate": 7.78826738237926e-06,
"loss": 0.0239,
"step": 3150
},
{
"epoch": 8.31578947368421,
"grad_norm": 0.3029802739620209,
"learning_rate": 7.556653883793724e-06,
"loss": 0.0293,
"step": 3160
},
{
"epoch": 8.342105263157896,
"grad_norm": 0.33494675159454346,
"learning_rate": 7.328254728191464e-06,
"loss": 0.0322,
"step": 3170
},
{
"epoch": 8.368421052631579,
"grad_norm": 0.5959463715553284,
"learning_rate": 7.103087212823778e-06,
"loss": 0.0282,
"step": 3180
},
{
"epoch": 8.394736842105264,
"grad_norm": 0.3299950957298279,
"learning_rate": 6.881168390201581e-06,
"loss": 0.0289,
"step": 3190
},
{
"epoch": 8.421052631578947,
"grad_norm": 0.28407934308052063,
"learning_rate": 6.66251506680397e-06,
"loss": 0.0277,
"step": 3200
},
{
"epoch": 8.447368421052632,
"grad_norm": 0.24567903578281403,
"learning_rate": 6.447143801805516e-06,
"loss": 0.0293,
"step": 3210
},
{
"epoch": 8.473684210526315,
"grad_norm": 0.20823001861572266,
"learning_rate": 6.23507090582206e-06,
"loss": 0.0257,
"step": 3220
},
{
"epoch": 8.5,
"grad_norm": 0.29089733958244324,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0268,
"step": 3230
},
{
"epoch": 8.526315789473685,
"grad_norm": 0.3374447524547577,
"learning_rate": 5.820884213177713e-06,
"loss": 0.0276,
"step": 3240
},
{
"epoch": 8.552631578947368,
"grad_norm": 0.32109740376472473,
"learning_rate": 5.618801783932725e-06,
"loss": 0.0302,
"step": 3250
},
{
"epoch": 8.578947368421053,
"grad_norm": 0.25355905294418335,
"learning_rate": 5.420080456158971e-06,
"loss": 0.0263,
"step": 3260
},
{
"epoch": 8.605263157894736,
"grad_norm": 0.6211085319519043,
"learning_rate": 5.224735279530063e-06,
"loss": 0.0284,
"step": 3270
},
{
"epoch": 8.631578947368421,
"grad_norm": 0.23430858552455902,
"learning_rate": 5.032781048035034e-06,
"loss": 0.0264,
"step": 3280
},
{
"epoch": 8.657894736842106,
"grad_norm": 0.24149766564369202,
"learning_rate": 4.84423229885802e-06,
"loss": 0.027,
"step": 3290
},
{
"epoch": 8.68421052631579,
"grad_norm": 0.3725610077381134,
"learning_rate": 4.659103311277274e-06,
"loss": 0.0253,
"step": 3300
},
{
"epoch": 8.710526315789474,
"grad_norm": 0.2504403293132782,
"learning_rate": 4.477408105583741e-06,
"loss": 0.0276,
"step": 3310
},
{
"epoch": 8.736842105263158,
"grad_norm": 0.21490493416786194,
"learning_rate": 4.29916044201934e-06,
"loss": 0.0272,
"step": 3320
},
{
"epoch": 8.763157894736842,
"grad_norm": 0.30955877900123596,
"learning_rate": 4.124373819734795e-06,
"loss": 0.0308,
"step": 3330
},
{
"epoch": 8.789473684210526,
"grad_norm": 0.210093155503273,
"learning_rate": 3.953061475767339e-06,
"loss": 0.0264,
"step": 3340
},
{
"epoch": 8.81578947368421,
"grad_norm": 0.29672837257385254,
"learning_rate": 3.785236384038232e-06,
"loss": 0.0295,
"step": 3350
},
{
"epoch": 8.842105263157894,
"grad_norm": 0.5261228680610657,
"learning_rate": 3.620911254370224e-06,
"loss": 0.0249,
"step": 3360
},
{
"epoch": 8.868421052631579,
"grad_norm": 0.22544153034687042,
"learning_rate": 3.460098531525019e-06,
"loss": 0.028,
"step": 3370
},
{
"epoch": 8.894736842105264,
"grad_norm": 0.24784667789936066,
"learning_rate": 3.302810394260736e-06,
"loss": 0.0252,
"step": 3380
},
{
"epoch": 8.921052631578947,
"grad_norm": 0.21960538625717163,
"learning_rate": 3.1490587544096782e-06,
"loss": 0.0255,
"step": 3390
},
{
"epoch": 8.947368421052632,
"grad_norm": 0.4938012361526489,
"learning_rate": 2.9988552559761294e-06,
"loss": 0.0261,
"step": 3400
},
{
"epoch": 8.973684210526315,
"grad_norm": 0.2430790662765503,
"learning_rate": 2.85221127425459e-06,
"loss": 0.0283,
"step": 3410
},
{
"epoch": 9.0,
"grad_norm": 0.3423117995262146,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.0262,
"step": 3420
},
{
"epoch": 9.026315789473685,
"grad_norm": 0.1678830087184906,
"learning_rate": 2.5696460134279955e-06,
"loss": 0.0226,
"step": 3430
},
{
"epoch": 9.052631578947368,
"grad_norm": 0.1864800751209259,
"learning_rate": 2.4337461337116894e-06,
"loss": 0.028,
"step": 3440
},
{
"epoch": 9.078947368421053,
"grad_norm": 0.3498280942440033,
"learning_rate": 2.3014485678642563e-06,
"loss": 0.0251,
"step": 3450
},
{
"epoch": 9.105263157894736,
"grad_norm": 0.2959420680999756,
"learning_rate": 2.1727633351182e-06,
"loss": 0.0267,
"step": 3460
},
{
"epoch": 9.131578947368421,
"grad_norm": 0.46708524227142334,
"learning_rate": 2.0477001811347985e-06,
"loss": 0.0271,
"step": 3470
},
{
"epoch": 9.157894736842104,
"grad_norm": 0.21950027346611023,
"learning_rate": 1.9262685772660606e-06,
"loss": 0.0241,
"step": 3480
},
{
"epoch": 9.18421052631579,
"grad_norm": 0.19924266636371613,
"learning_rate": 1.8084777198374315e-06,
"loss": 0.0276,
"step": 3490
},
{
"epoch": 9.210526315789474,
"grad_norm": 0.29325613379478455,
"learning_rate": 1.6943365294513236e-06,
"loss": 0.0278,
"step": 3500
},
{
"epoch": 9.236842105263158,
"grad_norm": 0.19263319671154022,
"learning_rate": 1.5838536503115675e-06,
"loss": 0.0275,
"step": 3510
},
{
"epoch": 9.263157894736842,
"grad_norm": 0.3293076753616333,
"learning_rate": 1.4770374495687134e-06,
"loss": 0.0232,
"step": 3520
},
{
"epoch": 9.289473684210526,
"grad_norm": 0.23276633024215698,
"learning_rate": 1.3738960166864101e-06,
"loss": 0.0245,
"step": 3530
},
{
"epoch": 9.31578947368421,
"grad_norm": 0.3246747553348541,
"learning_rate": 1.274437162828751e-06,
"loss": 0.0258,
"step": 3540
},
{
"epoch": 9.342105263157896,
"grad_norm": 0.3524140417575836,
"learning_rate": 1.1786684202687026e-06,
"loss": 0.023,
"step": 3550
},
{
"epoch": 9.368421052631579,
"grad_norm": 0.27717113494873047,
"learning_rate": 1.0865970418177051e-06,
"loss": 0.0292,
"step": 3560
},
{
"epoch": 9.394736842105264,
"grad_norm": 0.29303857684135437,
"learning_rate": 9.98230000276351e-07,
"loss": 0.0239,
"step": 3570
},
{
"epoch": 9.421052631578947,
"grad_norm": 0.5621191263198853,
"learning_rate": 9.135739879063465e-07,
"loss": 0.0242,
"step": 3580
},
{
"epoch": 9.447368421052632,
"grad_norm": 0.41081079840660095,
"learning_rate": 8.326354159236882e-07,
"loss": 0.0232,
"step": 3590
},
{
"epoch": 9.473684210526315,
"grad_norm": 0.48332345485687256,
"learning_rate": 7.554204140131138e-07,
"loss": 0.026,
"step": 3600
},
{
"epoch": 9.5,
"grad_norm": 0.36336687207221985,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0226,
"step": 3610
},
{
"epoch": 9.526315789473685,
"grad_norm": 0.4939121901988983,
"learning_rate": 6.121842287269419e-07,
"loss": 0.0274,
"step": 3620
},
{
"epoch": 9.552631578947368,
"grad_norm": 0.60195392370224,
"learning_rate": 5.46173892993429e-07,
"loss": 0.0229,
"step": 3630
},
{
"epoch": 9.578947368421053,
"grad_norm": 0.3372774124145508,
"learning_rate": 4.839088217946208e-07,
"loss": 0.0236,
"step": 3640
},
{
"epoch": 9.605263157894736,
"grad_norm": 0.2751666307449341,
"learning_rate": 4.253937306233691e-07,
"loss": 0.0267,
"step": 3650
},
{
"epoch": 9.631578947368421,
"grad_norm": 0.2811025381088257,
"learning_rate": 3.706330509769429e-07,
"loss": 0.0269,
"step": 3660
},
{
"epoch": 9.657894736842106,
"grad_norm": 0.27488309144973755,
"learning_rate": 3.1963093002145285e-07,
"loss": 0.0294,
"step": 3670
},
{
"epoch": 9.68421052631579,
"grad_norm": 0.29794642329216003,
"learning_rate": 2.7239123027775204e-07,
"loss": 0.0226,
"step": 3680
},
{
"epoch": 9.710526315789474,
"grad_norm": 0.49477118253707886,
"learning_rate": 2.289175293289314e-07,
"loss": 0.0263,
"step": 3690
},
{
"epoch": 9.736842105263158,
"grad_norm": 0.44544729590415955,
"learning_rate": 1.8921311954937516e-07,
"loss": 0.024,
"step": 3700
},
{
"epoch": 9.763157894736842,
"grad_norm": 0.380344033241272,
"learning_rate": 1.5328100785542697e-07,
"loss": 0.0243,
"step": 3710
},
{
"epoch": 9.789473684210526,
"grad_norm": 0.2532555162906647,
"learning_rate": 1.211239154776611e-07,
"loss": 0.0256,
"step": 3720
},
{
"epoch": 9.81578947368421,
"grad_norm": 0.3260039985179901,
"learning_rate": 9.27442777547971e-08,
"loss": 0.0253,
"step": 3730
},
{
"epoch": 9.842105263157894,
"grad_norm": 0.2649020254611969,
"learning_rate": 6.814424394926966e-08,
"loss": 0.0298,
"step": 3740
},
{
"epoch": 9.868421052631579,
"grad_norm": 0.257893443107605,
"learning_rate": 4.732567708445878e-08,
"loss": 0.0223,
"step": 3750
},
{
"epoch": 9.894736842105264,
"grad_norm": 0.20920208096504211,
"learning_rate": 3.029015380359157e-08,
"loss": 0.0249,
"step": 3760
},
{
"epoch": 9.921052631578947,
"grad_norm": 0.19053097069263458,
"learning_rate": 1.7038964250343238e-08,
"loss": 0.029,
"step": 3770
},
{
"epoch": 9.947368421052632,
"grad_norm": 0.3256742060184479,
"learning_rate": 7.573111971148627e-09,
"loss": 0.0254,
"step": 3780
},
{
"epoch": 9.973684210526315,
"grad_norm": 0.3680499792098999,
"learning_rate": 1.8933138391574732e-09,
"loss": 0.025,
"step": 3790
},
{
"epoch": 10.0,
"grad_norm": 0.5628379583358765,
"learning_rate": 0.0,
"loss": 0.0263,
"step": 3800
},
{
"epoch": 10.0,
"step": 3800,
"total_flos": 4.0064924863782144e+17,
"train_loss": 0.05822706136264299,
"train_runtime": 3981.0494,
"train_samples_per_second": 46.716,
"train_steps_per_second": 0.955
}
],
"logging_steps": 10,
"max_steps": 3800,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.0064924863782144e+17,
"train_batch_size": 49,
"trial_name": null,
"trial_params": null
}