XtraGPT-14B / trainer_state.json
nuojohnchen's picture
Add files using upload-large-folder tool
7273222 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6020469596628537,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012040939193257074,
"grad_norm": 2.0694425106048584,
"learning_rate": 1.2033694344163658e-08,
"loss": 0.6897,
"step": 10
},
{
"epoch": 0.002408187838651415,
"grad_norm": 2.151496171951294,
"learning_rate": 2.4067388688327316e-08,
"loss": 0.6787,
"step": 20
},
{
"epoch": 0.003612281757977122,
"grad_norm": 2.640268564224243,
"learning_rate": 3.610108303249097e-08,
"loss": 0.6639,
"step": 30
},
{
"epoch": 0.00481637567730283,
"grad_norm": 2.6572210788726807,
"learning_rate": 4.813477737665463e-08,
"loss": 0.7152,
"step": 40
},
{
"epoch": 0.006020469596628537,
"grad_norm": 1.7933714389801025,
"learning_rate": 6.016847172081829e-08,
"loss": 0.6503,
"step": 50
},
{
"epoch": 0.007224563515954244,
"grad_norm": 2.3688879013061523,
"learning_rate": 7.220216606498194e-08,
"loss": 0.6827,
"step": 60
},
{
"epoch": 0.008428657435279952,
"grad_norm": 2.220139265060425,
"learning_rate": 8.42358604091456e-08,
"loss": 0.6443,
"step": 70
},
{
"epoch": 0.00963275135460566,
"grad_norm": 2.4725093841552734,
"learning_rate": 9.626955475330927e-08,
"loss": 0.6681,
"step": 80
},
{
"epoch": 0.010836845273931367,
"grad_norm": 1.4149224758148193,
"learning_rate": 1.0830324909747292e-07,
"loss": 0.5592,
"step": 90
},
{
"epoch": 0.012040939193257074,
"grad_norm": 0.9355699419975281,
"learning_rate": 1.2033694344163658e-07,
"loss": 0.5802,
"step": 100
},
{
"epoch": 0.013245033112582781,
"grad_norm": 1.0211461782455444,
"learning_rate": 1.3237063778580024e-07,
"loss": 0.5589,
"step": 110
},
{
"epoch": 0.014449127031908489,
"grad_norm": 1.0006492137908936,
"learning_rate": 1.4440433212996388e-07,
"loss": 0.5421,
"step": 120
},
{
"epoch": 0.015653220951234198,
"grad_norm": 0.8444674015045166,
"learning_rate": 1.5643802647412754e-07,
"loss": 0.5079,
"step": 130
},
{
"epoch": 0.016857314870559904,
"grad_norm": 0.7920398712158203,
"learning_rate": 1.684717208182912e-07,
"loss": 0.4898,
"step": 140
},
{
"epoch": 0.018061408789885613,
"grad_norm": 0.6817948818206787,
"learning_rate": 1.8050541516245487e-07,
"loss": 0.4645,
"step": 150
},
{
"epoch": 0.01926550270921132,
"grad_norm": 0.9353106021881104,
"learning_rate": 1.9253910950661853e-07,
"loss": 0.485,
"step": 160
},
{
"epoch": 0.020469596628537028,
"grad_norm": 0.6695616841316223,
"learning_rate": 2.045728038507822e-07,
"loss": 0.4647,
"step": 170
},
{
"epoch": 0.021673690547862733,
"grad_norm": 0.6993837952613831,
"learning_rate": 2.1660649819494583e-07,
"loss": 0.4378,
"step": 180
},
{
"epoch": 0.022877784467188442,
"grad_norm": 0.7333642244338989,
"learning_rate": 2.286401925391095e-07,
"loss": 0.4288,
"step": 190
},
{
"epoch": 0.024081878386514148,
"grad_norm": 0.707914412021637,
"learning_rate": 2.4067388688327316e-07,
"loss": 0.4601,
"step": 200
},
{
"epoch": 0.025285972305839857,
"grad_norm": 0.7626605033874512,
"learning_rate": 2.527075812274368e-07,
"loss": 0.4454,
"step": 210
},
{
"epoch": 0.026490066225165563,
"grad_norm": 1.2267224788665771,
"learning_rate": 2.647412755716005e-07,
"loss": 0.4398,
"step": 220
},
{
"epoch": 0.027694160144491272,
"grad_norm": 0.7376552224159241,
"learning_rate": 2.767749699157641e-07,
"loss": 0.4275,
"step": 230
},
{
"epoch": 0.028898254063816978,
"grad_norm": 0.7109339237213135,
"learning_rate": 2.8880866425992776e-07,
"loss": 0.3996,
"step": 240
},
{
"epoch": 0.030102347983142687,
"grad_norm": 0.6406791806221008,
"learning_rate": 3.008423586040915e-07,
"loss": 0.4337,
"step": 250
},
{
"epoch": 0.031306441902468396,
"grad_norm": 0.6780328154563904,
"learning_rate": 3.128760529482551e-07,
"loss": 0.4296,
"step": 260
},
{
"epoch": 0.0325105358217941,
"grad_norm": 0.5574681162834167,
"learning_rate": 3.2490974729241875e-07,
"loss": 0.4123,
"step": 270
},
{
"epoch": 0.03371462974111981,
"grad_norm": 0.6190093755722046,
"learning_rate": 3.369434416365824e-07,
"loss": 0.3959,
"step": 280
},
{
"epoch": 0.034918723660445516,
"grad_norm": 0.6488677859306335,
"learning_rate": 3.4897713598074607e-07,
"loss": 0.3883,
"step": 290
},
{
"epoch": 0.036122817579771226,
"grad_norm": 0.6014848351478577,
"learning_rate": 3.6101083032490974e-07,
"loss": 0.4222,
"step": 300
},
{
"epoch": 0.03732691149909693,
"grad_norm": 0.5347362160682678,
"learning_rate": 3.730445246690734e-07,
"loss": 0.3929,
"step": 310
},
{
"epoch": 0.03853100541842264,
"grad_norm": 1.4445090293884277,
"learning_rate": 3.8507821901323706e-07,
"loss": 0.3798,
"step": 320
},
{
"epoch": 0.039735099337748346,
"grad_norm": 0.6319730877876282,
"learning_rate": 3.9711191335740067e-07,
"loss": 0.386,
"step": 330
},
{
"epoch": 0.040939193257074055,
"grad_norm": 0.9257851243019104,
"learning_rate": 4.091456077015644e-07,
"loss": 0.393,
"step": 340
},
{
"epoch": 0.04214328717639976,
"grad_norm": 0.5936801433563232,
"learning_rate": 4.2117930204572805e-07,
"loss": 0.3912,
"step": 350
},
{
"epoch": 0.04334738109572547,
"grad_norm": 0.686888575553894,
"learning_rate": 4.3321299638989166e-07,
"loss": 0.4015,
"step": 360
},
{
"epoch": 0.044551475015051176,
"grad_norm": 0.5986278653144836,
"learning_rate": 4.452466907340554e-07,
"loss": 0.3622,
"step": 370
},
{
"epoch": 0.045755568934376885,
"grad_norm": 0.5603286623954773,
"learning_rate": 4.57280385078219e-07,
"loss": 0.3774,
"step": 380
},
{
"epoch": 0.04695966285370259,
"grad_norm": 1.2507776021957397,
"learning_rate": 4.6931407942238265e-07,
"loss": 0.3681,
"step": 390
},
{
"epoch": 0.048163756773028296,
"grad_norm": 0.5886845588684082,
"learning_rate": 4.813477737665463e-07,
"loss": 0.371,
"step": 400
},
{
"epoch": 0.049367850692354005,
"grad_norm": 0.5690301656723022,
"learning_rate": 4.9338146811071e-07,
"loss": 0.3454,
"step": 410
},
{
"epoch": 0.050571944611679714,
"grad_norm": 0.6363804340362549,
"learning_rate": 5.054151624548736e-07,
"loss": 0.3477,
"step": 420
},
{
"epoch": 0.05177603853100542,
"grad_norm": 0.49289166927337646,
"learning_rate": 5.174488567990373e-07,
"loss": 0.352,
"step": 430
},
{
"epoch": 0.052980132450331126,
"grad_norm": 0.5901724696159363,
"learning_rate": 5.29482551143201e-07,
"loss": 0.3514,
"step": 440
},
{
"epoch": 0.054184226369656835,
"grad_norm": 0.6019484996795654,
"learning_rate": 5.415162454873646e-07,
"loss": 0.3713,
"step": 450
},
{
"epoch": 0.055388320288982544,
"grad_norm": 0.5057175755500793,
"learning_rate": 5.535499398315282e-07,
"loss": 0.3346,
"step": 460
},
{
"epoch": 0.056592414208308246,
"grad_norm": 0.4834252893924713,
"learning_rate": 5.655836341756919e-07,
"loss": 0.3638,
"step": 470
},
{
"epoch": 0.057796508127633955,
"grad_norm": 0.6098750233650208,
"learning_rate": 5.776173285198555e-07,
"loss": 0.3622,
"step": 480
},
{
"epoch": 0.059000602046959665,
"grad_norm": 0.6201721429824829,
"learning_rate": 5.896510228640193e-07,
"loss": 0.3329,
"step": 490
},
{
"epoch": 0.060204695966285374,
"grad_norm": 0.7006021738052368,
"learning_rate": 6.01684717208183e-07,
"loss": 0.3487,
"step": 500
},
{
"epoch": 0.061408789885611076,
"grad_norm": 0.708990216255188,
"learning_rate": 6.137184115523465e-07,
"loss": 0.3448,
"step": 510
},
{
"epoch": 0.06261288380493679,
"grad_norm": 0.7767229676246643,
"learning_rate": 6.257521058965102e-07,
"loss": 0.3751,
"step": 520
},
{
"epoch": 0.0638169777242625,
"grad_norm": 0.6051218509674072,
"learning_rate": 6.377858002406738e-07,
"loss": 0.3502,
"step": 530
},
{
"epoch": 0.0650210716435882,
"grad_norm": 0.7111226916313171,
"learning_rate": 6.498194945848375e-07,
"loss": 0.3625,
"step": 540
},
{
"epoch": 0.06622516556291391,
"grad_norm": 0.7441733479499817,
"learning_rate": 6.618531889290013e-07,
"loss": 0.3269,
"step": 550
},
{
"epoch": 0.06742925948223961,
"grad_norm": 0.6909326910972595,
"learning_rate": 6.738868832731648e-07,
"loss": 0.3302,
"step": 560
},
{
"epoch": 0.06863335340156532,
"grad_norm": 0.7504749298095703,
"learning_rate": 6.859205776173285e-07,
"loss": 0.3425,
"step": 570
},
{
"epoch": 0.06983744732089103,
"grad_norm": 0.5878099799156189,
"learning_rate": 6.979542719614921e-07,
"loss": 0.3504,
"step": 580
},
{
"epoch": 0.07104154124021674,
"grad_norm": 0.5515761971473694,
"learning_rate": 7.099879663056558e-07,
"loss": 0.3409,
"step": 590
},
{
"epoch": 0.07224563515954245,
"grad_norm": 0.57797771692276,
"learning_rate": 7.220216606498195e-07,
"loss": 0.3416,
"step": 600
},
{
"epoch": 0.07344972907886815,
"grad_norm": 0.4524708390235901,
"learning_rate": 7.34055354993983e-07,
"loss": 0.3581,
"step": 610
},
{
"epoch": 0.07465382299819386,
"grad_norm": 0.718927800655365,
"learning_rate": 7.460890493381468e-07,
"loss": 0.3609,
"step": 620
},
{
"epoch": 0.07585791691751957,
"grad_norm": 0.5666077733039856,
"learning_rate": 7.581227436823105e-07,
"loss": 0.335,
"step": 630
},
{
"epoch": 0.07706201083684527,
"grad_norm": 0.5896601676940918,
"learning_rate": 7.701564380264741e-07,
"loss": 0.3274,
"step": 640
},
{
"epoch": 0.07826610475617098,
"grad_norm": 0.6044319868087769,
"learning_rate": 7.821901323706378e-07,
"loss": 0.3407,
"step": 650
},
{
"epoch": 0.07947019867549669,
"grad_norm": 0.6831541061401367,
"learning_rate": 7.942238267148013e-07,
"loss": 0.3333,
"step": 660
},
{
"epoch": 0.0806742925948224,
"grad_norm": 0.7124572396278381,
"learning_rate": 8.06257521058965e-07,
"loss": 0.3326,
"step": 670
},
{
"epoch": 0.08187838651414811,
"grad_norm": 0.732711136341095,
"learning_rate": 8.182912154031288e-07,
"loss": 0.3487,
"step": 680
},
{
"epoch": 0.08308248043347381,
"grad_norm": 0.7555579543113708,
"learning_rate": 8.303249097472924e-07,
"loss": 0.3218,
"step": 690
},
{
"epoch": 0.08428657435279951,
"grad_norm": 0.7618419528007507,
"learning_rate": 8.423586040914561e-07,
"loss": 0.3231,
"step": 700
},
{
"epoch": 0.08549066827212523,
"grad_norm": 0.7383216023445129,
"learning_rate": 8.543922984356197e-07,
"loss": 0.3218,
"step": 710
},
{
"epoch": 0.08669476219145093,
"grad_norm": 0.5902182459831238,
"learning_rate": 8.664259927797833e-07,
"loss": 0.3367,
"step": 720
},
{
"epoch": 0.08789885611077664,
"grad_norm": 0.6107906103134155,
"learning_rate": 8.78459687123947e-07,
"loss": 0.3331,
"step": 730
},
{
"epoch": 0.08910295003010235,
"grad_norm": 0.7179387211799622,
"learning_rate": 8.904933814681108e-07,
"loss": 0.3347,
"step": 740
},
{
"epoch": 0.09030704394942805,
"grad_norm": 0.8263080716133118,
"learning_rate": 9.025270758122743e-07,
"loss": 0.3247,
"step": 750
},
{
"epoch": 0.09151113786875377,
"grad_norm": 0.8549688458442688,
"learning_rate": 9.14560770156438e-07,
"loss": 0.3239,
"step": 760
},
{
"epoch": 0.09271523178807947,
"grad_norm": 0.6674267053604126,
"learning_rate": 9.265944645006016e-07,
"loss": 0.333,
"step": 770
},
{
"epoch": 0.09391932570740517,
"grad_norm": 0.5892189741134644,
"learning_rate": 9.386281588447653e-07,
"loss": 0.322,
"step": 780
},
{
"epoch": 0.09512341962673089,
"grad_norm": 0.7087513208389282,
"learning_rate": 9.50661853188929e-07,
"loss": 0.327,
"step": 790
},
{
"epoch": 0.09632751354605659,
"grad_norm": 0.6016402840614319,
"learning_rate": 9.626955475330926e-07,
"loss": 0.3255,
"step": 800
},
{
"epoch": 0.0975316074653823,
"grad_norm": 0.5783524513244629,
"learning_rate": 9.747292418772562e-07,
"loss": 0.3128,
"step": 810
},
{
"epoch": 0.09873570138470801,
"grad_norm": 0.6049711108207703,
"learning_rate": 9.8676293622142e-07,
"loss": 0.3257,
"step": 820
},
{
"epoch": 0.09993979530403371,
"grad_norm": 0.6259274482727051,
"learning_rate": 9.987966305655835e-07,
"loss": 0.3318,
"step": 830
},
{
"epoch": 0.10114388922335943,
"grad_norm": 0.5331777930259705,
"learning_rate": 9.999964221834556e-07,
"loss": 0.3133,
"step": 840
},
{
"epoch": 0.10234798314268513,
"grad_norm": 0.5190764665603638,
"learning_rate": 9.999840544882987e-07,
"loss": 0.3349,
"step": 850
},
{
"epoch": 0.10355207706201083,
"grad_norm": 0.5867928862571716,
"learning_rate": 9.99962852962418e-07,
"loss": 0.3252,
"step": 860
},
{
"epoch": 0.10475617098133655,
"grad_norm": 0.7667666673660278,
"learning_rate": 9.999328179804064e-07,
"loss": 0.3269,
"step": 870
},
{
"epoch": 0.10596026490066225,
"grad_norm": 0.5684708952903748,
"learning_rate": 9.998939500729291e-07,
"loss": 0.3204,
"step": 880
},
{
"epoch": 0.10716435881998795,
"grad_norm": 0.5369793772697449,
"learning_rate": 9.99846249926713e-07,
"loss": 0.2997,
"step": 890
},
{
"epoch": 0.10836845273931367,
"grad_norm": 0.5773791074752808,
"learning_rate": 9.997897183845347e-07,
"loss": 0.3147,
"step": 900
},
{
"epoch": 0.10957254665863937,
"grad_norm": 0.571826159954071,
"learning_rate": 9.997243564452064e-07,
"loss": 0.32,
"step": 910
},
{
"epoch": 0.11077664057796509,
"grad_norm": 0.420244961977005,
"learning_rate": 9.996501652635578e-07,
"loss": 0.3141,
"step": 920
},
{
"epoch": 0.11198073449729079,
"grad_norm": 0.5253920555114746,
"learning_rate": 9.99567146150415e-07,
"loss": 0.3201,
"step": 930
},
{
"epoch": 0.11318482841661649,
"grad_norm": 0.49279969930648804,
"learning_rate": 9.994753005725785e-07,
"loss": 0.3076,
"step": 940
},
{
"epoch": 0.11438892233594221,
"grad_norm": 0.6114805936813354,
"learning_rate": 9.993746301527965e-07,
"loss": 0.3209,
"step": 950
},
{
"epoch": 0.11559301625526791,
"grad_norm": 1.6514418125152588,
"learning_rate": 9.99265136669737e-07,
"loss": 0.319,
"step": 960
},
{
"epoch": 0.11679711017459361,
"grad_norm": 0.6415925621986389,
"learning_rate": 9.99146822057955e-07,
"loss": 0.3268,
"step": 970
},
{
"epoch": 0.11800120409391933,
"grad_norm": 0.5680079460144043,
"learning_rate": 9.990196884078599e-07,
"loss": 0.3139,
"step": 980
},
{
"epoch": 0.11920529801324503,
"grad_norm": 0.715497612953186,
"learning_rate": 9.988837379656778e-07,
"loss": 0.3143,
"step": 990
},
{
"epoch": 0.12040939193257075,
"grad_norm": 0.6379466652870178,
"learning_rate": 9.987389731334112e-07,
"loss": 0.3037,
"step": 1000
},
{
"epoch": 0.12161348585189645,
"grad_norm": 0.5227240920066833,
"learning_rate": 9.985853964687985e-07,
"loss": 0.3202,
"step": 1010
},
{
"epoch": 0.12281757977122215,
"grad_norm": 0.5148226022720337,
"learning_rate": 9.984230106852658e-07,
"loss": 0.3089,
"step": 1020
},
{
"epoch": 0.12402167369054787,
"grad_norm": 0.8337252140045166,
"learning_rate": 9.982518186518824e-07,
"loss": 0.3093,
"step": 1030
},
{
"epoch": 0.12522576760987358,
"grad_norm": 0.5874176621437073,
"learning_rate": 9.980718233933072e-07,
"loss": 0.3257,
"step": 1040
},
{
"epoch": 0.12642986152919927,
"grad_norm": 0.6203235983848572,
"learning_rate": 9.978830280897373e-07,
"loss": 0.3094,
"step": 1050
},
{
"epoch": 0.127633955448525,
"grad_norm": 0.7386701107025146,
"learning_rate": 9.976854360768501e-07,
"loss": 0.3283,
"step": 1060
},
{
"epoch": 0.1288380493678507,
"grad_norm": 0.7480394244194031,
"learning_rate": 9.97479050845746e-07,
"loss": 0.322,
"step": 1070
},
{
"epoch": 0.1300421432871764,
"grad_norm": 0.6779530048370361,
"learning_rate": 9.97263876042886e-07,
"loss": 0.3263,
"step": 1080
},
{
"epoch": 0.1312462372065021,
"grad_norm": 1.0457607507705688,
"learning_rate": 9.970399154700262e-07,
"loss": 0.324,
"step": 1090
},
{
"epoch": 0.13245033112582782,
"grad_norm": 0.4574492871761322,
"learning_rate": 9.96807173084153e-07,
"loss": 0.3033,
"step": 1100
},
{
"epoch": 0.1336544250451535,
"grad_norm": 0.4800940454006195,
"learning_rate": 9.965656529974108e-07,
"loss": 0.3076,
"step": 1110
},
{
"epoch": 0.13485851896447923,
"grad_norm": 0.5336936116218567,
"learning_rate": 9.96315359477031e-07,
"loss": 0.3029,
"step": 1120
},
{
"epoch": 0.13606261288380495,
"grad_norm": 0.9403670430183411,
"learning_rate": 9.960562969452559e-07,
"loss": 0.3019,
"step": 1130
},
{
"epoch": 0.13726670680313063,
"grad_norm": 0.6152085661888123,
"learning_rate": 9.957884699792604e-07,
"loss": 0.3051,
"step": 1140
},
{
"epoch": 0.13847080072245635,
"grad_norm": 0.7313536405563354,
"learning_rate": 9.955118833110716e-07,
"loss": 0.3137,
"step": 1150
},
{
"epoch": 0.13967489464178207,
"grad_norm": 0.47397103905677795,
"learning_rate": 9.95226541827485e-07,
"loss": 0.3214,
"step": 1160
},
{
"epoch": 0.14087898856110775,
"grad_norm": 0.4812333881855011,
"learning_rate": 9.949324505699782e-07,
"loss": 0.3164,
"step": 1170
},
{
"epoch": 0.14208308248043347,
"grad_norm": 0.6729305386543274,
"learning_rate": 9.946296147346215e-07,
"loss": 0.2946,
"step": 1180
},
{
"epoch": 0.1432871763997592,
"grad_norm": 0.6568790078163147,
"learning_rate": 9.943180396719867e-07,
"loss": 0.2929,
"step": 1190
},
{
"epoch": 0.1444912703190849,
"grad_norm": 0.5633556842803955,
"learning_rate": 9.939977308870518e-07,
"loss": 0.3073,
"step": 1200
},
{
"epoch": 0.1456953642384106,
"grad_norm": 1.1128957271575928,
"learning_rate": 9.936686940391048e-07,
"loss": 0.3264,
"step": 1210
},
{
"epoch": 0.1468994581577363,
"grad_norm": 0.5192599892616272,
"learning_rate": 9.933309349416428e-07,
"loss": 0.3064,
"step": 1220
},
{
"epoch": 0.14810355207706202,
"grad_norm": 0.49194392561912537,
"learning_rate": 9.92984459562269e-07,
"loss": 0.302,
"step": 1230
},
{
"epoch": 0.1493076459963877,
"grad_norm": 0.5606468915939331,
"learning_rate": 9.926292740225888e-07,
"loss": 0.3037,
"step": 1240
},
{
"epoch": 0.15051173991571343,
"grad_norm": 0.544266939163208,
"learning_rate": 9.922653845981e-07,
"loss": 0.3025,
"step": 1250
},
{
"epoch": 0.15171583383503914,
"grad_norm": 1.0137197971343994,
"learning_rate": 9.918927977180826e-07,
"loss": 0.2998,
"step": 1260
},
{
"epoch": 0.15291992775436483,
"grad_norm": 0.4881134629249573,
"learning_rate": 9.91511519965486e-07,
"loss": 0.2975,
"step": 1270
},
{
"epoch": 0.15412402167369055,
"grad_norm": 0.4854426383972168,
"learning_rate": 9.911215580768106e-07,
"loss": 0.3109,
"step": 1280
},
{
"epoch": 0.15532811559301626,
"grad_norm": 0.5056730508804321,
"learning_rate": 9.90722918941991e-07,
"loss": 0.3121,
"step": 1290
},
{
"epoch": 0.15653220951234195,
"grad_norm": 0.5286668539047241,
"learning_rate": 9.903156096042734e-07,
"loss": 0.2982,
"step": 1300
},
{
"epoch": 0.15773630343166767,
"grad_norm": 0.5490984916687012,
"learning_rate": 9.898996372600903e-07,
"loss": 0.3115,
"step": 1310
},
{
"epoch": 0.15894039735099338,
"grad_norm": 0.614521861076355,
"learning_rate": 9.894750092589349e-07,
"loss": 0.2985,
"step": 1320
},
{
"epoch": 0.16014449127031907,
"grad_norm": 0.5678403973579407,
"learning_rate": 9.8904173310323e-07,
"loss": 0.3046,
"step": 1330
},
{
"epoch": 0.1613485851896448,
"grad_norm": 0.5179656147956848,
"learning_rate": 9.885998164481966e-07,
"loss": 0.3053,
"step": 1340
},
{
"epoch": 0.1625526791089705,
"grad_norm": 0.526849091053009,
"learning_rate": 9.881492671017172e-07,
"loss": 0.3143,
"step": 1350
},
{
"epoch": 0.16375677302829622,
"grad_norm": 0.5683344006538391,
"learning_rate": 9.876900930241991e-07,
"loss": 0.3031,
"step": 1360
},
{
"epoch": 0.1649608669476219,
"grad_norm": 0.5243839621543884,
"learning_rate": 9.872223023284333e-07,
"loss": 0.312,
"step": 1370
},
{
"epoch": 0.16616496086694763,
"grad_norm": 0.5260365605354309,
"learning_rate": 9.867459032794508e-07,
"loss": 0.3037,
"step": 1380
},
{
"epoch": 0.16736905478627334,
"grad_norm": 0.4755154252052307,
"learning_rate": 9.86260904294377e-07,
"loss": 0.2916,
"step": 1390
},
{
"epoch": 0.16857314870559903,
"grad_norm": 0.5555715560913086,
"learning_rate": 9.857673139422833e-07,
"loss": 0.3135,
"step": 1400
},
{
"epoch": 0.16977724262492475,
"grad_norm": 0.5810279250144958,
"learning_rate": 9.85265140944035e-07,
"loss": 0.3104,
"step": 1410
},
{
"epoch": 0.17098133654425046,
"grad_norm": 0.48022618889808655,
"learning_rate": 9.847543941721379e-07,
"loss": 0.3022,
"step": 1420
},
{
"epoch": 0.17218543046357615,
"grad_norm": 0.5191965103149414,
"learning_rate": 9.842350826505802e-07,
"loss": 0.3018,
"step": 1430
},
{
"epoch": 0.17338952438290187,
"grad_norm": 1.2972302436828613,
"learning_rate": 9.837072155546753e-07,
"loss": 0.3026,
"step": 1440
},
{
"epoch": 0.17459361830222758,
"grad_norm": 0.47315987944602966,
"learning_rate": 9.831708022108972e-07,
"loss": 0.311,
"step": 1450
},
{
"epoch": 0.17579771222155327,
"grad_norm": 0.5953189134597778,
"learning_rate": 9.826258520967177e-07,
"loss": 0.3071,
"step": 1460
},
{
"epoch": 0.177001806140879,
"grad_norm": 0.5407562851905823,
"learning_rate": 9.820723748404382e-07,
"loss": 0.31,
"step": 1470
},
{
"epoch": 0.1782059000602047,
"grad_norm": 0.5249618291854858,
"learning_rate": 9.815103802210193e-07,
"loss": 0.2898,
"step": 1480
},
{
"epoch": 0.1794099939795304,
"grad_norm": 0.5347439646720886,
"learning_rate": 9.80939878167908e-07,
"loss": 0.2944,
"step": 1490
},
{
"epoch": 0.1806140878988561,
"grad_norm": 0.49509304761886597,
"learning_rate": 9.80360878760863e-07,
"loss": 0.3073,
"step": 1500
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.5182557106018066,
"learning_rate": 9.79773392229776e-07,
"loss": 0.3092,
"step": 1510
},
{
"epoch": 0.18302227573750754,
"grad_norm": 0.5343918204307556,
"learning_rate": 9.79177428954492e-07,
"loss": 0.3058,
"step": 1520
},
{
"epoch": 0.18422636965683323,
"grad_norm": 0.42448320984840393,
"learning_rate": 9.785729994646228e-07,
"loss": 0.2966,
"step": 1530
},
{
"epoch": 0.18543046357615894,
"grad_norm": 0.514305055141449,
"learning_rate": 9.779601144393655e-07,
"loss": 0.3063,
"step": 1540
},
{
"epoch": 0.18663455749548466,
"grad_norm": 0.559808075428009,
"learning_rate": 9.773387847073102e-07,
"loss": 0.3103,
"step": 1550
},
{
"epoch": 0.18783865141481035,
"grad_norm": 0.5099034905433655,
"learning_rate": 9.767090212462506e-07,
"loss": 0.3045,
"step": 1560
},
{
"epoch": 0.18904274533413606,
"grad_norm": 0.5309582352638245,
"learning_rate": 9.76070835182989e-07,
"loss": 0.3198,
"step": 1570
},
{
"epoch": 0.19024683925346178,
"grad_norm": 0.5174340605735779,
"learning_rate": 9.754242377931402e-07,
"loss": 0.3019,
"step": 1580
},
{
"epoch": 0.19145093317278747,
"grad_norm": 0.47818174958229065,
"learning_rate": 9.747692405009327e-07,
"loss": 0.2885,
"step": 1590
},
{
"epoch": 0.19265502709211318,
"grad_norm": 0.4435511529445648,
"learning_rate": 9.741058548790055e-07,
"loss": 0.2716,
"step": 1600
},
{
"epoch": 0.1938591210114389,
"grad_norm": 0.47226864099502563,
"learning_rate": 9.734340926482052e-07,
"loss": 0.2911,
"step": 1610
},
{
"epoch": 0.1950632149307646,
"grad_norm": 0.4990203082561493,
"learning_rate": 9.72753965677378e-07,
"loss": 0.3119,
"step": 1620
},
{
"epoch": 0.1962673088500903,
"grad_norm": 0.6255252957344055,
"learning_rate": 9.7206548598316e-07,
"loss": 0.2902,
"step": 1630
},
{
"epoch": 0.19747140276941602,
"grad_norm": 0.5827116370201111,
"learning_rate": 9.713686657297655e-07,
"loss": 0.3079,
"step": 1640
},
{
"epoch": 0.1986754966887417,
"grad_norm": 0.5475650429725647,
"learning_rate": 9.706635172287715e-07,
"loss": 0.3095,
"step": 1650
},
{
"epoch": 0.19987959060806743,
"grad_norm": 0.674460768699646,
"learning_rate": 9.699500529389001e-07,
"loss": 0.2953,
"step": 1660
},
{
"epoch": 0.20108368452739314,
"grad_norm": 0.5000407695770264,
"learning_rate": 9.692282854657989e-07,
"loss": 0.3055,
"step": 1670
},
{
"epoch": 0.20228777844671886,
"grad_norm": 0.5063086748123169,
"learning_rate": 9.684982275618178e-07,
"loss": 0.2952,
"step": 1680
},
{
"epoch": 0.20349187236604455,
"grad_norm": 0.6266674399375916,
"learning_rate": 9.677598921257842e-07,
"loss": 0.3028,
"step": 1690
},
{
"epoch": 0.20469596628537026,
"grad_norm": 1.3428351879119873,
"learning_rate": 9.67013292202775e-07,
"loss": 0.3165,
"step": 1700
},
{
"epoch": 0.20590006020469598,
"grad_norm": 0.6307231187820435,
"learning_rate": 9.66258440983885e-07,
"loss": 0.3112,
"step": 1710
},
{
"epoch": 0.20710415412402167,
"grad_norm": 0.5176913738250732,
"learning_rate": 9.654953518059953e-07,
"loss": 0.3042,
"step": 1720
},
{
"epoch": 0.20830824804334738,
"grad_norm": 0.4618211090564728,
"learning_rate": 9.647240381515376e-07,
"loss": 0.3107,
"step": 1730
},
{
"epoch": 0.2095123419626731,
"grad_norm": 0.4354129135608673,
"learning_rate": 9.639445136482546e-07,
"loss": 0.2932,
"step": 1740
},
{
"epoch": 0.2107164358819988,
"grad_norm": 0.6150096654891968,
"learning_rate": 9.631567920689607e-07,
"loss": 0.2898,
"step": 1750
},
{
"epoch": 0.2119205298013245,
"grad_norm": 0.4629852771759033,
"learning_rate": 9.623608873312979e-07,
"loss": 0.2969,
"step": 1760
},
{
"epoch": 0.21312462372065022,
"grad_norm": 0.4912186563014984,
"learning_rate": 9.615568134974902e-07,
"loss": 0.3037,
"step": 1770
},
{
"epoch": 0.2143287176399759,
"grad_norm": 0.5452593564987183,
"learning_rate": 9.607445847740946e-07,
"loss": 0.3011,
"step": 1780
},
{
"epoch": 0.21553281155930162,
"grad_norm": 0.5524305701255798,
"learning_rate": 9.599242155117514e-07,
"loss": 0.3056,
"step": 1790
},
{
"epoch": 0.21673690547862734,
"grad_norm": 0.4734737277030945,
"learning_rate": 9.590957202049288e-07,
"loss": 0.2937,
"step": 1800
},
{
"epoch": 0.21794099939795303,
"grad_norm": 0.5050627589225769,
"learning_rate": 9.582591134916683e-07,
"loss": 0.2964,
"step": 1810
},
{
"epoch": 0.21914509331727874,
"grad_norm": 0.5784972310066223,
"learning_rate": 9.574144101533258e-07,
"loss": 0.3126,
"step": 1820
},
{
"epoch": 0.22034918723660446,
"grad_norm": 0.67679762840271,
"learning_rate": 9.565616251143093e-07,
"loss": 0.2997,
"step": 1830
},
{
"epoch": 0.22155328115593018,
"grad_norm": 0.730844259262085,
"learning_rate": 9.55700773441817e-07,
"loss": 0.2992,
"step": 1840
},
{
"epoch": 0.22275737507525586,
"grad_norm": 0.511701226234436,
"learning_rate": 9.5483187034557e-07,
"loss": 0.2843,
"step": 1850
},
{
"epoch": 0.22396146899458158,
"grad_norm": 0.49653661251068115,
"learning_rate": 9.539549311775434e-07,
"loss": 0.3003,
"step": 1860
},
{
"epoch": 0.2251655629139073,
"grad_norm": 0.479397714138031,
"learning_rate": 9.530699714316955e-07,
"loss": 0.3007,
"step": 1870
},
{
"epoch": 0.22636965683323299,
"grad_norm": 0.5917854905128479,
"learning_rate": 9.521770067436944e-07,
"loss": 0.2818,
"step": 1880
},
{
"epoch": 0.2275737507525587,
"grad_norm": 0.4750485420227051,
"learning_rate": 9.512760528906409e-07,
"loss": 0.3107,
"step": 1890
},
{
"epoch": 0.22877784467188442,
"grad_norm": 0.5081465244293213,
"learning_rate": 9.503671257907905e-07,
"loss": 0.3003,
"step": 1900
},
{
"epoch": 0.2299819385912101,
"grad_norm": 0.7816819548606873,
"learning_rate": 9.494502415032714e-07,
"loss": 0.2898,
"step": 1910
},
{
"epoch": 0.23118603251053582,
"grad_norm": 0.600690484046936,
"learning_rate": 9.485254162278013e-07,
"loss": 0.2975,
"step": 1920
},
{
"epoch": 0.23239012642986154,
"grad_norm": 0.6016291379928589,
"learning_rate": 9.475926663044016e-07,
"loss": 0.2895,
"step": 1930
},
{
"epoch": 0.23359422034918723,
"grad_norm": 0.5959491729736328,
"learning_rate": 9.466520082131074e-07,
"loss": 0.293,
"step": 1940
},
{
"epoch": 0.23479831426851294,
"grad_norm": 0.5337576270103455,
"learning_rate": 9.457034585736776e-07,
"loss": 0.2954,
"step": 1950
},
{
"epoch": 0.23600240818783866,
"grad_norm": 0.5701966881752014,
"learning_rate": 9.447470341453003e-07,
"loss": 0.3016,
"step": 1960
},
{
"epoch": 0.23720650210716435,
"grad_norm": 0.48122677206993103,
"learning_rate": 9.437827518262976e-07,
"loss": 0.2834,
"step": 1970
},
{
"epoch": 0.23841059602649006,
"grad_norm": 0.6107509732246399,
"learning_rate": 9.428106286538263e-07,
"loss": 0.2865,
"step": 1980
},
{
"epoch": 0.23961468994581578,
"grad_norm": 0.4537561237812042,
"learning_rate": 9.418306818035773e-07,
"loss": 0.2981,
"step": 1990
},
{
"epoch": 0.2408187838651415,
"grad_norm": 0.6205712556838989,
"learning_rate": 9.408429285894721e-07,
"loss": 0.3099,
"step": 2000
},
{
"epoch": 0.24202287778446718,
"grad_norm": 0.4940670132637024,
"learning_rate": 9.398473864633564e-07,
"loss": 0.2942,
"step": 2010
},
{
"epoch": 0.2432269717037929,
"grad_norm": 0.45464888215065,
"learning_rate": 9.388440730146923e-07,
"loss": 0.2875,
"step": 2020
},
{
"epoch": 0.24443106562311862,
"grad_norm": 0.4339371919631958,
"learning_rate": 9.378330059702479e-07,
"loss": 0.284,
"step": 2030
},
{
"epoch": 0.2456351595424443,
"grad_norm": 0.6798887848854065,
"learning_rate": 9.368142031937826e-07,
"loss": 0.3079,
"step": 2040
},
{
"epoch": 0.24683925346177002,
"grad_norm": 0.504805326461792,
"learning_rate": 9.357876826857334e-07,
"loss": 0.2942,
"step": 2050
},
{
"epoch": 0.24804334738109574,
"grad_norm": 1.0256134271621704,
"learning_rate": 9.347534625828955e-07,
"loss": 0.2958,
"step": 2060
},
{
"epoch": 0.24924744130042142,
"grad_norm": 0.7034043073654175,
"learning_rate": 9.337115611581019e-07,
"loss": 0.2977,
"step": 2070
},
{
"epoch": 0.25045153521974717,
"grad_norm": 0.6767880916595459,
"learning_rate": 9.326619968199016e-07,
"loss": 0.2843,
"step": 2080
},
{
"epoch": 0.25165562913907286,
"grad_norm": 0.5257042050361633,
"learning_rate": 9.316047881122334e-07,
"loss": 0.2869,
"step": 2090
},
{
"epoch": 0.25285972305839854,
"grad_norm": 0.5919986963272095,
"learning_rate": 9.305399537140983e-07,
"loss": 0.3009,
"step": 2100
},
{
"epoch": 0.2540638169777243,
"grad_norm": 0.5936114192008972,
"learning_rate": 9.294675124392302e-07,
"loss": 0.2863,
"step": 2110
},
{
"epoch": 0.25526791089705,
"grad_norm": 1.1754176616668701,
"learning_rate": 9.283874832357625e-07,
"loss": 0.2808,
"step": 2120
},
{
"epoch": 0.25647200481637566,
"grad_norm": 0.6144666075706482,
"learning_rate": 9.272998851858943e-07,
"loss": 0.2854,
"step": 2130
},
{
"epoch": 0.2576760987357014,
"grad_norm": 0.47984328866004944,
"learning_rate": 9.262047375055524e-07,
"loss": 0.2978,
"step": 2140
},
{
"epoch": 0.2588801926550271,
"grad_norm": 0.6158226728439331,
"learning_rate": 9.251020595440524e-07,
"loss": 0.3072,
"step": 2150
},
{
"epoch": 0.2600842865743528,
"grad_norm": 0.6357386708259583,
"learning_rate": 9.239918707837564e-07,
"loss": 0.2927,
"step": 2160
},
{
"epoch": 0.26128838049367853,
"grad_norm": 0.6893799901008606,
"learning_rate": 9.228741908397293e-07,
"loss": 0.2988,
"step": 2170
},
{
"epoch": 0.2624924744130042,
"grad_norm": 0.5763195157051086,
"learning_rate": 9.217490394593914e-07,
"loss": 0.3049,
"step": 2180
},
{
"epoch": 0.2636965683323299,
"grad_norm": 0.5649781823158264,
"learning_rate": 9.206164365221706e-07,
"loss": 0.3083,
"step": 2190
},
{
"epoch": 0.26490066225165565,
"grad_norm": 0.4519605040550232,
"learning_rate": 9.194764020391506e-07,
"loss": 0.274,
"step": 2200
},
{
"epoch": 0.26610475617098134,
"grad_norm": 0.5203403830528259,
"learning_rate": 9.183289561527164e-07,
"loss": 0.2823,
"step": 2210
},
{
"epoch": 0.267308850090307,
"grad_norm": 0.525934100151062,
"learning_rate": 9.171741191362005e-07,
"loss": 0.2928,
"step": 2220
},
{
"epoch": 0.26851294400963277,
"grad_norm": 0.5151864290237427,
"learning_rate": 9.160119113935227e-07,
"loss": 0.2914,
"step": 2230
},
{
"epoch": 0.26971703792895846,
"grad_norm": 0.663339376449585,
"learning_rate": 9.14842353458831e-07,
"loss": 0.301,
"step": 2240
},
{
"epoch": 0.27092113184828415,
"grad_norm": 0.5526972413063049,
"learning_rate": 9.136654659961381e-07,
"loss": 0.2931,
"step": 2250
},
{
"epoch": 0.2721252257676099,
"grad_norm": 0.6518740057945251,
"learning_rate": 9.12481269798956e-07,
"loss": 0.2772,
"step": 2260
},
{
"epoch": 0.2733293196869356,
"grad_norm": 0.5191295742988586,
"learning_rate": 9.112897857899298e-07,
"loss": 0.2933,
"step": 2270
},
{
"epoch": 0.27453341360626127,
"grad_norm": 1.087936282157898,
"learning_rate": 9.100910350204669e-07,
"loss": 0.2956,
"step": 2280
},
{
"epoch": 0.275737507525587,
"grad_norm": 0.5870952010154724,
"learning_rate": 9.088850386703653e-07,
"loss": 0.2857,
"step": 2290
},
{
"epoch": 0.2769416014449127,
"grad_norm": 0.5123207569122314,
"learning_rate": 9.076718180474399e-07,
"loss": 0.3005,
"step": 2300
},
{
"epoch": 0.2781456953642384,
"grad_norm": 0.47658002376556396,
"learning_rate": 9.064513945871457e-07,
"loss": 0.2889,
"step": 2310
},
{
"epoch": 0.27934978928356413,
"grad_norm": 0.564738929271698,
"learning_rate": 9.052237898521984e-07,
"loss": 0.2929,
"step": 2320
},
{
"epoch": 0.2805538832028898,
"grad_norm": 0.47116583585739136,
"learning_rate": 9.03989025532195e-07,
"loss": 0.2942,
"step": 2330
},
{
"epoch": 0.2817579771222155,
"grad_norm": 0.5838178396224976,
"learning_rate": 9.027471234432292e-07,
"loss": 0.2883,
"step": 2340
},
{
"epoch": 0.28296207104154125,
"grad_norm": 0.48679229617118835,
"learning_rate": 9.014981055275059e-07,
"loss": 0.29,
"step": 2350
},
{
"epoch": 0.28416616496086694,
"grad_norm": 0.5863898992538452,
"learning_rate": 9.00241993852955e-07,
"loss": 0.2871,
"step": 2360
},
{
"epoch": 0.28537025888019263,
"grad_norm": 0.5949921607971191,
"learning_rate": 8.989788106128402e-07,
"loss": 0.2927,
"step": 2370
},
{
"epoch": 0.2865743527995184,
"grad_norm": 0.42538484930992126,
"learning_rate": 8.977085781253668e-07,
"loss": 0.2825,
"step": 2380
},
{
"epoch": 0.28777844671884406,
"grad_norm": 0.5678000450134277,
"learning_rate": 8.964313188332881e-07,
"loss": 0.294,
"step": 2390
},
{
"epoch": 0.2889825406381698,
"grad_norm": 0.5283777713775635,
"learning_rate": 8.951470553035086e-07,
"loss": 0.286,
"step": 2400
},
{
"epoch": 0.2901866345574955,
"grad_norm": 0.8639681935310364,
"learning_rate": 8.938558102266851e-07,
"loss": 0.2971,
"step": 2410
},
{
"epoch": 0.2913907284768212,
"grad_norm": 0.5353107452392578,
"learning_rate": 8.925576064168261e-07,
"loss": 0.3038,
"step": 2420
},
{
"epoch": 0.2925948223961469,
"grad_norm": 0.5691916346549988,
"learning_rate": 8.912524668108885e-07,
"loss": 0.2901,
"step": 2430
},
{
"epoch": 0.2937989163154726,
"grad_norm": 0.5999578833580017,
"learning_rate": 8.899404144683724e-07,
"loss": 0.2864,
"step": 2440
},
{
"epoch": 0.2950030102347983,
"grad_norm": 0.6660271883010864,
"learning_rate": 8.886214725709136e-07,
"loss": 0.2866,
"step": 2450
},
{
"epoch": 0.29620710415412405,
"grad_norm": 0.5501262545585632,
"learning_rate": 8.872956644218742e-07,
"loss": 0.2909,
"step": 2460
},
{
"epoch": 0.29741119807344973,
"grad_norm": 0.44489532709121704,
"learning_rate": 8.859630134459308e-07,
"loss": 0.2869,
"step": 2470
},
{
"epoch": 0.2986152919927754,
"grad_norm": 0.619097113609314,
"learning_rate": 8.846235431886604e-07,
"loss": 0.2782,
"step": 2480
},
{
"epoch": 0.29981938591210117,
"grad_norm": 0.49712878465652466,
"learning_rate": 8.832772773161251e-07,
"loss": 0.2848,
"step": 2490
},
{
"epoch": 0.30102347983142685,
"grad_norm": 0.46963346004486084,
"learning_rate": 8.819242396144529e-07,
"loss": 0.2915,
"step": 2500
},
{
"epoch": 0.30222757375075254,
"grad_norm": 0.5881354212760925,
"learning_rate": 8.805644539894181e-07,
"loss": 0.2969,
"step": 2510
},
{
"epoch": 0.3034316676700783,
"grad_norm": 0.5345028042793274,
"learning_rate": 8.791979444660193e-07,
"loss": 0.2985,
"step": 2520
},
{
"epoch": 0.304635761589404,
"grad_norm": 0.5038124322891235,
"learning_rate": 8.778247351880536e-07,
"loss": 0.2931,
"step": 2530
},
{
"epoch": 0.30583985550872966,
"grad_norm": 0.6723479628562927,
"learning_rate": 8.764448504176919e-07,
"loss": 0.2885,
"step": 2540
},
{
"epoch": 0.3070439494280554,
"grad_norm": 0.474516361951828,
"learning_rate": 8.750583145350483e-07,
"loss": 0.2906,
"step": 2550
},
{
"epoch": 0.3082480433473811,
"grad_norm": 0.509379506111145,
"learning_rate": 8.736651520377507e-07,
"loss": 0.2874,
"step": 2560
},
{
"epoch": 0.3094521372667068,
"grad_norm": 0.9317507743835449,
"learning_rate": 8.722653875405075e-07,
"loss": 0.2891,
"step": 2570
},
{
"epoch": 0.3106562311860325,
"grad_norm": 0.4634588360786438,
"learning_rate": 8.708590457746727e-07,
"loss": 0.284,
"step": 2580
},
{
"epoch": 0.3118603251053582,
"grad_norm": 0.4674171209335327,
"learning_rate": 8.694461515878088e-07,
"loss": 0.2851,
"step": 2590
},
{
"epoch": 0.3130644190246839,
"grad_norm": 0.4606451988220215,
"learning_rate": 8.68026729943248e-07,
"loss": 0.282,
"step": 2600
},
{
"epoch": 0.31426851294400965,
"grad_norm": 0.5793256163597107,
"learning_rate": 8.666008059196513e-07,
"loss": 0.2852,
"step": 2610
},
{
"epoch": 0.31547260686333534,
"grad_norm": 0.742026686668396,
"learning_rate": 8.65168404710565e-07,
"loss": 0.2909,
"step": 2620
},
{
"epoch": 0.316676700782661,
"grad_norm": 0.469868928194046,
"learning_rate": 8.637295516239757e-07,
"loss": 0.2784,
"step": 2630
},
{
"epoch": 0.31788079470198677,
"grad_norm": 0.6895257234573364,
"learning_rate": 8.622842720818635e-07,
"loss": 0.2849,
"step": 2640
},
{
"epoch": 0.31908488862131246,
"grad_norm": 0.6843047142028809,
"learning_rate": 8.608325916197524e-07,
"loss": 0.2969,
"step": 2650
},
{
"epoch": 0.32028898254063815,
"grad_norm": 2.822052240371704,
"learning_rate": 8.593745358862592e-07,
"loss": 0.2954,
"step": 2660
},
{
"epoch": 0.3214930764599639,
"grad_norm": 0.5745678544044495,
"learning_rate": 8.579101306426406e-07,
"loss": 0.3005,
"step": 2670
},
{
"epoch": 0.3226971703792896,
"grad_norm": 0.4625186026096344,
"learning_rate": 8.564394017623378e-07,
"loss": 0.2889,
"step": 2680
},
{
"epoch": 0.32390126429861527,
"grad_norm": 0.5813141465187073,
"learning_rate": 8.549623752305192e-07,
"loss": 0.2926,
"step": 2690
},
{
"epoch": 0.325105358217941,
"grad_norm": 0.49706658720970154,
"learning_rate": 8.534790771436222e-07,
"loss": 0.2884,
"step": 2700
},
{
"epoch": 0.3263094521372667,
"grad_norm": 0.5477120280265808,
"learning_rate": 8.519895337088907e-07,
"loss": 0.2922,
"step": 2710
},
{
"epoch": 0.32751354605659244,
"grad_norm": 1.157457709312439,
"learning_rate": 8.504937712439131e-07,
"loss": 0.2699,
"step": 2720
},
{
"epoch": 0.32871763997591813,
"grad_norm": 0.5263344049453735,
"learning_rate": 8.48991816176157e-07,
"loss": 0.2888,
"step": 2730
},
{
"epoch": 0.3299217338952438,
"grad_norm": 0.764481782913208,
"learning_rate": 8.474836950425026e-07,
"loss": 0.292,
"step": 2740
},
{
"epoch": 0.33112582781456956,
"grad_norm": 0.5704035758972168,
"learning_rate": 8.459694344887731e-07,
"loss": 0.2928,
"step": 2750
},
{
"epoch": 0.33232992173389525,
"grad_norm": 0.46473219990730286,
"learning_rate": 8.444490612692645e-07,
"loss": 0.2816,
"step": 2760
},
{
"epoch": 0.33353401565322094,
"grad_norm": 0.5250662565231323,
"learning_rate": 8.429226022462728e-07,
"loss": 0.2881,
"step": 2770
},
{
"epoch": 0.3347381095725467,
"grad_norm": 0.6085227727890015,
"learning_rate": 8.413900843896193e-07,
"loss": 0.3122,
"step": 2780
},
{
"epoch": 0.33594220349187237,
"grad_norm": 0.7203246355056763,
"learning_rate": 8.398515347761745e-07,
"loss": 0.2911,
"step": 2790
},
{
"epoch": 0.33714629741119806,
"grad_norm": 0.5305497050285339,
"learning_rate": 8.383069805893784e-07,
"loss": 0.2888,
"step": 2800
},
{
"epoch": 0.3383503913305238,
"grad_norm": 0.5452449917793274,
"learning_rate": 8.367564491187622e-07,
"loss": 0.2866,
"step": 2810
},
{
"epoch": 0.3395544852498495,
"grad_norm": 0.4815659523010254,
"learning_rate": 8.351999677594645e-07,
"loss": 0.2863,
"step": 2820
},
{
"epoch": 0.3407585791691752,
"grad_norm": 0.5499128103256226,
"learning_rate": 8.336375640117481e-07,
"loss": 0.2865,
"step": 2830
},
{
"epoch": 0.3419626730885009,
"grad_norm": 0.559804379940033,
"learning_rate": 8.320692654805136e-07,
"loss": 0.2833,
"step": 2840
},
{
"epoch": 0.3431667670078266,
"grad_norm": 0.5070551633834839,
"learning_rate": 8.304950998748124e-07,
"loss": 0.2969,
"step": 2850
},
{
"epoch": 0.3443708609271523,
"grad_norm": 0.5566725730895996,
"learning_rate": 8.289150950073564e-07,
"loss": 0.2814,
"step": 2860
},
{
"epoch": 0.34557495484647804,
"grad_norm": 0.5421969890594482,
"learning_rate": 8.273292787940268e-07,
"loss": 0.2805,
"step": 2870
},
{
"epoch": 0.34677904876580373,
"grad_norm": 0.49686506390571594,
"learning_rate": 8.257376792533813e-07,
"loss": 0.2872,
"step": 2880
},
{
"epoch": 0.3479831426851294,
"grad_norm": 0.4665164649486542,
"learning_rate": 8.241403245061584e-07,
"loss": 0.2816,
"step": 2890
},
{
"epoch": 0.34918723660445516,
"grad_norm": 0.4437556266784668,
"learning_rate": 8.225372427747813e-07,
"loss": 0.286,
"step": 2900
},
{
"epoch": 0.35039133052378085,
"grad_norm": 0.5280335545539856,
"learning_rate": 8.209284623828583e-07,
"loss": 0.2895,
"step": 2910
},
{
"epoch": 0.35159542444310654,
"grad_norm": 0.5298367142677307,
"learning_rate": 8.193140117546832e-07,
"loss": 0.282,
"step": 2920
},
{
"epoch": 0.3527995183624323,
"grad_norm": 0.7123149633407593,
"learning_rate": 8.176939194147329e-07,
"loss": 0.2841,
"step": 2930
},
{
"epoch": 0.354003612281758,
"grad_norm": 0.6565315127372742,
"learning_rate": 8.160682139871632e-07,
"loss": 0.2793,
"step": 2940
},
{
"epoch": 0.35520770620108366,
"grad_norm": 0.7005172967910767,
"learning_rate": 8.144369241953032e-07,
"loss": 0.2854,
"step": 2950
},
{
"epoch": 0.3564118001204094,
"grad_norm": 0.7468757033348083,
"learning_rate": 8.128000788611478e-07,
"loss": 0.2992,
"step": 2960
},
{
"epoch": 0.3576158940397351,
"grad_norm": 0.5055456161499023,
"learning_rate": 8.111577069048487e-07,
"loss": 0.2979,
"step": 2970
},
{
"epoch": 0.3588199879590608,
"grad_norm": 0.576806366443634,
"learning_rate": 8.095098373442027e-07,
"loss": 0.2915,
"step": 2980
},
{
"epoch": 0.3600240818783865,
"grad_norm": 0.5598990321159363,
"learning_rate": 8.078564992941401e-07,
"loss": 0.2741,
"step": 2990
},
{
"epoch": 0.3612281757977122,
"grad_norm": 0.5614596009254456,
"learning_rate": 8.061977219662092e-07,
"loss": 0.2913,
"step": 3000
},
{
"epoch": 0.3624322697170379,
"grad_norm": 0.37974095344543457,
"learning_rate": 8.045335346680611e-07,
"loss": 0.2787,
"step": 3010
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.6439441442489624,
"learning_rate": 8.028639668029309e-07,
"loss": 0.2868,
"step": 3020
},
{
"epoch": 0.36484045755568933,
"grad_norm": 0.46323299407958984,
"learning_rate": 8.011890478691196e-07,
"loss": 0.2831,
"step": 3030
},
{
"epoch": 0.3660445514750151,
"grad_norm": 0.4963575005531311,
"learning_rate": 7.995088074594713e-07,
"loss": 0.2782,
"step": 3040
},
{
"epoch": 0.36724864539434077,
"grad_norm": 0.6179429888725281,
"learning_rate": 7.978232752608516e-07,
"loss": 0.2703,
"step": 3050
},
{
"epoch": 0.36845273931366646,
"grad_norm": 0.5127160549163818,
"learning_rate": 7.961324810536223e-07,
"loss": 0.3007,
"step": 3060
},
{
"epoch": 0.3696568332329922,
"grad_norm": 0.45177775621414185,
"learning_rate": 7.94436454711116e-07,
"loss": 0.288,
"step": 3070
},
{
"epoch": 0.3708609271523179,
"grad_norm": 0.47144508361816406,
"learning_rate": 7.927352261991074e-07,
"loss": 0.2901,
"step": 3080
},
{
"epoch": 0.3720650210716436,
"grad_norm": 0.5511527061462402,
"learning_rate": 7.910288255752844e-07,
"loss": 0.2754,
"step": 3090
},
{
"epoch": 0.3732691149909693,
"grad_norm": 0.5164305567741394,
"learning_rate": 7.893172829887171e-07,
"loss": 0.2847,
"step": 3100
},
{
"epoch": 0.374473208910295,
"grad_norm": 0.5629504919052124,
"learning_rate": 7.876006286793251e-07,
"loss": 0.2953,
"step": 3110
},
{
"epoch": 0.3756773028296207,
"grad_norm": 0.513200044631958,
"learning_rate": 7.858788929773422e-07,
"loss": 0.2702,
"step": 3120
},
{
"epoch": 0.37688139674894644,
"grad_norm": 0.504371166229248,
"learning_rate": 7.841521063027825e-07,
"loss": 0.2873,
"step": 3130
},
{
"epoch": 0.37808549066827213,
"grad_norm": 0.613593578338623,
"learning_rate": 7.824202991649013e-07,
"loss": 0.27,
"step": 3140
},
{
"epoch": 0.3792895845875978,
"grad_norm": 0.7345304489135742,
"learning_rate": 7.806835021616564e-07,
"loss": 0.2895,
"step": 3150
},
{
"epoch": 0.38049367850692356,
"grad_norm": 0.48514464497566223,
"learning_rate": 7.789417459791681e-07,
"loss": 0.2809,
"step": 3160
},
{
"epoch": 0.38169777242624925,
"grad_norm": 0.4638960063457489,
"learning_rate": 7.77195061391176e-07,
"loss": 0.2839,
"step": 3170
},
{
"epoch": 0.38290186634557494,
"grad_norm": 0.5008341073989868,
"learning_rate": 7.754434792584968e-07,
"loss": 0.2701,
"step": 3180
},
{
"epoch": 0.3841059602649007,
"grad_norm": 0.5258957743644714,
"learning_rate": 7.73687030528477e-07,
"loss": 0.2709,
"step": 3190
},
{
"epoch": 0.38531005418422637,
"grad_norm": 0.5781968832015991,
"learning_rate": 7.719257462344481e-07,
"loss": 0.2994,
"step": 3200
},
{
"epoch": 0.38651414810355206,
"grad_norm": 0.5485130548477173,
"learning_rate": 7.701596574951771e-07,
"loss": 0.3001,
"step": 3210
},
{
"epoch": 0.3877182420228778,
"grad_norm": 0.4708418846130371,
"learning_rate": 7.683887955143169e-07,
"loss": 0.2736,
"step": 3220
},
{
"epoch": 0.3889223359422035,
"grad_norm": 0.5321612358093262,
"learning_rate": 7.666131915798556e-07,
"loss": 0.2892,
"step": 3230
},
{
"epoch": 0.3901264298615292,
"grad_norm": 0.524898111820221,
"learning_rate": 7.648328770635623e-07,
"loss": 0.2897,
"step": 3240
},
{
"epoch": 0.3913305237808549,
"grad_norm": 0.4973953664302826,
"learning_rate": 7.630478834204351e-07,
"loss": 0.2804,
"step": 3250
},
{
"epoch": 0.3925346177001806,
"grad_norm": 0.5439997315406799,
"learning_rate": 7.612582421881423e-07,
"loss": 0.2824,
"step": 3260
},
{
"epoch": 0.3937387116195063,
"grad_norm": 0.5040695667266846,
"learning_rate": 7.594639849864681e-07,
"loss": 0.2806,
"step": 3270
},
{
"epoch": 0.39494280553883204,
"grad_norm": 0.57867830991745,
"learning_rate": 7.576651435167523e-07,
"loss": 0.2788,
"step": 3280
},
{
"epoch": 0.39614689945815773,
"grad_norm": 0.43785402178764343,
"learning_rate": 7.558617495613304e-07,
"loss": 0.272,
"step": 3290
},
{
"epoch": 0.3973509933774834,
"grad_norm": 0.6042655110359192,
"learning_rate": 7.540538349829725e-07,
"loss": 0.2918,
"step": 3300
},
{
"epoch": 0.39855508729680916,
"grad_norm": 0.6529451012611389,
"learning_rate": 7.522414317243198e-07,
"loss": 0.2882,
"step": 3310
},
{
"epoch": 0.39975918121613485,
"grad_norm": 0.5043284296989441,
"learning_rate": 7.50424571807321e-07,
"loss": 0.2859,
"step": 3320
},
{
"epoch": 0.40096327513546054,
"grad_norm": 0.44874584674835205,
"learning_rate": 7.486032873326656e-07,
"loss": 0.2912,
"step": 3330
},
{
"epoch": 0.4021673690547863,
"grad_norm": 0.515211284160614,
"learning_rate": 7.467776104792171e-07,
"loss": 0.2747,
"step": 3340
},
{
"epoch": 0.40337146297411197,
"grad_norm": 0.5425666570663452,
"learning_rate": 7.449475735034453e-07,
"loss": 0.2964,
"step": 3350
},
{
"epoch": 0.4045755568934377,
"grad_norm": 0.5557084083557129,
"learning_rate": 7.431132087388546e-07,
"loss": 0.2809,
"step": 3360
},
{
"epoch": 0.4057796508127634,
"grad_norm": 0.4438600540161133,
"learning_rate": 7.412745485954144e-07,
"loss": 0.269,
"step": 3370
},
{
"epoch": 0.4069837447320891,
"grad_norm": 0.586608350276947,
"learning_rate": 7.394316255589854e-07,
"loss": 0.2848,
"step": 3380
},
{
"epoch": 0.40818783865141484,
"grad_norm": 0.6429834961891174,
"learning_rate": 7.375844721907466e-07,
"loss": 0.2917,
"step": 3390
},
{
"epoch": 0.4093919325707405,
"grad_norm": 0.5150188207626343,
"learning_rate": 7.35733121126619e-07,
"loss": 0.2772,
"step": 3400
},
{
"epoch": 0.4105960264900662,
"grad_norm": 0.5537393093109131,
"learning_rate": 7.338776050766896e-07,
"loss": 0.2819,
"step": 3410
},
{
"epoch": 0.41180012040939196,
"grad_norm": 0.4834784269332886,
"learning_rate": 7.320179568246333e-07,
"loss": 0.2851,
"step": 3420
},
{
"epoch": 0.41300421432871764,
"grad_norm": 0.6806831955909729,
"learning_rate": 7.301542092271337e-07,
"loss": 0.2841,
"step": 3430
},
{
"epoch": 0.41420830824804333,
"grad_norm": 0.5081019997596741,
"learning_rate": 7.282863952133022e-07,
"loss": 0.2763,
"step": 3440
},
{
"epoch": 0.4154124021673691,
"grad_norm": 0.5681424140930176,
"learning_rate": 7.264145477840974e-07,
"loss": 0.2719,
"step": 3450
},
{
"epoch": 0.41661649608669477,
"grad_norm": 0.6257504820823669,
"learning_rate": 7.245387000117404e-07,
"loss": 0.2813,
"step": 3460
},
{
"epoch": 0.41782059000602045,
"grad_norm": 0.5195356607437134,
"learning_rate": 7.226588850391317e-07,
"loss": 0.2761,
"step": 3470
},
{
"epoch": 0.4190246839253462,
"grad_norm": 0.5490323305130005,
"learning_rate": 7.207751360792647e-07,
"loss": 0.291,
"step": 3480
},
{
"epoch": 0.4202287778446719,
"grad_norm": 0.6458017230033875,
"learning_rate": 7.188874864146397e-07,
"loss": 0.2919,
"step": 3490
},
{
"epoch": 0.4214328717639976,
"grad_norm": 0.5081551671028137,
"learning_rate": 7.16995969396676e-07,
"loss": 0.2762,
"step": 3500
},
{
"epoch": 0.4226369656833233,
"grad_norm": 0.6496263742446899,
"learning_rate": 7.151006184451212e-07,
"loss": 0.2766,
"step": 3510
},
{
"epoch": 0.423841059602649,
"grad_norm": 0.6383594870567322,
"learning_rate": 7.132014670474625e-07,
"loss": 0.2829,
"step": 3520
},
{
"epoch": 0.4250451535219747,
"grad_norm": 0.6374247074127197,
"learning_rate": 7.112985487583333e-07,
"loss": 0.2776,
"step": 3530
},
{
"epoch": 0.42624924744130044,
"grad_norm": 0.48250874876976013,
"learning_rate": 7.093918971989229e-07,
"loss": 0.2794,
"step": 3540
},
{
"epoch": 0.4274533413606261,
"grad_norm": 0.5055521726608276,
"learning_rate": 7.07481546056379e-07,
"loss": 0.2818,
"step": 3550
},
{
"epoch": 0.4286574352799518,
"grad_norm": 0.558320164680481,
"learning_rate": 7.055675290832157e-07,
"loss": 0.29,
"step": 3560
},
{
"epoch": 0.42986152919927756,
"grad_norm": 0.54196697473526,
"learning_rate": 7.036498800967153e-07,
"loss": 0.2819,
"step": 3570
},
{
"epoch": 0.43106562311860325,
"grad_norm": 0.5442371368408203,
"learning_rate": 7.017286329783314e-07,
"loss": 0.3044,
"step": 3580
},
{
"epoch": 0.43226971703792894,
"grad_norm": 0.531579315662384,
"learning_rate": 6.9980382167309e-07,
"loss": 0.2875,
"step": 3590
},
{
"epoch": 0.4334738109572547,
"grad_norm": 0.6069034934043884,
"learning_rate": 6.978754801889902e-07,
"loss": 0.2915,
"step": 3600
},
{
"epoch": 0.43467790487658037,
"grad_norm": 0.5376235246658325,
"learning_rate": 6.959436425964033e-07,
"loss": 0.2768,
"step": 3610
},
{
"epoch": 0.43588199879590606,
"grad_norm": 0.5438763499259949,
"learning_rate": 6.9400834302747e-07,
"loss": 0.2911,
"step": 3620
},
{
"epoch": 0.4370860927152318,
"grad_norm": 0.4325105547904968,
"learning_rate": 6.920696156754985e-07,
"loss": 0.269,
"step": 3630
},
{
"epoch": 0.4382901866345575,
"grad_norm": 0.5107905864715576,
"learning_rate": 6.901274947943597e-07,
"loss": 0.2754,
"step": 3640
},
{
"epoch": 0.4394942805538832,
"grad_norm": 0.5302306413650513,
"learning_rate": 6.881820146978822e-07,
"loss": 0.2835,
"step": 3650
},
{
"epoch": 0.4406983744732089,
"grad_norm": 0.5489309430122375,
"learning_rate": 6.862332097592457e-07,
"loss": 0.2746,
"step": 3660
},
{
"epoch": 0.4419024683925346,
"grad_norm": 0.4515032172203064,
"learning_rate": 6.842811144103743e-07,
"loss": 0.2829,
"step": 3670
},
{
"epoch": 0.44310656231186035,
"grad_norm": 0.5359588861465454,
"learning_rate": 6.823257631413275e-07,
"loss": 0.2826,
"step": 3680
},
{
"epoch": 0.44431065623118604,
"grad_norm": 0.49561506509780884,
"learning_rate": 6.803671904996916e-07,
"loss": 0.2946,
"step": 3690
},
{
"epoch": 0.44551475015051173,
"grad_norm": 0.43841075897216797,
"learning_rate": 6.784054310899683e-07,
"loss": 0.2802,
"step": 3700
},
{
"epoch": 0.4467188440698375,
"grad_norm": 0.7528261542320251,
"learning_rate": 6.764405195729639e-07,
"loss": 0.2829,
"step": 3710
},
{
"epoch": 0.44792293798916316,
"grad_norm": 1.1440777778625488,
"learning_rate": 6.744724906651774e-07,
"loss": 0.2665,
"step": 3720
},
{
"epoch": 0.44912703190848885,
"grad_norm": 0.5153807997703552,
"learning_rate": 6.72501379138186e-07,
"loss": 0.2754,
"step": 3730
},
{
"epoch": 0.4503311258278146,
"grad_norm": 0.582036554813385,
"learning_rate": 6.705272198180312e-07,
"loss": 0.2818,
"step": 3740
},
{
"epoch": 0.4515352197471403,
"grad_norm": 0.7196856737136841,
"learning_rate": 6.685500475846044e-07,
"loss": 0.2744,
"step": 3750
},
{
"epoch": 0.45273931366646597,
"grad_norm": 1.0595272779464722,
"learning_rate": 6.665698973710288e-07,
"loss": 0.2602,
"step": 3760
},
{
"epoch": 0.4539434075857917,
"grad_norm": 0.4910378158092499,
"learning_rate": 6.645868041630439e-07,
"loss": 0.2887,
"step": 3770
},
{
"epoch": 0.4551475015051174,
"grad_norm": 0.4395122230052948,
"learning_rate": 6.626008029983867e-07,
"loss": 0.2771,
"step": 3780
},
{
"epoch": 0.4563515954244431,
"grad_norm": 0.5630185008049011,
"learning_rate": 6.606119289661721e-07,
"loss": 0.2976,
"step": 3790
},
{
"epoch": 0.45755568934376883,
"grad_norm": 0.6062456965446472,
"learning_rate": 6.58620217206274e-07,
"loss": 0.2707,
"step": 3800
},
{
"epoch": 0.4587597832630945,
"grad_norm": 0.6882142424583435,
"learning_rate": 6.566257029087039e-07,
"loss": 0.2732,
"step": 3810
},
{
"epoch": 0.4599638771824202,
"grad_norm": 0.4631926417350769,
"learning_rate": 6.546284213129885e-07,
"loss": 0.2794,
"step": 3820
},
{
"epoch": 0.46116797110174595,
"grad_norm": 0.4465793967247009,
"learning_rate": 6.526284077075488e-07,
"loss": 0.2809,
"step": 3830
},
{
"epoch": 0.46237206502107164,
"grad_norm": 0.5073222517967224,
"learning_rate": 6.506256974290747e-07,
"loss": 0.2908,
"step": 3840
},
{
"epoch": 0.46357615894039733,
"grad_norm": 0.5717306137084961,
"learning_rate": 6.486203258619016e-07,
"loss": 0.282,
"step": 3850
},
{
"epoch": 0.4647802528597231,
"grad_norm": 0.5614638924598694,
"learning_rate": 6.466123284373858e-07,
"loss": 0.2764,
"step": 3860
},
{
"epoch": 0.46598434677904876,
"grad_norm": 0.626006007194519,
"learning_rate": 6.446017406332772e-07,
"loss": 0.277,
"step": 3870
},
{
"epoch": 0.46718844069837445,
"grad_norm": 0.47509709000587463,
"learning_rate": 6.425885979730933e-07,
"loss": 0.2828,
"step": 3880
},
{
"epoch": 0.4683925346177002,
"grad_norm": 0.5545176267623901,
"learning_rate": 6.405729360254914e-07,
"loss": 0.2893,
"step": 3890
},
{
"epoch": 0.4695966285370259,
"grad_norm": 0.4888879060745239,
"learning_rate": 6.3855479040364e-07,
"loss": 0.2811,
"step": 3900
},
{
"epoch": 0.4708007224563516,
"grad_norm": 0.44063079357147217,
"learning_rate": 6.365341967645902e-07,
"loss": 0.2782,
"step": 3910
},
{
"epoch": 0.4720048163756773,
"grad_norm": 0.5356207489967346,
"learning_rate": 6.345111908086444e-07,
"loss": 0.2658,
"step": 3920
},
{
"epoch": 0.473208910295003,
"grad_norm": 0.5134460926055908,
"learning_rate": 6.324858082787275e-07,
"loss": 0.2782,
"step": 3930
},
{
"epoch": 0.4744130042143287,
"grad_norm": 0.5685980916023254,
"learning_rate": 6.304580849597527e-07,
"loss": 0.2704,
"step": 3940
},
{
"epoch": 0.47561709813365444,
"grad_norm": 0.8610411286354065,
"learning_rate": 6.284280566779923e-07,
"loss": 0.29,
"step": 3950
},
{
"epoch": 0.4768211920529801,
"grad_norm": 0.5496920943260193,
"learning_rate": 6.263957593004421e-07,
"loss": 0.2704,
"step": 3960
},
{
"epoch": 0.4780252859723058,
"grad_norm": 0.4593532383441925,
"learning_rate": 6.243612287341896e-07,
"loss": 0.2806,
"step": 3970
},
{
"epoch": 0.47922937989163156,
"grad_norm": 0.5178139805793762,
"learning_rate": 6.223245009257783e-07,
"loss": 0.2683,
"step": 3980
},
{
"epoch": 0.48043347381095725,
"grad_norm": 0.6350088119506836,
"learning_rate": 6.20285611860573e-07,
"loss": 0.2796,
"step": 3990
},
{
"epoch": 0.481637567730283,
"grad_norm": 0.4848230183124542,
"learning_rate": 6.182445975621246e-07,
"loss": 0.2727,
"step": 4000
},
{
"epoch": 0.4828416616496087,
"grad_norm": 0.6039783358573914,
"learning_rate": 6.162014940915323e-07,
"loss": 0.295,
"step": 4010
},
{
"epoch": 0.48404575556893437,
"grad_norm": 0.5623034834861755,
"learning_rate": 6.141563375468082e-07,
"loss": 0.2843,
"step": 4020
},
{
"epoch": 0.4852498494882601,
"grad_norm": 0.5298231244087219,
"learning_rate": 6.12109164062238e-07,
"loss": 0.2685,
"step": 4030
},
{
"epoch": 0.4864539434075858,
"grad_norm": 0.49439486861228943,
"learning_rate": 6.100600098077431e-07,
"loss": 0.2588,
"step": 4040
},
{
"epoch": 0.4876580373269115,
"grad_norm": 0.4667768180370331,
"learning_rate": 6.080089109882418e-07,
"loss": 0.275,
"step": 4050
},
{
"epoch": 0.48886213124623723,
"grad_norm": 0.5490863919258118,
"learning_rate": 6.059559038430094e-07,
"loss": 0.2837,
"step": 4060
},
{
"epoch": 0.4900662251655629,
"grad_norm": 0.467192143201828,
"learning_rate": 6.039010246450376e-07,
"loss": 0.2733,
"step": 4070
},
{
"epoch": 0.4912703190848886,
"grad_norm": 0.49663642048835754,
"learning_rate": 6.018443097003945e-07,
"loss": 0.2738,
"step": 4080
},
{
"epoch": 0.49247441300421435,
"grad_norm": 0.501777708530426,
"learning_rate": 5.997857953475823e-07,
"loss": 0.2743,
"step": 4090
},
{
"epoch": 0.49367850692354004,
"grad_norm": 0.5064652562141418,
"learning_rate": 5.977255179568955e-07,
"loss": 0.2748,
"step": 4100
},
{
"epoch": 0.4948826008428657,
"grad_norm": 0.6248656511306763,
"learning_rate": 5.956635139297783e-07,
"loss": 0.2765,
"step": 4110
},
{
"epoch": 0.49608669476219147,
"grad_norm": 0.45688706636428833,
"learning_rate": 5.935998196981817e-07,
"loss": 0.271,
"step": 4120
},
{
"epoch": 0.49729078868151716,
"grad_norm": 0.7225250601768494,
"learning_rate": 5.915344717239197e-07,
"loss": 0.2853,
"step": 4130
},
{
"epoch": 0.49849488260084285,
"grad_norm": 0.5863081812858582,
"learning_rate": 5.894675064980246e-07,
"loss": 0.2685,
"step": 4140
},
{
"epoch": 0.4996989765201686,
"grad_norm": 0.5770187973976135,
"learning_rate": 5.87398960540103e-07,
"loss": 0.2774,
"step": 4150
},
{
"epoch": 0.5009030704394943,
"grad_norm": 0.41943806409835815,
"learning_rate": 5.8532887039769e-07,
"loss": 0.2622,
"step": 4160
},
{
"epoch": 0.50210716435882,
"grad_norm": 0.6374907493591309,
"learning_rate": 5.832572726456039e-07,
"loss": 0.2858,
"step": 4170
},
{
"epoch": 0.5033112582781457,
"grad_norm": 0.5210843086242676,
"learning_rate": 5.811842038852996e-07,
"loss": 0.2706,
"step": 4180
},
{
"epoch": 0.5045153521974715,
"grad_norm": 0.596387505531311,
"learning_rate": 5.791097007442222e-07,
"loss": 0.2823,
"step": 4190
},
{
"epoch": 0.5057194461167971,
"grad_norm": 0.6676878929138184,
"learning_rate": 5.7703379987516e-07,
"loss": 0.2848,
"step": 4200
},
{
"epoch": 0.5069235400361228,
"grad_norm": 0.6097555160522461,
"learning_rate": 5.749565379555961e-07,
"loss": 0.2766,
"step": 4210
},
{
"epoch": 0.5081276339554486,
"grad_norm": 0.6043739318847656,
"learning_rate": 5.728779516870615e-07,
"loss": 0.2885,
"step": 4220
},
{
"epoch": 0.5093317278747742,
"grad_norm": 0.5565124750137329,
"learning_rate": 5.707980777944859e-07,
"loss": 0.2643,
"step": 4230
},
{
"epoch": 0.5105358217941,
"grad_norm": 0.49649959802627563,
"learning_rate": 5.687169530255487e-07,
"loss": 0.2672,
"step": 4240
},
{
"epoch": 0.5117399157134257,
"grad_norm": 0.49968451261520386,
"learning_rate": 5.666346141500307e-07,
"loss": 0.2754,
"step": 4250
},
{
"epoch": 0.5129440096327513,
"grad_norm": 0.4982677698135376,
"learning_rate": 5.645510979591634e-07,
"loss": 0.2785,
"step": 4260
},
{
"epoch": 0.5141481035520771,
"grad_norm": 0.904083251953125,
"learning_rate": 5.624664412649797e-07,
"loss": 0.2833,
"step": 4270
},
{
"epoch": 0.5153521974714028,
"grad_norm": 0.5038682222366333,
"learning_rate": 5.603806808996625e-07,
"loss": 0.2746,
"step": 4280
},
{
"epoch": 0.5165562913907285,
"grad_norm": 0.7115175724029541,
"learning_rate": 5.58293853714895e-07,
"loss": 0.2712,
"step": 4290
},
{
"epoch": 0.5177603853100542,
"grad_norm": 0.5522176027297974,
"learning_rate": 5.562059965812097e-07,
"loss": 0.2869,
"step": 4300
},
{
"epoch": 0.5189644792293799,
"grad_norm": 0.6081178784370422,
"learning_rate": 5.541171463873357e-07,
"loss": 0.2751,
"step": 4310
},
{
"epoch": 0.5201685731487056,
"grad_norm": 0.5689599514007568,
"learning_rate": 5.52027340039548e-07,
"loss": 0.2875,
"step": 4320
},
{
"epoch": 0.5213726670680313,
"grad_norm": 0.43370601534843445,
"learning_rate": 5.499366144610153e-07,
"loss": 0.2673,
"step": 4330
},
{
"epoch": 0.5225767609873571,
"grad_norm": 0.5115625262260437,
"learning_rate": 5.478450065911473e-07,
"loss": 0.2791,
"step": 4340
},
{
"epoch": 0.5237808549066827,
"grad_norm": 0.518798291683197,
"learning_rate": 5.45752553384942e-07,
"loss": 0.277,
"step": 4350
},
{
"epoch": 0.5249849488260084,
"grad_norm": 0.5628324151039124,
"learning_rate": 5.436592918123337e-07,
"loss": 0.2884,
"step": 4360
},
{
"epoch": 0.5261890427453342,
"grad_norm": 0.47458890080451965,
"learning_rate": 5.415652588575385e-07,
"loss": 0.27,
"step": 4370
},
{
"epoch": 0.5273931366646598,
"grad_norm": 0.6163709759712219,
"learning_rate": 5.394704915184014e-07,
"loss": 0.2643,
"step": 4380
},
{
"epoch": 0.5285972305839856,
"grad_norm": 0.44985631108283997,
"learning_rate": 5.373750268057431e-07,
"loss": 0.2774,
"step": 4390
},
{
"epoch": 0.5298013245033113,
"grad_norm": 0.47572416067123413,
"learning_rate": 5.352789017427052e-07,
"loss": 0.278,
"step": 4400
},
{
"epoch": 0.5310054184226369,
"grad_norm": 0.5311432480812073,
"learning_rate": 5.33182153364097e-07,
"loss": 0.283,
"step": 4410
},
{
"epoch": 0.5322095123419627,
"grad_norm": 0.5810163617134094,
"learning_rate": 5.310848187157403e-07,
"loss": 0.257,
"step": 4420
},
{
"epoch": 0.5334136062612884,
"grad_norm": 0.8989514708518982,
"learning_rate": 5.289869348538153e-07,
"loss": 0.2846,
"step": 4430
},
{
"epoch": 0.534617700180614,
"grad_norm": 0.4534051716327667,
"learning_rate": 5.26888538844206e-07,
"loss": 0.2836,
"step": 4440
},
{
"epoch": 0.5358217940999398,
"grad_norm": 0.4670819938182831,
"learning_rate": 5.247896677618452e-07,
"loss": 0.2614,
"step": 4450
},
{
"epoch": 0.5370258880192655,
"grad_norm": 0.5935913324356079,
"learning_rate": 5.226903586900587e-07,
"loss": 0.2826,
"step": 4460
},
{
"epoch": 0.5382299819385912,
"grad_norm": 0.45839351415634155,
"learning_rate": 5.205906487199119e-07,
"loss": 0.2514,
"step": 4470
},
{
"epoch": 0.5394340758579169,
"grad_norm": 0.4929831624031067,
"learning_rate": 5.184905749495525e-07,
"loss": 0.2815,
"step": 4480
},
{
"epoch": 0.5406381697772427,
"grad_norm": 0.529437780380249,
"learning_rate": 5.163901744835564e-07,
"loss": 0.2744,
"step": 4490
},
{
"epoch": 0.5418422636965683,
"grad_norm": 0.44370970129966736,
"learning_rate": 5.14289484432271e-07,
"loss": 0.2837,
"step": 4500
},
{
"epoch": 0.543046357615894,
"grad_norm": 0.46680358052253723,
"learning_rate": 5.121885419111611e-07,
"loss": 0.2833,
"step": 4510
},
{
"epoch": 0.5442504515352198,
"grad_norm": 0.5581067204475403,
"learning_rate": 5.100873840401513e-07,
"loss": 0.2846,
"step": 4520
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.4683559238910675,
"learning_rate": 5.079860479429718e-07,
"loss": 0.2666,
"step": 4530
},
{
"epoch": 0.5466586393738712,
"grad_norm": 0.464067280292511,
"learning_rate": 5.058845707465009e-07,
"loss": 0.2693,
"step": 4540
},
{
"epoch": 0.5478627332931969,
"grad_norm": 0.5715063214302063,
"learning_rate": 5.037829895801106e-07,
"loss": 0.2746,
"step": 4550
},
{
"epoch": 0.5490668272125225,
"grad_norm": 0.585356593132019,
"learning_rate": 5.016813415750097e-07,
"loss": 0.281,
"step": 4560
},
{
"epoch": 0.5502709211318483,
"grad_norm": 0.4893047511577606,
"learning_rate": 4.995796638635875e-07,
"loss": 0.2799,
"step": 4570
},
{
"epoch": 0.551475015051174,
"grad_norm": 1.0689632892608643,
"learning_rate": 4.974779935787589e-07,
"loss": 0.2574,
"step": 4580
},
{
"epoch": 0.5526791089704997,
"grad_norm": 0.6054455637931824,
"learning_rate": 4.953763678533068e-07,
"loss": 0.2635,
"step": 4590
},
{
"epoch": 0.5538832028898254,
"grad_norm": 0.46325477957725525,
"learning_rate": 4.932748238192273e-07,
"loss": 0.2769,
"step": 4600
},
{
"epoch": 0.5550872968091511,
"grad_norm": 0.5770764350891113,
"learning_rate": 4.911733986070735e-07,
"loss": 0.2671,
"step": 4610
},
{
"epoch": 0.5562913907284768,
"grad_norm": 0.5715611577033997,
"learning_rate": 4.890721293452979e-07,
"loss": 0.2917,
"step": 4620
},
{
"epoch": 0.5574954846478025,
"grad_norm": 0.5384266972541809,
"learning_rate": 4.869710531595988e-07,
"loss": 0.2771,
"step": 4630
},
{
"epoch": 0.5586995785671283,
"grad_norm": 0.4611688256263733,
"learning_rate": 4.848702071722629e-07,
"loss": 0.2828,
"step": 4640
},
{
"epoch": 0.5599036724864539,
"grad_norm": 0.6118834018707275,
"learning_rate": 4.827696285015094e-07,
"loss": 0.2832,
"step": 4650
},
{
"epoch": 0.5611077664057796,
"grad_norm": 0.5026919841766357,
"learning_rate": 4.806693542608348e-07,
"loss": 0.2735,
"step": 4660
},
{
"epoch": 0.5623118603251054,
"grad_norm": 0.548273503780365,
"learning_rate": 4.785694215583566e-07,
"loss": 0.2742,
"step": 4670
},
{
"epoch": 0.563515954244431,
"grad_norm": 0.6186013221740723,
"learning_rate": 4.764698674961581e-07,
"loss": 0.2784,
"step": 4680
},
{
"epoch": 0.5647200481637568,
"grad_norm": 0.45300328731536865,
"learning_rate": 4.743707291696329e-07,
"loss": 0.2786,
"step": 4690
},
{
"epoch": 0.5659241420830825,
"grad_norm": 0.49064886569976807,
"learning_rate": 4.7227204366682873e-07,
"loss": 0.2747,
"step": 4700
},
{
"epoch": 0.5671282360024081,
"grad_norm": 0.5186241865158081,
"learning_rate": 4.7017384806779336e-07,
"loss": 0.2788,
"step": 4710
},
{
"epoch": 0.5683323299217339,
"grad_norm": 0.5284368395805359,
"learning_rate": 4.6807617944391843e-07,
"loss": 0.264,
"step": 4720
},
{
"epoch": 0.5695364238410596,
"grad_norm": 0.5770208239555359,
"learning_rate": 4.6597907485728477e-07,
"loss": 0.2759,
"step": 4730
},
{
"epoch": 0.5707405177603853,
"grad_norm": 0.5039085149765015,
"learning_rate": 4.6388257136000807e-07,
"loss": 0.2807,
"step": 4740
},
{
"epoch": 0.571944611679711,
"grad_norm": 1.2547776699066162,
"learning_rate": 4.617867059935838e-07,
"loss": 0.2651,
"step": 4750
},
{
"epoch": 0.5731487055990367,
"grad_norm": 0.5457895398139954,
"learning_rate": 4.5969151578823224e-07,
"loss": 0.27,
"step": 4760
},
{
"epoch": 0.5743527995183624,
"grad_norm": 0.4974658787250519,
"learning_rate": 4.5759703776224555e-07,
"loss": 0.2794,
"step": 4770
},
{
"epoch": 0.5755568934376881,
"grad_norm": 0.5161871314048767,
"learning_rate": 4.555033089213321e-07,
"loss": 0.2816,
"step": 4780
},
{
"epoch": 0.5767609873570139,
"grad_norm": 0.43015995621681213,
"learning_rate": 4.534103662579642e-07,
"loss": 0.267,
"step": 4790
},
{
"epoch": 0.5779650812763396,
"grad_norm": 0.4864785969257355,
"learning_rate": 4.5131824675072364e-07,
"loss": 0.2793,
"step": 4800
},
{
"epoch": 0.5791691751956652,
"grad_norm": 0.6006112694740295,
"learning_rate": 4.492269873636482e-07,
"loss": 0.2689,
"step": 4810
},
{
"epoch": 0.580373269114991,
"grad_norm": 0.4434204399585724,
"learning_rate": 4.4713662504557927e-07,
"loss": 0.2876,
"step": 4820
},
{
"epoch": 0.5815773630343167,
"grad_norm": 0.565077543258667,
"learning_rate": 4.450471967295083e-07,
"loss": 0.2658,
"step": 4830
},
{
"epoch": 0.5827814569536424,
"grad_norm": 0.5381281971931458,
"learning_rate": 4.429587393319246e-07,
"loss": 0.2715,
"step": 4840
},
{
"epoch": 0.5839855508729681,
"grad_norm": 0.49021026492118835,
"learning_rate": 4.408712897521633e-07,
"loss": 0.2688,
"step": 4850
},
{
"epoch": 0.5851896447922939,
"grad_norm": 0.5293102264404297,
"learning_rate": 4.3878488487175323e-07,
"loss": 0.2604,
"step": 4860
},
{
"epoch": 0.5863937387116195,
"grad_norm": 0.6353856921195984,
"learning_rate": 4.3669956155376476e-07,
"loss": 0.2586,
"step": 4870
},
{
"epoch": 0.5875978326309452,
"grad_norm": 0.5306446552276611,
"learning_rate": 4.3461535664215923e-07,
"loss": 0.2624,
"step": 4880
},
{
"epoch": 0.588801926550271,
"grad_norm": 0.5957462191581726,
"learning_rate": 4.325323069611383e-07,
"loss": 0.2731,
"step": 4890
},
{
"epoch": 0.5900060204695966,
"grad_norm": 0.6803829073905945,
"learning_rate": 4.3045044931449156e-07,
"loss": 0.2779,
"step": 4900
},
{
"epoch": 0.5912101143889223,
"grad_norm": 0.5501326322555542,
"learning_rate": 4.2836982048494854e-07,
"loss": 0.2675,
"step": 4910
},
{
"epoch": 0.5924142083082481,
"grad_norm": 0.49481987953186035,
"learning_rate": 4.262904572335272e-07,
"loss": 0.2725,
"step": 4920
},
{
"epoch": 0.5936183022275737,
"grad_norm": 0.5254814028739929,
"learning_rate": 4.242123962988851e-07,
"loss": 0.2804,
"step": 4930
},
{
"epoch": 0.5948223961468995,
"grad_norm": 0.5598310232162476,
"learning_rate": 4.2213567439667037e-07,
"loss": 0.2703,
"step": 4940
},
{
"epoch": 0.5960264900662252,
"grad_norm": 0.5715354681015015,
"learning_rate": 4.200603282188724e-07,
"loss": 0.2799,
"step": 4950
},
{
"epoch": 0.5972305839855508,
"grad_norm": 0.6474336981773376,
"learning_rate": 4.179863944331743e-07,
"loss": 0.2799,
"step": 4960
},
{
"epoch": 0.5984346779048766,
"grad_norm": 0.47116249799728394,
"learning_rate": 4.15913909682305e-07,
"loss": 0.2751,
"step": 4970
},
{
"epoch": 0.5996387718242023,
"grad_norm": 0.5750442147254944,
"learning_rate": 4.138429105833906e-07,
"loss": 0.2719,
"step": 4980
},
{
"epoch": 0.600842865743528,
"grad_norm": 0.5243822932243347,
"learning_rate": 4.1177343372730923e-07,
"loss": 0.2709,
"step": 4990
},
{
"epoch": 0.6020469596628537,
"grad_norm": 0.5334904789924622,
"learning_rate": 4.097055156780437e-07,
"loss": 0.272,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 8305,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1967389652549632.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}