groot-test-1 / checkpoint-7000 /trainer_state.json
Ofiroz91's picture
Add files using upload-large-folder tool
b43e596 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6397282736003747,
"eval_steps": 500,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023424689622862497,
"grad_norm": 4.449338436126709,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3236,
"step": 10
},
{
"epoch": 0.004684937924572499,
"grad_norm": 3.442420721054077,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1552,
"step": 20
},
{
"epoch": 0.007027406886858749,
"grad_norm": 2.458024263381958,
"learning_rate": 6e-06,
"loss": 0.9371,
"step": 30
},
{
"epoch": 0.009369875849144999,
"grad_norm": 2.4206013679504395,
"learning_rate": 8.000000000000001e-06,
"loss": 1.1333,
"step": 40
},
{
"epoch": 0.011712344811431248,
"grad_norm": 4.484491348266602,
"learning_rate": 1e-05,
"loss": 0.7988,
"step": 50
},
{
"epoch": 0.014054813773717497,
"grad_norm": 7.087528228759766,
"learning_rate": 1.2e-05,
"loss": 0.8714,
"step": 60
},
{
"epoch": 0.016397282736003747,
"grad_norm": 2.9479169845581055,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.5826,
"step": 70
},
{
"epoch": 0.018739751698289998,
"grad_norm": 2.2344982624053955,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.457,
"step": 80
},
{
"epoch": 0.02108222066057625,
"grad_norm": 1.1311728954315186,
"learning_rate": 1.8e-05,
"loss": 0.3638,
"step": 90
},
{
"epoch": 0.023424689622862496,
"grad_norm": 5.992610931396484,
"learning_rate": 2e-05,
"loss": 0.7519,
"step": 100
},
{
"epoch": 0.025767158585148747,
"grad_norm": 1.328804612159729,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.369,
"step": 110
},
{
"epoch": 0.028109627547434995,
"grad_norm": 2.6690480709075928,
"learning_rate": 2.4e-05,
"loss": 0.2072,
"step": 120
},
{
"epoch": 0.030452096509721246,
"grad_norm": 1.2436017990112305,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.2564,
"step": 130
},
{
"epoch": 0.03279456547200749,
"grad_norm": 2.130502939224243,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.1741,
"step": 140
},
{
"epoch": 0.035137034434293744,
"grad_norm": 1.1833769083023071,
"learning_rate": 3e-05,
"loss": 0.1982,
"step": 150
},
{
"epoch": 0.037479503396579995,
"grad_norm": 0.887791633605957,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2346,
"step": 160
},
{
"epoch": 0.039821972358866246,
"grad_norm": 2.4128785133361816,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.1967,
"step": 170
},
{
"epoch": 0.0421644413211525,
"grad_norm": 1.2833918333053589,
"learning_rate": 3.6e-05,
"loss": 0.1552,
"step": 180
},
{
"epoch": 0.04450691028343874,
"grad_norm": 1.459666132926941,
"learning_rate": 3.8e-05,
"loss": 0.2237,
"step": 190
},
{
"epoch": 0.04684937924572499,
"grad_norm": 1.7674411535263062,
"learning_rate": 4e-05,
"loss": 0.1619,
"step": 200
},
{
"epoch": 0.049191848208011243,
"grad_norm": 1.2941542863845825,
"learning_rate": 4.2e-05,
"loss": 0.184,
"step": 210
},
{
"epoch": 0.051534317170297494,
"grad_norm": 1.7022488117218018,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.1501,
"step": 220
},
{
"epoch": 0.053876786132583745,
"grad_norm": 0.8502867221832275,
"learning_rate": 4.600000000000001e-05,
"loss": 0.2449,
"step": 230
},
{
"epoch": 0.05621925509486999,
"grad_norm": 2.1729302406311035,
"learning_rate": 4.8e-05,
"loss": 0.141,
"step": 240
},
{
"epoch": 0.05856172405715624,
"grad_norm": 1.9990278482437134,
"learning_rate": 5e-05,
"loss": 0.1569,
"step": 250
},
{
"epoch": 0.06090419301944249,
"grad_norm": 1.0973132848739624,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.1574,
"step": 260
},
{
"epoch": 0.06324666198172874,
"grad_norm": 1.5121344327926636,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.1309,
"step": 270
},
{
"epoch": 0.06558913094401499,
"grad_norm": 1.0041357278823853,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.2048,
"step": 280
},
{
"epoch": 0.06793159990630124,
"grad_norm": 1.9920216798782349,
"learning_rate": 5.8e-05,
"loss": 0.1425,
"step": 290
},
{
"epoch": 0.07027406886858749,
"grad_norm": 0.6136835217475891,
"learning_rate": 6e-05,
"loss": 0.1236,
"step": 300
},
{
"epoch": 0.07261653783087374,
"grad_norm": 1.2063113451004028,
"learning_rate": 6.2e-05,
"loss": 0.1342,
"step": 310
},
{
"epoch": 0.07495900679315999,
"grad_norm": 0.7644496560096741,
"learning_rate": 6.400000000000001e-05,
"loss": 0.1205,
"step": 320
},
{
"epoch": 0.07730147575544624,
"grad_norm": 0.973790168762207,
"learning_rate": 6.6e-05,
"loss": 0.1551,
"step": 330
},
{
"epoch": 0.07964394471773249,
"grad_norm": 1.9004161357879639,
"learning_rate": 6.800000000000001e-05,
"loss": 0.1395,
"step": 340
},
{
"epoch": 0.08198641368001874,
"grad_norm": 0.8575976490974426,
"learning_rate": 7e-05,
"loss": 0.1081,
"step": 350
},
{
"epoch": 0.084328882642305,
"grad_norm": 1.3740334510803223,
"learning_rate": 7.2e-05,
"loss": 0.18,
"step": 360
},
{
"epoch": 0.08667135160459125,
"grad_norm": 0.7421107888221741,
"learning_rate": 7.4e-05,
"loss": 0.1496,
"step": 370
},
{
"epoch": 0.08901382056687748,
"grad_norm": 1.4952155351638794,
"learning_rate": 7.6e-05,
"loss": 0.1491,
"step": 380
},
{
"epoch": 0.09135628952916373,
"grad_norm": 1.0072972774505615,
"learning_rate": 7.800000000000001e-05,
"loss": 0.1282,
"step": 390
},
{
"epoch": 0.09369875849144998,
"grad_norm": 1.719224452972412,
"learning_rate": 8e-05,
"loss": 0.1779,
"step": 400
},
{
"epoch": 0.09604122745373624,
"grad_norm": 1.4302623271942139,
"learning_rate": 8.2e-05,
"loss": 0.1145,
"step": 410
},
{
"epoch": 0.09838369641602249,
"grad_norm": 0.6622968316078186,
"learning_rate": 8.4e-05,
"loss": 0.1159,
"step": 420
},
{
"epoch": 0.10072616537830874,
"grad_norm": 1.0967049598693848,
"learning_rate": 8.6e-05,
"loss": 0.1659,
"step": 430
},
{
"epoch": 0.10306863434059499,
"grad_norm": 1.1332488059997559,
"learning_rate": 8.800000000000001e-05,
"loss": 0.1292,
"step": 440
},
{
"epoch": 0.10541110330288124,
"grad_norm": 1.308289647102356,
"learning_rate": 9e-05,
"loss": 0.1202,
"step": 450
},
{
"epoch": 0.10775357226516749,
"grad_norm": 0.5696719884872437,
"learning_rate": 9.200000000000001e-05,
"loss": 0.1118,
"step": 460
},
{
"epoch": 0.11009604122745374,
"grad_norm": 0.9922944903373718,
"learning_rate": 9.4e-05,
"loss": 0.1644,
"step": 470
},
{
"epoch": 0.11243851018973998,
"grad_norm": 1.5004724264144897,
"learning_rate": 9.6e-05,
"loss": 0.2011,
"step": 480
},
{
"epoch": 0.11478097915202623,
"grad_norm": 0.9503705501556396,
"learning_rate": 9.8e-05,
"loss": 0.1038,
"step": 490
},
{
"epoch": 0.11712344811431248,
"grad_norm": 1.421077013015747,
"learning_rate": 0.0001,
"loss": 0.0944,
"step": 500
},
{
"epoch": 0.11946591707659873,
"grad_norm": 0.8938995599746704,
"learning_rate": 9.999972660400536e-05,
"loss": 0.1216,
"step": 510
},
{
"epoch": 0.12180838603888498,
"grad_norm": 0.46683940291404724,
"learning_rate": 9.999890641901125e-05,
"loss": 0.1278,
"step": 520
},
{
"epoch": 0.12415085500117123,
"grad_norm": 0.8092114925384521,
"learning_rate": 9.999753945398704e-05,
"loss": 0.0794,
"step": 530
},
{
"epoch": 0.12649332396345747,
"grad_norm": 0.27710163593292236,
"learning_rate": 9.99956257238817e-05,
"loss": 0.1266,
"step": 540
},
{
"epoch": 0.12883579292574374,
"grad_norm": 0.81737220287323,
"learning_rate": 9.999316524962345e-05,
"loss": 0.1108,
"step": 550
},
{
"epoch": 0.13117826188802997,
"grad_norm": 0.6735175848007202,
"learning_rate": 9.999015805811965e-05,
"loss": 0.0854,
"step": 560
},
{
"epoch": 0.13352073085031624,
"grad_norm": 0.2487485110759735,
"learning_rate": 9.998660418225645e-05,
"loss": 0.1045,
"step": 570
},
{
"epoch": 0.13586319981260248,
"grad_norm": 0.3255215287208557,
"learning_rate": 9.998250366089848e-05,
"loss": 0.0948,
"step": 580
},
{
"epoch": 0.13820566877488874,
"grad_norm": 0.7749798893928528,
"learning_rate": 9.997785653888835e-05,
"loss": 0.0775,
"step": 590
},
{
"epoch": 0.14054813773717498,
"grad_norm": 1.220957636833191,
"learning_rate": 9.997266286704631e-05,
"loss": 0.1201,
"step": 600
},
{
"epoch": 0.14289060669946124,
"grad_norm": 0.8066214919090271,
"learning_rate": 9.996692270216947e-05,
"loss": 0.0815,
"step": 610
},
{
"epoch": 0.14523307566174748,
"grad_norm": 0.6408377885818481,
"learning_rate": 9.996063610703137e-05,
"loss": 0.1,
"step": 620
},
{
"epoch": 0.14757554462403374,
"grad_norm": 0.8596289753913879,
"learning_rate": 9.995380315038119e-05,
"loss": 0.1008,
"step": 630
},
{
"epoch": 0.14991801358631998,
"grad_norm": 0.972243070602417,
"learning_rate": 9.994642390694308e-05,
"loss": 0.1075,
"step": 640
},
{
"epoch": 0.15226048254860622,
"grad_norm": 0.5220253467559814,
"learning_rate": 9.993849845741524e-05,
"loss": 0.1021,
"step": 650
},
{
"epoch": 0.15460295151089248,
"grad_norm": 0.5453582406044006,
"learning_rate": 9.993002688846913e-05,
"loss": 0.0963,
"step": 660
},
{
"epoch": 0.15694542047317872,
"grad_norm": 0.24789837002754211,
"learning_rate": 9.992100929274846e-05,
"loss": 0.0712,
"step": 670
},
{
"epoch": 0.15928788943546499,
"grad_norm": 0.31857672333717346,
"learning_rate": 9.991144576886823e-05,
"loss": 0.0859,
"step": 680
},
{
"epoch": 0.16163035839775122,
"grad_norm": 0.7285981178283691,
"learning_rate": 9.990133642141359e-05,
"loss": 0.1274,
"step": 690
},
{
"epoch": 0.1639728273600375,
"grad_norm": 1.0549755096435547,
"learning_rate": 9.989068136093873e-05,
"loss": 0.1187,
"step": 700
},
{
"epoch": 0.16631529632232372,
"grad_norm": 0.204506054520607,
"learning_rate": 9.987948070396571e-05,
"loss": 0.1005,
"step": 710
},
{
"epoch": 0.16865776528461,
"grad_norm": 0.4295964241027832,
"learning_rate": 9.986773457298311e-05,
"loss": 0.0937,
"step": 720
},
{
"epoch": 0.17100023424689623,
"grad_norm": 1.0681158304214478,
"learning_rate": 9.985544309644475e-05,
"loss": 0.0855,
"step": 730
},
{
"epoch": 0.1733427032091825,
"grad_norm": 0.667492151260376,
"learning_rate": 9.984260640876821e-05,
"loss": 0.1096,
"step": 740
},
{
"epoch": 0.17568517217146873,
"grad_norm": 0.6995371580123901,
"learning_rate": 9.98292246503335e-05,
"loss": 0.108,
"step": 750
},
{
"epoch": 0.17802764113375497,
"grad_norm": 0.9727945923805237,
"learning_rate": 9.981529796748134e-05,
"loss": 0.1155,
"step": 760
},
{
"epoch": 0.18037011009604123,
"grad_norm": 0.3702404201030731,
"learning_rate": 9.980082651251175e-05,
"loss": 0.0846,
"step": 770
},
{
"epoch": 0.18271257905832747,
"grad_norm": 0.3169856667518616,
"learning_rate": 9.97858104436822e-05,
"loss": 0.0917,
"step": 780
},
{
"epoch": 0.18505504802061373,
"grad_norm": 0.6973789930343628,
"learning_rate": 9.977024992520602e-05,
"loss": 0.0785,
"step": 790
},
{
"epoch": 0.18739751698289997,
"grad_norm": 0.5686987042427063,
"learning_rate": 9.975414512725057e-05,
"loss": 0.1015,
"step": 800
},
{
"epoch": 0.18973998594518623,
"grad_norm": 0.6190043687820435,
"learning_rate": 9.973749622593534e-05,
"loss": 0.0753,
"step": 810
},
{
"epoch": 0.19208245490747247,
"grad_norm": 0.3807699382305145,
"learning_rate": 9.972030340333001e-05,
"loss": 0.0734,
"step": 820
},
{
"epoch": 0.19442492386975874,
"grad_norm": 0.45342546701431274,
"learning_rate": 9.970256684745258e-05,
"loss": 0.1012,
"step": 830
},
{
"epoch": 0.19676739283204497,
"grad_norm": 0.2780962586402893,
"learning_rate": 9.968428675226714e-05,
"loss": 0.0757,
"step": 840
},
{
"epoch": 0.19910986179433124,
"grad_norm": 0.20734530687332153,
"learning_rate": 9.966546331768191e-05,
"loss": 0.0751,
"step": 850
},
{
"epoch": 0.20145233075661748,
"grad_norm": 0.3406268358230591,
"learning_rate": 9.964609674954696e-05,
"loss": 0.0937,
"step": 860
},
{
"epoch": 0.2037947997189037,
"grad_norm": 0.33824971318244934,
"learning_rate": 9.962618725965196e-05,
"loss": 0.0913,
"step": 870
},
{
"epoch": 0.20613726868118998,
"grad_norm": 0.5773669481277466,
"learning_rate": 9.96057350657239e-05,
"loss": 0.0834,
"step": 880
},
{
"epoch": 0.20847973764347622,
"grad_norm": 0.5624499917030334,
"learning_rate": 9.95847403914247e-05,
"loss": 0.1001,
"step": 890
},
{
"epoch": 0.21082220660576248,
"grad_norm": 0.5361132025718689,
"learning_rate": 9.956320346634876e-05,
"loss": 0.1233,
"step": 900
},
{
"epoch": 0.21316467556804872,
"grad_norm": 0.4824270009994507,
"learning_rate": 9.954112452602045e-05,
"loss": 0.0882,
"step": 910
},
{
"epoch": 0.21550714453033498,
"grad_norm": 0.6482338905334473,
"learning_rate": 9.95185038118915e-05,
"loss": 0.0647,
"step": 920
},
{
"epoch": 0.21784961349262122,
"grad_norm": 0.2783452868461609,
"learning_rate": 9.949534157133844e-05,
"loss": 0.0917,
"step": 930
},
{
"epoch": 0.22019208245490748,
"grad_norm": 0.4593198597431183,
"learning_rate": 9.94716380576598e-05,
"loss": 0.068,
"step": 940
},
{
"epoch": 0.22253455141719372,
"grad_norm": 0.7751959562301636,
"learning_rate": 9.944739353007344e-05,
"loss": 0.1032,
"step": 950
},
{
"epoch": 0.22487702037947996,
"grad_norm": 0.3963168263435364,
"learning_rate": 9.942260825371358e-05,
"loss": 0.0942,
"step": 960
},
{
"epoch": 0.22721948934176622,
"grad_norm": 0.40413302183151245,
"learning_rate": 9.939728249962807e-05,
"loss": 0.0736,
"step": 970
},
{
"epoch": 0.22956195830405246,
"grad_norm": 0.3862430155277252,
"learning_rate": 9.937141654477528e-05,
"loss": 0.0726,
"step": 980
},
{
"epoch": 0.23190442726633873,
"grad_norm": 0.5864925384521484,
"learning_rate": 9.934501067202117e-05,
"loss": 0.0872,
"step": 990
},
{
"epoch": 0.23424689622862496,
"grad_norm": 0.31625375151634216,
"learning_rate": 9.931806517013612e-05,
"loss": 0.0708,
"step": 1000
},
{
"epoch": 0.23658936519091123,
"grad_norm": 0.5403046011924744,
"learning_rate": 9.929058033379181e-05,
"loss": 0.073,
"step": 1010
},
{
"epoch": 0.23893183415319746,
"grad_norm": 0.4366021156311035,
"learning_rate": 9.926255646355804e-05,
"loss": 0.0643,
"step": 1020
},
{
"epoch": 0.24127430311548373,
"grad_norm": 0.500108540058136,
"learning_rate": 9.923399386589933e-05,
"loss": 0.0437,
"step": 1030
},
{
"epoch": 0.24361677207776997,
"grad_norm": 0.8096440434455872,
"learning_rate": 9.92048928531717e-05,
"loss": 0.0555,
"step": 1040
},
{
"epoch": 0.24595924104005623,
"grad_norm": 0.6826971173286438,
"learning_rate": 9.917525374361912e-05,
"loss": 0.0704,
"step": 1050
},
{
"epoch": 0.24830171000234247,
"grad_norm": 0.27831944823265076,
"learning_rate": 9.914507686137019e-05,
"loss": 0.0659,
"step": 1060
},
{
"epoch": 0.2506441789646287,
"grad_norm": 0.35980355739593506,
"learning_rate": 9.911436253643445e-05,
"loss": 0.0652,
"step": 1070
},
{
"epoch": 0.25298664792691494,
"grad_norm": 0.7075427174568176,
"learning_rate": 9.90831111046988e-05,
"loss": 0.0933,
"step": 1080
},
{
"epoch": 0.25532911688920124,
"grad_norm": 0.33446595072746277,
"learning_rate": 9.905132290792394e-05,
"loss": 0.0594,
"step": 1090
},
{
"epoch": 0.2576715858514875,
"grad_norm": 0.21890777349472046,
"learning_rate": 9.901899829374047e-05,
"loss": 0.0636,
"step": 1100
},
{
"epoch": 0.2600140548137737,
"grad_norm": 0.19606763124465942,
"learning_rate": 9.89861376156452e-05,
"loss": 0.0573,
"step": 1110
},
{
"epoch": 0.26235652377605995,
"grad_norm": 0.40309399366378784,
"learning_rate": 9.895274123299723e-05,
"loss": 0.0711,
"step": 1120
},
{
"epoch": 0.26469899273834624,
"grad_norm": 0.15657459199428558,
"learning_rate": 9.891880951101407e-05,
"loss": 0.0596,
"step": 1130
},
{
"epoch": 0.2670414617006325,
"grad_norm": 0.5244103670120239,
"learning_rate": 9.888434282076758e-05,
"loss": 0.0624,
"step": 1140
},
{
"epoch": 0.2693839306629187,
"grad_norm": 0.6240133047103882,
"learning_rate": 9.884934153917997e-05,
"loss": 0.1013,
"step": 1150
},
{
"epoch": 0.27172639962520495,
"grad_norm": 0.2892966568470001,
"learning_rate": 9.881380604901964e-05,
"loss": 0.0886,
"step": 1160
},
{
"epoch": 0.27406886858749124,
"grad_norm": 0.11301174759864807,
"learning_rate": 9.877773673889701e-05,
"loss": 0.0967,
"step": 1170
},
{
"epoch": 0.2764113375497775,
"grad_norm": 0.6525554060935974,
"learning_rate": 9.87411340032603e-05,
"loss": 0.0857,
"step": 1180
},
{
"epoch": 0.2787538065120637,
"grad_norm": 0.27176904678344727,
"learning_rate": 9.870399824239117e-05,
"loss": 0.0556,
"step": 1190
},
{
"epoch": 0.28109627547434995,
"grad_norm": 0.4166867136955261,
"learning_rate": 9.86663298624003e-05,
"loss": 0.0684,
"step": 1200
},
{
"epoch": 0.2834387444366362,
"grad_norm": 0.19580566883087158,
"learning_rate": 9.862812927522309e-05,
"loss": 0.0882,
"step": 1210
},
{
"epoch": 0.2857812133989225,
"grad_norm": 0.44604888558387756,
"learning_rate": 9.858939689861506e-05,
"loss": 0.0883,
"step": 1220
},
{
"epoch": 0.2881236823612087,
"grad_norm": 0.49636200070381165,
"learning_rate": 9.855013315614725e-05,
"loss": 0.0912,
"step": 1230
},
{
"epoch": 0.29046615132349496,
"grad_norm": 0.1988007128238678,
"learning_rate": 9.851033847720166e-05,
"loss": 0.0719,
"step": 1240
},
{
"epoch": 0.2928086202857812,
"grad_norm": 0.30095556378364563,
"learning_rate": 9.847001329696653e-05,
"loss": 0.078,
"step": 1250
},
{
"epoch": 0.2951510892480675,
"grad_norm": 0.34190279245376587,
"learning_rate": 9.842915805643155e-05,
"loss": 0.0442,
"step": 1260
},
{
"epoch": 0.2974935582103537,
"grad_norm": 0.25464609265327454,
"learning_rate": 9.838777320238312e-05,
"loss": 0.0583,
"step": 1270
},
{
"epoch": 0.29983602717263996,
"grad_norm": 0.07694657146930695,
"learning_rate": 9.834585918739936e-05,
"loss": 0.0359,
"step": 1280
},
{
"epoch": 0.3021784961349262,
"grad_norm": 0.19848985970020294,
"learning_rate": 9.830341646984521e-05,
"loss": 0.0812,
"step": 1290
},
{
"epoch": 0.30452096509721244,
"grad_norm": 0.27825915813446045,
"learning_rate": 9.826044551386744e-05,
"loss": 0.0496,
"step": 1300
},
{
"epoch": 0.30686343405949873,
"grad_norm": 0.3718523681163788,
"learning_rate": 9.821694678938953e-05,
"loss": 0.0671,
"step": 1310
},
{
"epoch": 0.30920590302178497,
"grad_norm": 0.5311722159385681,
"learning_rate": 9.817292077210659e-05,
"loss": 0.0739,
"step": 1320
},
{
"epoch": 0.3115483719840712,
"grad_norm": 0.41185882687568665,
"learning_rate": 9.812836794348004e-05,
"loss": 0.0665,
"step": 1330
},
{
"epoch": 0.31389084094635744,
"grad_norm": 0.2839798629283905,
"learning_rate": 9.808328879073251e-05,
"loss": 0.0495,
"step": 1340
},
{
"epoch": 0.31623330990864373,
"grad_norm": 0.5456023812294006,
"learning_rate": 9.803768380684242e-05,
"loss": 0.0538,
"step": 1350
},
{
"epoch": 0.31857577887092997,
"grad_norm": 1.1303348541259766,
"learning_rate": 9.799155349053851e-05,
"loss": 0.0948,
"step": 1360
},
{
"epoch": 0.3209182478332162,
"grad_norm": 0.3756462633609772,
"learning_rate": 9.794489834629455e-05,
"loss": 0.0405,
"step": 1370
},
{
"epoch": 0.32326071679550245,
"grad_norm": 0.45304539799690247,
"learning_rate": 9.789771888432375e-05,
"loss": 0.0518,
"step": 1380
},
{
"epoch": 0.3256031857577887,
"grad_norm": 0.42578068375587463,
"learning_rate": 9.785001562057309e-05,
"loss": 0.0694,
"step": 1390
},
{
"epoch": 0.327945654720075,
"grad_norm": 0.5314955711364746,
"learning_rate": 9.780178907671789e-05,
"loss": 0.0656,
"step": 1400
},
{
"epoch": 0.3302881236823612,
"grad_norm": 0.445273220539093,
"learning_rate": 9.775303978015585e-05,
"loss": 0.0467,
"step": 1410
},
{
"epoch": 0.33263059264464745,
"grad_norm": 0.45427191257476807,
"learning_rate": 9.77037682640015e-05,
"loss": 0.071,
"step": 1420
},
{
"epoch": 0.3349730616069337,
"grad_norm": 1.1310575008392334,
"learning_rate": 9.765397506708023e-05,
"loss": 0.0783,
"step": 1430
},
{
"epoch": 0.33731553056922,
"grad_norm": 0.37553080916404724,
"learning_rate": 9.760366073392246e-05,
"loss": 0.0595,
"step": 1440
},
{
"epoch": 0.3396579995315062,
"grad_norm": 0.456626296043396,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0684,
"step": 1450
},
{
"epoch": 0.34200046849379245,
"grad_norm": 0.23000092804431915,
"learning_rate": 9.750147086550844e-05,
"loss": 0.0663,
"step": 1460
},
{
"epoch": 0.3443429374560787,
"grad_norm": 0.8536004424095154,
"learning_rate": 9.744959644778422e-05,
"loss": 0.0615,
"step": 1470
},
{
"epoch": 0.346685406418365,
"grad_norm": 0.2810976803302765,
"learning_rate": 9.739720312887535e-05,
"loss": 0.0499,
"step": 1480
},
{
"epoch": 0.3490278753806512,
"grad_norm": 0.5517282485961914,
"learning_rate": 9.734429148174675e-05,
"loss": 0.0623,
"step": 1490
},
{
"epoch": 0.35137034434293746,
"grad_norm": 0.5391654372215271,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0701,
"step": 1500
},
{
"epoch": 0.3537128133052237,
"grad_norm": 0.2104485183954239,
"learning_rate": 9.723691552302562e-05,
"loss": 0.0624,
"step": 1510
},
{
"epoch": 0.35605528226750993,
"grad_norm": 0.6778100728988647,
"learning_rate": 9.718245238567939e-05,
"loss": 0.0735,
"step": 1520
},
{
"epoch": 0.3583977512297962,
"grad_norm": 0.5578711628913879,
"learning_rate": 9.712747326859315e-05,
"loss": 0.0649,
"step": 1530
},
{
"epoch": 0.36074022019208246,
"grad_norm": 0.19399204850196838,
"learning_rate": 9.707197877300974e-05,
"loss": 0.0696,
"step": 1540
},
{
"epoch": 0.3630826891543687,
"grad_norm": 0.36409327387809753,
"learning_rate": 9.701596950580806e-05,
"loss": 0.0764,
"step": 1550
},
{
"epoch": 0.36542515811665494,
"grad_norm": 0.3991371691226959,
"learning_rate": 9.695944607949649e-05,
"loss": 0.053,
"step": 1560
},
{
"epoch": 0.36776762707894123,
"grad_norm": 0.24415276944637299,
"learning_rate": 9.690240911220618e-05,
"loss": 0.0359,
"step": 1570
},
{
"epoch": 0.37011009604122747,
"grad_norm": 0.2075069695711136,
"learning_rate": 9.684485922768422e-05,
"loss": 0.0663,
"step": 1580
},
{
"epoch": 0.3724525650035137,
"grad_norm": 0.6543785333633423,
"learning_rate": 9.6786797055287e-05,
"loss": 0.0494,
"step": 1590
},
{
"epoch": 0.37479503396579994,
"grad_norm": 0.5545148253440857,
"learning_rate": 9.672822322997305e-05,
"loss": 0.0922,
"step": 1600
},
{
"epoch": 0.3771375029280862,
"grad_norm": 0.3024766743183136,
"learning_rate": 9.66691383922964e-05,
"loss": 0.0458,
"step": 1610
},
{
"epoch": 0.37947997189037247,
"grad_norm": 0.18543019890785217,
"learning_rate": 9.660954318839933e-05,
"loss": 0.0814,
"step": 1620
},
{
"epoch": 0.3818224408526587,
"grad_norm": 0.6047130823135376,
"learning_rate": 9.654943827000548e-05,
"loss": 0.0749,
"step": 1630
},
{
"epoch": 0.38416490981494494,
"grad_norm": 0.5619345307350159,
"learning_rate": 9.648882429441257e-05,
"loss": 0.0647,
"step": 1640
},
{
"epoch": 0.3865073787772312,
"grad_norm": 0.3835267126560211,
"learning_rate": 9.642770192448536e-05,
"loss": 0.0526,
"step": 1650
},
{
"epoch": 0.3888498477395175,
"grad_norm": 0.2994864583015442,
"learning_rate": 9.636607182864827e-05,
"loss": 0.0451,
"step": 1660
},
{
"epoch": 0.3911923167018037,
"grad_norm": 0.5770288705825806,
"learning_rate": 9.630393468087818e-05,
"loss": 0.0716,
"step": 1670
},
{
"epoch": 0.39353478566408995,
"grad_norm": 0.3165629506111145,
"learning_rate": 9.624129116069694e-05,
"loss": 0.0468,
"step": 1680
},
{
"epoch": 0.3958772546263762,
"grad_norm": 0.11682554334402084,
"learning_rate": 9.617814195316411e-05,
"loss": 0.0669,
"step": 1690
},
{
"epoch": 0.3982197235886625,
"grad_norm": 0.4979915916919708,
"learning_rate": 9.611448774886924e-05,
"loss": 0.0553,
"step": 1700
},
{
"epoch": 0.4005621925509487,
"grad_norm": 0.14603012800216675,
"learning_rate": 9.605032924392457e-05,
"loss": 0.0597,
"step": 1710
},
{
"epoch": 0.40290466151323495,
"grad_norm": 0.3345795273780823,
"learning_rate": 9.598566713995718e-05,
"loss": 0.049,
"step": 1720
},
{
"epoch": 0.4052471304755212,
"grad_norm": 0.4213583171367645,
"learning_rate": 9.59205021441015e-05,
"loss": 0.0659,
"step": 1730
},
{
"epoch": 0.4075895994378074,
"grad_norm": 0.1514274775981903,
"learning_rate": 9.58548349689915e-05,
"loss": 0.0803,
"step": 1740
},
{
"epoch": 0.4099320684000937,
"grad_norm": 1.1298153400421143,
"learning_rate": 9.578866633275288e-05,
"loss": 0.0574,
"step": 1750
},
{
"epoch": 0.41227453736237996,
"grad_norm": 0.2879124581813812,
"learning_rate": 9.572199695899522e-05,
"loss": 0.0618,
"step": 1760
},
{
"epoch": 0.4146170063246662,
"grad_norm": 0.21584849059581757,
"learning_rate": 9.565482757680415e-05,
"loss": 0.069,
"step": 1770
},
{
"epoch": 0.41695947528695243,
"grad_norm": 0.27666664123535156,
"learning_rate": 9.558715892073323e-05,
"loss": 0.0619,
"step": 1780
},
{
"epoch": 0.4193019442492387,
"grad_norm": 0.36067232489585876,
"learning_rate": 9.551899173079607e-05,
"loss": 0.0512,
"step": 1790
},
{
"epoch": 0.42164441321152496,
"grad_norm": 0.21706882119178772,
"learning_rate": 9.545032675245813e-05,
"loss": 0.0399,
"step": 1800
},
{
"epoch": 0.4239868821738112,
"grad_norm": 0.2502746880054474,
"learning_rate": 9.538116473662861e-05,
"loss": 0.067,
"step": 1810
},
{
"epoch": 0.42632935113609743,
"grad_norm": 0.19951611757278442,
"learning_rate": 9.531150643965223e-05,
"loss": 0.0572,
"step": 1820
},
{
"epoch": 0.42867182009838367,
"grad_norm": 0.5946075916290283,
"learning_rate": 9.524135262330098e-05,
"loss": 0.0556,
"step": 1830
},
{
"epoch": 0.43101428906066996,
"grad_norm": 0.20143412053585052,
"learning_rate": 9.517070405476575e-05,
"loss": 0.0556,
"step": 1840
},
{
"epoch": 0.4333567580229562,
"grad_norm": 0.30480778217315674,
"learning_rate": 9.509956150664796e-05,
"loss": 0.0721,
"step": 1850
},
{
"epoch": 0.43569922698524244,
"grad_norm": 0.289962500333786,
"learning_rate": 9.502792575695112e-05,
"loss": 0.0349,
"step": 1860
},
{
"epoch": 0.4380416959475287,
"grad_norm": 0.23470467329025269,
"learning_rate": 9.49557975890723e-05,
"loss": 0.0508,
"step": 1870
},
{
"epoch": 0.44038416490981497,
"grad_norm": 0.5040431022644043,
"learning_rate": 9.488317779179361e-05,
"loss": 0.0576,
"step": 1880
},
{
"epoch": 0.4427266338721012,
"grad_norm": 0.4373694360256195,
"learning_rate": 9.481006715927351e-05,
"loss": 0.0526,
"step": 1890
},
{
"epoch": 0.44506910283438744,
"grad_norm": 0.41776043176651,
"learning_rate": 9.473646649103818e-05,
"loss": 0.0417,
"step": 1900
},
{
"epoch": 0.4474115717966737,
"grad_norm": 0.5410218238830566,
"learning_rate": 9.46623765919727e-05,
"loss": 0.0737,
"step": 1910
},
{
"epoch": 0.4497540407589599,
"grad_norm": 0.4274581968784332,
"learning_rate": 9.458779827231237e-05,
"loss": 0.0715,
"step": 1920
},
{
"epoch": 0.4520965097212462,
"grad_norm": 0.31722667813301086,
"learning_rate": 9.451273234763371e-05,
"loss": 0.0672,
"step": 1930
},
{
"epoch": 0.45443897868353245,
"grad_norm": 0.221653014421463,
"learning_rate": 9.443717963884569e-05,
"loss": 0.0631,
"step": 1940
},
{
"epoch": 0.4567814476458187,
"grad_norm": 0.2043227255344391,
"learning_rate": 9.43611409721806e-05,
"loss": 0.0436,
"step": 1950
},
{
"epoch": 0.4591239166081049,
"grad_norm": 0.1967364400625229,
"learning_rate": 9.428461717918511e-05,
"loss": 0.0601,
"step": 1960
},
{
"epoch": 0.4614663855703912,
"grad_norm": 0.23282958567142487,
"learning_rate": 9.420760909671118e-05,
"loss": 0.0441,
"step": 1970
},
{
"epoch": 0.46380885453267745,
"grad_norm": 0.6064874529838562,
"learning_rate": 9.413011756690685e-05,
"loss": 0.0691,
"step": 1980
},
{
"epoch": 0.4661513234949637,
"grad_norm": 0.29970476031303406,
"learning_rate": 9.405214343720707e-05,
"loss": 0.0362,
"step": 1990
},
{
"epoch": 0.4684937924572499,
"grad_norm": 0.3310692310333252,
"learning_rate": 9.397368756032445e-05,
"loss": 0.045,
"step": 2000
},
{
"epoch": 0.4708362614195362,
"grad_norm": 0.34072744846343994,
"learning_rate": 9.389475079423988e-05,
"loss": 0.0646,
"step": 2010
},
{
"epoch": 0.47317873038182245,
"grad_norm": 0.09513302892446518,
"learning_rate": 9.381533400219318e-05,
"loss": 0.0543,
"step": 2020
},
{
"epoch": 0.4755211993441087,
"grad_norm": 0.19264456629753113,
"learning_rate": 9.373543805267368e-05,
"loss": 0.0682,
"step": 2030
},
{
"epoch": 0.47786366830639493,
"grad_norm": 0.3914099633693695,
"learning_rate": 9.365506381941066e-05,
"loss": 0.0455,
"step": 2040
},
{
"epoch": 0.48020613726868117,
"grad_norm": 0.4226783514022827,
"learning_rate": 9.357421218136386e-05,
"loss": 0.0689,
"step": 2050
},
{
"epoch": 0.48254860623096746,
"grad_norm": 0.41455796360969543,
"learning_rate": 9.349288402271388e-05,
"loss": 0.0596,
"step": 2060
},
{
"epoch": 0.4848910751932537,
"grad_norm": 0.2510756254196167,
"learning_rate": 9.341108023285238e-05,
"loss": 0.0341,
"step": 2070
},
{
"epoch": 0.48723354415553993,
"grad_norm": 0.40096133947372437,
"learning_rate": 9.332880170637252e-05,
"loss": 0.0813,
"step": 2080
},
{
"epoch": 0.48957601311782617,
"grad_norm": 0.6878464221954346,
"learning_rate": 9.32460493430591e-05,
"loss": 0.044,
"step": 2090
},
{
"epoch": 0.49191848208011246,
"grad_norm": 0.3416203558444977,
"learning_rate": 9.316282404787871e-05,
"loss": 0.0686,
"step": 2100
},
{
"epoch": 0.4942609510423987,
"grad_norm": 0.12535825371742249,
"learning_rate": 9.30791267309698e-05,
"loss": 0.0354,
"step": 2110
},
{
"epoch": 0.49660342000468494,
"grad_norm": 0.19023941457271576,
"learning_rate": 9.299495830763286e-05,
"loss": 0.0376,
"step": 2120
},
{
"epoch": 0.4989458889669712,
"grad_norm": 0.3778730034828186,
"learning_rate": 9.291031969832026e-05,
"loss": 0.0518,
"step": 2130
},
{
"epoch": 0.5012883579292574,
"grad_norm": 0.256195068359375,
"learning_rate": 9.282521182862629e-05,
"loss": 0.0571,
"step": 2140
},
{
"epoch": 0.5036308268915437,
"grad_norm": 0.19933399558067322,
"learning_rate": 9.273963562927695e-05,
"loss": 0.0271,
"step": 2150
},
{
"epoch": 0.5059732958538299,
"grad_norm": 0.06613205373287201,
"learning_rate": 9.265359203611987e-05,
"loss": 0.0334,
"step": 2160
},
{
"epoch": 0.5083157648161162,
"grad_norm": 0.21248801052570343,
"learning_rate": 9.256708199011401e-05,
"loss": 0.0746,
"step": 2170
},
{
"epoch": 0.5106582337784025,
"grad_norm": 0.3601578176021576,
"learning_rate": 9.248010643731935e-05,
"loss": 0.076,
"step": 2180
},
{
"epoch": 0.5130007027406887,
"grad_norm": 0.0984947606921196,
"learning_rate": 9.239266632888659e-05,
"loss": 0.0892,
"step": 2190
},
{
"epoch": 0.515343171702975,
"grad_norm": 0.13032953441143036,
"learning_rate": 9.230476262104677e-05,
"loss": 0.039,
"step": 2200
},
{
"epoch": 0.5176856406652612,
"grad_norm": 0.48068541288375854,
"learning_rate": 9.221639627510076e-05,
"loss": 0.0585,
"step": 2210
},
{
"epoch": 0.5200281096275474,
"grad_norm": 0.42812222242355347,
"learning_rate": 9.212756825740873e-05,
"loss": 0.0929,
"step": 2220
},
{
"epoch": 0.5223705785898337,
"grad_norm": 0.3526000380516052,
"learning_rate": 9.20382795393797e-05,
"loss": 0.0657,
"step": 2230
},
{
"epoch": 0.5247130475521199,
"grad_norm": 0.14142726361751556,
"learning_rate": 9.194853109746074e-05,
"loss": 0.0571,
"step": 2240
},
{
"epoch": 0.5270555165144062,
"grad_norm": 0.10022013634443283,
"learning_rate": 9.185832391312644e-05,
"loss": 0.0362,
"step": 2250
},
{
"epoch": 0.5293979854766925,
"grad_norm": 0.18126869201660156,
"learning_rate": 9.176765897286813e-05,
"loss": 0.0616,
"step": 2260
},
{
"epoch": 0.5317404544389787,
"grad_norm": 0.22198501229286194,
"learning_rate": 9.167653726818305e-05,
"loss": 0.0227,
"step": 2270
},
{
"epoch": 0.534082923401265,
"grad_norm": 0.07468587905168533,
"learning_rate": 9.158495979556358e-05,
"loss": 0.045,
"step": 2280
},
{
"epoch": 0.5364253923635511,
"grad_norm": 0.1882839947938919,
"learning_rate": 9.14929275564863e-05,
"loss": 0.0569,
"step": 2290
},
{
"epoch": 0.5387678613258374,
"grad_norm": 0.1339283585548401,
"learning_rate": 9.140044155740101e-05,
"loss": 0.0692,
"step": 2300
},
{
"epoch": 0.5411103302881237,
"grad_norm": 0.19089505076408386,
"learning_rate": 9.130750280971978e-05,
"loss": 0.0638,
"step": 2310
},
{
"epoch": 0.5434527992504099,
"grad_norm": 0.131087064743042,
"learning_rate": 9.121411232980588e-05,
"loss": 0.0656,
"step": 2320
},
{
"epoch": 0.5457952682126962,
"grad_norm": 0.24333599209785461,
"learning_rate": 9.112027113896262e-05,
"loss": 0.0617,
"step": 2330
},
{
"epoch": 0.5481377371749825,
"grad_norm": 0.4338069260120392,
"learning_rate": 9.102598026342222e-05,
"loss": 0.0384,
"step": 2340
},
{
"epoch": 0.5504802061372687,
"grad_norm": 0.3546713888645172,
"learning_rate": 9.093124073433463e-05,
"loss": 0.0594,
"step": 2350
},
{
"epoch": 0.552822675099555,
"grad_norm": 0.1043967604637146,
"learning_rate": 9.083605358775612e-05,
"loss": 0.0482,
"step": 2360
},
{
"epoch": 0.5551651440618411,
"grad_norm": 0.16685545444488525,
"learning_rate": 9.074041986463808e-05,
"loss": 0.0439,
"step": 2370
},
{
"epoch": 0.5575076130241274,
"grad_norm": 0.15651892125606537,
"learning_rate": 9.064434061081562e-05,
"loss": 0.0542,
"step": 2380
},
{
"epoch": 0.5598500819864137,
"grad_norm": 0.33224546909332275,
"learning_rate": 9.0547816876996e-05,
"loss": 0.0772,
"step": 2390
},
{
"epoch": 0.5621925509486999,
"grad_norm": 0.3219659626483917,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0347,
"step": 2400
},
{
"epoch": 0.5645350199109862,
"grad_norm": 0.3930731415748596,
"learning_rate": 9.035344019648702e-05,
"loss": 0.0386,
"step": 2410
},
{
"epoch": 0.5668774888732724,
"grad_norm": 0.13527953624725342,
"learning_rate": 9.025558937546988e-05,
"loss": 0.0479,
"step": 2420
},
{
"epoch": 0.5692199578355587,
"grad_norm": 0.1432938128709793,
"learning_rate": 9.015729832577681e-05,
"loss": 0.0319,
"step": 2430
},
{
"epoch": 0.571562426797845,
"grad_norm": 0.25687897205352783,
"learning_rate": 9.005856812230304e-05,
"loss": 0.0387,
"step": 2440
},
{
"epoch": 0.5739048957601312,
"grad_norm": 0.31300991773605347,
"learning_rate": 8.995939984474624e-05,
"loss": 0.0574,
"step": 2450
},
{
"epoch": 0.5762473647224174,
"grad_norm": 0.25793933868408203,
"learning_rate": 8.98597945775948e-05,
"loss": 0.0415,
"step": 2460
},
{
"epoch": 0.5785898336847036,
"grad_norm": 0.13978935778141022,
"learning_rate": 8.975975341011596e-05,
"loss": 0.0366,
"step": 2470
},
{
"epoch": 0.5809323026469899,
"grad_norm": 0.20552988350391388,
"learning_rate": 8.965927743634391e-05,
"loss": 0.0519,
"step": 2480
},
{
"epoch": 0.5832747716092762,
"grad_norm": 0.0843147486448288,
"learning_rate": 8.955836775506776e-05,
"loss": 0.0434,
"step": 2490
},
{
"epoch": 0.5856172405715624,
"grad_norm": 0.519131600856781,
"learning_rate": 8.945702546981969e-05,
"loss": 0.044,
"step": 2500
},
{
"epoch": 0.5879597095338487,
"grad_norm": 0.20150704681873322,
"learning_rate": 8.935525168886262e-05,
"loss": 0.0486,
"step": 2510
},
{
"epoch": 0.590302178496135,
"grad_norm": 0.6557456851005554,
"learning_rate": 8.92530475251784e-05,
"loss": 0.0444,
"step": 2520
},
{
"epoch": 0.5926446474584212,
"grad_norm": 0.48158717155456543,
"learning_rate": 8.91504140964553e-05,
"loss": 0.0512,
"step": 2530
},
{
"epoch": 0.5949871164207075,
"grad_norm": 0.3636298179626465,
"learning_rate": 8.90473525250761e-05,
"loss": 0.052,
"step": 2540
},
{
"epoch": 0.5973295853829936,
"grad_norm": 0.1767117828130722,
"learning_rate": 8.894386393810563e-05,
"loss": 0.0534,
"step": 2550
},
{
"epoch": 0.5996720543452799,
"grad_norm": 0.30989664793014526,
"learning_rate": 8.883994946727849e-05,
"loss": 0.0765,
"step": 2560
},
{
"epoch": 0.6020145233075662,
"grad_norm": 0.28089532256126404,
"learning_rate": 8.873561024898668e-05,
"loss": 0.0424,
"step": 2570
},
{
"epoch": 0.6043569922698524,
"grad_norm": 0.5266916751861572,
"learning_rate": 8.863084742426719e-05,
"loss": 0.0364,
"step": 2580
},
{
"epoch": 0.6066994612321387,
"grad_norm": 0.5653497576713562,
"learning_rate": 8.852566213878947e-05,
"loss": 0.0604,
"step": 2590
},
{
"epoch": 0.6090419301944249,
"grad_norm": 0.34995973110198975,
"learning_rate": 8.842005554284296e-05,
"loss": 0.0386,
"step": 2600
},
{
"epoch": 0.6113843991567112,
"grad_norm": 0.42935842275619507,
"learning_rate": 8.831402879132446e-05,
"loss": 0.0595,
"step": 2610
},
{
"epoch": 0.6137268681189975,
"grad_norm": 0.19672085344791412,
"learning_rate": 8.820758304372557e-05,
"loss": 0.0426,
"step": 2620
},
{
"epoch": 0.6160693370812836,
"grad_norm": 0.17344583570957184,
"learning_rate": 8.810071946411989e-05,
"loss": 0.0979,
"step": 2630
},
{
"epoch": 0.6184118060435699,
"grad_norm": 0.19755525887012482,
"learning_rate": 8.799343922115044e-05,
"loss": 0.0322,
"step": 2640
},
{
"epoch": 0.6207542750058562,
"grad_norm": 0.33817166090011597,
"learning_rate": 8.788574348801675e-05,
"loss": 0.0375,
"step": 2650
},
{
"epoch": 0.6230967439681424,
"grad_norm": 0.44614845514297485,
"learning_rate": 8.77776334424621e-05,
"loss": 0.054,
"step": 2660
},
{
"epoch": 0.6254392129304287,
"grad_norm": 0.4128440022468567,
"learning_rate": 8.766911026676064e-05,
"loss": 0.0422,
"step": 2670
},
{
"epoch": 0.6277816818927149,
"grad_norm": 0.22449485957622528,
"learning_rate": 8.756017514770443e-05,
"loss": 0.037,
"step": 2680
},
{
"epoch": 0.6301241508550012,
"grad_norm": 0.2689172029495239,
"learning_rate": 8.745082927659047e-05,
"loss": 0.0353,
"step": 2690
},
{
"epoch": 0.6324666198172875,
"grad_norm": 0.05075841769576073,
"learning_rate": 8.73410738492077e-05,
"loss": 0.0333,
"step": 2700
},
{
"epoch": 0.6348090887795736,
"grad_norm": 0.1499403417110443,
"learning_rate": 8.723091006582389e-05,
"loss": 0.0559,
"step": 2710
},
{
"epoch": 0.6371515577418599,
"grad_norm": 0.36928892135620117,
"learning_rate": 8.71203391311725e-05,
"loss": 0.0763,
"step": 2720
},
{
"epoch": 0.6394940267041461,
"grad_norm": 0.5727768540382385,
"learning_rate": 8.700936225443959e-05,
"loss": 0.0527,
"step": 2730
},
{
"epoch": 0.6418364956664324,
"grad_norm": 0.30735543370246887,
"learning_rate": 8.689798064925049e-05,
"loss": 0.0585,
"step": 2740
},
{
"epoch": 0.6441789646287187,
"grad_norm": 0.3882769048213959,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0491,
"step": 2750
},
{
"epoch": 0.6465214335910049,
"grad_norm": 0.365843802690506,
"learning_rate": 8.6674008130122e-05,
"loss": 0.0397,
"step": 2760
},
{
"epoch": 0.6488639025532912,
"grad_norm": 0.21451324224472046,
"learning_rate": 8.656141966551019e-05,
"loss": 0.0365,
"step": 2770
},
{
"epoch": 0.6512063715155774,
"grad_norm": 0.1609046310186386,
"learning_rate": 8.644843137107059e-05,
"loss": 0.039,
"step": 2780
},
{
"epoch": 0.6535488404778637,
"grad_norm": 0.7074998021125793,
"learning_rate": 8.633504448242505e-05,
"loss": 0.0591,
"step": 2790
},
{
"epoch": 0.65589130944015,
"grad_norm": 0.21024738252162933,
"learning_rate": 8.622126023955446e-05,
"loss": 0.0488,
"step": 2800
},
{
"epoch": 0.6582337784024361,
"grad_norm": 0.3021513819694519,
"learning_rate": 8.610707988678503e-05,
"loss": 0.04,
"step": 2810
},
{
"epoch": 0.6605762473647224,
"grad_norm": 0.19868189096450806,
"learning_rate": 8.599250467277483e-05,
"loss": 0.0319,
"step": 2820
},
{
"epoch": 0.6629187163270087,
"grad_norm": 0.15607990324497223,
"learning_rate": 8.587753585050004e-05,
"loss": 0.036,
"step": 2830
},
{
"epoch": 0.6652611852892949,
"grad_norm": 0.3136105239391327,
"learning_rate": 8.576217467724128e-05,
"loss": 0.0752,
"step": 2840
},
{
"epoch": 0.6676036542515812,
"grad_norm": 0.21903324127197266,
"learning_rate": 8.564642241456986e-05,
"loss": 0.0416,
"step": 2850
},
{
"epoch": 0.6699461232138674,
"grad_norm": 0.5193045735359192,
"learning_rate": 8.553028032833397e-05,
"loss": 0.0386,
"step": 2860
},
{
"epoch": 0.6722885921761537,
"grad_norm": 0.5539060235023499,
"learning_rate": 8.541374968864487e-05,
"loss": 0.0439,
"step": 2870
},
{
"epoch": 0.67463106113844,
"grad_norm": 0.2819710969924927,
"learning_rate": 8.529683176986295e-05,
"loss": 0.0541,
"step": 2880
},
{
"epoch": 0.6769735301007261,
"grad_norm": 0.1039167121052742,
"learning_rate": 8.517952785058385e-05,
"loss": 0.039,
"step": 2890
},
{
"epoch": 0.6793159990630124,
"grad_norm": 0.062352605164051056,
"learning_rate": 8.506183921362443e-05,
"loss": 0.0401,
"step": 2900
},
{
"epoch": 0.6816584680252986,
"grad_norm": 0.5535932183265686,
"learning_rate": 8.494376714600878e-05,
"loss": 0.0505,
"step": 2910
},
{
"epoch": 0.6840009369875849,
"grad_norm": 0.37601238489151,
"learning_rate": 8.482531293895412e-05,
"loss": 0.0391,
"step": 2920
},
{
"epoch": 0.6863434059498712,
"grad_norm": 0.06856988370418549,
"learning_rate": 8.470647788785665e-05,
"loss": 0.0389,
"step": 2930
},
{
"epoch": 0.6886858749121574,
"grad_norm": 0.5693712830543518,
"learning_rate": 8.458726329227747e-05,
"loss": 0.0495,
"step": 2940
},
{
"epoch": 0.6910283438744437,
"grad_norm": 0.14418154954910278,
"learning_rate": 8.44676704559283e-05,
"loss": 0.0405,
"step": 2950
},
{
"epoch": 0.69337081283673,
"grad_norm": 0.11880888044834137,
"learning_rate": 8.434770068665723e-05,
"loss": 0.0362,
"step": 2960
},
{
"epoch": 0.6957132817990161,
"grad_norm": 0.6350199580192566,
"learning_rate": 8.422735529643444e-05,
"loss": 0.0607,
"step": 2970
},
{
"epoch": 0.6980557507613024,
"grad_norm": 0.19949962198734283,
"learning_rate": 8.410663560133784e-05,
"loss": 0.0346,
"step": 2980
},
{
"epoch": 0.7003982197235886,
"grad_norm": 0.19905024766921997,
"learning_rate": 8.398554292153866e-05,
"loss": 0.0455,
"step": 2990
},
{
"epoch": 0.7027406886858749,
"grad_norm": 0.12724433839321136,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0312,
"step": 3000
},
{
"epoch": 0.7050831576481612,
"grad_norm": 0.6818522214889526,
"learning_rate": 8.37422439088976e-05,
"loss": 0.0477,
"step": 3010
},
{
"epoch": 0.7074256266104474,
"grad_norm": 0.14397919178009033,
"learning_rate": 8.362004023673474e-05,
"loss": 0.054,
"step": 3020
},
{
"epoch": 0.7097680955727337,
"grad_norm": 0.1597958207130432,
"learning_rate": 8.349746890119826e-05,
"loss": 0.0475,
"step": 3030
},
{
"epoch": 0.7121105645350199,
"grad_norm": 0.2985258102416992,
"learning_rate": 8.337453124270863e-05,
"loss": 0.0276,
"step": 3040
},
{
"epoch": 0.7144530334973062,
"grad_norm": 0.17043350636959076,
"learning_rate": 8.32512286056924e-05,
"loss": 0.0337,
"step": 3050
},
{
"epoch": 0.7167955024595924,
"grad_norm": 0.390009343624115,
"learning_rate": 8.31275623385675e-05,
"loss": 0.0277,
"step": 3060
},
{
"epoch": 0.7191379714218786,
"grad_norm": 0.20475880801677704,
"learning_rate": 8.300353379372834e-05,
"loss": 0.0691,
"step": 3070
},
{
"epoch": 0.7214804403841649,
"grad_norm": 0.11685507744550705,
"learning_rate": 8.287914432753123e-05,
"loss": 0.0411,
"step": 3080
},
{
"epoch": 0.7238229093464511,
"grad_norm": 0.531944990158081,
"learning_rate": 8.275439530027948e-05,
"loss": 0.0511,
"step": 3090
},
{
"epoch": 0.7261653783087374,
"grad_norm": 0.05079588294029236,
"learning_rate": 8.262928807620843e-05,
"loss": 0.0664,
"step": 3100
},
{
"epoch": 0.7285078472710237,
"grad_norm": 0.3010249435901642,
"learning_rate": 8.250382402347065e-05,
"loss": 0.0565,
"step": 3110
},
{
"epoch": 0.7308503162333099,
"grad_norm": 0.2115558385848999,
"learning_rate": 8.237800451412095e-05,
"loss": 0.0615,
"step": 3120
},
{
"epoch": 0.7331927851955962,
"grad_norm": 0.3865530490875244,
"learning_rate": 8.225183092410128e-05,
"loss": 0.0349,
"step": 3130
},
{
"epoch": 0.7355352541578825,
"grad_norm": 0.07815901935100555,
"learning_rate": 8.212530463322583e-05,
"loss": 0.036,
"step": 3140
},
{
"epoch": 0.7378777231201686,
"grad_norm": 0.11009709537029266,
"learning_rate": 8.199842702516583e-05,
"loss": 0.0386,
"step": 3150
},
{
"epoch": 0.7402201920824549,
"grad_norm": 0.12392786890268326,
"learning_rate": 8.18711994874345e-05,
"loss": 0.0396,
"step": 3160
},
{
"epoch": 0.7425626610447411,
"grad_norm": 0.16354168951511383,
"learning_rate": 8.174362341137177e-05,
"loss": 0.0446,
"step": 3170
},
{
"epoch": 0.7449051300070274,
"grad_norm": 0.2223191112279892,
"learning_rate": 8.161570019212921e-05,
"loss": 0.0326,
"step": 3180
},
{
"epoch": 0.7472475989693137,
"grad_norm": 0.176427960395813,
"learning_rate": 8.148743122865463e-05,
"loss": 0.0235,
"step": 3190
},
{
"epoch": 0.7495900679315999,
"grad_norm": 0.19706971943378448,
"learning_rate": 8.135881792367686e-05,
"loss": 0.0417,
"step": 3200
},
{
"epoch": 0.7519325368938862,
"grad_norm": 0.08818463236093521,
"learning_rate": 8.12298616836904e-05,
"loss": 0.0463,
"step": 3210
},
{
"epoch": 0.7542750058561724,
"grad_norm": 0.08389343321323395,
"learning_rate": 8.110056391894005e-05,
"loss": 0.0259,
"step": 3220
},
{
"epoch": 0.7566174748184586,
"grad_norm": 0.13730217516422272,
"learning_rate": 8.097092604340542e-05,
"loss": 0.0394,
"step": 3230
},
{
"epoch": 0.7589599437807449,
"grad_norm": 0.48324722051620483,
"learning_rate": 8.084094947478556e-05,
"loss": 0.0488,
"step": 3240
},
{
"epoch": 0.7613024127430311,
"grad_norm": 0.15898984670639038,
"learning_rate": 8.07106356344834e-05,
"loss": 0.0402,
"step": 3250
},
{
"epoch": 0.7636448817053174,
"grad_norm": 0.19997884333133698,
"learning_rate": 8.057998594759022e-05,
"loss": 0.0406,
"step": 3260
},
{
"epoch": 0.7659873506676037,
"grad_norm": 0.06215028837323189,
"learning_rate": 8.044900184287007e-05,
"loss": 0.0577,
"step": 3270
},
{
"epoch": 0.7683298196298899,
"grad_norm": 0.28326717019081116,
"learning_rate": 8.031768475274413e-05,
"loss": 0.057,
"step": 3280
},
{
"epoch": 0.7706722885921762,
"grad_norm": 0.29579654335975647,
"learning_rate": 8.018603611327504e-05,
"loss": 0.0563,
"step": 3290
},
{
"epoch": 0.7730147575544624,
"grad_norm": 0.5313428044319153,
"learning_rate": 8.005405736415126e-05,
"loss": 0.0748,
"step": 3300
},
{
"epoch": 0.7753572265167487,
"grad_norm": 0.45142146944999695,
"learning_rate": 7.992174994867123e-05,
"loss": 0.0344,
"step": 3310
},
{
"epoch": 0.777699695479035,
"grad_norm": 0.22848837077617645,
"learning_rate": 7.978911531372765e-05,
"loss": 0.0367,
"step": 3320
},
{
"epoch": 0.7800421644413211,
"grad_norm": 0.07316577434539795,
"learning_rate": 7.965615490979163e-05,
"loss": 0.0332,
"step": 3330
},
{
"epoch": 0.7823846334036074,
"grad_norm": 0.08522647619247437,
"learning_rate": 7.952287019089685e-05,
"loss": 0.0313,
"step": 3340
},
{
"epoch": 0.7847271023658936,
"grad_norm": 0.2560670077800751,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0753,
"step": 3350
},
{
"epoch": 0.7870695713281799,
"grad_norm": 0.2529207468032837,
"learning_rate": 7.925533364208309e-05,
"loss": 0.0584,
"step": 3360
},
{
"epoch": 0.7894120402904662,
"grad_norm": 0.20108440518379211,
"learning_rate": 7.912108473790092e-05,
"loss": 0.0443,
"step": 3370
},
{
"epoch": 0.7917545092527524,
"grad_norm": 0.09312764555215836,
"learning_rate": 7.898651737020166e-05,
"loss": 0.0529,
"step": 3380
},
{
"epoch": 0.7940969782150387,
"grad_norm": 0.08973310142755508,
"learning_rate": 7.88516330105925e-05,
"loss": 0.0313,
"step": 3390
},
{
"epoch": 0.796439447177325,
"grad_norm": 0.2917576730251312,
"learning_rate": 7.871643313414718e-05,
"loss": 0.0699,
"step": 3400
},
{
"epoch": 0.7987819161396111,
"grad_norm": 0.3426614999771118,
"learning_rate": 7.858091921938988e-05,
"loss": 0.0554,
"step": 3410
},
{
"epoch": 0.8011243851018974,
"grad_norm": 0.10231604427099228,
"learning_rate": 7.844509274827907e-05,
"loss": 0.0469,
"step": 3420
},
{
"epoch": 0.8034668540641836,
"grad_norm": 0.36295169591903687,
"learning_rate": 7.830895520619128e-05,
"loss": 0.0489,
"step": 3430
},
{
"epoch": 0.8058093230264699,
"grad_norm": 0.23017369210720062,
"learning_rate": 7.817250808190483e-05,
"loss": 0.0407,
"step": 3440
},
{
"epoch": 0.8081517919887562,
"grad_norm": 0.2438231259584427,
"learning_rate": 7.803575286758364e-05,
"loss": 0.0542,
"step": 3450
},
{
"epoch": 0.8104942609510424,
"grad_norm": 0.28502318263053894,
"learning_rate": 7.789869105876083e-05,
"loss": 0.0433,
"step": 3460
},
{
"epoch": 0.8128367299133287,
"grad_norm": 0.7063993215560913,
"learning_rate": 7.776132415432234e-05,
"loss": 0.0687,
"step": 3470
},
{
"epoch": 0.8151791988756149,
"grad_norm": 0.3574845492839813,
"learning_rate": 7.762365365649067e-05,
"loss": 0.0283,
"step": 3480
},
{
"epoch": 0.8175216678379011,
"grad_norm": 0.1527651846408844,
"learning_rate": 7.748568107080832e-05,
"loss": 0.0502,
"step": 3490
},
{
"epoch": 0.8198641368001874,
"grad_norm": 0.20111270248889923,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0526,
"step": 3500
},
{
"epoch": 0.8222066057624736,
"grad_norm": 0.5221764445304871,
"learning_rate": 7.720883567456298e-05,
"loss": 0.0385,
"step": 3510
},
{
"epoch": 0.8245490747247599,
"grad_norm": 0.11450177431106567,
"learning_rate": 7.70699658915369e-05,
"loss": 0.0495,
"step": 3520
},
{
"epoch": 0.8268915436870461,
"grad_norm": 0.2669161558151245,
"learning_rate": 7.693080007570084e-05,
"loss": 0.0419,
"step": 3530
},
{
"epoch": 0.8292340126493324,
"grad_norm": 0.4859974682331085,
"learning_rate": 7.679133974894983e-05,
"loss": 0.0454,
"step": 3540
},
{
"epoch": 0.8315764816116187,
"grad_norm": 0.13351887464523315,
"learning_rate": 7.66515864363997e-05,
"loss": 0.0401,
"step": 3550
},
{
"epoch": 0.8339189505739049,
"grad_norm": 0.3376217484474182,
"learning_rate": 7.651154166637025e-05,
"loss": 0.0372,
"step": 3560
},
{
"epoch": 0.8362614195361912,
"grad_norm": 0.4906126856803894,
"learning_rate": 7.637120697036866e-05,
"loss": 0.0444,
"step": 3570
},
{
"epoch": 0.8386038884984774,
"grad_norm": 0.1525869518518448,
"learning_rate": 7.623058388307269e-05,
"loss": 0.0411,
"step": 3580
},
{
"epoch": 0.8409463574607636,
"grad_norm": 0.10655678063631058,
"learning_rate": 7.608967394231387e-05,
"loss": 0.0322,
"step": 3590
},
{
"epoch": 0.8432888264230499,
"grad_norm": 0.6658011674880981,
"learning_rate": 7.594847868906076e-05,
"loss": 0.0736,
"step": 3600
},
{
"epoch": 0.8456312953853361,
"grad_norm": 0.2985578775405884,
"learning_rate": 7.580699966740201e-05,
"loss": 0.0296,
"step": 3610
},
{
"epoch": 0.8479737643476224,
"grad_norm": 0.08989045768976212,
"learning_rate": 7.566523842452958e-05,
"loss": 0.0412,
"step": 3620
},
{
"epoch": 0.8503162333099087,
"grad_norm": 0.37455546855926514,
"learning_rate": 7.552319651072164e-05,
"loss": 0.0473,
"step": 3630
},
{
"epoch": 0.8526587022721949,
"grad_norm": 0.19339019060134888,
"learning_rate": 7.538087547932585e-05,
"loss": 0.0475,
"step": 3640
},
{
"epoch": 0.8550011712344812,
"grad_norm": 0.22095589339733124,
"learning_rate": 7.52382768867422e-05,
"loss": 0.0287,
"step": 3650
},
{
"epoch": 0.8573436401967673,
"grad_norm": 0.39905375242233276,
"learning_rate": 7.509540229240601e-05,
"loss": 0.0418,
"step": 3660
},
{
"epoch": 0.8596861091590536,
"grad_norm": 0.1556907296180725,
"learning_rate": 7.495225325877103e-05,
"loss": 0.0462,
"step": 3670
},
{
"epoch": 0.8620285781213399,
"grad_norm": 0.43170592188835144,
"learning_rate": 7.480883135129211e-05,
"loss": 0.0453,
"step": 3680
},
{
"epoch": 0.8643710470836261,
"grad_norm": 0.09220433235168457,
"learning_rate": 7.466513813840825e-05,
"loss": 0.0414,
"step": 3690
},
{
"epoch": 0.8667135160459124,
"grad_norm": 0.09303878992795944,
"learning_rate": 7.452117519152542e-05,
"loss": 0.0412,
"step": 3700
},
{
"epoch": 0.8690559850081987,
"grad_norm": 0.456315279006958,
"learning_rate": 7.437694408499933e-05,
"loss": 0.0429,
"step": 3710
},
{
"epoch": 0.8713984539704849,
"grad_norm": 0.0672278180718422,
"learning_rate": 7.423244639611826e-05,
"loss": 0.0492,
"step": 3720
},
{
"epoch": 0.8737409229327712,
"grad_norm": 0.11052095890045166,
"learning_rate": 7.408768370508576e-05,
"loss": 0.0404,
"step": 3730
},
{
"epoch": 0.8760833918950574,
"grad_norm": 0.20042133331298828,
"learning_rate": 7.394265759500348e-05,
"loss": 0.0597,
"step": 3740
},
{
"epoch": 0.8784258608573436,
"grad_norm": 0.3536411225795746,
"learning_rate": 7.379736965185368e-05,
"loss": 0.0212,
"step": 3750
},
{
"epoch": 0.8807683298196299,
"grad_norm": 0.28125354647636414,
"learning_rate": 7.365182146448205e-05,
"loss": 0.052,
"step": 3760
},
{
"epoch": 0.8831107987819161,
"grad_norm": 0.12258744984865189,
"learning_rate": 7.350601462458024e-05,
"loss": 0.02,
"step": 3770
},
{
"epoch": 0.8854532677442024,
"grad_norm": 0.5056569576263428,
"learning_rate": 7.335995072666848e-05,
"loss": 0.035,
"step": 3780
},
{
"epoch": 0.8877957367064886,
"grad_norm": 0.2552855610847473,
"learning_rate": 7.32136313680782e-05,
"loss": 0.0421,
"step": 3790
},
{
"epoch": 0.8901382056687749,
"grad_norm": 0.05761013180017471,
"learning_rate": 7.30670581489344e-05,
"loss": 0.0414,
"step": 3800
},
{
"epoch": 0.8924806746310612,
"grad_norm": 0.9745859503746033,
"learning_rate": 7.292023267213835e-05,
"loss": 0.0725,
"step": 3810
},
{
"epoch": 0.8948231435933474,
"grad_norm": 0.2608197033405304,
"learning_rate": 7.277315654334997e-05,
"loss": 0.0405,
"step": 3820
},
{
"epoch": 0.8971656125556337,
"grad_norm": 0.3153429329395294,
"learning_rate": 7.262583137097018e-05,
"loss": 0.0407,
"step": 3830
},
{
"epoch": 0.8995080815179198,
"grad_norm": 0.5415343642234802,
"learning_rate": 7.247825876612353e-05,
"loss": 0.0389,
"step": 3840
},
{
"epoch": 0.9018505504802061,
"grad_norm": 0.4772924482822418,
"learning_rate": 7.233044034264034e-05,
"loss": 0.055,
"step": 3850
},
{
"epoch": 0.9041930194424924,
"grad_norm": 0.41308316588401794,
"learning_rate": 7.218237771703921e-05,
"loss": 0.0578,
"step": 3860
},
{
"epoch": 0.9065354884047786,
"grad_norm": 0.0859963595867157,
"learning_rate": 7.203407250850928e-05,
"loss": 0.0328,
"step": 3870
},
{
"epoch": 0.9088779573670649,
"grad_norm": 0.4168371856212616,
"learning_rate": 7.188552633889259e-05,
"loss": 0.0493,
"step": 3880
},
{
"epoch": 0.9112204263293512,
"grad_norm": 0.42193326354026794,
"learning_rate": 7.173674083266624e-05,
"loss": 0.052,
"step": 3890
},
{
"epoch": 0.9135628952916374,
"grad_norm": 0.11540161073207855,
"learning_rate": 7.158771761692464e-05,
"loss": 0.0616,
"step": 3900
},
{
"epoch": 0.9159053642539237,
"grad_norm": 0.1789163500070572,
"learning_rate": 7.143845832136188e-05,
"loss": 0.0315,
"step": 3910
},
{
"epoch": 0.9182478332162098,
"grad_norm": 0.2873396873474121,
"learning_rate": 7.128896457825364e-05,
"loss": 0.0577,
"step": 3920
},
{
"epoch": 0.9205903021784961,
"grad_norm": 0.035885997116565704,
"learning_rate": 7.113923802243957e-05,
"loss": 0.0462,
"step": 3930
},
{
"epoch": 0.9229327711407824,
"grad_norm": 0.380929172039032,
"learning_rate": 7.09892802913053e-05,
"loss": 0.0285,
"step": 3940
},
{
"epoch": 0.9252752401030686,
"grad_norm": 0.21406327188014984,
"learning_rate": 7.083909302476453e-05,
"loss": 0.0255,
"step": 3950
},
{
"epoch": 0.9276177090653549,
"grad_norm": 0.04998482018709183,
"learning_rate": 7.068867786524116e-05,
"loss": 0.0285,
"step": 3960
},
{
"epoch": 0.9299601780276411,
"grad_norm": 0.19604696333408356,
"learning_rate": 7.053803645765128e-05,
"loss": 0.0345,
"step": 3970
},
{
"epoch": 0.9323026469899274,
"grad_norm": 0.6424615979194641,
"learning_rate": 7.038717044938519e-05,
"loss": 0.0411,
"step": 3980
},
{
"epoch": 0.9346451159522137,
"grad_norm": 0.0754154697060585,
"learning_rate": 7.023608149028937e-05,
"loss": 0.0243,
"step": 3990
},
{
"epoch": 0.9369875849144998,
"grad_norm": 0.26757097244262695,
"learning_rate": 7.008477123264848e-05,
"loss": 0.0414,
"step": 4000
},
{
"epoch": 0.9393300538767861,
"grad_norm": 0.14239585399627686,
"learning_rate": 6.993324133116726e-05,
"loss": 0.0259,
"step": 4010
},
{
"epoch": 0.9416725228390724,
"grad_norm": 0.12988215684890747,
"learning_rate": 6.978149344295242e-05,
"loss": 0.0279,
"step": 4020
},
{
"epoch": 0.9440149918013586,
"grad_norm": 0.3678188920021057,
"learning_rate": 6.962952922749457e-05,
"loss": 0.0353,
"step": 4030
},
{
"epoch": 0.9463574607636449,
"grad_norm": 0.6559092402458191,
"learning_rate": 6.947735034665002e-05,
"loss": 0.0558,
"step": 4040
},
{
"epoch": 0.9486999297259311,
"grad_norm": 0.607363760471344,
"learning_rate": 6.932495846462261e-05,
"loss": 0.0459,
"step": 4050
},
{
"epoch": 0.9510423986882174,
"grad_norm": 0.22406215965747833,
"learning_rate": 6.917235524794558e-05,
"loss": 0.0412,
"step": 4060
},
{
"epoch": 0.9533848676505037,
"grad_norm": 0.2519318461418152,
"learning_rate": 6.901954236546323e-05,
"loss": 0.0355,
"step": 4070
},
{
"epoch": 0.9557273366127899,
"grad_norm": 0.40484338998794556,
"learning_rate": 6.886652148831279e-05,
"loss": 0.0446,
"step": 4080
},
{
"epoch": 0.9580698055750761,
"grad_norm": 0.36861318349838257,
"learning_rate": 6.871329428990602e-05,
"loss": 0.0324,
"step": 4090
},
{
"epoch": 0.9604122745373623,
"grad_norm": 0.15483994781970978,
"learning_rate": 6.855986244591104e-05,
"loss": 0.0265,
"step": 4100
},
{
"epoch": 0.9627547434996486,
"grad_norm": 0.12822240591049194,
"learning_rate": 6.840622763423391e-05,
"loss": 0.0251,
"step": 4110
},
{
"epoch": 0.9650972124619349,
"grad_norm": 0.2436823546886444,
"learning_rate": 6.825239153500029e-05,
"loss": 0.0354,
"step": 4120
},
{
"epoch": 0.9674396814242211,
"grad_norm": 0.11992768943309784,
"learning_rate": 6.809835583053715e-05,
"loss": 0.0355,
"step": 4130
},
{
"epoch": 0.9697821503865074,
"grad_norm": 0.05282627418637276,
"learning_rate": 6.794412220535426e-05,
"loss": 0.0325,
"step": 4140
},
{
"epoch": 0.9721246193487937,
"grad_norm": 0.1702210009098053,
"learning_rate": 6.778969234612584e-05,
"loss": 0.0421,
"step": 4150
},
{
"epoch": 0.9744670883110799,
"grad_norm": 0.30918455123901367,
"learning_rate": 6.763506794167208e-05,
"loss": 0.0306,
"step": 4160
},
{
"epoch": 0.9768095572733662,
"grad_norm": 0.18471957743167877,
"learning_rate": 6.748025068294067e-05,
"loss": 0.026,
"step": 4170
},
{
"epoch": 0.9791520262356523,
"grad_norm": 0.2867111265659332,
"learning_rate": 6.732524226298841e-05,
"loss": 0.0368,
"step": 4180
},
{
"epoch": 0.9814944951979386,
"grad_norm": 0.5615723729133606,
"learning_rate": 6.71700443769625e-05,
"loss": 0.0374,
"step": 4190
},
{
"epoch": 0.9838369641602249,
"grad_norm": 0.06628378480672836,
"learning_rate": 6.701465872208216e-05,
"loss": 0.0432,
"step": 4200
},
{
"epoch": 0.9861794331225111,
"grad_norm": 0.24212607741355896,
"learning_rate": 6.685908699762002e-05,
"loss": 0.0446,
"step": 4210
},
{
"epoch": 0.9885219020847974,
"grad_norm": 0.1411833018064499,
"learning_rate": 6.670333090488356e-05,
"loss": 0.0281,
"step": 4220
},
{
"epoch": 0.9908643710470836,
"grad_norm": 0.4957182705402374,
"learning_rate": 6.654739214719641e-05,
"loss": 0.0385,
"step": 4230
},
{
"epoch": 0.9932068400093699,
"grad_norm": 0.2773032486438751,
"learning_rate": 6.639127242987988e-05,
"loss": 0.0351,
"step": 4240
},
{
"epoch": 0.9955493089716562,
"grad_norm": 0.6347845196723938,
"learning_rate": 6.623497346023418e-05,
"loss": 0.0519,
"step": 4250
},
{
"epoch": 0.9978917779339423,
"grad_norm": 0.39392927289009094,
"learning_rate": 6.607849694751977e-05,
"loss": 0.0415,
"step": 4260
},
{
"epoch": 1.0002342468962286,
"grad_norm": 0.12185105681419373,
"learning_rate": 6.592184460293877e-05,
"loss": 0.0413,
"step": 4270
},
{
"epoch": 1.0025767158585148,
"grad_norm": 0.4016129970550537,
"learning_rate": 6.576501813961609e-05,
"loss": 0.0473,
"step": 4280
},
{
"epoch": 1.0049191848208012,
"grad_norm": 0.10202305018901825,
"learning_rate": 6.56080192725808e-05,
"loss": 0.0476,
"step": 4290
},
{
"epoch": 1.0072616537830874,
"grad_norm": 0.08643211424350739,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0363,
"step": 4300
},
{
"epoch": 1.0096041227453736,
"grad_norm": 0.4279628396034241,
"learning_rate": 6.529351119689688e-05,
"loss": 0.0343,
"step": 4310
},
{
"epoch": 1.0119465917076598,
"grad_norm": 0.0435931533575058,
"learning_rate": 6.513600542765817e-05,
"loss": 0.0363,
"step": 4320
},
{
"epoch": 1.0142890606699462,
"grad_norm": 0.11314094811677933,
"learning_rate": 6.497833413348909e-05,
"loss": 0.0409,
"step": 4330
},
{
"epoch": 1.0166315296322324,
"grad_norm": 0.049418941140174866,
"learning_rate": 6.48204990386577e-05,
"loss": 0.027,
"step": 4340
},
{
"epoch": 1.0189739985945185,
"grad_norm": 0.0937579795718193,
"learning_rate": 6.466250186922325e-05,
"loss": 0.0386,
"step": 4350
},
{
"epoch": 1.021316467556805,
"grad_norm": 0.17256158590316772,
"learning_rate": 6.450434435301751e-05,
"loss": 0.0283,
"step": 4360
},
{
"epoch": 1.0236589365190911,
"grad_norm": 0.41623151302337646,
"learning_rate": 6.43460282196257e-05,
"loss": 0.0309,
"step": 4370
},
{
"epoch": 1.0260014054813773,
"grad_norm": 0.25574249029159546,
"learning_rate": 6.418755520036775e-05,
"loss": 0.017,
"step": 4380
},
{
"epoch": 1.0283438744436637,
"grad_norm": 0.12465788424015045,
"learning_rate": 6.402892702827916e-05,
"loss": 0.028,
"step": 4390
},
{
"epoch": 1.03068634340595,
"grad_norm": 0.2367735058069229,
"learning_rate": 6.387014543809223e-05,
"loss": 0.0288,
"step": 4400
},
{
"epoch": 1.033028812368236,
"grad_norm": 0.15218676626682281,
"learning_rate": 6.371121216621698e-05,
"loss": 0.0414,
"step": 4410
},
{
"epoch": 1.0353712813305225,
"grad_norm": 0.09345823526382446,
"learning_rate": 6.355212895072223e-05,
"loss": 0.0348,
"step": 4420
},
{
"epoch": 1.0377137502928087,
"grad_norm": 0.25038620829582214,
"learning_rate": 6.339289753131649e-05,
"loss": 0.0472,
"step": 4430
},
{
"epoch": 1.0400562192550948,
"grad_norm": 0.5955792665481567,
"learning_rate": 6.323351964932908e-05,
"loss": 0.0612,
"step": 4440
},
{
"epoch": 1.042398688217381,
"grad_norm": 0.10471931844949722,
"learning_rate": 6.307399704769099e-05,
"loss": 0.0319,
"step": 4450
},
{
"epoch": 1.0447411571796674,
"grad_norm": 0.3728072941303253,
"learning_rate": 6.291433147091583e-05,
"loss": 0.0346,
"step": 4460
},
{
"epoch": 1.0470836261419536,
"grad_norm": 0.13940206170082092,
"learning_rate": 6.275452466508077e-05,
"loss": 0.0315,
"step": 4470
},
{
"epoch": 1.0494260951042398,
"grad_norm": 0.24892286956310272,
"learning_rate": 6.259457837780742e-05,
"loss": 0.0271,
"step": 4480
},
{
"epoch": 1.0517685640665262,
"grad_norm": 0.09227164089679718,
"learning_rate": 6.243449435824276e-05,
"loss": 0.035,
"step": 4490
},
{
"epoch": 1.0541110330288124,
"grad_norm": 0.4062785804271698,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0381,
"step": 4500
},
{
"epoch": 1.0564535019910986,
"grad_norm": 0.10490421950817108,
"learning_rate": 6.211392012633932e-05,
"loss": 0.0424,
"step": 4510
},
{
"epoch": 1.058795970953385,
"grad_norm": 0.08822830021381378,
"learning_rate": 6.195343341974899e-05,
"loss": 0.0484,
"step": 4520
},
{
"epoch": 1.0611384399156711,
"grad_norm": 0.22914232313632965,
"learning_rate": 6.179281599232591e-05,
"loss": 0.0388,
"step": 4530
},
{
"epoch": 1.0634809088779573,
"grad_norm": 0.6712221503257751,
"learning_rate": 6.163206960055651e-05,
"loss": 0.0853,
"step": 4540
},
{
"epoch": 1.0658233778402435,
"grad_norm": 0.2438327521085739,
"learning_rate": 6.147119600233758e-05,
"loss": 0.0177,
"step": 4550
},
{
"epoch": 1.06816584680253,
"grad_norm": 0.45352616906166077,
"learning_rate": 6.131019695695702e-05,
"loss": 0.0798,
"step": 4560
},
{
"epoch": 1.070508315764816,
"grad_norm": 0.17237244546413422,
"learning_rate": 6.11490742250746e-05,
"loss": 0.037,
"step": 4570
},
{
"epoch": 1.0728507847271023,
"grad_norm": 0.7011030316352844,
"learning_rate": 6.0987829568702656e-05,
"loss": 0.0549,
"step": 4580
},
{
"epoch": 1.0751932536893887,
"grad_norm": 0.14807315170764923,
"learning_rate": 6.0826464751186994e-05,
"loss": 0.0483,
"step": 4590
},
{
"epoch": 1.0775357226516749,
"grad_norm": 0.42932969331741333,
"learning_rate": 6.066498153718735e-05,
"loss": 0.0388,
"step": 4600
},
{
"epoch": 1.079878191613961,
"grad_norm": 0.13377119600772858,
"learning_rate": 6.05033816926583e-05,
"loss": 0.0464,
"step": 4610
},
{
"epoch": 1.0822206605762474,
"grad_norm": 0.13043726980686188,
"learning_rate": 6.034166698482984e-05,
"loss": 0.0234,
"step": 4620
},
{
"epoch": 1.0845631295385336,
"grad_norm": 0.23946554958820343,
"learning_rate": 6.017983918218812e-05,
"loss": 0.0415,
"step": 4630
},
{
"epoch": 1.0869055985008198,
"grad_norm": 0.11139467358589172,
"learning_rate": 6.001790005445607e-05,
"loss": 0.0397,
"step": 4640
},
{
"epoch": 1.0892480674631062,
"grad_norm": 0.1447746455669403,
"learning_rate": 5.985585137257401e-05,
"loss": 0.0293,
"step": 4650
},
{
"epoch": 1.0915905364253924,
"grad_norm": 0.45925086736679077,
"learning_rate": 5.969369490868042e-05,
"loss": 0.03,
"step": 4660
},
{
"epoch": 1.0939330053876786,
"grad_norm": 0.2177567183971405,
"learning_rate": 5.953143243609235e-05,
"loss": 0.042,
"step": 4670
},
{
"epoch": 1.096275474349965,
"grad_norm": 0.20075875520706177,
"learning_rate": 5.9369065729286245e-05,
"loss": 0.0384,
"step": 4680
},
{
"epoch": 1.0986179433122512,
"grad_norm": 0.16894571483135223,
"learning_rate": 5.9206596563878357e-05,
"loss": 0.0308,
"step": 4690
},
{
"epoch": 1.1009604122745373,
"grad_norm": 0.09761305898427963,
"learning_rate": 5.90440267166055e-05,
"loss": 0.0244,
"step": 4700
},
{
"epoch": 1.1033028812368235,
"grad_norm": 0.04163440316915512,
"learning_rate": 5.888135796530544e-05,
"loss": 0.0191,
"step": 4710
},
{
"epoch": 1.10564535019911,
"grad_norm": 0.27570199966430664,
"learning_rate": 5.871859208889759e-05,
"loss": 0.0222,
"step": 4720
},
{
"epoch": 1.107987819161396,
"grad_norm": 0.2948501706123352,
"learning_rate": 5.85557308673635e-05,
"loss": 0.0442,
"step": 4730
},
{
"epoch": 1.1103302881236823,
"grad_norm": 0.26524093747138977,
"learning_rate": 5.8392776081727385e-05,
"loss": 0.0347,
"step": 4740
},
{
"epoch": 1.1126727570859687,
"grad_norm": 0.26801493763923645,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.0299,
"step": 4750
},
{
"epoch": 1.1150152260482549,
"grad_norm": 0.0498003289103508,
"learning_rate": 5.8066592947342555e-05,
"loss": 0.0289,
"step": 4760
},
{
"epoch": 1.117357695010541,
"grad_norm": 0.2827109694480896,
"learning_rate": 5.7903368165680327e-05,
"loss": 0.0328,
"step": 4770
},
{
"epoch": 1.1197001639728275,
"grad_norm": 0.18607333302497864,
"learning_rate": 5.7740056954050084e-05,
"loss": 0.0277,
"step": 4780
},
{
"epoch": 1.1220426329351136,
"grad_norm": 0.10899386554956436,
"learning_rate": 5.757666109839702e-05,
"loss": 0.0397,
"step": 4790
},
{
"epoch": 1.1243851018973998,
"grad_norm": 0.9352733492851257,
"learning_rate": 5.74131823855921e-05,
"loss": 0.0801,
"step": 4800
},
{
"epoch": 1.126727570859686,
"grad_norm": 0.15164723992347717,
"learning_rate": 5.72496226034123e-05,
"loss": 0.0572,
"step": 4810
},
{
"epoch": 1.1290700398219724,
"grad_norm": 0.06457802653312683,
"learning_rate": 5.7085983540521216e-05,
"loss": 0.041,
"step": 4820
},
{
"epoch": 1.1314125087842586,
"grad_norm": 0.13067546486854553,
"learning_rate": 5.692226698644938e-05,
"loss": 0.0345,
"step": 4830
},
{
"epoch": 1.1337549777465448,
"grad_norm": 0.4330101013183594,
"learning_rate": 5.675847473157485e-05,
"loss": 0.0436,
"step": 4840
},
{
"epoch": 1.1360974467088312,
"grad_norm": 0.41848742961883545,
"learning_rate": 5.6594608567103456e-05,
"loss": 0.0216,
"step": 4850
},
{
"epoch": 1.1384399156711174,
"grad_norm": 0.13505397737026215,
"learning_rate": 5.6430670285049314e-05,
"loss": 0.0305,
"step": 4860
},
{
"epoch": 1.1407823846334035,
"grad_norm": 0.4569176435470581,
"learning_rate": 5.6266661678215216e-05,
"loss": 0.0324,
"step": 4870
},
{
"epoch": 1.14312485359569,
"grad_norm": 0.4705914556980133,
"learning_rate": 5.6102584540173006e-05,
"loss": 0.0478,
"step": 4880
},
{
"epoch": 1.1454673225579761,
"grad_norm": 0.276143342256546,
"learning_rate": 5.5938440665244006e-05,
"loss": 0.0578,
"step": 4890
},
{
"epoch": 1.1478097915202623,
"grad_norm": 0.3393331468105316,
"learning_rate": 5.577423184847932e-05,
"loss": 0.0507,
"step": 4900
},
{
"epoch": 1.1501522604825487,
"grad_norm": 0.18119889497756958,
"learning_rate": 5.560995988564023e-05,
"loss": 0.0197,
"step": 4910
},
{
"epoch": 1.1524947294448349,
"grad_norm": 0.0739196389913559,
"learning_rate": 5.544562657317863e-05,
"loss": 0.0297,
"step": 4920
},
{
"epoch": 1.154837198407121,
"grad_norm": 0.22677703201770782,
"learning_rate": 5.52812337082173e-05,
"loss": 0.0407,
"step": 4930
},
{
"epoch": 1.1571796673694075,
"grad_norm": 0.054532766342163086,
"learning_rate": 5.511678308853026e-05,
"loss": 0.0448,
"step": 4940
},
{
"epoch": 1.1595221363316937,
"grad_norm": 0.45871463418006897,
"learning_rate": 5.495227651252315e-05,
"loss": 0.0316,
"step": 4950
},
{
"epoch": 1.1618646052939798,
"grad_norm": 0.09669110924005508,
"learning_rate": 5.478771577921351e-05,
"loss": 0.0404,
"step": 4960
},
{
"epoch": 1.164207074256266,
"grad_norm": 0.1810620278120041,
"learning_rate": 5.462310268821118e-05,
"loss": 0.0233,
"step": 4970
},
{
"epoch": 1.1665495432185524,
"grad_norm": 0.10690245032310486,
"learning_rate": 5.445843903969854e-05,
"loss": 0.033,
"step": 4980
},
{
"epoch": 1.1688920121808386,
"grad_norm": 0.3685993552207947,
"learning_rate": 5.4293726634410855e-05,
"loss": 0.0204,
"step": 4990
},
{
"epoch": 1.1712344811431248,
"grad_norm": 0.17481215298175812,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0269,
"step": 5000
},
{
"epoch": 1.1735769501054112,
"grad_norm": 0.6450178027153015,
"learning_rate": 5.396416275909779e-05,
"loss": 0.052,
"step": 5010
},
{
"epoch": 1.1759194190676974,
"grad_norm": 0.0964297205209732,
"learning_rate": 5.379931489313016e-05,
"loss": 0.0299,
"step": 5020
},
{
"epoch": 1.1782618880299836,
"grad_norm": 0.06013895943760872,
"learning_rate": 5.363442547846356e-05,
"loss": 0.0334,
"step": 5030
},
{
"epoch": 1.1806043569922697,
"grad_norm": 0.032787106931209564,
"learning_rate": 5.3469496318302204e-05,
"loss": 0.0506,
"step": 5040
},
{
"epoch": 1.1829468259545561,
"grad_norm": 0.3833360970020294,
"learning_rate": 5.330452921628497e-05,
"loss": 0.0331,
"step": 5050
},
{
"epoch": 1.1852892949168423,
"grad_norm": 0.08078952878713608,
"learning_rate": 5.313952597646568e-05,
"loss": 0.0171,
"step": 5060
},
{
"epoch": 1.1876317638791285,
"grad_norm": 0.09187212586402893,
"learning_rate": 5.297448840329329e-05,
"loss": 0.0195,
"step": 5070
},
{
"epoch": 1.189974232841415,
"grad_norm": 0.2530211806297302,
"learning_rate": 5.280941830159227e-05,
"loss": 0.0219,
"step": 5080
},
{
"epoch": 1.192316701803701,
"grad_norm": 0.059026945382356644,
"learning_rate": 5.264431747654284e-05,
"loss": 0.0362,
"step": 5090
},
{
"epoch": 1.1946591707659873,
"grad_norm": 0.04210277274250984,
"learning_rate": 5.247918773366112e-05,
"loss": 0.0314,
"step": 5100
},
{
"epoch": 1.1970016397282737,
"grad_norm": 0.4919138550758362,
"learning_rate": 5.231403087877955e-05,
"loss": 0.0335,
"step": 5110
},
{
"epoch": 1.1993441086905599,
"grad_norm": 0.06546583771705627,
"learning_rate": 5.214884871802703e-05,
"loss": 0.0223,
"step": 5120
},
{
"epoch": 1.201686577652846,
"grad_norm": 0.08152215927839279,
"learning_rate": 5.198364305780922e-05,
"loss": 0.0316,
"step": 5130
},
{
"epoch": 1.2040290466151324,
"grad_norm": 0.2411283552646637,
"learning_rate": 5.1818415704788725e-05,
"loss": 0.0669,
"step": 5140
},
{
"epoch": 1.2063715155774186,
"grad_norm": 0.49666517972946167,
"learning_rate": 5.165316846586541e-05,
"loss": 0.041,
"step": 5150
},
{
"epoch": 1.2087139845397048,
"grad_norm": 0.08363020420074463,
"learning_rate": 5.148790314815663e-05,
"loss": 0.0209,
"step": 5160
},
{
"epoch": 1.2110564535019912,
"grad_norm": 0.04317115619778633,
"learning_rate": 5.132262155897739e-05,
"loss": 0.0367,
"step": 5170
},
{
"epoch": 1.2133989224642774,
"grad_norm": 0.1066800057888031,
"learning_rate": 5.1157325505820694e-05,
"loss": 0.0399,
"step": 5180
},
{
"epoch": 1.2157413914265636,
"grad_norm": 0.17649437487125397,
"learning_rate": 5.0992016796337686e-05,
"loss": 0.0236,
"step": 5190
},
{
"epoch": 1.21808386038885,
"grad_norm": 0.14966139197349548,
"learning_rate": 5.0826697238317935e-05,
"loss": 0.0195,
"step": 5200
},
{
"epoch": 1.2204263293511362,
"grad_norm": 0.03593892604112625,
"learning_rate": 5.066136863966963e-05,
"loss": 0.0202,
"step": 5210
},
{
"epoch": 1.2227687983134223,
"grad_norm": 0.46276217699050903,
"learning_rate": 5.0496032808399815e-05,
"loss": 0.0464,
"step": 5220
},
{
"epoch": 1.2251112672757085,
"grad_norm": 0.21946477890014648,
"learning_rate": 5.033069155259471e-05,
"loss": 0.0301,
"step": 5230
},
{
"epoch": 1.227453736237995,
"grad_norm": 0.08784784376621246,
"learning_rate": 5.016534668039976e-05,
"loss": 0.0316,
"step": 5240
},
{
"epoch": 1.229796205200281,
"grad_norm": 0.1410629153251648,
"learning_rate": 5e-05,
"loss": 0.0263,
"step": 5250
},
{
"epoch": 1.2321386741625673,
"grad_norm": 0.07868409156799316,
"learning_rate": 4.9834653319600246e-05,
"loss": 0.0213,
"step": 5260
},
{
"epoch": 1.2344811431248537,
"grad_norm": 0.215213343501091,
"learning_rate": 4.96693084474053e-05,
"loss": 0.0457,
"step": 5270
},
{
"epoch": 1.2368236120871399,
"grad_norm": 0.16864515841007233,
"learning_rate": 4.950396719160018e-05,
"loss": 0.0336,
"step": 5280
},
{
"epoch": 1.239166081049426,
"grad_norm": 0.0474487841129303,
"learning_rate": 4.93386313603304e-05,
"loss": 0.0227,
"step": 5290
},
{
"epoch": 1.2415085500117122,
"grad_norm": 0.5898747444152832,
"learning_rate": 4.917330276168208e-05,
"loss": 0.0165,
"step": 5300
},
{
"epoch": 1.2438510189739986,
"grad_norm": 0.4065062403678894,
"learning_rate": 4.9007983203662326e-05,
"loss": 0.0271,
"step": 5310
},
{
"epoch": 1.2461934879362848,
"grad_norm": 0.19243858754634857,
"learning_rate": 4.884267449417931e-05,
"loss": 0.0222,
"step": 5320
},
{
"epoch": 1.248535956898571,
"grad_norm": 0.14905819296836853,
"learning_rate": 4.867737844102261e-05,
"loss": 0.0183,
"step": 5330
},
{
"epoch": 1.2508784258608574,
"grad_norm": 0.28917795419692993,
"learning_rate": 4.851209685184338e-05,
"loss": 0.0194,
"step": 5340
},
{
"epoch": 1.2532208948231436,
"grad_norm": 0.3423207104206085,
"learning_rate": 4.834683153413459e-05,
"loss": 0.0281,
"step": 5350
},
{
"epoch": 1.2555633637854298,
"grad_norm": 0.04684186726808548,
"learning_rate": 4.818158429521129e-05,
"loss": 0.0266,
"step": 5360
},
{
"epoch": 1.2579058327477162,
"grad_norm": 0.27714163064956665,
"learning_rate": 4.801635694219079e-05,
"loss": 0.0468,
"step": 5370
},
{
"epoch": 1.2602483017100023,
"grad_norm": 0.1844978630542755,
"learning_rate": 4.785115128197298e-05,
"loss": 0.0392,
"step": 5380
},
{
"epoch": 1.2625907706722885,
"grad_norm": 0.36138930916786194,
"learning_rate": 4.7685969121220456e-05,
"loss": 0.029,
"step": 5390
},
{
"epoch": 1.264933239634575,
"grad_norm": 0.3211914896965027,
"learning_rate": 4.7520812266338885e-05,
"loss": 0.0611,
"step": 5400
},
{
"epoch": 1.2672757085968611,
"grad_norm": 0.5163668990135193,
"learning_rate": 4.735568252345718e-05,
"loss": 0.0481,
"step": 5410
},
{
"epoch": 1.2696181775591473,
"grad_norm": 0.5117266178131104,
"learning_rate": 4.7190581698407725e-05,
"loss": 0.0326,
"step": 5420
},
{
"epoch": 1.2719606465214337,
"grad_norm": 0.24475805461406708,
"learning_rate": 4.702551159670672e-05,
"loss": 0.0229,
"step": 5430
},
{
"epoch": 1.2743031154837199,
"grad_norm": 0.07154544442892075,
"learning_rate": 4.6860474023534335e-05,
"loss": 0.042,
"step": 5440
},
{
"epoch": 1.276645584446006,
"grad_norm": 0.28115877509117126,
"learning_rate": 4.669547078371504e-05,
"loss": 0.0249,
"step": 5450
},
{
"epoch": 1.2789880534082925,
"grad_norm": 0.22904540598392487,
"learning_rate": 4.65305036816978e-05,
"loss": 0.0339,
"step": 5460
},
{
"epoch": 1.2813305223705787,
"grad_norm": 0.11327308416366577,
"learning_rate": 4.6365574521536445e-05,
"loss": 0.0175,
"step": 5470
},
{
"epoch": 1.2836729913328648,
"grad_norm": 0.1697210669517517,
"learning_rate": 4.620068510686985e-05,
"loss": 0.0362,
"step": 5480
},
{
"epoch": 1.286015460295151,
"grad_norm": 0.08553613722324371,
"learning_rate": 4.60358372409022e-05,
"loss": 0.0159,
"step": 5490
},
{
"epoch": 1.2883579292574374,
"grad_norm": 0.07890176773071289,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.0288,
"step": 5500
},
{
"epoch": 1.2907003982197236,
"grad_norm": 0.33075398206710815,
"learning_rate": 4.570627336558915e-05,
"loss": 0.0262,
"step": 5510
},
{
"epoch": 1.2930428671820098,
"grad_norm": 0.09929897636175156,
"learning_rate": 4.554156096030149e-05,
"loss": 0.0231,
"step": 5520
},
{
"epoch": 1.295385336144296,
"grad_norm": 0.1128670945763588,
"learning_rate": 4.537689731178883e-05,
"loss": 0.0201,
"step": 5530
},
{
"epoch": 1.2977278051065824,
"grad_norm": 0.05418454855680466,
"learning_rate": 4.5212284220786494e-05,
"loss": 0.0404,
"step": 5540
},
{
"epoch": 1.3000702740688685,
"grad_norm": 0.1747845560312271,
"learning_rate": 4.504772348747687e-05,
"loss": 0.0324,
"step": 5550
},
{
"epoch": 1.3024127430311547,
"grad_norm": 0.6264855265617371,
"learning_rate": 4.488321691146975e-05,
"loss": 0.0607,
"step": 5560
},
{
"epoch": 1.3047552119934411,
"grad_norm": 0.20012634992599487,
"learning_rate": 4.471876629178273e-05,
"loss": 0.0336,
"step": 5570
},
{
"epoch": 1.3070976809557273,
"grad_norm": 0.07151951640844345,
"learning_rate": 4.4554373426821374e-05,
"loss": 0.0199,
"step": 5580
},
{
"epoch": 1.3094401499180135,
"grad_norm": 0.09090318530797958,
"learning_rate": 4.439004011435979e-05,
"loss": 0.0263,
"step": 5590
},
{
"epoch": 1.3117826188803,
"grad_norm": 0.09504502266645432,
"learning_rate": 4.4225768151520694e-05,
"loss": 0.038,
"step": 5600
},
{
"epoch": 1.314125087842586,
"grad_norm": 0.19809271395206451,
"learning_rate": 4.406155933475599e-05,
"loss": 0.0376,
"step": 5610
},
{
"epoch": 1.3164675568048723,
"grad_norm": 0.2558313012123108,
"learning_rate": 4.3897415459827e-05,
"loss": 0.043,
"step": 5620
},
{
"epoch": 1.3188100257671587,
"grad_norm": 0.08637325465679169,
"learning_rate": 4.373333832178478e-05,
"loss": 0.0341,
"step": 5630
},
{
"epoch": 1.3211524947294448,
"grad_norm": 0.06880134344100952,
"learning_rate": 4.3569329714950704e-05,
"loss": 0.0229,
"step": 5640
},
{
"epoch": 1.323494963691731,
"grad_norm": 0.16358880698680878,
"learning_rate": 4.3405391432896555e-05,
"loss": 0.0387,
"step": 5650
},
{
"epoch": 1.3258374326540174,
"grad_norm": 0.05642487108707428,
"learning_rate": 4.324152526842517e-05,
"loss": 0.0291,
"step": 5660
},
{
"epoch": 1.3281799016163036,
"grad_norm": 0.13398276269435883,
"learning_rate": 4.307773301355062e-05,
"loss": 0.0449,
"step": 5670
},
{
"epoch": 1.3305223705785898,
"grad_norm": 0.41730400919914246,
"learning_rate": 4.291401645947879e-05,
"loss": 0.0336,
"step": 5680
},
{
"epoch": 1.3328648395408762,
"grad_norm": 0.1082252785563469,
"learning_rate": 4.275037739658771e-05,
"loss": 0.0159,
"step": 5690
},
{
"epoch": 1.3352073085031624,
"grad_norm": 0.5044443607330322,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.026,
"step": 5700
},
{
"epoch": 1.3375497774654486,
"grad_norm": 0.19207948446273804,
"learning_rate": 4.2423338901602985e-05,
"loss": 0.0205,
"step": 5710
},
{
"epoch": 1.339892246427735,
"grad_norm": 0.14319565892219543,
"learning_rate": 4.2259943045949934e-05,
"loss": 0.0174,
"step": 5720
},
{
"epoch": 1.3422347153900211,
"grad_norm": 0.0638875886797905,
"learning_rate": 4.209663183431969e-05,
"loss": 0.0272,
"step": 5730
},
{
"epoch": 1.3445771843523073,
"grad_norm": 0.5683619379997253,
"learning_rate": 4.1933407052657456e-05,
"loss": 0.0265,
"step": 5740
},
{
"epoch": 1.3469196533145935,
"grad_norm": 0.1282253861427307,
"learning_rate": 4.17702704859633e-05,
"loss": 0.0285,
"step": 5750
},
{
"epoch": 1.3492621222768797,
"grad_norm": 0.2435198575258255,
"learning_rate": 4.160722391827262e-05,
"loss": 0.0348,
"step": 5760
},
{
"epoch": 1.351604591239166,
"grad_norm": 0.10652618855237961,
"learning_rate": 4.14442691326365e-05,
"loss": 0.0409,
"step": 5770
},
{
"epoch": 1.3539470602014523,
"grad_norm": 0.06271979957818985,
"learning_rate": 4.1281407911102425e-05,
"loss": 0.0559,
"step": 5780
},
{
"epoch": 1.3562895291637385,
"grad_norm": 0.05037263408303261,
"learning_rate": 4.111864203469457e-05,
"loss": 0.0263,
"step": 5790
},
{
"epoch": 1.3586319981260249,
"grad_norm": 0.2569263279438019,
"learning_rate": 4.095597328339452e-05,
"loss": 0.0259,
"step": 5800
},
{
"epoch": 1.360974467088311,
"grad_norm": 0.39117732644081116,
"learning_rate": 4.079340343612165e-05,
"loss": 0.0413,
"step": 5810
},
{
"epoch": 1.3633169360505972,
"grad_norm": 0.0529431588947773,
"learning_rate": 4.063093427071376e-05,
"loss": 0.0615,
"step": 5820
},
{
"epoch": 1.3656594050128836,
"grad_norm": 0.18688374757766724,
"learning_rate": 4.046856756390767e-05,
"loss": 0.0184,
"step": 5830
},
{
"epoch": 1.3680018739751698,
"grad_norm": 0.08132046461105347,
"learning_rate": 4.0306305091319595e-05,
"loss": 0.0203,
"step": 5840
},
{
"epoch": 1.370344342937456,
"grad_norm": 0.2862519323825836,
"learning_rate": 4.0144148627425993e-05,
"loss": 0.0497,
"step": 5850
},
{
"epoch": 1.3726868118997424,
"grad_norm": 0.12356792390346527,
"learning_rate": 3.9982099945543945e-05,
"loss": 0.0202,
"step": 5860
},
{
"epoch": 1.3750292808620286,
"grad_norm": 0.43368279933929443,
"learning_rate": 3.982016081781189e-05,
"loss": 0.0305,
"step": 5870
},
{
"epoch": 1.3773717498243148,
"grad_norm": 0.03974668309092522,
"learning_rate": 3.965833301517017e-05,
"loss": 0.0262,
"step": 5880
},
{
"epoch": 1.3797142187866012,
"grad_norm": 0.16461171209812164,
"learning_rate": 3.949661830734172e-05,
"loss": 0.0375,
"step": 5890
},
{
"epoch": 1.3820566877488873,
"grad_norm": 0.06088129058480263,
"learning_rate": 3.933501846281267e-05,
"loss": 0.0192,
"step": 5900
},
{
"epoch": 1.3843991567111735,
"grad_norm": 0.2690442204475403,
"learning_rate": 3.917353524881302e-05,
"loss": 0.0159,
"step": 5910
},
{
"epoch": 1.38674162567346,
"grad_norm": 0.09126674383878708,
"learning_rate": 3.901217043129735e-05,
"loss": 0.0334,
"step": 5920
},
{
"epoch": 1.3890840946357461,
"grad_norm": 0.11212047934532166,
"learning_rate": 3.8850925774925425e-05,
"loss": 0.0208,
"step": 5930
},
{
"epoch": 1.3914265635980323,
"grad_norm": 0.03019798919558525,
"learning_rate": 3.8689803043043e-05,
"loss": 0.0218,
"step": 5940
},
{
"epoch": 1.3937690325603187,
"grad_norm": 0.11158014088869095,
"learning_rate": 3.852880399766243e-05,
"loss": 0.0327,
"step": 5950
},
{
"epoch": 1.3961115015226049,
"grad_norm": 0.1466631442308426,
"learning_rate": 3.836793039944349e-05,
"loss": 0.0316,
"step": 5960
},
{
"epoch": 1.398453970484891,
"grad_norm": 0.04972492530941963,
"learning_rate": 3.820718400767409e-05,
"loss": 0.0204,
"step": 5970
},
{
"epoch": 1.4007964394471772,
"grad_norm": 0.18622121214866638,
"learning_rate": 3.8046566580251e-05,
"loss": 0.0446,
"step": 5980
},
{
"epoch": 1.4031389084094636,
"grad_norm": 0.4047488868236542,
"learning_rate": 3.788607987366069e-05,
"loss": 0.0422,
"step": 5990
},
{
"epoch": 1.4054813773717498,
"grad_norm": 0.043907005339860916,
"learning_rate": 3.772572564296005e-05,
"loss": 0.0279,
"step": 6000
},
{
"epoch": 1.407823846334036,
"grad_norm": 0.2679661214351654,
"learning_rate": 3.756550564175727e-05,
"loss": 0.0209,
"step": 6010
},
{
"epoch": 1.4101663152963222,
"grad_norm": 0.0252488162368536,
"learning_rate": 3.74054216221926e-05,
"loss": 0.0187,
"step": 6020
},
{
"epoch": 1.4125087842586086,
"grad_norm": 0.03220526501536369,
"learning_rate": 3.7245475334919246e-05,
"loss": 0.0235,
"step": 6030
},
{
"epoch": 1.4148512532208948,
"grad_norm": 0.132725328207016,
"learning_rate": 3.7085668529084184e-05,
"loss": 0.0231,
"step": 6040
},
{
"epoch": 1.417193722183181,
"grad_norm": 0.17545637488365173,
"learning_rate": 3.6926002952309016e-05,
"loss": 0.0259,
"step": 6050
},
{
"epoch": 1.4195361911454674,
"grad_norm": 0.197429358959198,
"learning_rate": 3.676648035067093e-05,
"loss": 0.0274,
"step": 6060
},
{
"epoch": 1.4218786601077535,
"grad_norm": 0.06819231063127518,
"learning_rate": 3.6607102468683526e-05,
"loss": 0.0355,
"step": 6070
},
{
"epoch": 1.4242211290700397,
"grad_norm": 0.16003242135047913,
"learning_rate": 3.6447871049277796e-05,
"loss": 0.0376,
"step": 6080
},
{
"epoch": 1.4265635980323261,
"grad_norm": 0.13673585653305054,
"learning_rate": 3.628878783378302e-05,
"loss": 0.0213,
"step": 6090
},
{
"epoch": 1.4289060669946123,
"grad_norm": 0.15434902906417847,
"learning_rate": 3.612985456190778e-05,
"loss": 0.0126,
"step": 6100
},
{
"epoch": 1.4312485359568985,
"grad_norm": 0.17395956814289093,
"learning_rate": 3.597107297172084e-05,
"loss": 0.084,
"step": 6110
},
{
"epoch": 1.433591004919185,
"grad_norm": 0.04844974726438522,
"learning_rate": 3.581244479963225e-05,
"loss": 0.0219,
"step": 6120
},
{
"epoch": 1.435933473881471,
"grad_norm": 0.04163607209920883,
"learning_rate": 3.5653971780374295e-05,
"loss": 0.0279,
"step": 6130
},
{
"epoch": 1.4382759428437573,
"grad_norm": 0.11247994005680084,
"learning_rate": 3.5495655646982505e-05,
"loss": 0.0388,
"step": 6140
},
{
"epoch": 1.4406184118060437,
"grad_norm": 0.10106071829795837,
"learning_rate": 3.533749813077677e-05,
"loss": 0.0197,
"step": 6150
},
{
"epoch": 1.4429608807683298,
"grad_norm": 0.3352503776550293,
"learning_rate": 3.517950096134232e-05,
"loss": 0.0306,
"step": 6160
},
{
"epoch": 1.445303349730616,
"grad_norm": 0.23961161077022552,
"learning_rate": 3.5021665866510925e-05,
"loss": 0.0361,
"step": 6170
},
{
"epoch": 1.4476458186929024,
"grad_norm": 0.27124881744384766,
"learning_rate": 3.4863994572341843e-05,
"loss": 0.0215,
"step": 6180
},
{
"epoch": 1.4499882876551886,
"grad_norm": 0.19891873002052307,
"learning_rate": 3.470648880310313e-05,
"loss": 0.0356,
"step": 6190
},
{
"epoch": 1.4523307566174748,
"grad_norm": 0.2036479115486145,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.024,
"step": 6200
},
{
"epoch": 1.4546732255797612,
"grad_norm": 0.20012417435646057,
"learning_rate": 3.439198072741921e-05,
"loss": 0.0601,
"step": 6210
},
{
"epoch": 1.4570156945420474,
"grad_norm": 0.09231871366500854,
"learning_rate": 3.423498186038393e-05,
"loss": 0.0264,
"step": 6220
},
{
"epoch": 1.4593581635043336,
"grad_norm": 0.08506989479064941,
"learning_rate": 3.407815539706124e-05,
"loss": 0.0326,
"step": 6230
},
{
"epoch": 1.4617006324666197,
"grad_norm": 0.07338278740644455,
"learning_rate": 3.392150305248024e-05,
"loss": 0.0261,
"step": 6240
},
{
"epoch": 1.4640431014289061,
"grad_norm": 0.04994959384202957,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.0155,
"step": 6250
},
{
"epoch": 1.4663855703911923,
"grad_norm": 0.14995336532592773,
"learning_rate": 3.360872757012011e-05,
"loss": 0.0215,
"step": 6260
},
{
"epoch": 1.4687280393534785,
"grad_norm": 0.08906183391809464,
"learning_rate": 3.3452607852803584e-05,
"loss": 0.0165,
"step": 6270
},
{
"epoch": 1.4710705083157647,
"grad_norm": 0.07266787439584732,
"learning_rate": 3.329666909511645e-05,
"loss": 0.0368,
"step": 6280
},
{
"epoch": 1.473412977278051,
"grad_norm": 0.5040260553359985,
"learning_rate": 3.3140913002379995e-05,
"loss": 0.0261,
"step": 6290
},
{
"epoch": 1.4757554462403373,
"grad_norm": 0.1433238685131073,
"learning_rate": 3.298534127791785e-05,
"loss": 0.0175,
"step": 6300
},
{
"epoch": 1.4780979152026235,
"grad_norm": 0.3917306363582611,
"learning_rate": 3.282995562303754e-05,
"loss": 0.0235,
"step": 6310
},
{
"epoch": 1.4804403841649099,
"grad_norm": 0.07920818775892258,
"learning_rate": 3.267475773701161e-05,
"loss": 0.0428,
"step": 6320
},
{
"epoch": 1.482782853127196,
"grad_norm": 0.07408647239208221,
"learning_rate": 3.251974931705933e-05,
"loss": 0.0255,
"step": 6330
},
{
"epoch": 1.4851253220894822,
"grad_norm": 0.0957607626914978,
"learning_rate": 3.236493205832795e-05,
"loss": 0.0279,
"step": 6340
},
{
"epoch": 1.4874677910517686,
"grad_norm": 0.372249037027359,
"learning_rate": 3.221030765387417e-05,
"loss": 0.0302,
"step": 6350
},
{
"epoch": 1.4898102600140548,
"grad_norm": 0.20557020604610443,
"learning_rate": 3.205587779464576e-05,
"loss": 0.0374,
"step": 6360
},
{
"epoch": 1.492152728976341,
"grad_norm": 0.2854403257369995,
"learning_rate": 3.190164416946285e-05,
"loss": 0.0234,
"step": 6370
},
{
"epoch": 1.4944951979386274,
"grad_norm": 0.023650668561458588,
"learning_rate": 3.1747608464999725e-05,
"loss": 0.028,
"step": 6380
},
{
"epoch": 1.4968376669009136,
"grad_norm": 0.3256511390209198,
"learning_rate": 3.1593772365766105e-05,
"loss": 0.0349,
"step": 6390
},
{
"epoch": 1.4991801358631998,
"grad_norm": 0.10362248122692108,
"learning_rate": 3.144013755408895e-05,
"loss": 0.0181,
"step": 6400
},
{
"epoch": 1.5015226048254862,
"grad_norm": 0.22891394793987274,
"learning_rate": 3.128670571009399e-05,
"loss": 0.0139,
"step": 6410
},
{
"epoch": 1.5038650737877723,
"grad_norm": 0.3262953460216522,
"learning_rate": 3.113347851168721e-05,
"loss": 0.0276,
"step": 6420
},
{
"epoch": 1.5062075427500585,
"grad_norm": 0.04172496870160103,
"learning_rate": 3.098045763453678e-05,
"loss": 0.0151,
"step": 6430
},
{
"epoch": 1.508550011712345,
"grad_norm": 0.10430093109607697,
"learning_rate": 3.082764475205442e-05,
"loss": 0.0151,
"step": 6440
},
{
"epoch": 1.510892480674631,
"grad_norm": 0.061427149921655655,
"learning_rate": 3.0675041535377405e-05,
"loss": 0.0257,
"step": 6450
},
{
"epoch": 1.5132349496369173,
"grad_norm": 0.3583897054195404,
"learning_rate": 3.052264965335e-05,
"loss": 0.0228,
"step": 6460
},
{
"epoch": 1.5155774185992037,
"grad_norm": 0.3000676929950714,
"learning_rate": 3.0370470772505433e-05,
"loss": 0.0319,
"step": 6470
},
{
"epoch": 1.5179198875614897,
"grad_norm": 0.054317738860845566,
"learning_rate": 3.0218506557047598e-05,
"loss": 0.0258,
"step": 6480
},
{
"epoch": 1.520262356523776,
"grad_norm": 0.554436981678009,
"learning_rate": 3.006675866883275e-05,
"loss": 0.0423,
"step": 6490
},
{
"epoch": 1.5226048254860625,
"grad_norm": 0.047464508563280106,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0181,
"step": 6500
},
{
"epoch": 1.5249472944483484,
"grad_norm": 0.0966537818312645,
"learning_rate": 2.976391850971065e-05,
"loss": 0.059,
"step": 6510
},
{
"epoch": 1.5272897634106348,
"grad_norm": 0.2305019199848175,
"learning_rate": 2.9612829550614836e-05,
"loss": 0.0525,
"step": 6520
},
{
"epoch": 1.529632232372921,
"grad_norm": 0.0933275818824768,
"learning_rate": 2.9461963542348737e-05,
"loss": 0.0181,
"step": 6530
},
{
"epoch": 1.5319747013352072,
"grad_norm": 0.06837865710258484,
"learning_rate": 2.931132213475884e-05,
"loss": 0.0305,
"step": 6540
},
{
"epoch": 1.5343171702974936,
"grad_norm": 0.3509468138217926,
"learning_rate": 2.916090697523549e-05,
"loss": 0.0246,
"step": 6550
},
{
"epoch": 1.5366596392597798,
"grad_norm": 0.09399297833442688,
"learning_rate": 2.9010719708694722e-05,
"loss": 0.0571,
"step": 6560
},
{
"epoch": 1.539002108222066,
"grad_norm": 0.04773678630590439,
"learning_rate": 2.8860761977560436e-05,
"loss": 0.0219,
"step": 6570
},
{
"epoch": 1.5413445771843524,
"grad_norm": 0.5464432835578918,
"learning_rate": 2.8711035421746367e-05,
"loss": 0.033,
"step": 6580
},
{
"epoch": 1.5436870461466385,
"grad_norm": 0.03716734051704407,
"learning_rate": 2.8561541678638142e-05,
"loss": 0.0238,
"step": 6590
},
{
"epoch": 1.5460295151089247,
"grad_norm": 0.07261854410171509,
"learning_rate": 2.8412282383075363e-05,
"loss": 0.0355,
"step": 6600
},
{
"epoch": 1.5483719840712111,
"grad_norm": 0.3284207880496979,
"learning_rate": 2.8263259167333777e-05,
"loss": 0.0179,
"step": 6610
},
{
"epoch": 1.5507144530334973,
"grad_norm": 0.16059207916259766,
"learning_rate": 2.811447366110741e-05,
"loss": 0.0221,
"step": 6620
},
{
"epoch": 1.5530569219957835,
"grad_norm": 0.1849690079689026,
"learning_rate": 2.7965927491490705e-05,
"loss": 0.0287,
"step": 6630
},
{
"epoch": 1.55539939095807,
"grad_norm": 0.39172595739364624,
"learning_rate": 2.7817622282960815e-05,
"loss": 0.0475,
"step": 6640
},
{
"epoch": 1.557741859920356,
"grad_norm": 0.30010920763015747,
"learning_rate": 2.766955965735968e-05,
"loss": 0.0312,
"step": 6650
},
{
"epoch": 1.5600843288826423,
"grad_norm": 0.4200305640697479,
"learning_rate": 2.7521741233876496e-05,
"loss": 0.0276,
"step": 6660
},
{
"epoch": 1.5624267978449287,
"grad_norm": 0.06515451520681381,
"learning_rate": 2.7374168629029813e-05,
"loss": 0.0165,
"step": 6670
},
{
"epoch": 1.5647692668072148,
"grad_norm": 0.19618399441242218,
"learning_rate": 2.7226843456650037e-05,
"loss": 0.019,
"step": 6680
},
{
"epoch": 1.567111735769501,
"grad_norm": 0.4492703378200531,
"learning_rate": 2.707976732786166e-05,
"loss": 0.0244,
"step": 6690
},
{
"epoch": 1.5694542047317874,
"grad_norm": 0.18303832411766052,
"learning_rate": 2.693294185106562e-05,
"loss": 0.0172,
"step": 6700
},
{
"epoch": 1.5717966736940734,
"grad_norm": 0.10762883722782135,
"learning_rate": 2.6786368631921836e-05,
"loss": 0.0286,
"step": 6710
},
{
"epoch": 1.5741391426563598,
"grad_norm": 0.1200929656624794,
"learning_rate": 2.6640049273331515e-05,
"loss": 0.0264,
"step": 6720
},
{
"epoch": 1.5764816116186462,
"grad_norm": 0.025387238711118698,
"learning_rate": 2.6493985375419778e-05,
"loss": 0.0172,
"step": 6730
},
{
"epoch": 1.5788240805809322,
"grad_norm": 0.2033502608537674,
"learning_rate": 2.6348178535517966e-05,
"loss": 0.0206,
"step": 6740
},
{
"epoch": 1.5811665495432186,
"grad_norm": 0.1873401701450348,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.0228,
"step": 6750
},
{
"epoch": 1.583509018505505,
"grad_norm": 0.3058501183986664,
"learning_rate": 2.6057342404996522e-05,
"loss": 0.0178,
"step": 6760
},
{
"epoch": 1.585851487467791,
"grad_norm": 0.07519169896841049,
"learning_rate": 2.591231629491423e-05,
"loss": 0.0242,
"step": 6770
},
{
"epoch": 1.5881939564300773,
"grad_norm": 0.11511756479740143,
"learning_rate": 2.5767553603881767e-05,
"loss": 0.0146,
"step": 6780
},
{
"epoch": 1.5905364253923635,
"grad_norm": 0.3080747425556183,
"learning_rate": 2.562305591500069e-05,
"loss": 0.0226,
"step": 6790
},
{
"epoch": 1.5928788943546497,
"grad_norm": 0.04100322350859642,
"learning_rate": 2.547882480847461e-05,
"loss": 0.0201,
"step": 6800
},
{
"epoch": 1.595221363316936,
"grad_norm": 0.04346079006791115,
"learning_rate": 2.5334861861591753e-05,
"loss": 0.0132,
"step": 6810
},
{
"epoch": 1.5975638322792223,
"grad_norm": 0.37990304827690125,
"learning_rate": 2.5191168648707887e-05,
"loss": 0.0281,
"step": 6820
},
{
"epoch": 1.5999063012415085,
"grad_norm": 0.4856362044811249,
"learning_rate": 2.5047746741228978e-05,
"loss": 0.0314,
"step": 6830
},
{
"epoch": 1.6022487702037949,
"grad_norm": 0.10676129907369614,
"learning_rate": 2.490459770759398e-05,
"loss": 0.0527,
"step": 6840
},
{
"epoch": 1.604591239166081,
"grad_norm": 0.05450962483882904,
"learning_rate": 2.476172311325783e-05,
"loss": 0.0316,
"step": 6850
},
{
"epoch": 1.6069337081283672,
"grad_norm": 0.04609961807727814,
"learning_rate": 2.4619124520674146e-05,
"loss": 0.0272,
"step": 6860
},
{
"epoch": 1.6092761770906536,
"grad_norm": 0.03132042661309242,
"learning_rate": 2.447680348927837e-05,
"loss": 0.0249,
"step": 6870
},
{
"epoch": 1.6116186460529398,
"grad_norm": 0.11953801661729813,
"learning_rate": 2.433476157547044e-05,
"loss": 0.0178,
"step": 6880
},
{
"epoch": 1.613961115015226,
"grad_norm": 0.1899009346961975,
"learning_rate": 2.419300033259798e-05,
"loss": 0.0445,
"step": 6890
},
{
"epoch": 1.6163035839775124,
"grad_norm": 0.04766709730029106,
"learning_rate": 2.405152131093926e-05,
"loss": 0.0144,
"step": 6900
},
{
"epoch": 1.6186460529397986,
"grad_norm": 0.40684422850608826,
"learning_rate": 2.3910326057686127e-05,
"loss": 0.018,
"step": 6910
},
{
"epoch": 1.6209885219020848,
"grad_norm": 0.06992173194885254,
"learning_rate": 2.3769416116927335e-05,
"loss": 0.0183,
"step": 6920
},
{
"epoch": 1.6233309908643712,
"grad_norm": 0.050338905304670334,
"learning_rate": 2.362879302963135e-05,
"loss": 0.0237,
"step": 6930
},
{
"epoch": 1.6256734598266573,
"grad_norm": 0.19553512334823608,
"learning_rate": 2.3488458333629777e-05,
"loss": 0.0339,
"step": 6940
},
{
"epoch": 1.6280159287889435,
"grad_norm": 0.15470145642757416,
"learning_rate": 2.3348413563600325e-05,
"loss": 0.0094,
"step": 6950
},
{
"epoch": 1.63035839775123,
"grad_norm": 0.23403486609458923,
"learning_rate": 2.3208660251050158e-05,
"loss": 0.026,
"step": 6960
},
{
"epoch": 1.6327008667135159,
"grad_norm": 0.13263070583343506,
"learning_rate": 2.3069199924299174e-05,
"loss": 0.0197,
"step": 6970
},
{
"epoch": 1.6350433356758023,
"grad_norm": 0.4499634802341461,
"learning_rate": 2.29300341084631e-05,
"loss": 0.0183,
"step": 6980
},
{
"epoch": 1.6373858046380887,
"grad_norm": 0.020672103390097618,
"learning_rate": 2.279116432543705e-05,
"loss": 0.019,
"step": 6990
},
{
"epoch": 1.6397282736003747,
"grad_norm": 0.08733947575092316,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0317,
"step": 7000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.841140693728e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}