so100_battery-ovrr0igj39 / trainer_state.json
00ri's picture
Upload trainer_state.json with huggingface_hub
7de24ed verified
raw
history blame
133 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998681608437706,
"eval_steps": 500,
"global_step": 7583,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0065919578114700065,
"grad_norm": 11.249381065368652,
"learning_rate": 5.263157894736842e-06,
"loss": 1.324,
"step": 10
},
{
"epoch": 0.013183915622940013,
"grad_norm": 8.759961128234863,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.1761,
"step": 20
},
{
"epoch": 0.01977587343441002,
"grad_norm": 2.233778953552246,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.6049,
"step": 30
},
{
"epoch": 0.026367831245880026,
"grad_norm": 3.505425453186035,
"learning_rate": 2.105263157894737e-05,
"loss": 0.3419,
"step": 40
},
{
"epoch": 0.03295978905735003,
"grad_norm": 1.7327282428741455,
"learning_rate": 2.6315789473684212e-05,
"loss": 0.2418,
"step": 50
},
{
"epoch": 0.03955174686882004,
"grad_norm": 1.4897282123565674,
"learning_rate": 3.157894736842105e-05,
"loss": 0.2017,
"step": 60
},
{
"epoch": 0.04614370468029005,
"grad_norm": 0.8921314477920532,
"learning_rate": 3.6842105263157895e-05,
"loss": 0.1668,
"step": 70
},
{
"epoch": 0.05273566249176005,
"grad_norm": 1.3826392889022827,
"learning_rate": 4.210526315789474e-05,
"loss": 0.1532,
"step": 80
},
{
"epoch": 0.05932762030323006,
"grad_norm": 1.489062786102295,
"learning_rate": 4.736842105263158e-05,
"loss": 0.1084,
"step": 90
},
{
"epoch": 0.06591957811470006,
"grad_norm": 1.280565619468689,
"learning_rate": 5.2631578947368424e-05,
"loss": 0.1128,
"step": 100
},
{
"epoch": 0.07251153592617007,
"grad_norm": 1.2948462963104248,
"learning_rate": 5.789473684210527e-05,
"loss": 0.1044,
"step": 110
},
{
"epoch": 0.07910349373764008,
"grad_norm": 1.5762895345687866,
"learning_rate": 6.31578947368421e-05,
"loss": 0.1034,
"step": 120
},
{
"epoch": 0.08569545154911008,
"grad_norm": 1.0561785697937012,
"learning_rate": 6.842105263157895e-05,
"loss": 0.0798,
"step": 130
},
{
"epoch": 0.0922874093605801,
"grad_norm": 0.9102309346199036,
"learning_rate": 7.368421052631579e-05,
"loss": 0.0752,
"step": 140
},
{
"epoch": 0.09887936717205009,
"grad_norm": 1.4243663549423218,
"learning_rate": 7.894736842105263e-05,
"loss": 0.0863,
"step": 150
},
{
"epoch": 0.1054713249835201,
"grad_norm": 0.7150789499282837,
"learning_rate": 8.421052631578948e-05,
"loss": 0.0778,
"step": 160
},
{
"epoch": 0.11206328279499012,
"grad_norm": 0.9231832027435303,
"learning_rate": 8.947368421052632e-05,
"loss": 0.0796,
"step": 170
},
{
"epoch": 0.11865524060646011,
"grad_norm": 0.5305670499801636,
"learning_rate": 9.473684210526316e-05,
"loss": 0.0733,
"step": 180
},
{
"epoch": 0.12524719841793014,
"grad_norm": 1.0431275367736816,
"learning_rate": 0.0001,
"loss": 0.0713,
"step": 190
},
{
"epoch": 0.13183915622940012,
"grad_norm": 1.0667047500610352,
"learning_rate": 0.00010526315789473685,
"loss": 0.0738,
"step": 200
},
{
"epoch": 0.13843111404087013,
"grad_norm": 0.9431530833244324,
"learning_rate": 0.0001105263157894737,
"loss": 0.0695,
"step": 210
},
{
"epoch": 0.14502307185234015,
"grad_norm": 1.231911063194275,
"learning_rate": 0.00011578947368421053,
"loss": 0.0707,
"step": 220
},
{
"epoch": 0.15161502966381016,
"grad_norm": 0.5772905945777893,
"learning_rate": 0.00012105263157894738,
"loss": 0.0642,
"step": 230
},
{
"epoch": 0.15820698747528017,
"grad_norm": 0.6241514086723328,
"learning_rate": 0.0001263157894736842,
"loss": 0.0621,
"step": 240
},
{
"epoch": 0.16479894528675015,
"grad_norm": 0.7449037432670593,
"learning_rate": 0.00013157894736842108,
"loss": 0.0639,
"step": 250
},
{
"epoch": 0.17139090309822017,
"grad_norm": 0.9040747880935669,
"learning_rate": 0.0001368421052631579,
"loss": 0.0595,
"step": 260
},
{
"epoch": 0.17798286090969018,
"grad_norm": 0.6246598958969116,
"learning_rate": 0.00014210526315789474,
"loss": 0.0612,
"step": 270
},
{
"epoch": 0.1845748187211602,
"grad_norm": 0.6300843358039856,
"learning_rate": 0.00014736842105263158,
"loss": 0.0574,
"step": 280
},
{
"epoch": 0.1911667765326302,
"grad_norm": 0.7051455974578857,
"learning_rate": 0.00015263157894736845,
"loss": 0.0489,
"step": 290
},
{
"epoch": 0.19775873434410018,
"grad_norm": 0.8903814554214478,
"learning_rate": 0.00015789473684210527,
"loss": 0.0588,
"step": 300
},
{
"epoch": 0.2043506921555702,
"grad_norm": 0.8815051317214966,
"learning_rate": 0.0001631578947368421,
"loss": 0.0605,
"step": 310
},
{
"epoch": 0.2109426499670402,
"grad_norm": 0.7266796231269836,
"learning_rate": 0.00016842105263157895,
"loss": 0.0555,
"step": 320
},
{
"epoch": 0.21753460777851022,
"grad_norm": 1.033163070678711,
"learning_rate": 0.0001736842105263158,
"loss": 0.056,
"step": 330
},
{
"epoch": 0.22412656558998023,
"grad_norm": 1.339528203010559,
"learning_rate": 0.00017894736842105264,
"loss": 0.0513,
"step": 340
},
{
"epoch": 0.23071852340145024,
"grad_norm": 1.1713142395019531,
"learning_rate": 0.00018421052631578948,
"loss": 0.0604,
"step": 350
},
{
"epoch": 0.23731048121292023,
"grad_norm": 0.7305978536605835,
"learning_rate": 0.00018947368421052632,
"loss": 0.061,
"step": 360
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.6867638826370239,
"learning_rate": 0.00019473684210526317,
"loss": 0.0446,
"step": 370
},
{
"epoch": 0.2504943968358603,
"grad_norm": 0.480622798204422,
"learning_rate": 0.0002,
"loss": 0.0507,
"step": 380
},
{
"epoch": 0.25708635464733026,
"grad_norm": 0.6892393827438354,
"learning_rate": 0.00019999904886484996,
"loss": 0.0562,
"step": 390
},
{
"epoch": 0.26367831245880025,
"grad_norm": 0.8014799952507019,
"learning_rate": 0.00019999619547749294,
"loss": 0.0407,
"step": 400
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.8931164741516113,
"learning_rate": 0.0001999914398922081,
"loss": 0.0488,
"step": 410
},
{
"epoch": 0.27686222808174027,
"grad_norm": 0.5557290315628052,
"learning_rate": 0.00019998478219945958,
"loss": 0.0533,
"step": 420
},
{
"epoch": 0.2834541858932103,
"grad_norm": 0.9810464978218079,
"learning_rate": 0.00019997622252589464,
"loss": 0.052,
"step": 430
},
{
"epoch": 0.2900461437046803,
"grad_norm": 0.6797704696655273,
"learning_rate": 0.00019996576103434137,
"loss": 0.0514,
"step": 440
},
{
"epoch": 0.2966381015161503,
"grad_norm": 1.141650915145874,
"learning_rate": 0.0001999533979238057,
"loss": 0.0489,
"step": 450
},
{
"epoch": 0.3032300593276203,
"grad_norm": 0.6689559817314148,
"learning_rate": 0.00019993913342946734,
"loss": 0.0441,
"step": 460
},
{
"epoch": 0.3098220171390903,
"grad_norm": 0.524917721748352,
"learning_rate": 0.0001999229678226756,
"loss": 0.0457,
"step": 470
},
{
"epoch": 0.31641397495056034,
"grad_norm": 0.7408258318901062,
"learning_rate": 0.00019990490141094392,
"loss": 0.0428,
"step": 480
},
{
"epoch": 0.3230059327620303,
"grad_norm": 0.5927634835243225,
"learning_rate": 0.0001998849345379444,
"loss": 0.0431,
"step": 490
},
{
"epoch": 0.3295978905735003,
"grad_norm": 0.4574936628341675,
"learning_rate": 0.00019986306758350083,
"loss": 0.038,
"step": 500
},
{
"epoch": 0.33618984838497035,
"grad_norm": 0.6031479835510254,
"learning_rate": 0.00019983930096358188,
"loss": 0.0442,
"step": 510
},
{
"epoch": 0.34278180619644033,
"grad_norm": 0.4019775688648224,
"learning_rate": 0.00019981363513029283,
"loss": 0.0336,
"step": 520
},
{
"epoch": 0.34937376400791037,
"grad_norm": 0.6691102981567383,
"learning_rate": 0.00019978607057186725,
"loss": 0.0387,
"step": 530
},
{
"epoch": 0.35596572181938035,
"grad_norm": 0.39324843883514404,
"learning_rate": 0.00019975660781265753,
"loss": 0.0449,
"step": 540
},
{
"epoch": 0.36255767963085034,
"grad_norm": 0.5069633722305298,
"learning_rate": 0.00019972524741312497,
"loss": 0.0319,
"step": 550
},
{
"epoch": 0.3691496374423204,
"grad_norm": 0.5699636936187744,
"learning_rate": 0.00019969198996982917,
"loss": 0.0402,
"step": 560
},
{
"epoch": 0.37574159525379036,
"grad_norm": 1.0686895847320557,
"learning_rate": 0.00019965683611541655,
"loss": 0.0542,
"step": 570
},
{
"epoch": 0.3823335530652604,
"grad_norm": 0.4853604733943939,
"learning_rate": 0.00019961978651860854,
"loss": 0.0476,
"step": 580
},
{
"epoch": 0.3889255108767304,
"grad_norm": 0.8250619173049927,
"learning_rate": 0.0001995808418841885,
"loss": 0.034,
"step": 590
},
{
"epoch": 0.39551746868820037,
"grad_norm": 0.6085853576660156,
"learning_rate": 0.00019954000295298871,
"loss": 0.0389,
"step": 600
},
{
"epoch": 0.4021094264996704,
"grad_norm": 3.688549041748047,
"learning_rate": 0.000199497270501876,
"loss": 0.0511,
"step": 610
},
{
"epoch": 0.4087013843111404,
"grad_norm": 1.5635132789611816,
"learning_rate": 0.00019945264534373714,
"loss": 0.1116,
"step": 620
},
{
"epoch": 0.41529334212261043,
"grad_norm": 0.7884135246276855,
"learning_rate": 0.00019940612832746322,
"loss": 0.0737,
"step": 630
},
{
"epoch": 0.4218852999340804,
"grad_norm": 0.9017935395240784,
"learning_rate": 0.0001993577203379336,
"loss": 0.0789,
"step": 640
},
{
"epoch": 0.42847725774555045,
"grad_norm": 0.8649272918701172,
"learning_rate": 0.00019930742229599914,
"loss": 0.0728,
"step": 650
},
{
"epoch": 0.43506921555702044,
"grad_norm": 0.772191047668457,
"learning_rate": 0.00019925523515846455,
"loss": 0.0697,
"step": 660
},
{
"epoch": 0.4416611733684904,
"grad_norm": 0.5265079140663147,
"learning_rate": 0.00019920115991807022,
"loss": 0.0622,
"step": 670
},
{
"epoch": 0.44825313117996046,
"grad_norm": 0.8318515419960022,
"learning_rate": 0.0001991451976034734,
"loss": 0.0786,
"step": 680
},
{
"epoch": 0.45484508899143045,
"grad_norm": 0.7197186946868896,
"learning_rate": 0.0001990873492792286,
"loss": 0.059,
"step": 690
},
{
"epoch": 0.4614370468029005,
"grad_norm": 0.9418641328811646,
"learning_rate": 0.00019902761604576725,
"loss": 0.078,
"step": 700
},
{
"epoch": 0.46802900461437047,
"grad_norm": 0.7985256314277649,
"learning_rate": 0.00019896599903937697,
"loss": 0.0834,
"step": 710
},
{
"epoch": 0.47462096242584045,
"grad_norm": 0.6049144268035889,
"learning_rate": 0.00019890249943217976,
"loss": 0.0656,
"step": 720
},
{
"epoch": 0.4812129202373105,
"grad_norm": 0.6395105719566345,
"learning_rate": 0.0001988371184321098,
"loss": 0.0764,
"step": 730
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.58722984790802,
"learning_rate": 0.00019876985728289038,
"loss": 0.0588,
"step": 740
},
{
"epoch": 0.4943968358602505,
"grad_norm": 0.4679464101791382,
"learning_rate": 0.00019870071726401043,
"loss": 0.0638,
"step": 750
},
{
"epoch": 0.5009887936717206,
"grad_norm": 0.509775698184967,
"learning_rate": 0.00019862969969069996,
"loss": 0.0602,
"step": 760
},
{
"epoch": 0.5075807514831905,
"grad_norm": 0.8126184344291687,
"learning_rate": 0.00019855680591390518,
"loss": 0.069,
"step": 770
},
{
"epoch": 0.5141727092946605,
"grad_norm": 0.7676377892494202,
"learning_rate": 0.00019848203732026275,
"loss": 0.0704,
"step": 780
},
{
"epoch": 0.5207646671061306,
"grad_norm": 1.0301965475082397,
"learning_rate": 0.00019840539533207344,
"loss": 0.0666,
"step": 790
},
{
"epoch": 0.5273566249176005,
"grad_norm": 0.6810826063156128,
"learning_rate": 0.000198326881407275,
"loss": 0.0698,
"step": 800
},
{
"epoch": 0.5339485827290705,
"grad_norm": 0.4939572513103485,
"learning_rate": 0.00019824649703941455,
"loss": 0.0548,
"step": 810
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.6614457964897156,
"learning_rate": 0.00019816424375762001,
"loss": 0.0748,
"step": 820
},
{
"epoch": 0.5471324983520105,
"grad_norm": 0.7715848088264465,
"learning_rate": 0.00019808012312657114,
"loss": 0.0653,
"step": 830
},
{
"epoch": 0.5537244561634805,
"grad_norm": 0.5254570245742798,
"learning_rate": 0.00019799413674646973,
"loss": 0.0537,
"step": 840
},
{
"epoch": 0.5603164139749506,
"grad_norm": 0.7626491785049438,
"learning_rate": 0.0001979062862530091,
"loss": 0.0599,
"step": 850
},
{
"epoch": 0.5669083717864206,
"grad_norm": 0.6767850518226624,
"learning_rate": 0.00019781657331734316,
"loss": 0.0644,
"step": 860
},
{
"epoch": 0.5735003295978905,
"grad_norm": 0.4016531705856323,
"learning_rate": 0.0001977249996460544,
"loss": 0.0543,
"step": 870
},
{
"epoch": 0.5800922874093606,
"grad_norm": 1.0104889869689941,
"learning_rate": 0.0001976315669811216,
"loss": 0.0681,
"step": 880
},
{
"epoch": 0.5866842452208306,
"grad_norm": 0.7674484252929688,
"learning_rate": 0.00019753627709988658,
"loss": 0.0562,
"step": 890
},
{
"epoch": 0.5932762030323006,
"grad_norm": 1.2781016826629639,
"learning_rate": 0.00019743913181502048,
"loss": 0.0602,
"step": 900
},
{
"epoch": 0.5998681608437706,
"grad_norm": 0.5540818572044373,
"learning_rate": 0.00019734013297448914,
"loss": 0.0631,
"step": 910
},
{
"epoch": 0.6064601186552406,
"grad_norm": 0.7823266386985779,
"learning_rate": 0.00019723928246151814,
"loss": 0.0637,
"step": 920
},
{
"epoch": 0.6130520764667106,
"grad_norm": 0.6756680607795715,
"learning_rate": 0.00019713658219455685,
"loss": 0.0684,
"step": 930
},
{
"epoch": 0.6196440342781806,
"grad_norm": 0.8224459290504456,
"learning_rate": 0.0001970320341272419,
"loss": 0.0512,
"step": 940
},
{
"epoch": 0.6262359920896506,
"grad_norm": 0.8429596424102783,
"learning_rate": 0.00019692564024836016,
"loss": 0.0516,
"step": 950
},
{
"epoch": 0.6328279499011207,
"grad_norm": 0.7025866508483887,
"learning_rate": 0.0001968174025818108,
"loss": 0.0667,
"step": 960
},
{
"epoch": 0.6394199077125906,
"grad_norm": 0.624162495136261,
"learning_rate": 0.00019670732318656677,
"loss": 0.0575,
"step": 970
},
{
"epoch": 0.6460118655240606,
"grad_norm": 0.5887486338615417,
"learning_rate": 0.00019659540415663571,
"loss": 0.0488,
"step": 980
},
{
"epoch": 0.6526038233355307,
"grad_norm": 0.45346468687057495,
"learning_rate": 0.00019648164762102013,
"loss": 0.0483,
"step": 990
},
{
"epoch": 0.6591957811470006,
"grad_norm": 0.6038155555725098,
"learning_rate": 0.0001963660557436768,
"loss": 0.054,
"step": 1000
},
{
"epoch": 0.6657877389584707,
"grad_norm": 0.5043258666992188,
"learning_rate": 0.00019624863072347564,
"loss": 0.0631,
"step": 1010
},
{
"epoch": 0.6723796967699407,
"grad_norm": 0.6452742218971252,
"learning_rate": 0.000196129374794158,
"loss": 0.0551,
"step": 1020
},
{
"epoch": 0.6789716545814107,
"grad_norm": 0.6438404321670532,
"learning_rate": 0.0001960082902242939,
"loss": 0.0501,
"step": 1030
},
{
"epoch": 0.6855636123928807,
"grad_norm": 0.8768063187599182,
"learning_rate": 0.00019588537931723927,
"loss": 0.0516,
"step": 1040
},
{
"epoch": 0.6921555702043507,
"grad_norm": 0.767848014831543,
"learning_rate": 0.00019576064441109172,
"loss": 0.0501,
"step": 1050
},
{
"epoch": 0.6987475280158207,
"grad_norm": 0.6131387948989868,
"learning_rate": 0.00019563408787864634,
"loss": 0.0595,
"step": 1060
},
{
"epoch": 0.7053394858272907,
"grad_norm": 0.4806978404521942,
"learning_rate": 0.00019550571212735048,
"loss": 0.0475,
"step": 1070
},
{
"epoch": 0.7119314436387607,
"grad_norm": 0.4950248897075653,
"learning_rate": 0.00019537551959925787,
"loss": 0.048,
"step": 1080
},
{
"epoch": 0.7185234014502307,
"grad_norm": 0.5537814497947693,
"learning_rate": 0.0001952435127709824,
"loss": 0.046,
"step": 1090
},
{
"epoch": 0.7251153592617007,
"grad_norm": 0.4151875078678131,
"learning_rate": 0.00019510969415365063,
"loss": 0.0429,
"step": 1100
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.42159780859947205,
"learning_rate": 0.0001949740662928545,
"loss": 0.0434,
"step": 1110
},
{
"epoch": 0.7382992748846408,
"grad_norm": 0.454226016998291,
"learning_rate": 0.00019483663176860248,
"loss": 0.0421,
"step": 1120
},
{
"epoch": 0.7448912326961108,
"grad_norm": 0.37481585144996643,
"learning_rate": 0.00019469739319527064,
"loss": 0.043,
"step": 1130
},
{
"epoch": 0.7514831905075807,
"grad_norm": 0.6487095952033997,
"learning_rate": 0.00019455635322155313,
"loss": 0.0433,
"step": 1140
},
{
"epoch": 0.7580751483190508,
"grad_norm": 0.44085580110549927,
"learning_rate": 0.00019441351453041138,
"loss": 0.0492,
"step": 1150
},
{
"epoch": 0.7646671061305208,
"grad_norm": 0.49984055757522583,
"learning_rate": 0.00019426887983902343,
"loss": 0.0431,
"step": 1160
},
{
"epoch": 0.7712590639419907,
"grad_norm": 0.5114363431930542,
"learning_rate": 0.00019412245189873203,
"loss": 0.0448,
"step": 1170
},
{
"epoch": 0.7778510217534608,
"grad_norm": 0.5482351779937744,
"learning_rate": 0.00019397423349499246,
"loss": 0.0481,
"step": 1180
},
{
"epoch": 0.7844429795649308,
"grad_norm": 0.7064313888549805,
"learning_rate": 0.00019382422744731933,
"loss": 0.0476,
"step": 1190
},
{
"epoch": 0.7910349373764007,
"grad_norm": 0.5201088190078735,
"learning_rate": 0.0001936724366092332,
"loss": 0.0596,
"step": 1200
},
{
"epoch": 0.7976268951878708,
"grad_norm": 0.794978678226471,
"learning_rate": 0.000193518863868206,
"loss": 0.0484,
"step": 1210
},
{
"epoch": 0.8042188529993408,
"grad_norm": 0.5086749196052551,
"learning_rate": 0.00019336351214560647,
"loss": 0.0482,
"step": 1220
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.5501623749732971,
"learning_rate": 0.00019320638439664426,
"loss": 0.0417,
"step": 1230
},
{
"epoch": 0.8174027686222808,
"grad_norm": 0.4340960383415222,
"learning_rate": 0.0001930474836103138,
"loss": 0.0406,
"step": 1240
},
{
"epoch": 0.8239947264337508,
"grad_norm": 0.5098422169685364,
"learning_rate": 0.00019288681280933768,
"loss": 0.0485,
"step": 1250
},
{
"epoch": 0.8305866842452209,
"grad_norm": 0.4968768358230591,
"learning_rate": 0.00019272437505010877,
"loss": 0.0412,
"step": 1260
},
{
"epoch": 0.8371786420566908,
"grad_norm": 0.46997663378715515,
"learning_rate": 0.00019256017342263228,
"loss": 0.0388,
"step": 1270
},
{
"epoch": 0.8437705998681608,
"grad_norm": 0.5510318279266357,
"learning_rate": 0.00019239421105046706,
"loss": 0.056,
"step": 1280
},
{
"epoch": 0.8503625576796309,
"grad_norm": 0.47607627511024475,
"learning_rate": 0.000192226491090666,
"loss": 0.0462,
"step": 1290
},
{
"epoch": 0.8569545154911009,
"grad_norm": 0.4591579735279083,
"learning_rate": 0.00019205701673371606,
"loss": 0.0456,
"step": 1300
},
{
"epoch": 0.8635464733025708,
"grad_norm": 0.45051664113998413,
"learning_rate": 0.00019188579120347766,
"loss": 0.0402,
"step": 1310
},
{
"epoch": 0.8701384311140409,
"grad_norm": 0.3680923283100128,
"learning_rate": 0.00019171281775712316,
"loss": 0.0378,
"step": 1320
},
{
"epoch": 0.8767303889255109,
"grad_norm": 0.4515272080898285,
"learning_rate": 0.00019153809968507505,
"loss": 0.0439,
"step": 1330
},
{
"epoch": 0.8833223467369808,
"grad_norm": 0.5114394426345825,
"learning_rate": 0.00019136164031094337,
"loss": 0.0522,
"step": 1340
},
{
"epoch": 0.8899143045484509,
"grad_norm": 0.6060967445373535,
"learning_rate": 0.00019118344299146235,
"loss": 0.04,
"step": 1350
},
{
"epoch": 0.8965062623599209,
"grad_norm": 0.7507016658782959,
"learning_rate": 0.00019100351111642666,
"loss": 0.0557,
"step": 1360
},
{
"epoch": 0.9030982201713909,
"grad_norm": 0.4493657648563385,
"learning_rate": 0.00019082184810862698,
"loss": 0.0424,
"step": 1370
},
{
"epoch": 0.9096901779828609,
"grad_norm": 0.5429974794387817,
"learning_rate": 0.00019063845742378467,
"loss": 0.0441,
"step": 1380
},
{
"epoch": 0.9162821357943309,
"grad_norm": 0.43085166811943054,
"learning_rate": 0.00019045334255048634,
"loss": 0.046,
"step": 1390
},
{
"epoch": 0.922874093605801,
"grad_norm": 0.41755935549736023,
"learning_rate": 0.0001902665070101172,
"loss": 0.0461,
"step": 1400
},
{
"epoch": 0.9294660514172709,
"grad_norm": 0.44052428007125854,
"learning_rate": 0.00019007795435679428,
"loss": 0.052,
"step": 1410
},
{
"epoch": 0.9360580092287409,
"grad_norm": 0.4310389757156372,
"learning_rate": 0.00018988768817729864,
"loss": 0.0442,
"step": 1420
},
{
"epoch": 0.942649967040211,
"grad_norm": 0.3892590403556824,
"learning_rate": 0.0001896957120910074,
"loss": 0.0416,
"step": 1430
},
{
"epoch": 0.9492419248516809,
"grad_norm": 0.7788804769515991,
"learning_rate": 0.00018950202974982454,
"loss": 0.0339,
"step": 1440
},
{
"epoch": 0.955833882663151,
"grad_norm": 0.5524693727493286,
"learning_rate": 0.00018930664483811173,
"loss": 0.045,
"step": 1450
},
{
"epoch": 0.962425840474621,
"grad_norm": 0.41249391436576843,
"learning_rate": 0.00018910956107261816,
"loss": 0.0381,
"step": 1460
},
{
"epoch": 0.9690177982860909,
"grad_norm": 0.3245869576931,
"learning_rate": 0.00018891078220240973,
"loss": 0.0277,
"step": 1470
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.28615134954452515,
"learning_rate": 0.0001887103120087979,
"loss": 0.0365,
"step": 1480
},
{
"epoch": 0.982201713909031,
"grad_norm": 0.32258233428001404,
"learning_rate": 0.00018850815430526758,
"loss": 0.0339,
"step": 1490
},
{
"epoch": 0.988793671720501,
"grad_norm": 0.4749410152435303,
"learning_rate": 0.00018830431293740473,
"loss": 0.0414,
"step": 1500
},
{
"epoch": 0.995385629531971,
"grad_norm": 0.44143855571746826,
"learning_rate": 0.00018809879178282313,
"loss": 0.0288,
"step": 1510
},
{
"epoch": 1.0019775873434411,
"grad_norm": 0.4565713107585907,
"learning_rate": 0.00018789159475109067,
"loss": 0.0343,
"step": 1520
},
{
"epoch": 1.008569545154911,
"grad_norm": 0.5609179735183716,
"learning_rate": 0.000187682725783655,
"loss": 0.0423,
"step": 1530
},
{
"epoch": 1.015161502966381,
"grad_norm": 0.4169975221157074,
"learning_rate": 0.00018747218885376842,
"loss": 0.0341,
"step": 1540
},
{
"epoch": 1.0217534607778511,
"grad_norm": 0.44291096925735474,
"learning_rate": 0.0001872599879664124,
"loss": 0.0435,
"step": 1550
},
{
"epoch": 1.028345418589321,
"grad_norm": 0.31878435611724854,
"learning_rate": 0.00018704612715822144,
"loss": 0.0402,
"step": 1560
},
{
"epoch": 1.034937376400791,
"grad_norm": 0.4876072406768799,
"learning_rate": 0.0001868306104974061,
"loss": 0.0298,
"step": 1570
},
{
"epoch": 1.0415293342122611,
"grad_norm": 0.4452480375766754,
"learning_rate": 0.0001866134420836759,
"loss": 0.042,
"step": 1580
},
{
"epoch": 1.048121292023731,
"grad_norm": 0.5295068025588989,
"learning_rate": 0.00018639462604816103,
"loss": 0.0408,
"step": 1590
},
{
"epoch": 1.054713249835201,
"grad_norm": 0.349461168050766,
"learning_rate": 0.00018617416655333395,
"loss": 0.037,
"step": 1600
},
{
"epoch": 1.0613052076466711,
"grad_norm": 0.39832666516304016,
"learning_rate": 0.00018595206779293015,
"loss": 0.0406,
"step": 1610
},
{
"epoch": 1.067897165458141,
"grad_norm": 0.5740079283714294,
"learning_rate": 0.00018572833399186836,
"loss": 0.0411,
"step": 1620
},
{
"epoch": 1.074489123269611,
"grad_norm": 0.20162849128246307,
"learning_rate": 0.00018550296940617034,
"loss": 0.0333,
"step": 1630
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.40781688690185547,
"learning_rate": 0.00018527597832287954,
"loss": 0.036,
"step": 1640
},
{
"epoch": 1.087673038892551,
"grad_norm": 0.2796386182308197,
"learning_rate": 0.00018504736505997997,
"loss": 0.0313,
"step": 1650
},
{
"epoch": 1.094264996704021,
"grad_norm": 0.6502156853675842,
"learning_rate": 0.00018481713396631383,
"loss": 0.0428,
"step": 1660
},
{
"epoch": 1.1008569545154911,
"grad_norm": 0.3565762937068939,
"learning_rate": 0.00018458528942149886,
"loss": 0.0363,
"step": 1670
},
{
"epoch": 1.107448912326961,
"grad_norm": 0.2560652792453766,
"learning_rate": 0.00018435183583584498,
"loss": 0.0404,
"step": 1680
},
{
"epoch": 1.1140408701384312,
"grad_norm": 0.4972442388534546,
"learning_rate": 0.00018411677765027036,
"loss": 0.053,
"step": 1690
},
{
"epoch": 1.1206328279499012,
"grad_norm": 0.36633139848709106,
"learning_rate": 0.0001838801193362171,
"loss": 0.0363,
"step": 1700
},
{
"epoch": 1.127224785761371,
"grad_norm": 0.4480843245983124,
"learning_rate": 0.000183641865395566,
"loss": 0.031,
"step": 1710
},
{
"epoch": 1.133816743572841,
"grad_norm": 0.42788198590278625,
"learning_rate": 0.00018340202036055102,
"loss": 0.0408,
"step": 1720
},
{
"epoch": 1.1404087013843112,
"grad_norm": 0.3363877534866333,
"learning_rate": 0.00018316058879367303,
"loss": 0.0431,
"step": 1730
},
{
"epoch": 1.147000659195781,
"grad_norm": 0.48484691977500916,
"learning_rate": 0.000182917575287613,
"loss": 0.0497,
"step": 1740
},
{
"epoch": 1.1535926170072512,
"grad_norm": 0.4944576025009155,
"learning_rate": 0.00018267298446514473,
"loss": 0.0381,
"step": 1750
},
{
"epoch": 1.1601845748187212,
"grad_norm": 0.31334227323532104,
"learning_rate": 0.00018242682097904673,
"loss": 0.0374,
"step": 1760
},
{
"epoch": 1.166776532630191,
"grad_norm": 0.4245593845844269,
"learning_rate": 0.00018217908951201394,
"loss": 0.0384,
"step": 1770
},
{
"epoch": 1.1733684904416612,
"grad_norm": 0.3156047463417053,
"learning_rate": 0.00018192979477656845,
"loss": 0.0375,
"step": 1780
},
{
"epoch": 1.1799604482531312,
"grad_norm": 0.38936617970466614,
"learning_rate": 0.00018167894151497,
"loss": 0.0383,
"step": 1790
},
{
"epoch": 1.186552406064601,
"grad_norm": 0.39287203550338745,
"learning_rate": 0.00018142653449912564,
"loss": 0.0384,
"step": 1800
},
{
"epoch": 1.1931443638760713,
"grad_norm": 0.4132576882839203,
"learning_rate": 0.0001811725785304991,
"loss": 0.0333,
"step": 1810
},
{
"epoch": 1.1997363216875412,
"grad_norm": 0.42320823669433594,
"learning_rate": 0.00018091707844001935,
"loss": 0.0282,
"step": 1820
},
{
"epoch": 1.2063282794990111,
"grad_norm": 0.4071812927722931,
"learning_rate": 0.00018066003908798873,
"loss": 0.0315,
"step": 1830
},
{
"epoch": 1.2129202373104813,
"grad_norm": 0.40392544865608215,
"learning_rate": 0.0001804014653639904,
"loss": 0.0331,
"step": 1840
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.4608232080936432,
"learning_rate": 0.00018014136218679567,
"loss": 0.0327,
"step": 1850
},
{
"epoch": 1.2261041529334213,
"grad_norm": 0.5048249959945679,
"learning_rate": 0.00017987973450426994,
"loss": 0.0334,
"step": 1860
},
{
"epoch": 1.2326961107448913,
"grad_norm": 0.5134670734405518,
"learning_rate": 0.0001796165872932789,
"loss": 0.0361,
"step": 1870
},
{
"epoch": 1.2392880685563612,
"grad_norm": 0.339224249124527,
"learning_rate": 0.00017935192555959385,
"loss": 0.0336,
"step": 1880
},
{
"epoch": 1.2458800263678311,
"grad_norm": 0.5917630195617676,
"learning_rate": 0.0001790857543377963,
"loss": 0.0447,
"step": 1890
},
{
"epoch": 1.2524719841793013,
"grad_norm": 0.641945481300354,
"learning_rate": 0.00017881807869118234,
"loss": 0.0546,
"step": 1900
},
{
"epoch": 1.2590639419907712,
"grad_norm": 0.4399726986885071,
"learning_rate": 0.00017854890371166637,
"loss": 0.0358,
"step": 1910
},
{
"epoch": 1.2656558998022414,
"grad_norm": 0.32603511214256287,
"learning_rate": 0.00017827823451968398,
"loss": 0.0342,
"step": 1920
},
{
"epoch": 1.2722478576137113,
"grad_norm": 0.659220814704895,
"learning_rate": 0.0001780060762640949,
"loss": 0.039,
"step": 1930
},
{
"epoch": 1.2788398154251812,
"grad_norm": 0.4240771234035492,
"learning_rate": 0.00017773243412208474,
"loss": 0.035,
"step": 1940
},
{
"epoch": 1.2854317732366514,
"grad_norm": 0.4172196090221405,
"learning_rate": 0.0001774573132990667,
"loss": 0.0379,
"step": 1950
},
{
"epoch": 1.2920237310481213,
"grad_norm": 0.42398178577423096,
"learning_rate": 0.00017718071902858256,
"loss": 0.0373,
"step": 1960
},
{
"epoch": 1.2986156888595912,
"grad_norm": 0.5154095888137817,
"learning_rate": 0.00017690265657220288,
"loss": 0.0403,
"step": 1970
},
{
"epoch": 1.3052076466710614,
"grad_norm": 0.396801233291626,
"learning_rate": 0.00017662313121942727,
"loss": 0.0391,
"step": 1980
},
{
"epoch": 1.3117996044825313,
"grad_norm": 0.4826532006263733,
"learning_rate": 0.00017634214828758342,
"loss": 0.0297,
"step": 1990
},
{
"epoch": 1.3183915622940012,
"grad_norm": 0.508990466594696,
"learning_rate": 0.00017605971312172622,
"loss": 0.0378,
"step": 2000
},
{
"epoch": 1.3249835201054714,
"grad_norm": 0.3308925926685333,
"learning_rate": 0.000175775831094536,
"loss": 0.0379,
"step": 2010
},
{
"epoch": 1.3315754779169413,
"grad_norm": 0.4720020294189453,
"learning_rate": 0.00017549050760621614,
"loss": 0.0392,
"step": 2020
},
{
"epoch": 1.3381674357284115,
"grad_norm": 0.6246912479400635,
"learning_rate": 0.00017520374808439076,
"loss": 0.0363,
"step": 2030
},
{
"epoch": 1.3447593935398814,
"grad_norm": 0.33079174160957336,
"learning_rate": 0.00017491555798400095,
"loss": 0.0316,
"step": 2040
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.2520120143890381,
"learning_rate": 0.00017462594278720145,
"loss": 0.0325,
"step": 2050
},
{
"epoch": 1.3579433091628212,
"grad_norm": 0.23862145841121674,
"learning_rate": 0.00017433490800325614,
"loss": 0.0351,
"step": 2060
},
{
"epoch": 1.3645352669742914,
"grad_norm": 0.3477911353111267,
"learning_rate": 0.00017404245916843324,
"loss": 0.0389,
"step": 2070
},
{
"epoch": 1.3711272247857613,
"grad_norm": 0.5003520846366882,
"learning_rate": 0.00017374860184590015,
"loss": 0.0368,
"step": 2080
},
{
"epoch": 1.3777191825972315,
"grad_norm": 0.3755623698234558,
"learning_rate": 0.00017345334162561734,
"loss": 0.0341,
"step": 2090
},
{
"epoch": 1.3843111404087014,
"grad_norm": 0.5258712768554688,
"learning_rate": 0.00017315668412423238,
"loss": 0.0334,
"step": 2100
},
{
"epoch": 1.3909030982201713,
"grad_norm": 0.567348062992096,
"learning_rate": 0.0001728586349849728,
"loss": 0.0366,
"step": 2110
},
{
"epoch": 1.3974950560316415,
"grad_norm": 0.4541948139667511,
"learning_rate": 0.00017255919987753878,
"loss": 0.0503,
"step": 2120
},
{
"epoch": 1.4040870138431114,
"grad_norm": 0.44722017645835876,
"learning_rate": 0.0001722583844979955,
"loss": 0.0433,
"step": 2130
},
{
"epoch": 1.4106789716545813,
"grad_norm": 0.25077545642852783,
"learning_rate": 0.0001719561945686646,
"loss": 0.0345,
"step": 2140
},
{
"epoch": 1.4172709294660515,
"grad_norm": 0.3619667887687683,
"learning_rate": 0.00017165263583801535,
"loss": 0.0325,
"step": 2150
},
{
"epoch": 1.4238628872775214,
"grad_norm": 0.6268120408058167,
"learning_rate": 0.0001713477140805553,
"loss": 0.0364,
"step": 2160
},
{
"epoch": 1.4304548450889913,
"grad_norm": 0.5806043148040771,
"learning_rate": 0.0001710414350967204,
"loss": 0.037,
"step": 2170
},
{
"epoch": 1.4370468029004615,
"grad_norm": 0.3783499002456665,
"learning_rate": 0.00017073380471276496,
"loss": 0.0318,
"step": 2180
},
{
"epoch": 1.4436387607119314,
"grad_norm": 0.45143669843673706,
"learning_rate": 0.0001704248287806503,
"loss": 0.0344,
"step": 2190
},
{
"epoch": 1.4502307185234016,
"grad_norm": 0.3384231626987457,
"learning_rate": 0.00017011451317793384,
"loss": 0.0306,
"step": 2200
},
{
"epoch": 1.4568226763348715,
"grad_norm": 0.45972728729248047,
"learning_rate": 0.00016980286380765714,
"loss": 0.0394,
"step": 2210
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.31935372948646545,
"learning_rate": 0.0001694898865982336,
"loss": 0.0327,
"step": 2220
},
{
"epoch": 1.4700065919578114,
"grad_norm": 0.3758127689361572,
"learning_rate": 0.0001691755875033357,
"loss": 0.0376,
"step": 2230
},
{
"epoch": 1.4765985497692815,
"grad_norm": 0.7778825759887695,
"learning_rate": 0.00016885997250178184,
"loss": 0.0346,
"step": 2240
},
{
"epoch": 1.4831905075807514,
"grad_norm": 0.7735721468925476,
"learning_rate": 0.00016854304759742237,
"loss": 0.038,
"step": 2250
},
{
"epoch": 1.4897824653922216,
"grad_norm": 0.6678999662399292,
"learning_rate": 0.00016822481881902568,
"loss": 0.0488,
"step": 2260
},
{
"epoch": 1.4963744232036915,
"grad_norm": 0.5145410895347595,
"learning_rate": 0.00016790529222016328,
"loss": 0.0423,
"step": 2270
},
{
"epoch": 1.5029663810151614,
"grad_norm": 1.2216230630874634,
"learning_rate": 0.00016758447387909474,
"loss": 0.0435,
"step": 2280
},
{
"epoch": 1.5095583388266314,
"grad_norm": 0.46562644839286804,
"learning_rate": 0.00016726236989865213,
"loss": 0.0329,
"step": 2290
},
{
"epoch": 1.5161502966381015,
"grad_norm": 0.552429735660553,
"learning_rate": 0.00016693898640612382,
"loss": 0.041,
"step": 2300
},
{
"epoch": 1.5227422544495717,
"grad_norm": 0.4718281328678131,
"learning_rate": 0.00016661432955313789,
"loss": 0.0317,
"step": 2310
},
{
"epoch": 1.5293342122610416,
"grad_norm": 0.5447438955307007,
"learning_rate": 0.00016628840551554522,
"loss": 0.0365,
"step": 2320
},
{
"epoch": 1.5359261700725115,
"grad_norm": 0.5384830236434937,
"learning_rate": 0.00016596122049330206,
"loss": 0.0365,
"step": 2330
},
{
"epoch": 1.5425181278839815,
"grad_norm": 0.48313167691230774,
"learning_rate": 0.0001656327807103518,
"loss": 0.0381,
"step": 2340
},
{
"epoch": 1.5491100856954514,
"grad_norm": 0.4898654520511627,
"learning_rate": 0.000165303092414507,
"loss": 0.0343,
"step": 2350
},
{
"epoch": 1.5557020435069215,
"grad_norm": 0.47862598299980164,
"learning_rate": 0.00016497216187733016,
"loss": 0.0333,
"step": 2360
},
{
"epoch": 1.5622940013183917,
"grad_norm": 0.4709709584712982,
"learning_rate": 0.00016463999539401454,
"loss": 0.0351,
"step": 2370
},
{
"epoch": 1.5688859591298616,
"grad_norm": 0.5032598972320557,
"learning_rate": 0.00016430659928326458,
"loss": 0.0306,
"step": 2380
},
{
"epoch": 1.5754779169413315,
"grad_norm": 0.9953115582466125,
"learning_rate": 0.00016397197988717542,
"loss": 0.0388,
"step": 2390
},
{
"epoch": 1.5820698747528015,
"grad_norm": 0.5729079246520996,
"learning_rate": 0.00016363614357111245,
"loss": 0.0336,
"step": 2400
},
{
"epoch": 1.5886618325642716,
"grad_norm": 0.8332236409187317,
"learning_rate": 0.0001632990967235902,
"loss": 0.0414,
"step": 2410
},
{
"epoch": 1.5952537903757416,
"grad_norm": 1.0546754598617554,
"learning_rate": 0.00016296084575615077,
"loss": 0.0383,
"step": 2420
},
{
"epoch": 1.6018457481872117,
"grad_norm": 0.546684205532074,
"learning_rate": 0.0001626213971032418,
"loss": 0.0382,
"step": 2430
},
{
"epoch": 1.6084377059986816,
"grad_norm": 0.6224532723426819,
"learning_rate": 0.00016228075722209422,
"loss": 0.0379,
"step": 2440
},
{
"epoch": 1.6150296638101516,
"grad_norm": 0.39089900255203247,
"learning_rate": 0.00016193893259259934,
"loss": 0.0364,
"step": 2450
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.5209794044494629,
"learning_rate": 0.00016159592971718548,
"loss": 0.0329,
"step": 2460
},
{
"epoch": 1.6282135794330916,
"grad_norm": 0.45939525961875916,
"learning_rate": 0.0001612517551206946,
"loss": 0.0316,
"step": 2470
},
{
"epoch": 1.6348055372445618,
"grad_norm": 0.4331035614013672,
"learning_rate": 0.00016090641535025774,
"loss": 0.0424,
"step": 2480
},
{
"epoch": 1.6413974950560317,
"grad_norm": 0.447710782289505,
"learning_rate": 0.0001605599169751708,
"loss": 0.0387,
"step": 2490
},
{
"epoch": 1.6479894528675016,
"grad_norm": 0.4073365330696106,
"learning_rate": 0.00016021226658676947,
"loss": 0.0404,
"step": 2500
},
{
"epoch": 1.6545814106789716,
"grad_norm": 0.36032500863075256,
"learning_rate": 0.00015986347079830382,
"loss": 0.0311,
"step": 2510
},
{
"epoch": 1.6611733684904415,
"grad_norm": 0.23349802196025848,
"learning_rate": 0.00015951353624481257,
"loss": 0.0248,
"step": 2520
},
{
"epoch": 1.6677653263019117,
"grad_norm": 0.3381997048854828,
"learning_rate": 0.0001591624695829968,
"loss": 0.0316,
"step": 2530
},
{
"epoch": 1.6743572841133818,
"grad_norm": 0.39666473865509033,
"learning_rate": 0.0001588102774910933,
"loss": 0.0399,
"step": 2540
},
{
"epoch": 1.6809492419248517,
"grad_norm": 0.38981807231903076,
"learning_rate": 0.00015845696666874772,
"loss": 0.0325,
"step": 2550
},
{
"epoch": 1.6875411997363217,
"grad_norm": 0.614475667476654,
"learning_rate": 0.00015810254383688682,
"loss": 0.0386,
"step": 2560
},
{
"epoch": 1.6941331575477916,
"grad_norm": 0.6012241244316101,
"learning_rate": 0.0001577470157375909,
"loss": 0.0426,
"step": 2570
},
{
"epoch": 1.7007251153592617,
"grad_norm": 0.8984513878822327,
"learning_rate": 0.00015739038913396546,
"loss": 0.0385,
"step": 2580
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.5758917331695557,
"learning_rate": 0.00015703267081001237,
"loss": 0.0327,
"step": 2590
},
{
"epoch": 1.7139090309822018,
"grad_norm": 0.39728182554244995,
"learning_rate": 0.00015667386757050106,
"loss": 0.0359,
"step": 2600
},
{
"epoch": 1.7205009887936717,
"grad_norm": 0.44694146513938904,
"learning_rate": 0.00015631398624083907,
"loss": 0.032,
"step": 2610
},
{
"epoch": 1.7270929466051417,
"grad_norm": 0.5872260332107544,
"learning_rate": 0.000155953033666942,
"loss": 0.0307,
"step": 2620
},
{
"epoch": 1.7336849044166116,
"grad_norm": 0.5661513209342957,
"learning_rate": 0.00015559101671510349,
"loss": 0.0326,
"step": 2630
},
{
"epoch": 1.7402768622280818,
"grad_norm": 0.3842809796333313,
"learning_rate": 0.00015522794227186443,
"loss": 0.0326,
"step": 2640
},
{
"epoch": 1.746868820039552,
"grad_norm": 0.24816927313804626,
"learning_rate": 0.00015486381724388222,
"loss": 0.0251,
"step": 2650
},
{
"epoch": 1.7534607778510218,
"grad_norm": 0.2353767305612564,
"learning_rate": 0.00015449864855779903,
"loss": 0.0272,
"step": 2660
},
{
"epoch": 1.7600527356624918,
"grad_norm": 0.25328564643859863,
"learning_rate": 0.00015413244316011038,
"loss": 0.0338,
"step": 2670
},
{
"epoch": 1.7666446934739617,
"grad_norm": 0.37852951884269714,
"learning_rate": 0.0001537652080170328,
"loss": 0.0308,
"step": 2680
},
{
"epoch": 1.7732366512854316,
"grad_norm": 0.294085294008255,
"learning_rate": 0.00015339695011437127,
"loss": 0.0236,
"step": 2690
},
{
"epoch": 1.7798286090969018,
"grad_norm": 0.3499051034450531,
"learning_rate": 0.00015302767645738655,
"loss": 0.0305,
"step": 2700
},
{
"epoch": 1.786420566908372,
"grad_norm": 0.4269741177558899,
"learning_rate": 0.00015265739407066176,
"loss": 0.0279,
"step": 2710
},
{
"epoch": 1.7930125247198418,
"grad_norm": 0.3368455767631531,
"learning_rate": 0.00015228610999796875,
"loss": 0.0306,
"step": 2720
},
{
"epoch": 1.7996044825313118,
"grad_norm": 0.36064472794532776,
"learning_rate": 0.00015191383130213417,
"loss": 0.0281,
"step": 2730
},
{
"epoch": 1.8061964403427817,
"grad_norm": 0.42101433873176575,
"learning_rate": 0.00015154056506490505,
"loss": 0.0299,
"step": 2740
},
{
"epoch": 1.8127883981542519,
"grad_norm": 0.3719172179698944,
"learning_rate": 0.0001511663183868142,
"loss": 0.0323,
"step": 2750
},
{
"epoch": 1.8193803559657218,
"grad_norm": 0.3902226984500885,
"learning_rate": 0.00015079109838704504,
"loss": 0.0327,
"step": 2760
},
{
"epoch": 1.825972313777192,
"grad_norm": 0.36405107378959656,
"learning_rate": 0.00015041491220329616,
"loss": 0.0278,
"step": 2770
},
{
"epoch": 1.8325642715886619,
"grad_norm": 0.31391507387161255,
"learning_rate": 0.0001500377669916456,
"loss": 0.0325,
"step": 2780
},
{
"epoch": 1.8391562294001318,
"grad_norm": 0.4089469611644745,
"learning_rate": 0.0001496596699264147,
"loss": 0.0253,
"step": 2790
},
{
"epoch": 1.8457481872116017,
"grad_norm": 0.5822712779045105,
"learning_rate": 0.00014928062820003166,
"loss": 0.0337,
"step": 2800
},
{
"epoch": 1.8523401450230719,
"grad_norm": 0.5532752275466919,
"learning_rate": 0.00014890064902289466,
"loss": 0.0316,
"step": 2810
},
{
"epoch": 1.858932102834542,
"grad_norm": 0.39222195744514465,
"learning_rate": 0.0001485197396232348,
"loss": 0.0304,
"step": 2820
},
{
"epoch": 1.865524060646012,
"grad_norm": 0.3746655285358429,
"learning_rate": 0.00014813790724697832,
"loss": 0.0361,
"step": 2830
},
{
"epoch": 1.8721160184574819,
"grad_norm": 0.5020349621772766,
"learning_rate": 0.0001477551591576092,
"loss": 0.0351,
"step": 2840
},
{
"epoch": 1.8787079762689518,
"grad_norm": 0.40259358286857605,
"learning_rate": 0.00014737150263603063,
"loss": 0.027,
"step": 2850
},
{
"epoch": 1.8852999340804217,
"grad_norm": 0.6693785190582275,
"learning_rate": 0.00014698694498042675,
"loss": 0.0345,
"step": 2860
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.6384851932525635,
"learning_rate": 0.00014660149350612353,
"loss": 0.0315,
"step": 2870
},
{
"epoch": 1.898483849703362,
"grad_norm": 0.5224544405937195,
"learning_rate": 0.00014621515554544997,
"loss": 0.0259,
"step": 2880
},
{
"epoch": 1.905075807514832,
"grad_norm": 0.5825631022453308,
"learning_rate": 0.0001458279384475983,
"loss": 0.0415,
"step": 2890
},
{
"epoch": 1.911667765326302,
"grad_norm": 0.36511966586112976,
"learning_rate": 0.0001454398495784844,
"loss": 0.033,
"step": 2900
},
{
"epoch": 1.9182597231377718,
"grad_norm": 0.4093778431415558,
"learning_rate": 0.00014505089632060753,
"loss": 0.0309,
"step": 2910
},
{
"epoch": 1.924851680949242,
"grad_norm": 0.4290638566017151,
"learning_rate": 0.00014466108607291003,
"loss": 0.0309,
"step": 2920
},
{
"epoch": 1.931443638760712,
"grad_norm": 0.6213640570640564,
"learning_rate": 0.00014427042625063646,
"loss": 0.0358,
"step": 2930
},
{
"epoch": 1.938035596572182,
"grad_norm": 0.6244672536849976,
"learning_rate": 0.00014387892428519258,
"loss": 0.0387,
"step": 2940
},
{
"epoch": 1.944627554383652,
"grad_norm": 0.380691796541214,
"learning_rate": 0.000143486587624004,
"loss": 0.0464,
"step": 2950
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.4133692979812622,
"learning_rate": 0.00014309342373037455,
"loss": 0.0329,
"step": 2960
},
{
"epoch": 1.9578114700065918,
"grad_norm": 0.4502374529838562,
"learning_rate": 0.00014269944008334418,
"loss": 0.0334,
"step": 2970
},
{
"epoch": 1.964403427818062,
"grad_norm": 0.5235921740531921,
"learning_rate": 0.00014230464417754675,
"loss": 0.033,
"step": 2980
},
{
"epoch": 1.9709953856295321,
"grad_norm": 0.5345565676689148,
"learning_rate": 0.00014190904352306757,
"loss": 0.0371,
"step": 2990
},
{
"epoch": 1.977587343441002,
"grad_norm": 0.34067875146865845,
"learning_rate": 0.0001415126456453004,
"loss": 0.0408,
"step": 3000
},
{
"epoch": 1.984179301252472,
"grad_norm": 0.36922353506088257,
"learning_rate": 0.00014111545808480434,
"loss": 0.0315,
"step": 3010
},
{
"epoch": 1.990771259063942,
"grad_norm": 0.36315643787384033,
"learning_rate": 0.0001407174883971604,
"loss": 0.0311,
"step": 3020
},
{
"epoch": 1.9973632168754119,
"grad_norm": 0.35053545236587524,
"learning_rate": 0.0001403187441528277,
"loss": 0.0367,
"step": 3030
},
{
"epoch": 2.0039551746868822,
"grad_norm": 0.5017916560173035,
"learning_rate": 0.00013991923293699956,
"loss": 0.0353,
"step": 3040
},
{
"epoch": 2.010547132498352,
"grad_norm": 0.3657391667366028,
"learning_rate": 0.00013951896234945925,
"loss": 0.0404,
"step": 3050
},
{
"epoch": 2.017139090309822,
"grad_norm": 0.5382429957389832,
"learning_rate": 0.00013911794000443528,
"loss": 0.0346,
"step": 3060
},
{
"epoch": 2.023731048121292,
"grad_norm": 0.5115209221839905,
"learning_rate": 0.0001387161735304566,
"loss": 0.0288,
"step": 3070
},
{
"epoch": 2.030323005932762,
"grad_norm": 0.5078955888748169,
"learning_rate": 0.00013831367057020748,
"loss": 0.0323,
"step": 3080
},
{
"epoch": 2.036914963744232,
"grad_norm": 0.4034331440925598,
"learning_rate": 0.00013791043878038224,
"loss": 0.0397,
"step": 3090
},
{
"epoch": 2.0435069215557022,
"grad_norm": 0.23669302463531494,
"learning_rate": 0.0001375064858315394,
"loss": 0.0314,
"step": 3100
},
{
"epoch": 2.050098879367172,
"grad_norm": 0.3059588074684143,
"learning_rate": 0.000137101819407956,
"loss": 0.0276,
"step": 3110
},
{
"epoch": 2.056690837178642,
"grad_norm": 0.5819403529167175,
"learning_rate": 0.00013669644720748118,
"loss": 0.0285,
"step": 3120
},
{
"epoch": 2.063282794990112,
"grad_norm": 0.6815973520278931,
"learning_rate": 0.00013629037694138995,
"loss": 0.0329,
"step": 3130
},
{
"epoch": 2.069874752801582,
"grad_norm": 0.28361934423446655,
"learning_rate": 0.0001358836163342364,
"loss": 0.0271,
"step": 3140
},
{
"epoch": 2.076466710613052,
"grad_norm": 0.2907734513282776,
"learning_rate": 0.00013547617312370663,
"loss": 0.0309,
"step": 3150
},
{
"epoch": 2.0830586684245223,
"grad_norm": 0.5272607207298279,
"learning_rate": 0.00013506805506047198,
"loss": 0.0308,
"step": 3160
},
{
"epoch": 2.089650626235992,
"grad_norm": 0.23821255564689636,
"learning_rate": 0.00013465926990804107,
"loss": 0.0341,
"step": 3170
},
{
"epoch": 2.096242584047462,
"grad_norm": 0.5370649099349976,
"learning_rate": 0.00013424982544261248,
"loss": 0.0316,
"step": 3180
},
{
"epoch": 2.102834541858932,
"grad_norm": 0.3361760675907135,
"learning_rate": 0.00013383972945292665,
"loss": 0.0248,
"step": 3190
},
{
"epoch": 2.109426499670402,
"grad_norm": 0.48819541931152344,
"learning_rate": 0.00013342898974011774,
"loss": 0.0347,
"step": 3200
},
{
"epoch": 2.1160184574818723,
"grad_norm": 0.24430608749389648,
"learning_rate": 0.00013301761411756543,
"loss": 0.0269,
"step": 3210
},
{
"epoch": 2.1226104152933423,
"grad_norm": 0.4588664770126343,
"learning_rate": 0.00013260561041074598,
"loss": 0.0276,
"step": 3220
},
{
"epoch": 2.129202373104812,
"grad_norm": 0.5559895634651184,
"learning_rate": 0.0001321929864570835,
"loss": 0.0257,
"step": 3230
},
{
"epoch": 2.135794330916282,
"grad_norm": 0.547458827495575,
"learning_rate": 0.00013177975010580085,
"loss": 0.0223,
"step": 3240
},
{
"epoch": 2.142386288727752,
"grad_norm": 0.3017808198928833,
"learning_rate": 0.00013136590921777053,
"loss": 0.031,
"step": 3250
},
{
"epoch": 2.148978246539222,
"grad_norm": 0.44043952226638794,
"learning_rate": 0.00013095147166536486,
"loss": 0.0276,
"step": 3260
},
{
"epoch": 2.1555702043506924,
"grad_norm": 0.4227822422981262,
"learning_rate": 0.0001305364453323062,
"loss": 0.0296,
"step": 3270
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.4026118516921997,
"learning_rate": 0.0001301208381135173,
"loss": 0.0301,
"step": 3280
},
{
"epoch": 2.168754119973632,
"grad_norm": 0.5354869961738586,
"learning_rate": 0.0001297046579149708,
"loss": 0.0286,
"step": 3290
},
{
"epoch": 2.175346077785102,
"grad_norm": 0.42211246490478516,
"learning_rate": 0.00012928791265353902,
"loss": 0.0336,
"step": 3300
},
{
"epoch": 2.181938035596572,
"grad_norm": 0.3645992577075958,
"learning_rate": 0.00012887061025684333,
"loss": 0.0242,
"step": 3310
},
{
"epoch": 2.188529993408042,
"grad_norm": 0.2105298638343811,
"learning_rate": 0.00012845275866310324,
"loss": 0.0228,
"step": 3320
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.25215044617652893,
"learning_rate": 0.00012803436582098558,
"loss": 0.0243,
"step": 3330
},
{
"epoch": 2.2017139090309823,
"grad_norm": 0.4196263253688812,
"learning_rate": 0.00012761543968945306,
"loss": 0.0282,
"step": 3340
},
{
"epoch": 2.2083058668424522,
"grad_norm": 0.1937485933303833,
"learning_rate": 0.00012719598823761308,
"loss": 0.0278,
"step": 3350
},
{
"epoch": 2.214897824653922,
"grad_norm": 0.5221042037010193,
"learning_rate": 0.00012677601944456604,
"loss": 0.0311,
"step": 3360
},
{
"epoch": 2.221489782465392,
"grad_norm": 0.2941031754016876,
"learning_rate": 0.0001263555412992535,
"loss": 0.0303,
"step": 3370
},
{
"epoch": 2.2280817402768625,
"grad_norm": 0.31689217686653137,
"learning_rate": 0.00012593456180030646,
"loss": 0.0252,
"step": 3380
},
{
"epoch": 2.2346736980883324,
"grad_norm": 0.42106205224990845,
"learning_rate": 0.0001255130889558928,
"loss": 0.0249,
"step": 3390
},
{
"epoch": 2.2412656558998023,
"grad_norm": 0.576701283454895,
"learning_rate": 0.0001250911307835653,
"loss": 0.0303,
"step": 3400
},
{
"epoch": 2.2478576137112722,
"grad_norm": 0.49954476952552795,
"learning_rate": 0.00012466869531010895,
"loss": 0.0323,
"step": 3410
},
{
"epoch": 2.254449571522742,
"grad_norm": 0.4963241517543793,
"learning_rate": 0.0001242457905713883,
"loss": 0.0316,
"step": 3420
},
{
"epoch": 2.261041529334212,
"grad_norm": 0.23066122829914093,
"learning_rate": 0.00012382242461219452,
"loss": 0.0226,
"step": 3430
},
{
"epoch": 2.267633487145682,
"grad_norm": 0.540354311466217,
"learning_rate": 0.00012339860548609262,
"loss": 0.0365,
"step": 3440
},
{
"epoch": 2.2742254449571524,
"grad_norm": 0.48116335272789,
"learning_rate": 0.0001229743412552679,
"loss": 0.0268,
"step": 3450
},
{
"epoch": 2.2808174027686223,
"grad_norm": 0.4430583417415619,
"learning_rate": 0.00012254963999037285,
"loss": 0.0263,
"step": 3460
},
{
"epoch": 2.2874093605800923,
"grad_norm": 0.42470598220825195,
"learning_rate": 0.0001221245097703735,
"loss": 0.0354,
"step": 3470
},
{
"epoch": 2.294001318391562,
"grad_norm": 0.31455087661743164,
"learning_rate": 0.00012169895868239574,
"loss": 0.0241,
"step": 3480
},
{
"epoch": 2.300593276203032,
"grad_norm": 0.3215204179286957,
"learning_rate": 0.00012127299482157149,
"loss": 0.0332,
"step": 3490
},
{
"epoch": 2.3071852340145025,
"grad_norm": 0.3963293135166168,
"learning_rate": 0.00012084662629088481,
"loss": 0.025,
"step": 3500
},
{
"epoch": 2.3137771918259724,
"grad_norm": 0.4304813742637634,
"learning_rate": 0.00012041986120101764,
"loss": 0.0354,
"step": 3510
},
{
"epoch": 2.3203691496374423,
"grad_norm": 0.3873739242553711,
"learning_rate": 0.00011999270767019553,
"loss": 0.0277,
"step": 3520
},
{
"epoch": 2.3269611074489123,
"grad_norm": 0.4315703809261322,
"learning_rate": 0.00011956517382403321,
"loss": 0.0301,
"step": 3530
},
{
"epoch": 2.333553065260382,
"grad_norm": 0.4416598081588745,
"learning_rate": 0.00011913726779538008,
"loss": 0.0283,
"step": 3540
},
{
"epoch": 2.3401450230718526,
"grad_norm": 0.3677782416343689,
"learning_rate": 0.0001187089977241654,
"loss": 0.0355,
"step": 3550
},
{
"epoch": 2.3467369808833225,
"grad_norm": 0.4988672733306885,
"learning_rate": 0.00011828037175724356,
"loss": 0.0314,
"step": 3560
},
{
"epoch": 2.3533289386947924,
"grad_norm": 0.4604177474975586,
"learning_rate": 0.00011785139804823906,
"loss": 0.0337,
"step": 3570
},
{
"epoch": 2.3599208965062624,
"grad_norm": 0.3596359193325043,
"learning_rate": 0.00011742208475739133,
"loss": 0.0295,
"step": 3580
},
{
"epoch": 2.3665128543177323,
"grad_norm": 0.16485251486301422,
"learning_rate": 0.0001169924400513996,
"loss": 0.0275,
"step": 3590
},
{
"epoch": 2.373104812129202,
"grad_norm": 0.3272377550601959,
"learning_rate": 0.00011656247210326748,
"loss": 0.0305,
"step": 3600
},
{
"epoch": 2.379696769940672,
"grad_norm": 0.32883545756340027,
"learning_rate": 0.0001161321890921476,
"loss": 0.0314,
"step": 3610
},
{
"epoch": 2.3862887277521425,
"grad_norm": 0.49502697587013245,
"learning_rate": 0.00011570159920318584,
"loss": 0.0323,
"step": 3620
},
{
"epoch": 2.3928806855636124,
"grad_norm": 0.3317064344882965,
"learning_rate": 0.00011527071062736583,
"loss": 0.0284,
"step": 3630
},
{
"epoch": 2.3994726433750824,
"grad_norm": 0.29318150877952576,
"learning_rate": 0.00011483953156135292,
"loss": 0.0226,
"step": 3640
},
{
"epoch": 2.4060646011865523,
"grad_norm": 0.48932701349258423,
"learning_rate": 0.00011440807020733843,
"loss": 0.0287,
"step": 3650
},
{
"epoch": 2.4126565589980222,
"grad_norm": 0.358005166053772,
"learning_rate": 0.00011397633477288359,
"loss": 0.0235,
"step": 3660
},
{
"epoch": 2.4192485168094926,
"grad_norm": 0.3554854691028595,
"learning_rate": 0.00011354433347076331,
"loss": 0.0269,
"step": 3670
},
{
"epoch": 2.4258404746209625,
"grad_norm": 0.3954286277294159,
"learning_rate": 0.00011311207451881008,
"loss": 0.0264,
"step": 3680
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.3300182819366455,
"learning_rate": 0.00011267956613975752,
"loss": 0.0291,
"step": 3690
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.22343868017196655,
"learning_rate": 0.00011224681656108411,
"loss": 0.0251,
"step": 3700
},
{
"epoch": 2.4456163480553723,
"grad_norm": 0.3663915991783142,
"learning_rate": 0.00011181383401485656,
"loss": 0.0295,
"step": 3710
},
{
"epoch": 2.4522083058668427,
"grad_norm": 0.39715585112571716,
"learning_rate": 0.00011138062673757325,
"loss": 0.0299,
"step": 3720
},
{
"epoch": 2.4588002636783126,
"grad_norm": 0.3747979402542114,
"learning_rate": 0.00011094720297000753,
"loss": 0.0295,
"step": 3730
},
{
"epoch": 2.4653922214897825,
"grad_norm": 0.2834596037864685,
"learning_rate": 0.00011051357095705101,
"loss": 0.0284,
"step": 3740
},
{
"epoch": 2.4719841793012525,
"grad_norm": 0.3044513165950775,
"learning_rate": 0.0001100797389475567,
"loss": 0.0272,
"step": 3750
},
{
"epoch": 2.4785761371127224,
"grad_norm": 0.39235764741897583,
"learning_rate": 0.00010964571519418207,
"loss": 0.024,
"step": 3760
},
{
"epoch": 2.4851680949241923,
"grad_norm": 0.31392836570739746,
"learning_rate": 0.00010921150795323207,
"loss": 0.0229,
"step": 3770
},
{
"epoch": 2.4917600527356623,
"grad_norm": 0.3227923512458801,
"learning_rate": 0.00010877712548450207,
"loss": 0.0235,
"step": 3780
},
{
"epoch": 2.4983520105471326,
"grad_norm": 0.35434576869010925,
"learning_rate": 0.00010834257605112079,
"loss": 0.0265,
"step": 3790
},
{
"epoch": 2.5049439683586026,
"grad_norm": 0.3610621988773346,
"learning_rate": 0.00010790786791939301,
"loss": 0.0286,
"step": 3800
},
{
"epoch": 2.5115359261700725,
"grad_norm": 0.26061367988586426,
"learning_rate": 0.00010747300935864243,
"loss": 0.0302,
"step": 3810
},
{
"epoch": 2.5181278839815424,
"grad_norm": 0.3455495536327362,
"learning_rate": 0.00010703800864105429,
"loss": 0.0283,
"step": 3820
},
{
"epoch": 2.5247198417930123,
"grad_norm": 0.5354321002960205,
"learning_rate": 0.00010660287404151807,
"loss": 0.0279,
"step": 3830
},
{
"epoch": 2.5313117996044827,
"grad_norm": 0.23394666612148285,
"learning_rate": 0.00010616761383747,
"loss": 0.0318,
"step": 3840
},
{
"epoch": 2.5379037574159526,
"grad_norm": 0.3995780348777771,
"learning_rate": 0.00010573223630873565,
"loss": 0.0265,
"step": 3850
},
{
"epoch": 2.5444957152274226,
"grad_norm": 0.4800235331058502,
"learning_rate": 0.00010529674973737252,
"loss": 0.0281,
"step": 3860
},
{
"epoch": 2.5510876730388925,
"grad_norm": 0.2611030042171478,
"learning_rate": 0.00010486116240751223,
"loss": 0.0297,
"step": 3870
},
{
"epoch": 2.5576796308503624,
"grad_norm": 0.3945279121398926,
"learning_rate": 0.0001044254826052032,
"loss": 0.025,
"step": 3880
},
{
"epoch": 2.564271588661833,
"grad_norm": 0.5326240658760071,
"learning_rate": 0.00010398971861825297,
"loss": 0.0264,
"step": 3890
},
{
"epoch": 2.5708635464733027,
"grad_norm": 0.3610016703605652,
"learning_rate": 0.00010355387873607036,
"loss": 0.0259,
"step": 3900
},
{
"epoch": 2.5774555042847727,
"grad_norm": 0.3786564767360687,
"learning_rate": 0.0001031179712495081,
"loss": 0.0253,
"step": 3910
},
{
"epoch": 2.5840474620962426,
"grad_norm": 0.5698022246360779,
"learning_rate": 0.0001026820044507048,
"loss": 0.021,
"step": 3920
},
{
"epoch": 2.5906394199077125,
"grad_norm": 0.4795434772968292,
"learning_rate": 0.00010224598663292737,
"loss": 0.0267,
"step": 3930
},
{
"epoch": 2.5972313777191824,
"grad_norm": 0.4011961817741394,
"learning_rate": 0.00010180992609041325,
"loss": 0.035,
"step": 3940
},
{
"epoch": 2.6038233355306524,
"grad_norm": 0.5173267126083374,
"learning_rate": 0.00010137383111821266,
"loss": 0.0298,
"step": 3950
},
{
"epoch": 2.6104152933421227,
"grad_norm": 0.47045668959617615,
"learning_rate": 0.00010093771001203076,
"loss": 0.0296,
"step": 3960
},
{
"epoch": 2.6170072511535927,
"grad_norm": 0.5313148498535156,
"learning_rate": 0.0001005015710680698,
"loss": 0.026,
"step": 3970
},
{
"epoch": 2.6235992089650626,
"grad_norm": 0.40992313623428345,
"learning_rate": 0.00010006542258287139,
"loss": 0.0213,
"step": 3980
},
{
"epoch": 2.6301911667765325,
"grad_norm": 0.2713076174259186,
"learning_rate": 9.96292728531586e-05,
"loss": 0.0238,
"step": 3990
},
{
"epoch": 2.6367831245880025,
"grad_norm": 0.41798898577690125,
"learning_rate": 9.919313017567822e-05,
"loss": 0.0269,
"step": 4000
},
{
"epoch": 2.643375082399473,
"grad_norm": 0.26005855202674866,
"learning_rate": 9.875700284704286e-05,
"loss": 0.0262,
"step": 4010
},
{
"epoch": 2.6499670402109428,
"grad_norm": 0.24366049468517303,
"learning_rate": 9.83208991635732e-05,
"loss": 0.0234,
"step": 4020
},
{
"epoch": 2.6565589980224127,
"grad_norm": 0.424334317445755,
"learning_rate": 9.788482742114003e-05,
"loss": 0.0296,
"step": 4030
},
{
"epoch": 2.6631509558338826,
"grad_norm": 0.3093094229698181,
"learning_rate": 9.744879591500662e-05,
"loss": 0.0282,
"step": 4040
},
{
"epoch": 2.6697429136453525,
"grad_norm": 0.42985987663269043,
"learning_rate": 9.701281293967083e-05,
"loss": 0.031,
"step": 4050
},
{
"epoch": 2.676334871456823,
"grad_norm": 0.3328607380390167,
"learning_rate": 9.657688678870728e-05,
"loss": 0.0318,
"step": 4060
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.35078462958335876,
"learning_rate": 9.614102575460973e-05,
"loss": 0.0268,
"step": 4070
},
{
"epoch": 2.6895187870797628,
"grad_norm": 0.4191462993621826,
"learning_rate": 9.57052381286331e-05,
"loss": 0.03,
"step": 4080
},
{
"epoch": 2.6961107448912327,
"grad_norm": 0.4283992648124695,
"learning_rate": 9.526953220063603e-05,
"loss": 0.0235,
"step": 4090
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.35658934712409973,
"learning_rate": 9.483391625892293e-05,
"loss": 0.0243,
"step": 4100
},
{
"epoch": 2.7092946605141726,
"grad_norm": 0.2613814175128937,
"learning_rate": 9.439839859008653e-05,
"loss": 0.0232,
"step": 4110
},
{
"epoch": 2.7158866183256425,
"grad_norm": 0.24698810279369354,
"learning_rate": 9.396298747885013e-05,
"loss": 0.0232,
"step": 4120
},
{
"epoch": 2.722478576137113,
"grad_norm": 0.25733861327171326,
"learning_rate": 9.352769120790988e-05,
"loss": 0.0231,
"step": 4130
},
{
"epoch": 2.729070533948583,
"grad_norm": 0.288001149892807,
"learning_rate": 9.309251805777754e-05,
"loss": 0.0247,
"step": 4140
},
{
"epoch": 2.7356624917600527,
"grad_norm": 0.47979527711868286,
"learning_rate": 9.265747630662265e-05,
"loss": 0.0315,
"step": 4150
},
{
"epoch": 2.7422544495715226,
"grad_norm": 0.5932050943374634,
"learning_rate": 9.22225742301153e-05,
"loss": 0.0252,
"step": 4160
},
{
"epoch": 2.7488464073829926,
"grad_norm": 0.3525910973548889,
"learning_rate": 9.178782010126844e-05,
"loss": 0.0249,
"step": 4170
},
{
"epoch": 2.755438365194463,
"grad_norm": 0.27204054594039917,
"learning_rate": 9.135322219028079e-05,
"loss": 0.025,
"step": 4180
},
{
"epoch": 2.762030323005933,
"grad_norm": 0.3478144407272339,
"learning_rate": 9.091878876437933e-05,
"loss": 0.0216,
"step": 4190
},
{
"epoch": 2.768622280817403,
"grad_norm": 0.29393240809440613,
"learning_rate": 9.04845280876621e-05,
"loss": 0.0214,
"step": 4200
},
{
"epoch": 2.7752142386288727,
"grad_norm": 0.21876759827136993,
"learning_rate": 9.005044842094101e-05,
"loss": 0.0245,
"step": 4210
},
{
"epoch": 2.7818061964403427,
"grad_norm": 0.423742413520813,
"learning_rate": 8.961655802158456e-05,
"loss": 0.0241,
"step": 4220
},
{
"epoch": 2.788398154251813,
"grad_norm": 0.38848140835762024,
"learning_rate": 8.918286514336099e-05,
"loss": 0.0238,
"step": 4230
},
{
"epoch": 2.794990112063283,
"grad_norm": 0.28686466813087463,
"learning_rate": 8.874937803628115e-05,
"loss": 0.022,
"step": 4240
},
{
"epoch": 2.801582069874753,
"grad_norm": 0.3457236588001251,
"learning_rate": 8.831610494644148e-05,
"loss": 0.0345,
"step": 4250
},
{
"epoch": 2.808174027686223,
"grad_norm": 0.339136004447937,
"learning_rate": 8.788305411586736e-05,
"loss": 0.0194,
"step": 4260
},
{
"epoch": 2.8147659854976927,
"grad_norm": 0.3297877907752991,
"learning_rate": 8.745023378235602e-05,
"loss": 0.0199,
"step": 4270
},
{
"epoch": 2.8213579433091627,
"grad_norm": 0.39552271366119385,
"learning_rate": 8.701765217932022e-05,
"loss": 0.0266,
"step": 4280
},
{
"epoch": 2.8279499011206326,
"grad_norm": 0.40580829977989197,
"learning_rate": 8.658531753563122e-05,
"loss": 0.0367,
"step": 4290
},
{
"epoch": 2.834541858932103,
"grad_norm": 0.3342481553554535,
"learning_rate": 8.615323807546258e-05,
"loss": 0.0223,
"step": 4300
},
{
"epoch": 2.841133816743573,
"grad_norm": 0.25729164481163025,
"learning_rate": 8.572142201813363e-05,
"loss": 0.023,
"step": 4310
},
{
"epoch": 2.847725774555043,
"grad_norm": 0.3168254792690277,
"learning_rate": 8.528987757795286e-05,
"loss": 0.0237,
"step": 4320
},
{
"epoch": 2.8543177323665128,
"grad_norm": 0.4179421365261078,
"learning_rate": 8.485861296406207e-05,
"loss": 0.0268,
"step": 4330
},
{
"epoch": 2.8609096901779827,
"grad_norm": 0.46458080410957336,
"learning_rate": 8.442763638027985e-05,
"loss": 0.0216,
"step": 4340
},
{
"epoch": 2.867501647989453,
"grad_norm": 0.35828524827957153,
"learning_rate": 8.399695602494581e-05,
"loss": 0.0204,
"step": 4350
},
{
"epoch": 2.874093605800923,
"grad_norm": 0.34387773275375366,
"learning_rate": 8.356658009076441e-05,
"loss": 0.0239,
"step": 4360
},
{
"epoch": 2.880685563612393,
"grad_norm": 0.3083021342754364,
"learning_rate": 8.313651676464923e-05,
"loss": 0.0228,
"step": 4370
},
{
"epoch": 2.887277521423863,
"grad_norm": 0.2175825834274292,
"learning_rate": 8.270677422756725e-05,
"loss": 0.0201,
"step": 4380
},
{
"epoch": 2.8938694792353328,
"grad_norm": 0.2774793803691864,
"learning_rate": 8.227736065438302e-05,
"loss": 0.0234,
"step": 4390
},
{
"epoch": 2.900461437046803,
"grad_norm": 0.2598700523376465,
"learning_rate": 8.184828421370348e-05,
"loss": 0.0241,
"step": 4400
},
{
"epoch": 2.9070533948582726,
"grad_norm": 0.3586549460887909,
"learning_rate": 8.141955306772229e-05,
"loss": 0.0162,
"step": 4410
},
{
"epoch": 2.913645352669743,
"grad_norm": 0.26286324858665466,
"learning_rate": 8.099117537206477e-05,
"loss": 0.0212,
"step": 4420
},
{
"epoch": 2.920237310481213,
"grad_norm": 0.4125373661518097,
"learning_rate": 8.05631592756325e-05,
"loss": 0.0202,
"step": 4430
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.29703447222709656,
"learning_rate": 8.013551292044859e-05,
"loss": 0.0213,
"step": 4440
},
{
"epoch": 2.933421226104153,
"grad_norm": 0.3580416738986969,
"learning_rate": 7.97082444415027e-05,
"loss": 0.0226,
"step": 4450
},
{
"epoch": 2.9400131839156227,
"grad_norm": 0.4119264781475067,
"learning_rate": 7.928136196659614e-05,
"loss": 0.0242,
"step": 4460
},
{
"epoch": 2.946605141727093,
"grad_norm": 0.5699878931045532,
"learning_rate": 7.885487361618754e-05,
"loss": 0.0262,
"step": 4470
},
{
"epoch": 2.953197099538563,
"grad_norm": 0.4126439094543457,
"learning_rate": 7.842878750323801e-05,
"loss": 0.021,
"step": 4480
},
{
"epoch": 2.959789057350033,
"grad_norm": 0.42604967951774597,
"learning_rate": 7.800311173305718e-05,
"loss": 0.0219,
"step": 4490
},
{
"epoch": 2.966381015161503,
"grad_norm": 0.19208472967147827,
"learning_rate": 7.757785440314882e-05,
"loss": 0.0284,
"step": 4500
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.43162015080451965,
"learning_rate": 7.715302360305678e-05,
"loss": 0.0192,
"step": 4510
},
{
"epoch": 2.979564930784443,
"grad_norm": 0.7263951301574707,
"learning_rate": 7.672862741421126e-05,
"loss": 0.0299,
"step": 4520
},
{
"epoch": 2.986156888595913,
"grad_norm": 0.3890402615070343,
"learning_rate": 7.63046739097748e-05,
"loss": 0.0222,
"step": 4530
},
{
"epoch": 2.992748846407383,
"grad_norm": 0.25311848521232605,
"learning_rate": 7.588117115448911e-05,
"loss": 0.0208,
"step": 4540
},
{
"epoch": 2.999340804218853,
"grad_norm": 0.33752700686454773,
"learning_rate": 7.545812720452127e-05,
"loss": 0.0263,
"step": 4550
},
{
"epoch": 3.005932762030323,
"grad_norm": 0.2610788345336914,
"learning_rate": 7.50355501073107e-05,
"loss": 0.0246,
"step": 4560
},
{
"epoch": 3.012524719841793,
"grad_norm": 0.32036837935447693,
"learning_rate": 7.461344790141607e-05,
"loss": 0.0283,
"step": 4570
},
{
"epoch": 3.019116677653263,
"grad_norm": 0.4340413212776184,
"learning_rate": 7.419182861636218e-05,
"loss": 0.0293,
"step": 4580
},
{
"epoch": 3.025708635464733,
"grad_norm": 0.39858514070510864,
"learning_rate": 7.377070027248756e-05,
"loss": 0.0186,
"step": 4590
},
{
"epoch": 3.032300593276203,
"grad_norm": 0.26919031143188477,
"learning_rate": 7.335007088079156e-05,
"loss": 0.0208,
"step": 4600
},
{
"epoch": 3.038892551087673,
"grad_norm": 0.4067997634410858,
"learning_rate": 7.292994844278223e-05,
"loss": 0.0261,
"step": 4610
},
{
"epoch": 3.045484508899143,
"grad_norm": 0.4950489103794098,
"learning_rate": 7.251034095032388e-05,
"loss": 0.0292,
"step": 4620
},
{
"epoch": 3.052076466710613,
"grad_norm": 0.2269221693277359,
"learning_rate": 7.20912563854852e-05,
"loss": 0.0175,
"step": 4630
},
{
"epoch": 3.058668424522083,
"grad_norm": 0.32157209515571594,
"learning_rate": 7.167270272038747e-05,
"loss": 0.0187,
"step": 4640
},
{
"epoch": 3.065260382333553,
"grad_norm": 0.2660551369190216,
"learning_rate": 7.12546879170527e-05,
"loss": 0.023,
"step": 4650
},
{
"epoch": 3.071852340145023,
"grad_norm": 0.29758307337760925,
"learning_rate": 7.08372199272524e-05,
"loss": 0.0291,
"step": 4660
},
{
"epoch": 3.078444297956493,
"grad_norm": 0.32291552424430847,
"learning_rate": 7.042030669235606e-05,
"loss": 0.0334,
"step": 4670
},
{
"epoch": 3.085036255767963,
"grad_norm": 0.481623113155365,
"learning_rate": 7.000395614318038e-05,
"loss": 0.0192,
"step": 4680
},
{
"epoch": 3.0916282135794333,
"grad_norm": 0.36292940378189087,
"learning_rate": 6.958817619983822e-05,
"loss": 0.0279,
"step": 4690
},
{
"epoch": 3.098220171390903,
"grad_norm": 0.34903573989868164,
"learning_rate": 6.917297477158792e-05,
"loss": 0.0219,
"step": 4700
},
{
"epoch": 3.104812129202373,
"grad_norm": 0.290768563747406,
"learning_rate": 6.875835975668298e-05,
"loss": 0.0245,
"step": 4710
},
{
"epoch": 3.111404087013843,
"grad_norm": 0.4250969886779785,
"learning_rate": 6.834433904222162e-05,
"loss": 0.0239,
"step": 4720
},
{
"epoch": 3.117996044825313,
"grad_norm": 0.31465357542037964,
"learning_rate": 6.793092050399698e-05,
"loss": 0.0227,
"step": 4730
},
{
"epoch": 3.124588002636783,
"grad_norm": 0.46385765075683594,
"learning_rate": 6.75181120063471e-05,
"loss": 0.0271,
"step": 4740
},
{
"epoch": 3.1311799604482533,
"grad_norm": 0.37862929701805115,
"learning_rate": 6.710592140200542e-05,
"loss": 0.0227,
"step": 4750
},
{
"epoch": 3.1377719182597232,
"grad_norm": 0.49200916290283203,
"learning_rate": 6.669435653195146e-05,
"loss": 0.0201,
"step": 4760
},
{
"epoch": 3.144363876071193,
"grad_norm": 0.4198756217956543,
"learning_rate": 6.628342522526143e-05,
"loss": 0.0216,
"step": 4770
},
{
"epoch": 3.150955833882663,
"grad_norm": 0.5533847212791443,
"learning_rate": 6.587313529895957e-05,
"loss": 0.034,
"step": 4780
},
{
"epoch": 3.157547791694133,
"grad_norm": 0.37719669938087463,
"learning_rate": 6.546349455786926e-05,
"loss": 0.0282,
"step": 4790
},
{
"epoch": 3.164139749505603,
"grad_norm": 0.6606992483139038,
"learning_rate": 6.505451079446467e-05,
"loss": 0.0217,
"step": 4800
},
{
"epoch": 3.1707317073170733,
"grad_norm": 0.20845943689346313,
"learning_rate": 6.464619178872247e-05,
"loss": 0.023,
"step": 4810
},
{
"epoch": 3.1773236651285433,
"grad_norm": 0.23495689034461975,
"learning_rate": 6.42385453079738e-05,
"loss": 0.0256,
"step": 4820
},
{
"epoch": 3.183915622940013,
"grad_norm": 0.1919371336698532,
"learning_rate": 6.38315791067567e-05,
"loss": 0.019,
"step": 4830
},
{
"epoch": 3.190507580751483,
"grad_norm": 0.3485127091407776,
"learning_rate": 6.342530092666821e-05,
"loss": 0.0205,
"step": 4840
},
{
"epoch": 3.197099538562953,
"grad_norm": 0.2419605702161789,
"learning_rate": 6.301971849621757e-05,
"loss": 0.0197,
"step": 4850
},
{
"epoch": 3.2036914963744234,
"grad_norm": 0.23359638452529907,
"learning_rate": 6.261483953067886e-05,
"loss": 0.0215,
"step": 4860
},
{
"epoch": 3.2102834541858933,
"grad_norm": 0.4236893355846405,
"learning_rate": 6.221067173194442e-05,
"loss": 0.0259,
"step": 4870
},
{
"epoch": 3.2168754119973633,
"grad_norm": 0.35271692276000977,
"learning_rate": 6.180722278837825e-05,
"loss": 0.0229,
"step": 4880
},
{
"epoch": 3.223467369808833,
"grad_norm": 0.5368591547012329,
"learning_rate": 6.140450037466974e-05,
"loss": 0.0227,
"step": 4890
},
{
"epoch": 3.230059327620303,
"grad_norm": 0.3813161849975586,
"learning_rate": 6.1002512151687796e-05,
"loss": 0.0175,
"step": 4900
},
{
"epoch": 3.236651285431773,
"grad_norm": 0.40781912207603455,
"learning_rate": 6.060126576633497e-05,
"loss": 0.0278,
"step": 4910
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.3028331398963928,
"learning_rate": 6.0200768851402133e-05,
"loss": 0.0212,
"step": 4920
},
{
"epoch": 3.2498352010547134,
"grad_norm": 0.20801442861557007,
"learning_rate": 5.980102902542306e-05,
"loss": 0.0244,
"step": 4930
},
{
"epoch": 3.2564271588661833,
"grad_norm": 0.3236633241176605,
"learning_rate": 5.9402053892529794e-05,
"loss": 0.023,
"step": 4940
},
{
"epoch": 3.263019116677653,
"grad_norm": 0.3075791895389557,
"learning_rate": 5.9003851042307804e-05,
"loss": 0.0193,
"step": 4950
},
{
"epoch": 3.269611074489123,
"grad_norm": 0.33486539125442505,
"learning_rate": 5.86064280496516e-05,
"loss": 0.0212,
"step": 4960
},
{
"epoch": 3.276203032300593,
"grad_norm": 0.4018231928348541,
"learning_rate": 5.8209792474620815e-05,
"loss": 0.0215,
"step": 4970
},
{
"epoch": 3.2827949901120634,
"grad_norm": 0.35829004645347595,
"learning_rate": 5.78139518622961e-05,
"loss": 0.0228,
"step": 4980
},
{
"epoch": 3.2893869479235334,
"grad_norm": 0.2682739496231079,
"learning_rate": 5.741891374263593e-05,
"loss": 0.0255,
"step": 4990
},
{
"epoch": 3.2959789057350033,
"grad_norm": 0.3929627537727356,
"learning_rate": 5.702468563033306e-05,
"loss": 0.0228,
"step": 5000
},
{
"epoch": 3.3025708635464732,
"grad_norm": 0.2807949483394623,
"learning_rate": 5.663127502467184e-05,
"loss": 0.0207,
"step": 5010
},
{
"epoch": 3.309162821357943,
"grad_norm": 0.33235079050064087,
"learning_rate": 5.6238689409385346e-05,
"loss": 0.0243,
"step": 5020
},
{
"epoch": 3.3157547791694135,
"grad_norm": 0.28995218873023987,
"learning_rate": 5.5846936252513174e-05,
"loss": 0.017,
"step": 5030
},
{
"epoch": 3.3223467369808835,
"grad_norm": 0.2601809799671173,
"learning_rate": 5.54560230062593e-05,
"loss": 0.0166,
"step": 5040
},
{
"epoch": 3.3289386947923534,
"grad_norm": 0.3650406301021576,
"learning_rate": 5.5065957106850204e-05,
"loss": 0.021,
"step": 5050
},
{
"epoch": 3.3355306526038233,
"grad_norm": 0.48497456312179565,
"learning_rate": 5.4676745974393764e-05,
"loss": 0.0173,
"step": 5060
},
{
"epoch": 3.3421226104152932,
"grad_norm": 0.3954178988933563,
"learning_rate": 5.4288397012737646e-05,
"loss": 0.02,
"step": 5070
},
{
"epoch": 3.348714568226763,
"grad_norm": 0.21555176377296448,
"learning_rate": 5.390091760932887e-05,
"loss": 0.0208,
"step": 5080
},
{
"epoch": 3.3553065260382335,
"grad_norm": 0.4477789103984833,
"learning_rate": 5.3514315135073076e-05,
"loss": 0.023,
"step": 5090
},
{
"epoch": 3.3618984838497035,
"grad_norm": 0.4595910906791687,
"learning_rate": 5.3128596944194234e-05,
"loss": 0.027,
"step": 5100
},
{
"epoch": 3.3684904416611734,
"grad_norm": 0.3426424264907837,
"learning_rate": 5.274377037409497e-05,
"loss": 0.0224,
"step": 5110
},
{
"epoch": 3.3750823994726433,
"grad_norm": 0.2647363841533661,
"learning_rate": 5.235984274521684e-05,
"loss": 0.0238,
"step": 5120
},
{
"epoch": 3.3816743572841133,
"grad_norm": 0.21992464363574982,
"learning_rate": 5.197682136090107e-05,
"loss": 0.0163,
"step": 5130
},
{
"epoch": 3.388266315095583,
"grad_norm": 0.6907774209976196,
"learning_rate": 5.159471350724978e-05,
"loss": 0.0223,
"step": 5140
},
{
"epoch": 3.3948582729070536,
"grad_norm": 0.44378501176834106,
"learning_rate": 5.121352645298708e-05,
"loss": 0.0245,
"step": 5150
},
{
"epoch": 3.4014502307185235,
"grad_norm": 0.25844740867614746,
"learning_rate": 5.083326744932117e-05,
"loss": 0.0211,
"step": 5160
},
{
"epoch": 3.4080421885299934,
"grad_norm": 0.3211382031440735,
"learning_rate": 5.0453943729806094e-05,
"loss": 0.0207,
"step": 5170
},
{
"epoch": 3.4146341463414633,
"grad_norm": 0.25202128291130066,
"learning_rate": 5.007556251020434e-05,
"loss": 0.0215,
"step": 5180
},
{
"epoch": 3.4212261041529333,
"grad_norm": 0.3003428876399994,
"learning_rate": 4.9698130988349424e-05,
"loss": 0.0207,
"step": 5190
},
{
"epoch": 3.4278180619644036,
"grad_norm": 0.32026761770248413,
"learning_rate": 4.9321656344009115e-05,
"loss": 0.0196,
"step": 5200
},
{
"epoch": 3.4344100197758736,
"grad_norm": 0.26623809337615967,
"learning_rate": 4.894614573874877e-05,
"loss": 0.0219,
"step": 5210
},
{
"epoch": 3.4410019775873435,
"grad_norm": 0.35238540172576904,
"learning_rate": 4.857160631579509e-05,
"loss": 0.0152,
"step": 5220
},
{
"epoch": 3.4475939353988134,
"grad_norm": 0.3443749248981476,
"learning_rate": 4.819804519990033e-05,
"loss": 0.0232,
"step": 5230
},
{
"epoch": 3.4541858932102834,
"grad_norm": 0.35800328850746155,
"learning_rate": 4.782546949720658e-05,
"loss": 0.0217,
"step": 5240
},
{
"epoch": 3.4607778510217533,
"grad_norm": 0.37850216031074524,
"learning_rate": 4.745388629511084e-05,
"loss": 0.0167,
"step": 5250
},
{
"epoch": 3.4673698088332237,
"grad_norm": 0.24581514298915863,
"learning_rate": 4.708330266212993e-05,
"loss": 0.0179,
"step": 5260
},
{
"epoch": 3.4739617666446936,
"grad_norm": 0.16642197966575623,
"learning_rate": 4.671372564776629e-05,
"loss": 0.0169,
"step": 5270
},
{
"epoch": 3.4805537244561635,
"grad_norm": 0.32910865545272827,
"learning_rate": 4.634516228237372e-05,
"loss": 0.019,
"step": 5280
},
{
"epoch": 3.4871456822676334,
"grad_norm": 0.21662920713424683,
"learning_rate": 4.59776195770236e-05,
"loss": 0.0162,
"step": 5290
},
{
"epoch": 3.4937376400791034,
"grad_norm": 0.3485572934150696,
"learning_rate": 4.561110452337171e-05,
"loss": 0.0217,
"step": 5300
},
{
"epoch": 3.5003295978905733,
"grad_norm": 0.20581798255443573,
"learning_rate": 4.5245624093525e-05,
"loss": 0.0296,
"step": 5310
},
{
"epoch": 3.5069215557020437,
"grad_norm": 0.35009968280792236,
"learning_rate": 4.488118523990915e-05,
"loss": 0.0208,
"step": 5320
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.39382439851760864,
"learning_rate": 4.451779489513628e-05,
"loss": 0.0217,
"step": 5330
},
{
"epoch": 3.5201054713249835,
"grad_norm": 0.348563551902771,
"learning_rate": 4.415545997187296e-05,
"loss": 0.0165,
"step": 5340
},
{
"epoch": 3.5266974291364535,
"grad_norm": 0.494354784488678,
"learning_rate": 4.379418736270886e-05,
"loss": 0.0232,
"step": 5350
},
{
"epoch": 3.5332893869479234,
"grad_norm": 0.1578008085489273,
"learning_rate": 4.343398394002547e-05,
"loss": 0.0226,
"step": 5360
},
{
"epoch": 3.5398813447593938,
"grad_norm": 0.3410768210887909,
"learning_rate": 4.307485655586557e-05,
"loss": 0.0219,
"step": 5370
},
{
"epoch": 3.5464733025708637,
"grad_norm": 0.20960773527622223,
"learning_rate": 4.271681204180268e-05,
"loss": 0.0209,
"step": 5380
},
{
"epoch": 3.5530652603823336,
"grad_norm": 0.22281195223331451,
"learning_rate": 4.2359857208811284e-05,
"loss": 0.0233,
"step": 5390
},
{
"epoch": 3.5596572181938035,
"grad_norm": 0.3393511474132538,
"learning_rate": 4.2003998847137174e-05,
"loss": 0.0209,
"step": 5400
},
{
"epoch": 3.5662491760052735,
"grad_norm": 0.6712432503700256,
"learning_rate": 4.164924372616821e-05,
"loss": 0.0249,
"step": 5410
},
{
"epoch": 3.572841133816744,
"grad_norm": 0.18807201087474823,
"learning_rate": 4.129559859430573e-05,
"loss": 0.024,
"step": 5420
},
{
"epoch": 3.5794330916282133,
"grad_norm": 0.4251366853713989,
"learning_rate": 4.094307017883606e-05,
"loss": 0.0174,
"step": 5430
},
{
"epoch": 3.5860250494396837,
"grad_norm": 0.2247576266527176,
"learning_rate": 4.0591665185802576e-05,
"loss": 0.0214,
"step": 5440
},
{
"epoch": 3.5926170072511536,
"grad_norm": 0.643822968006134,
"learning_rate": 4.0241390299878e-05,
"loss": 0.0222,
"step": 5450
},
{
"epoch": 3.5992089650626236,
"grad_norm": 0.37506723403930664,
"learning_rate": 3.989225218423753e-05,
"loss": 0.0147,
"step": 5460
},
{
"epoch": 3.6058009228740935,
"grad_norm": 0.3052820861339569,
"learning_rate": 3.954425748043186e-05,
"loss": 0.0191,
"step": 5470
},
{
"epoch": 3.6123928806855634,
"grad_norm": 0.3424012362957001,
"learning_rate": 3.9197412808260805e-05,
"loss": 0.0214,
"step": 5480
},
{
"epoch": 3.618984838497034,
"grad_norm": 0.24967588484287262,
"learning_rate": 3.885172476564765e-05,
"loss": 0.0157,
"step": 5490
},
{
"epoch": 3.6255767963085037,
"grad_norm": 0.2771139442920685,
"learning_rate": 3.850719992851326e-05,
"loss": 0.0198,
"step": 5500
},
{
"epoch": 3.6321687541199736,
"grad_norm": 0.3275032043457031,
"learning_rate": 3.8163844850651346e-05,
"loss": 0.0204,
"step": 5510
},
{
"epoch": 3.6387607119314436,
"grad_norm": 0.3696538507938385,
"learning_rate": 3.7821666063603566e-05,
"loss": 0.0172,
"step": 5520
},
{
"epoch": 3.6453526697429135,
"grad_norm": 0.43786558508872986,
"learning_rate": 3.748067007653536e-05,
"loss": 0.0199,
"step": 5530
},
{
"epoch": 3.651944627554384,
"grad_norm": 0.15298739075660706,
"learning_rate": 3.714086337611217e-05,
"loss": 0.0118,
"step": 5540
},
{
"epoch": 3.658536585365854,
"grad_norm": 0.2643417716026306,
"learning_rate": 3.680225242637583e-05,
"loss": 0.0217,
"step": 5550
},
{
"epoch": 3.6651285431773237,
"grad_norm": 0.29987242817878723,
"learning_rate": 3.646484366862197e-05,
"loss": 0.0218,
"step": 5560
},
{
"epoch": 3.6717205009887937,
"grad_norm": 0.2553282678127289,
"learning_rate": 3.6128643521277096e-05,
"loss": 0.0192,
"step": 5570
},
{
"epoch": 3.6783124588002636,
"grad_norm": 0.24411100149154663,
"learning_rate": 3.57936583797768e-05,
"loss": 0.0156,
"step": 5580
},
{
"epoch": 3.684904416611734,
"grad_norm": 0.2638270854949951,
"learning_rate": 3.5459894616443954e-05,
"loss": 0.0188,
"step": 5590
},
{
"epoch": 3.6914963744232034,
"grad_norm": 0.19742664694786072,
"learning_rate": 3.5127358580367463e-05,
"loss": 0.021,
"step": 5600
},
{
"epoch": 3.698088332234674,
"grad_norm": 0.3131982386112213,
"learning_rate": 3.479605659728159e-05,
"loss": 0.0176,
"step": 5610
},
{
"epoch": 3.7046802900461437,
"grad_norm": 0.24199941754341125,
"learning_rate": 3.446599496944557e-05,
"loss": 0.0178,
"step": 5620
},
{
"epoch": 3.7112722478576137,
"grad_norm": 0.18790839612483978,
"learning_rate": 3.413717997552376e-05,
"loss": 0.012,
"step": 5630
},
{
"epoch": 3.7178642056690836,
"grad_norm": 0.4031229317188263,
"learning_rate": 3.380961787046605e-05,
"loss": 0.022,
"step": 5640
},
{
"epoch": 3.7244561634805535,
"grad_norm": 0.3094145357608795,
"learning_rate": 3.348331488538913e-05,
"loss": 0.0207,
"step": 5650
},
{
"epoch": 3.731048121292024,
"grad_norm": 0.31893035769462585,
"learning_rate": 3.315827722745779e-05,
"loss": 0.0195,
"step": 5660
},
{
"epoch": 3.737640079103494,
"grad_norm": 0.2687014639377594,
"learning_rate": 3.28345110797668e-05,
"loss": 0.0152,
"step": 5670
},
{
"epoch": 3.7442320369149638,
"grad_norm": 0.3952026963233948,
"learning_rate": 3.2512022601223515e-05,
"loss": 0.0247,
"step": 5680
},
{
"epoch": 3.7508239947264337,
"grad_norm": 0.25332149863243103,
"learning_rate": 3.21908179264304e-05,
"loss": 0.0142,
"step": 5690
},
{
"epoch": 3.7574159525379036,
"grad_norm": 0.4335060119628906,
"learning_rate": 3.187090316556861e-05,
"loss": 0.0202,
"step": 5700
},
{
"epoch": 3.764007910349374,
"grad_norm": 0.25930336117744446,
"learning_rate": 3.155228440428164e-05,
"loss": 0.0208,
"step": 5710
},
{
"epoch": 3.770599868160844,
"grad_norm": 0.6695492267608643,
"learning_rate": 3.123496770355956e-05,
"loss": 0.0153,
"step": 5720
},
{
"epoch": 3.777191825972314,
"grad_norm": 0.3357510566711426,
"learning_rate": 3.091895909962375e-05,
"loss": 0.021,
"step": 5730
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.4220266342163086,
"learning_rate": 3.060426460381195e-05,
"loss": 0.0155,
"step": 5740
},
{
"epoch": 3.7903757415952537,
"grad_norm": 0.2396579086780548,
"learning_rate": 3.0290890202464182e-05,
"loss": 0.017,
"step": 5750
},
{
"epoch": 3.796967699406724,
"grad_norm": 0.4336076080799103,
"learning_rate": 2.9978841856808525e-05,
"loss": 0.0193,
"step": 5760
},
{
"epoch": 3.8035596572181936,
"grad_norm": 0.4535181224346161,
"learning_rate": 2.966812550284803e-05,
"loss": 0.0151,
"step": 5770
},
{
"epoch": 3.810151615029664,
"grad_norm": 0.2847338020801544,
"learning_rate": 2.9358747051247637e-05,
"loss": 0.0164,
"step": 5780
},
{
"epoch": 3.816743572841134,
"grad_norm": 0.33757925033569336,
"learning_rate": 2.905071238722169e-05,
"loss": 0.0173,
"step": 5790
},
{
"epoch": 3.823335530652604,
"grad_norm": 0.21222251653671265,
"learning_rate": 2.8744027370422167e-05,
"loss": 0.0186,
"step": 5800
},
{
"epoch": 3.8299274884640737,
"grad_norm": 0.8053876757621765,
"learning_rate": 2.843869783482701e-05,
"loss": 0.0189,
"step": 5810
},
{
"epoch": 3.8365194462755436,
"grad_norm": 0.2711152732372284,
"learning_rate": 2.8134729588629303e-05,
"loss": 0.0281,
"step": 5820
},
{
"epoch": 3.843111404087014,
"grad_norm": 0.24810029566287994,
"learning_rate": 2.7832128414126735e-05,
"loss": 0.0169,
"step": 5830
},
{
"epoch": 3.849703361898484,
"grad_norm": 0.3628500998020172,
"learning_rate": 2.7530900067611577e-05,
"loss": 0.0138,
"step": 5840
},
{
"epoch": 3.856295319709954,
"grad_norm": 0.1820344775915146,
"learning_rate": 2.7231050279261217e-05,
"loss": 0.0201,
"step": 5850
},
{
"epoch": 3.862887277521424,
"grad_norm": 0.5230331420898438,
"learning_rate": 2.6932584753029068e-05,
"loss": 0.0162,
"step": 5860
},
{
"epoch": 3.8694792353328937,
"grad_norm": 0.27183738350868225,
"learning_rate": 2.6635509166536243e-05,
"loss": 0.0173,
"step": 5870
},
{
"epoch": 3.876071193144364,
"grad_norm": 0.19195932149887085,
"learning_rate": 2.633982917096335e-05,
"loss": 0.0207,
"step": 5880
},
{
"epoch": 3.882663150955834,
"grad_norm": 0.42282554507255554,
"learning_rate": 2.6045550390943185e-05,
"loss": 0.0159,
"step": 5890
},
{
"epoch": 3.889255108767304,
"grad_norm": 0.2981650233268738,
"learning_rate": 2.5752678424453514e-05,
"loss": 0.0173,
"step": 5900
},
{
"epoch": 3.895847066578774,
"grad_norm": 0.32203352451324463,
"learning_rate": 2.5461218842710798e-05,
"loss": 0.021,
"step": 5910
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.2388588786125183,
"learning_rate": 2.517117719006411e-05,
"loss": 0.0219,
"step": 5920
},
{
"epoch": 3.9090309822017137,
"grad_norm": 0.40328285098075867,
"learning_rate": 2.488255898388966e-05,
"loss": 0.0169,
"step": 5930
},
{
"epoch": 3.9156229400131837,
"grad_norm": 0.14190708100795746,
"learning_rate": 2.4595369714485895e-05,
"loss": 0.0167,
"step": 5940
},
{
"epoch": 3.922214897824654,
"grad_norm": 0.418643593788147,
"learning_rate": 2.430961484496893e-05,
"loss": 0.0187,
"step": 5950
},
{
"epoch": 3.928806855636124,
"grad_norm": 0.2280479073524475,
"learning_rate": 2.4025299811168843e-05,
"loss": 0.0151,
"step": 5960
},
{
"epoch": 3.935398813447594,
"grad_norm": 0.5002431869506836,
"learning_rate": 2.3742430021526018e-05,
"loss": 0.019,
"step": 5970
},
{
"epoch": 3.941990771259064,
"grad_norm": 0.22551734745502472,
"learning_rate": 2.3461010856988473e-05,
"loss": 0.013,
"step": 5980
},
{
"epoch": 3.9485827290705338,
"grad_norm": 0.3069497048854828,
"learning_rate": 2.318104767090944e-05,
"loss": 0.018,
"step": 5990
},
{
"epoch": 3.955174686882004,
"grad_norm": 0.36286690831184387,
"learning_rate": 2.2902545788945396e-05,
"loss": 0.024,
"step": 6000
},
{
"epoch": 3.961766644693474,
"grad_norm": 0.2421414703130722,
"learning_rate": 2.2625510508954952e-05,
"loss": 0.0212,
"step": 6010
},
{
"epoch": 3.968358602504944,
"grad_norm": 0.23019398748874664,
"learning_rate": 2.234994710089795e-05,
"loss": 0.0188,
"step": 6020
},
{
"epoch": 3.974950560316414,
"grad_norm": 0.2802564203739166,
"learning_rate": 2.207586080673528e-05,
"loss": 0.0192,
"step": 6030
},
{
"epoch": 3.981542518127884,
"grad_norm": 0.2667250633239746,
"learning_rate": 2.1803256840329134e-05,
"loss": 0.0213,
"step": 6040
},
{
"epoch": 3.988134475939354,
"grad_norm": 0.4056625962257385,
"learning_rate": 2.1532140387343735e-05,
"loss": 0.0169,
"step": 6050
},
{
"epoch": 3.994726433750824,
"grad_norm": 0.1790419965982437,
"learning_rate": 2.126251660514691e-05,
"loss": 0.0185,
"step": 6060
},
{
"epoch": 4.001318391562294,
"grad_norm": 0.2861385941505432,
"learning_rate": 2.0994390622711734e-05,
"loss": 0.0191,
"step": 6070
},
{
"epoch": 4.0079103493737644,
"grad_norm": 0.20970335602760315,
"learning_rate": 2.0727767540519193e-05,
"loss": 0.0171,
"step": 6080
},
{
"epoch": 4.014502307185234,
"grad_norm": 0.2126467227935791,
"learning_rate": 2.046265243046094e-05,
"loss": 0.0175,
"step": 6090
},
{
"epoch": 4.021094264996704,
"grad_norm": 0.4862785339355469,
"learning_rate": 2.0199050335743007e-05,
"loss": 0.0212,
"step": 6100
},
{
"epoch": 4.027686222808174,
"grad_norm": 0.36454570293426514,
"learning_rate": 1.9936966270789738e-05,
"loss": 0.0159,
"step": 6110
},
{
"epoch": 4.034278180619644,
"grad_norm": 0.1897134780883789,
"learning_rate": 1.9676405221148475e-05,
"loss": 0.0172,
"step": 6120
},
{
"epoch": 4.040870138431114,
"grad_norm": 0.2542422115802765,
"learning_rate": 1.9417372143394697e-05,
"loss": 0.0251,
"step": 6130
},
{
"epoch": 4.047462096242584,
"grad_norm": 0.20512335002422333,
"learning_rate": 1.9159871965037657e-05,
"loss": 0.0172,
"step": 6140
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.21565409004688263,
"learning_rate": 1.8903909584426826e-05,
"loss": 0.018,
"step": 6150
},
{
"epoch": 4.060646011865524,
"grad_norm": 0.3546988368034363,
"learning_rate": 1.86494898706585e-05,
"loss": 0.0169,
"step": 6160
},
{
"epoch": 4.067237969676994,
"grad_norm": 0.5294975638389587,
"learning_rate": 1.8396617663483363e-05,
"loss": 0.0159,
"step": 6170
},
{
"epoch": 4.073829927488464,
"grad_norm": 0.2470693439245224,
"learning_rate": 1.814529777321432e-05,
"loss": 0.0211,
"step": 6180
},
{
"epoch": 4.080421885299934,
"grad_norm": 0.4331272542476654,
"learning_rate": 1.7895534980634954e-05,
"loss": 0.0176,
"step": 6190
},
{
"epoch": 4.0870138431114045,
"grad_norm": 0.3057391941547394,
"learning_rate": 1.764733403690875e-05,
"loss": 0.0203,
"step": 6200
},
{
"epoch": 4.093605800922874,
"grad_norm": 0.11541125923395157,
"learning_rate": 1.740069966348846e-05,
"loss": 0.0193,
"step": 6210
},
{
"epoch": 4.100197758734344,
"grad_norm": 0.28473731875419617,
"learning_rate": 1.71556365520266e-05,
"loss": 0.0196,
"step": 6220
},
{
"epoch": 4.106789716545814,
"grad_norm": 0.14990141987800598,
"learning_rate": 1.6912149364285958e-05,
"loss": 0.0147,
"step": 6230
},
{
"epoch": 4.113381674357284,
"grad_norm": 0.33358579874038696,
"learning_rate": 1.667024273205092e-05,
"loss": 0.02,
"step": 6240
},
{
"epoch": 4.119973632168755,
"grad_norm": 0.2164691537618637,
"learning_rate": 1.6429921257039592e-05,
"loss": 0.0171,
"step": 6250
},
{
"epoch": 4.126565589980224,
"grad_norm": 0.29503509402275085,
"learning_rate": 1.619118951081594e-05,
"loss": 0.0156,
"step": 6260
},
{
"epoch": 4.133157547791694,
"grad_norm": 0.29893797636032104,
"learning_rate": 1.5954052034703125e-05,
"loss": 0.016,
"step": 6270
},
{
"epoch": 4.139749505603164,
"grad_norm": 0.3970952033996582,
"learning_rate": 1.5718513339696883e-05,
"loss": 0.0191,
"step": 6280
},
{
"epoch": 4.146341463414634,
"grad_norm": 0.2718060612678528,
"learning_rate": 1.548457790637987e-05,
"loss": 0.014,
"step": 6290
},
{
"epoch": 4.152933421226104,
"grad_norm": 0.3720945119857788,
"learning_rate": 1.525225018483638e-05,
"loss": 0.0168,
"step": 6300
},
{
"epoch": 4.159525379037574,
"grad_norm": 0.21513940393924713,
"learning_rate": 1.5021534594567621e-05,
"loss": 0.0159,
"step": 6310
},
{
"epoch": 4.1661173368490445,
"grad_norm": 0.30618909001350403,
"learning_rate": 1.4792435524407755e-05,
"loss": 0.0151,
"step": 6320
},
{
"epoch": 4.172709294660514,
"grad_norm": 0.409757524728775,
"learning_rate": 1.4564957332440365e-05,
"loss": 0.0177,
"step": 6330
},
{
"epoch": 4.179301252471984,
"grad_norm": 0.2687203884124756,
"learning_rate": 1.4339104345915554e-05,
"loss": 0.0202,
"step": 6340
},
{
"epoch": 4.185893210283454,
"grad_norm": 0.25398269295692444,
"learning_rate": 1.4114880861167557e-05,
"loss": 0.0189,
"step": 6350
},
{
"epoch": 4.192485168094924,
"grad_norm": 0.2254013866186142,
"learning_rate": 1.3892291143533154e-05,
"loss": 0.0144,
"step": 6360
},
{
"epoch": 4.199077125906395,
"grad_norm": 0.32205384969711304,
"learning_rate": 1.3671339427270458e-05,
"loss": 0.0161,
"step": 6370
},
{
"epoch": 4.205669083717864,
"grad_norm": 0.3406763970851898,
"learning_rate": 1.3452029915478304e-05,
"loss": 0.02,
"step": 6380
},
{
"epoch": 4.2122610415293344,
"grad_norm": 0.31815874576568604,
"learning_rate": 1.3234366780016438e-05,
"loss": 0.0185,
"step": 6390
},
{
"epoch": 4.218852999340804,
"grad_norm": 0.1224733293056488,
"learning_rate": 1.3018354161425994e-05,
"loss": 0.0181,
"step": 6400
},
{
"epoch": 4.225444957152274,
"grad_norm": 0.42326441407203674,
"learning_rate": 1.2803996168850896e-05,
"loss": 0.016,
"step": 6410
},
{
"epoch": 4.232036914963745,
"grad_norm": 0.2917204797267914,
"learning_rate": 1.2591296879959557e-05,
"loss": 0.0146,
"step": 6420
},
{
"epoch": 4.238628872775214,
"grad_norm": 0.27973493933677673,
"learning_rate": 1.238026034086739e-05,
"loss": 0.0167,
"step": 6430
},
{
"epoch": 4.2452208305866845,
"grad_norm": 0.13871712982654572,
"learning_rate": 1.2170890566059811e-05,
"loss": 0.0161,
"step": 6440
},
{
"epoch": 4.251812788398154,
"grad_norm": 0.2724437713623047,
"learning_rate": 1.1963191538315833e-05,
"loss": 0.0188,
"step": 6450
},
{
"epoch": 4.258404746209624,
"grad_norm": 0.24582289159297943,
"learning_rate": 1.1757167208632414e-05,
"loss": 0.0142,
"step": 6460
},
{
"epoch": 4.264996704021094,
"grad_norm": 0.6128583550453186,
"learning_rate": 1.1552821496149135e-05,
"loss": 0.015,
"step": 6470
},
{
"epoch": 4.271588661832564,
"grad_norm": 0.38243502378463745,
"learning_rate": 1.135015828807382e-05,
"loss": 0.0135,
"step": 6480
},
{
"epoch": 4.278180619644035,
"grad_norm": 0.22540901601314545,
"learning_rate": 1.1149181439608514e-05,
"loss": 0.0156,
"step": 6490
},
{
"epoch": 4.284772577455504,
"grad_norm": 0.4100974500179291,
"learning_rate": 1.0949894773876079e-05,
"loss": 0.0156,
"step": 6500
},
{
"epoch": 4.2913645352669745,
"grad_norm": 0.1929452121257782,
"learning_rate": 1.0752302081847565e-05,
"loss": 0.0184,
"step": 6510
},
{
"epoch": 4.297956493078444,
"grad_norm": 0.27612316608428955,
"learning_rate": 1.0556407122270096e-05,
"loss": 0.0192,
"step": 6520
},
{
"epoch": 4.304548450889914,
"grad_norm": 0.20837433636188507,
"learning_rate": 1.0362213621595307e-05,
"loss": 0.0135,
"step": 6530
},
{
"epoch": 4.311140408701385,
"grad_norm": 0.38383790850639343,
"learning_rate": 1.016972527390846e-05,
"loss": 0.0186,
"step": 6540
},
{
"epoch": 4.317732366512854,
"grad_norm": 0.3808279037475586,
"learning_rate": 9.978945740858226e-06,
"loss": 0.0172,
"step": 6550
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.12612776458263397,
"learning_rate": 9.789878651587036e-06,
"loss": 0.0131,
"step": 6560
},
{
"epoch": 4.330916282135794,
"grad_norm": 0.47806084156036377,
"learning_rate": 9.602527602661949e-06,
"loss": 0.0175,
"step": 6570
},
{
"epoch": 4.337508239947264,
"grad_norm": 0.5602189302444458,
"learning_rate": 9.416896158006328e-06,
"loss": 0.0161,
"step": 6580
},
{
"epoch": 4.344100197758735,
"grad_norm": 0.5258492231369019,
"learning_rate": 9.232987848832009e-06,
"loss": 0.0151,
"step": 6590
},
{
"epoch": 4.350692155570204,
"grad_norm": 0.18115440011024475,
"learning_rate": 9.050806173572134e-06,
"loss": 0.0115,
"step": 6600
},
{
"epoch": 4.357284113381675,
"grad_norm": 0.2673959732055664,
"learning_rate": 8.870354597814622e-06,
"loss": 0.013,
"step": 6610
},
{
"epoch": 4.363876071193144,
"grad_norm": 0.4614759385585785,
"learning_rate": 8.691636554236182e-06,
"loss": 0.0179,
"step": 6620
},
{
"epoch": 4.3704680290046145,
"grad_norm": 0.31257471442222595,
"learning_rate": 8.514655442537122e-06,
"loss": 0.0152,
"step": 6630
},
{
"epoch": 4.377059986816084,
"grad_norm": 0.1402910202741623,
"learning_rate": 8.339414629376507e-06,
"loss": 0.0155,
"step": 6640
},
{
"epoch": 4.383651944627554,
"grad_norm": 0.19149114191532135,
"learning_rate": 8.165917448308324e-06,
"loss": 0.0132,
"step": 6650
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.31132665276527405,
"learning_rate": 7.994167199717894e-06,
"loss": 0.0159,
"step": 6660
},
{
"epoch": 4.396835860250494,
"grad_norm": 0.30715203285217285,
"learning_rate": 7.824167150759188e-06,
"loss": 0.022,
"step": 6670
},
{
"epoch": 4.403427818061965,
"grad_norm": 0.23801127076148987,
"learning_rate": 7.655920535292682e-06,
"loss": 0.0123,
"step": 6680
},
{
"epoch": 4.410019775873434,
"grad_norm": 0.3437555730342865,
"learning_rate": 7.4894305538237285e-06,
"loss": 0.0154,
"step": 6690
},
{
"epoch": 4.4166117336849045,
"grad_norm": 0.23300838470458984,
"learning_rate": 7.324700373441828e-06,
"loss": 0.0188,
"step": 6700
},
{
"epoch": 4.423203691496375,
"grad_norm": 0.2827889621257782,
"learning_rate": 7.161733127760228e-06,
"loss": 0.0151,
"step": 6710
},
{
"epoch": 4.429795649307844,
"grad_norm": 0.2165522575378418,
"learning_rate": 7.000531916856512e-06,
"loss": 0.0145,
"step": 6720
},
{
"epoch": 4.436387607119315,
"grad_norm": 0.3993603587150574,
"learning_rate": 6.841099807213392e-06,
"loss": 0.024,
"step": 6730
},
{
"epoch": 4.442979564930784,
"grad_norm": 0.21347716450691223,
"learning_rate": 6.683439831660554e-06,
"loss": 0.0254,
"step": 6740
},
{
"epoch": 4.4495715227422545,
"grad_norm": 0.4783138036727905,
"learning_rate": 6.527554989316897e-06,
"loss": 0.0141,
"step": 6750
},
{
"epoch": 4.456163480553725,
"grad_norm": 0.2551850378513336,
"learning_rate": 6.373448245533464e-06,
"loss": 0.0203,
"step": 6760
},
{
"epoch": 4.462755438365194,
"grad_norm": 0.22933778166770935,
"learning_rate": 6.221122531837076e-06,
"loss": 0.0193,
"step": 6770
},
{
"epoch": 4.469347396176665,
"grad_norm": 0.1832355260848999,
"learning_rate": 6.070580745874544e-06,
"loss": 0.0134,
"step": 6780
},
{
"epoch": 4.475939353988134,
"grad_norm": 0.3792283535003662,
"learning_rate": 5.921825751357557e-06,
"loss": 0.0159,
"step": 6790
},
{
"epoch": 4.482531311799605,
"grad_norm": 0.18225885927677155,
"learning_rate": 5.7748603780081735e-06,
"loss": 0.0217,
"step": 6800
},
{
"epoch": 4.489123269611074,
"grad_norm": 0.49436914920806885,
"learning_rate": 5.62968742150507e-06,
"loss": 0.0158,
"step": 6810
},
{
"epoch": 4.4957152274225445,
"grad_norm": 0.2793099582195282,
"learning_rate": 5.4863096434302655e-06,
"loss": 0.016,
"step": 6820
},
{
"epoch": 4.502307185234015,
"grad_norm": 0.2998494505882263,
"learning_rate": 5.344729771216661e-06,
"loss": 0.0174,
"step": 6830
},
{
"epoch": 4.508899143045484,
"grad_norm": 0.45131003856658936,
"learning_rate": 5.204950498096117e-06,
"loss": 0.0196,
"step": 6840
},
{
"epoch": 4.515491100856955,
"grad_norm": 0.37397655844688416,
"learning_rate": 5.066974483048215e-06,
"loss": 0.0158,
"step": 6850
},
{
"epoch": 4.522083058668424,
"grad_norm": 0.5381725430488586,
"learning_rate": 4.930804350749729e-06,
"loss": 0.016,
"step": 6860
},
{
"epoch": 4.528675016479895,
"grad_norm": 0.2811379134654999,
"learning_rate": 4.796442691524638e-06,
"loss": 0.013,
"step": 6870
},
{
"epoch": 4.535266974291364,
"grad_norm": 0.205452561378479,
"learning_rate": 4.663892061294872e-06,
"loss": 0.0165,
"step": 6880
},
{
"epoch": 4.541858932102834,
"grad_norm": 0.2746995687484741,
"learning_rate": 4.5331549815317174e-06,
"loss": 0.0227,
"step": 6890
},
{
"epoch": 4.548450889914305,
"grad_norm": 0.30904215574264526,
"learning_rate": 4.404233939207791e-06,
"loss": 0.0153,
"step": 6900
},
{
"epoch": 4.555042847725774,
"grad_norm": 0.42725998163223267,
"learning_rate": 4.2771313867498e-06,
"loss": 0.0192,
"step": 6910
},
{
"epoch": 4.561634805537245,
"grad_norm": 0.18472789227962494,
"learning_rate": 4.151849741991864e-06,
"loss": 0.025,
"step": 6920
},
{
"epoch": 4.568226763348715,
"grad_norm": 0.3807401955127716,
"learning_rate": 4.0283913881294935e-06,
"loss": 0.0181,
"step": 6930
},
{
"epoch": 4.5748187211601845,
"grad_norm": 0.17289142310619354,
"learning_rate": 3.906758673674293e-06,
"loss": 0.0148,
"step": 6940
},
{
"epoch": 4.581410678971655,
"grad_norm": 0.32773271203041077,
"learning_rate": 3.7869539124092525e-06,
"loss": 0.0173,
"step": 6950
},
{
"epoch": 4.588002636783124,
"grad_norm": 0.2213710993528366,
"learning_rate": 3.6689793833447837e-06,
"loss": 0.0137,
"step": 6960
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.17836393415927887,
"learning_rate": 3.552837330675296e-06,
"loss": 0.0184,
"step": 6970
},
{
"epoch": 4.601186552406064,
"grad_norm": 0.2593984603881836,
"learning_rate": 3.43852996373657e-06,
"loss": 0.0138,
"step": 6980
},
{
"epoch": 4.607778510217535,
"grad_norm": 0.2913285195827484,
"learning_rate": 3.3260594569636928e-06,
"loss": 0.0212,
"step": 6990
},
{
"epoch": 4.614370468029005,
"grad_norm": 0.18963216245174408,
"learning_rate": 3.215427949849714e-06,
"loss": 0.0155,
"step": 7000
},
{
"epoch": 4.6209624258404745,
"grad_norm": 0.30186694860458374,
"learning_rate": 3.1066375469049337e-06,
"loss": 0.0185,
"step": 7010
},
{
"epoch": 4.627554383651945,
"grad_norm": 0.3594430685043335,
"learning_rate": 2.9996903176168765e-06,
"loss": 0.0157,
"step": 7020
},
{
"epoch": 4.634146341463414,
"grad_norm": 0.407387912273407,
"learning_rate": 2.8945882964109496e-06,
"loss": 0.0155,
"step": 7030
},
{
"epoch": 4.640738299274885,
"grad_norm": 0.1670001596212387,
"learning_rate": 2.7913334826116357e-06,
"loss": 0.0156,
"step": 7040
},
{
"epoch": 4.647330257086354,
"grad_norm": 0.3461068272590637,
"learning_rate": 2.689927840404638e-06,
"loss": 0.0155,
"step": 7050
},
{
"epoch": 4.6539222148978245,
"grad_norm": 0.1870720386505127,
"learning_rate": 2.590373298799342e-06,
"loss": 0.0137,
"step": 7060
},
{
"epoch": 4.660514172709295,
"grad_norm": 0.5297737717628479,
"learning_rate": 2.492671751592235e-06,
"loss": 0.021,
"step": 7070
},
{
"epoch": 4.667106130520764,
"grad_norm": 0.41437268257141113,
"learning_rate": 2.3968250573308424e-06,
"loss": 0.0166,
"step": 7080
},
{
"epoch": 4.673698088332235,
"grad_norm": 0.2162405252456665,
"learning_rate": 2.302835039278339e-06,
"loss": 0.0163,
"step": 7090
},
{
"epoch": 4.680290046143705,
"grad_norm": 0.3162844479084015,
"learning_rate": 2.2107034853789288e-06,
"loss": 0.0184,
"step": 7100
},
{
"epoch": 4.686882003955175,
"grad_norm": 0.23974072933197021,
"learning_rate": 2.1204321482238164e-06,
"loss": 0.0187,
"step": 7110
},
{
"epoch": 4.693473961766645,
"grad_norm": 0.24216875433921814,
"learning_rate": 2.0320227450178254e-06,
"loss": 0.0145,
"step": 7120
},
{
"epoch": 4.7000659195781145,
"grad_norm": 0.3286508023738861,
"learning_rate": 1.945476957546788e-06,
"loss": 0.0189,
"step": 7130
},
{
"epoch": 4.706657877389585,
"grad_norm": 0.22018277645111084,
"learning_rate": 1.860796432145495e-06,
"loss": 0.0164,
"step": 7140
},
{
"epoch": 4.713249835201054,
"grad_norm": 0.18138107657432556,
"learning_rate": 1.7779827796664538e-06,
"loss": 0.0173,
"step": 7150
},
{
"epoch": 4.719841793012525,
"grad_norm": 0.3609893321990967,
"learning_rate": 1.6970375754491562e-06,
"loss": 0.0291,
"step": 7160
},
{
"epoch": 4.726433750823995,
"grad_norm": 0.31565043330192566,
"learning_rate": 1.6179623592901926e-06,
"loss": 0.014,
"step": 7170
},
{
"epoch": 4.733025708635465,
"grad_norm": 0.27240124344825745,
"learning_rate": 1.5407586354139193e-06,
"loss": 0.0167,
"step": 7180
},
{
"epoch": 4.739617666446935,
"grad_norm": 0.3199063837528229,
"learning_rate": 1.4654278724438364e-06,
"loss": 0.0164,
"step": 7190
},
{
"epoch": 4.746209624258404,
"grad_norm": 0.23247933387756348,
"learning_rate": 1.3919715033746893e-06,
"loss": 0.0195,
"step": 7200
},
{
"epoch": 4.752801582069875,
"grad_norm": 0.26770317554473877,
"learning_rate": 1.3203909255451452e-06,
"loss": 0.0125,
"step": 7210
},
{
"epoch": 4.759393539881344,
"grad_norm": 0.2076646387577057,
"learning_rate": 1.2506875006113027e-06,
"loss": 0.0162,
"step": 7220
},
{
"epoch": 4.765985497692815,
"grad_norm": 0.1567927598953247,
"learning_rate": 1.1828625545207027e-06,
"loss": 0.0142,
"step": 7230
},
{
"epoch": 4.772577455504285,
"grad_norm": 0.3224427402019501,
"learning_rate": 1.1169173774871478e-06,
"loss": 0.0161,
"step": 7240
},
{
"epoch": 4.7791694133157545,
"grad_norm": 0.5948562622070312,
"learning_rate": 1.0528532239661547e-06,
"loss": 0.0164,
"step": 7250
},
{
"epoch": 4.785761371127225,
"grad_norm": 0.30895039439201355,
"learning_rate": 9.906713126310974e-07,
"loss": 0.0171,
"step": 7260
},
{
"epoch": 4.792353328938695,
"grad_norm": 0.14259961247444153,
"learning_rate": 9.303728263500011e-07,
"loss": 0.0194,
"step": 7270
},
{
"epoch": 4.798945286750165,
"grad_norm": 0.15019071102142334,
"learning_rate": 8.719589121630622e-07,
"loss": 0.0163,
"step": 7280
},
{
"epoch": 4.805537244561635,
"grad_norm": 0.2892571687698364,
"learning_rate": 8.154306812608315e-07,
"loss": 0.0173,
"step": 7290
},
{
"epoch": 4.812129202373105,
"grad_norm": 0.2563762962818146,
"learning_rate": 7.607892089630308e-07,
"loss": 0.0168,
"step": 7300
},
{
"epoch": 4.818721160184575,
"grad_norm": 0.2222357541322708,
"learning_rate": 7.080355346981815e-07,
"loss": 0.014,
"step": 7310
},
{
"epoch": 4.8253131179960445,
"grad_norm": 0.22898784279823303,
"learning_rate": 6.571706619837526e-07,
"loss": 0.0135,
"step": 7320
},
{
"epoch": 4.831905075807515,
"grad_norm": 0.23187340795993805,
"learning_rate": 6.081955584071097e-07,
"loss": 0.0142,
"step": 7330
},
{
"epoch": 4.838497033618985,
"grad_norm": 0.3049458861351013,
"learning_rate": 5.61111155607108e-07,
"loss": 0.0199,
"step": 7340
},
{
"epoch": 4.845088991430455,
"grad_norm": 0.17564386129379272,
"learning_rate": 5.159183492563613e-07,
"loss": 0.0151,
"step": 7350
},
{
"epoch": 4.851680949241925,
"grad_norm": 0.3510572016239166,
"learning_rate": 4.7261799904420035e-07,
"loss": 0.0164,
"step": 7360
},
{
"epoch": 4.8582729070533945,
"grad_norm": 0.31466346979141235,
"learning_rate": 4.3121092866031945e-07,
"loss": 0.0176,
"step": 7370
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.2005147635936737,
"learning_rate": 3.91697925779122e-07,
"loss": 0.0168,
"step": 7380
},
{
"epoch": 4.871456822676334,
"grad_norm": 0.1678527295589447,
"learning_rate": 3.5407974204473284e-07,
"loss": 0.0175,
"step": 7390
},
{
"epoch": 4.878048780487805,
"grad_norm": 0.21754373610019684,
"learning_rate": 3.1835709305668703e-07,
"loss": 0.0127,
"step": 7400
},
{
"epoch": 4.884640738299275,
"grad_norm": 0.21587257087230682,
"learning_rate": 2.84530658356319e-07,
"loss": 0.017,
"step": 7410
},
{
"epoch": 4.891232696110745,
"grad_norm": 0.31447526812553406,
"learning_rate": 2.526010814138613e-07,
"loss": 0.0217,
"step": 7420
},
{
"epoch": 4.897824653922215,
"grad_norm": 0.30843478441238403,
"learning_rate": 2.2256896961616592e-07,
"loss": 0.0181,
"step": 7430
},
{
"epoch": 4.904416611733685,
"grad_norm": 0.29951369762420654,
"learning_rate": 1.9443489425517992e-07,
"loss": 0.0152,
"step": 7440
},
{
"epoch": 4.911008569545155,
"grad_norm": 0.4117021858692169,
"learning_rate": 1.6819939051706535e-07,
"loss": 0.0127,
"step": 7450
},
{
"epoch": 4.917600527356625,
"grad_norm": 0.11666778475046158,
"learning_rate": 1.438629574720074e-07,
"loss": 0.0144,
"step": 7460
},
{
"epoch": 4.924192485168095,
"grad_norm": 0.3991844356060028,
"learning_rate": 1.2142605806474417e-07,
"loss": 0.0162,
"step": 7470
},
{
"epoch": 4.930784442979565,
"grad_norm": 0.09675031900405884,
"learning_rate": 1.0088911910576259e-07,
"loss": 0.0223,
"step": 7480
},
{
"epoch": 4.937376400791035,
"grad_norm": 0.3356577157974243,
"learning_rate": 8.225253126314947e-08,
"loss": 0.0168,
"step": 7490
},
{
"epoch": 4.943968358602505,
"grad_norm": 0.27056625485420227,
"learning_rate": 6.551664905517508e-08,
"loss": 0.0166,
"step": 7500
},
{
"epoch": 4.950560316413975,
"grad_norm": 0.24081185460090637,
"learning_rate": 5.068179084355418e-08,
"loss": 0.0164,
"step": 7510
},
{
"epoch": 4.957152274225445,
"grad_norm": 0.3618698716163635,
"learning_rate": 3.774823882738421e-08,
"loss": 0.0176,
"step": 7520
},
{
"epoch": 4.963744232036915,
"grad_norm": 0.20548762381076813,
"learning_rate": 2.6716239037805068e-08,
"loss": 0.0183,
"step": 7530
},
{
"epoch": 4.970336189848385,
"grad_norm": 0.24806766211986542,
"learning_rate": 1.7586001333258495e-08,
"loss": 0.0156,
"step": 7540
},
{
"epoch": 4.976928147659855,
"grad_norm": 0.3018137216567993,
"learning_rate": 1.0357699395535658e-08,
"loss": 0.0196,
"step": 7550
},
{
"epoch": 4.9835201054713245,
"grad_norm": 0.24933604896068573,
"learning_rate": 5.031470726490906e-09,
"loss": 0.015,
"step": 7560
},
{
"epoch": 4.990112063282795,
"grad_norm": 0.23485144972801208,
"learning_rate": 1.6074166453883266e-09,
"loss": 0.0103,
"step": 7570
},
{
"epoch": 4.996704021094265,
"grad_norm": 0.4469901919364929,
"learning_rate": 8.560228699217021e-11,
"loss": 0.0147,
"step": 7580
},
{
"epoch": 4.998681608437706,
"step": 7583,
"total_flos": 2.658519488376864e+17,
"train_loss": 0.03622536294503214,
"train_runtime": 3445.8975,
"train_samples_per_second": 35.209,
"train_steps_per_second": 2.201
}
],
"logging_steps": 10,
"max_steps": 7583,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.658519488376864e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}