DEJAN-LM-LARGE / trainer_state.json
dejanseo's picture
Upload 10 files
81e10d2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.4056355472983433,
"eval_steps": 500,
"global_step": 11000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003096454559529339,
"grad_norm": 7.865213871002197,
"learning_rate": 6.193868070610096e-07,
"loss": 10.5439,
"step": 10
},
{
"epoch": 0.006192909119058678,
"grad_norm": 5.684272289276123,
"learning_rate": 1.2387736141220192e-06,
"loss": 10.2888,
"step": 20
},
{
"epoch": 0.009289363678588018,
"grad_norm": 4.032341003417969,
"learning_rate": 1.8581604211830287e-06,
"loss": 9.9454,
"step": 30
},
{
"epoch": 0.012385818238117356,
"grad_norm": 3.232361316680908,
"learning_rate": 2.4775472282440385e-06,
"loss": 9.6908,
"step": 40
},
{
"epoch": 0.015482272797646695,
"grad_norm": 2.7629575729370117,
"learning_rate": 3.096934035305048e-06,
"loss": 9.491,
"step": 50
},
{
"epoch": 0.018578727357176035,
"grad_norm": 2.439429998397827,
"learning_rate": 3.7163208423660575e-06,
"loss": 9.3421,
"step": 60
},
{
"epoch": 0.021675181916705373,
"grad_norm": 2.311237335205078,
"learning_rate": 4.335707649427067e-06,
"loss": 9.2172,
"step": 70
},
{
"epoch": 0.02477163647623471,
"grad_norm": 2.1415603160858154,
"learning_rate": 4.955094456488077e-06,
"loss": 9.1165,
"step": 80
},
{
"epoch": 0.02786809103576405,
"grad_norm": 2.0442802906036377,
"learning_rate": 5.574481263549087e-06,
"loss": 9.0171,
"step": 90
},
{
"epoch": 0.03096454559529339,
"grad_norm": 2.0417075157165527,
"learning_rate": 6.193868070610096e-06,
"loss": 8.9188,
"step": 100
},
{
"epoch": 0.034061000154822725,
"grad_norm": 1.906326413154602,
"learning_rate": 6.813254877671105e-06,
"loss": 8.817,
"step": 110
},
{
"epoch": 0.03715745471435207,
"grad_norm": 1.876010537147522,
"learning_rate": 7.432641684732115e-06,
"loss": 8.7205,
"step": 120
},
{
"epoch": 0.04025390927388141,
"grad_norm": 1.7335777282714844,
"learning_rate": 8.052028491793125e-06,
"loss": 8.6376,
"step": 130
},
{
"epoch": 0.043350363833410746,
"grad_norm": 1.6829620599746704,
"learning_rate": 8.671415298854134e-06,
"loss": 8.5273,
"step": 140
},
{
"epoch": 0.046446818392940084,
"grad_norm": 1.6329585313796997,
"learning_rate": 9.290802105915144e-06,
"loss": 8.4292,
"step": 150
},
{
"epoch": 0.04954327295246942,
"grad_norm": 1.62351655960083,
"learning_rate": 9.910188912976154e-06,
"loss": 8.3279,
"step": 160
},
{
"epoch": 0.05263972751199876,
"grad_norm": 1.5334705114364624,
"learning_rate": 1.0529575720037164e-05,
"loss": 8.2018,
"step": 170
},
{
"epoch": 0.0557361820715281,
"grad_norm": 1.5242592096328735,
"learning_rate": 1.1148962527098173e-05,
"loss": 8.1007,
"step": 180
},
{
"epoch": 0.058832636631057436,
"grad_norm": 1.5945011377334595,
"learning_rate": 1.1768349334159183e-05,
"loss": 7.972,
"step": 190
},
{
"epoch": 0.06192909119058678,
"grad_norm": 1.3093743324279785,
"learning_rate": 1.2387736141220193e-05,
"loss": 7.8736,
"step": 200
},
{
"epoch": 0.06502554575011611,
"grad_norm": 1.3056074380874634,
"learning_rate": 1.30071229482812e-05,
"loss": 7.7617,
"step": 210
},
{
"epoch": 0.06812200030964545,
"grad_norm": 1.2901231050491333,
"learning_rate": 1.362650975534221e-05,
"loss": 7.6573,
"step": 220
},
{
"epoch": 0.07121845486917479,
"grad_norm": 1.0811238288879395,
"learning_rate": 1.424589656240322e-05,
"loss": 7.5707,
"step": 230
},
{
"epoch": 0.07431490942870414,
"grad_norm": 0.9134311676025391,
"learning_rate": 1.486528336946423e-05,
"loss": 7.4959,
"step": 240
},
{
"epoch": 0.07741136398823348,
"grad_norm": 0.9673048257827759,
"learning_rate": 1.548467017652524e-05,
"loss": 7.4314,
"step": 250
},
{
"epoch": 0.08050781854776282,
"grad_norm": 1.0383951663970947,
"learning_rate": 1.610405698358625e-05,
"loss": 7.3523,
"step": 260
},
{
"epoch": 0.08360427310729215,
"grad_norm": 1.0910584926605225,
"learning_rate": 1.6723443790647262e-05,
"loss": 7.3133,
"step": 270
},
{
"epoch": 0.08670072766682149,
"grad_norm": 0.804308295249939,
"learning_rate": 1.734283059770827e-05,
"loss": 7.2522,
"step": 280
},
{
"epoch": 0.08979718222635083,
"grad_norm": 0.9341151714324951,
"learning_rate": 1.796221740476928e-05,
"loss": 7.2261,
"step": 290
},
{
"epoch": 0.09289363678588017,
"grad_norm": 0.8165347576141357,
"learning_rate": 1.8581604211830288e-05,
"loss": 7.2088,
"step": 300
},
{
"epoch": 0.0959900913454095,
"grad_norm": 0.6941328644752502,
"learning_rate": 1.9200991018891298e-05,
"loss": 7.1554,
"step": 310
},
{
"epoch": 0.09908654590493884,
"grad_norm": 0.7364155650138855,
"learning_rate": 1.9820377825952308e-05,
"loss": 7.1313,
"step": 320
},
{
"epoch": 0.10218300046446818,
"grad_norm": 1.3144842386245728,
"learning_rate": 2.0439764633013317e-05,
"loss": 7.1198,
"step": 330
},
{
"epoch": 0.10527945502399752,
"grad_norm": 0.703687846660614,
"learning_rate": 2.1059151440074327e-05,
"loss": 7.0936,
"step": 340
},
{
"epoch": 0.10837590958352686,
"grad_norm": 0.7936609387397766,
"learning_rate": 2.1678538247135337e-05,
"loss": 7.0966,
"step": 350
},
{
"epoch": 0.1114723641430562,
"grad_norm": 0.9979026317596436,
"learning_rate": 2.2297925054196347e-05,
"loss": 7.0917,
"step": 360
},
{
"epoch": 0.11456881870258553,
"grad_norm": 0.8398326635360718,
"learning_rate": 2.2917311861257356e-05,
"loss": 7.0791,
"step": 370
},
{
"epoch": 0.11766527326211487,
"grad_norm": 0.7220719456672668,
"learning_rate": 2.3536698668318366e-05,
"loss": 7.057,
"step": 380
},
{
"epoch": 0.12076172782164422,
"grad_norm": 0.8845738172531128,
"learning_rate": 2.4156085475379376e-05,
"loss": 7.0476,
"step": 390
},
{
"epoch": 0.12385818238117356,
"grad_norm": 0.8084824085235596,
"learning_rate": 2.4775472282440385e-05,
"loss": 7.0369,
"step": 400
},
{
"epoch": 0.1269546369407029,
"grad_norm": 0.7229199409484863,
"learning_rate": 2.5394859089501395e-05,
"loss": 7.0193,
"step": 410
},
{
"epoch": 0.13005109150023222,
"grad_norm": 0.7450975179672241,
"learning_rate": 2.60142458965624e-05,
"loss": 7.0136,
"step": 420
},
{
"epoch": 0.13314754605976156,
"grad_norm": 1.1810022592544556,
"learning_rate": 2.6633632703623415e-05,
"loss": 7.0257,
"step": 430
},
{
"epoch": 0.1362440006192909,
"grad_norm": 0.724097728729248,
"learning_rate": 2.725301951068442e-05,
"loss": 7.0076,
"step": 440
},
{
"epoch": 0.13934045517882024,
"grad_norm": 0.8406842350959778,
"learning_rate": 2.7872406317745434e-05,
"loss": 6.9976,
"step": 450
},
{
"epoch": 0.14243690973834958,
"grad_norm": 0.8269332647323608,
"learning_rate": 2.849179312480644e-05,
"loss": 6.9812,
"step": 460
},
{
"epoch": 0.14553336429787894,
"grad_norm": 0.7661322355270386,
"learning_rate": 2.9111179931867453e-05,
"loss": 7.0072,
"step": 470
},
{
"epoch": 0.14862981885740828,
"grad_norm": 0.6673895120620728,
"learning_rate": 2.973056673892846e-05,
"loss": 6.9775,
"step": 480
},
{
"epoch": 0.15172627341693762,
"grad_norm": 1.1476161479949951,
"learning_rate": 3.0349953545989473e-05,
"loss": 6.9496,
"step": 490
},
{
"epoch": 0.15482272797646696,
"grad_norm": 1.0809210538864136,
"learning_rate": 3.096934035305048e-05,
"loss": 6.9578,
"step": 500
},
{
"epoch": 0.1579191825359963,
"grad_norm": 0.8364447951316833,
"learning_rate": 3.158872716011149e-05,
"loss": 6.9371,
"step": 510
},
{
"epoch": 0.16101563709552563,
"grad_norm": 0.9381659030914307,
"learning_rate": 3.22081139671725e-05,
"loss": 6.9373,
"step": 520
},
{
"epoch": 0.16411209165505497,
"grad_norm": 0.8810213804244995,
"learning_rate": 3.2827500774233505e-05,
"loss": 6.9463,
"step": 530
},
{
"epoch": 0.1672085462145843,
"grad_norm": 0.8275142908096313,
"learning_rate": 3.3446887581294525e-05,
"loss": 6.932,
"step": 540
},
{
"epoch": 0.17030500077411365,
"grad_norm": 0.6804556846618652,
"learning_rate": 3.406627438835553e-05,
"loss": 6.9181,
"step": 550
},
{
"epoch": 0.17340145533364298,
"grad_norm": 0.7559427618980408,
"learning_rate": 3.468566119541654e-05,
"loss": 6.9202,
"step": 560
},
{
"epoch": 0.17649790989317232,
"grad_norm": 0.6762346029281616,
"learning_rate": 3.5305048002477544e-05,
"loss": 6.9081,
"step": 570
},
{
"epoch": 0.17959436445270166,
"grad_norm": 0.6671234369277954,
"learning_rate": 3.592443480953856e-05,
"loss": 6.9216,
"step": 580
},
{
"epoch": 0.182690819012231,
"grad_norm": 0.9335949420928955,
"learning_rate": 3.654382161659957e-05,
"loss": 6.9034,
"step": 590
},
{
"epoch": 0.18578727357176034,
"grad_norm": 0.9805537462234497,
"learning_rate": 3.7163208423660576e-05,
"loss": 6.895,
"step": 600
},
{
"epoch": 0.18888372813128967,
"grad_norm": 0.8761160969734192,
"learning_rate": 3.778259523072158e-05,
"loss": 6.9029,
"step": 610
},
{
"epoch": 0.191980182690819,
"grad_norm": 0.8361015915870667,
"learning_rate": 3.8401982037782596e-05,
"loss": 6.8819,
"step": 620
},
{
"epoch": 0.19507663725034835,
"grad_norm": 0.6740533709526062,
"learning_rate": 3.902136884484361e-05,
"loss": 6.8882,
"step": 630
},
{
"epoch": 0.1981730918098777,
"grad_norm": 0.8334875702857971,
"learning_rate": 3.9640755651904615e-05,
"loss": 6.8917,
"step": 640
},
{
"epoch": 0.20126954636940703,
"grad_norm": 0.7946698665618896,
"learning_rate": 4.026014245896562e-05,
"loss": 6.8712,
"step": 650
},
{
"epoch": 0.20436600092893636,
"grad_norm": 1.1773180961608887,
"learning_rate": 4.0879529266026635e-05,
"loss": 6.8963,
"step": 660
},
{
"epoch": 0.2074624554884657,
"grad_norm": 0.6932355165481567,
"learning_rate": 4.149891607308765e-05,
"loss": 6.8718,
"step": 670
},
{
"epoch": 0.21055891004799504,
"grad_norm": 0.8239333629608154,
"learning_rate": 4.2118302880148654e-05,
"loss": 6.8549,
"step": 680
},
{
"epoch": 0.21365536460752438,
"grad_norm": 0.8844727873802185,
"learning_rate": 4.273768968720966e-05,
"loss": 6.8687,
"step": 690
},
{
"epoch": 0.21675181916705372,
"grad_norm": 0.8168037533760071,
"learning_rate": 4.3357076494270674e-05,
"loss": 6.8457,
"step": 700
},
{
"epoch": 0.21984827372658305,
"grad_norm": 0.7363680601119995,
"learning_rate": 4.397646330133168e-05,
"loss": 6.8538,
"step": 710
},
{
"epoch": 0.2229447282861124,
"grad_norm": 0.9639245867729187,
"learning_rate": 4.459585010839269e-05,
"loss": 6.855,
"step": 720
},
{
"epoch": 0.22604118284564173,
"grad_norm": 0.7763282656669617,
"learning_rate": 4.52152369154537e-05,
"loss": 6.8254,
"step": 730
},
{
"epoch": 0.22913763740517107,
"grad_norm": 1.482752799987793,
"learning_rate": 4.583462372251471e-05,
"loss": 6.8331,
"step": 740
},
{
"epoch": 0.2322340919647004,
"grad_norm": 0.8456624150276184,
"learning_rate": 4.645401052957572e-05,
"loss": 6.8513,
"step": 750
},
{
"epoch": 0.23533054652422974,
"grad_norm": 0.9166210889816284,
"learning_rate": 4.707339733663673e-05,
"loss": 6.8402,
"step": 760
},
{
"epoch": 0.23842700108375908,
"grad_norm": 0.8375464677810669,
"learning_rate": 4.769278414369774e-05,
"loss": 6.8337,
"step": 770
},
{
"epoch": 0.24152345564328845,
"grad_norm": 1.267236590385437,
"learning_rate": 4.831217095075875e-05,
"loss": 6.8193,
"step": 780
},
{
"epoch": 0.2446199102028178,
"grad_norm": 0.6456039547920227,
"learning_rate": 4.893155775781976e-05,
"loss": 6.7969,
"step": 790
},
{
"epoch": 0.24771636476234712,
"grad_norm": 0.8981896638870239,
"learning_rate": 4.955094456488077e-05,
"loss": 6.8133,
"step": 800
},
{
"epoch": 0.25081281932187643,
"grad_norm": 1.120186686515808,
"learning_rate": 5.017033137194178e-05,
"loss": 6.8056,
"step": 810
},
{
"epoch": 0.2539092738814058,
"grad_norm": 2.292698621749878,
"learning_rate": 5.078971817900279e-05,
"loss": 6.8308,
"step": 820
},
{
"epoch": 0.2570057284409351,
"grad_norm": 0.7018686532974243,
"learning_rate": 5.1409104986063797e-05,
"loss": 6.8154,
"step": 830
},
{
"epoch": 0.26010218300046445,
"grad_norm": 0.8676766753196716,
"learning_rate": 5.20284917931248e-05,
"loss": 6.8134,
"step": 840
},
{
"epoch": 0.2631986375599938,
"grad_norm": 1.0170965194702148,
"learning_rate": 5.2647878600185816e-05,
"loss": 6.8014,
"step": 850
},
{
"epoch": 0.2662950921195231,
"grad_norm": 1.100301742553711,
"learning_rate": 5.326726540724683e-05,
"loss": 6.7759,
"step": 860
},
{
"epoch": 0.26939154667905246,
"grad_norm": 0.9783535003662109,
"learning_rate": 5.3886652214307835e-05,
"loss": 6.7684,
"step": 870
},
{
"epoch": 0.2724880012385818,
"grad_norm": 1.2189717292785645,
"learning_rate": 5.450603902136884e-05,
"loss": 6.7609,
"step": 880
},
{
"epoch": 0.27558445579811114,
"grad_norm": 0.9612496495246887,
"learning_rate": 5.5125425828429855e-05,
"loss": 6.7852,
"step": 890
},
{
"epoch": 0.2786809103576405,
"grad_norm": 1.201369047164917,
"learning_rate": 5.574481263549087e-05,
"loss": 6.7685,
"step": 900
},
{
"epoch": 0.2817773649171698,
"grad_norm": 1.0445016622543335,
"learning_rate": 5.6364199442551874e-05,
"loss": 6.7863,
"step": 910
},
{
"epoch": 0.28487381947669915,
"grad_norm": 0.9389632940292358,
"learning_rate": 5.698358624961288e-05,
"loss": 6.7803,
"step": 920
},
{
"epoch": 0.2879702740362285,
"grad_norm": 1.522533655166626,
"learning_rate": 5.7602973056673894e-05,
"loss": 6.7642,
"step": 930
},
{
"epoch": 0.2910667285957579,
"grad_norm": 0.5819054841995239,
"learning_rate": 5.822235986373491e-05,
"loss": 6.772,
"step": 940
},
{
"epoch": 0.2941631831552872,
"grad_norm": 0.5492868423461914,
"learning_rate": 5.884174667079591e-05,
"loss": 6.7712,
"step": 950
},
{
"epoch": 0.29725963771481656,
"grad_norm": 0.9563374519348145,
"learning_rate": 5.946113347785692e-05,
"loss": 6.7602,
"step": 960
},
{
"epoch": 0.3003560922743459,
"grad_norm": 1.8112778663635254,
"learning_rate": 6.0080520284917926e-05,
"loss": 6.774,
"step": 970
},
{
"epoch": 0.30345254683387524,
"grad_norm": 1.9124343395233154,
"learning_rate": 6.0699907091978946e-05,
"loss": 6.7692,
"step": 980
},
{
"epoch": 0.3065490013934046,
"grad_norm": 1.0520577430725098,
"learning_rate": 6.131929389903995e-05,
"loss": 6.7624,
"step": 990
},
{
"epoch": 0.3096454559529339,
"grad_norm": 0.9971650242805481,
"learning_rate": 6.193868070610096e-05,
"loss": 6.7597,
"step": 1000
},
{
"epoch": 0.31274191051246325,
"grad_norm": 0.7130516171455383,
"learning_rate": 6.255806751316196e-05,
"loss": 6.7548,
"step": 1010
},
{
"epoch": 0.3158383650719926,
"grad_norm": 0.8867819309234619,
"learning_rate": 6.317745432022298e-05,
"loss": 6.7416,
"step": 1020
},
{
"epoch": 0.3189348196315219,
"grad_norm": 2.448023557662964,
"learning_rate": 6.379684112728398e-05,
"loss": 6.7675,
"step": 1030
},
{
"epoch": 0.32203127419105126,
"grad_norm": 2.0288820266723633,
"learning_rate": 6.4416227934345e-05,
"loss": 6.7555,
"step": 1040
},
{
"epoch": 0.3251277287505806,
"grad_norm": 0.645900309085846,
"learning_rate": 6.503561474140602e-05,
"loss": 6.7557,
"step": 1050
},
{
"epoch": 0.32822418331010994,
"grad_norm": 0.7342972159385681,
"learning_rate": 6.565500154846701e-05,
"loss": 6.7452,
"step": 1060
},
{
"epoch": 0.3313206378696393,
"grad_norm": 1.523195743560791,
"learning_rate": 6.627438835552803e-05,
"loss": 6.7476,
"step": 1070
},
{
"epoch": 0.3344170924291686,
"grad_norm": 1.812499761581421,
"learning_rate": 6.689377516258905e-05,
"loss": 6.7432,
"step": 1080
},
{
"epoch": 0.33751354698869795,
"grad_norm": 0.8007811307907104,
"learning_rate": 6.751316196965004e-05,
"loss": 6.7387,
"step": 1090
},
{
"epoch": 0.3406100015482273,
"grad_norm": 1.449756145477295,
"learning_rate": 6.813254877671106e-05,
"loss": 6.7323,
"step": 1100
},
{
"epoch": 0.34370645610775663,
"grad_norm": 1.145936369895935,
"learning_rate": 6.875193558377207e-05,
"loss": 6.7396,
"step": 1110
},
{
"epoch": 0.34680291066728597,
"grad_norm": 1.155754804611206,
"learning_rate": 6.937132239083308e-05,
"loss": 6.7288,
"step": 1120
},
{
"epoch": 0.3498993652268153,
"grad_norm": 1.3261879682540894,
"learning_rate": 6.99907091978941e-05,
"loss": 6.717,
"step": 1130
},
{
"epoch": 0.35299581978634464,
"grad_norm": 2.5398218631744385,
"learning_rate": 7.061009600495509e-05,
"loss": 6.7055,
"step": 1140
},
{
"epoch": 0.356092274345874,
"grad_norm": 0.6757873892784119,
"learning_rate": 7.122948281201611e-05,
"loss": 6.7242,
"step": 1150
},
{
"epoch": 0.3591887289054033,
"grad_norm": 0.8870462775230408,
"learning_rate": 7.184886961907711e-05,
"loss": 6.7241,
"step": 1160
},
{
"epoch": 0.36228518346493266,
"grad_norm": 2.03185772895813,
"learning_rate": 7.246825642613812e-05,
"loss": 6.7364,
"step": 1170
},
{
"epoch": 0.365381638024462,
"grad_norm": 1.013759970664978,
"learning_rate": 7.308764323319914e-05,
"loss": 6.7151,
"step": 1180
},
{
"epoch": 0.36847809258399133,
"grad_norm": 1.6533416509628296,
"learning_rate": 7.370703004026015e-05,
"loss": 6.7207,
"step": 1190
},
{
"epoch": 0.3715745471435207,
"grad_norm": 1.0296862125396729,
"learning_rate": 7.432641684732115e-05,
"loss": 6.7154,
"step": 1200
},
{
"epoch": 0.37467100170305,
"grad_norm": 0.7925991415977478,
"learning_rate": 7.494580365438217e-05,
"loss": 6.7036,
"step": 1210
},
{
"epoch": 0.37776745626257935,
"grad_norm": 1.123253345489502,
"learning_rate": 7.556519046144317e-05,
"loss": 6.6981,
"step": 1220
},
{
"epoch": 0.3808639108221087,
"grad_norm": 1.2927206754684448,
"learning_rate": 7.618457726850419e-05,
"loss": 6.7105,
"step": 1230
},
{
"epoch": 0.383960365381638,
"grad_norm": 1.2877053022384644,
"learning_rate": 7.680396407556519e-05,
"loss": 6.7046,
"step": 1240
},
{
"epoch": 0.38705681994116736,
"grad_norm": 1.5025876760482788,
"learning_rate": 7.74233508826262e-05,
"loss": 6.7097,
"step": 1250
},
{
"epoch": 0.3901532745006967,
"grad_norm": 1.8476455211639404,
"learning_rate": 7.804273768968722e-05,
"loss": 6.7091,
"step": 1260
},
{
"epoch": 0.39324972906022604,
"grad_norm": 1.1083704233169556,
"learning_rate": 7.866212449674822e-05,
"loss": 6.7281,
"step": 1270
},
{
"epoch": 0.3963461836197554,
"grad_norm": 1.9753637313842773,
"learning_rate": 7.928151130380923e-05,
"loss": 6.6795,
"step": 1280
},
{
"epoch": 0.3994426381792847,
"grad_norm": 0.6769999265670776,
"learning_rate": 7.990089811087024e-05,
"loss": 6.7043,
"step": 1290
},
{
"epoch": 0.40253909273881405,
"grad_norm": 1.1025127172470093,
"learning_rate": 8.052028491793124e-05,
"loss": 6.673,
"step": 1300
},
{
"epoch": 0.4056355472983434,
"grad_norm": 1.132672667503357,
"learning_rate": 8.113967172499226e-05,
"loss": 6.6962,
"step": 1310
},
{
"epoch": 0.40873200185787273,
"grad_norm": 3.0605337619781494,
"learning_rate": 8.175905853205327e-05,
"loss": 6.693,
"step": 1320
},
{
"epoch": 0.41182845641740207,
"grad_norm": 1.0931648015975952,
"learning_rate": 8.237844533911428e-05,
"loss": 6.6865,
"step": 1330
},
{
"epoch": 0.4149249109769314,
"grad_norm": 1.2315603494644165,
"learning_rate": 8.29978321461753e-05,
"loss": 6.6753,
"step": 1340
},
{
"epoch": 0.41802136553646074,
"grad_norm": 1.4472100734710693,
"learning_rate": 8.36172189532363e-05,
"loss": 6.6882,
"step": 1350
},
{
"epoch": 0.4211178200959901,
"grad_norm": 1.6784274578094482,
"learning_rate": 8.423660576029731e-05,
"loss": 6.6776,
"step": 1360
},
{
"epoch": 0.4242142746555194,
"grad_norm": 2.4951741695404053,
"learning_rate": 8.485599256735831e-05,
"loss": 6.6813,
"step": 1370
},
{
"epoch": 0.42731072921504876,
"grad_norm": 2.3850290775299072,
"learning_rate": 8.547537937441932e-05,
"loss": 6.6729,
"step": 1380
},
{
"epoch": 0.4304071837745781,
"grad_norm": 0.8592017889022827,
"learning_rate": 8.609476618148034e-05,
"loss": 6.681,
"step": 1390
},
{
"epoch": 0.43350363833410743,
"grad_norm": 1.178676962852478,
"learning_rate": 8.671415298854135e-05,
"loss": 6.6717,
"step": 1400
},
{
"epoch": 0.43660009289363677,
"grad_norm": 1.6043617725372314,
"learning_rate": 8.733353979560235e-05,
"loss": 6.6647,
"step": 1410
},
{
"epoch": 0.4396965474531661,
"grad_norm": 0.872035562992096,
"learning_rate": 8.795292660266336e-05,
"loss": 6.666,
"step": 1420
},
{
"epoch": 0.44279300201269545,
"grad_norm": 1.1680723428726196,
"learning_rate": 8.857231340972438e-05,
"loss": 6.6622,
"step": 1430
},
{
"epoch": 0.4458894565722248,
"grad_norm": 0.8795621991157532,
"learning_rate": 8.919170021678539e-05,
"loss": 6.64,
"step": 1440
},
{
"epoch": 0.4489859111317541,
"grad_norm": 1.5785902738571167,
"learning_rate": 8.981108702384639e-05,
"loss": 6.6613,
"step": 1450
},
{
"epoch": 0.45208236569128346,
"grad_norm": 1.319611668586731,
"learning_rate": 9.04304738309074e-05,
"loss": 6.6573,
"step": 1460
},
{
"epoch": 0.4551788202508128,
"grad_norm": 1.0796053409576416,
"learning_rate": 9.104986063796842e-05,
"loss": 6.6614,
"step": 1470
},
{
"epoch": 0.45827527481034214,
"grad_norm": 1.2139097452163696,
"learning_rate": 9.166924744502942e-05,
"loss": 6.6461,
"step": 1480
},
{
"epoch": 0.4613717293698715,
"grad_norm": 1.3955761194229126,
"learning_rate": 9.228863425209043e-05,
"loss": 6.6611,
"step": 1490
},
{
"epoch": 0.4644681839294008,
"grad_norm": 1.5178614854812622,
"learning_rate": 9.290802105915144e-05,
"loss": 6.6615,
"step": 1500
},
{
"epoch": 0.46756463848893015,
"grad_norm": 1.3112921714782715,
"learning_rate": 9.352740786621246e-05,
"loss": 6.6644,
"step": 1510
},
{
"epoch": 0.4706610930484595,
"grad_norm": 1.5961909294128418,
"learning_rate": 9.414679467327346e-05,
"loss": 6.673,
"step": 1520
},
{
"epoch": 0.4737575476079888,
"grad_norm": 1.0166618824005127,
"learning_rate": 9.476618148033447e-05,
"loss": 6.647,
"step": 1530
},
{
"epoch": 0.47685400216751817,
"grad_norm": 1.2850325107574463,
"learning_rate": 9.538556828739548e-05,
"loss": 6.6536,
"step": 1540
},
{
"epoch": 0.4799504567270475,
"grad_norm": 1.1776533126831055,
"learning_rate": 9.600495509445648e-05,
"loss": 6.6446,
"step": 1550
},
{
"epoch": 0.4830469112865769,
"grad_norm": 1.783477544784546,
"learning_rate": 9.66243419015175e-05,
"loss": 6.6353,
"step": 1560
},
{
"epoch": 0.48614336584610623,
"grad_norm": 1.7229933738708496,
"learning_rate": 9.724372870857851e-05,
"loss": 6.6363,
"step": 1570
},
{
"epoch": 0.4892398204056356,
"grad_norm": 0.9246505498886108,
"learning_rate": 9.786311551563952e-05,
"loss": 6.6616,
"step": 1580
},
{
"epoch": 0.4923362749651649,
"grad_norm": 1.7007242441177368,
"learning_rate": 9.848250232270054e-05,
"loss": 6.6608,
"step": 1590
},
{
"epoch": 0.49543272952469425,
"grad_norm": 1.145609974861145,
"learning_rate": 9.910188912976154e-05,
"loss": 6.6282,
"step": 1600
},
{
"epoch": 0.4985291840842236,
"grad_norm": 1.1772605180740356,
"learning_rate": 9.972127593682255e-05,
"loss": 6.6463,
"step": 1610
},
{
"epoch": 0.5016256386437529,
"grad_norm": 0.8392823338508606,
"learning_rate": 0.00010034066274388355,
"loss": 6.6387,
"step": 1620
},
{
"epoch": 0.5047220932032822,
"grad_norm": 1.2767823934555054,
"learning_rate": 0.00010096004955094456,
"loss": 6.6571,
"step": 1630
},
{
"epoch": 0.5078185477628115,
"grad_norm": 2.3833205699920654,
"learning_rate": 0.00010157943635800558,
"loss": 6.6407,
"step": 1640
},
{
"epoch": 0.5109150023223409,
"grad_norm": 1.3098053932189941,
"learning_rate": 0.00010219882316506659,
"loss": 6.6357,
"step": 1650
},
{
"epoch": 0.5140114568818702,
"grad_norm": 1.2075214385986328,
"learning_rate": 0.00010281820997212759,
"loss": 6.6368,
"step": 1660
},
{
"epoch": 0.5171079114413996,
"grad_norm": 1.251852035522461,
"learning_rate": 0.00010343759677918861,
"loss": 6.6255,
"step": 1670
},
{
"epoch": 0.5202043660009289,
"grad_norm": 1.3142434358596802,
"learning_rate": 0.0001040569835862496,
"loss": 6.6218,
"step": 1680
},
{
"epoch": 0.5233008205604582,
"grad_norm": 2.4521663188934326,
"learning_rate": 0.00010467637039331063,
"loss": 6.6264,
"step": 1690
},
{
"epoch": 0.5263972751199876,
"grad_norm": 1.0846492052078247,
"learning_rate": 0.00010529575720037163,
"loss": 6.6515,
"step": 1700
},
{
"epoch": 0.5294937296795169,
"grad_norm": 1.6620179414749146,
"learning_rate": 0.00010591514400743264,
"loss": 6.6323,
"step": 1710
},
{
"epoch": 0.5325901842390462,
"grad_norm": 0.8557600975036621,
"learning_rate": 0.00010653453081449366,
"loss": 6.6437,
"step": 1720
},
{
"epoch": 0.5356866387985756,
"grad_norm": 0.6991952061653137,
"learning_rate": 0.00010715391762155466,
"loss": 6.6211,
"step": 1730
},
{
"epoch": 0.5387830933581049,
"grad_norm": 1.5852376222610474,
"learning_rate": 0.00010777330442861567,
"loss": 6.5952,
"step": 1740
},
{
"epoch": 0.5418795479176343,
"grad_norm": 1.642796516418457,
"learning_rate": 0.00010839269123567669,
"loss": 6.6295,
"step": 1750
},
{
"epoch": 0.5449760024771636,
"grad_norm": 1.2764023542404175,
"learning_rate": 0.00010901207804273768,
"loss": 6.6294,
"step": 1760
},
{
"epoch": 0.5480724570366929,
"grad_norm": 1.6868603229522705,
"learning_rate": 0.0001096314648497987,
"loss": 6.6292,
"step": 1770
},
{
"epoch": 0.5511689115962223,
"grad_norm": 1.3303276300430298,
"learning_rate": 0.00011025085165685971,
"loss": 6.5998,
"step": 1780
},
{
"epoch": 0.5542653661557516,
"grad_norm": 1.396274447441101,
"learning_rate": 0.00011087023846392072,
"loss": 6.6223,
"step": 1790
},
{
"epoch": 0.557361820715281,
"grad_norm": 0.879639744758606,
"learning_rate": 0.00011148962527098174,
"loss": 6.612,
"step": 1800
},
{
"epoch": 0.5604582752748103,
"grad_norm": 1.1366828680038452,
"learning_rate": 0.00011210901207804273,
"loss": 6.6122,
"step": 1810
},
{
"epoch": 0.5635547298343396,
"grad_norm": 1.480747938156128,
"learning_rate": 0.00011272839888510375,
"loss": 6.6094,
"step": 1820
},
{
"epoch": 0.566651184393869,
"grad_norm": 1.1296987533569336,
"learning_rate": 0.00011334778569216477,
"loss": 6.6194,
"step": 1830
},
{
"epoch": 0.5697476389533983,
"grad_norm": 0.9196439385414124,
"learning_rate": 0.00011396717249922576,
"loss": 6.5923,
"step": 1840
},
{
"epoch": 0.5728440935129276,
"grad_norm": 1.3304774761199951,
"learning_rate": 0.00011458655930628678,
"loss": 6.6129,
"step": 1850
},
{
"epoch": 0.575940548072457,
"grad_norm": 1.071112871170044,
"learning_rate": 0.00011520594611334779,
"loss": 6.6095,
"step": 1860
},
{
"epoch": 0.5790370026319864,
"grad_norm": 1.1381322145462036,
"learning_rate": 0.0001158253329204088,
"loss": 6.5962,
"step": 1870
},
{
"epoch": 0.5821334571915158,
"grad_norm": 2.608501672744751,
"learning_rate": 0.00011644471972746981,
"loss": 6.6024,
"step": 1880
},
{
"epoch": 0.5852299117510451,
"grad_norm": 1.4727625846862793,
"learning_rate": 0.0001170641065345308,
"loss": 6.5914,
"step": 1890
},
{
"epoch": 0.5883263663105744,
"grad_norm": 1.192298173904419,
"learning_rate": 0.00011768349334159183,
"loss": 6.6072,
"step": 1900
},
{
"epoch": 0.5914228208701038,
"grad_norm": 0.9773418307304382,
"learning_rate": 0.00011830288014865285,
"loss": 6.5805,
"step": 1910
},
{
"epoch": 0.5945192754296331,
"grad_norm": 1.096369743347168,
"learning_rate": 0.00011892226695571384,
"loss": 6.6052,
"step": 1920
},
{
"epoch": 0.5976157299891625,
"grad_norm": 1.2275642156600952,
"learning_rate": 0.00011954165376277486,
"loss": 6.594,
"step": 1930
},
{
"epoch": 0.6007121845486918,
"grad_norm": 1.3209136724472046,
"learning_rate": 0.00012016104056983585,
"loss": 6.6078,
"step": 1940
},
{
"epoch": 0.6038086391082211,
"grad_norm": 1.3680113554000854,
"learning_rate": 0.00012078042737689687,
"loss": 6.5793,
"step": 1950
},
{
"epoch": 0.6069050936677505,
"grad_norm": 1.2960150241851807,
"learning_rate": 0.00012139981418395789,
"loss": 6.5969,
"step": 1960
},
{
"epoch": 0.6100015482272798,
"grad_norm": 0.8884462118148804,
"learning_rate": 0.00012201920099101888,
"loss": 6.5862,
"step": 1970
},
{
"epoch": 0.6130980027868091,
"grad_norm": 0.9539084434509277,
"learning_rate": 0.0001226385877980799,
"loss": 6.5797,
"step": 1980
},
{
"epoch": 0.6161944573463385,
"grad_norm": 1.023714303970337,
"learning_rate": 0.0001232579746051409,
"loss": 6.5896,
"step": 1990
},
{
"epoch": 0.6192909119058678,
"grad_norm": 1.0426772832870483,
"learning_rate": 0.00012387736141220192,
"loss": 6.6121,
"step": 2000
},
{
"epoch": 0.6223873664653972,
"grad_norm": 1.4499601125717163,
"learning_rate": 0.00012449674821926292,
"loss": 6.6107,
"step": 2010
},
{
"epoch": 0.6254838210249265,
"grad_norm": 1.2633146047592163,
"learning_rate": 0.00012511613502632393,
"loss": 6.5991,
"step": 2020
},
{
"epoch": 0.6285802755844558,
"grad_norm": 0.845995306968689,
"learning_rate": 0.00012573552183338496,
"loss": 6.5722,
"step": 2030
},
{
"epoch": 0.6316767301439852,
"grad_norm": 1.2431766986846924,
"learning_rate": 0.00012635490864044597,
"loss": 6.5958,
"step": 2040
},
{
"epoch": 0.6347731847035145,
"grad_norm": 0.9436641335487366,
"learning_rate": 0.00012697429544750698,
"loss": 6.5901,
"step": 2050
},
{
"epoch": 0.6378696392630439,
"grad_norm": 1.334149718284607,
"learning_rate": 0.00012759368225456795,
"loss": 6.5938,
"step": 2060
},
{
"epoch": 0.6409660938225732,
"grad_norm": 0.9270686507225037,
"learning_rate": 0.000128213069061629,
"loss": 6.5767,
"step": 2070
},
{
"epoch": 0.6440625483821025,
"grad_norm": 1.3940073251724243,
"learning_rate": 0.00012883245586869,
"loss": 6.5834,
"step": 2080
},
{
"epoch": 0.6471590029416319,
"grad_norm": 1.163221001625061,
"learning_rate": 0.000129451842675751,
"loss": 6.5784,
"step": 2090
},
{
"epoch": 0.6502554575011612,
"grad_norm": 0.9691527485847473,
"learning_rate": 0.00013007122948281203,
"loss": 6.5823,
"step": 2100
},
{
"epoch": 0.6533519120606905,
"grad_norm": 0.7050260305404663,
"learning_rate": 0.000130690616289873,
"loss": 6.5847,
"step": 2110
},
{
"epoch": 0.6564483666202199,
"grad_norm": 1.2201118469238281,
"learning_rate": 0.00013131000309693402,
"loss": 6.5952,
"step": 2120
},
{
"epoch": 0.6595448211797492,
"grad_norm": 1.3519176244735718,
"learning_rate": 0.00013192938990399505,
"loss": 6.5822,
"step": 2130
},
{
"epoch": 0.6626412757392786,
"grad_norm": 1.0712783336639404,
"learning_rate": 0.00013254877671105606,
"loss": 6.5677,
"step": 2140
},
{
"epoch": 0.6657377302988079,
"grad_norm": 1.0584081411361694,
"learning_rate": 0.00013316816351811707,
"loss": 6.5859,
"step": 2150
},
{
"epoch": 0.6688341848583372,
"grad_norm": 0.8563801050186157,
"learning_rate": 0.0001337875503251781,
"loss": 6.5902,
"step": 2160
},
{
"epoch": 0.6719306394178666,
"grad_norm": 0.8715903162956238,
"learning_rate": 0.00013440693713223908,
"loss": 6.5875,
"step": 2170
},
{
"epoch": 0.6750270939773959,
"grad_norm": 1.3086822032928467,
"learning_rate": 0.00013502632393930008,
"loss": 6.5905,
"step": 2180
},
{
"epoch": 0.6781235485369252,
"grad_norm": 0.8140910267829895,
"learning_rate": 0.00013564571074636112,
"loss": 6.558,
"step": 2190
},
{
"epoch": 0.6812200030964546,
"grad_norm": 0.8857564330101013,
"learning_rate": 0.00013626509755342212,
"loss": 6.5713,
"step": 2200
},
{
"epoch": 0.6843164576559839,
"grad_norm": 1.4854942560195923,
"learning_rate": 0.00013688448436048313,
"loss": 6.5836,
"step": 2210
},
{
"epoch": 0.6874129122155133,
"grad_norm": 1.4530035257339478,
"learning_rate": 0.00013750387116754414,
"loss": 6.5756,
"step": 2220
},
{
"epoch": 0.6905093667750426,
"grad_norm": 0.8865880370140076,
"learning_rate": 0.00013812325797460514,
"loss": 6.5887,
"step": 2230
},
{
"epoch": 0.6936058213345719,
"grad_norm": 0.8601120710372925,
"learning_rate": 0.00013874264478166615,
"loss": 6.5701,
"step": 2240
},
{
"epoch": 0.6967022758941013,
"grad_norm": 0.8077085614204407,
"learning_rate": 0.00013936203158872716,
"loss": 6.5734,
"step": 2250
},
{
"epoch": 0.6997987304536306,
"grad_norm": 0.7860495448112488,
"learning_rate": 0.0001399814183957882,
"loss": 6.5609,
"step": 2260
},
{
"epoch": 0.70289518501316,
"grad_norm": 1.4957787990570068,
"learning_rate": 0.00014060080520284917,
"loss": 6.5588,
"step": 2270
},
{
"epoch": 0.7059916395726893,
"grad_norm": 1.2393313646316528,
"learning_rate": 0.00014122019200991018,
"loss": 6.5752,
"step": 2280
},
{
"epoch": 0.7090880941322186,
"grad_norm": 0.8842589855194092,
"learning_rate": 0.0001418395788169712,
"loss": 6.5574,
"step": 2290
},
{
"epoch": 0.712184548691748,
"grad_norm": 0.7826055884361267,
"learning_rate": 0.00014245896562403222,
"loss": 6.5612,
"step": 2300
},
{
"epoch": 0.7152810032512773,
"grad_norm": 0.9402616024017334,
"learning_rate": 0.00014307835243109322,
"loss": 6.5596,
"step": 2310
},
{
"epoch": 0.7183774578108066,
"grad_norm": 1.274904727935791,
"learning_rate": 0.00014369773923815423,
"loss": 6.5796,
"step": 2320
},
{
"epoch": 0.721473912370336,
"grad_norm": 1.112528681755066,
"learning_rate": 0.00014431712604521523,
"loss": 6.5563,
"step": 2330
},
{
"epoch": 0.7245703669298653,
"grad_norm": 0.8044337630271912,
"learning_rate": 0.00014493651285227624,
"loss": 6.547,
"step": 2340
},
{
"epoch": 0.7276668214893947,
"grad_norm": 1.0962836742401123,
"learning_rate": 0.00014555589965933727,
"loss": 6.5543,
"step": 2350
},
{
"epoch": 0.730763276048924,
"grad_norm": 1.0332891941070557,
"learning_rate": 0.00014617528646639828,
"loss": 6.5486,
"step": 2360
},
{
"epoch": 0.7338597306084533,
"grad_norm": 0.9583357572555542,
"learning_rate": 0.00014679467327345926,
"loss": 6.5602,
"step": 2370
},
{
"epoch": 0.7369561851679827,
"grad_norm": 1.0913727283477783,
"learning_rate": 0.0001474140600805203,
"loss": 6.5468,
"step": 2380
},
{
"epoch": 0.740052639727512,
"grad_norm": 1.192328929901123,
"learning_rate": 0.0001480334468875813,
"loss": 6.5476,
"step": 2390
},
{
"epoch": 0.7431490942870413,
"grad_norm": 1.3153208494186401,
"learning_rate": 0.0001486528336946423,
"loss": 6.5502,
"step": 2400
},
{
"epoch": 0.7462455488465707,
"grad_norm": 1.0659363269805908,
"learning_rate": 0.0001492722205017033,
"loss": 6.5458,
"step": 2410
},
{
"epoch": 0.7493420034061,
"grad_norm": 0.6409627199172974,
"learning_rate": 0.00014989160730876435,
"loss": 6.5615,
"step": 2420
},
{
"epoch": 0.7524384579656294,
"grad_norm": 1.534621238708496,
"learning_rate": 0.00015051099411582532,
"loss": 6.5413,
"step": 2430
},
{
"epoch": 0.7555349125251587,
"grad_norm": 0.8091804385185242,
"learning_rate": 0.00015113038092288633,
"loss": 6.5558,
"step": 2440
},
{
"epoch": 0.758631367084688,
"grad_norm": 1.1276757717132568,
"learning_rate": 0.00015174976772994736,
"loss": 6.5495,
"step": 2450
},
{
"epoch": 0.7617278216442174,
"grad_norm": 1.1171313524246216,
"learning_rate": 0.00015236915453700837,
"loss": 6.5202,
"step": 2460
},
{
"epoch": 0.7648242762037467,
"grad_norm": 0.8118519186973572,
"learning_rate": 0.00015298854134406938,
"loss": 6.5484,
"step": 2470
},
{
"epoch": 0.767920730763276,
"grad_norm": 0.835800290107727,
"learning_rate": 0.00015360792815113038,
"loss": 6.5512,
"step": 2480
},
{
"epoch": 0.7710171853228054,
"grad_norm": 1.2488937377929688,
"learning_rate": 0.0001542273149581914,
"loss": 6.5434,
"step": 2490
},
{
"epoch": 0.7741136398823347,
"grad_norm": 1.0001873970031738,
"learning_rate": 0.0001548467017652524,
"loss": 6.5562,
"step": 2500
},
{
"epoch": 0.7772100944418641,
"grad_norm": 1.329168438911438,
"learning_rate": 0.00015546608857231343,
"loss": 6.5432,
"step": 2510
},
{
"epoch": 0.7803065490013934,
"grad_norm": 1.0739688873291016,
"learning_rate": 0.00015608547537937444,
"loss": 6.5359,
"step": 2520
},
{
"epoch": 0.7834030035609227,
"grad_norm": 1.1103359460830688,
"learning_rate": 0.00015670486218643541,
"loss": 6.5514,
"step": 2530
},
{
"epoch": 0.7864994581204521,
"grad_norm": 0.7088027596473694,
"learning_rate": 0.00015732424899349645,
"loss": 6.5415,
"step": 2540
},
{
"epoch": 0.7895959126799814,
"grad_norm": 1.141654133796692,
"learning_rate": 0.00015794363580055745,
"loss": 6.5505,
"step": 2550
},
{
"epoch": 0.7926923672395108,
"grad_norm": 0.9900869727134705,
"learning_rate": 0.00015856302260761846,
"loss": 6.5505,
"step": 2560
},
{
"epoch": 0.7957888217990401,
"grad_norm": 0.9820410013198853,
"learning_rate": 0.00015918240941467947,
"loss": 6.5306,
"step": 2570
},
{
"epoch": 0.7988852763585694,
"grad_norm": 1.1498329639434814,
"learning_rate": 0.00015980179622174047,
"loss": 6.5308,
"step": 2580
},
{
"epoch": 0.8019817309180988,
"grad_norm": 1.5919135808944702,
"learning_rate": 0.00016042118302880148,
"loss": 6.5458,
"step": 2590
},
{
"epoch": 0.8050781854776281,
"grad_norm": 1.7433273792266846,
"learning_rate": 0.00016104056983586249,
"loss": 6.5307,
"step": 2600
},
{
"epoch": 0.8081746400371574,
"grad_norm": 1.2043076753616333,
"learning_rate": 0.00016165995664292352,
"loss": 6.5347,
"step": 2610
},
{
"epoch": 0.8112710945966868,
"grad_norm": 1.2197911739349365,
"learning_rate": 0.00016227934344998453,
"loss": 6.5282,
"step": 2620
},
{
"epoch": 0.8143675491562161,
"grad_norm": 0.8074585199356079,
"learning_rate": 0.0001628987302570455,
"loss": 6.542,
"step": 2630
},
{
"epoch": 0.8174640037157455,
"grad_norm": 1.1308220624923706,
"learning_rate": 0.00016351811706410654,
"loss": 6.547,
"step": 2640
},
{
"epoch": 0.8205604582752748,
"grad_norm": 0.989686131477356,
"learning_rate": 0.00016413750387116755,
"loss": 6.5418,
"step": 2650
},
{
"epoch": 0.8236569128348041,
"grad_norm": 0.9891242980957031,
"learning_rate": 0.00016475689067822855,
"loss": 6.5211,
"step": 2660
},
{
"epoch": 0.8267533673943335,
"grad_norm": 0.9230055212974548,
"learning_rate": 0.00016537627748528958,
"loss": 6.5371,
"step": 2670
},
{
"epoch": 0.8298498219538628,
"grad_norm": 0.9501631259918213,
"learning_rate": 0.0001659956642923506,
"loss": 6.5325,
"step": 2680
},
{
"epoch": 0.8329462765133921,
"grad_norm": 0.9849696755409241,
"learning_rate": 0.00016661505109941157,
"loss": 6.5395,
"step": 2690
},
{
"epoch": 0.8360427310729215,
"grad_norm": 1.2444876432418823,
"learning_rate": 0.0001672344379064726,
"loss": 6.5156,
"step": 2700
},
{
"epoch": 0.8391391856324508,
"grad_norm": 0.7472972869873047,
"learning_rate": 0.0001678538247135336,
"loss": 6.5228,
"step": 2710
},
{
"epoch": 0.8422356401919802,
"grad_norm": 0.8915477991104126,
"learning_rate": 0.00016847321152059462,
"loss": 6.5242,
"step": 2720
},
{
"epoch": 0.8453320947515095,
"grad_norm": 1.0017406940460205,
"learning_rate": 0.00016909259832765562,
"loss": 6.5449,
"step": 2730
},
{
"epoch": 0.8484285493110388,
"grad_norm": 1.0427559614181519,
"learning_rate": 0.00016971198513471663,
"loss": 6.5257,
"step": 2740
},
{
"epoch": 0.8515250038705682,
"grad_norm": 0.8571954965591431,
"learning_rate": 0.00017033137194177764,
"loss": 6.5203,
"step": 2750
},
{
"epoch": 0.8546214584300975,
"grad_norm": 1.0811147689819336,
"learning_rate": 0.00017095075874883864,
"loss": 6.5196,
"step": 2760
},
{
"epoch": 0.8577179129896269,
"grad_norm": 0.9217764735221863,
"learning_rate": 0.00017157014555589968,
"loss": 6.5189,
"step": 2770
},
{
"epoch": 0.8608143675491562,
"grad_norm": 0.9920642375946045,
"learning_rate": 0.00017218953236296068,
"loss": 6.5191,
"step": 2780
},
{
"epoch": 0.8639108221086855,
"grad_norm": 1.0834949016571045,
"learning_rate": 0.00017280891917002166,
"loss": 6.5227,
"step": 2790
},
{
"epoch": 0.8670072766682149,
"grad_norm": 0.916513204574585,
"learning_rate": 0.0001734283059770827,
"loss": 6.5144,
"step": 2800
},
{
"epoch": 0.8701037312277442,
"grad_norm": 1.2615902423858643,
"learning_rate": 0.0001740476927841437,
"loss": 6.506,
"step": 2810
},
{
"epoch": 0.8732001857872735,
"grad_norm": 0.8685635924339294,
"learning_rate": 0.0001746670795912047,
"loss": 6.5142,
"step": 2820
},
{
"epoch": 0.8762966403468029,
"grad_norm": 0.8606330156326294,
"learning_rate": 0.00017528646639826574,
"loss": 6.5019,
"step": 2830
},
{
"epoch": 0.8793930949063322,
"grad_norm": 0.7754759192466736,
"learning_rate": 0.00017590585320532672,
"loss": 6.5119,
"step": 2840
},
{
"epoch": 0.8824895494658616,
"grad_norm": 0.8332505226135254,
"learning_rate": 0.00017652524001238773,
"loss": 6.5283,
"step": 2850
},
{
"epoch": 0.8855860040253909,
"grad_norm": 1.1799520254135132,
"learning_rate": 0.00017714462681944876,
"loss": 6.5043,
"step": 2860
},
{
"epoch": 0.8886824585849202,
"grad_norm": 0.9492645859718323,
"learning_rate": 0.00017776401362650977,
"loss": 6.5106,
"step": 2870
},
{
"epoch": 0.8917789131444496,
"grad_norm": 0.7921923995018005,
"learning_rate": 0.00017838340043357077,
"loss": 6.5065,
"step": 2880
},
{
"epoch": 0.8948753677039789,
"grad_norm": 0.6766930818557739,
"learning_rate": 0.00017900278724063175,
"loss": 6.5239,
"step": 2890
},
{
"epoch": 0.8979718222635082,
"grad_norm": 0.7052696347236633,
"learning_rate": 0.00017962217404769278,
"loss": 6.5378,
"step": 2900
},
{
"epoch": 0.9010682768230376,
"grad_norm": 0.973673403263092,
"learning_rate": 0.0001802415608547538,
"loss": 6.5099,
"step": 2910
},
{
"epoch": 0.9041647313825669,
"grad_norm": 0.8590471744537354,
"learning_rate": 0.0001808609476618148,
"loss": 6.522,
"step": 2920
},
{
"epoch": 0.9072611859420963,
"grad_norm": 0.9478482604026794,
"learning_rate": 0.00018148033446887583,
"loss": 6.5148,
"step": 2930
},
{
"epoch": 0.9103576405016256,
"grad_norm": 0.991057813167572,
"learning_rate": 0.00018209972127593684,
"loss": 6.4991,
"step": 2940
},
{
"epoch": 0.9134540950611549,
"grad_norm": 0.8526809811592102,
"learning_rate": 0.00018271910808299782,
"loss": 6.5164,
"step": 2950
},
{
"epoch": 0.9165505496206843,
"grad_norm": 0.6919571161270142,
"learning_rate": 0.00018333849489005885,
"loss": 6.5214,
"step": 2960
},
{
"epoch": 0.9196470041802136,
"grad_norm": 0.657346248626709,
"learning_rate": 0.00018395788169711986,
"loss": 6.5013,
"step": 2970
},
{
"epoch": 0.922743458739743,
"grad_norm": 0.8530818223953247,
"learning_rate": 0.00018457726850418086,
"loss": 6.5145,
"step": 2980
},
{
"epoch": 0.9258399132992723,
"grad_norm": 0.8030965328216553,
"learning_rate": 0.0001851966553112419,
"loss": 6.513,
"step": 2990
},
{
"epoch": 0.9289363678588016,
"grad_norm": 0.8161980509757996,
"learning_rate": 0.00018581604211830288,
"loss": 6.5074,
"step": 3000
},
{
"epoch": 0.932032822418331,
"grad_norm": 0.9112780094146729,
"learning_rate": 0.00018643542892536388,
"loss": 6.4961,
"step": 3010
},
{
"epoch": 0.9351292769778603,
"grad_norm": 0.8977142572402954,
"learning_rate": 0.00018705481573242491,
"loss": 6.4973,
"step": 3020
},
{
"epoch": 0.9382257315373896,
"grad_norm": 1.0232683420181274,
"learning_rate": 0.00018767420253948592,
"loss": 6.48,
"step": 3030
},
{
"epoch": 0.941322186096919,
"grad_norm": 0.8228316307067871,
"learning_rate": 0.00018829358934654693,
"loss": 6.4842,
"step": 3040
},
{
"epoch": 0.9444186406564483,
"grad_norm": 0.724467396736145,
"learning_rate": 0.0001889129761536079,
"loss": 6.4781,
"step": 3050
},
{
"epoch": 0.9475150952159777,
"grad_norm": 0.9022755026817322,
"learning_rate": 0.00018953236296066894,
"loss": 6.4799,
"step": 3060
},
{
"epoch": 0.950611549775507,
"grad_norm": 1.0211142301559448,
"learning_rate": 0.00019015174976772995,
"loss": 6.4719,
"step": 3070
},
{
"epoch": 0.9537080043350363,
"grad_norm": 0.7571627497673035,
"learning_rate": 0.00019077113657479095,
"loss": 6.4685,
"step": 3080
},
{
"epoch": 0.9568044588945657,
"grad_norm": 0.797822117805481,
"learning_rate": 0.00019139052338185199,
"loss": 6.4502,
"step": 3090
},
{
"epoch": 0.959900913454095,
"grad_norm": 1.1731350421905518,
"learning_rate": 0.00019200991018891297,
"loss": 6.4534,
"step": 3100
},
{
"epoch": 0.9629973680136245,
"grad_norm": 0.7823401689529419,
"learning_rate": 0.00019262929699597397,
"loss": 6.4611,
"step": 3110
},
{
"epoch": 0.9660938225731538,
"grad_norm": 1.2475049495697021,
"learning_rate": 0.000193248683803035,
"loss": 6.4389,
"step": 3120
},
{
"epoch": 0.9691902771326831,
"grad_norm": 0.9524723887443542,
"learning_rate": 0.000193868070610096,
"loss": 6.4435,
"step": 3130
},
{
"epoch": 0.9722867316922125,
"grad_norm": 0.9494399428367615,
"learning_rate": 0.00019448745741715702,
"loss": 6.4332,
"step": 3140
},
{
"epoch": 0.9753831862517418,
"grad_norm": 1.0070710182189941,
"learning_rate": 0.00019510684422421805,
"loss": 6.4529,
"step": 3150
},
{
"epoch": 0.9784796408112711,
"grad_norm": 1.180368185043335,
"learning_rate": 0.00019572623103127903,
"loss": 6.4369,
"step": 3160
},
{
"epoch": 0.9815760953708005,
"grad_norm": 1.0592350959777832,
"learning_rate": 0.00019634561783834004,
"loss": 6.4402,
"step": 3170
},
{
"epoch": 0.9846725499303298,
"grad_norm": 0.8868720531463623,
"learning_rate": 0.00019696500464540107,
"loss": 6.4406,
"step": 3180
},
{
"epoch": 0.9877690044898592,
"grad_norm": 0.8809700608253479,
"learning_rate": 0.00019758439145246208,
"loss": 6.4125,
"step": 3190
},
{
"epoch": 0.9908654590493885,
"grad_norm": 0.8676486611366272,
"learning_rate": 0.00019820377825952308,
"loss": 6.4281,
"step": 3200
},
{
"epoch": 0.9939619136089178,
"grad_norm": 0.6752346754074097,
"learning_rate": 0.0001988231650665841,
"loss": 6.4041,
"step": 3210
},
{
"epoch": 0.9970583681684472,
"grad_norm": 0.9568197131156921,
"learning_rate": 0.0001994425518736451,
"loss": 6.3981,
"step": 3220
},
{
"epoch": 1.0,
"grad_norm": 0.5673872828483582,
"learning_rate": 0.0002000619386807061,
"loss": 6.071,
"step": 3230
},
{
"epoch": 1.0030964545595293,
"grad_norm": 0.8268159627914429,
"learning_rate": 0.0002006813254877671,
"loss": 6.3988,
"step": 3240
},
{
"epoch": 1.0061929091190587,
"grad_norm": 0.7635223269462585,
"learning_rate": 0.00020130071229482814,
"loss": 6.3831,
"step": 3250
},
{
"epoch": 1.009289363678588,
"grad_norm": 0.7615036368370056,
"learning_rate": 0.00020192009910188912,
"loss": 6.3832,
"step": 3260
},
{
"epoch": 1.0123858182381174,
"grad_norm": 0.7862409353256226,
"learning_rate": 0.00020253948590895013,
"loss": 6.3929,
"step": 3270
},
{
"epoch": 1.0154822727976467,
"grad_norm": 1.113342046737671,
"learning_rate": 0.00020315887271601116,
"loss": 6.3917,
"step": 3280
},
{
"epoch": 1.018578727357176,
"grad_norm": 0.8702403903007507,
"learning_rate": 0.00020377825952307217,
"loss": 6.3783,
"step": 3290
},
{
"epoch": 1.0216751819167054,
"grad_norm": 0.8440068960189819,
"learning_rate": 0.00020439764633013317,
"loss": 6.3777,
"step": 3300
},
{
"epoch": 1.0247716364762347,
"grad_norm": 1.1612240076065063,
"learning_rate": 0.00020501703313719418,
"loss": 6.3617,
"step": 3310
},
{
"epoch": 1.027868091035764,
"grad_norm": 0.8664381504058838,
"learning_rate": 0.00020563641994425519,
"loss": 6.3766,
"step": 3320
},
{
"epoch": 1.0309645455952934,
"grad_norm": 1.2137264013290405,
"learning_rate": 0.0002062558067513162,
"loss": 6.3724,
"step": 3330
},
{
"epoch": 1.0340610001548227,
"grad_norm": 1.2266614437103271,
"learning_rate": 0.00020687519355837723,
"loss": 6.3313,
"step": 3340
},
{
"epoch": 1.037157454714352,
"grad_norm": 0.8568953275680542,
"learning_rate": 0.00020749458036543823,
"loss": 6.3583,
"step": 3350
},
{
"epoch": 1.0402539092738814,
"grad_norm": 0.874577522277832,
"learning_rate": 0.0002081139671724992,
"loss": 6.3511,
"step": 3360
},
{
"epoch": 1.0433503638334107,
"grad_norm": 1.1219960451126099,
"learning_rate": 0.00020873335397956024,
"loss": 6.323,
"step": 3370
},
{
"epoch": 1.04644681839294,
"grad_norm": 1.1575599908828735,
"learning_rate": 0.00020935274078662125,
"loss": 6.3426,
"step": 3380
},
{
"epoch": 1.0495432729524694,
"grad_norm": 0.7617483139038086,
"learning_rate": 0.00020997212759368226,
"loss": 6.3243,
"step": 3390
},
{
"epoch": 1.0526397275119987,
"grad_norm": 1.019921064376831,
"learning_rate": 0.00021059151440074326,
"loss": 6.303,
"step": 3400
},
{
"epoch": 1.055736182071528,
"grad_norm": 1.034369945526123,
"learning_rate": 0.0002112109012078043,
"loss": 6.3092,
"step": 3410
},
{
"epoch": 1.0588326366310574,
"grad_norm": 1.1426433324813843,
"learning_rate": 0.00021183028801486528,
"loss": 6.3023,
"step": 3420
},
{
"epoch": 1.0619290911905868,
"grad_norm": 0.9942957162857056,
"learning_rate": 0.00021244967482192628,
"loss": 6.3104,
"step": 3430
},
{
"epoch": 1.065025545750116,
"grad_norm": 1.0719786882400513,
"learning_rate": 0.00021306906162898732,
"loss": 6.2973,
"step": 3440
},
{
"epoch": 1.0681220003096454,
"grad_norm": 1.0518437623977661,
"learning_rate": 0.00021368844843604832,
"loss": 6.2967,
"step": 3450
},
{
"epoch": 1.0712184548691748,
"grad_norm": 1.2732771635055542,
"learning_rate": 0.00021430783524310933,
"loss": 6.2746,
"step": 3460
},
{
"epoch": 1.0743149094287041,
"grad_norm": 1.5430597066879272,
"learning_rate": 0.00021492722205017034,
"loss": 6.2815,
"step": 3470
},
{
"epoch": 1.0774113639882335,
"grad_norm": 0.8930633068084717,
"learning_rate": 0.00021554660885723134,
"loss": 6.251,
"step": 3480
},
{
"epoch": 1.0805078185477628,
"grad_norm": 1.1095397472381592,
"learning_rate": 0.00021616599566429235,
"loss": 6.238,
"step": 3490
},
{
"epoch": 1.0836042731072921,
"grad_norm": 1.1570417881011963,
"learning_rate": 0.00021678538247135338,
"loss": 6.2169,
"step": 3500
},
{
"epoch": 1.0867007276668215,
"grad_norm": 1.2682262659072876,
"learning_rate": 0.0002174047692784144,
"loss": 6.2256,
"step": 3510
},
{
"epoch": 1.0897971822263508,
"grad_norm": 1.2010442018508911,
"learning_rate": 0.00021802415608547537,
"loss": 6.2034,
"step": 3520
},
{
"epoch": 1.0928936367858801,
"grad_norm": 1.3368873596191406,
"learning_rate": 0.0002186435428925364,
"loss": 6.1673,
"step": 3530
},
{
"epoch": 1.0959900913454095,
"grad_norm": 1.1895204782485962,
"learning_rate": 0.0002192629296995974,
"loss": 6.1546,
"step": 3540
},
{
"epoch": 1.0990865459049388,
"grad_norm": 1.1519889831542969,
"learning_rate": 0.0002198823165066584,
"loss": 6.1365,
"step": 3550
},
{
"epoch": 1.1021830004644682,
"grad_norm": 1.3705570697784424,
"learning_rate": 0.00022050170331371942,
"loss": 6.07,
"step": 3560
},
{
"epoch": 1.1052794550239975,
"grad_norm": 1.6378076076507568,
"learning_rate": 0.00022112109012078043,
"loss": 6.0432,
"step": 3570
},
{
"epoch": 1.1083759095835268,
"grad_norm": 1.5780587196350098,
"learning_rate": 0.00022174047692784143,
"loss": 6.0201,
"step": 3580
},
{
"epoch": 1.1114723641430562,
"grad_norm": 1.2604175806045532,
"learning_rate": 0.00022235986373490244,
"loss": 5.9781,
"step": 3590
},
{
"epoch": 1.1145688187025855,
"grad_norm": 1.4099502563476562,
"learning_rate": 0.00022297925054196347,
"loss": 5.9298,
"step": 3600
},
{
"epoch": 1.1176652732621148,
"grad_norm": 1.233045220375061,
"learning_rate": 0.00022359863734902448,
"loss": 5.8924,
"step": 3610
},
{
"epoch": 1.1207617278216442,
"grad_norm": 1.4948160648345947,
"learning_rate": 0.00022421802415608546,
"loss": 5.8785,
"step": 3620
},
{
"epoch": 1.1238581823811735,
"grad_norm": 1.7853126525878906,
"learning_rate": 0.0002248374109631465,
"loss": 5.8134,
"step": 3630
},
{
"epoch": 1.1269546369407029,
"grad_norm": 2.1024398803710938,
"learning_rate": 0.0002254567977702075,
"loss": 5.7864,
"step": 3640
},
{
"epoch": 1.1300510915002322,
"grad_norm": 1.6895965337753296,
"learning_rate": 0.0002260761845772685,
"loss": 5.7182,
"step": 3650
},
{
"epoch": 1.1331475460597615,
"grad_norm": 1.7023606300354004,
"learning_rate": 0.00022669557138432954,
"loss": 5.6528,
"step": 3660
},
{
"epoch": 1.1362440006192909,
"grad_norm": 1.2533527612686157,
"learning_rate": 0.00022731495819139054,
"loss": 5.6221,
"step": 3670
},
{
"epoch": 1.1393404551788202,
"grad_norm": 1.621505618095398,
"learning_rate": 0.00022793434499845152,
"loss": 5.5583,
"step": 3680
},
{
"epoch": 1.1424369097383495,
"grad_norm": 1.3869857788085938,
"learning_rate": 0.00022855373180551256,
"loss": 5.5145,
"step": 3690
},
{
"epoch": 1.1455333642978789,
"grad_norm": 1.542646884918213,
"learning_rate": 0.00022917311861257356,
"loss": 5.4559,
"step": 3700
},
{
"epoch": 1.1486298188574082,
"grad_norm": 1.4515721797943115,
"learning_rate": 0.00022979250541963457,
"loss": 5.4007,
"step": 3710
},
{
"epoch": 1.1517262734169376,
"grad_norm": 1.7579517364501953,
"learning_rate": 0.00023041189222669557,
"loss": 5.3976,
"step": 3720
},
{
"epoch": 1.154822727976467,
"grad_norm": 1.2533565759658813,
"learning_rate": 0.00023103127903375658,
"loss": 5.3172,
"step": 3730
},
{
"epoch": 1.1579191825359962,
"grad_norm": 1.494162917137146,
"learning_rate": 0.0002316506658408176,
"loss": 5.2509,
"step": 3740
},
{
"epoch": 1.1610156370955256,
"grad_norm": 1.595115065574646,
"learning_rate": 0.0002322700526478786,
"loss": 5.1784,
"step": 3750
},
{
"epoch": 1.164112091655055,
"grad_norm": 1.55663001537323,
"learning_rate": 0.00023288943945493963,
"loss": 5.0949,
"step": 3760
},
{
"epoch": 1.1672085462145843,
"grad_norm": 1.374272346496582,
"learning_rate": 0.00023350882626200063,
"loss": 5.0331,
"step": 3770
},
{
"epoch": 1.1703050007741136,
"grad_norm": 1.3195029497146606,
"learning_rate": 0.0002341282130690616,
"loss": 4.9576,
"step": 3780
},
{
"epoch": 1.173401455333643,
"grad_norm": 1.161839485168457,
"learning_rate": 0.00023474759987612265,
"loss": 4.9166,
"step": 3790
},
{
"epoch": 1.1764979098931723,
"grad_norm": 1.2902604341506958,
"learning_rate": 0.00023536698668318365,
"loss": 4.8334,
"step": 3800
},
{
"epoch": 1.1795943644527016,
"grad_norm": 1.0339348316192627,
"learning_rate": 0.00023598637349024466,
"loss": 4.7735,
"step": 3810
},
{
"epoch": 1.182690819012231,
"grad_norm": 1.1560925245285034,
"learning_rate": 0.0002366057602973057,
"loss": 4.7389,
"step": 3820
},
{
"epoch": 1.1857872735717603,
"grad_norm": 1.0810256004333496,
"learning_rate": 0.00023722514710436667,
"loss": 4.673,
"step": 3830
},
{
"epoch": 1.1888837281312896,
"grad_norm": 1.187358021736145,
"learning_rate": 0.00023784453391142768,
"loss": 4.6703,
"step": 3840
},
{
"epoch": 1.191980182690819,
"grad_norm": 1.2153098583221436,
"learning_rate": 0.0002384639207184887,
"loss": 4.5977,
"step": 3850
},
{
"epoch": 1.1950766372503483,
"grad_norm": 1.3098320960998535,
"learning_rate": 0.00023908330752554972,
"loss": 4.5396,
"step": 3860
},
{
"epoch": 1.1981730918098776,
"grad_norm": 1.3841015100479126,
"learning_rate": 0.00023970269433261072,
"loss": 4.5191,
"step": 3870
},
{
"epoch": 1.201269546369407,
"grad_norm": 1.0185471773147583,
"learning_rate": 0.0002403220811396717,
"loss": 4.4745,
"step": 3880
},
{
"epoch": 1.2043660009289363,
"grad_norm": 1.0954643487930298,
"learning_rate": 0.00024094146794673274,
"loss": 4.4384,
"step": 3890
},
{
"epoch": 1.2074624554884656,
"grad_norm": 1.0373002290725708,
"learning_rate": 0.00024156085475379374,
"loss": 4.4049,
"step": 3900
},
{
"epoch": 1.210558910047995,
"grad_norm": 1.0706144571304321,
"learning_rate": 0.00024218024156085475,
"loss": 4.3815,
"step": 3910
},
{
"epoch": 1.2136553646075243,
"grad_norm": 1.1758544445037842,
"learning_rate": 0.00024279962836791578,
"loss": 4.3619,
"step": 3920
},
{
"epoch": 1.2167518191670537,
"grad_norm": 1.1079212427139282,
"learning_rate": 0.0002434190151749768,
"loss": 4.337,
"step": 3930
},
{
"epoch": 1.219848273726583,
"grad_norm": 1.1753212213516235,
"learning_rate": 0.00024403840198203777,
"loss": 4.3046,
"step": 3940
},
{
"epoch": 1.2229447282861123,
"grad_norm": 1.1949397325515747,
"learning_rate": 0.00024465778878909883,
"loss": 4.3101,
"step": 3950
},
{
"epoch": 1.2260411828456417,
"grad_norm": 1.0809822082519531,
"learning_rate": 0.0002452771755961598,
"loss": 4.2624,
"step": 3960
},
{
"epoch": 1.229137637405171,
"grad_norm": 1.113866925239563,
"learning_rate": 0.0002458965624032208,
"loss": 4.2679,
"step": 3970
},
{
"epoch": 1.2322340919647004,
"grad_norm": 1.1212016344070435,
"learning_rate": 0.0002465159492102818,
"loss": 4.2071,
"step": 3980
},
{
"epoch": 1.2353305465242297,
"grad_norm": 1.1517590284347534,
"learning_rate": 0.00024713533601734285,
"loss": 4.1929,
"step": 3990
},
{
"epoch": 1.238427001083759,
"grad_norm": 0.9486988186836243,
"learning_rate": 0.00024775472282440383,
"loss": 4.1934,
"step": 4000
},
{
"epoch": 1.2415234556432884,
"grad_norm": 1.0706721544265747,
"learning_rate": 0.00024837410963146487,
"loss": 4.1776,
"step": 4010
},
{
"epoch": 1.2446199102028177,
"grad_norm": 1.1148719787597656,
"learning_rate": 0.00024899349643852585,
"loss": 4.1265,
"step": 4020
},
{
"epoch": 1.247716364762347,
"grad_norm": 1.1065315008163452,
"learning_rate": 0.0002496128832455869,
"loss": 4.1034,
"step": 4030
},
{
"epoch": 1.2508128193218764,
"grad_norm": 1.158066987991333,
"learning_rate": 0.00025023227005264786,
"loss": 4.0863,
"step": 4040
},
{
"epoch": 1.2539092738814057,
"grad_norm": 1.1560614109039307,
"learning_rate": 0.0002508516568597089,
"loss": 4.0778,
"step": 4050
},
{
"epoch": 1.257005728440935,
"grad_norm": 0.98968905210495,
"learning_rate": 0.0002514710436667699,
"loss": 4.051,
"step": 4060
},
{
"epoch": 1.2601021830004644,
"grad_norm": 1.1713204383850098,
"learning_rate": 0.0002520904304738309,
"loss": 4.0181,
"step": 4070
},
{
"epoch": 1.2631986375599937,
"grad_norm": 1.1065443754196167,
"learning_rate": 0.00025270981728089194,
"loss": 4.0239,
"step": 4080
},
{
"epoch": 1.266295092119523,
"grad_norm": 1.043097972869873,
"learning_rate": 0.00025332920408795297,
"loss": 4.0208,
"step": 4090
},
{
"epoch": 1.2693915466790524,
"grad_norm": 1.024276614189148,
"learning_rate": 0.00025394859089501395,
"loss": 3.9804,
"step": 4100
},
{
"epoch": 1.2724880012385817,
"grad_norm": 1.1613043546676636,
"learning_rate": 0.000254567977702075,
"loss": 3.9819,
"step": 4110
},
{
"epoch": 1.275584455798111,
"grad_norm": 1.0510482788085938,
"learning_rate": 0.0002551873645091359,
"loss": 3.9696,
"step": 4120
},
{
"epoch": 1.2786809103576404,
"grad_norm": 0.9902080297470093,
"learning_rate": 0.00025580675131619694,
"loss": 3.9233,
"step": 4130
},
{
"epoch": 1.2817773649171698,
"grad_norm": 1.165866732597351,
"learning_rate": 0.000256426138123258,
"loss": 3.9079,
"step": 4140
},
{
"epoch": 1.284873819476699,
"grad_norm": 1.0561455488204956,
"learning_rate": 0.00025704552493031896,
"loss": 3.9072,
"step": 4150
},
{
"epoch": 1.2879702740362284,
"grad_norm": 0.989741325378418,
"learning_rate": 0.00025766491173738,
"loss": 3.9018,
"step": 4160
},
{
"epoch": 1.2910667285957578,
"grad_norm": 1.099219799041748,
"learning_rate": 0.000258284298544441,
"loss": 3.868,
"step": 4170
},
{
"epoch": 1.2941631831552871,
"grad_norm": 1.1154602766036987,
"learning_rate": 0.000258903685351502,
"loss": 3.8644,
"step": 4180
},
{
"epoch": 1.2972596377148164,
"grad_norm": 1.0872890949249268,
"learning_rate": 0.00025952307215856304,
"loss": 3.8587,
"step": 4190
},
{
"epoch": 1.3003560922743458,
"grad_norm": 1.0499584674835205,
"learning_rate": 0.00026014245896562407,
"loss": 3.8235,
"step": 4200
},
{
"epoch": 1.3034525468338751,
"grad_norm": 1.030174732208252,
"learning_rate": 0.00026076184577268505,
"loss": 3.8302,
"step": 4210
},
{
"epoch": 1.3065490013934045,
"grad_norm": 1.0867342948913574,
"learning_rate": 0.000261381232579746,
"loss": 3.8341,
"step": 4220
},
{
"epoch": 1.3096454559529338,
"grad_norm": 1.0520577430725098,
"learning_rate": 0.00026200061938680706,
"loss": 3.8018,
"step": 4230
},
{
"epoch": 1.3127419105124631,
"grad_norm": 1.0809017419815063,
"learning_rate": 0.00026262000619386804,
"loss": 3.7748,
"step": 4240
},
{
"epoch": 1.3158383650719925,
"grad_norm": 1.1091547012329102,
"learning_rate": 0.0002632393930009291,
"loss": 3.7732,
"step": 4250
},
{
"epoch": 1.3189348196315218,
"grad_norm": 1.0448859930038452,
"learning_rate": 0.0002638587798079901,
"loss": 3.74,
"step": 4260
},
{
"epoch": 1.3220312741910512,
"grad_norm": 1.0798423290252686,
"learning_rate": 0.0002644781666150511,
"loss": 3.7374,
"step": 4270
},
{
"epoch": 1.3251277287505805,
"grad_norm": 0.9496048092842102,
"learning_rate": 0.0002650975534221121,
"loss": 3.7422,
"step": 4280
},
{
"epoch": 1.3282241833101098,
"grad_norm": 0.9731584787368774,
"learning_rate": 0.00026571694022917315,
"loss": 3.6992,
"step": 4290
},
{
"epoch": 1.3313206378696392,
"grad_norm": 0.9330194592475891,
"learning_rate": 0.00026633632703623413,
"loss": 3.6868,
"step": 4300
},
{
"epoch": 1.3344170924291685,
"grad_norm": 1.0531985759735107,
"learning_rate": 0.00026695571384329517,
"loss": 3.6958,
"step": 4310
},
{
"epoch": 1.3375135469886978,
"grad_norm": 0.9694075584411621,
"learning_rate": 0.0002675751006503562,
"loss": 3.7137,
"step": 4320
},
{
"epoch": 1.3406100015482272,
"grad_norm": 0.9474936723709106,
"learning_rate": 0.0002681944874574171,
"loss": 3.6889,
"step": 4330
},
{
"epoch": 1.3437064561077565,
"grad_norm": 0.9624688029289246,
"learning_rate": 0.00026881387426447816,
"loss": 3.6531,
"step": 4340
},
{
"epoch": 1.3468029106672859,
"grad_norm": 0.9767426252365112,
"learning_rate": 0.0002694332610715392,
"loss": 3.6596,
"step": 4350
},
{
"epoch": 1.3498993652268152,
"grad_norm": 0.9959364533424377,
"learning_rate": 0.00027005264787860017,
"loss": 3.6434,
"step": 4360
},
{
"epoch": 1.3529958197863445,
"grad_norm": 1.0519224405288696,
"learning_rate": 0.0002706720346856612,
"loss": 3.5982,
"step": 4370
},
{
"epoch": 1.3560922743458739,
"grad_norm": 0.9964626431465149,
"learning_rate": 0.00027129142149272224,
"loss": 3.6145,
"step": 4380
},
{
"epoch": 1.3591887289054032,
"grad_norm": 1.0506435632705688,
"learning_rate": 0.0002719108082997832,
"loss": 3.5859,
"step": 4390
},
{
"epoch": 1.3622851834649325,
"grad_norm": 1.0846556425094604,
"learning_rate": 0.00027253019510684425,
"loss": 3.5981,
"step": 4400
},
{
"epoch": 1.3653816380244619,
"grad_norm": 1.0251847505569458,
"learning_rate": 0.0002731495819139053,
"loss": 3.5731,
"step": 4410
},
{
"epoch": 1.3684780925839912,
"grad_norm": 1.0184073448181152,
"learning_rate": 0.00027376896872096626,
"loss": 3.5665,
"step": 4420
},
{
"epoch": 1.3715745471435206,
"grad_norm": 0.9859119057655334,
"learning_rate": 0.00027438835552802724,
"loss": 3.5401,
"step": 4430
},
{
"epoch": 1.37467100170305,
"grad_norm": 0.9708986878395081,
"learning_rate": 0.0002750077423350883,
"loss": 3.5392,
"step": 4440
},
{
"epoch": 1.3777674562625792,
"grad_norm": 1.0786579847335815,
"learning_rate": 0.00027562712914214925,
"loss": 3.5553,
"step": 4450
},
{
"epoch": 1.3808639108221086,
"grad_norm": 1.011117696762085,
"learning_rate": 0.0002762465159492103,
"loss": 3.5251,
"step": 4460
},
{
"epoch": 1.383960365381638,
"grad_norm": 0.9319019317626953,
"learning_rate": 0.00027686590275627127,
"loss": 3.5408,
"step": 4470
},
{
"epoch": 1.3870568199411673,
"grad_norm": 1.0703030824661255,
"learning_rate": 0.0002774852895633323,
"loss": 3.5147,
"step": 4480
},
{
"epoch": 1.3901532745006966,
"grad_norm": 0.9363672733306885,
"learning_rate": 0.00027810467637039333,
"loss": 3.5054,
"step": 4490
},
{
"epoch": 1.393249729060226,
"grad_norm": 1.0434913635253906,
"learning_rate": 0.0002787240631774543,
"loss": 3.498,
"step": 4500
},
{
"epoch": 1.3963461836197553,
"grad_norm": 1.1381675004959106,
"learning_rate": 0.00027934344998451535,
"loss": 3.5045,
"step": 4510
},
{
"epoch": 1.3994426381792846,
"grad_norm": 0.9770002365112305,
"learning_rate": 0.0002799628367915764,
"loss": 3.5115,
"step": 4520
},
{
"epoch": 1.402539092738814,
"grad_norm": 0.9267017245292664,
"learning_rate": 0.0002805822235986373,
"loss": 3.4452,
"step": 4530
},
{
"epoch": 1.4056355472983433,
"grad_norm": 1.0910615921020508,
"learning_rate": 0.00028120161040569834,
"loss": 3.4792,
"step": 4540
},
{
"epoch": 1.4087320018578726,
"grad_norm": 1.0374314785003662,
"learning_rate": 0.00028182099721275937,
"loss": 3.4497,
"step": 4550
},
{
"epoch": 1.411828456417402,
"grad_norm": 1.1077336072921753,
"learning_rate": 0.00028244038401982035,
"loss": 3.4836,
"step": 4560
},
{
"epoch": 1.4149249109769313,
"grad_norm": 0.9700469374656677,
"learning_rate": 0.0002830597708268814,
"loss": 3.4539,
"step": 4570
},
{
"epoch": 1.4180213655364606,
"grad_norm": 1.0011495351791382,
"learning_rate": 0.0002836791576339424,
"loss": 3.4192,
"step": 4580
},
{
"epoch": 1.42111782009599,
"grad_norm": 1.0449153184890747,
"learning_rate": 0.0002842985444410034,
"loss": 3.4279,
"step": 4590
},
{
"epoch": 1.4242142746555193,
"grad_norm": 1.0163695812225342,
"learning_rate": 0.00028491793124806443,
"loss": 3.4375,
"step": 4600
},
{
"epoch": 1.4273107292150486,
"grad_norm": 0.9043591618537903,
"learning_rate": 0.00028553731805512546,
"loss": 3.41,
"step": 4610
},
{
"epoch": 1.430407183774578,
"grad_norm": 1.0529117584228516,
"learning_rate": 0.00028615670486218644,
"loss": 3.4181,
"step": 4620
},
{
"epoch": 1.4335036383341073,
"grad_norm": 0.9313072562217712,
"learning_rate": 0.0002867760916692475,
"loss": 3.381,
"step": 4630
},
{
"epoch": 1.4366000928936367,
"grad_norm": 1.0091314315795898,
"learning_rate": 0.00028739547847630846,
"loss": 3.4084,
"step": 4640
},
{
"epoch": 1.439696547453166,
"grad_norm": 1.023206114768982,
"learning_rate": 0.00028801486528336943,
"loss": 3.3933,
"step": 4650
},
{
"epoch": 1.4427930020126953,
"grad_norm": 0.9428771734237671,
"learning_rate": 0.00028863425209043047,
"loss": 3.3793,
"step": 4660
},
{
"epoch": 1.4458894565722247,
"grad_norm": 0.9487484097480774,
"learning_rate": 0.0002892536388974915,
"loss": 3.3703,
"step": 4670
},
{
"epoch": 1.448985911131754,
"grad_norm": 1.0242682695388794,
"learning_rate": 0.0002898730257045525,
"loss": 3.3808,
"step": 4680
},
{
"epoch": 1.4520823656912833,
"grad_norm": 0.963318407535553,
"learning_rate": 0.0002904924125116135,
"loss": 3.3756,
"step": 4690
},
{
"epoch": 1.4551788202508127,
"grad_norm": 0.9051762223243713,
"learning_rate": 0.00029111179931867455,
"loss": 3.3356,
"step": 4700
},
{
"epoch": 1.458275274810342,
"grad_norm": 0.9930270910263062,
"learning_rate": 0.0002917311861257355,
"loss": 3.3601,
"step": 4710
},
{
"epoch": 1.4613717293698714,
"grad_norm": 1.077131748199463,
"learning_rate": 0.00029235057293279656,
"loss": 3.3308,
"step": 4720
},
{
"epoch": 1.4644681839294007,
"grad_norm": 0.881527304649353,
"learning_rate": 0.0002929699597398576,
"loss": 3.328,
"step": 4730
},
{
"epoch": 1.46756463848893,
"grad_norm": 1.0115300416946411,
"learning_rate": 0.0002935893465469185,
"loss": 3.3233,
"step": 4740
},
{
"epoch": 1.4706610930484594,
"grad_norm": 1.0688494443893433,
"learning_rate": 0.00029420873335397955,
"loss": 3.3381,
"step": 4750
},
{
"epoch": 1.4737575476079887,
"grad_norm": 1.0195506811141968,
"learning_rate": 0.0002948281201610406,
"loss": 3.3058,
"step": 4760
},
{
"epoch": 1.476854002167518,
"grad_norm": 0.9502407312393188,
"learning_rate": 0.00029544750696810156,
"loss": 3.3174,
"step": 4770
},
{
"epoch": 1.4799504567270474,
"grad_norm": 1.0097241401672363,
"learning_rate": 0.0002960668937751626,
"loss": 3.3102,
"step": 4780
},
{
"epoch": 1.483046911286577,
"grad_norm": 0.9834030866622925,
"learning_rate": 0.0002966862805822236,
"loss": 3.3135,
"step": 4790
},
{
"epoch": 1.4861433658461063,
"grad_norm": 1.014854907989502,
"learning_rate": 0.0002973056673892846,
"loss": 3.2915,
"step": 4800
},
{
"epoch": 1.4892398204056356,
"grad_norm": 0.944720983505249,
"learning_rate": 0.00029792505419634564,
"loss": 3.2783,
"step": 4810
},
{
"epoch": 1.492336274965165,
"grad_norm": 1.012688159942627,
"learning_rate": 0.0002985444410034066,
"loss": 3.2931,
"step": 4820
},
{
"epoch": 1.4954327295246943,
"grad_norm": 0.9100663065910339,
"learning_rate": 0.00029916382781046766,
"loss": 3.2785,
"step": 4830
},
{
"epoch": 1.4985291840842236,
"grad_norm": 0.8774744272232056,
"learning_rate": 0.0002997832146175287,
"loss": 3.2777,
"step": 4840
},
{
"epoch": 1.5016256386437528,
"grad_norm": 0.9623695611953735,
"learning_rate": 0.0003004026014245896,
"loss": 3.2671,
"step": 4850
},
{
"epoch": 1.504722093203282,
"grad_norm": 1.0606322288513184,
"learning_rate": 0.00030102198823165065,
"loss": 3.2483,
"step": 4860
},
{
"epoch": 1.5078185477628114,
"grad_norm": 1.0098302364349365,
"learning_rate": 0.0003016413750387117,
"loss": 3.2355,
"step": 4870
},
{
"epoch": 1.5109150023223408,
"grad_norm": 0.8991314172744751,
"learning_rate": 0.00030226076184577266,
"loss": 3.239,
"step": 4880
},
{
"epoch": 1.51401145688187,
"grad_norm": 0.9911772012710571,
"learning_rate": 0.0003028801486528337,
"loss": 3.2569,
"step": 4890
},
{
"epoch": 1.5171079114413994,
"grad_norm": 0.9949657320976257,
"learning_rate": 0.00030349953545989473,
"loss": 3.2441,
"step": 4900
},
{
"epoch": 1.5202043660009288,
"grad_norm": 0.9273360371589661,
"learning_rate": 0.0003041189222669557,
"loss": 3.2385,
"step": 4910
},
{
"epoch": 1.5233008205604581,
"grad_norm": 0.94888836145401,
"learning_rate": 0.00030473830907401674,
"loss": 3.2728,
"step": 4920
},
{
"epoch": 1.5263972751199875,
"grad_norm": 0.9299125075340271,
"learning_rate": 0.0003053576958810778,
"loss": 3.2272,
"step": 4930
},
{
"epoch": 1.5294937296795168,
"grad_norm": 0.8870009183883667,
"learning_rate": 0.00030597708268813875,
"loss": 3.2218,
"step": 4940
},
{
"epoch": 1.5325901842390461,
"grad_norm": 1.0036243200302124,
"learning_rate": 0.00030659646949519973,
"loss": 3.2008,
"step": 4950
},
{
"epoch": 1.5356866387985755,
"grad_norm": 0.9473212957382202,
"learning_rate": 0.00030721585630226077,
"loss": 3.2295,
"step": 4960
},
{
"epoch": 1.5387830933581048,
"grad_norm": 0.8856829404830933,
"learning_rate": 0.00030783524310932175,
"loss": 3.2126,
"step": 4970
},
{
"epoch": 1.5418795479176342,
"grad_norm": 0.997509777545929,
"learning_rate": 0.0003084546299163828,
"loss": 3.212,
"step": 4980
},
{
"epoch": 1.5449760024771635,
"grad_norm": 0.9016265273094177,
"learning_rate": 0.0003090740167234438,
"loss": 3.208,
"step": 4990
},
{
"epoch": 1.5480724570366928,
"grad_norm": 0.8731397390365601,
"learning_rate": 0.0003096934035305048,
"loss": 3.2158,
"step": 5000
},
{
"epoch": 1.5511689115962222,
"grad_norm": 0.9676650166511536,
"learning_rate": 0.0003103127903375658,
"loss": 3.2032,
"step": 5010
},
{
"epoch": 1.5542653661557515,
"grad_norm": 0.9783886075019836,
"learning_rate": 0.00031093217714462686,
"loss": 3.2114,
"step": 5020
},
{
"epoch": 1.5573618207152808,
"grad_norm": 1.0224086046218872,
"learning_rate": 0.00031155156395168784,
"loss": 3.1828,
"step": 5030
},
{
"epoch": 1.5604582752748102,
"grad_norm": 0.9322043061256409,
"learning_rate": 0.00031217095075874887,
"loss": 3.1851,
"step": 5040
},
{
"epoch": 1.5635547298343395,
"grad_norm": 0.9294213056564331,
"learning_rate": 0.0003127903375658099,
"loss": 3.189,
"step": 5050
},
{
"epoch": 1.5666511843938689,
"grad_norm": 0.9628444910049438,
"learning_rate": 0.00031340972437287083,
"loss": 3.1524,
"step": 5060
},
{
"epoch": 1.5697476389533982,
"grad_norm": 0.9377193450927734,
"learning_rate": 0.00031402911117993186,
"loss": 3.1688,
"step": 5070
},
{
"epoch": 1.5728440935129275,
"grad_norm": 0.8622744083404541,
"learning_rate": 0.0003146484979869929,
"loss": 3.1374,
"step": 5080
},
{
"epoch": 1.5759405480724569,
"grad_norm": 0.9315075874328613,
"learning_rate": 0.0003152678847940539,
"loss": 3.1657,
"step": 5090
},
{
"epoch": 1.5790370026319864,
"grad_norm": 0.9984999895095825,
"learning_rate": 0.0003158872716011149,
"loss": 3.1494,
"step": 5100
},
{
"epoch": 1.5821334571915158,
"grad_norm": 0.9476169943809509,
"learning_rate": 0.0003165066584081759,
"loss": 3.1262,
"step": 5110
},
{
"epoch": 1.585229911751045,
"grad_norm": 0.8942754864692688,
"learning_rate": 0.0003171260452152369,
"loss": 3.1546,
"step": 5120
},
{
"epoch": 1.5883263663105744,
"grad_norm": 0.9009295701980591,
"learning_rate": 0.00031774543202229796,
"loss": 3.1516,
"step": 5130
},
{
"epoch": 1.5914228208701038,
"grad_norm": 1.010343074798584,
"learning_rate": 0.00031836481882935893,
"loss": 3.1448,
"step": 5140
},
{
"epoch": 1.5945192754296331,
"grad_norm": 0.9292970299720764,
"learning_rate": 0.00031898420563641997,
"loss": 3.123,
"step": 5150
},
{
"epoch": 1.5976157299891625,
"grad_norm": 0.9574374556541443,
"learning_rate": 0.00031960359244348095,
"loss": 3.1358,
"step": 5160
},
{
"epoch": 1.6007121845486918,
"grad_norm": 0.9073388576507568,
"learning_rate": 0.0003202229792505419,
"loss": 3.1352,
"step": 5170
},
{
"epoch": 1.6038086391082211,
"grad_norm": 0.9928716421127319,
"learning_rate": 0.00032084236605760296,
"loss": 3.1226,
"step": 5180
},
{
"epoch": 1.6069050936677505,
"grad_norm": 0.9886534810066223,
"learning_rate": 0.000321461752864664,
"loss": 3.131,
"step": 5190
},
{
"epoch": 1.6100015482272798,
"grad_norm": 0.9734316468238831,
"learning_rate": 0.00032208113967172497,
"loss": 3.1341,
"step": 5200
},
{
"epoch": 1.6130980027868091,
"grad_norm": 0.9681540131568909,
"learning_rate": 0.000322700526478786,
"loss": 3.0973,
"step": 5210
},
{
"epoch": 1.6161944573463385,
"grad_norm": 0.9452388286590576,
"learning_rate": 0.00032331991328584704,
"loss": 3.1082,
"step": 5220
},
{
"epoch": 1.6192909119058678,
"grad_norm": 0.9055010080337524,
"learning_rate": 0.000323939300092908,
"loss": 3.0891,
"step": 5230
},
{
"epoch": 1.6223873664653972,
"grad_norm": 0.9603378772735596,
"learning_rate": 0.00032455868689996905,
"loss": 3.0998,
"step": 5240
},
{
"epoch": 1.6254838210249265,
"grad_norm": 0.8925791382789612,
"learning_rate": 0.0003251780737070301,
"loss": 3.1165,
"step": 5250
},
{
"epoch": 1.6285802755844558,
"grad_norm": 0.928421139717102,
"learning_rate": 0.000325797460514091,
"loss": 3.1087,
"step": 5260
},
{
"epoch": 1.6316767301439852,
"grad_norm": 0.9481196403503418,
"learning_rate": 0.00032641684732115204,
"loss": 3.0916,
"step": 5270
},
{
"epoch": 1.6347731847035145,
"grad_norm": 0.9044370055198669,
"learning_rate": 0.0003270362341282131,
"loss": 3.1,
"step": 5280
},
{
"epoch": 1.6378696392630439,
"grad_norm": 0.9636628031730652,
"learning_rate": 0.00032765562093527406,
"loss": 3.114,
"step": 5290
},
{
"epoch": 1.6409660938225732,
"grad_norm": 0.9585344195365906,
"learning_rate": 0.0003282750077423351,
"loss": 3.086,
"step": 5300
},
{
"epoch": 1.6440625483821025,
"grad_norm": 0.9368054866790771,
"learning_rate": 0.0003288943945493961,
"loss": 3.0763,
"step": 5310
},
{
"epoch": 1.6471590029416319,
"grad_norm": 0.951101541519165,
"learning_rate": 0.0003295137813564571,
"loss": 3.0746,
"step": 5320
},
{
"epoch": 1.6502554575011612,
"grad_norm": 0.9043335318565369,
"learning_rate": 0.00033013316816351814,
"loss": 3.0665,
"step": 5330
},
{
"epoch": 1.6533519120606905,
"grad_norm": 0.8929763436317444,
"learning_rate": 0.00033075255497057917,
"loss": 3.0644,
"step": 5340
},
{
"epoch": 1.6564483666202199,
"grad_norm": 0.9089614152908325,
"learning_rate": 0.00033137194177764015,
"loss": 3.0661,
"step": 5350
},
{
"epoch": 1.6595448211797492,
"grad_norm": 0.9606667757034302,
"learning_rate": 0.0003319913285847012,
"loss": 3.0578,
"step": 5360
},
{
"epoch": 1.6626412757392786,
"grad_norm": 0.8867613673210144,
"learning_rate": 0.00033261071539176216,
"loss": 3.0707,
"step": 5370
},
{
"epoch": 1.665737730298808,
"grad_norm": 0.9263885617256165,
"learning_rate": 0.00033323010219882314,
"loss": 3.0579,
"step": 5380
},
{
"epoch": 1.6688341848583372,
"grad_norm": 0.8380886316299438,
"learning_rate": 0.0003338494890058842,
"loss": 3.0628,
"step": 5390
},
{
"epoch": 1.6719306394178666,
"grad_norm": 0.9296733140945435,
"learning_rate": 0.0003344688758129452,
"loss": 3.0374,
"step": 5400
},
{
"epoch": 1.675027093977396,
"grad_norm": 0.9482071995735168,
"learning_rate": 0.0003350882626200062,
"loss": 3.0611,
"step": 5410
},
{
"epoch": 1.6781235485369252,
"grad_norm": 0.934635579586029,
"learning_rate": 0.0003357076494270672,
"loss": 3.0465,
"step": 5420
},
{
"epoch": 1.6812200030964546,
"grad_norm": 0.9624560475349426,
"learning_rate": 0.00033632703623412825,
"loss": 3.0622,
"step": 5430
},
{
"epoch": 1.684316457655984,
"grad_norm": 0.952055037021637,
"learning_rate": 0.00033694642304118923,
"loss": 3.0483,
"step": 5440
},
{
"epoch": 1.6874129122155133,
"grad_norm": 0.8703885674476624,
"learning_rate": 0.00033756580984825027,
"loss": 3.0506,
"step": 5450
},
{
"epoch": 1.6905093667750426,
"grad_norm": 0.9054002165794373,
"learning_rate": 0.00033818519665531125,
"loss": 3.045,
"step": 5460
},
{
"epoch": 1.693605821334572,
"grad_norm": 0.9501616954803467,
"learning_rate": 0.0003388045834623722,
"loss": 3.0327,
"step": 5470
},
{
"epoch": 1.6967022758941013,
"grad_norm": 0.880946934223175,
"learning_rate": 0.00033942397026943326,
"loss": 3.0414,
"step": 5480
},
{
"epoch": 1.6997987304536306,
"grad_norm": 0.9799813032150269,
"learning_rate": 0.00034004335707649424,
"loss": 3.0485,
"step": 5490
},
{
"epoch": 1.70289518501316,
"grad_norm": 0.9278644323348999,
"learning_rate": 0.00034066274388355527,
"loss": 3.0334,
"step": 5500
},
{
"epoch": 1.7059916395726893,
"grad_norm": 0.8921311497688293,
"learning_rate": 0.0003412821306906163,
"loss": 3.0283,
"step": 5510
},
{
"epoch": 1.7090880941322186,
"grad_norm": 0.8926926851272583,
"learning_rate": 0.0003419015174976773,
"loss": 3.0294,
"step": 5520
},
{
"epoch": 1.712184548691748,
"grad_norm": 0.9130481481552124,
"learning_rate": 0.0003425209043047383,
"loss": 3.007,
"step": 5530
},
{
"epoch": 1.7152810032512773,
"grad_norm": 0.9094374775886536,
"learning_rate": 0.00034314029111179935,
"loss": 3.0183,
"step": 5540
},
{
"epoch": 1.7183774578108066,
"grad_norm": 0.8862912058830261,
"learning_rate": 0.00034375967791886033,
"loss": 2.9898,
"step": 5550
},
{
"epoch": 1.721473912370336,
"grad_norm": 0.9140844941139221,
"learning_rate": 0.00034437906472592136,
"loss": 3.0172,
"step": 5560
},
{
"epoch": 1.7245703669298653,
"grad_norm": 0.976078450679779,
"learning_rate": 0.0003449984515329824,
"loss": 3.0161,
"step": 5570
},
{
"epoch": 1.7276668214893947,
"grad_norm": 0.9176059365272522,
"learning_rate": 0.0003456178383400433,
"loss": 2.9931,
"step": 5580
},
{
"epoch": 1.730763276048924,
"grad_norm": 0.9895356297492981,
"learning_rate": 0.00034623722514710436,
"loss": 3.0026,
"step": 5590
},
{
"epoch": 1.7338597306084533,
"grad_norm": 0.9021176099777222,
"learning_rate": 0.0003468566119541654,
"loss": 2.9841,
"step": 5600
},
{
"epoch": 1.7369561851679827,
"grad_norm": 1.0290924310684204,
"learning_rate": 0.00034747599876122637,
"loss": 3.0205,
"step": 5610
},
{
"epoch": 1.740052639727512,
"grad_norm": 0.9842997193336487,
"learning_rate": 0.0003480953855682874,
"loss": 2.9983,
"step": 5620
},
{
"epoch": 1.7431490942870413,
"grad_norm": 1.004170536994934,
"learning_rate": 0.00034871477237534843,
"loss": 2.9929,
"step": 5630
},
{
"epoch": 1.7462455488465707,
"grad_norm": 0.8903537392616272,
"learning_rate": 0.0003493341591824094,
"loss": 2.9928,
"step": 5640
},
{
"epoch": 1.7493420034061,
"grad_norm": 0.9463049173355103,
"learning_rate": 0.00034995354598947045,
"loss": 2.9975,
"step": 5650
},
{
"epoch": 1.7524384579656294,
"grad_norm": 0.879135251045227,
"learning_rate": 0.0003505729327965315,
"loss": 2.9767,
"step": 5660
},
{
"epoch": 1.7555349125251587,
"grad_norm": 0.9398852586746216,
"learning_rate": 0.00035119231960359246,
"loss": 2.9813,
"step": 5670
},
{
"epoch": 1.758631367084688,
"grad_norm": 0.9972649216651917,
"learning_rate": 0.00035181170641065344,
"loss": 2.964,
"step": 5680
},
{
"epoch": 1.7617278216442174,
"grad_norm": 0.9139822721481323,
"learning_rate": 0.00035243109321771447,
"loss": 2.9906,
"step": 5690
},
{
"epoch": 1.7648242762037467,
"grad_norm": 0.8910505771636963,
"learning_rate": 0.00035305048002477545,
"loss": 2.9749,
"step": 5700
},
{
"epoch": 1.767920730763276,
"grad_norm": 1.1436492204666138,
"learning_rate": 0.0003536698668318365,
"loss": 2.9727,
"step": 5710
},
{
"epoch": 1.7710171853228054,
"grad_norm": 0.9300575852394104,
"learning_rate": 0.0003542892536388975,
"loss": 3.0028,
"step": 5720
},
{
"epoch": 1.7741136398823347,
"grad_norm": 0.8461237549781799,
"learning_rate": 0.0003549086404459585,
"loss": 2.9749,
"step": 5730
},
{
"epoch": 1.777210094441864,
"grad_norm": 0.882404088973999,
"learning_rate": 0.00035552802725301953,
"loss": 2.9568,
"step": 5740
},
{
"epoch": 1.7803065490013934,
"grad_norm": 0.8937315344810486,
"learning_rate": 0.00035614741406008056,
"loss": 2.9807,
"step": 5750
},
{
"epoch": 1.7834030035609227,
"grad_norm": 0.8935524225234985,
"learning_rate": 0.00035676680086714154,
"loss": 2.982,
"step": 5760
},
{
"epoch": 1.786499458120452,
"grad_norm": 0.9033128023147583,
"learning_rate": 0.0003573861876742026,
"loss": 2.9634,
"step": 5770
},
{
"epoch": 1.7895959126799814,
"grad_norm": 0.9767388701438904,
"learning_rate": 0.0003580055744812635,
"loss": 2.9613,
"step": 5780
},
{
"epoch": 1.7926923672395108,
"grad_norm": 1.0344420671463013,
"learning_rate": 0.00035862496128832454,
"loss": 2.9319,
"step": 5790
},
{
"epoch": 1.79578882179904,
"grad_norm": 0.87823486328125,
"learning_rate": 0.00035924434809538557,
"loss": 2.96,
"step": 5800
},
{
"epoch": 1.7988852763585694,
"grad_norm": 0.9067280888557434,
"learning_rate": 0.00035986373490244655,
"loss": 2.9322,
"step": 5810
},
{
"epoch": 1.8019817309180988,
"grad_norm": 0.8616409301757812,
"learning_rate": 0.0003604831217095076,
"loss": 2.9611,
"step": 5820
},
{
"epoch": 1.805078185477628,
"grad_norm": 0.8421568274497986,
"learning_rate": 0.0003611025085165686,
"loss": 2.9366,
"step": 5830
},
{
"epoch": 1.8081746400371574,
"grad_norm": 0.8576173782348633,
"learning_rate": 0.0003617218953236296,
"loss": 2.9423,
"step": 5840
},
{
"epoch": 1.8112710945966868,
"grad_norm": 0.8986689448356628,
"learning_rate": 0.00036234128213069063,
"loss": 2.9376,
"step": 5850
},
{
"epoch": 1.8143675491562161,
"grad_norm": 0.9134368300437927,
"learning_rate": 0.00036296066893775166,
"loss": 2.9262,
"step": 5860
},
{
"epoch": 1.8174640037157455,
"grad_norm": 0.9681121110916138,
"learning_rate": 0.00036358005574481264,
"loss": 2.9341,
"step": 5870
},
{
"epoch": 1.8205604582752748,
"grad_norm": 1.0286924839019775,
"learning_rate": 0.0003641994425518737,
"loss": 2.9306,
"step": 5880
},
{
"epoch": 1.8236569128348041,
"grad_norm": 0.9352772831916809,
"learning_rate": 0.00036481882935893465,
"loss": 2.948,
"step": 5890
},
{
"epoch": 1.8267533673943335,
"grad_norm": 1.0539007186889648,
"learning_rate": 0.00036543821616599563,
"loss": 2.9523,
"step": 5900
},
{
"epoch": 1.8298498219538628,
"grad_norm": 0.8661713600158691,
"learning_rate": 0.00036605760297305667,
"loss": 2.9269,
"step": 5910
},
{
"epoch": 1.8329462765133921,
"grad_norm": 0.9120956659317017,
"learning_rate": 0.0003666769897801177,
"loss": 2.9302,
"step": 5920
},
{
"epoch": 1.8360427310729215,
"grad_norm": 0.9333845376968384,
"learning_rate": 0.0003672963765871787,
"loss": 2.9247,
"step": 5930
},
{
"epoch": 1.8391391856324508,
"grad_norm": 0.864277720451355,
"learning_rate": 0.0003679157633942397,
"loss": 2.9269,
"step": 5940
},
{
"epoch": 1.8422356401919802,
"grad_norm": 0.954741358757019,
"learning_rate": 0.00036853515020130075,
"loss": 2.9348,
"step": 5950
},
{
"epoch": 1.8453320947515095,
"grad_norm": 0.8879597187042236,
"learning_rate": 0.0003691545370083617,
"loss": 2.9259,
"step": 5960
},
{
"epoch": 1.8484285493110388,
"grad_norm": 0.8487861752510071,
"learning_rate": 0.00036977392381542276,
"loss": 2.9189,
"step": 5970
},
{
"epoch": 1.8515250038705682,
"grad_norm": 0.9464482069015503,
"learning_rate": 0.0003703933106224838,
"loss": 2.9119,
"step": 5980
},
{
"epoch": 1.8546214584300975,
"grad_norm": 0.8773711919784546,
"learning_rate": 0.0003710126974295447,
"loss": 2.9222,
"step": 5990
},
{
"epoch": 1.8577179129896269,
"grad_norm": 0.8919110894203186,
"learning_rate": 0.00037163208423660575,
"loss": 2.9056,
"step": 6000
},
{
"epoch": 1.8608143675491562,
"grad_norm": 0.9436878561973572,
"learning_rate": 0.0003722514710436668,
"loss": 2.9095,
"step": 6010
},
{
"epoch": 1.8639108221086855,
"grad_norm": 0.9595790505409241,
"learning_rate": 0.00037287085785072776,
"loss": 2.9047,
"step": 6020
},
{
"epoch": 1.8670072766682149,
"grad_norm": 0.8692799806594849,
"learning_rate": 0.0003734902446577888,
"loss": 2.905,
"step": 6030
},
{
"epoch": 1.8701037312277442,
"grad_norm": 0.9274528622627258,
"learning_rate": 0.00037410963146484983,
"loss": 2.9251,
"step": 6040
},
{
"epoch": 1.8732001857872735,
"grad_norm": 0.8798776268959045,
"learning_rate": 0.0003747290182719108,
"loss": 2.9113,
"step": 6050
},
{
"epoch": 1.8762966403468029,
"grad_norm": 0.8613748550415039,
"learning_rate": 0.00037534840507897184,
"loss": 2.9077,
"step": 6060
},
{
"epoch": 1.8793930949063322,
"grad_norm": 0.8926125764846802,
"learning_rate": 0.0003759677918860329,
"loss": 2.9029,
"step": 6070
},
{
"epoch": 1.8824895494658616,
"grad_norm": 0.9414944052696228,
"learning_rate": 0.00037658717869309386,
"loss": 2.8968,
"step": 6080
},
{
"epoch": 1.885586004025391,
"grad_norm": 0.8922074437141418,
"learning_rate": 0.0003772065655001549,
"loss": 2.8992,
"step": 6090
},
{
"epoch": 1.8886824585849202,
"grad_norm": 0.9254492521286011,
"learning_rate": 0.0003778259523072158,
"loss": 2.912,
"step": 6100
},
{
"epoch": 1.8917789131444496,
"grad_norm": 0.8882949948310852,
"learning_rate": 0.00037844533911427685,
"loss": 2.8972,
"step": 6110
},
{
"epoch": 1.894875367703979,
"grad_norm": 0.874482274055481,
"learning_rate": 0.0003790647259213379,
"loss": 2.8848,
"step": 6120
},
{
"epoch": 1.8979718222635082,
"grad_norm": 0.8989077210426331,
"learning_rate": 0.00037968411272839886,
"loss": 2.8934,
"step": 6130
},
{
"epoch": 1.9010682768230376,
"grad_norm": 0.9361928105354309,
"learning_rate": 0.0003803034995354599,
"loss": 2.8697,
"step": 6140
},
{
"epoch": 1.904164731382567,
"grad_norm": 0.8788303732872009,
"learning_rate": 0.0003809228863425209,
"loss": 2.8989,
"step": 6150
},
{
"epoch": 1.9072611859420963,
"grad_norm": 0.8196372985839844,
"learning_rate": 0.0003815422731495819,
"loss": 2.8913,
"step": 6160
},
{
"epoch": 1.9103576405016256,
"grad_norm": 0.8973246216773987,
"learning_rate": 0.00038216165995664294,
"loss": 2.8941,
"step": 6170
},
{
"epoch": 1.913454095061155,
"grad_norm": 0.951608419418335,
"learning_rate": 0.00038278104676370397,
"loss": 2.8941,
"step": 6180
},
{
"epoch": 1.9165505496206843,
"grad_norm": 0.87721186876297,
"learning_rate": 0.00038340043357076495,
"loss": 2.9039,
"step": 6190
},
{
"epoch": 1.9196470041802136,
"grad_norm": 0.8995383381843567,
"learning_rate": 0.00038401982037782593,
"loss": 2.8978,
"step": 6200
},
{
"epoch": 1.922743458739743,
"grad_norm": 0.9441946148872375,
"learning_rate": 0.00038463920718488696,
"loss": 2.8774,
"step": 6210
},
{
"epoch": 1.9258399132992723,
"grad_norm": 0.8960248231887817,
"learning_rate": 0.00038525859399194794,
"loss": 2.8908,
"step": 6220
},
{
"epoch": 1.9289363678588016,
"grad_norm": 0.9116747975349426,
"learning_rate": 0.000385877980799009,
"loss": 2.8639,
"step": 6230
},
{
"epoch": 1.932032822418331,
"grad_norm": 0.8798891305923462,
"learning_rate": 0.00038649736760607,
"loss": 2.86,
"step": 6240
},
{
"epoch": 1.9351292769778603,
"grad_norm": 0.8671932816505432,
"learning_rate": 0.000387116754413131,
"loss": 2.871,
"step": 6250
},
{
"epoch": 1.9382257315373896,
"grad_norm": 0.9382427930831909,
"learning_rate": 0.000387736141220192,
"loss": 2.8508,
"step": 6260
},
{
"epoch": 1.941322186096919,
"grad_norm": 0.9341138005256653,
"learning_rate": 0.00038835552802725306,
"loss": 2.8717,
"step": 6270
},
{
"epoch": 1.9444186406564483,
"grad_norm": 0.9240859150886536,
"learning_rate": 0.00038897491483431404,
"loss": 2.8802,
"step": 6280
},
{
"epoch": 1.9475150952159777,
"grad_norm": 0.9910873174667358,
"learning_rate": 0.00038959430164137507,
"loss": 2.8709,
"step": 6290
},
{
"epoch": 1.950611549775507,
"grad_norm": 0.9003307223320007,
"learning_rate": 0.0003902136884484361,
"loss": 2.8732,
"step": 6300
},
{
"epoch": 1.9537080043350363,
"grad_norm": 0.904257595539093,
"learning_rate": 0.00039083307525549703,
"loss": 2.8876,
"step": 6310
},
{
"epoch": 1.9568044588945657,
"grad_norm": 0.978615403175354,
"learning_rate": 0.00039145246206255806,
"loss": 2.8684,
"step": 6320
},
{
"epoch": 1.959900913454095,
"grad_norm": 0.8782775402069092,
"learning_rate": 0.0003920718488696191,
"loss": 2.8677,
"step": 6330
},
{
"epoch": 1.9629973680136246,
"grad_norm": 0.9640995860099792,
"learning_rate": 0.0003926912356766801,
"loss": 2.8568,
"step": 6340
},
{
"epoch": 1.966093822573154,
"grad_norm": 0.8807209134101868,
"learning_rate": 0.0003933106224837411,
"loss": 2.8618,
"step": 6350
},
{
"epoch": 1.9691902771326832,
"grad_norm": 0.8921664357185364,
"learning_rate": 0.00039393000929080214,
"loss": 2.8788,
"step": 6360
},
{
"epoch": 1.9722867316922126,
"grad_norm": 0.9727539420127869,
"learning_rate": 0.0003945493960978631,
"loss": 2.8512,
"step": 6370
},
{
"epoch": 1.975383186251742,
"grad_norm": 0.8913626670837402,
"learning_rate": 0.00039516878290492415,
"loss": 2.8604,
"step": 6380
},
{
"epoch": 1.9784796408112713,
"grad_norm": 0.8825446963310242,
"learning_rate": 0.0003957881697119852,
"loss": 2.8448,
"step": 6390
},
{
"epoch": 1.9815760953708006,
"grad_norm": 0.916666567325592,
"learning_rate": 0.00039640755651904617,
"loss": 2.8625,
"step": 6400
},
{
"epoch": 1.98467254993033,
"grad_norm": 1.0008190870285034,
"learning_rate": 0.00039702694332610715,
"loss": 2.8631,
"step": 6410
},
{
"epoch": 1.9877690044898593,
"grad_norm": 0.8584704399108887,
"learning_rate": 0.0003976463301331682,
"loss": 2.8701,
"step": 6420
},
{
"epoch": 1.9908654590493886,
"grad_norm": 0.9079132676124573,
"learning_rate": 0.00039826571694022916,
"loss": 2.8453,
"step": 6430
},
{
"epoch": 1.993961913608918,
"grad_norm": 0.8909833431243896,
"learning_rate": 0.0003988851037472902,
"loss": 2.8315,
"step": 6440
},
{
"epoch": 1.9970583681684473,
"grad_norm": 0.9206358194351196,
"learning_rate": 0.00039950449055435117,
"loss": 2.8694,
"step": 6450
},
{
"epoch": 2.0,
"grad_norm": 0.6666725277900696,
"learning_rate": 0.0004001238773614122,
"loss": 2.7051,
"step": 6460
},
{
"epoch": 2.0030964545595293,
"grad_norm": 0.8826514482498169,
"learning_rate": 0.00040074326416847324,
"loss": 2.8328,
"step": 6470
},
{
"epoch": 2.0061929091190587,
"grad_norm": 0.922680139541626,
"learning_rate": 0.0004013626509755342,
"loss": 2.852,
"step": 6480
},
{
"epoch": 2.009289363678588,
"grad_norm": 0.9056729674339294,
"learning_rate": 0.00040198203778259525,
"loss": 2.8423,
"step": 6490
},
{
"epoch": 2.0123858182381174,
"grad_norm": 0.866322934627533,
"learning_rate": 0.0004026014245896563,
"loss": 2.8412,
"step": 6500
},
{
"epoch": 2.0154822727976467,
"grad_norm": 0.9588058590888977,
"learning_rate": 0.0004032208113967172,
"loss": 2.8526,
"step": 6510
},
{
"epoch": 2.018578727357176,
"grad_norm": 0.9247243404388428,
"learning_rate": 0.00040384019820377824,
"loss": 2.8271,
"step": 6520
},
{
"epoch": 2.0216751819167054,
"grad_norm": 0.8787789940834045,
"learning_rate": 0.0004044595850108393,
"loss": 2.8043,
"step": 6530
},
{
"epoch": 2.0247716364762347,
"grad_norm": 0.8963256478309631,
"learning_rate": 0.00040507897181790025,
"loss": 2.8162,
"step": 6540
},
{
"epoch": 2.027868091035764,
"grad_norm": 0.9025070071220398,
"learning_rate": 0.0004056983586249613,
"loss": 2.8226,
"step": 6550
},
{
"epoch": 2.0309645455952934,
"grad_norm": 0.8822202086448669,
"learning_rate": 0.0004063177454320223,
"loss": 2.8284,
"step": 6560
},
{
"epoch": 2.0340610001548227,
"grad_norm": 0.9176104068756104,
"learning_rate": 0.0004069371322390833,
"loss": 2.8379,
"step": 6570
},
{
"epoch": 2.037157454714352,
"grad_norm": 0.9508628845214844,
"learning_rate": 0.00040755651904614433,
"loss": 2.8113,
"step": 6580
},
{
"epoch": 2.0402539092738814,
"grad_norm": 0.9238744378089905,
"learning_rate": 0.00040817590585320537,
"loss": 2.8221,
"step": 6590
},
{
"epoch": 2.0433503638334107,
"grad_norm": 0.8854493498802185,
"learning_rate": 0.00040879529266026635,
"loss": 2.8139,
"step": 6600
},
{
"epoch": 2.04644681839294,
"grad_norm": 0.8652548789978027,
"learning_rate": 0.0004094146794673274,
"loss": 2.8153,
"step": 6610
},
{
"epoch": 2.0495432729524694,
"grad_norm": 0.8663405179977417,
"learning_rate": 0.00041003406627438836,
"loss": 2.8098,
"step": 6620
},
{
"epoch": 2.0526397275119987,
"grad_norm": 0.8482099175453186,
"learning_rate": 0.00041065345308144934,
"loss": 2.8102,
"step": 6630
},
{
"epoch": 2.055736182071528,
"grad_norm": 0.895483672618866,
"learning_rate": 0.00041127283988851037,
"loss": 2.8014,
"step": 6640
},
{
"epoch": 2.0588326366310574,
"grad_norm": 0.8933889865875244,
"learning_rate": 0.0004118922266955714,
"loss": 2.8008,
"step": 6650
},
{
"epoch": 2.0619290911905868,
"grad_norm": 0.87566739320755,
"learning_rate": 0.0004125116135026324,
"loss": 2.8055,
"step": 6660
},
{
"epoch": 2.065025545750116,
"grad_norm": 0.9240240454673767,
"learning_rate": 0.0004131310003096934,
"loss": 2.8249,
"step": 6670
},
{
"epoch": 2.0681220003096454,
"grad_norm": 0.9362452626228333,
"learning_rate": 0.00041375038711675445,
"loss": 2.8128,
"step": 6680
},
{
"epoch": 2.0712184548691748,
"grad_norm": 0.859845757484436,
"learning_rate": 0.00041436977392381543,
"loss": 2.7887,
"step": 6690
},
{
"epoch": 2.074314909428704,
"grad_norm": 0.9458219408988953,
"learning_rate": 0.00041498916073087646,
"loss": 2.8087,
"step": 6700
},
{
"epoch": 2.0774113639882335,
"grad_norm": 0.9015805125236511,
"learning_rate": 0.0004156085475379375,
"loss": 2.8197,
"step": 6710
},
{
"epoch": 2.080507818547763,
"grad_norm": 0.8841304779052734,
"learning_rate": 0.0004162279343449984,
"loss": 2.793,
"step": 6720
},
{
"epoch": 2.083604273107292,
"grad_norm": 0.9217279553413391,
"learning_rate": 0.00041684732115205946,
"loss": 2.8279,
"step": 6730
},
{
"epoch": 2.0867007276668215,
"grad_norm": 0.9141611456871033,
"learning_rate": 0.0004174667079591205,
"loss": 2.7922,
"step": 6740
},
{
"epoch": 2.089797182226351,
"grad_norm": 0.8566716313362122,
"learning_rate": 0.00041808609476618147,
"loss": 2.8088,
"step": 6750
},
{
"epoch": 2.09289363678588,
"grad_norm": 0.9103225469589233,
"learning_rate": 0.0004187054815732425,
"loss": 2.8134,
"step": 6760
},
{
"epoch": 2.0959900913454095,
"grad_norm": 0.8901599049568176,
"learning_rate": 0.0004193248683803035,
"loss": 2.8114,
"step": 6770
},
{
"epoch": 2.099086545904939,
"grad_norm": 0.9474543333053589,
"learning_rate": 0.0004199442551873645,
"loss": 2.7907,
"step": 6780
},
{
"epoch": 2.102183000464468,
"grad_norm": 0.8805556297302246,
"learning_rate": 0.00042056364199442555,
"loss": 2.8023,
"step": 6790
},
{
"epoch": 2.1052794550239975,
"grad_norm": 0.9209165573120117,
"learning_rate": 0.00042118302880148653,
"loss": 2.8247,
"step": 6800
},
{
"epoch": 2.108375909583527,
"grad_norm": 0.9121336340904236,
"learning_rate": 0.00042180241560854756,
"loss": 2.7983,
"step": 6810
},
{
"epoch": 2.111472364143056,
"grad_norm": 0.883575439453125,
"learning_rate": 0.0004224218024156086,
"loss": 2.7973,
"step": 6820
},
{
"epoch": 2.1145688187025855,
"grad_norm": 0.8569662570953369,
"learning_rate": 0.0004230411892226695,
"loss": 2.807,
"step": 6830
},
{
"epoch": 2.117665273262115,
"grad_norm": 0.8648683428764343,
"learning_rate": 0.00042366057602973055,
"loss": 2.7953,
"step": 6840
},
{
"epoch": 2.120761727821644,
"grad_norm": 1.0288830995559692,
"learning_rate": 0.0004242799628367916,
"loss": 2.7934,
"step": 6850
},
{
"epoch": 2.1238581823811735,
"grad_norm": 0.9366074800491333,
"learning_rate": 0.00042489934964385257,
"loss": 2.8014,
"step": 6860
},
{
"epoch": 2.126954636940703,
"grad_norm": 0.9614273905754089,
"learning_rate": 0.0004255187364509136,
"loss": 2.7822,
"step": 6870
},
{
"epoch": 2.130051091500232,
"grad_norm": 0.8939881324768066,
"learning_rate": 0.00042613812325797463,
"loss": 2.8195,
"step": 6880
},
{
"epoch": 2.1331475460597615,
"grad_norm": 0.9166781902313232,
"learning_rate": 0.0004267575100650356,
"loss": 2.7889,
"step": 6890
},
{
"epoch": 2.136244000619291,
"grad_norm": 0.8826269507408142,
"learning_rate": 0.00042737689687209665,
"loss": 2.8041,
"step": 6900
},
{
"epoch": 2.13934045517882,
"grad_norm": 0.9127874970436096,
"learning_rate": 0.0004279962836791577,
"loss": 2.7986,
"step": 6910
},
{
"epoch": 2.1424369097383495,
"grad_norm": 0.9072954654693604,
"learning_rate": 0.00042861567048621866,
"loss": 2.8031,
"step": 6920
},
{
"epoch": 2.145533364297879,
"grad_norm": 0.8833560943603516,
"learning_rate": 0.00042923505729327964,
"loss": 2.7911,
"step": 6930
},
{
"epoch": 2.1486298188574082,
"grad_norm": 0.861221194267273,
"learning_rate": 0.00042985444410034067,
"loss": 2.8073,
"step": 6940
},
{
"epoch": 2.1517262734169376,
"grad_norm": 0.9040530323982239,
"learning_rate": 0.00043047383090740165,
"loss": 2.7849,
"step": 6950
},
{
"epoch": 2.154822727976467,
"grad_norm": 0.9143641591072083,
"learning_rate": 0.0004310932177144627,
"loss": 2.7896,
"step": 6960
},
{
"epoch": 2.1579191825359962,
"grad_norm": 0.8545592427253723,
"learning_rate": 0.0004317126045215237,
"loss": 2.7971,
"step": 6970
},
{
"epoch": 2.1610156370955256,
"grad_norm": 0.9303133487701416,
"learning_rate": 0.0004323319913285847,
"loss": 2.7784,
"step": 6980
},
{
"epoch": 2.164112091655055,
"grad_norm": 0.9570648074150085,
"learning_rate": 0.00043295137813564573,
"loss": 2.7977,
"step": 6990
},
{
"epoch": 2.1672085462145843,
"grad_norm": 0.906696081161499,
"learning_rate": 0.00043357076494270676,
"loss": 2.7947,
"step": 7000
},
{
"epoch": 2.1703050007741136,
"grad_norm": 0.8919961452484131,
"learning_rate": 0.00043419015174976774,
"loss": 2.7926,
"step": 7010
},
{
"epoch": 2.173401455333643,
"grad_norm": 0.8740367889404297,
"learning_rate": 0.0004348095385568288,
"loss": 2.7747,
"step": 7020
},
{
"epoch": 2.1764979098931723,
"grad_norm": 0.8785775899887085,
"learning_rate": 0.0004354289253638898,
"loss": 2.791,
"step": 7030
},
{
"epoch": 2.1795943644527016,
"grad_norm": 0.9824354648590088,
"learning_rate": 0.00043604831217095073,
"loss": 2.7756,
"step": 7040
},
{
"epoch": 2.182690819012231,
"grad_norm": 0.9581257104873657,
"learning_rate": 0.00043666769897801177,
"loss": 2.7893,
"step": 7050
},
{
"epoch": 2.1857872735717603,
"grad_norm": 0.9003785252571106,
"learning_rate": 0.0004372870857850728,
"loss": 2.7857,
"step": 7060
},
{
"epoch": 2.1888837281312896,
"grad_norm": 0.9463407397270203,
"learning_rate": 0.0004379064725921338,
"loss": 2.7608,
"step": 7070
},
{
"epoch": 2.191980182690819,
"grad_norm": 0.9050635695457458,
"learning_rate": 0.0004385258593991948,
"loss": 2.7703,
"step": 7080
},
{
"epoch": 2.1950766372503483,
"grad_norm": 0.8689008951187134,
"learning_rate": 0.0004391452462062558,
"loss": 2.7742,
"step": 7090
},
{
"epoch": 2.1981730918098776,
"grad_norm": 0.8723441958427429,
"learning_rate": 0.0004397646330133168,
"loss": 2.7694,
"step": 7100
},
{
"epoch": 2.201269546369407,
"grad_norm": 0.8924479484558105,
"learning_rate": 0.00044038401982037786,
"loss": 2.7906,
"step": 7110
},
{
"epoch": 2.2043660009289363,
"grad_norm": 0.919276773929596,
"learning_rate": 0.00044100340662743884,
"loss": 2.7872,
"step": 7120
},
{
"epoch": 2.2074624554884656,
"grad_norm": 0.901465654373169,
"learning_rate": 0.00044162279343449987,
"loss": 2.7465,
"step": 7130
},
{
"epoch": 2.210558910047995,
"grad_norm": 0.8734842538833618,
"learning_rate": 0.00044224218024156085,
"loss": 2.7662,
"step": 7140
},
{
"epoch": 2.2136553646075243,
"grad_norm": 0.9729484915733337,
"learning_rate": 0.00044286156704862183,
"loss": 2.7681,
"step": 7150
},
{
"epoch": 2.2167518191670537,
"grad_norm": 0.8634438514709473,
"learning_rate": 0.00044348095385568286,
"loss": 2.7694,
"step": 7160
},
{
"epoch": 2.219848273726583,
"grad_norm": 0.8623734712600708,
"learning_rate": 0.0004441003406627439,
"loss": 2.7775,
"step": 7170
},
{
"epoch": 2.2229447282861123,
"grad_norm": 0.9596241116523743,
"learning_rate": 0.0004447197274698049,
"loss": 2.7916,
"step": 7180
},
{
"epoch": 2.2260411828456417,
"grad_norm": 0.8765792846679688,
"learning_rate": 0.0004453391142768659,
"loss": 2.7529,
"step": 7190
},
{
"epoch": 2.229137637405171,
"grad_norm": 0.887290894985199,
"learning_rate": 0.00044595850108392694,
"loss": 2.7697,
"step": 7200
},
{
"epoch": 2.2322340919647004,
"grad_norm": 0.842238187789917,
"learning_rate": 0.0004465778878909879,
"loss": 2.7521,
"step": 7210
},
{
"epoch": 2.2353305465242297,
"grad_norm": 0.9190672039985657,
"learning_rate": 0.00044719727469804896,
"loss": 2.7611,
"step": 7220
},
{
"epoch": 2.238427001083759,
"grad_norm": 0.8801867365837097,
"learning_rate": 0.00044781666150511,
"loss": 2.7656,
"step": 7230
},
{
"epoch": 2.2415234556432884,
"grad_norm": 0.9014734029769897,
"learning_rate": 0.0004484360483121709,
"loss": 2.7855,
"step": 7240
},
{
"epoch": 2.2446199102028177,
"grad_norm": 0.8749867081642151,
"learning_rate": 0.00044905543511923195,
"loss": 2.7683,
"step": 7250
},
{
"epoch": 2.247716364762347,
"grad_norm": 0.8823255896568298,
"learning_rate": 0.000449674821926293,
"loss": 2.7468,
"step": 7260
},
{
"epoch": 2.2508128193218764,
"grad_norm": 1.020506739616394,
"learning_rate": 0.00045029420873335396,
"loss": 2.7633,
"step": 7270
},
{
"epoch": 2.2539092738814057,
"grad_norm": 0.9416619539260864,
"learning_rate": 0.000450913595540415,
"loss": 2.7598,
"step": 7280
},
{
"epoch": 2.257005728440935,
"grad_norm": 0.8934683203697205,
"learning_rate": 0.00045153298234747603,
"loss": 2.767,
"step": 7290
},
{
"epoch": 2.2601021830004644,
"grad_norm": 0.9301040768623352,
"learning_rate": 0.000452152369154537,
"loss": 2.768,
"step": 7300
},
{
"epoch": 2.2631986375599937,
"grad_norm": 0.9030665159225464,
"learning_rate": 0.00045277175596159804,
"loss": 2.7468,
"step": 7310
},
{
"epoch": 2.266295092119523,
"grad_norm": 0.8950912952423096,
"learning_rate": 0.0004533911427686591,
"loss": 2.7583,
"step": 7320
},
{
"epoch": 2.2693915466790524,
"grad_norm": 0.9231360554695129,
"learning_rate": 0.00045401052957572005,
"loss": 2.768,
"step": 7330
},
{
"epoch": 2.2724880012385817,
"grad_norm": 0.9247618317604065,
"learning_rate": 0.0004546299163827811,
"loss": 2.7679,
"step": 7340
},
{
"epoch": 2.275584455798111,
"grad_norm": 0.8417907953262329,
"learning_rate": 0.00045524930318984207,
"loss": 2.7641,
"step": 7350
},
{
"epoch": 2.2786809103576404,
"grad_norm": 0.881175696849823,
"learning_rate": 0.00045586868999690305,
"loss": 2.7377,
"step": 7360
},
{
"epoch": 2.2817773649171698,
"grad_norm": 0.9351217746734619,
"learning_rate": 0.0004564880768039641,
"loss": 2.7521,
"step": 7370
},
{
"epoch": 2.284873819476699,
"grad_norm": 0.8650684952735901,
"learning_rate": 0.0004571074636110251,
"loss": 2.7675,
"step": 7380
},
{
"epoch": 2.2879702740362284,
"grad_norm": 0.922113299369812,
"learning_rate": 0.0004577268504180861,
"loss": 2.7401,
"step": 7390
},
{
"epoch": 2.2910667285957578,
"grad_norm": 0.8902767896652222,
"learning_rate": 0.0004583462372251471,
"loss": 2.7772,
"step": 7400
},
{
"epoch": 2.294163183155287,
"grad_norm": 0.8764835596084595,
"learning_rate": 0.00045896562403220816,
"loss": 2.7526,
"step": 7410
},
{
"epoch": 2.2972596377148164,
"grad_norm": 0.8847823739051819,
"learning_rate": 0.00045958501083926914,
"loss": 2.7504,
"step": 7420
},
{
"epoch": 2.300356092274346,
"grad_norm": 0.8462940454483032,
"learning_rate": 0.00046020439764633017,
"loss": 2.7209,
"step": 7430
},
{
"epoch": 2.303452546833875,
"grad_norm": 0.8645547032356262,
"learning_rate": 0.00046082378445339115,
"loss": 2.7464,
"step": 7440
},
{
"epoch": 2.3065490013934045,
"grad_norm": 0.8842138051986694,
"learning_rate": 0.00046144317126045213,
"loss": 2.7566,
"step": 7450
},
{
"epoch": 2.309645455952934,
"grad_norm": 0.8625742197036743,
"learning_rate": 0.00046206255806751316,
"loss": 2.753,
"step": 7460
},
{
"epoch": 2.312741910512463,
"grad_norm": 0.922121524810791,
"learning_rate": 0.00046268194487457414,
"loss": 2.75,
"step": 7470
},
{
"epoch": 2.3158383650719925,
"grad_norm": 0.8739849925041199,
"learning_rate": 0.0004633013316816352,
"loss": 2.7513,
"step": 7480
},
{
"epoch": 2.318934819631522,
"grad_norm": 0.8614432215690613,
"learning_rate": 0.0004639207184886962,
"loss": 2.75,
"step": 7490
},
{
"epoch": 2.322031274191051,
"grad_norm": 0.8714541792869568,
"learning_rate": 0.0004645401052957572,
"loss": 2.7297,
"step": 7500
},
{
"epoch": 2.3251277287505805,
"grad_norm": 0.9732015132904053,
"learning_rate": 0.0004651594921028182,
"loss": 2.7529,
"step": 7510
},
{
"epoch": 2.32822418331011,
"grad_norm": 0.9061838388442993,
"learning_rate": 0.00046577887890987925,
"loss": 2.7541,
"step": 7520
},
{
"epoch": 2.331320637869639,
"grad_norm": 1.0056427717208862,
"learning_rate": 0.00046639826571694023,
"loss": 2.7381,
"step": 7530
},
{
"epoch": 2.3344170924291685,
"grad_norm": 0.9382318258285522,
"learning_rate": 0.00046701765252400127,
"loss": 2.758,
"step": 7540
},
{
"epoch": 2.337513546988698,
"grad_norm": 0.9322879314422607,
"learning_rate": 0.0004676370393310623,
"loss": 2.7196,
"step": 7550
},
{
"epoch": 2.340610001548227,
"grad_norm": 0.8709734678268433,
"learning_rate": 0.0004682564261381232,
"loss": 2.7259,
"step": 7560
},
{
"epoch": 2.3437064561077565,
"grad_norm": 0.8605784177780151,
"learning_rate": 0.00046887581294518426,
"loss": 2.7116,
"step": 7570
},
{
"epoch": 2.346802910667286,
"grad_norm": 0.8777926564216614,
"learning_rate": 0.0004694951997522453,
"loss": 2.7389,
"step": 7580
},
{
"epoch": 2.349899365226815,
"grad_norm": 0.9535753130912781,
"learning_rate": 0.00047011458655930627,
"loss": 2.7402,
"step": 7590
},
{
"epoch": 2.3529958197863445,
"grad_norm": 0.8377962708473206,
"learning_rate": 0.0004707339733663673,
"loss": 2.7672,
"step": 7600
},
{
"epoch": 2.356092274345874,
"grad_norm": 0.9221674799919128,
"learning_rate": 0.00047135336017342834,
"loss": 2.7341,
"step": 7610
},
{
"epoch": 2.359188728905403,
"grad_norm": 0.9175540804862976,
"learning_rate": 0.0004719727469804893,
"loss": 2.7332,
"step": 7620
},
{
"epoch": 2.3622851834649325,
"grad_norm": 0.896039605140686,
"learning_rate": 0.00047259213378755035,
"loss": 2.7587,
"step": 7630
},
{
"epoch": 2.365381638024462,
"grad_norm": 0.8460658192634583,
"learning_rate": 0.0004732115205946114,
"loss": 2.7378,
"step": 7640
},
{
"epoch": 2.3684780925839912,
"grad_norm": 0.9001418352127075,
"learning_rate": 0.00047383090740167236,
"loss": 2.7374,
"step": 7650
},
{
"epoch": 2.3715745471435206,
"grad_norm": 0.9807076454162598,
"learning_rate": 0.00047445029420873334,
"loss": 2.723,
"step": 7660
},
{
"epoch": 2.37467100170305,
"grad_norm": 0.8731216192245483,
"learning_rate": 0.0004750696810157944,
"loss": 2.7112,
"step": 7670
},
{
"epoch": 2.3777674562625792,
"grad_norm": 0.8750482201576233,
"learning_rate": 0.00047568906782285536,
"loss": 2.7016,
"step": 7680
},
{
"epoch": 2.3808639108221086,
"grad_norm": 0.8985123634338379,
"learning_rate": 0.0004763084546299164,
"loss": 2.7462,
"step": 7690
},
{
"epoch": 2.383960365381638,
"grad_norm": 0.8914074301719666,
"learning_rate": 0.0004769278414369774,
"loss": 2.7253,
"step": 7700
},
{
"epoch": 2.3870568199411673,
"grad_norm": 0.8856596350669861,
"learning_rate": 0.0004775472282440384,
"loss": 2.7438,
"step": 7710
},
{
"epoch": 2.3901532745006966,
"grad_norm": 0.9476223587989807,
"learning_rate": 0.00047816661505109944,
"loss": 2.7208,
"step": 7720
},
{
"epoch": 2.393249729060226,
"grad_norm": 0.8765897750854492,
"learning_rate": 0.00047878600185816047,
"loss": 2.7302,
"step": 7730
},
{
"epoch": 2.3963461836197553,
"grad_norm": 0.9087428450584412,
"learning_rate": 0.00047940538866522145,
"loss": 2.7225,
"step": 7740
},
{
"epoch": 2.3994426381792846,
"grad_norm": 0.9276483058929443,
"learning_rate": 0.0004800247754722825,
"loss": 2.7297,
"step": 7750
},
{
"epoch": 2.402539092738814,
"grad_norm": 0.8988469243049622,
"learning_rate": 0.0004806441622793434,
"loss": 2.7167,
"step": 7760
},
{
"epoch": 2.4056355472983433,
"grad_norm": 0.865112841129303,
"learning_rate": 0.00048126354908640444,
"loss": 2.7187,
"step": 7770
},
{
"epoch": 2.4087320018578726,
"grad_norm": 0.8832447528839111,
"learning_rate": 0.0004818829358934655,
"loss": 2.7123,
"step": 7780
},
{
"epoch": 2.411828456417402,
"grad_norm": 0.8970694541931152,
"learning_rate": 0.00048250232270052645,
"loss": 2.7255,
"step": 7790
},
{
"epoch": 2.4149249109769313,
"grad_norm": 0.8232760429382324,
"learning_rate": 0.0004831217095075875,
"loss": 2.7315,
"step": 7800
},
{
"epoch": 2.4180213655364606,
"grad_norm": 0.9075847268104553,
"learning_rate": 0.0004837410963146485,
"loss": 2.7098,
"step": 7810
},
{
"epoch": 2.42111782009599,
"grad_norm": 0.871097981929779,
"learning_rate": 0.0004843604831217095,
"loss": 2.7172,
"step": 7820
},
{
"epoch": 2.4242142746555193,
"grad_norm": 0.8684946894645691,
"learning_rate": 0.00048497986992877053,
"loss": 2.7031,
"step": 7830
},
{
"epoch": 2.4273107292150486,
"grad_norm": 0.9100140929222107,
"learning_rate": 0.00048559925673583157,
"loss": 2.7175,
"step": 7840
},
{
"epoch": 2.430407183774578,
"grad_norm": 0.8607642650604248,
"learning_rate": 0.00048621864354289254,
"loss": 2.7149,
"step": 7850
},
{
"epoch": 2.4335036383341073,
"grad_norm": 0.865871012210846,
"learning_rate": 0.0004868380303499536,
"loss": 2.7139,
"step": 7860
},
{
"epoch": 2.4366000928936367,
"grad_norm": 0.9190123677253723,
"learning_rate": 0.00048745741715701456,
"loss": 2.7167,
"step": 7870
},
{
"epoch": 2.439696547453166,
"grad_norm": 0.8954902291297913,
"learning_rate": 0.00048807680396407554,
"loss": 2.7041,
"step": 7880
},
{
"epoch": 2.4427930020126953,
"grad_norm": 0.9070473313331604,
"learning_rate": 0.0004886961907711366,
"loss": 2.712,
"step": 7890
},
{
"epoch": 2.4458894565722247,
"grad_norm": 1.2090919017791748,
"learning_rate": 0.0004893155775781977,
"loss": 2.7241,
"step": 7900
},
{
"epoch": 2.448985911131754,
"grad_norm": 0.8956063985824585,
"learning_rate": 0.0004899349643852586,
"loss": 2.7089,
"step": 7910
},
{
"epoch": 2.4520823656912833,
"grad_norm": 0.8796259164810181,
"learning_rate": 0.0004905543511923196,
"loss": 2.6996,
"step": 7920
},
{
"epoch": 2.4551788202508127,
"grad_norm": 0.8752288222312927,
"learning_rate": 0.0004911737379993806,
"loss": 2.7141,
"step": 7930
},
{
"epoch": 2.458275274810342,
"grad_norm": 0.8404427170753479,
"learning_rate": 0.0004917931248064416,
"loss": 2.7086,
"step": 7940
},
{
"epoch": 2.4613717293698714,
"grad_norm": 0.8801198601722717,
"learning_rate": 0.0004924125116135027,
"loss": 2.716,
"step": 7950
},
{
"epoch": 2.4644681839294007,
"grad_norm": 0.8937883377075195,
"learning_rate": 0.0004930318984205636,
"loss": 2.6963,
"step": 7960
},
{
"epoch": 2.46756463848893,
"grad_norm": 0.8348713517189026,
"learning_rate": 0.0004936512852276246,
"loss": 2.7158,
"step": 7970
},
{
"epoch": 2.4706610930484594,
"grad_norm": 0.9168616533279419,
"learning_rate": 0.0004942706720346857,
"loss": 2.7212,
"step": 7980
},
{
"epoch": 2.4737575476079887,
"grad_norm": 0.8765811324119568,
"learning_rate": 0.0004948900588417467,
"loss": 2.7037,
"step": 7990
},
{
"epoch": 2.476854002167518,
"grad_norm": 0.9563819766044617,
"learning_rate": 0.0004955094456488077,
"loss": 2.7076,
"step": 8000
},
{
"epoch": 2.4799504567270474,
"grad_norm": 0.9105591177940369,
"learning_rate": 0.0004961288324558688,
"loss": 2.704,
"step": 8010
},
{
"epoch": 2.4830469112865767,
"grad_norm": 0.8907128572463989,
"learning_rate": 0.0004967482192629297,
"loss": 2.711,
"step": 8020
},
{
"epoch": 2.486143365846106,
"grad_norm": 0.9110057353973389,
"learning_rate": 0.0004973676060699907,
"loss": 2.715,
"step": 8030
},
{
"epoch": 2.4892398204056354,
"grad_norm": 0.8938244581222534,
"learning_rate": 0.0004979869928770517,
"loss": 2.7236,
"step": 8040
},
{
"epoch": 2.4923362749651647,
"grad_norm": 0.8680298328399658,
"learning_rate": 0.0004986063796841128,
"loss": 2.7141,
"step": 8050
},
{
"epoch": 2.495432729524694,
"grad_norm": 1.2556971311569214,
"learning_rate": 0.0004992257664911738,
"loss": 2.7182,
"step": 8060
},
{
"epoch": 2.4985291840842234,
"grad_norm": 0.8885079026222229,
"learning_rate": 0.0004998451532982347,
"loss": 2.7178,
"step": 8070
},
{
"epoch": 2.5016256386437528,
"grad_norm": 0.8683394193649292,
"learning_rate": 0.0005004645401052957,
"loss": 2.713,
"step": 8080
},
{
"epoch": 2.504722093203282,
"grad_norm": 0.8895092010498047,
"learning_rate": 0.0005010839269123568,
"loss": 2.7244,
"step": 8090
},
{
"epoch": 2.5078185477628114,
"grad_norm": 0.9000723958015442,
"learning_rate": 0.0005017033137194178,
"loss": 2.7018,
"step": 8100
},
{
"epoch": 2.5109150023223408,
"grad_norm": 0.8466011881828308,
"learning_rate": 0.0005023227005264788,
"loss": 2.7252,
"step": 8110
},
{
"epoch": 2.51401145688187,
"grad_norm": 0.8740931749343872,
"learning_rate": 0.0005029420873335399,
"loss": 2.7023,
"step": 8120
},
{
"epoch": 2.5171079114413994,
"grad_norm": 0.9173566102981567,
"learning_rate": 0.0005035614741406008,
"loss": 2.7158,
"step": 8130
},
{
"epoch": 2.520204366000929,
"grad_norm": 0.9136703610420227,
"learning_rate": 0.0005041808609476618,
"loss": 2.7081,
"step": 8140
},
{
"epoch": 2.523300820560458,
"grad_norm": 0.9001860022544861,
"learning_rate": 0.0005048002477547229,
"loss": 2.6879,
"step": 8150
},
{
"epoch": 2.5263972751199875,
"grad_norm": 0.8756097555160522,
"learning_rate": 0.0005054196345617839,
"loss": 2.714,
"step": 8160
},
{
"epoch": 2.529493729679517,
"grad_norm": 0.8774548768997192,
"learning_rate": 0.0005060390213688449,
"loss": 2.6751,
"step": 8170
},
{
"epoch": 2.532590184239046,
"grad_norm": 0.8764857649803162,
"learning_rate": 0.0005066584081759059,
"loss": 2.7045,
"step": 8180
},
{
"epoch": 2.5356866387985755,
"grad_norm": 0.8589802980422974,
"learning_rate": 0.0005072777949829669,
"loss": 2.7001,
"step": 8190
},
{
"epoch": 2.538783093358105,
"grad_norm": 0.8591241836547852,
"learning_rate": 0.0005078971817900279,
"loss": 2.6838,
"step": 8200
},
{
"epoch": 2.541879547917634,
"grad_norm": 0.8960736989974976,
"learning_rate": 0.000508516568597089,
"loss": 2.6847,
"step": 8210
},
{
"epoch": 2.5449760024771635,
"grad_norm": 0.8818134069442749,
"learning_rate": 0.00050913595540415,
"loss": 2.6907,
"step": 8220
},
{
"epoch": 2.548072457036693,
"grad_norm": 0.8439919948577881,
"learning_rate": 0.0005097553422112108,
"loss": 2.6649,
"step": 8230
},
{
"epoch": 2.551168911596222,
"grad_norm": 0.953252911567688,
"learning_rate": 0.0005103747290182718,
"loss": 2.7113,
"step": 8240
},
{
"epoch": 2.5542653661557515,
"grad_norm": 0.8814793825149536,
"learning_rate": 0.0005109941158253329,
"loss": 2.6971,
"step": 8250
},
{
"epoch": 2.557361820715281,
"grad_norm": 0.8562922477722168,
"learning_rate": 0.0005116135026323939,
"loss": 2.6816,
"step": 8260
},
{
"epoch": 2.56045827527481,
"grad_norm": 0.9286318421363831,
"learning_rate": 0.0005122328894394549,
"loss": 2.6976,
"step": 8270
},
{
"epoch": 2.5635547298343395,
"grad_norm": 0.8571282029151917,
"learning_rate": 0.000512852276246516,
"loss": 2.6931,
"step": 8280
},
{
"epoch": 2.566651184393869,
"grad_norm": 0.8638617396354675,
"learning_rate": 0.0005134716630535769,
"loss": 2.7112,
"step": 8290
},
{
"epoch": 2.569747638953398,
"grad_norm": 0.8954980969429016,
"learning_rate": 0.0005140910498606379,
"loss": 2.6775,
"step": 8300
},
{
"epoch": 2.5728440935129275,
"grad_norm": 0.8603184223175049,
"learning_rate": 0.000514710436667699,
"loss": 2.6962,
"step": 8310
},
{
"epoch": 2.575940548072457,
"grad_norm": 0.8614330887794495,
"learning_rate": 0.00051532982347476,
"loss": 2.7119,
"step": 8320
},
{
"epoch": 2.5790370026319867,
"grad_norm": 0.853256106376648,
"learning_rate": 0.000515949210281821,
"loss": 2.6701,
"step": 8330
},
{
"epoch": 2.5821334571915155,
"grad_norm": 0.9329004883766174,
"learning_rate": 0.000516568597088882,
"loss": 2.7029,
"step": 8340
},
{
"epoch": 2.5852299117510453,
"grad_norm": 0.8642740249633789,
"learning_rate": 0.000517187983895943,
"loss": 2.6927,
"step": 8350
},
{
"epoch": 2.5883263663105742,
"grad_norm": 0.8851795196533203,
"learning_rate": 0.000517807370703004,
"loss": 2.6801,
"step": 8360
},
{
"epoch": 2.591422820870104,
"grad_norm": 0.8649539947509766,
"learning_rate": 0.0005184267575100651,
"loss": 2.671,
"step": 8370
},
{
"epoch": 2.594519275429633,
"grad_norm": 0.8715213537216187,
"learning_rate": 0.0005190461443171261,
"loss": 2.69,
"step": 8380
},
{
"epoch": 2.5976157299891627,
"grad_norm": 0.8469790816307068,
"learning_rate": 0.000519665531124187,
"loss": 2.6773,
"step": 8390
},
{
"epoch": 2.6007121845486916,
"grad_norm": 0.8525969982147217,
"learning_rate": 0.0005202849179312481,
"loss": 2.6728,
"step": 8400
},
{
"epoch": 2.6038086391082214,
"grad_norm": 0.8539503812789917,
"learning_rate": 0.0005209043047383091,
"loss": 2.6869,
"step": 8410
},
{
"epoch": 2.6069050936677503,
"grad_norm": 0.877877414226532,
"learning_rate": 0.0005215236915453701,
"loss": 2.6924,
"step": 8420
},
{
"epoch": 2.61000154822728,
"grad_norm": 0.9159960150718689,
"learning_rate": 0.0005221430783524312,
"loss": 2.6827,
"step": 8430
},
{
"epoch": 2.613098002786809,
"grad_norm": 0.9159612059593201,
"learning_rate": 0.000522762465159492,
"loss": 2.6715,
"step": 8440
},
{
"epoch": 2.6161944573463387,
"grad_norm": 0.8842989802360535,
"learning_rate": 0.000523381851966553,
"loss": 2.6781,
"step": 8450
},
{
"epoch": 2.6192909119058676,
"grad_norm": 0.981275737285614,
"learning_rate": 0.0005240012387736141,
"loss": 2.6975,
"step": 8460
},
{
"epoch": 2.6223873664653974,
"grad_norm": 0.8604749441146851,
"learning_rate": 0.0005246206255806751,
"loss": 2.6785,
"step": 8470
},
{
"epoch": 2.6254838210249263,
"grad_norm": 0.880984902381897,
"learning_rate": 0.0005252400123877361,
"loss": 2.6743,
"step": 8480
},
{
"epoch": 2.628580275584456,
"grad_norm": 0.9086693525314331,
"learning_rate": 0.0005258593991947972,
"loss": 2.6827,
"step": 8490
},
{
"epoch": 2.631676730143985,
"grad_norm": 0.9209759831428528,
"learning_rate": 0.0005264787860018581,
"loss": 2.6969,
"step": 8500
},
{
"epoch": 2.6347731847035147,
"grad_norm": 1.1329649686813354,
"learning_rate": 0.0005270981728089191,
"loss": 2.6682,
"step": 8510
},
{
"epoch": 2.6378696392630436,
"grad_norm": 0.904861569404602,
"learning_rate": 0.0005277175596159802,
"loss": 2.6765,
"step": 8520
},
{
"epoch": 2.6409660938225734,
"grad_norm": 0.9609228372573853,
"learning_rate": 0.0005283369464230412,
"loss": 2.6777,
"step": 8530
},
{
"epoch": 2.6440625483821023,
"grad_norm": 0.84135901927948,
"learning_rate": 0.0005289563332301022,
"loss": 2.6963,
"step": 8540
},
{
"epoch": 2.647159002941632,
"grad_norm": 0.9496148228645325,
"learning_rate": 0.0005295757200371633,
"loss": 2.6755,
"step": 8550
},
{
"epoch": 2.650255457501161,
"grad_norm": 0.9461915493011475,
"learning_rate": 0.0005301951068442242,
"loss": 2.6947,
"step": 8560
},
{
"epoch": 2.6533519120606908,
"grad_norm": 0.8542360067367554,
"learning_rate": 0.0005308144936512852,
"loss": 2.6722,
"step": 8570
},
{
"epoch": 2.6564483666202197,
"grad_norm": 0.9559420347213745,
"learning_rate": 0.0005314338804583463,
"loss": 2.6781,
"step": 8580
},
{
"epoch": 2.6595448211797494,
"grad_norm": 0.9376833438873291,
"learning_rate": 0.0005320532672654073,
"loss": 2.7124,
"step": 8590
},
{
"epoch": 2.6626412757392783,
"grad_norm": 0.8750305771827698,
"learning_rate": 0.0005326726540724683,
"loss": 2.695,
"step": 8600
},
{
"epoch": 2.665737730298808,
"grad_norm": 0.8628771305084229,
"learning_rate": 0.0005332920408795294,
"loss": 2.6874,
"step": 8610
},
{
"epoch": 2.668834184858337,
"grad_norm": 0.91616290807724,
"learning_rate": 0.0005339114276865903,
"loss": 2.6733,
"step": 8620
},
{
"epoch": 2.671930639417867,
"grad_norm": 0.8734931349754333,
"learning_rate": 0.0005345308144936513,
"loss": 2.6805,
"step": 8630
},
{
"epoch": 2.6750270939773957,
"grad_norm": 0.8667175769805908,
"learning_rate": 0.0005351502013007124,
"loss": 2.6863,
"step": 8640
},
{
"epoch": 2.6781235485369255,
"grad_norm": 0.8947048783302307,
"learning_rate": 0.0005357695881077733,
"loss": 2.6906,
"step": 8650
},
{
"epoch": 2.6812200030964544,
"grad_norm": 0.9095123410224915,
"learning_rate": 0.0005363889749148342,
"loss": 2.6741,
"step": 8660
},
{
"epoch": 2.684316457655984,
"grad_norm": 0.8678126335144043,
"learning_rate": 0.0005370083617218953,
"loss": 2.6532,
"step": 8670
},
{
"epoch": 2.687412912215513,
"grad_norm": 0.8941618800163269,
"learning_rate": 0.0005376277485289563,
"loss": 2.6733,
"step": 8680
},
{
"epoch": 2.690509366775043,
"grad_norm": 0.9127388596534729,
"learning_rate": 0.0005382471353360173,
"loss": 2.6864,
"step": 8690
},
{
"epoch": 2.6936058213345717,
"grad_norm": 0.8542888760566711,
"learning_rate": 0.0005388665221430784,
"loss": 2.6839,
"step": 8700
},
{
"epoch": 2.6967022758941015,
"grad_norm": 0.8937285542488098,
"learning_rate": 0.0005394859089501394,
"loss": 2.6911,
"step": 8710
},
{
"epoch": 2.6997987304536304,
"grad_norm": 0.9001040458679199,
"learning_rate": 0.0005401052957572003,
"loss": 2.6785,
"step": 8720
},
{
"epoch": 2.70289518501316,
"grad_norm": 0.9357818365097046,
"learning_rate": 0.0005407246825642614,
"loss": 2.6959,
"step": 8730
},
{
"epoch": 2.705991639572689,
"grad_norm": 0.9065813422203064,
"learning_rate": 0.0005413440693713224,
"loss": 2.6838,
"step": 8740
},
{
"epoch": 2.709088094132219,
"grad_norm": 0.8821165561676025,
"learning_rate": 0.0005419634561783834,
"loss": 2.6618,
"step": 8750
},
{
"epoch": 2.7121845486917477,
"grad_norm": 0.8667876720428467,
"learning_rate": 0.0005425828429854445,
"loss": 2.6849,
"step": 8760
},
{
"epoch": 2.7152810032512775,
"grad_norm": 0.8643457889556885,
"learning_rate": 0.0005432022297925055,
"loss": 2.6629,
"step": 8770
},
{
"epoch": 2.7183774578108064,
"grad_norm": 0.8841952681541443,
"learning_rate": 0.0005438216165995664,
"loss": 2.6605,
"step": 8780
},
{
"epoch": 2.721473912370336,
"grad_norm": 0.9219385385513306,
"learning_rate": 0.0005444410034066275,
"loss": 2.6594,
"step": 8790
},
{
"epoch": 2.724570366929865,
"grad_norm": 0.9676291942596436,
"learning_rate": 0.0005450603902136885,
"loss": 2.6796,
"step": 8800
},
{
"epoch": 2.727666821489395,
"grad_norm": 0.9405499696731567,
"learning_rate": 0.0005456797770207495,
"loss": 2.6928,
"step": 8810
},
{
"epoch": 2.7307632760489238,
"grad_norm": 0.9420516490936279,
"learning_rate": 0.0005462991638278106,
"loss": 2.6699,
"step": 8820
},
{
"epoch": 2.7338597306084536,
"grad_norm": 0.9792620539665222,
"learning_rate": 0.0005469185506348715,
"loss": 2.6666,
"step": 8830
},
{
"epoch": 2.7369561851679824,
"grad_norm": 0.9726955890655518,
"learning_rate": 0.0005475379374419325,
"loss": 2.645,
"step": 8840
},
{
"epoch": 2.7400526397275122,
"grad_norm": 1.020033359527588,
"learning_rate": 0.0005481573242489936,
"loss": 2.6614,
"step": 8850
},
{
"epoch": 2.743149094287041,
"grad_norm": 1.0454789400100708,
"learning_rate": 0.0005487767110560545,
"loss": 2.6565,
"step": 8860
},
{
"epoch": 2.746245548846571,
"grad_norm": 0.8889420628547668,
"learning_rate": 0.0005493960978631155,
"loss": 2.6916,
"step": 8870
},
{
"epoch": 2.7493420034061,
"grad_norm": 0.9025602340698242,
"learning_rate": 0.0005500154846701765,
"loss": 2.6999,
"step": 8880
},
{
"epoch": 2.7524384579656296,
"grad_norm": 0.8665561079978943,
"learning_rate": 0.0005506348714772375,
"loss": 2.6385,
"step": 8890
},
{
"epoch": 2.7555349125251585,
"grad_norm": 0.9031399488449097,
"learning_rate": 0.0005512542582842985,
"loss": 2.6505,
"step": 8900
},
{
"epoch": 2.7586313670846883,
"grad_norm": 0.9555135369300842,
"learning_rate": 0.0005518736450913595,
"loss": 2.6816,
"step": 8910
},
{
"epoch": 2.761727821644217,
"grad_norm": 0.9307361245155334,
"learning_rate": 0.0005524930318984206,
"loss": 2.6715,
"step": 8920
},
{
"epoch": 2.764824276203747,
"grad_norm": 0.9591286778450012,
"learning_rate": 0.0005531124187054816,
"loss": 2.6822,
"step": 8930
},
{
"epoch": 2.767920730763276,
"grad_norm": 0.9070897698402405,
"learning_rate": 0.0005537318055125425,
"loss": 2.6702,
"step": 8940
},
{
"epoch": 2.7710171853228056,
"grad_norm": 0.9256467819213867,
"learning_rate": 0.0005543511923196036,
"loss": 2.6555,
"step": 8950
},
{
"epoch": 2.7741136398823345,
"grad_norm": 0.993756115436554,
"learning_rate": 0.0005549705791266646,
"loss": 2.6547,
"step": 8960
},
{
"epoch": 2.7772100944418643,
"grad_norm": 0.9043955206871033,
"learning_rate": 0.0005555899659337256,
"loss": 2.6905,
"step": 8970
},
{
"epoch": 2.780306549001393,
"grad_norm": 0.9000112414360046,
"learning_rate": 0.0005562093527407867,
"loss": 2.6534,
"step": 8980
},
{
"epoch": 2.783403003560923,
"grad_norm": 0.9210097789764404,
"learning_rate": 0.0005568287395478476,
"loss": 2.6717,
"step": 8990
},
{
"epoch": 2.786499458120452,
"grad_norm": 0.8958888053894043,
"learning_rate": 0.0005574481263549086,
"loss": 2.6856,
"step": 9000
},
{
"epoch": 2.7895959126799816,
"grad_norm": 1.0156104564666748,
"learning_rate": 0.0005580675131619697,
"loss": 2.6794,
"step": 9010
},
{
"epoch": 2.7926923672395105,
"grad_norm": 0.9581423997879028,
"learning_rate": 0.0005586868999690307,
"loss": 2.6576,
"step": 9020
},
{
"epoch": 2.7957888217990403,
"grad_norm": 0.9721694588661194,
"learning_rate": 0.0005593062867760917,
"loss": 2.6569,
"step": 9030
},
{
"epoch": 2.798885276358569,
"grad_norm": 0.9453576803207397,
"learning_rate": 0.0005599256735831528,
"loss": 2.6664,
"step": 9040
},
{
"epoch": 2.801981730918099,
"grad_norm": 0.9473662972450256,
"learning_rate": 0.0005605450603902137,
"loss": 2.6604,
"step": 9050
},
{
"epoch": 2.805078185477628,
"grad_norm": 0.9190026521682739,
"learning_rate": 0.0005611644471972746,
"loss": 2.6632,
"step": 9060
},
{
"epoch": 2.8081746400371577,
"grad_norm": 0.9677988886833191,
"learning_rate": 0.0005617838340043357,
"loss": 2.6574,
"step": 9070
},
{
"epoch": 2.8112710945966866,
"grad_norm": 0.9148370623588562,
"learning_rate": 0.0005624032208113967,
"loss": 2.6801,
"step": 9080
},
{
"epoch": 2.8143675491562163,
"grad_norm": 0.908485472202301,
"learning_rate": 0.0005630226076184577,
"loss": 2.6614,
"step": 9090
},
{
"epoch": 2.8174640037157452,
"grad_norm": 0.9479948878288269,
"learning_rate": 0.0005636419944255187,
"loss": 2.6426,
"step": 9100
},
{
"epoch": 2.820560458275275,
"grad_norm": 1.0165117979049683,
"learning_rate": 0.0005642613812325797,
"loss": 2.6608,
"step": 9110
},
{
"epoch": 2.823656912834804,
"grad_norm": 0.870343029499054,
"learning_rate": 0.0005648807680396407,
"loss": 2.6729,
"step": 9120
},
{
"epoch": 2.8267533673943337,
"grad_norm": 0.9335671067237854,
"learning_rate": 0.0005655001548467018,
"loss": 2.6655,
"step": 9130
},
{
"epoch": 2.8298498219538626,
"grad_norm": 0.9250266551971436,
"learning_rate": 0.0005661195416537628,
"loss": 2.6793,
"step": 9140
},
{
"epoch": 2.8329462765133924,
"grad_norm": 0.8737602233886719,
"learning_rate": 0.0005667389284608237,
"loss": 2.6552,
"step": 9150
},
{
"epoch": 2.8360427310729213,
"grad_norm": 0.9168223142623901,
"learning_rate": 0.0005673583152678848,
"loss": 2.6725,
"step": 9160
},
{
"epoch": 2.839139185632451,
"grad_norm": 0.9240823984146118,
"learning_rate": 0.0005679777020749458,
"loss": 2.642,
"step": 9170
},
{
"epoch": 2.84223564019198,
"grad_norm": 0.9061072468757629,
"learning_rate": 0.0005685970888820068,
"loss": 2.6746,
"step": 9180
},
{
"epoch": 2.8453320947515097,
"grad_norm": 0.8670341968536377,
"learning_rate": 0.0005692164756890679,
"loss": 2.6693,
"step": 9190
},
{
"epoch": 2.8484285493110386,
"grad_norm": 0.9250338673591614,
"learning_rate": 0.0005698358624961289,
"loss": 2.6755,
"step": 9200
},
{
"epoch": 2.8515250038705684,
"grad_norm": 0.9369593262672424,
"learning_rate": 0.0005704552493031898,
"loss": 2.6794,
"step": 9210
},
{
"epoch": 2.8546214584300973,
"grad_norm": 0.9392365217208862,
"learning_rate": 0.0005710746361102509,
"loss": 2.6644,
"step": 9220
},
{
"epoch": 2.857717912989627,
"grad_norm": 0.9542964696884155,
"learning_rate": 0.0005716940229173119,
"loss": 2.6785,
"step": 9230
},
{
"epoch": 2.860814367549156,
"grad_norm": 0.9194208979606628,
"learning_rate": 0.0005723134097243729,
"loss": 2.6743,
"step": 9240
},
{
"epoch": 2.8639108221086857,
"grad_norm": 0.9285315275192261,
"learning_rate": 0.000572932796531434,
"loss": 2.6589,
"step": 9250
},
{
"epoch": 2.8670072766682146,
"grad_norm": 0.9268024563789368,
"learning_rate": 0.000573552183338495,
"loss": 2.6552,
"step": 9260
},
{
"epoch": 2.8701037312277444,
"grad_norm": 0.904656171798706,
"learning_rate": 0.0005741715701455558,
"loss": 2.6657,
"step": 9270
},
{
"epoch": 2.8732001857872733,
"grad_norm": 0.9420167207717896,
"learning_rate": 0.0005747909569526169,
"loss": 2.6572,
"step": 9280
},
{
"epoch": 2.876296640346803,
"grad_norm": 0.9118287563323975,
"learning_rate": 0.0005754103437596779,
"loss": 2.6629,
"step": 9290
},
{
"epoch": 2.879393094906332,
"grad_norm": 0.940430223941803,
"learning_rate": 0.0005760297305667389,
"loss": 2.6518,
"step": 9300
},
{
"epoch": 2.8824895494658618,
"grad_norm": 1.3163542747497559,
"learning_rate": 0.0005766491173738,
"loss": 2.652,
"step": 9310
},
{
"epoch": 2.8855860040253907,
"grad_norm": 0.9466584324836731,
"learning_rate": 0.0005772685041808609,
"loss": 2.6809,
"step": 9320
},
{
"epoch": 2.8886824585849205,
"grad_norm": 2.4098305702209473,
"learning_rate": 0.0005778878909879219,
"loss": 2.6616,
"step": 9330
},
{
"epoch": 2.8917789131444493,
"grad_norm": 1.0643264055252075,
"learning_rate": 0.000578507277794983,
"loss": 2.6719,
"step": 9340
},
{
"epoch": 2.894875367703979,
"grad_norm": 0.9846721887588501,
"learning_rate": 0.000579126664602044,
"loss": 2.6706,
"step": 9350
},
{
"epoch": 2.897971822263508,
"grad_norm": 0.9832435250282288,
"learning_rate": 0.000579746051409105,
"loss": 2.6725,
"step": 9360
},
{
"epoch": 2.901068276823038,
"grad_norm": 0.8981136083602905,
"learning_rate": 0.000580365438216166,
"loss": 2.6475,
"step": 9370
},
{
"epoch": 2.9041647313825667,
"grad_norm": 0.8961195349693298,
"learning_rate": 0.000580984825023227,
"loss": 2.6705,
"step": 9380
},
{
"epoch": 2.9072611859420965,
"grad_norm": 1.0543441772460938,
"learning_rate": 0.000581604211830288,
"loss": 2.6666,
"step": 9390
},
{
"epoch": 2.9103576405016254,
"grad_norm": 0.9041043519973755,
"learning_rate": 0.0005822235986373491,
"loss": 2.6608,
"step": 9400
},
{
"epoch": 2.913454095061155,
"grad_norm": 0.9475833773612976,
"learning_rate": 0.0005828429854444101,
"loss": 2.6453,
"step": 9410
},
{
"epoch": 2.916550549620684,
"grad_norm": 0.9282538890838623,
"learning_rate": 0.000583462372251471,
"loss": 2.6531,
"step": 9420
},
{
"epoch": 2.919647004180214,
"grad_norm": 0.936406672000885,
"learning_rate": 0.0005840817590585321,
"loss": 2.6594,
"step": 9430
},
{
"epoch": 2.9227434587397427,
"grad_norm": 0.9766597747802734,
"learning_rate": 0.0005847011458655931,
"loss": 2.6698,
"step": 9440
},
{
"epoch": 2.9258399132992725,
"grad_norm": 0.9606243968009949,
"learning_rate": 0.0005853205326726541,
"loss": 2.6819,
"step": 9450
},
{
"epoch": 2.9289363678588014,
"grad_norm": 0.9478334784507751,
"learning_rate": 0.0005859399194797152,
"loss": 2.6589,
"step": 9460
},
{
"epoch": 2.932032822418331,
"grad_norm": 0.9398000836372375,
"learning_rate": 0.0005865593062867762,
"loss": 2.6414,
"step": 9470
},
{
"epoch": 2.93512927697786,
"grad_norm": 0.8788222074508667,
"learning_rate": 0.000587178693093837,
"loss": 2.6327,
"step": 9480
},
{
"epoch": 2.93822573153739,
"grad_norm": 0.945261538028717,
"learning_rate": 0.0005877980799008981,
"loss": 2.6642,
"step": 9490
},
{
"epoch": 2.9413221860969188,
"grad_norm": 0.9153859615325928,
"learning_rate": 0.0005884174667079591,
"loss": 2.6528,
"step": 9500
},
{
"epoch": 2.9444186406564485,
"grad_norm": 1.6933245658874512,
"learning_rate": 0.0005890368535150201,
"loss": 2.6594,
"step": 9510
},
{
"epoch": 2.9475150952159774,
"grad_norm": 1.0047813653945923,
"learning_rate": 0.0005896562403220812,
"loss": 2.6867,
"step": 9520
},
{
"epoch": 2.950611549775507,
"grad_norm": 1.006410002708435,
"learning_rate": 0.0005902756271291422,
"loss": 2.6551,
"step": 9530
},
{
"epoch": 2.953708004335036,
"grad_norm": 0.987974226474762,
"learning_rate": 0.0005908950139362031,
"loss": 2.6563,
"step": 9540
},
{
"epoch": 2.956804458894566,
"grad_norm": 0.9611511826515198,
"learning_rate": 0.0005915144007432642,
"loss": 2.6677,
"step": 9550
},
{
"epoch": 2.959900913454095,
"grad_norm": 0.9569249153137207,
"learning_rate": 0.0005921337875503252,
"loss": 2.6368,
"step": 9560
},
{
"epoch": 2.9629973680136246,
"grad_norm": 0.909783124923706,
"learning_rate": 0.0005927531743573862,
"loss": 2.6353,
"step": 9570
},
{
"epoch": 2.966093822573154,
"grad_norm": 0.9167472720146179,
"learning_rate": 0.0005933725611644472,
"loss": 2.6469,
"step": 9580
},
{
"epoch": 2.9691902771326832,
"grad_norm": 0.9903345108032227,
"learning_rate": 0.0005939919479715082,
"loss": 2.6567,
"step": 9590
},
{
"epoch": 2.9722867316922126,
"grad_norm": 0.9372828006744385,
"learning_rate": 0.0005946113347785692,
"loss": 2.6597,
"step": 9600
},
{
"epoch": 2.975383186251742,
"grad_norm": 1.0080912113189697,
"learning_rate": 0.0005952307215856302,
"loss": 2.6425,
"step": 9610
},
{
"epoch": 2.9784796408112713,
"grad_norm": 0.9167620539665222,
"learning_rate": 0.0005958501083926913,
"loss": 2.666,
"step": 9620
},
{
"epoch": 2.9815760953708006,
"grad_norm": 0.9428613781929016,
"learning_rate": 0.0005964694951997523,
"loss": 2.6486,
"step": 9630
},
{
"epoch": 2.98467254993033,
"grad_norm": 1.0144000053405762,
"learning_rate": 0.0005970888820068132,
"loss": 2.6382,
"step": 9640
},
{
"epoch": 2.9877690044898593,
"grad_norm": 0.8944305777549744,
"learning_rate": 0.0005977082688138743,
"loss": 2.6406,
"step": 9650
},
{
"epoch": 2.9908654590493886,
"grad_norm": 0.9113066792488098,
"learning_rate": 0.0005983276556209353,
"loss": 2.6514,
"step": 9660
},
{
"epoch": 2.993961913608918,
"grad_norm": 0.9131670594215393,
"learning_rate": 0.0005989470424279963,
"loss": 2.6314,
"step": 9670
},
{
"epoch": 2.9970583681684473,
"grad_norm": 0.9719523787498474,
"learning_rate": 0.0005995664292350574,
"loss": 2.6578,
"step": 9680
},
{
"epoch": 3.0,
"grad_norm": 0.8123937845230103,
"learning_rate": 0.0006001858160421183,
"loss": 2.5215,
"step": 9690
},
{
"epoch": 3.0030964545595293,
"grad_norm": 1.055759310722351,
"learning_rate": 0.0006008052028491792,
"loss": 2.6481,
"step": 9700
},
{
"epoch": 3.0061929091190587,
"grad_norm": 0.9894253611564636,
"learning_rate": 0.0006014245896562403,
"loss": 2.6389,
"step": 9710
},
{
"epoch": 3.009289363678588,
"grad_norm": 0.9278469085693359,
"learning_rate": 0.0006020439764633013,
"loss": 2.6382,
"step": 9720
},
{
"epoch": 3.0123858182381174,
"grad_norm": 0.9690927267074585,
"learning_rate": 0.0006026633632703623,
"loss": 2.6225,
"step": 9730
},
{
"epoch": 3.0154822727976467,
"grad_norm": 0.8948525190353394,
"learning_rate": 0.0006032827500774234,
"loss": 2.6266,
"step": 9740
},
{
"epoch": 3.018578727357176,
"grad_norm": 0.9562525749206543,
"learning_rate": 0.0006039021368844843,
"loss": 2.6251,
"step": 9750
},
{
"epoch": 3.0216751819167054,
"grad_norm": 0.9463378190994263,
"learning_rate": 0.0006045215236915453,
"loss": 2.6405,
"step": 9760
},
{
"epoch": 3.0247716364762347,
"grad_norm": 0.9799174070358276,
"learning_rate": 0.0006051409104986064,
"loss": 2.6381,
"step": 9770
},
{
"epoch": 3.027868091035764,
"grad_norm": 0.9874619841575623,
"learning_rate": 0.0006057602973056674,
"loss": 2.6143,
"step": 9780
},
{
"epoch": 3.0309645455952934,
"grad_norm": 1.083337426185608,
"learning_rate": 0.0006063796841127284,
"loss": 2.6153,
"step": 9790
},
{
"epoch": 3.0340610001548227,
"grad_norm": 0.9509608745574951,
"learning_rate": 0.0006069990709197895,
"loss": 2.6379,
"step": 9800
},
{
"epoch": 3.037157454714352,
"grad_norm": 0.9036940336227417,
"learning_rate": 0.0006076184577268504,
"loss": 2.6415,
"step": 9810
},
{
"epoch": 3.0402539092738814,
"grad_norm": 0.9959449768066406,
"learning_rate": 0.0006082378445339114,
"loss": 2.6394,
"step": 9820
},
{
"epoch": 3.0433503638334107,
"grad_norm": 0.9509766101837158,
"learning_rate": 0.0006088572313409725,
"loss": 2.6287,
"step": 9830
},
{
"epoch": 3.04644681839294,
"grad_norm": 0.9667684435844421,
"learning_rate": 0.0006094766181480335,
"loss": 2.6518,
"step": 9840
},
{
"epoch": 3.0495432729524694,
"grad_norm": 0.8897145986557007,
"learning_rate": 0.0006100960049550945,
"loss": 2.6333,
"step": 9850
},
{
"epoch": 3.0526397275119987,
"grad_norm": 1.0284274816513062,
"learning_rate": 0.0006107153917621555,
"loss": 2.6348,
"step": 9860
},
{
"epoch": 3.055736182071528,
"grad_norm": 0.9442754983901978,
"learning_rate": 0.0006113347785692165,
"loss": 2.6345,
"step": 9870
},
{
"epoch": 3.0588326366310574,
"grad_norm": 0.9227479696273804,
"learning_rate": 0.0006119541653762775,
"loss": 2.6346,
"step": 9880
},
{
"epoch": 3.0619290911905868,
"grad_norm": 0.9678612351417542,
"learning_rate": 0.0006125735521833386,
"loss": 2.6267,
"step": 9890
},
{
"epoch": 3.065025545750116,
"grad_norm": 0.9622678160667419,
"learning_rate": 0.0006131929389903995,
"loss": 2.6205,
"step": 9900
},
{
"epoch": 3.0681220003096454,
"grad_norm": 0.9785904288291931,
"learning_rate": 0.0006138123257974604,
"loss": 2.6316,
"step": 9910
},
{
"epoch": 3.0712184548691748,
"grad_norm": 0.9019646644592285,
"learning_rate": 0.0006144317126045215,
"loss": 2.6322,
"step": 9920
},
{
"epoch": 3.074314909428704,
"grad_norm": 0.9511599540710449,
"learning_rate": 0.0006150510994115825,
"loss": 2.6404,
"step": 9930
},
{
"epoch": 3.0774113639882335,
"grad_norm": 1.1197845935821533,
"learning_rate": 0.0006156704862186435,
"loss": 2.6334,
"step": 9940
},
{
"epoch": 3.080507818547763,
"grad_norm": 1.0321228504180908,
"learning_rate": 0.0006162898730257046,
"loss": 2.6578,
"step": 9950
},
{
"epoch": 3.083604273107292,
"grad_norm": 0.933640718460083,
"learning_rate": 0.0006169092598327656,
"loss": 2.6498,
"step": 9960
},
{
"epoch": 3.0867007276668215,
"grad_norm": 0.9308697581291199,
"learning_rate": 0.0006175286466398265,
"loss": 2.6403,
"step": 9970
},
{
"epoch": 3.089797182226351,
"grad_norm": 1.0035881996154785,
"learning_rate": 0.0006181480334468876,
"loss": 2.6369,
"step": 9980
},
{
"epoch": 3.09289363678588,
"grad_norm": 0.9733856916427612,
"learning_rate": 0.0006187674202539486,
"loss": 2.6434,
"step": 9990
},
{
"epoch": 3.0959900913454095,
"grad_norm": 0.9512896537780762,
"learning_rate": 0.0006193868070610096,
"loss": 2.6433,
"step": 10000
},
{
"epoch": 3.099086545904939,
"grad_norm": 1.1366065740585327,
"learning_rate": 0.0006200061938680707,
"loss": 2.6278,
"step": 10010
},
{
"epoch": 3.102183000464468,
"grad_norm": 1.0089902877807617,
"learning_rate": 0.0006206255806751317,
"loss": 2.6198,
"step": 10020
},
{
"epoch": 3.1052794550239975,
"grad_norm": 0.9710060358047485,
"learning_rate": 0.0006212449674821926,
"loss": 2.6299,
"step": 10030
},
{
"epoch": 3.108375909583527,
"grad_norm": 1.0112597942352295,
"learning_rate": 0.0006218643542892537,
"loss": 2.629,
"step": 10040
},
{
"epoch": 3.111472364143056,
"grad_norm": 0.8979578614234924,
"learning_rate": 0.0006224837410963147,
"loss": 2.6306,
"step": 10050
},
{
"epoch": 3.1145688187025855,
"grad_norm": 0.985578715801239,
"learning_rate": 0.0006231031279033757,
"loss": 2.6214,
"step": 10060
},
{
"epoch": 3.117665273262115,
"grad_norm": 1.0180467367172241,
"learning_rate": 0.0006237225147104368,
"loss": 2.6698,
"step": 10070
},
{
"epoch": 3.120761727821644,
"grad_norm": 0.9561509490013123,
"learning_rate": 0.0006243419015174977,
"loss": 2.6295,
"step": 10080
},
{
"epoch": 3.1238581823811735,
"grad_norm": 0.9035720229148865,
"learning_rate": 0.0006249612883245587,
"loss": 2.6356,
"step": 10090
},
{
"epoch": 3.126954636940703,
"grad_norm": 0.9758944511413574,
"learning_rate": 0.0006255806751316198,
"loss": 2.6373,
"step": 10100
},
{
"epoch": 3.130051091500232,
"grad_norm": 0.9201127290725708,
"learning_rate": 0.0006262000619386807,
"loss": 2.6354,
"step": 10110
},
{
"epoch": 3.1331475460597615,
"grad_norm": 0.9586511850357056,
"learning_rate": 0.0006268194487457417,
"loss": 2.6286,
"step": 10120
},
{
"epoch": 3.136244000619291,
"grad_norm": 1.3197758197784424,
"learning_rate": 0.0006274388355528027,
"loss": 2.6503,
"step": 10130
},
{
"epoch": 3.13934045517882,
"grad_norm": 1.4489221572875977,
"learning_rate": 0.0006280582223598637,
"loss": 2.667,
"step": 10140
},
{
"epoch": 3.1424369097383495,
"grad_norm": 1.1435356140136719,
"learning_rate": 0.0006286776091669247,
"loss": 2.6803,
"step": 10150
},
{
"epoch": 3.145533364297879,
"grad_norm": 5.218364238739014,
"learning_rate": 0.0006292969959739858,
"loss": 2.7482,
"step": 10160
},
{
"epoch": 3.1486298188574082,
"grad_norm": 1.0673755407333374,
"learning_rate": 0.0006299163827810468,
"loss": 2.6814,
"step": 10170
},
{
"epoch": 3.1517262734169376,
"grad_norm": 0.9964536428451538,
"learning_rate": 0.0006305357695881078,
"loss": 2.6468,
"step": 10180
},
{
"epoch": 3.154822727976467,
"grad_norm": 1.0818805694580078,
"learning_rate": 0.0006311551563951688,
"loss": 2.6687,
"step": 10190
},
{
"epoch": 3.1579191825359962,
"grad_norm": 1.0229182243347168,
"learning_rate": 0.0006317745432022298,
"loss": 2.632,
"step": 10200
},
{
"epoch": 3.1610156370955256,
"grad_norm": 0.9602491855621338,
"learning_rate": 0.0006323939300092908,
"loss": 2.6209,
"step": 10210
},
{
"epoch": 3.164112091655055,
"grad_norm": 1.0441064834594727,
"learning_rate": 0.0006330133168163518,
"loss": 2.6421,
"step": 10220
},
{
"epoch": 3.1672085462145843,
"grad_norm": 19.606216430664062,
"learning_rate": 0.0006336327036234129,
"loss": 2.6372,
"step": 10230
},
{
"epoch": 3.1703050007741136,
"grad_norm": 1.115622878074646,
"learning_rate": 0.0006342520904304738,
"loss": 2.6775,
"step": 10240
},
{
"epoch": 3.173401455333643,
"grad_norm": 1.1430797576904297,
"learning_rate": 0.0006348714772375348,
"loss": 2.6415,
"step": 10250
},
{
"epoch": 3.1764979098931723,
"grad_norm": 7.035722255706787,
"learning_rate": 0.0006354908640445959,
"loss": 2.6995,
"step": 10260
},
{
"epoch": 3.1795943644527016,
"grad_norm": 1.2375656366348267,
"learning_rate": 0.0006361102508516569,
"loss": 2.7278,
"step": 10270
},
{
"epoch": 3.182690819012231,
"grad_norm": 1.0868054628372192,
"learning_rate": 0.0006367296376587179,
"loss": 2.6475,
"step": 10280
},
{
"epoch": 3.1857872735717603,
"grad_norm": 1.0047295093536377,
"learning_rate": 0.000637349024465779,
"loss": 2.6195,
"step": 10290
},
{
"epoch": 3.1888837281312896,
"grad_norm": 0.9876299500465393,
"learning_rate": 0.0006379684112728399,
"loss": 2.6392,
"step": 10300
},
{
"epoch": 3.191980182690819,
"grad_norm": 1.021812081336975,
"learning_rate": 0.0006385877980799008,
"loss": 2.6468,
"step": 10310
},
{
"epoch": 3.1950766372503483,
"grad_norm": 0.954329788684845,
"learning_rate": 0.0006392071848869619,
"loss": 2.6368,
"step": 10320
},
{
"epoch": 3.1981730918098776,
"grad_norm": 0.9458587169647217,
"learning_rate": 0.0006398265716940229,
"loss": 2.6368,
"step": 10330
},
{
"epoch": 3.201269546369407,
"grad_norm": 1.0526219606399536,
"learning_rate": 0.0006404459585010839,
"loss": 2.6389,
"step": 10340
},
{
"epoch": 3.2043660009289363,
"grad_norm": 1.1330630779266357,
"learning_rate": 0.0006410653453081449,
"loss": 2.6666,
"step": 10350
},
{
"epoch": 3.2074624554884656,
"grad_norm": 1.0522410869598389,
"learning_rate": 0.0006416847321152059,
"loss": 2.6405,
"step": 10360
},
{
"epoch": 3.210558910047995,
"grad_norm": 0.973717451095581,
"learning_rate": 0.0006423041189222669,
"loss": 2.6198,
"step": 10370
},
{
"epoch": 3.2136553646075243,
"grad_norm": 0.9188945889472961,
"learning_rate": 0.000642923505729328,
"loss": 2.6478,
"step": 10380
},
{
"epoch": 3.2167518191670537,
"grad_norm": 0.9480977654457092,
"learning_rate": 0.000643542892536389,
"loss": 2.635,
"step": 10390
},
{
"epoch": 3.219848273726583,
"grad_norm": 0.9224624633789062,
"learning_rate": 0.0006441622793434499,
"loss": 2.6362,
"step": 10400
},
{
"epoch": 3.2229447282861123,
"grad_norm": 0.9448727965354919,
"learning_rate": 0.000644781666150511,
"loss": 2.6215,
"step": 10410
},
{
"epoch": 3.2260411828456417,
"grad_norm": 0.9381209015846252,
"learning_rate": 0.000645401052957572,
"loss": 2.6305,
"step": 10420
},
{
"epoch": 3.229137637405171,
"grad_norm": 1.0034310817718506,
"learning_rate": 0.000646020439764633,
"loss": 2.6344,
"step": 10430
},
{
"epoch": 3.2322340919647004,
"grad_norm": 0.9512182474136353,
"learning_rate": 0.0006466398265716941,
"loss": 2.6397,
"step": 10440
},
{
"epoch": 3.2353305465242297,
"grad_norm": 0.9563096761703491,
"learning_rate": 0.0006472592133787551,
"loss": 2.6354,
"step": 10450
},
{
"epoch": 3.238427001083759,
"grad_norm": 1.0199220180511475,
"learning_rate": 0.000647878600185816,
"loss": 2.6379,
"step": 10460
},
{
"epoch": 3.2415234556432884,
"grad_norm": 0.9473974108695984,
"learning_rate": 0.0006484979869928771,
"loss": 2.6262,
"step": 10470
},
{
"epoch": 3.2446199102028177,
"grad_norm": 0.9869408011436462,
"learning_rate": 0.0006491173737999381,
"loss": 2.6495,
"step": 10480
},
{
"epoch": 3.247716364762347,
"grad_norm": 0.9925758242607117,
"learning_rate": 0.0006497367606069991,
"loss": 2.631,
"step": 10490
},
{
"epoch": 3.2508128193218764,
"grad_norm": 1.048644781112671,
"learning_rate": 0.0006503561474140602,
"loss": 2.6145,
"step": 10500
},
{
"epoch": 3.2539092738814057,
"grad_norm": 0.9119939804077148,
"learning_rate": 0.0006509755342211212,
"loss": 2.6249,
"step": 10510
},
{
"epoch": 3.257005728440935,
"grad_norm": 1.0462340116500854,
"learning_rate": 0.000651594921028182,
"loss": 2.6343,
"step": 10520
},
{
"epoch": 3.2601021830004644,
"grad_norm": 0.9970148801803589,
"learning_rate": 0.0006522143078352431,
"loss": 2.6353,
"step": 10530
},
{
"epoch": 3.2631986375599937,
"grad_norm": 0.9585279822349548,
"learning_rate": 0.0006528336946423041,
"loss": 2.6223,
"step": 10540
},
{
"epoch": 3.266295092119523,
"grad_norm": 1.0489411354064941,
"learning_rate": 0.0006534530814493651,
"loss": 2.6246,
"step": 10550
},
{
"epoch": 3.2693915466790524,
"grad_norm": 0.9942703247070312,
"learning_rate": 0.0006540724682564262,
"loss": 2.6163,
"step": 10560
},
{
"epoch": 3.2724880012385817,
"grad_norm": 1.0939925909042358,
"learning_rate": 0.0006546918550634871,
"loss": 2.6485,
"step": 10570
},
{
"epoch": 3.275584455798111,
"grad_norm": 0.9639611840248108,
"learning_rate": 0.0006553112418705481,
"loss": 2.6369,
"step": 10580
},
{
"epoch": 3.2786809103576404,
"grad_norm": 1.003915786743164,
"learning_rate": 0.0006559306286776092,
"loss": 2.6324,
"step": 10590
},
{
"epoch": 3.2817773649171698,
"grad_norm": 1.07323157787323,
"learning_rate": 0.0006565500154846702,
"loss": 2.6239,
"step": 10600
},
{
"epoch": 3.284873819476699,
"grad_norm": 0.9782385230064392,
"learning_rate": 0.0006571694022917312,
"loss": 2.6276,
"step": 10610
},
{
"epoch": 3.2879702740362284,
"grad_norm": 0.9947441816329956,
"learning_rate": 0.0006577887890987922,
"loss": 2.6258,
"step": 10620
},
{
"epoch": 3.2910667285957578,
"grad_norm": 0.9401261806488037,
"learning_rate": 0.0006584081759058532,
"loss": 2.6054,
"step": 10630
},
{
"epoch": 3.294163183155287,
"grad_norm": 0.9426921606063843,
"learning_rate": 0.0006590275627129142,
"loss": 2.634,
"step": 10640
},
{
"epoch": 3.2972596377148164,
"grad_norm": 0.9457327127456665,
"learning_rate": 0.0006596469495199753,
"loss": 2.6319,
"step": 10650
},
{
"epoch": 3.300356092274346,
"grad_norm": 1.1993708610534668,
"learning_rate": 0.0006602663363270363,
"loss": 2.6365,
"step": 10660
},
{
"epoch": 3.303452546833875,
"grad_norm": 0.9889876842498779,
"learning_rate": 0.0006608857231340973,
"loss": 2.6503,
"step": 10670
},
{
"epoch": 3.3065490013934045,
"grad_norm": 0.9788354635238647,
"learning_rate": 0.0006615051099411583,
"loss": 2.641,
"step": 10680
},
{
"epoch": 3.309645455952934,
"grad_norm": 0.9262669682502747,
"learning_rate": 0.0006621244967482193,
"loss": 2.6473,
"step": 10690
},
{
"epoch": 3.312741910512463,
"grad_norm": 0.9675087332725525,
"learning_rate": 0.0006627438835552803,
"loss": 2.6425,
"step": 10700
},
{
"epoch": 3.3158383650719925,
"grad_norm": 0.9308109879493713,
"learning_rate": 0.0006633632703623414,
"loss": 2.6425,
"step": 10710
},
{
"epoch": 3.318934819631522,
"grad_norm": 0.9837930202484131,
"learning_rate": 0.0006639826571694024,
"loss": 2.6309,
"step": 10720
},
{
"epoch": 3.322031274191051,
"grad_norm": 0.9883390069007874,
"learning_rate": 0.0006646020439764632,
"loss": 2.5976,
"step": 10730
},
{
"epoch": 3.3251277287505805,
"grad_norm": 0.9393827319145203,
"learning_rate": 0.0006652214307835243,
"loss": 2.6229,
"step": 10740
},
{
"epoch": 3.32822418331011,
"grad_norm": 0.9329293370246887,
"learning_rate": 0.0006658408175905853,
"loss": 2.6102,
"step": 10750
},
{
"epoch": 3.331320637869639,
"grad_norm": 0.8954689502716064,
"learning_rate": 0.0006664602043976463,
"loss": 2.6578,
"step": 10760
},
{
"epoch": 3.3344170924291685,
"grad_norm": 0.92784583568573,
"learning_rate": 0.0006670795912047074,
"loss": 2.6127,
"step": 10770
},
{
"epoch": 3.337513546988698,
"grad_norm": 0.9678082466125488,
"learning_rate": 0.0006676989780117683,
"loss": 2.6097,
"step": 10780
},
{
"epoch": 3.340610001548227,
"grad_norm": 0.9594787955284119,
"learning_rate": 0.0006683183648188293,
"loss": 2.6068,
"step": 10790
},
{
"epoch": 3.3437064561077565,
"grad_norm": 0.9914245009422302,
"learning_rate": 0.0006689377516258904,
"loss": 2.6173,
"step": 10800
},
{
"epoch": 3.346802910667286,
"grad_norm": 0.9944581985473633,
"learning_rate": 0.0006695571384329514,
"loss": 2.6229,
"step": 10810
},
{
"epoch": 3.349899365226815,
"grad_norm": 1.0383622646331787,
"learning_rate": 0.0006701765252400124,
"loss": 2.6341,
"step": 10820
},
{
"epoch": 3.3529958197863445,
"grad_norm": 1.034728765487671,
"learning_rate": 0.0006707959120470735,
"loss": 2.637,
"step": 10830
},
{
"epoch": 3.356092274345874,
"grad_norm": 1.0271577835083008,
"learning_rate": 0.0006714152988541344,
"loss": 2.6387,
"step": 10840
},
{
"epoch": 3.359188728905403,
"grad_norm": 0.9783453345298767,
"learning_rate": 0.0006720346856611954,
"loss": 2.5978,
"step": 10850
},
{
"epoch": 3.3622851834649325,
"grad_norm": 0.9792416095733643,
"learning_rate": 0.0006726540724682565,
"loss": 2.6349,
"step": 10860
},
{
"epoch": 3.365381638024462,
"grad_norm": 0.9500912427902222,
"learning_rate": 0.0006732734592753175,
"loss": 2.6087,
"step": 10870
},
{
"epoch": 3.3684780925839912,
"grad_norm": 0.9641538262367249,
"learning_rate": 0.0006738928460823785,
"loss": 2.613,
"step": 10880
},
{
"epoch": 3.3715745471435206,
"grad_norm": 0.957671582698822,
"learning_rate": 0.0006745122328894394,
"loss": 2.6215,
"step": 10890
},
{
"epoch": 3.37467100170305,
"grad_norm": 1.0475072860717773,
"learning_rate": 0.0006751316196965005,
"loss": 2.6092,
"step": 10900
},
{
"epoch": 3.3777674562625792,
"grad_norm": 0.96811842918396,
"learning_rate": 0.0006757510065035615,
"loss": 2.6134,
"step": 10910
},
{
"epoch": 3.3808639108221086,
"grad_norm": 1.0156564712524414,
"learning_rate": 0.0006763703933106225,
"loss": 2.6112,
"step": 10920
},
{
"epoch": 3.383960365381638,
"grad_norm": 1.0434483289718628,
"learning_rate": 0.0006769897801176836,
"loss": 2.6183,
"step": 10930
},
{
"epoch": 3.3870568199411673,
"grad_norm": 0.9763379096984863,
"learning_rate": 0.0006776091669247445,
"loss": 2.6063,
"step": 10940
},
{
"epoch": 3.3901532745006966,
"grad_norm": 1.0185160636901855,
"learning_rate": 0.0006782285537318054,
"loss": 2.6491,
"step": 10950
},
{
"epoch": 3.393249729060226,
"grad_norm": 0.9660173058509827,
"learning_rate": 0.0006788479405388665,
"loss": 2.6164,
"step": 10960
},
{
"epoch": 3.3963461836197553,
"grad_norm": 0.9487093091011047,
"learning_rate": 0.0006794673273459275,
"loss": 2.614,
"step": 10970
},
{
"epoch": 3.3994426381792846,
"grad_norm": 0.9912219643592834,
"learning_rate": 0.0006800867141529885,
"loss": 2.6369,
"step": 10980
},
{
"epoch": 3.402539092738814,
"grad_norm": 0.9763176441192627,
"learning_rate": 0.0006807061009600496,
"loss": 2.6003,
"step": 10990
},
{
"epoch": 3.4056355472983433,
"grad_norm": 1.007444977760315,
"learning_rate": 0.0006813254877671105,
"loss": 2.6294,
"step": 11000
}
],
"logging_steps": 10,
"max_steps": 161450,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3128782178793882e+18,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}