{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.4056355472983433, "eval_steps": 500, "global_step": 11000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003096454559529339, "grad_norm": 7.865213871002197, "learning_rate": 6.193868070610096e-07, "loss": 10.5439, "step": 10 }, { "epoch": 0.006192909119058678, "grad_norm": 5.684272289276123, "learning_rate": 1.2387736141220192e-06, "loss": 10.2888, "step": 20 }, { "epoch": 0.009289363678588018, "grad_norm": 4.032341003417969, "learning_rate": 1.8581604211830287e-06, "loss": 9.9454, "step": 30 }, { "epoch": 0.012385818238117356, "grad_norm": 3.232361316680908, "learning_rate": 2.4775472282440385e-06, "loss": 9.6908, "step": 40 }, { "epoch": 0.015482272797646695, "grad_norm": 2.7629575729370117, "learning_rate": 3.096934035305048e-06, "loss": 9.491, "step": 50 }, { "epoch": 0.018578727357176035, "grad_norm": 2.439429998397827, "learning_rate": 3.7163208423660575e-06, "loss": 9.3421, "step": 60 }, { "epoch": 0.021675181916705373, "grad_norm": 2.311237335205078, "learning_rate": 4.335707649427067e-06, "loss": 9.2172, "step": 70 }, { "epoch": 0.02477163647623471, "grad_norm": 2.1415603160858154, "learning_rate": 4.955094456488077e-06, "loss": 9.1165, "step": 80 }, { "epoch": 0.02786809103576405, "grad_norm": 2.0442802906036377, "learning_rate": 5.574481263549087e-06, "loss": 9.0171, "step": 90 }, { "epoch": 0.03096454559529339, "grad_norm": 2.0417075157165527, "learning_rate": 6.193868070610096e-06, "loss": 8.9188, "step": 100 }, { "epoch": 0.034061000154822725, "grad_norm": 1.906326413154602, "learning_rate": 6.813254877671105e-06, "loss": 8.817, "step": 110 }, { "epoch": 0.03715745471435207, "grad_norm": 1.876010537147522, "learning_rate": 7.432641684732115e-06, "loss": 8.7205, "step": 120 }, { "epoch": 0.04025390927388141, "grad_norm": 1.7335777282714844, "learning_rate": 8.052028491793125e-06, "loss": 8.6376, "step": 130 }, { "epoch": 0.043350363833410746, "grad_norm": 1.6829620599746704, "learning_rate": 8.671415298854134e-06, "loss": 8.5273, "step": 140 }, { "epoch": 0.046446818392940084, "grad_norm": 1.6329585313796997, "learning_rate": 9.290802105915144e-06, "loss": 8.4292, "step": 150 }, { "epoch": 0.04954327295246942, "grad_norm": 1.62351655960083, "learning_rate": 9.910188912976154e-06, "loss": 8.3279, "step": 160 }, { "epoch": 0.05263972751199876, "grad_norm": 1.5334705114364624, "learning_rate": 1.0529575720037164e-05, "loss": 8.2018, "step": 170 }, { "epoch": 0.0557361820715281, "grad_norm": 1.5242592096328735, "learning_rate": 1.1148962527098173e-05, "loss": 8.1007, "step": 180 }, { "epoch": 0.058832636631057436, "grad_norm": 1.5945011377334595, "learning_rate": 1.1768349334159183e-05, "loss": 7.972, "step": 190 }, { "epoch": 0.06192909119058678, "grad_norm": 1.3093743324279785, "learning_rate": 1.2387736141220193e-05, "loss": 7.8736, "step": 200 }, { "epoch": 0.06502554575011611, "grad_norm": 1.3056074380874634, "learning_rate": 1.30071229482812e-05, "loss": 7.7617, "step": 210 }, { "epoch": 0.06812200030964545, "grad_norm": 1.2901231050491333, "learning_rate": 1.362650975534221e-05, "loss": 7.6573, "step": 220 }, { "epoch": 0.07121845486917479, "grad_norm": 1.0811238288879395, "learning_rate": 1.424589656240322e-05, "loss": 7.5707, "step": 230 }, { "epoch": 0.07431490942870414, "grad_norm": 0.9134311676025391, "learning_rate": 1.486528336946423e-05, "loss": 7.4959, "step": 240 }, { "epoch": 0.07741136398823348, "grad_norm": 0.9673048257827759, "learning_rate": 1.548467017652524e-05, "loss": 7.4314, "step": 250 }, { "epoch": 0.08050781854776282, "grad_norm": 1.0383951663970947, "learning_rate": 1.610405698358625e-05, "loss": 7.3523, "step": 260 }, { "epoch": 0.08360427310729215, "grad_norm": 1.0910584926605225, "learning_rate": 1.6723443790647262e-05, "loss": 7.3133, "step": 270 }, { "epoch": 0.08670072766682149, "grad_norm": 0.804308295249939, "learning_rate": 1.734283059770827e-05, "loss": 7.2522, "step": 280 }, { "epoch": 0.08979718222635083, "grad_norm": 0.9341151714324951, "learning_rate": 1.796221740476928e-05, "loss": 7.2261, "step": 290 }, { "epoch": 0.09289363678588017, "grad_norm": 0.8165347576141357, "learning_rate": 1.8581604211830288e-05, "loss": 7.2088, "step": 300 }, { "epoch": 0.0959900913454095, "grad_norm": 0.6941328644752502, "learning_rate": 1.9200991018891298e-05, "loss": 7.1554, "step": 310 }, { "epoch": 0.09908654590493884, "grad_norm": 0.7364155650138855, "learning_rate": 1.9820377825952308e-05, "loss": 7.1313, "step": 320 }, { "epoch": 0.10218300046446818, "grad_norm": 1.3144842386245728, "learning_rate": 2.0439764633013317e-05, "loss": 7.1198, "step": 330 }, { "epoch": 0.10527945502399752, "grad_norm": 0.703687846660614, "learning_rate": 2.1059151440074327e-05, "loss": 7.0936, "step": 340 }, { "epoch": 0.10837590958352686, "grad_norm": 0.7936609387397766, "learning_rate": 2.1678538247135337e-05, "loss": 7.0966, "step": 350 }, { "epoch": 0.1114723641430562, "grad_norm": 0.9979026317596436, "learning_rate": 2.2297925054196347e-05, "loss": 7.0917, "step": 360 }, { "epoch": 0.11456881870258553, "grad_norm": 0.8398326635360718, "learning_rate": 2.2917311861257356e-05, "loss": 7.0791, "step": 370 }, { "epoch": 0.11766527326211487, "grad_norm": 0.7220719456672668, "learning_rate": 2.3536698668318366e-05, "loss": 7.057, "step": 380 }, { "epoch": 0.12076172782164422, "grad_norm": 0.8845738172531128, "learning_rate": 2.4156085475379376e-05, "loss": 7.0476, "step": 390 }, { "epoch": 0.12385818238117356, "grad_norm": 0.8084824085235596, "learning_rate": 2.4775472282440385e-05, "loss": 7.0369, "step": 400 }, { "epoch": 0.1269546369407029, "grad_norm": 0.7229199409484863, "learning_rate": 2.5394859089501395e-05, "loss": 7.0193, "step": 410 }, { "epoch": 0.13005109150023222, "grad_norm": 0.7450975179672241, "learning_rate": 2.60142458965624e-05, "loss": 7.0136, "step": 420 }, { "epoch": 0.13314754605976156, "grad_norm": 1.1810022592544556, "learning_rate": 2.6633632703623415e-05, "loss": 7.0257, "step": 430 }, { "epoch": 0.1362440006192909, "grad_norm": 0.724097728729248, "learning_rate": 2.725301951068442e-05, "loss": 7.0076, "step": 440 }, { "epoch": 0.13934045517882024, "grad_norm": 0.8406842350959778, "learning_rate": 2.7872406317745434e-05, "loss": 6.9976, "step": 450 }, { "epoch": 0.14243690973834958, "grad_norm": 0.8269332647323608, "learning_rate": 2.849179312480644e-05, "loss": 6.9812, "step": 460 }, { "epoch": 0.14553336429787894, "grad_norm": 0.7661322355270386, "learning_rate": 2.9111179931867453e-05, "loss": 7.0072, "step": 470 }, { "epoch": 0.14862981885740828, "grad_norm": 0.6673895120620728, "learning_rate": 2.973056673892846e-05, "loss": 6.9775, "step": 480 }, { "epoch": 0.15172627341693762, "grad_norm": 1.1476161479949951, "learning_rate": 3.0349953545989473e-05, "loss": 6.9496, "step": 490 }, { "epoch": 0.15482272797646696, "grad_norm": 1.0809210538864136, "learning_rate": 3.096934035305048e-05, "loss": 6.9578, "step": 500 }, { "epoch": 0.1579191825359963, "grad_norm": 0.8364447951316833, "learning_rate": 3.158872716011149e-05, "loss": 6.9371, "step": 510 }, { "epoch": 0.16101563709552563, "grad_norm": 0.9381659030914307, "learning_rate": 3.22081139671725e-05, "loss": 6.9373, "step": 520 }, { "epoch": 0.16411209165505497, "grad_norm": 0.8810213804244995, "learning_rate": 3.2827500774233505e-05, "loss": 6.9463, "step": 530 }, { "epoch": 0.1672085462145843, "grad_norm": 0.8275142908096313, "learning_rate": 3.3446887581294525e-05, "loss": 6.932, "step": 540 }, { "epoch": 0.17030500077411365, "grad_norm": 0.6804556846618652, "learning_rate": 3.406627438835553e-05, "loss": 6.9181, "step": 550 }, { "epoch": 0.17340145533364298, "grad_norm": 0.7559427618980408, "learning_rate": 3.468566119541654e-05, "loss": 6.9202, "step": 560 }, { "epoch": 0.17649790989317232, "grad_norm": 0.6762346029281616, "learning_rate": 3.5305048002477544e-05, "loss": 6.9081, "step": 570 }, { "epoch": 0.17959436445270166, "grad_norm": 0.6671234369277954, "learning_rate": 3.592443480953856e-05, "loss": 6.9216, "step": 580 }, { "epoch": 0.182690819012231, "grad_norm": 0.9335949420928955, "learning_rate": 3.654382161659957e-05, "loss": 6.9034, "step": 590 }, { "epoch": 0.18578727357176034, "grad_norm": 0.9805537462234497, "learning_rate": 3.7163208423660576e-05, "loss": 6.895, "step": 600 }, { "epoch": 0.18888372813128967, "grad_norm": 0.8761160969734192, "learning_rate": 3.778259523072158e-05, "loss": 6.9029, "step": 610 }, { "epoch": 0.191980182690819, "grad_norm": 0.8361015915870667, "learning_rate": 3.8401982037782596e-05, "loss": 6.8819, "step": 620 }, { "epoch": 0.19507663725034835, "grad_norm": 0.6740533709526062, "learning_rate": 3.902136884484361e-05, "loss": 6.8882, "step": 630 }, { "epoch": 0.1981730918098777, "grad_norm": 0.8334875702857971, "learning_rate": 3.9640755651904615e-05, "loss": 6.8917, "step": 640 }, { "epoch": 0.20126954636940703, "grad_norm": 0.7946698665618896, "learning_rate": 4.026014245896562e-05, "loss": 6.8712, "step": 650 }, { "epoch": 0.20436600092893636, "grad_norm": 1.1773180961608887, "learning_rate": 4.0879529266026635e-05, "loss": 6.8963, "step": 660 }, { "epoch": 0.2074624554884657, "grad_norm": 0.6932355165481567, "learning_rate": 4.149891607308765e-05, "loss": 6.8718, "step": 670 }, { "epoch": 0.21055891004799504, "grad_norm": 0.8239333629608154, "learning_rate": 4.2118302880148654e-05, "loss": 6.8549, "step": 680 }, { "epoch": 0.21365536460752438, "grad_norm": 0.8844727873802185, "learning_rate": 4.273768968720966e-05, "loss": 6.8687, "step": 690 }, { "epoch": 0.21675181916705372, "grad_norm": 0.8168037533760071, "learning_rate": 4.3357076494270674e-05, "loss": 6.8457, "step": 700 }, { "epoch": 0.21984827372658305, "grad_norm": 0.7363680601119995, "learning_rate": 4.397646330133168e-05, "loss": 6.8538, "step": 710 }, { "epoch": 0.2229447282861124, "grad_norm": 0.9639245867729187, "learning_rate": 4.459585010839269e-05, "loss": 6.855, "step": 720 }, { "epoch": 0.22604118284564173, "grad_norm": 0.7763282656669617, "learning_rate": 4.52152369154537e-05, "loss": 6.8254, "step": 730 }, { "epoch": 0.22913763740517107, "grad_norm": 1.482752799987793, "learning_rate": 4.583462372251471e-05, "loss": 6.8331, "step": 740 }, { "epoch": 0.2322340919647004, "grad_norm": 0.8456624150276184, "learning_rate": 4.645401052957572e-05, "loss": 6.8513, "step": 750 }, { "epoch": 0.23533054652422974, "grad_norm": 0.9166210889816284, "learning_rate": 4.707339733663673e-05, "loss": 6.8402, "step": 760 }, { "epoch": 0.23842700108375908, "grad_norm": 0.8375464677810669, "learning_rate": 4.769278414369774e-05, "loss": 6.8337, "step": 770 }, { "epoch": 0.24152345564328845, "grad_norm": 1.267236590385437, "learning_rate": 4.831217095075875e-05, "loss": 6.8193, "step": 780 }, { "epoch": 0.2446199102028178, "grad_norm": 0.6456039547920227, "learning_rate": 4.893155775781976e-05, "loss": 6.7969, "step": 790 }, { "epoch": 0.24771636476234712, "grad_norm": 0.8981896638870239, "learning_rate": 4.955094456488077e-05, "loss": 6.8133, "step": 800 }, { "epoch": 0.25081281932187643, "grad_norm": 1.120186686515808, "learning_rate": 5.017033137194178e-05, "loss": 6.8056, "step": 810 }, { "epoch": 0.2539092738814058, "grad_norm": 2.292698621749878, "learning_rate": 5.078971817900279e-05, "loss": 6.8308, "step": 820 }, { "epoch": 0.2570057284409351, "grad_norm": 0.7018686532974243, "learning_rate": 5.1409104986063797e-05, "loss": 6.8154, "step": 830 }, { "epoch": 0.26010218300046445, "grad_norm": 0.8676766753196716, "learning_rate": 5.20284917931248e-05, "loss": 6.8134, "step": 840 }, { "epoch": 0.2631986375599938, "grad_norm": 1.0170965194702148, "learning_rate": 5.2647878600185816e-05, "loss": 6.8014, "step": 850 }, { "epoch": 0.2662950921195231, "grad_norm": 1.100301742553711, "learning_rate": 5.326726540724683e-05, "loss": 6.7759, "step": 860 }, { "epoch": 0.26939154667905246, "grad_norm": 0.9783535003662109, "learning_rate": 5.3886652214307835e-05, "loss": 6.7684, "step": 870 }, { "epoch": 0.2724880012385818, "grad_norm": 1.2189717292785645, "learning_rate": 5.450603902136884e-05, "loss": 6.7609, "step": 880 }, { "epoch": 0.27558445579811114, "grad_norm": 0.9612496495246887, "learning_rate": 5.5125425828429855e-05, "loss": 6.7852, "step": 890 }, { "epoch": 0.2786809103576405, "grad_norm": 1.201369047164917, "learning_rate": 5.574481263549087e-05, "loss": 6.7685, "step": 900 }, { "epoch": 0.2817773649171698, "grad_norm": 1.0445016622543335, "learning_rate": 5.6364199442551874e-05, "loss": 6.7863, "step": 910 }, { "epoch": 0.28487381947669915, "grad_norm": 0.9389632940292358, "learning_rate": 5.698358624961288e-05, "loss": 6.7803, "step": 920 }, { "epoch": 0.2879702740362285, "grad_norm": 1.522533655166626, "learning_rate": 5.7602973056673894e-05, "loss": 6.7642, "step": 930 }, { "epoch": 0.2910667285957579, "grad_norm": 0.5819054841995239, "learning_rate": 5.822235986373491e-05, "loss": 6.772, "step": 940 }, { "epoch": 0.2941631831552872, "grad_norm": 0.5492868423461914, "learning_rate": 5.884174667079591e-05, "loss": 6.7712, "step": 950 }, { "epoch": 0.29725963771481656, "grad_norm": 0.9563374519348145, "learning_rate": 5.946113347785692e-05, "loss": 6.7602, "step": 960 }, { "epoch": 0.3003560922743459, "grad_norm": 1.8112778663635254, "learning_rate": 6.0080520284917926e-05, "loss": 6.774, "step": 970 }, { "epoch": 0.30345254683387524, "grad_norm": 1.9124343395233154, "learning_rate": 6.0699907091978946e-05, "loss": 6.7692, "step": 980 }, { "epoch": 0.3065490013934046, "grad_norm": 1.0520577430725098, "learning_rate": 6.131929389903995e-05, "loss": 6.7624, "step": 990 }, { "epoch": 0.3096454559529339, "grad_norm": 0.9971650242805481, "learning_rate": 6.193868070610096e-05, "loss": 6.7597, "step": 1000 }, { "epoch": 0.31274191051246325, "grad_norm": 0.7130516171455383, "learning_rate": 6.255806751316196e-05, "loss": 6.7548, "step": 1010 }, { "epoch": 0.3158383650719926, "grad_norm": 0.8867819309234619, "learning_rate": 6.317745432022298e-05, "loss": 6.7416, "step": 1020 }, { "epoch": 0.3189348196315219, "grad_norm": 2.448023557662964, "learning_rate": 6.379684112728398e-05, "loss": 6.7675, "step": 1030 }, { "epoch": 0.32203127419105126, "grad_norm": 2.0288820266723633, "learning_rate": 6.4416227934345e-05, "loss": 6.7555, "step": 1040 }, { "epoch": 0.3251277287505806, "grad_norm": 0.645900309085846, "learning_rate": 6.503561474140602e-05, "loss": 6.7557, "step": 1050 }, { "epoch": 0.32822418331010994, "grad_norm": 0.7342972159385681, "learning_rate": 6.565500154846701e-05, "loss": 6.7452, "step": 1060 }, { "epoch": 0.3313206378696393, "grad_norm": 1.523195743560791, "learning_rate": 6.627438835552803e-05, "loss": 6.7476, "step": 1070 }, { "epoch": 0.3344170924291686, "grad_norm": 1.812499761581421, "learning_rate": 6.689377516258905e-05, "loss": 6.7432, "step": 1080 }, { "epoch": 0.33751354698869795, "grad_norm": 0.8007811307907104, "learning_rate": 6.751316196965004e-05, "loss": 6.7387, "step": 1090 }, { "epoch": 0.3406100015482273, "grad_norm": 1.449756145477295, "learning_rate": 6.813254877671106e-05, "loss": 6.7323, "step": 1100 }, { "epoch": 0.34370645610775663, "grad_norm": 1.145936369895935, "learning_rate": 6.875193558377207e-05, "loss": 6.7396, "step": 1110 }, { "epoch": 0.34680291066728597, "grad_norm": 1.155754804611206, "learning_rate": 6.937132239083308e-05, "loss": 6.7288, "step": 1120 }, { "epoch": 0.3498993652268153, "grad_norm": 1.3261879682540894, "learning_rate": 6.99907091978941e-05, "loss": 6.717, "step": 1130 }, { "epoch": 0.35299581978634464, "grad_norm": 2.5398218631744385, "learning_rate": 7.061009600495509e-05, "loss": 6.7055, "step": 1140 }, { "epoch": 0.356092274345874, "grad_norm": 0.6757873892784119, "learning_rate": 7.122948281201611e-05, "loss": 6.7242, "step": 1150 }, { "epoch": 0.3591887289054033, "grad_norm": 0.8870462775230408, "learning_rate": 7.184886961907711e-05, "loss": 6.7241, "step": 1160 }, { "epoch": 0.36228518346493266, "grad_norm": 2.03185772895813, "learning_rate": 7.246825642613812e-05, "loss": 6.7364, "step": 1170 }, { "epoch": 0.365381638024462, "grad_norm": 1.013759970664978, "learning_rate": 7.308764323319914e-05, "loss": 6.7151, "step": 1180 }, { "epoch": 0.36847809258399133, "grad_norm": 1.6533416509628296, "learning_rate": 7.370703004026015e-05, "loss": 6.7207, "step": 1190 }, { "epoch": 0.3715745471435207, "grad_norm": 1.0296862125396729, "learning_rate": 7.432641684732115e-05, "loss": 6.7154, "step": 1200 }, { "epoch": 0.37467100170305, "grad_norm": 0.7925991415977478, "learning_rate": 7.494580365438217e-05, "loss": 6.7036, "step": 1210 }, { "epoch": 0.37776745626257935, "grad_norm": 1.123253345489502, "learning_rate": 7.556519046144317e-05, "loss": 6.6981, "step": 1220 }, { "epoch": 0.3808639108221087, "grad_norm": 1.2927206754684448, "learning_rate": 7.618457726850419e-05, "loss": 6.7105, "step": 1230 }, { "epoch": 0.383960365381638, "grad_norm": 1.2877053022384644, "learning_rate": 7.680396407556519e-05, "loss": 6.7046, "step": 1240 }, { "epoch": 0.38705681994116736, "grad_norm": 1.5025876760482788, "learning_rate": 7.74233508826262e-05, "loss": 6.7097, "step": 1250 }, { "epoch": 0.3901532745006967, "grad_norm": 1.8476455211639404, "learning_rate": 7.804273768968722e-05, "loss": 6.7091, "step": 1260 }, { "epoch": 0.39324972906022604, "grad_norm": 1.1083704233169556, "learning_rate": 7.866212449674822e-05, "loss": 6.7281, "step": 1270 }, { "epoch": 0.3963461836197554, "grad_norm": 1.9753637313842773, "learning_rate": 7.928151130380923e-05, "loss": 6.6795, "step": 1280 }, { "epoch": 0.3994426381792847, "grad_norm": 0.6769999265670776, "learning_rate": 7.990089811087024e-05, "loss": 6.7043, "step": 1290 }, { "epoch": 0.40253909273881405, "grad_norm": 1.1025127172470093, "learning_rate": 8.052028491793124e-05, "loss": 6.673, "step": 1300 }, { "epoch": 0.4056355472983434, "grad_norm": 1.132672667503357, "learning_rate": 8.113967172499226e-05, "loss": 6.6962, "step": 1310 }, { "epoch": 0.40873200185787273, "grad_norm": 3.0605337619781494, "learning_rate": 8.175905853205327e-05, "loss": 6.693, "step": 1320 }, { "epoch": 0.41182845641740207, "grad_norm": 1.0931648015975952, "learning_rate": 8.237844533911428e-05, "loss": 6.6865, "step": 1330 }, { "epoch": 0.4149249109769314, "grad_norm": 1.2315603494644165, "learning_rate": 8.29978321461753e-05, "loss": 6.6753, "step": 1340 }, { "epoch": 0.41802136553646074, "grad_norm": 1.4472100734710693, "learning_rate": 8.36172189532363e-05, "loss": 6.6882, "step": 1350 }, { "epoch": 0.4211178200959901, "grad_norm": 1.6784274578094482, "learning_rate": 8.423660576029731e-05, "loss": 6.6776, "step": 1360 }, { "epoch": 0.4242142746555194, "grad_norm": 2.4951741695404053, "learning_rate": 8.485599256735831e-05, "loss": 6.6813, "step": 1370 }, { "epoch": 0.42731072921504876, "grad_norm": 2.3850290775299072, "learning_rate": 8.547537937441932e-05, "loss": 6.6729, "step": 1380 }, { "epoch": 0.4304071837745781, "grad_norm": 0.8592017889022827, "learning_rate": 8.609476618148034e-05, "loss": 6.681, "step": 1390 }, { "epoch": 0.43350363833410743, "grad_norm": 1.178676962852478, "learning_rate": 8.671415298854135e-05, "loss": 6.6717, "step": 1400 }, { "epoch": 0.43660009289363677, "grad_norm": 1.6043617725372314, "learning_rate": 8.733353979560235e-05, "loss": 6.6647, "step": 1410 }, { "epoch": 0.4396965474531661, "grad_norm": 0.872035562992096, "learning_rate": 8.795292660266336e-05, "loss": 6.666, "step": 1420 }, { "epoch": 0.44279300201269545, "grad_norm": 1.1680723428726196, "learning_rate": 8.857231340972438e-05, "loss": 6.6622, "step": 1430 }, { "epoch": 0.4458894565722248, "grad_norm": 0.8795621991157532, "learning_rate": 8.919170021678539e-05, "loss": 6.64, "step": 1440 }, { "epoch": 0.4489859111317541, "grad_norm": 1.5785902738571167, "learning_rate": 8.981108702384639e-05, "loss": 6.6613, "step": 1450 }, { "epoch": 0.45208236569128346, "grad_norm": 1.319611668586731, "learning_rate": 9.04304738309074e-05, "loss": 6.6573, "step": 1460 }, { "epoch": 0.4551788202508128, "grad_norm": 1.0796053409576416, "learning_rate": 9.104986063796842e-05, "loss": 6.6614, "step": 1470 }, { "epoch": 0.45827527481034214, "grad_norm": 1.2139097452163696, "learning_rate": 9.166924744502942e-05, "loss": 6.6461, "step": 1480 }, { "epoch": 0.4613717293698715, "grad_norm": 1.3955761194229126, "learning_rate": 9.228863425209043e-05, "loss": 6.6611, "step": 1490 }, { "epoch": 0.4644681839294008, "grad_norm": 1.5178614854812622, "learning_rate": 9.290802105915144e-05, "loss": 6.6615, "step": 1500 }, { "epoch": 0.46756463848893015, "grad_norm": 1.3112921714782715, "learning_rate": 9.352740786621246e-05, "loss": 6.6644, "step": 1510 }, { "epoch": 0.4706610930484595, "grad_norm": 1.5961909294128418, "learning_rate": 9.414679467327346e-05, "loss": 6.673, "step": 1520 }, { "epoch": 0.4737575476079888, "grad_norm": 1.0166618824005127, "learning_rate": 9.476618148033447e-05, "loss": 6.647, "step": 1530 }, { "epoch": 0.47685400216751817, "grad_norm": 1.2850325107574463, "learning_rate": 9.538556828739548e-05, "loss": 6.6536, "step": 1540 }, { "epoch": 0.4799504567270475, "grad_norm": 1.1776533126831055, "learning_rate": 9.600495509445648e-05, "loss": 6.6446, "step": 1550 }, { "epoch": 0.4830469112865769, "grad_norm": 1.783477544784546, "learning_rate": 9.66243419015175e-05, "loss": 6.6353, "step": 1560 }, { "epoch": 0.48614336584610623, "grad_norm": 1.7229933738708496, "learning_rate": 9.724372870857851e-05, "loss": 6.6363, "step": 1570 }, { "epoch": 0.4892398204056356, "grad_norm": 0.9246505498886108, "learning_rate": 9.786311551563952e-05, "loss": 6.6616, "step": 1580 }, { "epoch": 0.4923362749651649, "grad_norm": 1.7007242441177368, "learning_rate": 9.848250232270054e-05, "loss": 6.6608, "step": 1590 }, { "epoch": 0.49543272952469425, "grad_norm": 1.145609974861145, "learning_rate": 9.910188912976154e-05, "loss": 6.6282, "step": 1600 }, { "epoch": 0.4985291840842236, "grad_norm": 1.1772605180740356, "learning_rate": 9.972127593682255e-05, "loss": 6.6463, "step": 1610 }, { "epoch": 0.5016256386437529, "grad_norm": 0.8392823338508606, "learning_rate": 0.00010034066274388355, "loss": 6.6387, "step": 1620 }, { "epoch": 0.5047220932032822, "grad_norm": 1.2767823934555054, "learning_rate": 0.00010096004955094456, "loss": 6.6571, "step": 1630 }, { "epoch": 0.5078185477628115, "grad_norm": 2.3833205699920654, "learning_rate": 0.00010157943635800558, "loss": 6.6407, "step": 1640 }, { "epoch": 0.5109150023223409, "grad_norm": 1.3098053932189941, "learning_rate": 0.00010219882316506659, "loss": 6.6357, "step": 1650 }, { "epoch": 0.5140114568818702, "grad_norm": 1.2075214385986328, "learning_rate": 0.00010281820997212759, "loss": 6.6368, "step": 1660 }, { "epoch": 0.5171079114413996, "grad_norm": 1.251852035522461, "learning_rate": 0.00010343759677918861, "loss": 6.6255, "step": 1670 }, { "epoch": 0.5202043660009289, "grad_norm": 1.3142434358596802, "learning_rate": 0.0001040569835862496, "loss": 6.6218, "step": 1680 }, { "epoch": 0.5233008205604582, "grad_norm": 2.4521663188934326, "learning_rate": 0.00010467637039331063, "loss": 6.6264, "step": 1690 }, { "epoch": 0.5263972751199876, "grad_norm": 1.0846492052078247, "learning_rate": 0.00010529575720037163, "loss": 6.6515, "step": 1700 }, { "epoch": 0.5294937296795169, "grad_norm": 1.6620179414749146, "learning_rate": 0.00010591514400743264, "loss": 6.6323, "step": 1710 }, { "epoch": 0.5325901842390462, "grad_norm": 0.8557600975036621, "learning_rate": 0.00010653453081449366, "loss": 6.6437, "step": 1720 }, { "epoch": 0.5356866387985756, "grad_norm": 0.6991952061653137, "learning_rate": 0.00010715391762155466, "loss": 6.6211, "step": 1730 }, { "epoch": 0.5387830933581049, "grad_norm": 1.5852376222610474, "learning_rate": 0.00010777330442861567, "loss": 6.5952, "step": 1740 }, { "epoch": 0.5418795479176343, "grad_norm": 1.642796516418457, "learning_rate": 0.00010839269123567669, "loss": 6.6295, "step": 1750 }, { "epoch": 0.5449760024771636, "grad_norm": 1.2764023542404175, "learning_rate": 0.00010901207804273768, "loss": 6.6294, "step": 1760 }, { "epoch": 0.5480724570366929, "grad_norm": 1.6868603229522705, "learning_rate": 0.0001096314648497987, "loss": 6.6292, "step": 1770 }, { "epoch": 0.5511689115962223, "grad_norm": 1.3303276300430298, "learning_rate": 0.00011025085165685971, "loss": 6.5998, "step": 1780 }, { "epoch": 0.5542653661557516, "grad_norm": 1.396274447441101, "learning_rate": 0.00011087023846392072, "loss": 6.6223, "step": 1790 }, { "epoch": 0.557361820715281, "grad_norm": 0.879639744758606, "learning_rate": 0.00011148962527098174, "loss": 6.612, "step": 1800 }, { "epoch": 0.5604582752748103, "grad_norm": 1.1366828680038452, "learning_rate": 0.00011210901207804273, "loss": 6.6122, "step": 1810 }, { "epoch": 0.5635547298343396, "grad_norm": 1.480747938156128, "learning_rate": 0.00011272839888510375, "loss": 6.6094, "step": 1820 }, { "epoch": 0.566651184393869, "grad_norm": 1.1296987533569336, "learning_rate": 0.00011334778569216477, "loss": 6.6194, "step": 1830 }, { "epoch": 0.5697476389533983, "grad_norm": 0.9196439385414124, "learning_rate": 0.00011396717249922576, "loss": 6.5923, "step": 1840 }, { "epoch": 0.5728440935129276, "grad_norm": 1.3304774761199951, "learning_rate": 0.00011458655930628678, "loss": 6.6129, "step": 1850 }, { "epoch": 0.575940548072457, "grad_norm": 1.071112871170044, "learning_rate": 0.00011520594611334779, "loss": 6.6095, "step": 1860 }, { "epoch": 0.5790370026319864, "grad_norm": 1.1381322145462036, "learning_rate": 0.0001158253329204088, "loss": 6.5962, "step": 1870 }, { "epoch": 0.5821334571915158, "grad_norm": 2.608501672744751, "learning_rate": 0.00011644471972746981, "loss": 6.6024, "step": 1880 }, { "epoch": 0.5852299117510451, "grad_norm": 1.4727625846862793, "learning_rate": 0.0001170641065345308, "loss": 6.5914, "step": 1890 }, { "epoch": 0.5883263663105744, "grad_norm": 1.192298173904419, "learning_rate": 0.00011768349334159183, "loss": 6.6072, "step": 1900 }, { "epoch": 0.5914228208701038, "grad_norm": 0.9773418307304382, "learning_rate": 0.00011830288014865285, "loss": 6.5805, "step": 1910 }, { "epoch": 0.5945192754296331, "grad_norm": 1.096369743347168, "learning_rate": 0.00011892226695571384, "loss": 6.6052, "step": 1920 }, { "epoch": 0.5976157299891625, "grad_norm": 1.2275642156600952, "learning_rate": 0.00011954165376277486, "loss": 6.594, "step": 1930 }, { "epoch": 0.6007121845486918, "grad_norm": 1.3209136724472046, "learning_rate": 0.00012016104056983585, "loss": 6.6078, "step": 1940 }, { "epoch": 0.6038086391082211, "grad_norm": 1.3680113554000854, "learning_rate": 0.00012078042737689687, "loss": 6.5793, "step": 1950 }, { "epoch": 0.6069050936677505, "grad_norm": 1.2960150241851807, "learning_rate": 0.00012139981418395789, "loss": 6.5969, "step": 1960 }, { "epoch": 0.6100015482272798, "grad_norm": 0.8884462118148804, "learning_rate": 0.00012201920099101888, "loss": 6.5862, "step": 1970 }, { "epoch": 0.6130980027868091, "grad_norm": 0.9539084434509277, "learning_rate": 0.0001226385877980799, "loss": 6.5797, "step": 1980 }, { "epoch": 0.6161944573463385, "grad_norm": 1.023714303970337, "learning_rate": 0.0001232579746051409, "loss": 6.5896, "step": 1990 }, { "epoch": 0.6192909119058678, "grad_norm": 1.0426772832870483, "learning_rate": 0.00012387736141220192, "loss": 6.6121, "step": 2000 }, { "epoch": 0.6223873664653972, "grad_norm": 1.4499601125717163, "learning_rate": 0.00012449674821926292, "loss": 6.6107, "step": 2010 }, { "epoch": 0.6254838210249265, "grad_norm": 1.2633146047592163, "learning_rate": 0.00012511613502632393, "loss": 6.5991, "step": 2020 }, { "epoch": 0.6285802755844558, "grad_norm": 0.845995306968689, "learning_rate": 0.00012573552183338496, "loss": 6.5722, "step": 2030 }, { "epoch": 0.6316767301439852, "grad_norm": 1.2431766986846924, "learning_rate": 0.00012635490864044597, "loss": 6.5958, "step": 2040 }, { "epoch": 0.6347731847035145, "grad_norm": 0.9436641335487366, "learning_rate": 0.00012697429544750698, "loss": 6.5901, "step": 2050 }, { "epoch": 0.6378696392630439, "grad_norm": 1.334149718284607, "learning_rate": 0.00012759368225456795, "loss": 6.5938, "step": 2060 }, { "epoch": 0.6409660938225732, "grad_norm": 0.9270686507225037, "learning_rate": 0.000128213069061629, "loss": 6.5767, "step": 2070 }, { "epoch": 0.6440625483821025, "grad_norm": 1.3940073251724243, "learning_rate": 0.00012883245586869, "loss": 6.5834, "step": 2080 }, { "epoch": 0.6471590029416319, "grad_norm": 1.163221001625061, "learning_rate": 0.000129451842675751, "loss": 6.5784, "step": 2090 }, { "epoch": 0.6502554575011612, "grad_norm": 0.9691527485847473, "learning_rate": 0.00013007122948281203, "loss": 6.5823, "step": 2100 }, { "epoch": 0.6533519120606905, "grad_norm": 0.7050260305404663, "learning_rate": 0.000130690616289873, "loss": 6.5847, "step": 2110 }, { "epoch": 0.6564483666202199, "grad_norm": 1.2201118469238281, "learning_rate": 0.00013131000309693402, "loss": 6.5952, "step": 2120 }, { "epoch": 0.6595448211797492, "grad_norm": 1.3519176244735718, "learning_rate": 0.00013192938990399505, "loss": 6.5822, "step": 2130 }, { "epoch": 0.6626412757392786, "grad_norm": 1.0712783336639404, "learning_rate": 0.00013254877671105606, "loss": 6.5677, "step": 2140 }, { "epoch": 0.6657377302988079, "grad_norm": 1.0584081411361694, "learning_rate": 0.00013316816351811707, "loss": 6.5859, "step": 2150 }, { "epoch": 0.6688341848583372, "grad_norm": 0.8563801050186157, "learning_rate": 0.0001337875503251781, "loss": 6.5902, "step": 2160 }, { "epoch": 0.6719306394178666, "grad_norm": 0.8715903162956238, "learning_rate": 0.00013440693713223908, "loss": 6.5875, "step": 2170 }, { "epoch": 0.6750270939773959, "grad_norm": 1.3086822032928467, "learning_rate": 0.00013502632393930008, "loss": 6.5905, "step": 2180 }, { "epoch": 0.6781235485369252, "grad_norm": 0.8140910267829895, "learning_rate": 0.00013564571074636112, "loss": 6.558, "step": 2190 }, { "epoch": 0.6812200030964546, "grad_norm": 0.8857564330101013, "learning_rate": 0.00013626509755342212, "loss": 6.5713, "step": 2200 }, { "epoch": 0.6843164576559839, "grad_norm": 1.4854942560195923, "learning_rate": 0.00013688448436048313, "loss": 6.5836, "step": 2210 }, { "epoch": 0.6874129122155133, "grad_norm": 1.4530035257339478, "learning_rate": 0.00013750387116754414, "loss": 6.5756, "step": 2220 }, { "epoch": 0.6905093667750426, "grad_norm": 0.8865880370140076, "learning_rate": 0.00013812325797460514, "loss": 6.5887, "step": 2230 }, { "epoch": 0.6936058213345719, "grad_norm": 0.8601120710372925, "learning_rate": 0.00013874264478166615, "loss": 6.5701, "step": 2240 }, { "epoch": 0.6967022758941013, "grad_norm": 0.8077085614204407, "learning_rate": 0.00013936203158872716, "loss": 6.5734, "step": 2250 }, { "epoch": 0.6997987304536306, "grad_norm": 0.7860495448112488, "learning_rate": 0.0001399814183957882, "loss": 6.5609, "step": 2260 }, { "epoch": 0.70289518501316, "grad_norm": 1.4957787990570068, "learning_rate": 0.00014060080520284917, "loss": 6.5588, "step": 2270 }, { "epoch": 0.7059916395726893, "grad_norm": 1.2393313646316528, "learning_rate": 0.00014122019200991018, "loss": 6.5752, "step": 2280 }, { "epoch": 0.7090880941322186, "grad_norm": 0.8842589855194092, "learning_rate": 0.0001418395788169712, "loss": 6.5574, "step": 2290 }, { "epoch": 0.712184548691748, "grad_norm": 0.7826055884361267, "learning_rate": 0.00014245896562403222, "loss": 6.5612, "step": 2300 }, { "epoch": 0.7152810032512773, "grad_norm": 0.9402616024017334, "learning_rate": 0.00014307835243109322, "loss": 6.5596, "step": 2310 }, { "epoch": 0.7183774578108066, "grad_norm": 1.274904727935791, "learning_rate": 0.00014369773923815423, "loss": 6.5796, "step": 2320 }, { "epoch": 0.721473912370336, "grad_norm": 1.112528681755066, "learning_rate": 0.00014431712604521523, "loss": 6.5563, "step": 2330 }, { "epoch": 0.7245703669298653, "grad_norm": 0.8044337630271912, "learning_rate": 0.00014493651285227624, "loss": 6.547, "step": 2340 }, { "epoch": 0.7276668214893947, "grad_norm": 1.0962836742401123, "learning_rate": 0.00014555589965933727, "loss": 6.5543, "step": 2350 }, { "epoch": 0.730763276048924, "grad_norm": 1.0332891941070557, "learning_rate": 0.00014617528646639828, "loss": 6.5486, "step": 2360 }, { "epoch": 0.7338597306084533, "grad_norm": 0.9583357572555542, "learning_rate": 0.00014679467327345926, "loss": 6.5602, "step": 2370 }, { "epoch": 0.7369561851679827, "grad_norm": 1.0913727283477783, "learning_rate": 0.0001474140600805203, "loss": 6.5468, "step": 2380 }, { "epoch": 0.740052639727512, "grad_norm": 1.192328929901123, "learning_rate": 0.0001480334468875813, "loss": 6.5476, "step": 2390 }, { "epoch": 0.7431490942870413, "grad_norm": 1.3153208494186401, "learning_rate": 0.0001486528336946423, "loss": 6.5502, "step": 2400 }, { "epoch": 0.7462455488465707, "grad_norm": 1.0659363269805908, "learning_rate": 0.0001492722205017033, "loss": 6.5458, "step": 2410 }, { "epoch": 0.7493420034061, "grad_norm": 0.6409627199172974, "learning_rate": 0.00014989160730876435, "loss": 6.5615, "step": 2420 }, { "epoch": 0.7524384579656294, "grad_norm": 1.534621238708496, "learning_rate": 0.00015051099411582532, "loss": 6.5413, "step": 2430 }, { "epoch": 0.7555349125251587, "grad_norm": 0.8091804385185242, "learning_rate": 0.00015113038092288633, "loss": 6.5558, "step": 2440 }, { "epoch": 0.758631367084688, "grad_norm": 1.1276757717132568, "learning_rate": 0.00015174976772994736, "loss": 6.5495, "step": 2450 }, { "epoch": 0.7617278216442174, "grad_norm": 1.1171313524246216, "learning_rate": 0.00015236915453700837, "loss": 6.5202, "step": 2460 }, { "epoch": 0.7648242762037467, "grad_norm": 0.8118519186973572, "learning_rate": 0.00015298854134406938, "loss": 6.5484, "step": 2470 }, { "epoch": 0.767920730763276, "grad_norm": 0.835800290107727, "learning_rate": 0.00015360792815113038, "loss": 6.5512, "step": 2480 }, { "epoch": 0.7710171853228054, "grad_norm": 1.2488937377929688, "learning_rate": 0.0001542273149581914, "loss": 6.5434, "step": 2490 }, { "epoch": 0.7741136398823347, "grad_norm": 1.0001873970031738, "learning_rate": 0.0001548467017652524, "loss": 6.5562, "step": 2500 }, { "epoch": 0.7772100944418641, "grad_norm": 1.329168438911438, "learning_rate": 0.00015546608857231343, "loss": 6.5432, "step": 2510 }, { "epoch": 0.7803065490013934, "grad_norm": 1.0739688873291016, "learning_rate": 0.00015608547537937444, "loss": 6.5359, "step": 2520 }, { "epoch": 0.7834030035609227, "grad_norm": 1.1103359460830688, "learning_rate": 0.00015670486218643541, "loss": 6.5514, "step": 2530 }, { "epoch": 0.7864994581204521, "grad_norm": 0.7088027596473694, "learning_rate": 0.00015732424899349645, "loss": 6.5415, "step": 2540 }, { "epoch": 0.7895959126799814, "grad_norm": 1.141654133796692, "learning_rate": 0.00015794363580055745, "loss": 6.5505, "step": 2550 }, { "epoch": 0.7926923672395108, "grad_norm": 0.9900869727134705, "learning_rate": 0.00015856302260761846, "loss": 6.5505, "step": 2560 }, { "epoch": 0.7957888217990401, "grad_norm": 0.9820410013198853, "learning_rate": 0.00015918240941467947, "loss": 6.5306, "step": 2570 }, { "epoch": 0.7988852763585694, "grad_norm": 1.1498329639434814, "learning_rate": 0.00015980179622174047, "loss": 6.5308, "step": 2580 }, { "epoch": 0.8019817309180988, "grad_norm": 1.5919135808944702, "learning_rate": 0.00016042118302880148, "loss": 6.5458, "step": 2590 }, { "epoch": 0.8050781854776281, "grad_norm": 1.7433273792266846, "learning_rate": 0.00016104056983586249, "loss": 6.5307, "step": 2600 }, { "epoch": 0.8081746400371574, "grad_norm": 1.2043076753616333, "learning_rate": 0.00016165995664292352, "loss": 6.5347, "step": 2610 }, { "epoch": 0.8112710945966868, "grad_norm": 1.2197911739349365, "learning_rate": 0.00016227934344998453, "loss": 6.5282, "step": 2620 }, { "epoch": 0.8143675491562161, "grad_norm": 0.8074585199356079, "learning_rate": 0.0001628987302570455, "loss": 6.542, "step": 2630 }, { "epoch": 0.8174640037157455, "grad_norm": 1.1308220624923706, "learning_rate": 0.00016351811706410654, "loss": 6.547, "step": 2640 }, { "epoch": 0.8205604582752748, "grad_norm": 0.989686131477356, "learning_rate": 0.00016413750387116755, "loss": 6.5418, "step": 2650 }, { "epoch": 0.8236569128348041, "grad_norm": 0.9891242980957031, "learning_rate": 0.00016475689067822855, "loss": 6.5211, "step": 2660 }, { "epoch": 0.8267533673943335, "grad_norm": 0.9230055212974548, "learning_rate": 0.00016537627748528958, "loss": 6.5371, "step": 2670 }, { "epoch": 0.8298498219538628, "grad_norm": 0.9501631259918213, "learning_rate": 0.0001659956642923506, "loss": 6.5325, "step": 2680 }, { "epoch": 0.8329462765133921, "grad_norm": 0.9849696755409241, "learning_rate": 0.00016661505109941157, "loss": 6.5395, "step": 2690 }, { "epoch": 0.8360427310729215, "grad_norm": 1.2444876432418823, "learning_rate": 0.0001672344379064726, "loss": 6.5156, "step": 2700 }, { "epoch": 0.8391391856324508, "grad_norm": 0.7472972869873047, "learning_rate": 0.0001678538247135336, "loss": 6.5228, "step": 2710 }, { "epoch": 0.8422356401919802, "grad_norm": 0.8915477991104126, "learning_rate": 0.00016847321152059462, "loss": 6.5242, "step": 2720 }, { "epoch": 0.8453320947515095, "grad_norm": 1.0017406940460205, "learning_rate": 0.00016909259832765562, "loss": 6.5449, "step": 2730 }, { "epoch": 0.8484285493110388, "grad_norm": 1.0427559614181519, "learning_rate": 0.00016971198513471663, "loss": 6.5257, "step": 2740 }, { "epoch": 0.8515250038705682, "grad_norm": 0.8571954965591431, "learning_rate": 0.00017033137194177764, "loss": 6.5203, "step": 2750 }, { "epoch": 0.8546214584300975, "grad_norm": 1.0811147689819336, "learning_rate": 0.00017095075874883864, "loss": 6.5196, "step": 2760 }, { "epoch": 0.8577179129896269, "grad_norm": 0.9217764735221863, "learning_rate": 0.00017157014555589968, "loss": 6.5189, "step": 2770 }, { "epoch": 0.8608143675491562, "grad_norm": 0.9920642375946045, "learning_rate": 0.00017218953236296068, "loss": 6.5191, "step": 2780 }, { "epoch": 0.8639108221086855, "grad_norm": 1.0834949016571045, "learning_rate": 0.00017280891917002166, "loss": 6.5227, "step": 2790 }, { "epoch": 0.8670072766682149, "grad_norm": 0.916513204574585, "learning_rate": 0.0001734283059770827, "loss": 6.5144, "step": 2800 }, { "epoch": 0.8701037312277442, "grad_norm": 1.2615902423858643, "learning_rate": 0.0001740476927841437, "loss": 6.506, "step": 2810 }, { "epoch": 0.8732001857872735, "grad_norm": 0.8685635924339294, "learning_rate": 0.0001746670795912047, "loss": 6.5142, "step": 2820 }, { "epoch": 0.8762966403468029, "grad_norm": 0.8606330156326294, "learning_rate": 0.00017528646639826574, "loss": 6.5019, "step": 2830 }, { "epoch": 0.8793930949063322, "grad_norm": 0.7754759192466736, "learning_rate": 0.00017590585320532672, "loss": 6.5119, "step": 2840 }, { "epoch": 0.8824895494658616, "grad_norm": 0.8332505226135254, "learning_rate": 0.00017652524001238773, "loss": 6.5283, "step": 2850 }, { "epoch": 0.8855860040253909, "grad_norm": 1.1799520254135132, "learning_rate": 0.00017714462681944876, "loss": 6.5043, "step": 2860 }, { "epoch": 0.8886824585849202, "grad_norm": 0.9492645859718323, "learning_rate": 0.00017776401362650977, "loss": 6.5106, "step": 2870 }, { "epoch": 0.8917789131444496, "grad_norm": 0.7921923995018005, "learning_rate": 0.00017838340043357077, "loss": 6.5065, "step": 2880 }, { "epoch": 0.8948753677039789, "grad_norm": 0.6766930818557739, "learning_rate": 0.00017900278724063175, "loss": 6.5239, "step": 2890 }, { "epoch": 0.8979718222635082, "grad_norm": 0.7052696347236633, "learning_rate": 0.00017962217404769278, "loss": 6.5378, "step": 2900 }, { "epoch": 0.9010682768230376, "grad_norm": 0.973673403263092, "learning_rate": 0.0001802415608547538, "loss": 6.5099, "step": 2910 }, { "epoch": 0.9041647313825669, "grad_norm": 0.8590471744537354, "learning_rate": 0.0001808609476618148, "loss": 6.522, "step": 2920 }, { "epoch": 0.9072611859420963, "grad_norm": 0.9478482604026794, "learning_rate": 0.00018148033446887583, "loss": 6.5148, "step": 2930 }, { "epoch": 0.9103576405016256, "grad_norm": 0.991057813167572, "learning_rate": 0.00018209972127593684, "loss": 6.4991, "step": 2940 }, { "epoch": 0.9134540950611549, "grad_norm": 0.8526809811592102, "learning_rate": 0.00018271910808299782, "loss": 6.5164, "step": 2950 }, { "epoch": 0.9165505496206843, "grad_norm": 0.6919571161270142, "learning_rate": 0.00018333849489005885, "loss": 6.5214, "step": 2960 }, { "epoch": 0.9196470041802136, "grad_norm": 0.657346248626709, "learning_rate": 0.00018395788169711986, "loss": 6.5013, "step": 2970 }, { "epoch": 0.922743458739743, "grad_norm": 0.8530818223953247, "learning_rate": 0.00018457726850418086, "loss": 6.5145, "step": 2980 }, { "epoch": 0.9258399132992723, "grad_norm": 0.8030965328216553, "learning_rate": 0.0001851966553112419, "loss": 6.513, "step": 2990 }, { "epoch": 0.9289363678588016, "grad_norm": 0.8161980509757996, "learning_rate": 0.00018581604211830288, "loss": 6.5074, "step": 3000 }, { "epoch": 0.932032822418331, "grad_norm": 0.9112780094146729, "learning_rate": 0.00018643542892536388, "loss": 6.4961, "step": 3010 }, { "epoch": 0.9351292769778603, "grad_norm": 0.8977142572402954, "learning_rate": 0.00018705481573242491, "loss": 6.4973, "step": 3020 }, { "epoch": 0.9382257315373896, "grad_norm": 1.0232683420181274, "learning_rate": 0.00018767420253948592, "loss": 6.48, "step": 3030 }, { "epoch": 0.941322186096919, "grad_norm": 0.8228316307067871, "learning_rate": 0.00018829358934654693, "loss": 6.4842, "step": 3040 }, { "epoch": 0.9444186406564483, "grad_norm": 0.724467396736145, "learning_rate": 0.0001889129761536079, "loss": 6.4781, "step": 3050 }, { "epoch": 0.9475150952159777, "grad_norm": 0.9022755026817322, "learning_rate": 0.00018953236296066894, "loss": 6.4799, "step": 3060 }, { "epoch": 0.950611549775507, "grad_norm": 1.0211142301559448, "learning_rate": 0.00019015174976772995, "loss": 6.4719, "step": 3070 }, { "epoch": 0.9537080043350363, "grad_norm": 0.7571627497673035, "learning_rate": 0.00019077113657479095, "loss": 6.4685, "step": 3080 }, { "epoch": 0.9568044588945657, "grad_norm": 0.797822117805481, "learning_rate": 0.00019139052338185199, "loss": 6.4502, "step": 3090 }, { "epoch": 0.959900913454095, "grad_norm": 1.1731350421905518, "learning_rate": 0.00019200991018891297, "loss": 6.4534, "step": 3100 }, { "epoch": 0.9629973680136245, "grad_norm": 0.7823401689529419, "learning_rate": 0.00019262929699597397, "loss": 6.4611, "step": 3110 }, { "epoch": 0.9660938225731538, "grad_norm": 1.2475049495697021, "learning_rate": 0.000193248683803035, "loss": 6.4389, "step": 3120 }, { "epoch": 0.9691902771326831, "grad_norm": 0.9524723887443542, "learning_rate": 0.000193868070610096, "loss": 6.4435, "step": 3130 }, { "epoch": 0.9722867316922125, "grad_norm": 0.9494399428367615, "learning_rate": 0.00019448745741715702, "loss": 6.4332, "step": 3140 }, { "epoch": 0.9753831862517418, "grad_norm": 1.0070710182189941, "learning_rate": 0.00019510684422421805, "loss": 6.4529, "step": 3150 }, { "epoch": 0.9784796408112711, "grad_norm": 1.180368185043335, "learning_rate": 0.00019572623103127903, "loss": 6.4369, "step": 3160 }, { "epoch": 0.9815760953708005, "grad_norm": 1.0592350959777832, "learning_rate": 0.00019634561783834004, "loss": 6.4402, "step": 3170 }, { "epoch": 0.9846725499303298, "grad_norm": 0.8868720531463623, "learning_rate": 0.00019696500464540107, "loss": 6.4406, "step": 3180 }, { "epoch": 0.9877690044898592, "grad_norm": 0.8809700608253479, "learning_rate": 0.00019758439145246208, "loss": 6.4125, "step": 3190 }, { "epoch": 0.9908654590493885, "grad_norm": 0.8676486611366272, "learning_rate": 0.00019820377825952308, "loss": 6.4281, "step": 3200 }, { "epoch": 0.9939619136089178, "grad_norm": 0.6752346754074097, "learning_rate": 0.0001988231650665841, "loss": 6.4041, "step": 3210 }, { "epoch": 0.9970583681684472, "grad_norm": 0.9568197131156921, "learning_rate": 0.0001994425518736451, "loss": 6.3981, "step": 3220 }, { "epoch": 1.0, "grad_norm": 0.5673872828483582, "learning_rate": 0.0002000619386807061, "loss": 6.071, "step": 3230 }, { "epoch": 1.0030964545595293, "grad_norm": 0.8268159627914429, "learning_rate": 0.0002006813254877671, "loss": 6.3988, "step": 3240 }, { "epoch": 1.0061929091190587, "grad_norm": 0.7635223269462585, "learning_rate": 0.00020130071229482814, "loss": 6.3831, "step": 3250 }, { "epoch": 1.009289363678588, "grad_norm": 0.7615036368370056, "learning_rate": 0.00020192009910188912, "loss": 6.3832, "step": 3260 }, { "epoch": 1.0123858182381174, "grad_norm": 0.7862409353256226, "learning_rate": 0.00020253948590895013, "loss": 6.3929, "step": 3270 }, { "epoch": 1.0154822727976467, "grad_norm": 1.113342046737671, "learning_rate": 0.00020315887271601116, "loss": 6.3917, "step": 3280 }, { "epoch": 1.018578727357176, "grad_norm": 0.8702403903007507, "learning_rate": 0.00020377825952307217, "loss": 6.3783, "step": 3290 }, { "epoch": 1.0216751819167054, "grad_norm": 0.8440068960189819, "learning_rate": 0.00020439764633013317, "loss": 6.3777, "step": 3300 }, { "epoch": 1.0247716364762347, "grad_norm": 1.1612240076065063, "learning_rate": 0.00020501703313719418, "loss": 6.3617, "step": 3310 }, { "epoch": 1.027868091035764, "grad_norm": 0.8664381504058838, "learning_rate": 0.00020563641994425519, "loss": 6.3766, "step": 3320 }, { "epoch": 1.0309645455952934, "grad_norm": 1.2137264013290405, "learning_rate": 0.0002062558067513162, "loss": 6.3724, "step": 3330 }, { "epoch": 1.0340610001548227, "grad_norm": 1.2266614437103271, "learning_rate": 0.00020687519355837723, "loss": 6.3313, "step": 3340 }, { "epoch": 1.037157454714352, "grad_norm": 0.8568953275680542, "learning_rate": 0.00020749458036543823, "loss": 6.3583, "step": 3350 }, { "epoch": 1.0402539092738814, "grad_norm": 0.874577522277832, "learning_rate": 0.0002081139671724992, "loss": 6.3511, "step": 3360 }, { "epoch": 1.0433503638334107, "grad_norm": 1.1219960451126099, "learning_rate": 0.00020873335397956024, "loss": 6.323, "step": 3370 }, { "epoch": 1.04644681839294, "grad_norm": 1.1575599908828735, "learning_rate": 0.00020935274078662125, "loss": 6.3426, "step": 3380 }, { "epoch": 1.0495432729524694, "grad_norm": 0.7617483139038086, "learning_rate": 0.00020997212759368226, "loss": 6.3243, "step": 3390 }, { "epoch": 1.0526397275119987, "grad_norm": 1.019921064376831, "learning_rate": 0.00021059151440074326, "loss": 6.303, "step": 3400 }, { "epoch": 1.055736182071528, "grad_norm": 1.034369945526123, "learning_rate": 0.0002112109012078043, "loss": 6.3092, "step": 3410 }, { "epoch": 1.0588326366310574, "grad_norm": 1.1426433324813843, "learning_rate": 0.00021183028801486528, "loss": 6.3023, "step": 3420 }, { "epoch": 1.0619290911905868, "grad_norm": 0.9942957162857056, "learning_rate": 0.00021244967482192628, "loss": 6.3104, "step": 3430 }, { "epoch": 1.065025545750116, "grad_norm": 1.0719786882400513, "learning_rate": 0.00021306906162898732, "loss": 6.2973, "step": 3440 }, { "epoch": 1.0681220003096454, "grad_norm": 1.0518437623977661, "learning_rate": 0.00021368844843604832, "loss": 6.2967, "step": 3450 }, { "epoch": 1.0712184548691748, "grad_norm": 1.2732771635055542, "learning_rate": 0.00021430783524310933, "loss": 6.2746, "step": 3460 }, { "epoch": 1.0743149094287041, "grad_norm": 1.5430597066879272, "learning_rate": 0.00021492722205017034, "loss": 6.2815, "step": 3470 }, { "epoch": 1.0774113639882335, "grad_norm": 0.8930633068084717, "learning_rate": 0.00021554660885723134, "loss": 6.251, "step": 3480 }, { "epoch": 1.0805078185477628, "grad_norm": 1.1095397472381592, "learning_rate": 0.00021616599566429235, "loss": 6.238, "step": 3490 }, { "epoch": 1.0836042731072921, "grad_norm": 1.1570417881011963, "learning_rate": 0.00021678538247135338, "loss": 6.2169, "step": 3500 }, { "epoch": 1.0867007276668215, "grad_norm": 1.2682262659072876, "learning_rate": 0.0002174047692784144, "loss": 6.2256, "step": 3510 }, { "epoch": 1.0897971822263508, "grad_norm": 1.2010442018508911, "learning_rate": 0.00021802415608547537, "loss": 6.2034, "step": 3520 }, { "epoch": 1.0928936367858801, "grad_norm": 1.3368873596191406, "learning_rate": 0.0002186435428925364, "loss": 6.1673, "step": 3530 }, { "epoch": 1.0959900913454095, "grad_norm": 1.1895204782485962, "learning_rate": 0.0002192629296995974, "loss": 6.1546, "step": 3540 }, { "epoch": 1.0990865459049388, "grad_norm": 1.1519889831542969, "learning_rate": 0.0002198823165066584, "loss": 6.1365, "step": 3550 }, { "epoch": 1.1021830004644682, "grad_norm": 1.3705570697784424, "learning_rate": 0.00022050170331371942, "loss": 6.07, "step": 3560 }, { "epoch": 1.1052794550239975, "grad_norm": 1.6378076076507568, "learning_rate": 0.00022112109012078043, "loss": 6.0432, "step": 3570 }, { "epoch": 1.1083759095835268, "grad_norm": 1.5780587196350098, "learning_rate": 0.00022174047692784143, "loss": 6.0201, "step": 3580 }, { "epoch": 1.1114723641430562, "grad_norm": 1.2604175806045532, "learning_rate": 0.00022235986373490244, "loss": 5.9781, "step": 3590 }, { "epoch": 1.1145688187025855, "grad_norm": 1.4099502563476562, "learning_rate": 0.00022297925054196347, "loss": 5.9298, "step": 3600 }, { "epoch": 1.1176652732621148, "grad_norm": 1.233045220375061, "learning_rate": 0.00022359863734902448, "loss": 5.8924, "step": 3610 }, { "epoch": 1.1207617278216442, "grad_norm": 1.4948160648345947, "learning_rate": 0.00022421802415608546, "loss": 5.8785, "step": 3620 }, { "epoch": 1.1238581823811735, "grad_norm": 1.7853126525878906, "learning_rate": 0.0002248374109631465, "loss": 5.8134, "step": 3630 }, { "epoch": 1.1269546369407029, "grad_norm": 2.1024398803710938, "learning_rate": 0.0002254567977702075, "loss": 5.7864, "step": 3640 }, { "epoch": 1.1300510915002322, "grad_norm": 1.6895965337753296, "learning_rate": 0.0002260761845772685, "loss": 5.7182, "step": 3650 }, { "epoch": 1.1331475460597615, "grad_norm": 1.7023606300354004, "learning_rate": 0.00022669557138432954, "loss": 5.6528, "step": 3660 }, { "epoch": 1.1362440006192909, "grad_norm": 1.2533527612686157, "learning_rate": 0.00022731495819139054, "loss": 5.6221, "step": 3670 }, { "epoch": 1.1393404551788202, "grad_norm": 1.621505618095398, "learning_rate": 0.00022793434499845152, "loss": 5.5583, "step": 3680 }, { "epoch": 1.1424369097383495, "grad_norm": 1.3869857788085938, "learning_rate": 0.00022855373180551256, "loss": 5.5145, "step": 3690 }, { "epoch": 1.1455333642978789, "grad_norm": 1.542646884918213, "learning_rate": 0.00022917311861257356, "loss": 5.4559, "step": 3700 }, { "epoch": 1.1486298188574082, "grad_norm": 1.4515721797943115, "learning_rate": 0.00022979250541963457, "loss": 5.4007, "step": 3710 }, { "epoch": 1.1517262734169376, "grad_norm": 1.7579517364501953, "learning_rate": 0.00023041189222669557, "loss": 5.3976, "step": 3720 }, { "epoch": 1.154822727976467, "grad_norm": 1.2533565759658813, "learning_rate": 0.00023103127903375658, "loss": 5.3172, "step": 3730 }, { "epoch": 1.1579191825359962, "grad_norm": 1.494162917137146, "learning_rate": 0.0002316506658408176, "loss": 5.2509, "step": 3740 }, { "epoch": 1.1610156370955256, "grad_norm": 1.595115065574646, "learning_rate": 0.0002322700526478786, "loss": 5.1784, "step": 3750 }, { "epoch": 1.164112091655055, "grad_norm": 1.55663001537323, "learning_rate": 0.00023288943945493963, "loss": 5.0949, "step": 3760 }, { "epoch": 1.1672085462145843, "grad_norm": 1.374272346496582, "learning_rate": 0.00023350882626200063, "loss": 5.0331, "step": 3770 }, { "epoch": 1.1703050007741136, "grad_norm": 1.3195029497146606, "learning_rate": 0.0002341282130690616, "loss": 4.9576, "step": 3780 }, { "epoch": 1.173401455333643, "grad_norm": 1.161839485168457, "learning_rate": 0.00023474759987612265, "loss": 4.9166, "step": 3790 }, { "epoch": 1.1764979098931723, "grad_norm": 1.2902604341506958, "learning_rate": 0.00023536698668318365, "loss": 4.8334, "step": 3800 }, { "epoch": 1.1795943644527016, "grad_norm": 1.0339348316192627, "learning_rate": 0.00023598637349024466, "loss": 4.7735, "step": 3810 }, { "epoch": 1.182690819012231, "grad_norm": 1.1560925245285034, "learning_rate": 0.0002366057602973057, "loss": 4.7389, "step": 3820 }, { "epoch": 1.1857872735717603, "grad_norm": 1.0810256004333496, "learning_rate": 0.00023722514710436667, "loss": 4.673, "step": 3830 }, { "epoch": 1.1888837281312896, "grad_norm": 1.187358021736145, "learning_rate": 0.00023784453391142768, "loss": 4.6703, "step": 3840 }, { "epoch": 1.191980182690819, "grad_norm": 1.2153098583221436, "learning_rate": 0.0002384639207184887, "loss": 4.5977, "step": 3850 }, { "epoch": 1.1950766372503483, "grad_norm": 1.3098320960998535, "learning_rate": 0.00023908330752554972, "loss": 4.5396, "step": 3860 }, { "epoch": 1.1981730918098776, "grad_norm": 1.3841015100479126, "learning_rate": 0.00023970269433261072, "loss": 4.5191, "step": 3870 }, { "epoch": 1.201269546369407, "grad_norm": 1.0185471773147583, "learning_rate": 0.0002403220811396717, "loss": 4.4745, "step": 3880 }, { "epoch": 1.2043660009289363, "grad_norm": 1.0954643487930298, "learning_rate": 0.00024094146794673274, "loss": 4.4384, "step": 3890 }, { "epoch": 1.2074624554884656, "grad_norm": 1.0373002290725708, "learning_rate": 0.00024156085475379374, "loss": 4.4049, "step": 3900 }, { "epoch": 1.210558910047995, "grad_norm": 1.0706144571304321, "learning_rate": 0.00024218024156085475, "loss": 4.3815, "step": 3910 }, { "epoch": 1.2136553646075243, "grad_norm": 1.1758544445037842, "learning_rate": 0.00024279962836791578, "loss": 4.3619, "step": 3920 }, { "epoch": 1.2167518191670537, "grad_norm": 1.1079212427139282, "learning_rate": 0.0002434190151749768, "loss": 4.337, "step": 3930 }, { "epoch": 1.219848273726583, "grad_norm": 1.1753212213516235, "learning_rate": 0.00024403840198203777, "loss": 4.3046, "step": 3940 }, { "epoch": 1.2229447282861123, "grad_norm": 1.1949397325515747, "learning_rate": 0.00024465778878909883, "loss": 4.3101, "step": 3950 }, { "epoch": 1.2260411828456417, "grad_norm": 1.0809822082519531, "learning_rate": 0.0002452771755961598, "loss": 4.2624, "step": 3960 }, { "epoch": 1.229137637405171, "grad_norm": 1.113866925239563, "learning_rate": 0.0002458965624032208, "loss": 4.2679, "step": 3970 }, { "epoch": 1.2322340919647004, "grad_norm": 1.1212016344070435, "learning_rate": 0.0002465159492102818, "loss": 4.2071, "step": 3980 }, { "epoch": 1.2353305465242297, "grad_norm": 1.1517590284347534, "learning_rate": 0.00024713533601734285, "loss": 4.1929, "step": 3990 }, { "epoch": 1.238427001083759, "grad_norm": 0.9486988186836243, "learning_rate": 0.00024775472282440383, "loss": 4.1934, "step": 4000 }, { "epoch": 1.2415234556432884, "grad_norm": 1.0706721544265747, "learning_rate": 0.00024837410963146487, "loss": 4.1776, "step": 4010 }, { "epoch": 1.2446199102028177, "grad_norm": 1.1148719787597656, "learning_rate": 0.00024899349643852585, "loss": 4.1265, "step": 4020 }, { "epoch": 1.247716364762347, "grad_norm": 1.1065315008163452, "learning_rate": 0.0002496128832455869, "loss": 4.1034, "step": 4030 }, { "epoch": 1.2508128193218764, "grad_norm": 1.158066987991333, "learning_rate": 0.00025023227005264786, "loss": 4.0863, "step": 4040 }, { "epoch": 1.2539092738814057, "grad_norm": 1.1560614109039307, "learning_rate": 0.0002508516568597089, "loss": 4.0778, "step": 4050 }, { "epoch": 1.257005728440935, "grad_norm": 0.98968905210495, "learning_rate": 0.0002514710436667699, "loss": 4.051, "step": 4060 }, { "epoch": 1.2601021830004644, "grad_norm": 1.1713204383850098, "learning_rate": 0.0002520904304738309, "loss": 4.0181, "step": 4070 }, { "epoch": 1.2631986375599937, "grad_norm": 1.1065443754196167, "learning_rate": 0.00025270981728089194, "loss": 4.0239, "step": 4080 }, { "epoch": 1.266295092119523, "grad_norm": 1.043097972869873, "learning_rate": 0.00025332920408795297, "loss": 4.0208, "step": 4090 }, { "epoch": 1.2693915466790524, "grad_norm": 1.024276614189148, "learning_rate": 0.00025394859089501395, "loss": 3.9804, "step": 4100 }, { "epoch": 1.2724880012385817, "grad_norm": 1.1613043546676636, "learning_rate": 0.000254567977702075, "loss": 3.9819, "step": 4110 }, { "epoch": 1.275584455798111, "grad_norm": 1.0510482788085938, "learning_rate": 0.0002551873645091359, "loss": 3.9696, "step": 4120 }, { "epoch": 1.2786809103576404, "grad_norm": 0.9902080297470093, "learning_rate": 0.00025580675131619694, "loss": 3.9233, "step": 4130 }, { "epoch": 1.2817773649171698, "grad_norm": 1.165866732597351, "learning_rate": 0.000256426138123258, "loss": 3.9079, "step": 4140 }, { "epoch": 1.284873819476699, "grad_norm": 1.0561455488204956, "learning_rate": 0.00025704552493031896, "loss": 3.9072, "step": 4150 }, { "epoch": 1.2879702740362284, "grad_norm": 0.989741325378418, "learning_rate": 0.00025766491173738, "loss": 3.9018, "step": 4160 }, { "epoch": 1.2910667285957578, "grad_norm": 1.099219799041748, "learning_rate": 0.000258284298544441, "loss": 3.868, "step": 4170 }, { "epoch": 1.2941631831552871, "grad_norm": 1.1154602766036987, "learning_rate": 0.000258903685351502, "loss": 3.8644, "step": 4180 }, { "epoch": 1.2972596377148164, "grad_norm": 1.0872890949249268, "learning_rate": 0.00025952307215856304, "loss": 3.8587, "step": 4190 }, { "epoch": 1.3003560922743458, "grad_norm": 1.0499584674835205, "learning_rate": 0.00026014245896562407, "loss": 3.8235, "step": 4200 }, { "epoch": 1.3034525468338751, "grad_norm": 1.030174732208252, "learning_rate": 0.00026076184577268505, "loss": 3.8302, "step": 4210 }, { "epoch": 1.3065490013934045, "grad_norm": 1.0867342948913574, "learning_rate": 0.000261381232579746, "loss": 3.8341, "step": 4220 }, { "epoch": 1.3096454559529338, "grad_norm": 1.0520577430725098, "learning_rate": 0.00026200061938680706, "loss": 3.8018, "step": 4230 }, { "epoch": 1.3127419105124631, "grad_norm": 1.0809017419815063, "learning_rate": 0.00026262000619386804, "loss": 3.7748, "step": 4240 }, { "epoch": 1.3158383650719925, "grad_norm": 1.1091547012329102, "learning_rate": 0.0002632393930009291, "loss": 3.7732, "step": 4250 }, { "epoch": 1.3189348196315218, "grad_norm": 1.0448859930038452, "learning_rate": 0.0002638587798079901, "loss": 3.74, "step": 4260 }, { "epoch": 1.3220312741910512, "grad_norm": 1.0798423290252686, "learning_rate": 0.0002644781666150511, "loss": 3.7374, "step": 4270 }, { "epoch": 1.3251277287505805, "grad_norm": 0.9496048092842102, "learning_rate": 0.0002650975534221121, "loss": 3.7422, "step": 4280 }, { "epoch": 1.3282241833101098, "grad_norm": 0.9731584787368774, "learning_rate": 0.00026571694022917315, "loss": 3.6992, "step": 4290 }, { "epoch": 1.3313206378696392, "grad_norm": 0.9330194592475891, "learning_rate": 0.00026633632703623413, "loss": 3.6868, "step": 4300 }, { "epoch": 1.3344170924291685, "grad_norm": 1.0531985759735107, "learning_rate": 0.00026695571384329517, "loss": 3.6958, "step": 4310 }, { "epoch": 1.3375135469886978, "grad_norm": 0.9694075584411621, "learning_rate": 0.0002675751006503562, "loss": 3.7137, "step": 4320 }, { "epoch": 1.3406100015482272, "grad_norm": 0.9474936723709106, "learning_rate": 0.0002681944874574171, "loss": 3.6889, "step": 4330 }, { "epoch": 1.3437064561077565, "grad_norm": 0.9624688029289246, "learning_rate": 0.00026881387426447816, "loss": 3.6531, "step": 4340 }, { "epoch": 1.3468029106672859, "grad_norm": 0.9767426252365112, "learning_rate": 0.0002694332610715392, "loss": 3.6596, "step": 4350 }, { "epoch": 1.3498993652268152, "grad_norm": 0.9959364533424377, "learning_rate": 0.00027005264787860017, "loss": 3.6434, "step": 4360 }, { "epoch": 1.3529958197863445, "grad_norm": 1.0519224405288696, "learning_rate": 0.0002706720346856612, "loss": 3.5982, "step": 4370 }, { "epoch": 1.3560922743458739, "grad_norm": 0.9964626431465149, "learning_rate": 0.00027129142149272224, "loss": 3.6145, "step": 4380 }, { "epoch": 1.3591887289054032, "grad_norm": 1.0506435632705688, "learning_rate": 0.0002719108082997832, "loss": 3.5859, "step": 4390 }, { "epoch": 1.3622851834649325, "grad_norm": 1.0846556425094604, "learning_rate": 0.00027253019510684425, "loss": 3.5981, "step": 4400 }, { "epoch": 1.3653816380244619, "grad_norm": 1.0251847505569458, "learning_rate": 0.0002731495819139053, "loss": 3.5731, "step": 4410 }, { "epoch": 1.3684780925839912, "grad_norm": 1.0184073448181152, "learning_rate": 0.00027376896872096626, "loss": 3.5665, "step": 4420 }, { "epoch": 1.3715745471435206, "grad_norm": 0.9859119057655334, "learning_rate": 0.00027438835552802724, "loss": 3.5401, "step": 4430 }, { "epoch": 1.37467100170305, "grad_norm": 0.9708986878395081, "learning_rate": 0.0002750077423350883, "loss": 3.5392, "step": 4440 }, { "epoch": 1.3777674562625792, "grad_norm": 1.0786579847335815, "learning_rate": 0.00027562712914214925, "loss": 3.5553, "step": 4450 }, { "epoch": 1.3808639108221086, "grad_norm": 1.011117696762085, "learning_rate": 0.0002762465159492103, "loss": 3.5251, "step": 4460 }, { "epoch": 1.383960365381638, "grad_norm": 0.9319019317626953, "learning_rate": 0.00027686590275627127, "loss": 3.5408, "step": 4470 }, { "epoch": 1.3870568199411673, "grad_norm": 1.0703030824661255, "learning_rate": 0.0002774852895633323, "loss": 3.5147, "step": 4480 }, { "epoch": 1.3901532745006966, "grad_norm": 0.9363672733306885, "learning_rate": 0.00027810467637039333, "loss": 3.5054, "step": 4490 }, { "epoch": 1.393249729060226, "grad_norm": 1.0434913635253906, "learning_rate": 0.0002787240631774543, "loss": 3.498, "step": 4500 }, { "epoch": 1.3963461836197553, "grad_norm": 1.1381675004959106, "learning_rate": 0.00027934344998451535, "loss": 3.5045, "step": 4510 }, { "epoch": 1.3994426381792846, "grad_norm": 0.9770002365112305, "learning_rate": 0.0002799628367915764, "loss": 3.5115, "step": 4520 }, { "epoch": 1.402539092738814, "grad_norm": 0.9267017245292664, "learning_rate": 0.0002805822235986373, "loss": 3.4452, "step": 4530 }, { "epoch": 1.4056355472983433, "grad_norm": 1.0910615921020508, "learning_rate": 0.00028120161040569834, "loss": 3.4792, "step": 4540 }, { "epoch": 1.4087320018578726, "grad_norm": 1.0374314785003662, "learning_rate": 0.00028182099721275937, "loss": 3.4497, "step": 4550 }, { "epoch": 1.411828456417402, "grad_norm": 1.1077336072921753, "learning_rate": 0.00028244038401982035, "loss": 3.4836, "step": 4560 }, { "epoch": 1.4149249109769313, "grad_norm": 0.9700469374656677, "learning_rate": 0.0002830597708268814, "loss": 3.4539, "step": 4570 }, { "epoch": 1.4180213655364606, "grad_norm": 1.0011495351791382, "learning_rate": 0.0002836791576339424, "loss": 3.4192, "step": 4580 }, { "epoch": 1.42111782009599, "grad_norm": 1.0449153184890747, "learning_rate": 0.0002842985444410034, "loss": 3.4279, "step": 4590 }, { "epoch": 1.4242142746555193, "grad_norm": 1.0163695812225342, "learning_rate": 0.00028491793124806443, "loss": 3.4375, "step": 4600 }, { "epoch": 1.4273107292150486, "grad_norm": 0.9043591618537903, "learning_rate": 0.00028553731805512546, "loss": 3.41, "step": 4610 }, { "epoch": 1.430407183774578, "grad_norm": 1.0529117584228516, "learning_rate": 0.00028615670486218644, "loss": 3.4181, "step": 4620 }, { "epoch": 1.4335036383341073, "grad_norm": 0.9313072562217712, "learning_rate": 0.0002867760916692475, "loss": 3.381, "step": 4630 }, { "epoch": 1.4366000928936367, "grad_norm": 1.0091314315795898, "learning_rate": 0.00028739547847630846, "loss": 3.4084, "step": 4640 }, { "epoch": 1.439696547453166, "grad_norm": 1.023206114768982, "learning_rate": 0.00028801486528336943, "loss": 3.3933, "step": 4650 }, { "epoch": 1.4427930020126953, "grad_norm": 0.9428771734237671, "learning_rate": 0.00028863425209043047, "loss": 3.3793, "step": 4660 }, { "epoch": 1.4458894565722247, "grad_norm": 0.9487484097480774, "learning_rate": 0.0002892536388974915, "loss": 3.3703, "step": 4670 }, { "epoch": 1.448985911131754, "grad_norm": 1.0242682695388794, "learning_rate": 0.0002898730257045525, "loss": 3.3808, "step": 4680 }, { "epoch": 1.4520823656912833, "grad_norm": 0.963318407535553, "learning_rate": 0.0002904924125116135, "loss": 3.3756, "step": 4690 }, { "epoch": 1.4551788202508127, "grad_norm": 0.9051762223243713, "learning_rate": 0.00029111179931867455, "loss": 3.3356, "step": 4700 }, { "epoch": 1.458275274810342, "grad_norm": 0.9930270910263062, "learning_rate": 0.0002917311861257355, "loss": 3.3601, "step": 4710 }, { "epoch": 1.4613717293698714, "grad_norm": 1.077131748199463, "learning_rate": 0.00029235057293279656, "loss": 3.3308, "step": 4720 }, { "epoch": 1.4644681839294007, "grad_norm": 0.881527304649353, "learning_rate": 0.0002929699597398576, "loss": 3.328, "step": 4730 }, { "epoch": 1.46756463848893, "grad_norm": 1.0115300416946411, "learning_rate": 0.0002935893465469185, "loss": 3.3233, "step": 4740 }, { "epoch": 1.4706610930484594, "grad_norm": 1.0688494443893433, "learning_rate": 0.00029420873335397955, "loss": 3.3381, "step": 4750 }, { "epoch": 1.4737575476079887, "grad_norm": 1.0195506811141968, "learning_rate": 0.0002948281201610406, "loss": 3.3058, "step": 4760 }, { "epoch": 1.476854002167518, "grad_norm": 0.9502407312393188, "learning_rate": 0.00029544750696810156, "loss": 3.3174, "step": 4770 }, { "epoch": 1.4799504567270474, "grad_norm": 1.0097241401672363, "learning_rate": 0.0002960668937751626, "loss": 3.3102, "step": 4780 }, { "epoch": 1.483046911286577, "grad_norm": 0.9834030866622925, "learning_rate": 0.0002966862805822236, "loss": 3.3135, "step": 4790 }, { "epoch": 1.4861433658461063, "grad_norm": 1.014854907989502, "learning_rate": 0.0002973056673892846, "loss": 3.2915, "step": 4800 }, { "epoch": 1.4892398204056356, "grad_norm": 0.944720983505249, "learning_rate": 0.00029792505419634564, "loss": 3.2783, "step": 4810 }, { "epoch": 1.492336274965165, "grad_norm": 1.012688159942627, "learning_rate": 0.0002985444410034066, "loss": 3.2931, "step": 4820 }, { "epoch": 1.4954327295246943, "grad_norm": 0.9100663065910339, "learning_rate": 0.00029916382781046766, "loss": 3.2785, "step": 4830 }, { "epoch": 1.4985291840842236, "grad_norm": 0.8774744272232056, "learning_rate": 0.0002997832146175287, "loss": 3.2777, "step": 4840 }, { "epoch": 1.5016256386437528, "grad_norm": 0.9623695611953735, "learning_rate": 0.0003004026014245896, "loss": 3.2671, "step": 4850 }, { "epoch": 1.504722093203282, "grad_norm": 1.0606322288513184, "learning_rate": 0.00030102198823165065, "loss": 3.2483, "step": 4860 }, { "epoch": 1.5078185477628114, "grad_norm": 1.0098302364349365, "learning_rate": 0.0003016413750387117, "loss": 3.2355, "step": 4870 }, { "epoch": 1.5109150023223408, "grad_norm": 0.8991314172744751, "learning_rate": 0.00030226076184577266, "loss": 3.239, "step": 4880 }, { "epoch": 1.51401145688187, "grad_norm": 0.9911772012710571, "learning_rate": 0.0003028801486528337, "loss": 3.2569, "step": 4890 }, { "epoch": 1.5171079114413994, "grad_norm": 0.9949657320976257, "learning_rate": 0.00030349953545989473, "loss": 3.2441, "step": 4900 }, { "epoch": 1.5202043660009288, "grad_norm": 0.9273360371589661, "learning_rate": 0.0003041189222669557, "loss": 3.2385, "step": 4910 }, { "epoch": 1.5233008205604581, "grad_norm": 0.94888836145401, "learning_rate": 0.00030473830907401674, "loss": 3.2728, "step": 4920 }, { "epoch": 1.5263972751199875, "grad_norm": 0.9299125075340271, "learning_rate": 0.0003053576958810778, "loss": 3.2272, "step": 4930 }, { "epoch": 1.5294937296795168, "grad_norm": 0.8870009183883667, "learning_rate": 0.00030597708268813875, "loss": 3.2218, "step": 4940 }, { "epoch": 1.5325901842390461, "grad_norm": 1.0036243200302124, "learning_rate": 0.00030659646949519973, "loss": 3.2008, "step": 4950 }, { "epoch": 1.5356866387985755, "grad_norm": 0.9473212957382202, "learning_rate": 0.00030721585630226077, "loss": 3.2295, "step": 4960 }, { "epoch": 1.5387830933581048, "grad_norm": 0.8856829404830933, "learning_rate": 0.00030783524310932175, "loss": 3.2126, "step": 4970 }, { "epoch": 1.5418795479176342, "grad_norm": 0.997509777545929, "learning_rate": 0.0003084546299163828, "loss": 3.212, "step": 4980 }, { "epoch": 1.5449760024771635, "grad_norm": 0.9016265273094177, "learning_rate": 0.0003090740167234438, "loss": 3.208, "step": 4990 }, { "epoch": 1.5480724570366928, "grad_norm": 0.8731397390365601, "learning_rate": 0.0003096934035305048, "loss": 3.2158, "step": 5000 }, { "epoch": 1.5511689115962222, "grad_norm": 0.9676650166511536, "learning_rate": 0.0003103127903375658, "loss": 3.2032, "step": 5010 }, { "epoch": 1.5542653661557515, "grad_norm": 0.9783886075019836, "learning_rate": 0.00031093217714462686, "loss": 3.2114, "step": 5020 }, { "epoch": 1.5573618207152808, "grad_norm": 1.0224086046218872, "learning_rate": 0.00031155156395168784, "loss": 3.1828, "step": 5030 }, { "epoch": 1.5604582752748102, "grad_norm": 0.9322043061256409, "learning_rate": 0.00031217095075874887, "loss": 3.1851, "step": 5040 }, { "epoch": 1.5635547298343395, "grad_norm": 0.9294213056564331, "learning_rate": 0.0003127903375658099, "loss": 3.189, "step": 5050 }, { "epoch": 1.5666511843938689, "grad_norm": 0.9628444910049438, "learning_rate": 0.00031340972437287083, "loss": 3.1524, "step": 5060 }, { "epoch": 1.5697476389533982, "grad_norm": 0.9377193450927734, "learning_rate": 0.00031402911117993186, "loss": 3.1688, "step": 5070 }, { "epoch": 1.5728440935129275, "grad_norm": 0.8622744083404541, "learning_rate": 0.0003146484979869929, "loss": 3.1374, "step": 5080 }, { "epoch": 1.5759405480724569, "grad_norm": 0.9315075874328613, "learning_rate": 0.0003152678847940539, "loss": 3.1657, "step": 5090 }, { "epoch": 1.5790370026319864, "grad_norm": 0.9984999895095825, "learning_rate": 0.0003158872716011149, "loss": 3.1494, "step": 5100 }, { "epoch": 1.5821334571915158, "grad_norm": 0.9476169943809509, "learning_rate": 0.0003165066584081759, "loss": 3.1262, "step": 5110 }, { "epoch": 1.585229911751045, "grad_norm": 0.8942754864692688, "learning_rate": 0.0003171260452152369, "loss": 3.1546, "step": 5120 }, { "epoch": 1.5883263663105744, "grad_norm": 0.9009295701980591, "learning_rate": 0.00031774543202229796, "loss": 3.1516, "step": 5130 }, { "epoch": 1.5914228208701038, "grad_norm": 1.010343074798584, "learning_rate": 0.00031836481882935893, "loss": 3.1448, "step": 5140 }, { "epoch": 1.5945192754296331, "grad_norm": 0.9292970299720764, "learning_rate": 0.00031898420563641997, "loss": 3.123, "step": 5150 }, { "epoch": 1.5976157299891625, "grad_norm": 0.9574374556541443, "learning_rate": 0.00031960359244348095, "loss": 3.1358, "step": 5160 }, { "epoch": 1.6007121845486918, "grad_norm": 0.9073388576507568, "learning_rate": 0.0003202229792505419, "loss": 3.1352, "step": 5170 }, { "epoch": 1.6038086391082211, "grad_norm": 0.9928716421127319, "learning_rate": 0.00032084236605760296, "loss": 3.1226, "step": 5180 }, { "epoch": 1.6069050936677505, "grad_norm": 0.9886534810066223, "learning_rate": 0.000321461752864664, "loss": 3.131, "step": 5190 }, { "epoch": 1.6100015482272798, "grad_norm": 0.9734316468238831, "learning_rate": 0.00032208113967172497, "loss": 3.1341, "step": 5200 }, { "epoch": 1.6130980027868091, "grad_norm": 0.9681540131568909, "learning_rate": 0.000322700526478786, "loss": 3.0973, "step": 5210 }, { "epoch": 1.6161944573463385, "grad_norm": 0.9452388286590576, "learning_rate": 0.00032331991328584704, "loss": 3.1082, "step": 5220 }, { "epoch": 1.6192909119058678, "grad_norm": 0.9055010080337524, "learning_rate": 0.000323939300092908, "loss": 3.0891, "step": 5230 }, { "epoch": 1.6223873664653972, "grad_norm": 0.9603378772735596, "learning_rate": 0.00032455868689996905, "loss": 3.0998, "step": 5240 }, { "epoch": 1.6254838210249265, "grad_norm": 0.8925791382789612, "learning_rate": 0.0003251780737070301, "loss": 3.1165, "step": 5250 }, { "epoch": 1.6285802755844558, "grad_norm": 0.928421139717102, "learning_rate": 0.000325797460514091, "loss": 3.1087, "step": 5260 }, { "epoch": 1.6316767301439852, "grad_norm": 0.9481196403503418, "learning_rate": 0.00032641684732115204, "loss": 3.0916, "step": 5270 }, { "epoch": 1.6347731847035145, "grad_norm": 0.9044370055198669, "learning_rate": 0.0003270362341282131, "loss": 3.1, "step": 5280 }, { "epoch": 1.6378696392630439, "grad_norm": 0.9636628031730652, "learning_rate": 0.00032765562093527406, "loss": 3.114, "step": 5290 }, { "epoch": 1.6409660938225732, "grad_norm": 0.9585344195365906, "learning_rate": 0.0003282750077423351, "loss": 3.086, "step": 5300 }, { "epoch": 1.6440625483821025, "grad_norm": 0.9368054866790771, "learning_rate": 0.0003288943945493961, "loss": 3.0763, "step": 5310 }, { "epoch": 1.6471590029416319, "grad_norm": 0.951101541519165, "learning_rate": 0.0003295137813564571, "loss": 3.0746, "step": 5320 }, { "epoch": 1.6502554575011612, "grad_norm": 0.9043335318565369, "learning_rate": 0.00033013316816351814, "loss": 3.0665, "step": 5330 }, { "epoch": 1.6533519120606905, "grad_norm": 0.8929763436317444, "learning_rate": 0.00033075255497057917, "loss": 3.0644, "step": 5340 }, { "epoch": 1.6564483666202199, "grad_norm": 0.9089614152908325, "learning_rate": 0.00033137194177764015, "loss": 3.0661, "step": 5350 }, { "epoch": 1.6595448211797492, "grad_norm": 0.9606667757034302, "learning_rate": 0.0003319913285847012, "loss": 3.0578, "step": 5360 }, { "epoch": 1.6626412757392786, "grad_norm": 0.8867613673210144, "learning_rate": 0.00033261071539176216, "loss": 3.0707, "step": 5370 }, { "epoch": 1.665737730298808, "grad_norm": 0.9263885617256165, "learning_rate": 0.00033323010219882314, "loss": 3.0579, "step": 5380 }, { "epoch": 1.6688341848583372, "grad_norm": 0.8380886316299438, "learning_rate": 0.0003338494890058842, "loss": 3.0628, "step": 5390 }, { "epoch": 1.6719306394178666, "grad_norm": 0.9296733140945435, "learning_rate": 0.0003344688758129452, "loss": 3.0374, "step": 5400 }, { "epoch": 1.675027093977396, "grad_norm": 0.9482071995735168, "learning_rate": 0.0003350882626200062, "loss": 3.0611, "step": 5410 }, { "epoch": 1.6781235485369252, "grad_norm": 0.934635579586029, "learning_rate": 0.0003357076494270672, "loss": 3.0465, "step": 5420 }, { "epoch": 1.6812200030964546, "grad_norm": 0.9624560475349426, "learning_rate": 0.00033632703623412825, "loss": 3.0622, "step": 5430 }, { "epoch": 1.684316457655984, "grad_norm": 0.952055037021637, "learning_rate": 0.00033694642304118923, "loss": 3.0483, "step": 5440 }, { "epoch": 1.6874129122155133, "grad_norm": 0.8703885674476624, "learning_rate": 0.00033756580984825027, "loss": 3.0506, "step": 5450 }, { "epoch": 1.6905093667750426, "grad_norm": 0.9054002165794373, "learning_rate": 0.00033818519665531125, "loss": 3.045, "step": 5460 }, { "epoch": 1.693605821334572, "grad_norm": 0.9501616954803467, "learning_rate": 0.0003388045834623722, "loss": 3.0327, "step": 5470 }, { "epoch": 1.6967022758941013, "grad_norm": 0.880946934223175, "learning_rate": 0.00033942397026943326, "loss": 3.0414, "step": 5480 }, { "epoch": 1.6997987304536306, "grad_norm": 0.9799813032150269, "learning_rate": 0.00034004335707649424, "loss": 3.0485, "step": 5490 }, { "epoch": 1.70289518501316, "grad_norm": 0.9278644323348999, "learning_rate": 0.00034066274388355527, "loss": 3.0334, "step": 5500 }, { "epoch": 1.7059916395726893, "grad_norm": 0.8921311497688293, "learning_rate": 0.0003412821306906163, "loss": 3.0283, "step": 5510 }, { "epoch": 1.7090880941322186, "grad_norm": 0.8926926851272583, "learning_rate": 0.0003419015174976773, "loss": 3.0294, "step": 5520 }, { "epoch": 1.712184548691748, "grad_norm": 0.9130481481552124, "learning_rate": 0.0003425209043047383, "loss": 3.007, "step": 5530 }, { "epoch": 1.7152810032512773, "grad_norm": 0.9094374775886536, "learning_rate": 0.00034314029111179935, "loss": 3.0183, "step": 5540 }, { "epoch": 1.7183774578108066, "grad_norm": 0.8862912058830261, "learning_rate": 0.00034375967791886033, "loss": 2.9898, "step": 5550 }, { "epoch": 1.721473912370336, "grad_norm": 0.9140844941139221, "learning_rate": 0.00034437906472592136, "loss": 3.0172, "step": 5560 }, { "epoch": 1.7245703669298653, "grad_norm": 0.976078450679779, "learning_rate": 0.0003449984515329824, "loss": 3.0161, "step": 5570 }, { "epoch": 1.7276668214893947, "grad_norm": 0.9176059365272522, "learning_rate": 0.0003456178383400433, "loss": 2.9931, "step": 5580 }, { "epoch": 1.730763276048924, "grad_norm": 0.9895356297492981, "learning_rate": 0.00034623722514710436, "loss": 3.0026, "step": 5590 }, { "epoch": 1.7338597306084533, "grad_norm": 0.9021176099777222, "learning_rate": 0.0003468566119541654, "loss": 2.9841, "step": 5600 }, { "epoch": 1.7369561851679827, "grad_norm": 1.0290924310684204, "learning_rate": 0.00034747599876122637, "loss": 3.0205, "step": 5610 }, { "epoch": 1.740052639727512, "grad_norm": 0.9842997193336487, "learning_rate": 0.0003480953855682874, "loss": 2.9983, "step": 5620 }, { "epoch": 1.7431490942870413, "grad_norm": 1.004170536994934, "learning_rate": 0.00034871477237534843, "loss": 2.9929, "step": 5630 }, { "epoch": 1.7462455488465707, "grad_norm": 0.8903537392616272, "learning_rate": 0.0003493341591824094, "loss": 2.9928, "step": 5640 }, { "epoch": 1.7493420034061, "grad_norm": 0.9463049173355103, "learning_rate": 0.00034995354598947045, "loss": 2.9975, "step": 5650 }, { "epoch": 1.7524384579656294, "grad_norm": 0.879135251045227, "learning_rate": 0.0003505729327965315, "loss": 2.9767, "step": 5660 }, { "epoch": 1.7555349125251587, "grad_norm": 0.9398852586746216, "learning_rate": 0.00035119231960359246, "loss": 2.9813, "step": 5670 }, { "epoch": 1.758631367084688, "grad_norm": 0.9972649216651917, "learning_rate": 0.00035181170641065344, "loss": 2.964, "step": 5680 }, { "epoch": 1.7617278216442174, "grad_norm": 0.9139822721481323, "learning_rate": 0.00035243109321771447, "loss": 2.9906, "step": 5690 }, { "epoch": 1.7648242762037467, "grad_norm": 0.8910505771636963, "learning_rate": 0.00035305048002477545, "loss": 2.9749, "step": 5700 }, { "epoch": 1.767920730763276, "grad_norm": 1.1436492204666138, "learning_rate": 0.0003536698668318365, "loss": 2.9727, "step": 5710 }, { "epoch": 1.7710171853228054, "grad_norm": 0.9300575852394104, "learning_rate": 0.0003542892536388975, "loss": 3.0028, "step": 5720 }, { "epoch": 1.7741136398823347, "grad_norm": 0.8461237549781799, "learning_rate": 0.0003549086404459585, "loss": 2.9749, "step": 5730 }, { "epoch": 1.777210094441864, "grad_norm": 0.882404088973999, "learning_rate": 0.00035552802725301953, "loss": 2.9568, "step": 5740 }, { "epoch": 1.7803065490013934, "grad_norm": 0.8937315344810486, "learning_rate": 0.00035614741406008056, "loss": 2.9807, "step": 5750 }, { "epoch": 1.7834030035609227, "grad_norm": 0.8935524225234985, "learning_rate": 0.00035676680086714154, "loss": 2.982, "step": 5760 }, { "epoch": 1.786499458120452, "grad_norm": 0.9033128023147583, "learning_rate": 0.0003573861876742026, "loss": 2.9634, "step": 5770 }, { "epoch": 1.7895959126799814, "grad_norm": 0.9767388701438904, "learning_rate": 0.0003580055744812635, "loss": 2.9613, "step": 5780 }, { "epoch": 1.7926923672395108, "grad_norm": 1.0344420671463013, "learning_rate": 0.00035862496128832454, "loss": 2.9319, "step": 5790 }, { "epoch": 1.79578882179904, "grad_norm": 0.87823486328125, "learning_rate": 0.00035924434809538557, "loss": 2.96, "step": 5800 }, { "epoch": 1.7988852763585694, "grad_norm": 0.9067280888557434, "learning_rate": 0.00035986373490244655, "loss": 2.9322, "step": 5810 }, { "epoch": 1.8019817309180988, "grad_norm": 0.8616409301757812, "learning_rate": 0.0003604831217095076, "loss": 2.9611, "step": 5820 }, { "epoch": 1.805078185477628, "grad_norm": 0.8421568274497986, "learning_rate": 0.0003611025085165686, "loss": 2.9366, "step": 5830 }, { "epoch": 1.8081746400371574, "grad_norm": 0.8576173782348633, "learning_rate": 0.0003617218953236296, "loss": 2.9423, "step": 5840 }, { "epoch": 1.8112710945966868, "grad_norm": 0.8986689448356628, "learning_rate": 0.00036234128213069063, "loss": 2.9376, "step": 5850 }, { "epoch": 1.8143675491562161, "grad_norm": 0.9134368300437927, "learning_rate": 0.00036296066893775166, "loss": 2.9262, "step": 5860 }, { "epoch": 1.8174640037157455, "grad_norm": 0.9681121110916138, "learning_rate": 0.00036358005574481264, "loss": 2.9341, "step": 5870 }, { "epoch": 1.8205604582752748, "grad_norm": 1.0286924839019775, "learning_rate": 0.0003641994425518737, "loss": 2.9306, "step": 5880 }, { "epoch": 1.8236569128348041, "grad_norm": 0.9352772831916809, "learning_rate": 0.00036481882935893465, "loss": 2.948, "step": 5890 }, { "epoch": 1.8267533673943335, "grad_norm": 1.0539007186889648, "learning_rate": 0.00036543821616599563, "loss": 2.9523, "step": 5900 }, { "epoch": 1.8298498219538628, "grad_norm": 0.8661713600158691, "learning_rate": 0.00036605760297305667, "loss": 2.9269, "step": 5910 }, { "epoch": 1.8329462765133921, "grad_norm": 0.9120956659317017, "learning_rate": 0.0003666769897801177, "loss": 2.9302, "step": 5920 }, { "epoch": 1.8360427310729215, "grad_norm": 0.9333845376968384, "learning_rate": 0.0003672963765871787, "loss": 2.9247, "step": 5930 }, { "epoch": 1.8391391856324508, "grad_norm": 0.864277720451355, "learning_rate": 0.0003679157633942397, "loss": 2.9269, "step": 5940 }, { "epoch": 1.8422356401919802, "grad_norm": 0.954741358757019, "learning_rate": 0.00036853515020130075, "loss": 2.9348, "step": 5950 }, { "epoch": 1.8453320947515095, "grad_norm": 0.8879597187042236, "learning_rate": 0.0003691545370083617, "loss": 2.9259, "step": 5960 }, { "epoch": 1.8484285493110388, "grad_norm": 0.8487861752510071, "learning_rate": 0.00036977392381542276, "loss": 2.9189, "step": 5970 }, { "epoch": 1.8515250038705682, "grad_norm": 0.9464482069015503, "learning_rate": 0.0003703933106224838, "loss": 2.9119, "step": 5980 }, { "epoch": 1.8546214584300975, "grad_norm": 0.8773711919784546, "learning_rate": 0.0003710126974295447, "loss": 2.9222, "step": 5990 }, { "epoch": 1.8577179129896269, "grad_norm": 0.8919110894203186, "learning_rate": 0.00037163208423660575, "loss": 2.9056, "step": 6000 }, { "epoch": 1.8608143675491562, "grad_norm": 0.9436878561973572, "learning_rate": 0.0003722514710436668, "loss": 2.9095, "step": 6010 }, { "epoch": 1.8639108221086855, "grad_norm": 0.9595790505409241, "learning_rate": 0.00037287085785072776, "loss": 2.9047, "step": 6020 }, { "epoch": 1.8670072766682149, "grad_norm": 0.8692799806594849, "learning_rate": 0.0003734902446577888, "loss": 2.905, "step": 6030 }, { "epoch": 1.8701037312277442, "grad_norm": 0.9274528622627258, "learning_rate": 0.00037410963146484983, "loss": 2.9251, "step": 6040 }, { "epoch": 1.8732001857872735, "grad_norm": 0.8798776268959045, "learning_rate": 0.0003747290182719108, "loss": 2.9113, "step": 6050 }, { "epoch": 1.8762966403468029, "grad_norm": 0.8613748550415039, "learning_rate": 0.00037534840507897184, "loss": 2.9077, "step": 6060 }, { "epoch": 1.8793930949063322, "grad_norm": 0.8926125764846802, "learning_rate": 0.0003759677918860329, "loss": 2.9029, "step": 6070 }, { "epoch": 1.8824895494658616, "grad_norm": 0.9414944052696228, "learning_rate": 0.00037658717869309386, "loss": 2.8968, "step": 6080 }, { "epoch": 1.885586004025391, "grad_norm": 0.8922074437141418, "learning_rate": 0.0003772065655001549, "loss": 2.8992, "step": 6090 }, { "epoch": 1.8886824585849202, "grad_norm": 0.9254492521286011, "learning_rate": 0.0003778259523072158, "loss": 2.912, "step": 6100 }, { "epoch": 1.8917789131444496, "grad_norm": 0.8882949948310852, "learning_rate": 0.00037844533911427685, "loss": 2.8972, "step": 6110 }, { "epoch": 1.894875367703979, "grad_norm": 0.874482274055481, "learning_rate": 0.0003790647259213379, "loss": 2.8848, "step": 6120 }, { "epoch": 1.8979718222635082, "grad_norm": 0.8989077210426331, "learning_rate": 0.00037968411272839886, "loss": 2.8934, "step": 6130 }, { "epoch": 1.9010682768230376, "grad_norm": 0.9361928105354309, "learning_rate": 0.0003803034995354599, "loss": 2.8697, "step": 6140 }, { "epoch": 1.904164731382567, "grad_norm": 0.8788303732872009, "learning_rate": 0.0003809228863425209, "loss": 2.8989, "step": 6150 }, { "epoch": 1.9072611859420963, "grad_norm": 0.8196372985839844, "learning_rate": 0.0003815422731495819, "loss": 2.8913, "step": 6160 }, { "epoch": 1.9103576405016256, "grad_norm": 0.8973246216773987, "learning_rate": 0.00038216165995664294, "loss": 2.8941, "step": 6170 }, { "epoch": 1.913454095061155, "grad_norm": 0.951608419418335, "learning_rate": 0.00038278104676370397, "loss": 2.8941, "step": 6180 }, { "epoch": 1.9165505496206843, "grad_norm": 0.87721186876297, "learning_rate": 0.00038340043357076495, "loss": 2.9039, "step": 6190 }, { "epoch": 1.9196470041802136, "grad_norm": 0.8995383381843567, "learning_rate": 0.00038401982037782593, "loss": 2.8978, "step": 6200 }, { "epoch": 1.922743458739743, "grad_norm": 0.9441946148872375, "learning_rate": 0.00038463920718488696, "loss": 2.8774, "step": 6210 }, { "epoch": 1.9258399132992723, "grad_norm": 0.8960248231887817, "learning_rate": 0.00038525859399194794, "loss": 2.8908, "step": 6220 }, { "epoch": 1.9289363678588016, "grad_norm": 0.9116747975349426, "learning_rate": 0.000385877980799009, "loss": 2.8639, "step": 6230 }, { "epoch": 1.932032822418331, "grad_norm": 0.8798891305923462, "learning_rate": 0.00038649736760607, "loss": 2.86, "step": 6240 }, { "epoch": 1.9351292769778603, "grad_norm": 0.8671932816505432, "learning_rate": 0.000387116754413131, "loss": 2.871, "step": 6250 }, { "epoch": 1.9382257315373896, "grad_norm": 0.9382427930831909, "learning_rate": 0.000387736141220192, "loss": 2.8508, "step": 6260 }, { "epoch": 1.941322186096919, "grad_norm": 0.9341138005256653, "learning_rate": 0.00038835552802725306, "loss": 2.8717, "step": 6270 }, { "epoch": 1.9444186406564483, "grad_norm": 0.9240859150886536, "learning_rate": 0.00038897491483431404, "loss": 2.8802, "step": 6280 }, { "epoch": 1.9475150952159777, "grad_norm": 0.9910873174667358, "learning_rate": 0.00038959430164137507, "loss": 2.8709, "step": 6290 }, { "epoch": 1.950611549775507, "grad_norm": 0.9003307223320007, "learning_rate": 0.0003902136884484361, "loss": 2.8732, "step": 6300 }, { "epoch": 1.9537080043350363, "grad_norm": 0.904257595539093, "learning_rate": 0.00039083307525549703, "loss": 2.8876, "step": 6310 }, { "epoch": 1.9568044588945657, "grad_norm": 0.978615403175354, "learning_rate": 0.00039145246206255806, "loss": 2.8684, "step": 6320 }, { "epoch": 1.959900913454095, "grad_norm": 0.8782775402069092, "learning_rate": 0.0003920718488696191, "loss": 2.8677, "step": 6330 }, { "epoch": 1.9629973680136246, "grad_norm": 0.9640995860099792, "learning_rate": 0.0003926912356766801, "loss": 2.8568, "step": 6340 }, { "epoch": 1.966093822573154, "grad_norm": 0.8807209134101868, "learning_rate": 0.0003933106224837411, "loss": 2.8618, "step": 6350 }, { "epoch": 1.9691902771326832, "grad_norm": 0.8921664357185364, "learning_rate": 0.00039393000929080214, "loss": 2.8788, "step": 6360 }, { "epoch": 1.9722867316922126, "grad_norm": 0.9727539420127869, "learning_rate": 0.0003945493960978631, "loss": 2.8512, "step": 6370 }, { "epoch": 1.975383186251742, "grad_norm": 0.8913626670837402, "learning_rate": 0.00039516878290492415, "loss": 2.8604, "step": 6380 }, { "epoch": 1.9784796408112713, "grad_norm": 0.8825446963310242, "learning_rate": 0.0003957881697119852, "loss": 2.8448, "step": 6390 }, { "epoch": 1.9815760953708006, "grad_norm": 0.916666567325592, "learning_rate": 0.00039640755651904617, "loss": 2.8625, "step": 6400 }, { "epoch": 1.98467254993033, "grad_norm": 1.0008190870285034, "learning_rate": 0.00039702694332610715, "loss": 2.8631, "step": 6410 }, { "epoch": 1.9877690044898593, "grad_norm": 0.8584704399108887, "learning_rate": 0.0003976463301331682, "loss": 2.8701, "step": 6420 }, { "epoch": 1.9908654590493886, "grad_norm": 0.9079132676124573, "learning_rate": 0.00039826571694022916, "loss": 2.8453, "step": 6430 }, { "epoch": 1.993961913608918, "grad_norm": 0.8909833431243896, "learning_rate": 0.0003988851037472902, "loss": 2.8315, "step": 6440 }, { "epoch": 1.9970583681684473, "grad_norm": 0.9206358194351196, "learning_rate": 0.00039950449055435117, "loss": 2.8694, "step": 6450 }, { "epoch": 2.0, "grad_norm": 0.6666725277900696, "learning_rate": 0.0004001238773614122, "loss": 2.7051, "step": 6460 }, { "epoch": 2.0030964545595293, "grad_norm": 0.8826514482498169, "learning_rate": 0.00040074326416847324, "loss": 2.8328, "step": 6470 }, { "epoch": 2.0061929091190587, "grad_norm": 0.922680139541626, "learning_rate": 0.0004013626509755342, "loss": 2.852, "step": 6480 }, { "epoch": 2.009289363678588, "grad_norm": 0.9056729674339294, "learning_rate": 0.00040198203778259525, "loss": 2.8423, "step": 6490 }, { "epoch": 2.0123858182381174, "grad_norm": 0.866322934627533, "learning_rate": 0.0004026014245896563, "loss": 2.8412, "step": 6500 }, { "epoch": 2.0154822727976467, "grad_norm": 0.9588058590888977, "learning_rate": 0.0004032208113967172, "loss": 2.8526, "step": 6510 }, { "epoch": 2.018578727357176, "grad_norm": 0.9247243404388428, "learning_rate": 0.00040384019820377824, "loss": 2.8271, "step": 6520 }, { "epoch": 2.0216751819167054, "grad_norm": 0.8787789940834045, "learning_rate": 0.0004044595850108393, "loss": 2.8043, "step": 6530 }, { "epoch": 2.0247716364762347, "grad_norm": 0.8963256478309631, "learning_rate": 0.00040507897181790025, "loss": 2.8162, "step": 6540 }, { "epoch": 2.027868091035764, "grad_norm": 0.9025070071220398, "learning_rate": 0.0004056983586249613, "loss": 2.8226, "step": 6550 }, { "epoch": 2.0309645455952934, "grad_norm": 0.8822202086448669, "learning_rate": 0.0004063177454320223, "loss": 2.8284, "step": 6560 }, { "epoch": 2.0340610001548227, "grad_norm": 0.9176104068756104, "learning_rate": 0.0004069371322390833, "loss": 2.8379, "step": 6570 }, { "epoch": 2.037157454714352, "grad_norm": 0.9508628845214844, "learning_rate": 0.00040755651904614433, "loss": 2.8113, "step": 6580 }, { "epoch": 2.0402539092738814, "grad_norm": 0.9238744378089905, "learning_rate": 0.00040817590585320537, "loss": 2.8221, "step": 6590 }, { "epoch": 2.0433503638334107, "grad_norm": 0.8854493498802185, "learning_rate": 0.00040879529266026635, "loss": 2.8139, "step": 6600 }, { "epoch": 2.04644681839294, "grad_norm": 0.8652548789978027, "learning_rate": 0.0004094146794673274, "loss": 2.8153, "step": 6610 }, { "epoch": 2.0495432729524694, "grad_norm": 0.8663405179977417, "learning_rate": 0.00041003406627438836, "loss": 2.8098, "step": 6620 }, { "epoch": 2.0526397275119987, "grad_norm": 0.8482099175453186, "learning_rate": 0.00041065345308144934, "loss": 2.8102, "step": 6630 }, { "epoch": 2.055736182071528, "grad_norm": 0.895483672618866, "learning_rate": 0.00041127283988851037, "loss": 2.8014, "step": 6640 }, { "epoch": 2.0588326366310574, "grad_norm": 0.8933889865875244, "learning_rate": 0.0004118922266955714, "loss": 2.8008, "step": 6650 }, { "epoch": 2.0619290911905868, "grad_norm": 0.87566739320755, "learning_rate": 0.0004125116135026324, "loss": 2.8055, "step": 6660 }, { "epoch": 2.065025545750116, "grad_norm": 0.9240240454673767, "learning_rate": 0.0004131310003096934, "loss": 2.8249, "step": 6670 }, { "epoch": 2.0681220003096454, "grad_norm": 0.9362452626228333, "learning_rate": 0.00041375038711675445, "loss": 2.8128, "step": 6680 }, { "epoch": 2.0712184548691748, "grad_norm": 0.859845757484436, "learning_rate": 0.00041436977392381543, "loss": 2.7887, "step": 6690 }, { "epoch": 2.074314909428704, "grad_norm": 0.9458219408988953, "learning_rate": 0.00041498916073087646, "loss": 2.8087, "step": 6700 }, { "epoch": 2.0774113639882335, "grad_norm": 0.9015805125236511, "learning_rate": 0.0004156085475379375, "loss": 2.8197, "step": 6710 }, { "epoch": 2.080507818547763, "grad_norm": 0.8841304779052734, "learning_rate": 0.0004162279343449984, "loss": 2.793, "step": 6720 }, { "epoch": 2.083604273107292, "grad_norm": 0.9217279553413391, "learning_rate": 0.00041684732115205946, "loss": 2.8279, "step": 6730 }, { "epoch": 2.0867007276668215, "grad_norm": 0.9141611456871033, "learning_rate": 0.0004174667079591205, "loss": 2.7922, "step": 6740 }, { "epoch": 2.089797182226351, "grad_norm": 0.8566716313362122, "learning_rate": 0.00041808609476618147, "loss": 2.8088, "step": 6750 }, { "epoch": 2.09289363678588, "grad_norm": 0.9103225469589233, "learning_rate": 0.0004187054815732425, "loss": 2.8134, "step": 6760 }, { "epoch": 2.0959900913454095, "grad_norm": 0.8901599049568176, "learning_rate": 0.0004193248683803035, "loss": 2.8114, "step": 6770 }, { "epoch": 2.099086545904939, "grad_norm": 0.9474543333053589, "learning_rate": 0.0004199442551873645, "loss": 2.7907, "step": 6780 }, { "epoch": 2.102183000464468, "grad_norm": 0.8805556297302246, "learning_rate": 0.00042056364199442555, "loss": 2.8023, "step": 6790 }, { "epoch": 2.1052794550239975, "grad_norm": 0.9209165573120117, "learning_rate": 0.00042118302880148653, "loss": 2.8247, "step": 6800 }, { "epoch": 2.108375909583527, "grad_norm": 0.9121336340904236, "learning_rate": 0.00042180241560854756, "loss": 2.7983, "step": 6810 }, { "epoch": 2.111472364143056, "grad_norm": 0.883575439453125, "learning_rate": 0.0004224218024156086, "loss": 2.7973, "step": 6820 }, { "epoch": 2.1145688187025855, "grad_norm": 0.8569662570953369, "learning_rate": 0.0004230411892226695, "loss": 2.807, "step": 6830 }, { "epoch": 2.117665273262115, "grad_norm": 0.8648683428764343, "learning_rate": 0.00042366057602973055, "loss": 2.7953, "step": 6840 }, { "epoch": 2.120761727821644, "grad_norm": 1.0288830995559692, "learning_rate": 0.0004242799628367916, "loss": 2.7934, "step": 6850 }, { "epoch": 2.1238581823811735, "grad_norm": 0.9366074800491333, "learning_rate": 0.00042489934964385257, "loss": 2.8014, "step": 6860 }, { "epoch": 2.126954636940703, "grad_norm": 0.9614273905754089, "learning_rate": 0.0004255187364509136, "loss": 2.7822, "step": 6870 }, { "epoch": 2.130051091500232, "grad_norm": 0.8939881324768066, "learning_rate": 0.00042613812325797463, "loss": 2.8195, "step": 6880 }, { "epoch": 2.1331475460597615, "grad_norm": 0.9166781902313232, "learning_rate": 0.0004267575100650356, "loss": 2.7889, "step": 6890 }, { "epoch": 2.136244000619291, "grad_norm": 0.8826269507408142, "learning_rate": 0.00042737689687209665, "loss": 2.8041, "step": 6900 }, { "epoch": 2.13934045517882, "grad_norm": 0.9127874970436096, "learning_rate": 0.0004279962836791577, "loss": 2.7986, "step": 6910 }, { "epoch": 2.1424369097383495, "grad_norm": 0.9072954654693604, "learning_rate": 0.00042861567048621866, "loss": 2.8031, "step": 6920 }, { "epoch": 2.145533364297879, "grad_norm": 0.8833560943603516, "learning_rate": 0.00042923505729327964, "loss": 2.7911, "step": 6930 }, { "epoch": 2.1486298188574082, "grad_norm": 0.861221194267273, "learning_rate": 0.00042985444410034067, "loss": 2.8073, "step": 6940 }, { "epoch": 2.1517262734169376, "grad_norm": 0.9040530323982239, "learning_rate": 0.00043047383090740165, "loss": 2.7849, "step": 6950 }, { "epoch": 2.154822727976467, "grad_norm": 0.9143641591072083, "learning_rate": 0.0004310932177144627, "loss": 2.7896, "step": 6960 }, { "epoch": 2.1579191825359962, "grad_norm": 0.8545592427253723, "learning_rate": 0.0004317126045215237, "loss": 2.7971, "step": 6970 }, { "epoch": 2.1610156370955256, "grad_norm": 0.9303133487701416, "learning_rate": 0.0004323319913285847, "loss": 2.7784, "step": 6980 }, { "epoch": 2.164112091655055, "grad_norm": 0.9570648074150085, "learning_rate": 0.00043295137813564573, "loss": 2.7977, "step": 6990 }, { "epoch": 2.1672085462145843, "grad_norm": 0.906696081161499, "learning_rate": 0.00043357076494270676, "loss": 2.7947, "step": 7000 }, { "epoch": 2.1703050007741136, "grad_norm": 0.8919961452484131, "learning_rate": 0.00043419015174976774, "loss": 2.7926, "step": 7010 }, { "epoch": 2.173401455333643, "grad_norm": 0.8740367889404297, "learning_rate": 0.0004348095385568288, "loss": 2.7747, "step": 7020 }, { "epoch": 2.1764979098931723, "grad_norm": 0.8785775899887085, "learning_rate": 0.0004354289253638898, "loss": 2.791, "step": 7030 }, { "epoch": 2.1795943644527016, "grad_norm": 0.9824354648590088, "learning_rate": 0.00043604831217095073, "loss": 2.7756, "step": 7040 }, { "epoch": 2.182690819012231, "grad_norm": 0.9581257104873657, "learning_rate": 0.00043666769897801177, "loss": 2.7893, "step": 7050 }, { "epoch": 2.1857872735717603, "grad_norm": 0.9003785252571106, "learning_rate": 0.0004372870857850728, "loss": 2.7857, "step": 7060 }, { "epoch": 2.1888837281312896, "grad_norm": 0.9463407397270203, "learning_rate": 0.0004379064725921338, "loss": 2.7608, "step": 7070 }, { "epoch": 2.191980182690819, "grad_norm": 0.9050635695457458, "learning_rate": 0.0004385258593991948, "loss": 2.7703, "step": 7080 }, { "epoch": 2.1950766372503483, "grad_norm": 0.8689008951187134, "learning_rate": 0.0004391452462062558, "loss": 2.7742, "step": 7090 }, { "epoch": 2.1981730918098776, "grad_norm": 0.8723441958427429, "learning_rate": 0.0004397646330133168, "loss": 2.7694, "step": 7100 }, { "epoch": 2.201269546369407, "grad_norm": 0.8924479484558105, "learning_rate": 0.00044038401982037786, "loss": 2.7906, "step": 7110 }, { "epoch": 2.2043660009289363, "grad_norm": 0.919276773929596, "learning_rate": 0.00044100340662743884, "loss": 2.7872, "step": 7120 }, { "epoch": 2.2074624554884656, "grad_norm": 0.901465654373169, "learning_rate": 0.00044162279343449987, "loss": 2.7465, "step": 7130 }, { "epoch": 2.210558910047995, "grad_norm": 0.8734842538833618, "learning_rate": 0.00044224218024156085, "loss": 2.7662, "step": 7140 }, { "epoch": 2.2136553646075243, "grad_norm": 0.9729484915733337, "learning_rate": 0.00044286156704862183, "loss": 2.7681, "step": 7150 }, { "epoch": 2.2167518191670537, "grad_norm": 0.8634438514709473, "learning_rate": 0.00044348095385568286, "loss": 2.7694, "step": 7160 }, { "epoch": 2.219848273726583, "grad_norm": 0.8623734712600708, "learning_rate": 0.0004441003406627439, "loss": 2.7775, "step": 7170 }, { "epoch": 2.2229447282861123, "grad_norm": 0.9596241116523743, "learning_rate": 0.0004447197274698049, "loss": 2.7916, "step": 7180 }, { "epoch": 2.2260411828456417, "grad_norm": 0.8765792846679688, "learning_rate": 0.0004453391142768659, "loss": 2.7529, "step": 7190 }, { "epoch": 2.229137637405171, "grad_norm": 0.887290894985199, "learning_rate": 0.00044595850108392694, "loss": 2.7697, "step": 7200 }, { "epoch": 2.2322340919647004, "grad_norm": 0.842238187789917, "learning_rate": 0.0004465778878909879, "loss": 2.7521, "step": 7210 }, { "epoch": 2.2353305465242297, "grad_norm": 0.9190672039985657, "learning_rate": 0.00044719727469804896, "loss": 2.7611, "step": 7220 }, { "epoch": 2.238427001083759, "grad_norm": 0.8801867365837097, "learning_rate": 0.00044781666150511, "loss": 2.7656, "step": 7230 }, { "epoch": 2.2415234556432884, "grad_norm": 0.9014734029769897, "learning_rate": 0.0004484360483121709, "loss": 2.7855, "step": 7240 }, { "epoch": 2.2446199102028177, "grad_norm": 0.8749867081642151, "learning_rate": 0.00044905543511923195, "loss": 2.7683, "step": 7250 }, { "epoch": 2.247716364762347, "grad_norm": 0.8823255896568298, "learning_rate": 0.000449674821926293, "loss": 2.7468, "step": 7260 }, { "epoch": 2.2508128193218764, "grad_norm": 1.020506739616394, "learning_rate": 0.00045029420873335396, "loss": 2.7633, "step": 7270 }, { "epoch": 2.2539092738814057, "grad_norm": 0.9416619539260864, "learning_rate": 0.000450913595540415, "loss": 2.7598, "step": 7280 }, { "epoch": 2.257005728440935, "grad_norm": 0.8934683203697205, "learning_rate": 0.00045153298234747603, "loss": 2.767, "step": 7290 }, { "epoch": 2.2601021830004644, "grad_norm": 0.9301040768623352, "learning_rate": 0.000452152369154537, "loss": 2.768, "step": 7300 }, { "epoch": 2.2631986375599937, "grad_norm": 0.9030665159225464, "learning_rate": 0.00045277175596159804, "loss": 2.7468, "step": 7310 }, { "epoch": 2.266295092119523, "grad_norm": 0.8950912952423096, "learning_rate": 0.0004533911427686591, "loss": 2.7583, "step": 7320 }, { "epoch": 2.2693915466790524, "grad_norm": 0.9231360554695129, "learning_rate": 0.00045401052957572005, "loss": 2.768, "step": 7330 }, { "epoch": 2.2724880012385817, "grad_norm": 0.9247618317604065, "learning_rate": 0.0004546299163827811, "loss": 2.7679, "step": 7340 }, { "epoch": 2.275584455798111, "grad_norm": 0.8417907953262329, "learning_rate": 0.00045524930318984207, "loss": 2.7641, "step": 7350 }, { "epoch": 2.2786809103576404, "grad_norm": 0.881175696849823, "learning_rate": 0.00045586868999690305, "loss": 2.7377, "step": 7360 }, { "epoch": 2.2817773649171698, "grad_norm": 0.9351217746734619, "learning_rate": 0.0004564880768039641, "loss": 2.7521, "step": 7370 }, { "epoch": 2.284873819476699, "grad_norm": 0.8650684952735901, "learning_rate": 0.0004571074636110251, "loss": 2.7675, "step": 7380 }, { "epoch": 2.2879702740362284, "grad_norm": 0.922113299369812, "learning_rate": 0.0004577268504180861, "loss": 2.7401, "step": 7390 }, { "epoch": 2.2910667285957578, "grad_norm": 0.8902767896652222, "learning_rate": 0.0004583462372251471, "loss": 2.7772, "step": 7400 }, { "epoch": 2.294163183155287, "grad_norm": 0.8764835596084595, "learning_rate": 0.00045896562403220816, "loss": 2.7526, "step": 7410 }, { "epoch": 2.2972596377148164, "grad_norm": 0.8847823739051819, "learning_rate": 0.00045958501083926914, "loss": 2.7504, "step": 7420 }, { "epoch": 2.300356092274346, "grad_norm": 0.8462940454483032, "learning_rate": 0.00046020439764633017, "loss": 2.7209, "step": 7430 }, { "epoch": 2.303452546833875, "grad_norm": 0.8645547032356262, "learning_rate": 0.00046082378445339115, "loss": 2.7464, "step": 7440 }, { "epoch": 2.3065490013934045, "grad_norm": 0.8842138051986694, "learning_rate": 0.00046144317126045213, "loss": 2.7566, "step": 7450 }, { "epoch": 2.309645455952934, "grad_norm": 0.8625742197036743, "learning_rate": 0.00046206255806751316, "loss": 2.753, "step": 7460 }, { "epoch": 2.312741910512463, "grad_norm": 0.922121524810791, "learning_rate": 0.00046268194487457414, "loss": 2.75, "step": 7470 }, { "epoch": 2.3158383650719925, "grad_norm": 0.8739849925041199, "learning_rate": 0.0004633013316816352, "loss": 2.7513, "step": 7480 }, { "epoch": 2.318934819631522, "grad_norm": 0.8614432215690613, "learning_rate": 0.0004639207184886962, "loss": 2.75, "step": 7490 }, { "epoch": 2.322031274191051, "grad_norm": 0.8714541792869568, "learning_rate": 0.0004645401052957572, "loss": 2.7297, "step": 7500 }, { "epoch": 2.3251277287505805, "grad_norm": 0.9732015132904053, "learning_rate": 0.0004651594921028182, "loss": 2.7529, "step": 7510 }, { "epoch": 2.32822418331011, "grad_norm": 0.9061838388442993, "learning_rate": 0.00046577887890987925, "loss": 2.7541, "step": 7520 }, { "epoch": 2.331320637869639, "grad_norm": 1.0056427717208862, "learning_rate": 0.00046639826571694023, "loss": 2.7381, "step": 7530 }, { "epoch": 2.3344170924291685, "grad_norm": 0.9382318258285522, "learning_rate": 0.00046701765252400127, "loss": 2.758, "step": 7540 }, { "epoch": 2.337513546988698, "grad_norm": 0.9322879314422607, "learning_rate": 0.0004676370393310623, "loss": 2.7196, "step": 7550 }, { "epoch": 2.340610001548227, "grad_norm": 0.8709734678268433, "learning_rate": 0.0004682564261381232, "loss": 2.7259, "step": 7560 }, { "epoch": 2.3437064561077565, "grad_norm": 0.8605784177780151, "learning_rate": 0.00046887581294518426, "loss": 2.7116, "step": 7570 }, { "epoch": 2.346802910667286, "grad_norm": 0.8777926564216614, "learning_rate": 0.0004694951997522453, "loss": 2.7389, "step": 7580 }, { "epoch": 2.349899365226815, "grad_norm": 0.9535753130912781, "learning_rate": 0.00047011458655930627, "loss": 2.7402, "step": 7590 }, { "epoch": 2.3529958197863445, "grad_norm": 0.8377962708473206, "learning_rate": 0.0004707339733663673, "loss": 2.7672, "step": 7600 }, { "epoch": 2.356092274345874, "grad_norm": 0.9221674799919128, "learning_rate": 0.00047135336017342834, "loss": 2.7341, "step": 7610 }, { "epoch": 2.359188728905403, "grad_norm": 0.9175540804862976, "learning_rate": 0.0004719727469804893, "loss": 2.7332, "step": 7620 }, { "epoch": 2.3622851834649325, "grad_norm": 0.896039605140686, "learning_rate": 0.00047259213378755035, "loss": 2.7587, "step": 7630 }, { "epoch": 2.365381638024462, "grad_norm": 0.8460658192634583, "learning_rate": 0.0004732115205946114, "loss": 2.7378, "step": 7640 }, { "epoch": 2.3684780925839912, "grad_norm": 0.9001418352127075, "learning_rate": 0.00047383090740167236, "loss": 2.7374, "step": 7650 }, { "epoch": 2.3715745471435206, "grad_norm": 0.9807076454162598, "learning_rate": 0.00047445029420873334, "loss": 2.723, "step": 7660 }, { "epoch": 2.37467100170305, "grad_norm": 0.8731216192245483, "learning_rate": 0.0004750696810157944, "loss": 2.7112, "step": 7670 }, { "epoch": 2.3777674562625792, "grad_norm": 0.8750482201576233, "learning_rate": 0.00047568906782285536, "loss": 2.7016, "step": 7680 }, { "epoch": 2.3808639108221086, "grad_norm": 0.8985123634338379, "learning_rate": 0.0004763084546299164, "loss": 2.7462, "step": 7690 }, { "epoch": 2.383960365381638, "grad_norm": 0.8914074301719666, "learning_rate": 0.0004769278414369774, "loss": 2.7253, "step": 7700 }, { "epoch": 2.3870568199411673, "grad_norm": 0.8856596350669861, "learning_rate": 0.0004775472282440384, "loss": 2.7438, "step": 7710 }, { "epoch": 2.3901532745006966, "grad_norm": 0.9476223587989807, "learning_rate": 0.00047816661505109944, "loss": 2.7208, "step": 7720 }, { "epoch": 2.393249729060226, "grad_norm": 0.8765897750854492, "learning_rate": 0.00047878600185816047, "loss": 2.7302, "step": 7730 }, { "epoch": 2.3963461836197553, "grad_norm": 0.9087428450584412, "learning_rate": 0.00047940538866522145, "loss": 2.7225, "step": 7740 }, { "epoch": 2.3994426381792846, "grad_norm": 0.9276483058929443, "learning_rate": 0.0004800247754722825, "loss": 2.7297, "step": 7750 }, { "epoch": 2.402539092738814, "grad_norm": 0.8988469243049622, "learning_rate": 0.0004806441622793434, "loss": 2.7167, "step": 7760 }, { "epoch": 2.4056355472983433, "grad_norm": 0.865112841129303, "learning_rate": 0.00048126354908640444, "loss": 2.7187, "step": 7770 }, { "epoch": 2.4087320018578726, "grad_norm": 0.8832447528839111, "learning_rate": 0.0004818829358934655, "loss": 2.7123, "step": 7780 }, { "epoch": 2.411828456417402, "grad_norm": 0.8970694541931152, "learning_rate": 0.00048250232270052645, "loss": 2.7255, "step": 7790 }, { "epoch": 2.4149249109769313, "grad_norm": 0.8232760429382324, "learning_rate": 0.0004831217095075875, "loss": 2.7315, "step": 7800 }, { "epoch": 2.4180213655364606, "grad_norm": 0.9075847268104553, "learning_rate": 0.0004837410963146485, "loss": 2.7098, "step": 7810 }, { "epoch": 2.42111782009599, "grad_norm": 0.871097981929779, "learning_rate": 0.0004843604831217095, "loss": 2.7172, "step": 7820 }, { "epoch": 2.4242142746555193, "grad_norm": 0.8684946894645691, "learning_rate": 0.00048497986992877053, "loss": 2.7031, "step": 7830 }, { "epoch": 2.4273107292150486, "grad_norm": 0.9100140929222107, "learning_rate": 0.00048559925673583157, "loss": 2.7175, "step": 7840 }, { "epoch": 2.430407183774578, "grad_norm": 0.8607642650604248, "learning_rate": 0.00048621864354289254, "loss": 2.7149, "step": 7850 }, { "epoch": 2.4335036383341073, "grad_norm": 0.865871012210846, "learning_rate": 0.0004868380303499536, "loss": 2.7139, "step": 7860 }, { "epoch": 2.4366000928936367, "grad_norm": 0.9190123677253723, "learning_rate": 0.00048745741715701456, "loss": 2.7167, "step": 7870 }, { "epoch": 2.439696547453166, "grad_norm": 0.8954902291297913, "learning_rate": 0.00048807680396407554, "loss": 2.7041, "step": 7880 }, { "epoch": 2.4427930020126953, "grad_norm": 0.9070473313331604, "learning_rate": 0.0004886961907711366, "loss": 2.712, "step": 7890 }, { "epoch": 2.4458894565722247, "grad_norm": 1.2090919017791748, "learning_rate": 0.0004893155775781977, "loss": 2.7241, "step": 7900 }, { "epoch": 2.448985911131754, "grad_norm": 0.8956063985824585, "learning_rate": 0.0004899349643852586, "loss": 2.7089, "step": 7910 }, { "epoch": 2.4520823656912833, "grad_norm": 0.8796259164810181, "learning_rate": 0.0004905543511923196, "loss": 2.6996, "step": 7920 }, { "epoch": 2.4551788202508127, "grad_norm": 0.8752288222312927, "learning_rate": 0.0004911737379993806, "loss": 2.7141, "step": 7930 }, { "epoch": 2.458275274810342, "grad_norm": 0.8404427170753479, "learning_rate": 0.0004917931248064416, "loss": 2.7086, "step": 7940 }, { "epoch": 2.4613717293698714, "grad_norm": 0.8801198601722717, "learning_rate": 0.0004924125116135027, "loss": 2.716, "step": 7950 }, { "epoch": 2.4644681839294007, "grad_norm": 0.8937883377075195, "learning_rate": 0.0004930318984205636, "loss": 2.6963, "step": 7960 }, { "epoch": 2.46756463848893, "grad_norm": 0.8348713517189026, "learning_rate": 0.0004936512852276246, "loss": 2.7158, "step": 7970 }, { "epoch": 2.4706610930484594, "grad_norm": 0.9168616533279419, "learning_rate": 0.0004942706720346857, "loss": 2.7212, "step": 7980 }, { "epoch": 2.4737575476079887, "grad_norm": 0.8765811324119568, "learning_rate": 0.0004948900588417467, "loss": 2.7037, "step": 7990 }, { "epoch": 2.476854002167518, "grad_norm": 0.9563819766044617, "learning_rate": 0.0004955094456488077, "loss": 2.7076, "step": 8000 }, { "epoch": 2.4799504567270474, "grad_norm": 0.9105591177940369, "learning_rate": 0.0004961288324558688, "loss": 2.704, "step": 8010 }, { "epoch": 2.4830469112865767, "grad_norm": 0.8907128572463989, "learning_rate": 0.0004967482192629297, "loss": 2.711, "step": 8020 }, { "epoch": 2.486143365846106, "grad_norm": 0.9110057353973389, "learning_rate": 0.0004973676060699907, "loss": 2.715, "step": 8030 }, { "epoch": 2.4892398204056354, "grad_norm": 0.8938244581222534, "learning_rate": 0.0004979869928770517, "loss": 2.7236, "step": 8040 }, { "epoch": 2.4923362749651647, "grad_norm": 0.8680298328399658, "learning_rate": 0.0004986063796841128, "loss": 2.7141, "step": 8050 }, { "epoch": 2.495432729524694, "grad_norm": 1.2556971311569214, "learning_rate": 0.0004992257664911738, "loss": 2.7182, "step": 8060 }, { "epoch": 2.4985291840842234, "grad_norm": 0.8885079026222229, "learning_rate": 0.0004998451532982347, "loss": 2.7178, "step": 8070 }, { "epoch": 2.5016256386437528, "grad_norm": 0.8683394193649292, "learning_rate": 0.0005004645401052957, "loss": 2.713, "step": 8080 }, { "epoch": 2.504722093203282, "grad_norm": 0.8895092010498047, "learning_rate": 0.0005010839269123568, "loss": 2.7244, "step": 8090 }, { "epoch": 2.5078185477628114, "grad_norm": 0.9000723958015442, "learning_rate": 0.0005017033137194178, "loss": 2.7018, "step": 8100 }, { "epoch": 2.5109150023223408, "grad_norm": 0.8466011881828308, "learning_rate": 0.0005023227005264788, "loss": 2.7252, "step": 8110 }, { "epoch": 2.51401145688187, "grad_norm": 0.8740931749343872, "learning_rate": 0.0005029420873335399, "loss": 2.7023, "step": 8120 }, { "epoch": 2.5171079114413994, "grad_norm": 0.9173566102981567, "learning_rate": 0.0005035614741406008, "loss": 2.7158, "step": 8130 }, { "epoch": 2.520204366000929, "grad_norm": 0.9136703610420227, "learning_rate": 0.0005041808609476618, "loss": 2.7081, "step": 8140 }, { "epoch": 2.523300820560458, "grad_norm": 0.9001860022544861, "learning_rate": 0.0005048002477547229, "loss": 2.6879, "step": 8150 }, { "epoch": 2.5263972751199875, "grad_norm": 0.8756097555160522, "learning_rate": 0.0005054196345617839, "loss": 2.714, "step": 8160 }, { "epoch": 2.529493729679517, "grad_norm": 0.8774548768997192, "learning_rate": 0.0005060390213688449, "loss": 2.6751, "step": 8170 }, { "epoch": 2.532590184239046, "grad_norm": 0.8764857649803162, "learning_rate": 0.0005066584081759059, "loss": 2.7045, "step": 8180 }, { "epoch": 2.5356866387985755, "grad_norm": 0.8589802980422974, "learning_rate": 0.0005072777949829669, "loss": 2.7001, "step": 8190 }, { "epoch": 2.538783093358105, "grad_norm": 0.8591241836547852, "learning_rate": 0.0005078971817900279, "loss": 2.6838, "step": 8200 }, { "epoch": 2.541879547917634, "grad_norm": 0.8960736989974976, "learning_rate": 0.000508516568597089, "loss": 2.6847, "step": 8210 }, { "epoch": 2.5449760024771635, "grad_norm": 0.8818134069442749, "learning_rate": 0.00050913595540415, "loss": 2.6907, "step": 8220 }, { "epoch": 2.548072457036693, "grad_norm": 0.8439919948577881, "learning_rate": 0.0005097553422112108, "loss": 2.6649, "step": 8230 }, { "epoch": 2.551168911596222, "grad_norm": 0.953252911567688, "learning_rate": 0.0005103747290182718, "loss": 2.7113, "step": 8240 }, { "epoch": 2.5542653661557515, "grad_norm": 0.8814793825149536, "learning_rate": 0.0005109941158253329, "loss": 2.6971, "step": 8250 }, { "epoch": 2.557361820715281, "grad_norm": 0.8562922477722168, "learning_rate": 0.0005116135026323939, "loss": 2.6816, "step": 8260 }, { "epoch": 2.56045827527481, "grad_norm": 0.9286318421363831, "learning_rate": 0.0005122328894394549, "loss": 2.6976, "step": 8270 }, { "epoch": 2.5635547298343395, "grad_norm": 0.8571282029151917, "learning_rate": 0.000512852276246516, "loss": 2.6931, "step": 8280 }, { "epoch": 2.566651184393869, "grad_norm": 0.8638617396354675, "learning_rate": 0.0005134716630535769, "loss": 2.7112, "step": 8290 }, { "epoch": 2.569747638953398, "grad_norm": 0.8954980969429016, "learning_rate": 0.0005140910498606379, "loss": 2.6775, "step": 8300 }, { "epoch": 2.5728440935129275, "grad_norm": 0.8603184223175049, "learning_rate": 0.000514710436667699, "loss": 2.6962, "step": 8310 }, { "epoch": 2.575940548072457, "grad_norm": 0.8614330887794495, "learning_rate": 0.00051532982347476, "loss": 2.7119, "step": 8320 }, { "epoch": 2.5790370026319867, "grad_norm": 0.853256106376648, "learning_rate": 0.000515949210281821, "loss": 2.6701, "step": 8330 }, { "epoch": 2.5821334571915155, "grad_norm": 0.9329004883766174, "learning_rate": 0.000516568597088882, "loss": 2.7029, "step": 8340 }, { "epoch": 2.5852299117510453, "grad_norm": 0.8642740249633789, "learning_rate": 0.000517187983895943, "loss": 2.6927, "step": 8350 }, { "epoch": 2.5883263663105742, "grad_norm": 0.8851795196533203, "learning_rate": 0.000517807370703004, "loss": 2.6801, "step": 8360 }, { "epoch": 2.591422820870104, "grad_norm": 0.8649539947509766, "learning_rate": 0.0005184267575100651, "loss": 2.671, "step": 8370 }, { "epoch": 2.594519275429633, "grad_norm": 0.8715213537216187, "learning_rate": 0.0005190461443171261, "loss": 2.69, "step": 8380 }, { "epoch": 2.5976157299891627, "grad_norm": 0.8469790816307068, "learning_rate": 0.000519665531124187, "loss": 2.6773, "step": 8390 }, { "epoch": 2.6007121845486916, "grad_norm": 0.8525969982147217, "learning_rate": 0.0005202849179312481, "loss": 2.6728, "step": 8400 }, { "epoch": 2.6038086391082214, "grad_norm": 0.8539503812789917, "learning_rate": 0.0005209043047383091, "loss": 2.6869, "step": 8410 }, { "epoch": 2.6069050936677503, "grad_norm": 0.877877414226532, "learning_rate": 0.0005215236915453701, "loss": 2.6924, "step": 8420 }, { "epoch": 2.61000154822728, "grad_norm": 0.9159960150718689, "learning_rate": 0.0005221430783524312, "loss": 2.6827, "step": 8430 }, { "epoch": 2.613098002786809, "grad_norm": 0.9159612059593201, "learning_rate": 0.000522762465159492, "loss": 2.6715, "step": 8440 }, { "epoch": 2.6161944573463387, "grad_norm": 0.8842989802360535, "learning_rate": 0.000523381851966553, "loss": 2.6781, "step": 8450 }, { "epoch": 2.6192909119058676, "grad_norm": 0.981275737285614, "learning_rate": 0.0005240012387736141, "loss": 2.6975, "step": 8460 }, { "epoch": 2.6223873664653974, "grad_norm": 0.8604749441146851, "learning_rate": 0.0005246206255806751, "loss": 2.6785, "step": 8470 }, { "epoch": 2.6254838210249263, "grad_norm": 0.880984902381897, "learning_rate": 0.0005252400123877361, "loss": 2.6743, "step": 8480 }, { "epoch": 2.628580275584456, "grad_norm": 0.9086693525314331, "learning_rate": 0.0005258593991947972, "loss": 2.6827, "step": 8490 }, { "epoch": 2.631676730143985, "grad_norm": 0.9209759831428528, "learning_rate": 0.0005264787860018581, "loss": 2.6969, "step": 8500 }, { "epoch": 2.6347731847035147, "grad_norm": 1.1329649686813354, "learning_rate": 0.0005270981728089191, "loss": 2.6682, "step": 8510 }, { "epoch": 2.6378696392630436, "grad_norm": 0.904861569404602, "learning_rate": 0.0005277175596159802, "loss": 2.6765, "step": 8520 }, { "epoch": 2.6409660938225734, "grad_norm": 0.9609228372573853, "learning_rate": 0.0005283369464230412, "loss": 2.6777, "step": 8530 }, { "epoch": 2.6440625483821023, "grad_norm": 0.84135901927948, "learning_rate": 0.0005289563332301022, "loss": 2.6963, "step": 8540 }, { "epoch": 2.647159002941632, "grad_norm": 0.9496148228645325, "learning_rate": 0.0005295757200371633, "loss": 2.6755, "step": 8550 }, { "epoch": 2.650255457501161, "grad_norm": 0.9461915493011475, "learning_rate": 0.0005301951068442242, "loss": 2.6947, "step": 8560 }, { "epoch": 2.6533519120606908, "grad_norm": 0.8542360067367554, "learning_rate": 0.0005308144936512852, "loss": 2.6722, "step": 8570 }, { "epoch": 2.6564483666202197, "grad_norm": 0.9559420347213745, "learning_rate": 0.0005314338804583463, "loss": 2.6781, "step": 8580 }, { "epoch": 2.6595448211797494, "grad_norm": 0.9376833438873291, "learning_rate": 0.0005320532672654073, "loss": 2.7124, "step": 8590 }, { "epoch": 2.6626412757392783, "grad_norm": 0.8750305771827698, "learning_rate": 0.0005326726540724683, "loss": 2.695, "step": 8600 }, { "epoch": 2.665737730298808, "grad_norm": 0.8628771305084229, "learning_rate": 0.0005332920408795294, "loss": 2.6874, "step": 8610 }, { "epoch": 2.668834184858337, "grad_norm": 0.91616290807724, "learning_rate": 0.0005339114276865903, "loss": 2.6733, "step": 8620 }, { "epoch": 2.671930639417867, "grad_norm": 0.8734931349754333, "learning_rate": 0.0005345308144936513, "loss": 2.6805, "step": 8630 }, { "epoch": 2.6750270939773957, "grad_norm": 0.8667175769805908, "learning_rate": 0.0005351502013007124, "loss": 2.6863, "step": 8640 }, { "epoch": 2.6781235485369255, "grad_norm": 0.8947048783302307, "learning_rate": 0.0005357695881077733, "loss": 2.6906, "step": 8650 }, { "epoch": 2.6812200030964544, "grad_norm": 0.9095123410224915, "learning_rate": 0.0005363889749148342, "loss": 2.6741, "step": 8660 }, { "epoch": 2.684316457655984, "grad_norm": 0.8678126335144043, "learning_rate": 0.0005370083617218953, "loss": 2.6532, "step": 8670 }, { "epoch": 2.687412912215513, "grad_norm": 0.8941618800163269, "learning_rate": 0.0005376277485289563, "loss": 2.6733, "step": 8680 }, { "epoch": 2.690509366775043, "grad_norm": 0.9127388596534729, "learning_rate": 0.0005382471353360173, "loss": 2.6864, "step": 8690 }, { "epoch": 2.6936058213345717, "grad_norm": 0.8542888760566711, "learning_rate": 0.0005388665221430784, "loss": 2.6839, "step": 8700 }, { "epoch": 2.6967022758941015, "grad_norm": 0.8937285542488098, "learning_rate": 0.0005394859089501394, "loss": 2.6911, "step": 8710 }, { "epoch": 2.6997987304536304, "grad_norm": 0.9001040458679199, "learning_rate": 0.0005401052957572003, "loss": 2.6785, "step": 8720 }, { "epoch": 2.70289518501316, "grad_norm": 0.9357818365097046, "learning_rate": 0.0005407246825642614, "loss": 2.6959, "step": 8730 }, { "epoch": 2.705991639572689, "grad_norm": 0.9065813422203064, "learning_rate": 0.0005413440693713224, "loss": 2.6838, "step": 8740 }, { "epoch": 2.709088094132219, "grad_norm": 0.8821165561676025, "learning_rate": 0.0005419634561783834, "loss": 2.6618, "step": 8750 }, { "epoch": 2.7121845486917477, "grad_norm": 0.8667876720428467, "learning_rate": 0.0005425828429854445, "loss": 2.6849, "step": 8760 }, { "epoch": 2.7152810032512775, "grad_norm": 0.8643457889556885, "learning_rate": 0.0005432022297925055, "loss": 2.6629, "step": 8770 }, { "epoch": 2.7183774578108064, "grad_norm": 0.8841952681541443, "learning_rate": 0.0005438216165995664, "loss": 2.6605, "step": 8780 }, { "epoch": 2.721473912370336, "grad_norm": 0.9219385385513306, "learning_rate": 0.0005444410034066275, "loss": 2.6594, "step": 8790 }, { "epoch": 2.724570366929865, "grad_norm": 0.9676291942596436, "learning_rate": 0.0005450603902136885, "loss": 2.6796, "step": 8800 }, { "epoch": 2.727666821489395, "grad_norm": 0.9405499696731567, "learning_rate": 0.0005456797770207495, "loss": 2.6928, "step": 8810 }, { "epoch": 2.7307632760489238, "grad_norm": 0.9420516490936279, "learning_rate": 0.0005462991638278106, "loss": 2.6699, "step": 8820 }, { "epoch": 2.7338597306084536, "grad_norm": 0.9792620539665222, "learning_rate": 0.0005469185506348715, "loss": 2.6666, "step": 8830 }, { "epoch": 2.7369561851679824, "grad_norm": 0.9726955890655518, "learning_rate": 0.0005475379374419325, "loss": 2.645, "step": 8840 }, { "epoch": 2.7400526397275122, "grad_norm": 1.020033359527588, "learning_rate": 0.0005481573242489936, "loss": 2.6614, "step": 8850 }, { "epoch": 2.743149094287041, "grad_norm": 1.0454789400100708, "learning_rate": 0.0005487767110560545, "loss": 2.6565, "step": 8860 }, { "epoch": 2.746245548846571, "grad_norm": 0.8889420628547668, "learning_rate": 0.0005493960978631155, "loss": 2.6916, "step": 8870 }, { "epoch": 2.7493420034061, "grad_norm": 0.9025602340698242, "learning_rate": 0.0005500154846701765, "loss": 2.6999, "step": 8880 }, { "epoch": 2.7524384579656296, "grad_norm": 0.8665561079978943, "learning_rate": 0.0005506348714772375, "loss": 2.6385, "step": 8890 }, { "epoch": 2.7555349125251585, "grad_norm": 0.9031399488449097, "learning_rate": 0.0005512542582842985, "loss": 2.6505, "step": 8900 }, { "epoch": 2.7586313670846883, "grad_norm": 0.9555135369300842, "learning_rate": 0.0005518736450913595, "loss": 2.6816, "step": 8910 }, { "epoch": 2.761727821644217, "grad_norm": 0.9307361245155334, "learning_rate": 0.0005524930318984206, "loss": 2.6715, "step": 8920 }, { "epoch": 2.764824276203747, "grad_norm": 0.9591286778450012, "learning_rate": 0.0005531124187054816, "loss": 2.6822, "step": 8930 }, { "epoch": 2.767920730763276, "grad_norm": 0.9070897698402405, "learning_rate": 0.0005537318055125425, "loss": 2.6702, "step": 8940 }, { "epoch": 2.7710171853228056, "grad_norm": 0.9256467819213867, "learning_rate": 0.0005543511923196036, "loss": 2.6555, "step": 8950 }, { "epoch": 2.7741136398823345, "grad_norm": 0.993756115436554, "learning_rate": 0.0005549705791266646, "loss": 2.6547, "step": 8960 }, { "epoch": 2.7772100944418643, "grad_norm": 0.9043955206871033, "learning_rate": 0.0005555899659337256, "loss": 2.6905, "step": 8970 }, { "epoch": 2.780306549001393, "grad_norm": 0.9000112414360046, "learning_rate": 0.0005562093527407867, "loss": 2.6534, "step": 8980 }, { "epoch": 2.783403003560923, "grad_norm": 0.9210097789764404, "learning_rate": 0.0005568287395478476, "loss": 2.6717, "step": 8990 }, { "epoch": 2.786499458120452, "grad_norm": 0.8958888053894043, "learning_rate": 0.0005574481263549086, "loss": 2.6856, "step": 9000 }, { "epoch": 2.7895959126799816, "grad_norm": 1.0156104564666748, "learning_rate": 0.0005580675131619697, "loss": 2.6794, "step": 9010 }, { "epoch": 2.7926923672395105, "grad_norm": 0.9581423997879028, "learning_rate": 0.0005586868999690307, "loss": 2.6576, "step": 9020 }, { "epoch": 2.7957888217990403, "grad_norm": 0.9721694588661194, "learning_rate": 0.0005593062867760917, "loss": 2.6569, "step": 9030 }, { "epoch": 2.798885276358569, "grad_norm": 0.9453576803207397, "learning_rate": 0.0005599256735831528, "loss": 2.6664, "step": 9040 }, { "epoch": 2.801981730918099, "grad_norm": 0.9473662972450256, "learning_rate": 0.0005605450603902137, "loss": 2.6604, "step": 9050 }, { "epoch": 2.805078185477628, "grad_norm": 0.9190026521682739, "learning_rate": 0.0005611644471972746, "loss": 2.6632, "step": 9060 }, { "epoch": 2.8081746400371577, "grad_norm": 0.9677988886833191, "learning_rate": 0.0005617838340043357, "loss": 2.6574, "step": 9070 }, { "epoch": 2.8112710945966866, "grad_norm": 0.9148370623588562, "learning_rate": 0.0005624032208113967, "loss": 2.6801, "step": 9080 }, { "epoch": 2.8143675491562163, "grad_norm": 0.908485472202301, "learning_rate": 0.0005630226076184577, "loss": 2.6614, "step": 9090 }, { "epoch": 2.8174640037157452, "grad_norm": 0.9479948878288269, "learning_rate": 0.0005636419944255187, "loss": 2.6426, "step": 9100 }, { "epoch": 2.820560458275275, "grad_norm": 1.0165117979049683, "learning_rate": 0.0005642613812325797, "loss": 2.6608, "step": 9110 }, { "epoch": 2.823656912834804, "grad_norm": 0.870343029499054, "learning_rate": 0.0005648807680396407, "loss": 2.6729, "step": 9120 }, { "epoch": 2.8267533673943337, "grad_norm": 0.9335671067237854, "learning_rate": 0.0005655001548467018, "loss": 2.6655, "step": 9130 }, { "epoch": 2.8298498219538626, "grad_norm": 0.9250266551971436, "learning_rate": 0.0005661195416537628, "loss": 2.6793, "step": 9140 }, { "epoch": 2.8329462765133924, "grad_norm": 0.8737602233886719, "learning_rate": 0.0005667389284608237, "loss": 2.6552, "step": 9150 }, { "epoch": 2.8360427310729213, "grad_norm": 0.9168223142623901, "learning_rate": 0.0005673583152678848, "loss": 2.6725, "step": 9160 }, { "epoch": 2.839139185632451, "grad_norm": 0.9240823984146118, "learning_rate": 0.0005679777020749458, "loss": 2.642, "step": 9170 }, { "epoch": 2.84223564019198, "grad_norm": 0.9061072468757629, "learning_rate": 0.0005685970888820068, "loss": 2.6746, "step": 9180 }, { "epoch": 2.8453320947515097, "grad_norm": 0.8670341968536377, "learning_rate": 0.0005692164756890679, "loss": 2.6693, "step": 9190 }, { "epoch": 2.8484285493110386, "grad_norm": 0.9250338673591614, "learning_rate": 0.0005698358624961289, "loss": 2.6755, "step": 9200 }, { "epoch": 2.8515250038705684, "grad_norm": 0.9369593262672424, "learning_rate": 0.0005704552493031898, "loss": 2.6794, "step": 9210 }, { "epoch": 2.8546214584300973, "grad_norm": 0.9392365217208862, "learning_rate": 0.0005710746361102509, "loss": 2.6644, "step": 9220 }, { "epoch": 2.857717912989627, "grad_norm": 0.9542964696884155, "learning_rate": 0.0005716940229173119, "loss": 2.6785, "step": 9230 }, { "epoch": 2.860814367549156, "grad_norm": 0.9194208979606628, "learning_rate": 0.0005723134097243729, "loss": 2.6743, "step": 9240 }, { "epoch": 2.8639108221086857, "grad_norm": 0.9285315275192261, "learning_rate": 0.000572932796531434, "loss": 2.6589, "step": 9250 }, { "epoch": 2.8670072766682146, "grad_norm": 0.9268024563789368, "learning_rate": 0.000573552183338495, "loss": 2.6552, "step": 9260 }, { "epoch": 2.8701037312277444, "grad_norm": 0.904656171798706, "learning_rate": 0.0005741715701455558, "loss": 2.6657, "step": 9270 }, { "epoch": 2.8732001857872733, "grad_norm": 0.9420167207717896, "learning_rate": 0.0005747909569526169, "loss": 2.6572, "step": 9280 }, { "epoch": 2.876296640346803, "grad_norm": 0.9118287563323975, "learning_rate": 0.0005754103437596779, "loss": 2.6629, "step": 9290 }, { "epoch": 2.879393094906332, "grad_norm": 0.940430223941803, "learning_rate": 0.0005760297305667389, "loss": 2.6518, "step": 9300 }, { "epoch": 2.8824895494658618, "grad_norm": 1.3163542747497559, "learning_rate": 0.0005766491173738, "loss": 2.652, "step": 9310 }, { "epoch": 2.8855860040253907, "grad_norm": 0.9466584324836731, "learning_rate": 0.0005772685041808609, "loss": 2.6809, "step": 9320 }, { "epoch": 2.8886824585849205, "grad_norm": 2.4098305702209473, "learning_rate": 0.0005778878909879219, "loss": 2.6616, "step": 9330 }, { "epoch": 2.8917789131444493, "grad_norm": 1.0643264055252075, "learning_rate": 0.000578507277794983, "loss": 2.6719, "step": 9340 }, { "epoch": 2.894875367703979, "grad_norm": 0.9846721887588501, "learning_rate": 0.000579126664602044, "loss": 2.6706, "step": 9350 }, { "epoch": 2.897971822263508, "grad_norm": 0.9832435250282288, "learning_rate": 0.000579746051409105, "loss": 2.6725, "step": 9360 }, { "epoch": 2.901068276823038, "grad_norm": 0.8981136083602905, "learning_rate": 0.000580365438216166, "loss": 2.6475, "step": 9370 }, { "epoch": 2.9041647313825667, "grad_norm": 0.8961195349693298, "learning_rate": 0.000580984825023227, "loss": 2.6705, "step": 9380 }, { "epoch": 2.9072611859420965, "grad_norm": 1.0543441772460938, "learning_rate": 0.000581604211830288, "loss": 2.6666, "step": 9390 }, { "epoch": 2.9103576405016254, "grad_norm": 0.9041043519973755, "learning_rate": 0.0005822235986373491, "loss": 2.6608, "step": 9400 }, { "epoch": 2.913454095061155, "grad_norm": 0.9475833773612976, "learning_rate": 0.0005828429854444101, "loss": 2.6453, "step": 9410 }, { "epoch": 2.916550549620684, "grad_norm": 0.9282538890838623, "learning_rate": 0.000583462372251471, "loss": 2.6531, "step": 9420 }, { "epoch": 2.919647004180214, "grad_norm": 0.936406672000885, "learning_rate": 0.0005840817590585321, "loss": 2.6594, "step": 9430 }, { "epoch": 2.9227434587397427, "grad_norm": 0.9766597747802734, "learning_rate": 0.0005847011458655931, "loss": 2.6698, "step": 9440 }, { "epoch": 2.9258399132992725, "grad_norm": 0.9606243968009949, "learning_rate": 0.0005853205326726541, "loss": 2.6819, "step": 9450 }, { "epoch": 2.9289363678588014, "grad_norm": 0.9478334784507751, "learning_rate": 0.0005859399194797152, "loss": 2.6589, "step": 9460 }, { "epoch": 2.932032822418331, "grad_norm": 0.9398000836372375, "learning_rate": 0.0005865593062867762, "loss": 2.6414, "step": 9470 }, { "epoch": 2.93512927697786, "grad_norm": 0.8788222074508667, "learning_rate": 0.000587178693093837, "loss": 2.6327, "step": 9480 }, { "epoch": 2.93822573153739, "grad_norm": 0.945261538028717, "learning_rate": 0.0005877980799008981, "loss": 2.6642, "step": 9490 }, { "epoch": 2.9413221860969188, "grad_norm": 0.9153859615325928, "learning_rate": 0.0005884174667079591, "loss": 2.6528, "step": 9500 }, { "epoch": 2.9444186406564485, "grad_norm": 1.6933245658874512, "learning_rate": 0.0005890368535150201, "loss": 2.6594, "step": 9510 }, { "epoch": 2.9475150952159774, "grad_norm": 1.0047813653945923, "learning_rate": 0.0005896562403220812, "loss": 2.6867, "step": 9520 }, { "epoch": 2.950611549775507, "grad_norm": 1.006410002708435, "learning_rate": 0.0005902756271291422, "loss": 2.6551, "step": 9530 }, { "epoch": 2.953708004335036, "grad_norm": 0.987974226474762, "learning_rate": 0.0005908950139362031, "loss": 2.6563, "step": 9540 }, { "epoch": 2.956804458894566, "grad_norm": 0.9611511826515198, "learning_rate": 0.0005915144007432642, "loss": 2.6677, "step": 9550 }, { "epoch": 2.959900913454095, "grad_norm": 0.9569249153137207, "learning_rate": 0.0005921337875503252, "loss": 2.6368, "step": 9560 }, { "epoch": 2.9629973680136246, "grad_norm": 0.909783124923706, "learning_rate": 0.0005927531743573862, "loss": 2.6353, "step": 9570 }, { "epoch": 2.966093822573154, "grad_norm": 0.9167472720146179, "learning_rate": 0.0005933725611644472, "loss": 2.6469, "step": 9580 }, { "epoch": 2.9691902771326832, "grad_norm": 0.9903345108032227, "learning_rate": 0.0005939919479715082, "loss": 2.6567, "step": 9590 }, { "epoch": 2.9722867316922126, "grad_norm": 0.9372828006744385, "learning_rate": 0.0005946113347785692, "loss": 2.6597, "step": 9600 }, { "epoch": 2.975383186251742, "grad_norm": 1.0080912113189697, "learning_rate": 0.0005952307215856302, "loss": 2.6425, "step": 9610 }, { "epoch": 2.9784796408112713, "grad_norm": 0.9167620539665222, "learning_rate": 0.0005958501083926913, "loss": 2.666, "step": 9620 }, { "epoch": 2.9815760953708006, "grad_norm": 0.9428613781929016, "learning_rate": 0.0005964694951997523, "loss": 2.6486, "step": 9630 }, { "epoch": 2.98467254993033, "grad_norm": 1.0144000053405762, "learning_rate": 0.0005970888820068132, "loss": 2.6382, "step": 9640 }, { "epoch": 2.9877690044898593, "grad_norm": 0.8944305777549744, "learning_rate": 0.0005977082688138743, "loss": 2.6406, "step": 9650 }, { "epoch": 2.9908654590493886, "grad_norm": 0.9113066792488098, "learning_rate": 0.0005983276556209353, "loss": 2.6514, "step": 9660 }, { "epoch": 2.993961913608918, "grad_norm": 0.9131670594215393, "learning_rate": 0.0005989470424279963, "loss": 2.6314, "step": 9670 }, { "epoch": 2.9970583681684473, "grad_norm": 0.9719523787498474, "learning_rate": 0.0005995664292350574, "loss": 2.6578, "step": 9680 }, { "epoch": 3.0, "grad_norm": 0.8123937845230103, "learning_rate": 0.0006001858160421183, "loss": 2.5215, "step": 9690 }, { "epoch": 3.0030964545595293, "grad_norm": 1.055759310722351, "learning_rate": 0.0006008052028491792, "loss": 2.6481, "step": 9700 }, { "epoch": 3.0061929091190587, "grad_norm": 0.9894253611564636, "learning_rate": 0.0006014245896562403, "loss": 2.6389, "step": 9710 }, { "epoch": 3.009289363678588, "grad_norm": 0.9278469085693359, "learning_rate": 0.0006020439764633013, "loss": 2.6382, "step": 9720 }, { "epoch": 3.0123858182381174, "grad_norm": 0.9690927267074585, "learning_rate": 0.0006026633632703623, "loss": 2.6225, "step": 9730 }, { "epoch": 3.0154822727976467, "grad_norm": 0.8948525190353394, "learning_rate": 0.0006032827500774234, "loss": 2.6266, "step": 9740 }, { "epoch": 3.018578727357176, "grad_norm": 0.9562525749206543, "learning_rate": 0.0006039021368844843, "loss": 2.6251, "step": 9750 }, { "epoch": 3.0216751819167054, "grad_norm": 0.9463378190994263, "learning_rate": 0.0006045215236915453, "loss": 2.6405, "step": 9760 }, { "epoch": 3.0247716364762347, "grad_norm": 0.9799174070358276, "learning_rate": 0.0006051409104986064, "loss": 2.6381, "step": 9770 }, { "epoch": 3.027868091035764, "grad_norm": 0.9874619841575623, "learning_rate": 0.0006057602973056674, "loss": 2.6143, "step": 9780 }, { "epoch": 3.0309645455952934, "grad_norm": 1.083337426185608, "learning_rate": 0.0006063796841127284, "loss": 2.6153, "step": 9790 }, { "epoch": 3.0340610001548227, "grad_norm": 0.9509608745574951, "learning_rate": 0.0006069990709197895, "loss": 2.6379, "step": 9800 }, { "epoch": 3.037157454714352, "grad_norm": 0.9036940336227417, "learning_rate": 0.0006076184577268504, "loss": 2.6415, "step": 9810 }, { "epoch": 3.0402539092738814, "grad_norm": 0.9959449768066406, "learning_rate": 0.0006082378445339114, "loss": 2.6394, "step": 9820 }, { "epoch": 3.0433503638334107, "grad_norm": 0.9509766101837158, "learning_rate": 0.0006088572313409725, "loss": 2.6287, "step": 9830 }, { "epoch": 3.04644681839294, "grad_norm": 0.9667684435844421, "learning_rate": 0.0006094766181480335, "loss": 2.6518, "step": 9840 }, { "epoch": 3.0495432729524694, "grad_norm": 0.8897145986557007, "learning_rate": 0.0006100960049550945, "loss": 2.6333, "step": 9850 }, { "epoch": 3.0526397275119987, "grad_norm": 1.0284274816513062, "learning_rate": 0.0006107153917621555, "loss": 2.6348, "step": 9860 }, { "epoch": 3.055736182071528, "grad_norm": 0.9442754983901978, "learning_rate": 0.0006113347785692165, "loss": 2.6345, "step": 9870 }, { "epoch": 3.0588326366310574, "grad_norm": 0.9227479696273804, "learning_rate": 0.0006119541653762775, "loss": 2.6346, "step": 9880 }, { "epoch": 3.0619290911905868, "grad_norm": 0.9678612351417542, "learning_rate": 0.0006125735521833386, "loss": 2.6267, "step": 9890 }, { "epoch": 3.065025545750116, "grad_norm": 0.9622678160667419, "learning_rate": 0.0006131929389903995, "loss": 2.6205, "step": 9900 }, { "epoch": 3.0681220003096454, "grad_norm": 0.9785904288291931, "learning_rate": 0.0006138123257974604, "loss": 2.6316, "step": 9910 }, { "epoch": 3.0712184548691748, "grad_norm": 0.9019646644592285, "learning_rate": 0.0006144317126045215, "loss": 2.6322, "step": 9920 }, { "epoch": 3.074314909428704, "grad_norm": 0.9511599540710449, "learning_rate": 0.0006150510994115825, "loss": 2.6404, "step": 9930 }, { "epoch": 3.0774113639882335, "grad_norm": 1.1197845935821533, "learning_rate": 0.0006156704862186435, "loss": 2.6334, "step": 9940 }, { "epoch": 3.080507818547763, "grad_norm": 1.0321228504180908, "learning_rate": 0.0006162898730257046, "loss": 2.6578, "step": 9950 }, { "epoch": 3.083604273107292, "grad_norm": 0.933640718460083, "learning_rate": 0.0006169092598327656, "loss": 2.6498, "step": 9960 }, { "epoch": 3.0867007276668215, "grad_norm": 0.9308697581291199, "learning_rate": 0.0006175286466398265, "loss": 2.6403, "step": 9970 }, { "epoch": 3.089797182226351, "grad_norm": 1.0035881996154785, "learning_rate": 0.0006181480334468876, "loss": 2.6369, "step": 9980 }, { "epoch": 3.09289363678588, "grad_norm": 0.9733856916427612, "learning_rate": 0.0006187674202539486, "loss": 2.6434, "step": 9990 }, { "epoch": 3.0959900913454095, "grad_norm": 0.9512896537780762, "learning_rate": 0.0006193868070610096, "loss": 2.6433, "step": 10000 }, { "epoch": 3.099086545904939, "grad_norm": 1.1366065740585327, "learning_rate": 0.0006200061938680707, "loss": 2.6278, "step": 10010 }, { "epoch": 3.102183000464468, "grad_norm": 1.0089902877807617, "learning_rate": 0.0006206255806751317, "loss": 2.6198, "step": 10020 }, { "epoch": 3.1052794550239975, "grad_norm": 0.9710060358047485, "learning_rate": 0.0006212449674821926, "loss": 2.6299, "step": 10030 }, { "epoch": 3.108375909583527, "grad_norm": 1.0112597942352295, "learning_rate": 0.0006218643542892537, "loss": 2.629, "step": 10040 }, { "epoch": 3.111472364143056, "grad_norm": 0.8979578614234924, "learning_rate": 0.0006224837410963147, "loss": 2.6306, "step": 10050 }, { "epoch": 3.1145688187025855, "grad_norm": 0.985578715801239, "learning_rate": 0.0006231031279033757, "loss": 2.6214, "step": 10060 }, { "epoch": 3.117665273262115, "grad_norm": 1.0180467367172241, "learning_rate": 0.0006237225147104368, "loss": 2.6698, "step": 10070 }, { "epoch": 3.120761727821644, "grad_norm": 0.9561509490013123, "learning_rate": 0.0006243419015174977, "loss": 2.6295, "step": 10080 }, { "epoch": 3.1238581823811735, "grad_norm": 0.9035720229148865, "learning_rate": 0.0006249612883245587, "loss": 2.6356, "step": 10090 }, { "epoch": 3.126954636940703, "grad_norm": 0.9758944511413574, "learning_rate": 0.0006255806751316198, "loss": 2.6373, "step": 10100 }, { "epoch": 3.130051091500232, "grad_norm": 0.9201127290725708, "learning_rate": 0.0006262000619386807, "loss": 2.6354, "step": 10110 }, { "epoch": 3.1331475460597615, "grad_norm": 0.9586511850357056, "learning_rate": 0.0006268194487457417, "loss": 2.6286, "step": 10120 }, { "epoch": 3.136244000619291, "grad_norm": 1.3197758197784424, "learning_rate": 0.0006274388355528027, "loss": 2.6503, "step": 10130 }, { "epoch": 3.13934045517882, "grad_norm": 1.4489221572875977, "learning_rate": 0.0006280582223598637, "loss": 2.667, "step": 10140 }, { "epoch": 3.1424369097383495, "grad_norm": 1.1435356140136719, "learning_rate": 0.0006286776091669247, "loss": 2.6803, "step": 10150 }, { "epoch": 3.145533364297879, "grad_norm": 5.218364238739014, "learning_rate": 0.0006292969959739858, "loss": 2.7482, "step": 10160 }, { "epoch": 3.1486298188574082, "grad_norm": 1.0673755407333374, "learning_rate": 0.0006299163827810468, "loss": 2.6814, "step": 10170 }, { "epoch": 3.1517262734169376, "grad_norm": 0.9964536428451538, "learning_rate": 0.0006305357695881078, "loss": 2.6468, "step": 10180 }, { "epoch": 3.154822727976467, "grad_norm": 1.0818805694580078, "learning_rate": 0.0006311551563951688, "loss": 2.6687, "step": 10190 }, { "epoch": 3.1579191825359962, "grad_norm": 1.0229182243347168, "learning_rate": 0.0006317745432022298, "loss": 2.632, "step": 10200 }, { "epoch": 3.1610156370955256, "grad_norm": 0.9602491855621338, "learning_rate": 0.0006323939300092908, "loss": 2.6209, "step": 10210 }, { "epoch": 3.164112091655055, "grad_norm": 1.0441064834594727, "learning_rate": 0.0006330133168163518, "loss": 2.6421, "step": 10220 }, { "epoch": 3.1672085462145843, "grad_norm": 19.606216430664062, "learning_rate": 0.0006336327036234129, "loss": 2.6372, "step": 10230 }, { "epoch": 3.1703050007741136, "grad_norm": 1.115622878074646, "learning_rate": 0.0006342520904304738, "loss": 2.6775, "step": 10240 }, { "epoch": 3.173401455333643, "grad_norm": 1.1430797576904297, "learning_rate": 0.0006348714772375348, "loss": 2.6415, "step": 10250 }, { "epoch": 3.1764979098931723, "grad_norm": 7.035722255706787, "learning_rate": 0.0006354908640445959, "loss": 2.6995, "step": 10260 }, { "epoch": 3.1795943644527016, "grad_norm": 1.2375656366348267, "learning_rate": 0.0006361102508516569, "loss": 2.7278, "step": 10270 }, { "epoch": 3.182690819012231, "grad_norm": 1.0868054628372192, "learning_rate": 0.0006367296376587179, "loss": 2.6475, "step": 10280 }, { "epoch": 3.1857872735717603, "grad_norm": 1.0047295093536377, "learning_rate": 0.000637349024465779, "loss": 2.6195, "step": 10290 }, { "epoch": 3.1888837281312896, "grad_norm": 0.9876299500465393, "learning_rate": 0.0006379684112728399, "loss": 2.6392, "step": 10300 }, { "epoch": 3.191980182690819, "grad_norm": 1.021812081336975, "learning_rate": 0.0006385877980799008, "loss": 2.6468, "step": 10310 }, { "epoch": 3.1950766372503483, "grad_norm": 0.954329788684845, "learning_rate": 0.0006392071848869619, "loss": 2.6368, "step": 10320 }, { "epoch": 3.1981730918098776, "grad_norm": 0.9458587169647217, "learning_rate": 0.0006398265716940229, "loss": 2.6368, "step": 10330 }, { "epoch": 3.201269546369407, "grad_norm": 1.0526219606399536, "learning_rate": 0.0006404459585010839, "loss": 2.6389, "step": 10340 }, { "epoch": 3.2043660009289363, "grad_norm": 1.1330630779266357, "learning_rate": 0.0006410653453081449, "loss": 2.6666, "step": 10350 }, { "epoch": 3.2074624554884656, "grad_norm": 1.0522410869598389, "learning_rate": 0.0006416847321152059, "loss": 2.6405, "step": 10360 }, { "epoch": 3.210558910047995, "grad_norm": 0.973717451095581, "learning_rate": 0.0006423041189222669, "loss": 2.6198, "step": 10370 }, { "epoch": 3.2136553646075243, "grad_norm": 0.9188945889472961, "learning_rate": 0.000642923505729328, "loss": 2.6478, "step": 10380 }, { "epoch": 3.2167518191670537, "grad_norm": 0.9480977654457092, "learning_rate": 0.000643542892536389, "loss": 2.635, "step": 10390 }, { "epoch": 3.219848273726583, "grad_norm": 0.9224624633789062, "learning_rate": 0.0006441622793434499, "loss": 2.6362, "step": 10400 }, { "epoch": 3.2229447282861123, "grad_norm": 0.9448727965354919, "learning_rate": 0.000644781666150511, "loss": 2.6215, "step": 10410 }, { "epoch": 3.2260411828456417, "grad_norm": 0.9381209015846252, "learning_rate": 0.000645401052957572, "loss": 2.6305, "step": 10420 }, { "epoch": 3.229137637405171, "grad_norm": 1.0034310817718506, "learning_rate": 0.000646020439764633, "loss": 2.6344, "step": 10430 }, { "epoch": 3.2322340919647004, "grad_norm": 0.9512182474136353, "learning_rate": 0.0006466398265716941, "loss": 2.6397, "step": 10440 }, { "epoch": 3.2353305465242297, "grad_norm": 0.9563096761703491, "learning_rate": 0.0006472592133787551, "loss": 2.6354, "step": 10450 }, { "epoch": 3.238427001083759, "grad_norm": 1.0199220180511475, "learning_rate": 0.000647878600185816, "loss": 2.6379, "step": 10460 }, { "epoch": 3.2415234556432884, "grad_norm": 0.9473974108695984, "learning_rate": 0.0006484979869928771, "loss": 2.6262, "step": 10470 }, { "epoch": 3.2446199102028177, "grad_norm": 0.9869408011436462, "learning_rate": 0.0006491173737999381, "loss": 2.6495, "step": 10480 }, { "epoch": 3.247716364762347, "grad_norm": 0.9925758242607117, "learning_rate": 0.0006497367606069991, "loss": 2.631, "step": 10490 }, { "epoch": 3.2508128193218764, "grad_norm": 1.048644781112671, "learning_rate": 0.0006503561474140602, "loss": 2.6145, "step": 10500 }, { "epoch": 3.2539092738814057, "grad_norm": 0.9119939804077148, "learning_rate": 0.0006509755342211212, "loss": 2.6249, "step": 10510 }, { "epoch": 3.257005728440935, "grad_norm": 1.0462340116500854, "learning_rate": 0.000651594921028182, "loss": 2.6343, "step": 10520 }, { "epoch": 3.2601021830004644, "grad_norm": 0.9970148801803589, "learning_rate": 0.0006522143078352431, "loss": 2.6353, "step": 10530 }, { "epoch": 3.2631986375599937, "grad_norm": 0.9585279822349548, "learning_rate": 0.0006528336946423041, "loss": 2.6223, "step": 10540 }, { "epoch": 3.266295092119523, "grad_norm": 1.0489411354064941, "learning_rate": 0.0006534530814493651, "loss": 2.6246, "step": 10550 }, { "epoch": 3.2693915466790524, "grad_norm": 0.9942703247070312, "learning_rate": 0.0006540724682564262, "loss": 2.6163, "step": 10560 }, { "epoch": 3.2724880012385817, "grad_norm": 1.0939925909042358, "learning_rate": 0.0006546918550634871, "loss": 2.6485, "step": 10570 }, { "epoch": 3.275584455798111, "grad_norm": 0.9639611840248108, "learning_rate": 0.0006553112418705481, "loss": 2.6369, "step": 10580 }, { "epoch": 3.2786809103576404, "grad_norm": 1.003915786743164, "learning_rate": 0.0006559306286776092, "loss": 2.6324, "step": 10590 }, { "epoch": 3.2817773649171698, "grad_norm": 1.07323157787323, "learning_rate": 0.0006565500154846702, "loss": 2.6239, "step": 10600 }, { "epoch": 3.284873819476699, "grad_norm": 0.9782385230064392, "learning_rate": 0.0006571694022917312, "loss": 2.6276, "step": 10610 }, { "epoch": 3.2879702740362284, "grad_norm": 0.9947441816329956, "learning_rate": 0.0006577887890987922, "loss": 2.6258, "step": 10620 }, { "epoch": 3.2910667285957578, "grad_norm": 0.9401261806488037, "learning_rate": 0.0006584081759058532, "loss": 2.6054, "step": 10630 }, { "epoch": 3.294163183155287, "grad_norm": 0.9426921606063843, "learning_rate": 0.0006590275627129142, "loss": 2.634, "step": 10640 }, { "epoch": 3.2972596377148164, "grad_norm": 0.9457327127456665, "learning_rate": 0.0006596469495199753, "loss": 2.6319, "step": 10650 }, { "epoch": 3.300356092274346, "grad_norm": 1.1993708610534668, "learning_rate": 0.0006602663363270363, "loss": 2.6365, "step": 10660 }, { "epoch": 3.303452546833875, "grad_norm": 0.9889876842498779, "learning_rate": 0.0006608857231340973, "loss": 2.6503, "step": 10670 }, { "epoch": 3.3065490013934045, "grad_norm": 0.9788354635238647, "learning_rate": 0.0006615051099411583, "loss": 2.641, "step": 10680 }, { "epoch": 3.309645455952934, "grad_norm": 0.9262669682502747, "learning_rate": 0.0006621244967482193, "loss": 2.6473, "step": 10690 }, { "epoch": 3.312741910512463, "grad_norm": 0.9675087332725525, "learning_rate": 0.0006627438835552803, "loss": 2.6425, "step": 10700 }, { "epoch": 3.3158383650719925, "grad_norm": 0.9308109879493713, "learning_rate": 0.0006633632703623414, "loss": 2.6425, "step": 10710 }, { "epoch": 3.318934819631522, "grad_norm": 0.9837930202484131, "learning_rate": 0.0006639826571694024, "loss": 2.6309, "step": 10720 }, { "epoch": 3.322031274191051, "grad_norm": 0.9883390069007874, "learning_rate": 0.0006646020439764632, "loss": 2.5976, "step": 10730 }, { "epoch": 3.3251277287505805, "grad_norm": 0.9393827319145203, "learning_rate": 0.0006652214307835243, "loss": 2.6229, "step": 10740 }, { "epoch": 3.32822418331011, "grad_norm": 0.9329293370246887, "learning_rate": 0.0006658408175905853, "loss": 2.6102, "step": 10750 }, { "epoch": 3.331320637869639, "grad_norm": 0.8954689502716064, "learning_rate": 0.0006664602043976463, "loss": 2.6578, "step": 10760 }, { "epoch": 3.3344170924291685, "grad_norm": 0.92784583568573, "learning_rate": 0.0006670795912047074, "loss": 2.6127, "step": 10770 }, { "epoch": 3.337513546988698, "grad_norm": 0.9678082466125488, "learning_rate": 0.0006676989780117683, "loss": 2.6097, "step": 10780 }, { "epoch": 3.340610001548227, "grad_norm": 0.9594787955284119, "learning_rate": 0.0006683183648188293, "loss": 2.6068, "step": 10790 }, { "epoch": 3.3437064561077565, "grad_norm": 0.9914245009422302, "learning_rate": 0.0006689377516258904, "loss": 2.6173, "step": 10800 }, { "epoch": 3.346802910667286, "grad_norm": 0.9944581985473633, "learning_rate": 0.0006695571384329514, "loss": 2.6229, "step": 10810 }, { "epoch": 3.349899365226815, "grad_norm": 1.0383622646331787, "learning_rate": 0.0006701765252400124, "loss": 2.6341, "step": 10820 }, { "epoch": 3.3529958197863445, "grad_norm": 1.034728765487671, "learning_rate": 0.0006707959120470735, "loss": 2.637, "step": 10830 }, { "epoch": 3.356092274345874, "grad_norm": 1.0271577835083008, "learning_rate": 0.0006714152988541344, "loss": 2.6387, "step": 10840 }, { "epoch": 3.359188728905403, "grad_norm": 0.9783453345298767, "learning_rate": 0.0006720346856611954, "loss": 2.5978, "step": 10850 }, { "epoch": 3.3622851834649325, "grad_norm": 0.9792416095733643, "learning_rate": 0.0006726540724682565, "loss": 2.6349, "step": 10860 }, { "epoch": 3.365381638024462, "grad_norm": 0.9500912427902222, "learning_rate": 0.0006732734592753175, "loss": 2.6087, "step": 10870 }, { "epoch": 3.3684780925839912, "grad_norm": 0.9641538262367249, "learning_rate": 0.0006738928460823785, "loss": 2.613, "step": 10880 }, { "epoch": 3.3715745471435206, "grad_norm": 0.957671582698822, "learning_rate": 0.0006745122328894394, "loss": 2.6215, "step": 10890 }, { "epoch": 3.37467100170305, "grad_norm": 1.0475072860717773, "learning_rate": 0.0006751316196965005, "loss": 2.6092, "step": 10900 }, { "epoch": 3.3777674562625792, "grad_norm": 0.96811842918396, "learning_rate": 0.0006757510065035615, "loss": 2.6134, "step": 10910 }, { "epoch": 3.3808639108221086, "grad_norm": 1.0156564712524414, "learning_rate": 0.0006763703933106225, "loss": 2.6112, "step": 10920 }, { "epoch": 3.383960365381638, "grad_norm": 1.0434483289718628, "learning_rate": 0.0006769897801176836, "loss": 2.6183, "step": 10930 }, { "epoch": 3.3870568199411673, "grad_norm": 0.9763379096984863, "learning_rate": 0.0006776091669247445, "loss": 2.6063, "step": 10940 }, { "epoch": 3.3901532745006966, "grad_norm": 1.0185160636901855, "learning_rate": 0.0006782285537318054, "loss": 2.6491, "step": 10950 }, { "epoch": 3.393249729060226, "grad_norm": 0.9660173058509827, "learning_rate": 0.0006788479405388665, "loss": 2.6164, "step": 10960 }, { "epoch": 3.3963461836197553, "grad_norm": 0.9487093091011047, "learning_rate": 0.0006794673273459275, "loss": 2.614, "step": 10970 }, { "epoch": 3.3994426381792846, "grad_norm": 0.9912219643592834, "learning_rate": 0.0006800867141529885, "loss": 2.6369, "step": 10980 }, { "epoch": 3.402539092738814, "grad_norm": 0.9763176441192627, "learning_rate": 0.0006807061009600496, "loss": 2.6003, "step": 10990 }, { "epoch": 3.4056355472983433, "grad_norm": 1.007444977760315, "learning_rate": 0.0006813254877671105, "loss": 2.6294, "step": 11000 } ], "logging_steps": 10, "max_steps": 161450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3128782178793882e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }