{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9993079584775086, "eval_steps": 500, "global_step": 4334, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00461361014994233, "grad_norm": 5.300843715667725, "learning_rate": 2.3041474654377884e-07, "loss": 0.806, "step": 10 }, { "epoch": 0.00922722029988466, "grad_norm": 5.792849540710449, "learning_rate": 4.608294930875577e-07, "loss": 0.8251, "step": 20 }, { "epoch": 0.01384083044982699, "grad_norm": 4.814388751983643, "learning_rate": 6.912442396313365e-07, "loss": 0.7555, "step": 30 }, { "epoch": 0.01845444059976932, "grad_norm": 2.745297431945801, "learning_rate": 9.216589861751154e-07, "loss": 0.7218, "step": 40 }, { "epoch": 0.02306805074971165, "grad_norm": 1.7126681804656982, "learning_rate": 1.1520737327188942e-06, "loss": 0.6751, "step": 50 }, { "epoch": 0.02768166089965398, "grad_norm": 1.703934669494629, "learning_rate": 1.382488479262673e-06, "loss": 0.6197, "step": 60 }, { "epoch": 0.03229527104959631, "grad_norm": 1.7122575044631958, "learning_rate": 1.6129032258064516e-06, "loss": 0.5083, "step": 70 }, { "epoch": 0.03690888119953864, "grad_norm": 2.8078525066375732, "learning_rate": 1.8433179723502307e-06, "loss": 0.5326, "step": 80 }, { "epoch": 0.04152249134948097, "grad_norm": 1.8321082592010498, "learning_rate": 2.0737327188940094e-06, "loss": 0.5463, "step": 90 }, { "epoch": 0.0461361014994233, "grad_norm": 1.547890305519104, "learning_rate": 2.3041474654377884e-06, "loss": 0.5445, "step": 100 }, { "epoch": 0.05074971164936563, "grad_norm": 1.5186538696289062, "learning_rate": 2.5345622119815673e-06, "loss": 0.4972, "step": 110 }, { "epoch": 0.05536332179930796, "grad_norm": 1.6117104291915894, "learning_rate": 2.764976958525346e-06, "loss": 0.519, "step": 120 }, { "epoch": 0.05997693194925029, "grad_norm": 1.8595815896987915, "learning_rate": 2.9953917050691243e-06, "loss": 0.453, "step": 130 }, { "epoch": 0.06459054209919261, "grad_norm": 1.2419018745422363, "learning_rate": 3.225806451612903e-06, "loss": 0.4799, "step": 140 }, { "epoch": 0.06920415224913495, "grad_norm": 1.5790776014328003, "learning_rate": 3.4562211981566825e-06, "loss": 0.4896, "step": 150 }, { "epoch": 0.07381776239907728, "grad_norm": 1.4088906049728394, "learning_rate": 3.6866359447004615e-06, "loss": 0.5167, "step": 160 }, { "epoch": 0.0784313725490196, "grad_norm": 1.2521541118621826, "learning_rate": 3.91705069124424e-06, "loss": 0.5127, "step": 170 }, { "epoch": 0.08304498269896193, "grad_norm": 2.0803122520446777, "learning_rate": 4.147465437788019e-06, "loss": 0.4465, "step": 180 }, { "epoch": 0.08765859284890427, "grad_norm": 1.523786187171936, "learning_rate": 4.377880184331797e-06, "loss": 0.4761, "step": 190 }, { "epoch": 0.0922722029988466, "grad_norm": 1.418717622756958, "learning_rate": 4.608294930875577e-06, "loss": 0.5055, "step": 200 }, { "epoch": 0.09688581314878893, "grad_norm": 1.483023762702942, "learning_rate": 4.838709677419355e-06, "loss": 0.4557, "step": 210 }, { "epoch": 0.10149942329873125, "grad_norm": 1.5981467962265015, "learning_rate": 5.0691244239631346e-06, "loss": 0.4686, "step": 220 }, { "epoch": 0.1061130334486736, "grad_norm": 1.130113959312439, "learning_rate": 5.299539170506913e-06, "loss": 0.4843, "step": 230 }, { "epoch": 0.11072664359861592, "grad_norm": 1.8060191869735718, "learning_rate": 5.529953917050692e-06, "loss": 0.5015, "step": 240 }, { "epoch": 0.11534025374855825, "grad_norm": 1.069687008857727, "learning_rate": 5.76036866359447e-06, "loss": 0.4622, "step": 250 }, { "epoch": 0.11995386389850057, "grad_norm": 1.1288576126098633, "learning_rate": 5.9907834101382485e-06, "loss": 0.4642, "step": 260 }, { "epoch": 0.1245674740484429, "grad_norm": 1.7959188222885132, "learning_rate": 6.221198156682028e-06, "loss": 0.4357, "step": 270 }, { "epoch": 0.12918108419838523, "grad_norm": 1.1214991807937622, "learning_rate": 6.451612903225806e-06, "loss": 0.4547, "step": 280 }, { "epoch": 0.13379469434832755, "grad_norm": 1.6405386924743652, "learning_rate": 6.682027649769586e-06, "loss": 0.4464, "step": 290 }, { "epoch": 0.1384083044982699, "grad_norm": 1.422194004058838, "learning_rate": 6.912442396313365e-06, "loss": 0.4957, "step": 300 }, { "epoch": 0.14302191464821223, "grad_norm": 1.5487645864486694, "learning_rate": 7.1428571428571436e-06, "loss": 0.4473, "step": 310 }, { "epoch": 0.14763552479815456, "grad_norm": 1.3289945125579834, "learning_rate": 7.373271889400923e-06, "loss": 0.4831, "step": 320 }, { "epoch": 0.1522491349480969, "grad_norm": 1.6698657274246216, "learning_rate": 7.603686635944701e-06, "loss": 0.4996, "step": 330 }, { "epoch": 0.1568627450980392, "grad_norm": 1.4635429382324219, "learning_rate": 7.83410138248848e-06, "loss": 0.4861, "step": 340 }, { "epoch": 0.16147635524798154, "grad_norm": 1.1806988716125488, "learning_rate": 8.064516129032258e-06, "loss": 0.4774, "step": 350 }, { "epoch": 0.16608996539792387, "grad_norm": 1.4619848728179932, "learning_rate": 8.294930875576038e-06, "loss": 0.505, "step": 360 }, { "epoch": 0.1707035755478662, "grad_norm": 1.3232295513153076, "learning_rate": 8.525345622119815e-06, "loss": 0.5116, "step": 370 }, { "epoch": 0.17531718569780855, "grad_norm": 1.5264451503753662, "learning_rate": 8.755760368663595e-06, "loss": 0.4307, "step": 380 }, { "epoch": 0.17993079584775087, "grad_norm": 1.2139824628829956, "learning_rate": 8.986175115207374e-06, "loss": 0.4525, "step": 390 }, { "epoch": 0.1845444059976932, "grad_norm": 1.5944982767105103, "learning_rate": 9.216589861751153e-06, "loss": 0.4557, "step": 400 }, { "epoch": 0.18915801614763553, "grad_norm": 1.3352501392364502, "learning_rate": 9.447004608294931e-06, "loss": 0.4927, "step": 410 }, { "epoch": 0.19377162629757785, "grad_norm": 1.0890029668807983, "learning_rate": 9.67741935483871e-06, "loss": 0.4926, "step": 420 }, { "epoch": 0.19838523644752018, "grad_norm": 1.675134539604187, "learning_rate": 9.90783410138249e-06, "loss": 0.4373, "step": 430 }, { "epoch": 0.2029988465974625, "grad_norm": 1.240625023841858, "learning_rate": 9.999941600087643e-06, "loss": 0.4877, "step": 440 }, { "epoch": 0.20761245674740483, "grad_norm": 1.6467055082321167, "learning_rate": 9.999584716674727e-06, "loss": 0.4628, "step": 450 }, { "epoch": 0.2122260668973472, "grad_norm": 1.0588369369506836, "learning_rate": 9.998903417374228e-06, "loss": 0.4757, "step": 460 }, { "epoch": 0.21683967704728951, "grad_norm": 1.150065302848816, "learning_rate": 9.997897746394684e-06, "loss": 0.486, "step": 470 }, { "epoch": 0.22145328719723184, "grad_norm": 1.382214903831482, "learning_rate": 9.996567768992642e-06, "loss": 0.4862, "step": 480 }, { "epoch": 0.22606689734717417, "grad_norm": 1.066063642501831, "learning_rate": 9.994913571468432e-06, "loss": 0.486, "step": 490 }, { "epoch": 0.2306805074971165, "grad_norm": 1.6976910829544067, "learning_rate": 9.992935261160559e-06, "loss": 0.4468, "step": 500 }, { "epoch": 0.2306805074971165, "eval_loss": 0.39874735474586487, "eval_runtime": 400.8977, "eval_samples_per_second": 8.409, "eval_steps_per_second": 1.202, "step": 500 }, { "epoch": 0.23529411764705882, "grad_norm": 1.5809332132339478, "learning_rate": 9.990632966438743e-06, "loss": 0.4171, "step": 510 }, { "epoch": 0.23990772779700115, "grad_norm": 1.19710111618042, "learning_rate": 9.988006836695593e-06, "loss": 0.4764, "step": 520 }, { "epoch": 0.24452133794694347, "grad_norm": 1.0074009895324707, "learning_rate": 9.985057042336898e-06, "loss": 0.4678, "step": 530 }, { "epoch": 0.2491349480968858, "grad_norm": 1.1784993410110474, "learning_rate": 9.981783774770595e-06, "loss": 0.4524, "step": 540 }, { "epoch": 0.2537485582468281, "grad_norm": 1.5833553075790405, "learning_rate": 9.97818724639432e-06, "loss": 0.4399, "step": 550 }, { "epoch": 0.25836216839677045, "grad_norm": 1.1442863941192627, "learning_rate": 9.974267690581646e-06, "loss": 0.4897, "step": 560 }, { "epoch": 0.2629757785467128, "grad_norm": 1.2376320362091064, "learning_rate": 9.970025361666934e-06, "loss": 0.4497, "step": 570 }, { "epoch": 0.2675893886966551, "grad_norm": 1.0321787595748901, "learning_rate": 9.965460534928827e-06, "loss": 0.4832, "step": 580 }, { "epoch": 0.2722029988465975, "grad_norm": 1.4388171434402466, "learning_rate": 9.960573506572391e-06, "loss": 0.4788, "step": 590 }, { "epoch": 0.2768166089965398, "grad_norm": 1.7396047115325928, "learning_rate": 9.95536459370989e-06, "loss": 0.4484, "step": 600 }, { "epoch": 0.28143021914648214, "grad_norm": 1.2319186925888062, "learning_rate": 9.949834134340219e-06, "loss": 0.4799, "step": 610 }, { "epoch": 0.28604382929642447, "grad_norm": 1.3797781467437744, "learning_rate": 9.94398248732696e-06, "loss": 0.4506, "step": 620 }, { "epoch": 0.2906574394463668, "grad_norm": 1.5929468870162964, "learning_rate": 9.9378100323751e-06, "loss": 0.5108, "step": 630 }, { "epoch": 0.2952710495963091, "grad_norm": 1.4704443216323853, "learning_rate": 9.931317170006398e-06, "loss": 0.4607, "step": 640 }, { "epoch": 0.29988465974625145, "grad_norm": 1.980751872062683, "learning_rate": 9.924504321533387e-06, "loss": 0.4564, "step": 650 }, { "epoch": 0.3044982698961938, "grad_norm": 1.143687129020691, "learning_rate": 9.91737192903204e-06, "loss": 0.447, "step": 660 }, { "epoch": 0.3091118800461361, "grad_norm": 1.0242226123809814, "learning_rate": 9.909920455313087e-06, "loss": 0.4712, "step": 670 }, { "epoch": 0.3137254901960784, "grad_norm": 1.3482781648635864, "learning_rate": 9.902150383891979e-06, "loss": 0.4831, "step": 680 }, { "epoch": 0.31833910034602075, "grad_norm": 1.1587884426116943, "learning_rate": 9.894062218957517e-06, "loss": 0.499, "step": 690 }, { "epoch": 0.3229527104959631, "grad_norm": 1.1495003700256348, "learning_rate": 9.885656485339129e-06, "loss": 0.4482, "step": 700 }, { "epoch": 0.3275663206459054, "grad_norm": 1.0492511987686157, "learning_rate": 9.876933728472826e-06, "loss": 0.4763, "step": 710 }, { "epoch": 0.33217993079584773, "grad_norm": 1.2462799549102783, "learning_rate": 9.867894514365802e-06, "loss": 0.4707, "step": 720 }, { "epoch": 0.33679354094579006, "grad_norm": 0.9900134205818176, "learning_rate": 9.858539429559705e-06, "loss": 0.472, "step": 730 }, { "epoch": 0.3414071510957324, "grad_norm": 1.1227929592132568, "learning_rate": 9.848869081092581e-06, "loss": 0.497, "step": 740 }, { "epoch": 0.3460207612456747, "grad_norm": 1.3729174137115479, "learning_rate": 9.838884096459486e-06, "loss": 0.4271, "step": 750 }, { "epoch": 0.3506343713956171, "grad_norm": 1.1883389949798584, "learning_rate": 9.828585123571763e-06, "loss": 0.4977, "step": 760 }, { "epoch": 0.3552479815455594, "grad_norm": 1.181907296180725, "learning_rate": 9.817972830715003e-06, "loss": 0.4605, "step": 770 }, { "epoch": 0.35986159169550175, "grad_norm": 0.9973541498184204, "learning_rate": 9.807047906505683e-06, "loss": 0.4414, "step": 780 }, { "epoch": 0.3644752018454441, "grad_norm": 1.148622989654541, "learning_rate": 9.795811059846476e-06, "loss": 0.4372, "step": 790 }, { "epoch": 0.3690888119953864, "grad_norm": 0.8280909061431885, "learning_rate": 9.78426301988026e-06, "loss": 0.486, "step": 800 }, { "epoch": 0.3737024221453287, "grad_norm": 1.027239203453064, "learning_rate": 9.772404535942802e-06, "loss": 0.4645, "step": 810 }, { "epoch": 0.37831603229527105, "grad_norm": 1.01218581199646, "learning_rate": 9.760236377514128e-06, "loss": 0.4503, "step": 820 }, { "epoch": 0.3829296424452134, "grad_norm": 1.0366742610931396, "learning_rate": 9.747759334168602e-06, "loss": 0.4602, "step": 830 }, { "epoch": 0.3875432525951557, "grad_norm": 1.3200701475143433, "learning_rate": 9.734974215523684e-06, "loss": 0.4535, "step": 840 }, { "epoch": 0.39215686274509803, "grad_norm": 1.200269103050232, "learning_rate": 9.721881851187406e-06, "loss": 0.4569, "step": 850 }, { "epoch": 0.39677047289504036, "grad_norm": 1.3125215768814087, "learning_rate": 9.708483090704524e-06, "loss": 0.4359, "step": 860 }, { "epoch": 0.4013840830449827, "grad_norm": 0.9996836185455322, "learning_rate": 9.694778803501404e-06, "loss": 0.4573, "step": 870 }, { "epoch": 0.405997693194925, "grad_norm": 1.1388590335845947, "learning_rate": 9.680769878829606e-06, "loss": 0.4737, "step": 880 }, { "epoch": 0.41061130334486734, "grad_norm": 1.0428895950317383, "learning_rate": 9.666457225708175e-06, "loss": 0.4532, "step": 890 }, { "epoch": 0.41522491349480967, "grad_norm": 1.0916321277618408, "learning_rate": 9.65184177286466e-06, "loss": 0.4475, "step": 900 }, { "epoch": 0.419838523644752, "grad_norm": 1.0428664684295654, "learning_rate": 9.636924468674856e-06, "loss": 0.4866, "step": 910 }, { "epoch": 0.4244521337946944, "grad_norm": 1.1310166120529175, "learning_rate": 9.62170628110125e-06, "loss": 0.4437, "step": 920 }, { "epoch": 0.4290657439446367, "grad_norm": 1.1518645286560059, "learning_rate": 9.606188197630224e-06, "loss": 0.4349, "step": 930 }, { "epoch": 0.43367935409457903, "grad_norm": 1.1595722436904907, "learning_rate": 9.590371225207981e-06, "loss": 0.4917, "step": 940 }, { "epoch": 0.43829296424452135, "grad_norm": 1.127502202987671, "learning_rate": 9.574256390175192e-06, "loss": 0.4682, "step": 950 }, { "epoch": 0.4429065743944637, "grad_norm": 0.9480658769607544, "learning_rate": 9.557844738200408e-06, "loss": 0.4912, "step": 960 }, { "epoch": 0.447520184544406, "grad_norm": 1.121092677116394, "learning_rate": 9.541137334212212e-06, "loss": 0.461, "step": 970 }, { "epoch": 0.45213379469434833, "grad_norm": 1.1865901947021484, "learning_rate": 9.524135262330098e-06, "loss": 0.4414, "step": 980 }, { "epoch": 0.45674740484429066, "grad_norm": 1.2148582935333252, "learning_rate": 9.506839625794152e-06, "loss": 0.4457, "step": 990 }, { "epoch": 0.461361014994233, "grad_norm": 1.0445005893707275, "learning_rate": 9.489251546893441e-06, "loss": 0.4457, "step": 1000 }, { "epoch": 0.461361014994233, "eval_loss": 0.386065274477005, "eval_runtime": 400.8996, "eval_samples_per_second": 8.409, "eval_steps_per_second": 1.202, "step": 1000 }, { "epoch": 0.4659746251441753, "grad_norm": 1.1309161186218262, "learning_rate": 9.4713721668932e-06, "loss": 0.4457, "step": 1010 }, { "epoch": 0.47058823529411764, "grad_norm": 1.0543358325958252, "learning_rate": 9.453202645960775e-06, "loss": 0.4343, "step": 1020 }, { "epoch": 0.47520184544405997, "grad_norm": 0.9181652665138245, "learning_rate": 9.434744163090341e-06, "loss": 0.402, "step": 1030 }, { "epoch": 0.4798154555940023, "grad_norm": 0.8402210474014282, "learning_rate": 9.415997916026401e-06, "loss": 0.4742, "step": 1040 }, { "epoch": 0.4844290657439446, "grad_norm": 0.9334933757781982, "learning_rate": 9.396965121186058e-06, "loss": 0.4487, "step": 1050 }, { "epoch": 0.48904267589388695, "grad_norm": 1.047976016998291, "learning_rate": 9.377647013580102e-06, "loss": 0.449, "step": 1060 }, { "epoch": 0.4936562860438293, "grad_norm": 1.0943602323532104, "learning_rate": 9.358044846732848e-06, "loss": 0.4591, "step": 1070 }, { "epoch": 0.4982698961937716, "grad_norm": 1.235120177268982, "learning_rate": 9.338159892600809e-06, "loss": 0.457, "step": 1080 }, { "epoch": 0.5028835063437139, "grad_norm": 1.1438275575637817, "learning_rate": 9.317993441490163e-06, "loss": 0.4863, "step": 1090 }, { "epoch": 0.5074971164936563, "grad_norm": 1.0768276453018188, "learning_rate": 9.297546801973027e-06, "loss": 0.4863, "step": 1100 }, { "epoch": 0.5121107266435986, "grad_norm": 0.9337517023086548, "learning_rate": 9.276821300802535e-06, "loss": 0.4445, "step": 1110 }, { "epoch": 0.5167243367935409, "grad_norm": 1.2967017889022827, "learning_rate": 9.255818282826755e-06, "loss": 0.4654, "step": 1120 }, { "epoch": 0.5213379469434832, "grad_norm": 1.256011962890625, "learning_rate": 9.23453911090143e-06, "loss": 0.4164, "step": 1130 }, { "epoch": 0.5259515570934256, "grad_norm": 0.8023069500923157, "learning_rate": 9.21298516580153e-06, "loss": 0.4471, "step": 1140 }, { "epoch": 0.5305651672433679, "grad_norm": 1.1097207069396973, "learning_rate": 9.191157846131662e-06, "loss": 0.438, "step": 1150 }, { "epoch": 0.5351787773933102, "grad_norm": 1.0085464715957642, "learning_rate": 9.169058568235324e-06, "loss": 0.4754, "step": 1160 }, { "epoch": 0.5397923875432526, "grad_norm": 1.2570720911026, "learning_rate": 9.146688766102985e-06, "loss": 0.4622, "step": 1170 }, { "epoch": 0.544405997693195, "grad_norm": 0.8718953728675842, "learning_rate": 9.124049891279052e-06, "loss": 0.4778, "step": 1180 }, { "epoch": 0.5490196078431373, "grad_norm": 1.1282092332839966, "learning_rate": 9.101143412767665e-06, "loss": 0.43, "step": 1190 }, { "epoch": 0.5536332179930796, "grad_norm": 1.1799993515014648, "learning_rate": 9.077970816937394e-06, "loss": 0.4398, "step": 1200 }, { "epoch": 0.558246828143022, "grad_norm": 1.0852458477020264, "learning_rate": 9.05453360742477e-06, "loss": 0.4509, "step": 1210 }, { "epoch": 0.5628604382929643, "grad_norm": 0.9532956480979919, "learning_rate": 9.030833305036732e-06, "loss": 0.4322, "step": 1220 }, { "epoch": 0.5674740484429066, "grad_norm": 1.1594491004943848, "learning_rate": 9.006871447651941e-06, "loss": 0.4463, "step": 1230 }, { "epoch": 0.5720876585928489, "grad_norm": 1.0407638549804688, "learning_rate": 8.982649590120982e-06, "loss": 0.4744, "step": 1240 }, { "epoch": 0.5767012687427913, "grad_norm": 0.9278146028518677, "learning_rate": 8.95816930416548e-06, "loss": 0.4506, "step": 1250 }, { "epoch": 0.5813148788927336, "grad_norm": 1.0362186431884766, "learning_rate": 8.933432178276108e-06, "loss": 0.5047, "step": 1260 }, { "epoch": 0.5859284890426759, "grad_norm": 1.1875795125961304, "learning_rate": 8.908439817609514e-06, "loss": 0.4331, "step": 1270 }, { "epoch": 0.5905420991926182, "grad_norm": 1.0592981576919556, "learning_rate": 8.883193843884169e-06, "loss": 0.4869, "step": 1280 }, { "epoch": 0.5951557093425606, "grad_norm": 1.037862777709961, "learning_rate": 8.857695895275127e-06, "loss": 0.4568, "step": 1290 }, { "epoch": 0.5997693194925029, "grad_norm": 1.190478801727295, "learning_rate": 8.831947626307735e-06, "loss": 0.4291, "step": 1300 }, { "epoch": 0.6043829296424452, "grad_norm": 1.168628215789795, "learning_rate": 8.805950707750268e-06, "loss": 0.4864, "step": 1310 }, { "epoch": 0.6089965397923875, "grad_norm": 1.1069689989089966, "learning_rate": 8.779706826505513e-06, "loss": 0.4755, "step": 1320 }, { "epoch": 0.6136101499423299, "grad_norm": 1.0567044019699097, "learning_rate": 8.753217685501317e-06, "loss": 0.4429, "step": 1330 }, { "epoch": 0.6182237600922722, "grad_norm": 1.3095778226852417, "learning_rate": 8.72648500358008e-06, "loss": 0.4799, "step": 1340 }, { "epoch": 0.6228373702422145, "grad_norm": 1.2443181276321411, "learning_rate": 8.699510515387222e-06, "loss": 0.4238, "step": 1350 }, { "epoch": 0.6274509803921569, "grad_norm": 0.9751482605934143, "learning_rate": 8.672295971258624e-06, "loss": 0.4621, "step": 1360 }, { "epoch": 0.6320645905420992, "grad_norm": 1.0337327718734741, "learning_rate": 8.644843137107058e-06, "loss": 0.482, "step": 1370 }, { "epoch": 0.6366782006920415, "grad_norm": 1.4924689531326294, "learning_rate": 8.617153794307588e-06, "loss": 0.4138, "step": 1380 }, { "epoch": 0.6412918108419838, "grad_norm": 1.1185983419418335, "learning_rate": 8.58922973958199e-06, "loss": 0.4808, "step": 1390 }, { "epoch": 0.6459054209919262, "grad_norm": 0.8987427949905396, "learning_rate": 8.561072784882156e-06, "loss": 0.4196, "step": 1400 }, { "epoch": 0.6505190311418685, "grad_norm": 0.9043972492218018, "learning_rate": 8.532684757272527e-06, "loss": 0.4675, "step": 1410 }, { "epoch": 0.6551326412918108, "grad_norm": 1.1553007364273071, "learning_rate": 8.504067498811533e-06, "loss": 0.4585, "step": 1420 }, { "epoch": 0.6597462514417531, "grad_norm": 1.2655616998672485, "learning_rate": 8.475222866432065e-06, "loss": 0.4557, "step": 1430 }, { "epoch": 0.6643598615916955, "grad_norm": 0.9298520684242249, "learning_rate": 8.446152731820984e-06, "loss": 0.4378, "step": 1440 }, { "epoch": 0.6689734717416378, "grad_norm": 1.0106518268585205, "learning_rate": 8.416858981297663e-06, "loss": 0.482, "step": 1450 }, { "epoch": 0.6735870818915801, "grad_norm": 0.7871996760368347, "learning_rate": 8.387343515691594e-06, "loss": 0.4153, "step": 1460 }, { "epoch": 0.6782006920415224, "grad_norm": 0.8685919046401978, "learning_rate": 8.357608250219046e-06, "loss": 0.4619, "step": 1470 }, { "epoch": 0.6828143021914648, "grad_norm": 1.3125975131988525, "learning_rate": 8.327655114358782e-06, "loss": 0.4327, "step": 1480 }, { "epoch": 0.6874279123414071, "grad_norm": 1.0396238565444946, "learning_rate": 8.297486051726864e-06, "loss": 0.4713, "step": 1490 }, { "epoch": 0.6920415224913494, "grad_norm": 0.7324469685554504, "learning_rate": 8.267103019950529e-06, "loss": 0.4197, "step": 1500 }, { "epoch": 0.6920415224913494, "eval_loss": 0.3744993507862091, "eval_runtime": 409.5922, "eval_samples_per_second": 8.23, "eval_steps_per_second": 1.177, "step": 1500 }, { "epoch": 0.6966551326412919, "grad_norm": 1.0459123849868774, "learning_rate": 8.23650799054117e-06, "loss": 0.4525, "step": 1510 }, { "epoch": 0.7012687427912342, "grad_norm": 0.972507655620575, "learning_rate": 8.2057029487664e-06, "loss": 0.4344, "step": 1520 }, { "epoch": 0.7058823529411765, "grad_norm": 0.9018703103065491, "learning_rate": 8.174689893521239e-06, "loss": 0.4456, "step": 1530 }, { "epoch": 0.7104959630911188, "grad_norm": 1.1698877811431885, "learning_rate": 8.143470837198394e-06, "loss": 0.4342, "step": 1540 }, { "epoch": 0.7151095732410612, "grad_norm": 0.8043988943099976, "learning_rate": 8.112047805557693e-06, "loss": 0.4407, "step": 1550 }, { "epoch": 0.7197231833910035, "grad_norm": 1.0644773244857788, "learning_rate": 8.080422837594627e-06, "loss": 0.4188, "step": 1560 }, { "epoch": 0.7243367935409458, "grad_norm": 1.3345856666564941, "learning_rate": 8.048597985408047e-06, "loss": 0.4594, "step": 1570 }, { "epoch": 0.7289504036908881, "grad_norm": 0.9245930910110474, "learning_rate": 8.016575314067005e-06, "loss": 0.4549, "step": 1580 }, { "epoch": 0.7335640138408305, "grad_norm": 0.8729799389839172, "learning_rate": 7.984356901476755e-06, "loss": 0.4548, "step": 1590 }, { "epoch": 0.7381776239907728, "grad_norm": 1.0106137990951538, "learning_rate": 7.951944838243916e-06, "loss": 0.4452, "step": 1600 }, { "epoch": 0.7427912341407151, "grad_norm": 1.196505069732666, "learning_rate": 7.919341227540828e-06, "loss": 0.4491, "step": 1610 }, { "epoch": 0.7474048442906575, "grad_norm": 1.1595311164855957, "learning_rate": 7.886548184969063e-06, "loss": 0.4731, "step": 1620 }, { "epoch": 0.7520184544405998, "grad_norm": 1.1693317890167236, "learning_rate": 7.85356783842216e-06, "loss": 0.432, "step": 1630 }, { "epoch": 0.7566320645905421, "grad_norm": 0.9775774478912354, "learning_rate": 7.820402327947543e-06, "loss": 0.461, "step": 1640 }, { "epoch": 0.7612456747404844, "grad_norm": 1.2050389051437378, "learning_rate": 7.78705380560766e-06, "loss": 0.4118, "step": 1650 }, { "epoch": 0.7658592848904268, "grad_norm": 0.976572573184967, "learning_rate": 7.753524435340334e-06, "loss": 0.445, "step": 1660 }, { "epoch": 0.7704728950403691, "grad_norm": 0.9844825863838196, "learning_rate": 7.719816392818354e-06, "loss": 0.453, "step": 1670 }, { "epoch": 0.7750865051903114, "grad_norm": 0.966995894908905, "learning_rate": 7.685931865308293e-06, "loss": 0.4424, "step": 1680 }, { "epoch": 0.7797001153402537, "grad_norm": 0.9509267807006836, "learning_rate": 7.651873051528582e-06, "loss": 0.4164, "step": 1690 }, { "epoch": 0.7843137254901961, "grad_norm": 0.89404296875, "learning_rate": 7.617642161506837e-06, "loss": 0.4345, "step": 1700 }, { "epoch": 0.7889273356401384, "grad_norm": 1.4994254112243652, "learning_rate": 7.583241416436462e-06, "loss": 0.4373, "step": 1710 }, { "epoch": 0.7935409457900807, "grad_norm": 1.2022879123687744, "learning_rate": 7.548673048532504e-06, "loss": 0.4146, "step": 1720 }, { "epoch": 0.798154555940023, "grad_norm": 1.1147469282150269, "learning_rate": 7.513939300886816e-06, "loss": 0.4008, "step": 1730 }, { "epoch": 0.8027681660899654, "grad_norm": 0.9766092896461487, "learning_rate": 7.479042427322509e-06, "loss": 0.4401, "step": 1740 }, { "epoch": 0.8073817762399077, "grad_norm": 1.0522454977035522, "learning_rate": 7.443984692247701e-06, "loss": 0.4565, "step": 1750 }, { "epoch": 0.81199538638985, "grad_norm": 0.9872923493385315, "learning_rate": 7.408768370508577e-06, "loss": 0.432, "step": 1760 }, { "epoch": 0.8166089965397924, "grad_norm": 0.729234516620636, "learning_rate": 7.373395747241792e-06, "loss": 0.3847, "step": 1770 }, { "epoch": 0.8212226066897347, "grad_norm": 0.9378695487976074, "learning_rate": 7.337869117726176e-06, "loss": 0.412, "step": 1780 }, { "epoch": 0.825836216839677, "grad_norm": 1.1060293912887573, "learning_rate": 7.302190787233808e-06, "loss": 0.4462, "step": 1790 }, { "epoch": 0.8304498269896193, "grad_norm": 1.1734408140182495, "learning_rate": 7.266363070880424e-06, "loss": 0.4321, "step": 1800 }, { "epoch": 0.8350634371395617, "grad_norm": 0.9876635670661926, "learning_rate": 7.2303882934751965e-06, "loss": 0.4477, "step": 1810 }, { "epoch": 0.839677047289504, "grad_norm": 1.1338772773742676, "learning_rate": 7.194268789369875e-06, "loss": 0.4028, "step": 1820 }, { "epoch": 0.8442906574394463, "grad_norm": 0.9537489414215088, "learning_rate": 7.158006902307322e-06, "loss": 0.457, "step": 1830 }, { "epoch": 0.8489042675893888, "grad_norm": 1.215729832649231, "learning_rate": 7.121604985269423e-06, "loss": 0.4248, "step": 1840 }, { "epoch": 0.8535178777393311, "grad_norm": 1.3123574256896973, "learning_rate": 7.085065400324407e-06, "loss": 0.4731, "step": 1850 }, { "epoch": 0.8581314878892734, "grad_norm": 0.9171858429908752, "learning_rate": 7.048390518473579e-06, "loss": 0.3925, "step": 1860 }, { "epoch": 0.8627450980392157, "grad_norm": 1.195125937461853, "learning_rate": 7.011582719497466e-06, "loss": 0.4481, "step": 1870 }, { "epoch": 0.8673587081891581, "grad_norm": 1.029279112815857, "learning_rate": 6.974644391801395e-06, "loss": 0.4487, "step": 1880 }, { "epoch": 0.8719723183391004, "grad_norm": 1.345962643623352, "learning_rate": 6.9375779322605154e-06, "loss": 0.4424, "step": 1890 }, { "epoch": 0.8765859284890427, "grad_norm": 0.9620792269706726, "learning_rate": 6.900385746064268e-06, "loss": 0.4628, "step": 1900 }, { "epoch": 0.881199538638985, "grad_norm": 1.1548868417739868, "learning_rate": 6.863070246560319e-06, "loss": 0.4194, "step": 1910 }, { "epoch": 0.8858131487889274, "grad_norm": 0.8851338624954224, "learning_rate": 6.825633855097954e-06, "loss": 0.4404, "step": 1920 }, { "epoch": 0.8904267589388697, "grad_norm": 1.1914703845977783, "learning_rate": 6.788079000870966e-06, "loss": 0.4654, "step": 1930 }, { "epoch": 0.895040369088812, "grad_norm": 0.8694286346435547, "learning_rate": 6.7504081207600295e-06, "loss": 0.4849, "step": 1940 }, { "epoch": 0.8996539792387543, "grad_norm": 0.9843218326568604, "learning_rate": 6.712623659174569e-06, "loss": 0.4286, "step": 1950 }, { "epoch": 0.9042675893886967, "grad_norm": 1.0082261562347412, "learning_rate": 6.674728067894149e-06, "loss": 0.4271, "step": 1960 }, { "epoch": 0.908881199538639, "grad_norm": 1.0179473161697388, "learning_rate": 6.636723805909384e-06, "loss": 0.4384, "step": 1970 }, { "epoch": 0.9134948096885813, "grad_norm": 1.102802038192749, "learning_rate": 6.598613339262369e-06, "loss": 0.4058, "step": 1980 }, { "epoch": 0.9181084198385236, "grad_norm": 1.0184437036514282, "learning_rate": 6.560399140886673e-06, "loss": 0.4047, "step": 1990 }, { "epoch": 0.922722029988466, "grad_norm": 0.9515882134437561, "learning_rate": 6.522083690446863e-06, "loss": 0.4264, "step": 2000 }, { "epoch": 0.922722029988466, "eval_loss": 0.3640458583831787, "eval_runtime": 405.1587, "eval_samples_per_second": 8.32, "eval_steps_per_second": 1.19, "step": 2000 }, { "epoch": 0.9273356401384083, "grad_norm": 0.9829747080802917, "learning_rate": 6.483669474177609e-06, "loss": 0.4309, "step": 2010 }, { "epoch": 0.9319492502883506, "grad_norm": 1.134294033050537, "learning_rate": 6.445158984722358e-06, "loss": 0.4321, "step": 2020 }, { "epoch": 0.936562860438293, "grad_norm": 0.8324179649353027, "learning_rate": 6.406554720971583e-06, "loss": 0.4118, "step": 2030 }, { "epoch": 0.9411764705882353, "grad_norm": 0.9672048091888428, "learning_rate": 6.367859187900635e-06, "loss": 0.4508, "step": 2040 }, { "epoch": 0.9457900807381776, "grad_norm": 0.7900782823562622, "learning_rate": 6.329074896407202e-06, "loss": 0.4088, "step": 2050 }, { "epoch": 0.9504036908881199, "grad_norm": 1.2132816314697266, "learning_rate": 6.29020436314838e-06, "loss": 0.4, "step": 2060 }, { "epoch": 0.9550173010380623, "grad_norm": 1.028160810470581, "learning_rate": 6.251250110377368e-06, "loss": 0.4122, "step": 2070 }, { "epoch": 0.9596309111880046, "grad_norm": 0.979695200920105, "learning_rate": 6.212214665779805e-06, "loss": 0.4449, "step": 2080 }, { "epoch": 0.9642445213379469, "grad_norm": 0.845983624458313, "learning_rate": 6.173100562309751e-06, "loss": 0.4229, "step": 2090 }, { "epoch": 0.9688581314878892, "grad_norm": 1.1386796236038208, "learning_rate": 6.133910338025329e-06, "loss": 0.4389, "step": 2100 }, { "epoch": 0.9734717416378316, "grad_norm": 1.0641363859176636, "learning_rate": 6.094646535924026e-06, "loss": 0.4459, "step": 2110 }, { "epoch": 0.9780853517877739, "grad_norm": 1.2342710494995117, "learning_rate": 6.055311703777699e-06, "loss": 0.4556, "step": 2120 }, { "epoch": 0.9826989619377162, "grad_norm": 0.9290037751197815, "learning_rate": 6.0159083939672326e-06, "loss": 0.4837, "step": 2130 }, { "epoch": 0.9873125720876585, "grad_norm": 1.0555449724197388, "learning_rate": 5.976439163316936e-06, "loss": 0.4119, "step": 2140 }, { "epoch": 0.9919261822376009, "grad_norm": 1.001559853553772, "learning_rate": 5.936906572928625e-06, "loss": 0.4391, "step": 2150 }, { "epoch": 0.9965397923875432, "grad_norm": 1.120397686958313, "learning_rate": 5.897313188015433e-06, "loss": 0.4175, "step": 2160 }, { "epoch": 1.0009227220299886, "grad_norm": 1.0661535263061523, "learning_rate": 5.8576615777353725e-06, "loss": 0.4176, "step": 2170 }, { "epoch": 1.0055363321799309, "grad_norm": 0.8464300036430359, "learning_rate": 5.81795431502461e-06, "loss": 0.3182, "step": 2180 }, { "epoch": 1.0101499423298732, "grad_norm": 1.150085687637329, "learning_rate": 5.778193976430518e-06, "loss": 0.3412, "step": 2190 }, { "epoch": 1.0147635524798155, "grad_norm": 1.1552358865737915, "learning_rate": 5.738383141944493e-06, "loss": 0.3254, "step": 2200 }, { "epoch": 1.0193771626297579, "grad_norm": 0.8325443863868713, "learning_rate": 5.698524394834531e-06, "loss": 0.3121, "step": 2210 }, { "epoch": 1.0239907727797002, "grad_norm": 0.9441822171211243, "learning_rate": 5.658620321477613e-06, "loss": 0.309, "step": 2220 }, { "epoch": 1.0286043829296425, "grad_norm": 0.683917224407196, "learning_rate": 5.6186735111918735e-06, "loss": 0.2945, "step": 2230 }, { "epoch": 1.0332179930795848, "grad_norm": 1.1293641328811646, "learning_rate": 5.5786865560685855e-06, "loss": 0.3277, "step": 2240 }, { "epoch": 1.0378316032295272, "grad_norm": 1.0378141403198242, "learning_rate": 5.538662050803965e-06, "loss": 0.3337, "step": 2250 }, { "epoch": 1.0424452133794695, "grad_norm": 0.8613712787628174, "learning_rate": 5.498602592530799e-06, "loss": 0.3145, "step": 2260 }, { "epoch": 1.0470588235294118, "grad_norm": 0.8895742297172546, "learning_rate": 5.458510780649932e-06, "loss": 0.3016, "step": 2270 }, { "epoch": 1.0516724336793541, "grad_norm": 0.8962990045547485, "learning_rate": 5.41838921666158e-06, "loss": 0.3107, "step": 2280 }, { "epoch": 1.0562860438292965, "grad_norm": 1.1359519958496094, "learning_rate": 5.378240503996531e-06, "loss": 0.313, "step": 2290 }, { "epoch": 1.0608996539792388, "grad_norm": 0.9063310027122498, "learning_rate": 5.338067247847219e-06, "loss": 0.3186, "step": 2300 }, { "epoch": 1.0655132641291811, "grad_norm": 0.927183985710144, "learning_rate": 5.297872054998663e-06, "loss": 0.3198, "step": 2310 }, { "epoch": 1.0701268742791235, "grad_norm": 1.573792815208435, "learning_rate": 5.257657533659326e-06, "loss": 0.3181, "step": 2320 }, { "epoch": 1.0747404844290658, "grad_norm": 1.2177760601043701, "learning_rate": 5.217426293291869e-06, "loss": 0.3369, "step": 2330 }, { "epoch": 1.079354094579008, "grad_norm": 1.1653475761413574, "learning_rate": 5.177180944443821e-06, "loss": 0.311, "step": 2340 }, { "epoch": 1.0839677047289504, "grad_norm": 0.874153196811676, "learning_rate": 5.136924098578201e-06, "loss": 0.3109, "step": 2350 }, { "epoch": 1.0885813148788928, "grad_norm": 1.03621244430542, "learning_rate": 5.096658367904043e-06, "loss": 0.2808, "step": 2360 }, { "epoch": 1.093194925028835, "grad_norm": 1.4676544666290283, "learning_rate": 5.056386365206908e-06, "loss": 0.3435, "step": 2370 }, { "epoch": 1.0978085351787774, "grad_norm": 1.0248422622680664, "learning_rate": 5.016110703679341e-06, "loss": 0.3141, "step": 2380 }, { "epoch": 1.1024221453287197, "grad_norm": 1.0083783864974976, "learning_rate": 4.9758339967512995e-06, "loss": 0.3074, "step": 2390 }, { "epoch": 1.107035755478662, "grad_norm": 0.9300906658172607, "learning_rate": 4.935558857920576e-06, "loss": 0.3255, "step": 2400 }, { "epoch": 1.1116493656286044, "grad_norm": 1.1581122875213623, "learning_rate": 4.895287900583216e-06, "loss": 0.3007, "step": 2410 }, { "epoch": 1.1162629757785467, "grad_norm": 1.3533753156661987, "learning_rate": 4.855023737863927e-06, "loss": 0.3383, "step": 2420 }, { "epoch": 1.120876585928489, "grad_norm": 1.157009482383728, "learning_rate": 4.814768982446532e-06, "loss": 0.3207, "step": 2430 }, { "epoch": 1.1254901960784314, "grad_norm": 1.2679253816604614, "learning_rate": 4.774526246404417e-06, "loss": 0.3069, "step": 2440 }, { "epoch": 1.1301038062283737, "grad_norm": 1.1905463933944702, "learning_rate": 4.734298141031057e-06, "loss": 0.2949, "step": 2450 }, { "epoch": 1.134717416378316, "grad_norm": 0.9034658670425415, "learning_rate": 4.69408727667056e-06, "loss": 0.3602, "step": 2460 }, { "epoch": 1.1393310265282584, "grad_norm": 0.964447021484375, "learning_rate": 4.653896262548291e-06, "loss": 0.2999, "step": 2470 }, { "epoch": 1.1439446366782007, "grad_norm": 0.8305296897888184, "learning_rate": 4.613727706601558e-06, "loss": 0.3186, "step": 2480 }, { "epoch": 1.148558246828143, "grad_norm": 1.3243507146835327, "learning_rate": 4.573584215310394e-06, "loss": 0.2857, "step": 2490 }, { "epoch": 1.1531718569780853, "grad_norm": 1.1306155920028687, "learning_rate": 4.533468393528421e-06, "loss": 0.3188, "step": 2500 }, { "epoch": 1.1531718569780853, "eval_loss": 0.36377301812171936, "eval_runtime": 432.1909, "eval_samples_per_second": 7.8, "eval_steps_per_second": 1.115, "step": 2500 }, { "epoch": 1.1577854671280277, "grad_norm": 1.311614751815796, "learning_rate": 4.493382844313826e-06, "loss": 0.3255, "step": 2510 }, { "epoch": 1.16239907727797, "grad_norm": 1.1632609367370605, "learning_rate": 4.453330168760451e-06, "loss": 0.3408, "step": 2520 }, { "epoch": 1.1670126874279123, "grad_norm": 0.9437416195869446, "learning_rate": 4.41331296582902e-06, "loss": 0.3562, "step": 2530 }, { "epoch": 1.1716262975778546, "grad_norm": 1.4374769926071167, "learning_rate": 4.373333832178478e-06, "loss": 0.3049, "step": 2540 }, { "epoch": 1.176239907727797, "grad_norm": 0.9964131712913513, "learning_rate": 4.333395361997521e-06, "loss": 0.3223, "step": 2550 }, { "epoch": 1.1808535178777393, "grad_norm": 0.7799270749092102, "learning_rate": 4.293500146836241e-06, "loss": 0.2913, "step": 2560 }, { "epoch": 1.1854671280276816, "grad_norm": 1.0871920585632324, "learning_rate": 4.25365077543798e-06, "loss": 0.2823, "step": 2570 }, { "epoch": 1.190080738177624, "grad_norm": 1.0069403648376465, "learning_rate": 4.213849833571341e-06, "loss": 0.3583, "step": 2580 }, { "epoch": 1.1946943483275663, "grad_norm": 0.9551932215690613, "learning_rate": 4.174099903862403e-06, "loss": 0.3101, "step": 2590 }, { "epoch": 1.1993079584775086, "grad_norm": 0.7922395467758179, "learning_rate": 4.134403565627144e-06, "loss": 0.3311, "step": 2600 }, { "epoch": 1.203921568627451, "grad_norm": 1.0218504667282104, "learning_rate": 4.0947633947040616e-06, "loss": 0.3437, "step": 2610 }, { "epoch": 1.2085351787773932, "grad_norm": 1.01131272315979, "learning_rate": 4.055181963287044e-06, "loss": 0.2788, "step": 2620 }, { "epoch": 1.2131487889273356, "grad_norm": 0.9900946021080017, "learning_rate": 4.01566183975845e-06, "loss": 0.3188, "step": 2630 }, { "epoch": 1.217762399077278, "grad_norm": 1.219028353691101, "learning_rate": 3.9762055885224614e-06, "loss": 0.2936, "step": 2640 }, { "epoch": 1.2223760092272202, "grad_norm": 1.4025200605392456, "learning_rate": 3.936815769838682e-06, "loss": 0.266, "step": 2650 }, { "epoch": 1.2269896193771626, "grad_norm": 1.1445423364639282, "learning_rate": 3.897494939655996e-06, "loss": 0.315, "step": 2660 }, { "epoch": 1.2316032295271049, "grad_norm": 1.1953898668289185, "learning_rate": 3.8582456494467214e-06, "loss": 0.3161, "step": 2670 }, { "epoch": 1.2362168396770472, "grad_norm": 1.501749873161316, "learning_rate": 3.819070446041059e-06, "loss": 0.3216, "step": 2680 }, { "epoch": 1.2408304498269895, "grad_norm": 0.9947803616523743, "learning_rate": 3.779971871461813e-06, "loss": 0.3184, "step": 2690 }, { "epoch": 1.2454440599769319, "grad_norm": 0.9146224856376648, "learning_rate": 3.7409524627594607e-06, "loss": 0.3097, "step": 2700 }, { "epoch": 1.2500576701268744, "grad_norm": 1.1721278429031372, "learning_rate": 3.702014751847514e-06, "loss": 0.2805, "step": 2710 }, { "epoch": 1.2546712802768165, "grad_norm": 0.8447152972221375, "learning_rate": 3.6631612653382354e-06, "loss": 0.3199, "step": 2720 }, { "epoch": 1.259284890426759, "grad_norm": 1.5338748693466187, "learning_rate": 3.624394524378684e-06, "loss": 0.3204, "step": 2730 }, { "epoch": 1.2638985005767012, "grad_norm": 0.9287798404693604, "learning_rate": 3.585717044487126e-06, "loss": 0.3378, "step": 2740 }, { "epoch": 1.2685121107266437, "grad_norm": 1.4134514331817627, "learning_rate": 3.5471313353898056e-06, "loss": 0.3073, "step": 2750 }, { "epoch": 1.2731257208765858, "grad_norm": 1.08121919631958, "learning_rate": 3.5086399008580885e-06, "loss": 0.3255, "step": 2760 }, { "epoch": 1.2777393310265284, "grad_norm": 1.100626826286316, "learning_rate": 3.470245238546002e-06, "loss": 0.3108, "step": 2770 }, { "epoch": 1.2823529411764705, "grad_norm": 0.8085044622421265, "learning_rate": 3.4319498398281638e-06, "loss": 0.2944, "step": 2780 }, { "epoch": 1.286966551326413, "grad_norm": 1.2880297899246216, "learning_rate": 3.393756189638115e-06, "loss": 0.3167, "step": 2790 }, { "epoch": 1.2915801614763551, "grad_norm": 1.2243609428405762, "learning_rate": 3.355666766307084e-06, "loss": 0.3009, "step": 2800 }, { "epoch": 1.2961937716262977, "grad_norm": 0.699437141418457, "learning_rate": 3.3176840414031653e-06, "loss": 0.2878, "step": 2810 }, { "epoch": 1.3008073817762398, "grad_norm": 1.0296318531036377, "learning_rate": 3.2798104795709484e-06, "loss": 0.2743, "step": 2820 }, { "epoch": 1.3054209919261823, "grad_norm": 1.024989366531372, "learning_rate": 3.242048538371585e-06, "loss": 0.3117, "step": 2830 }, { "epoch": 1.3100346020761244, "grad_norm": 1.111118197441101, "learning_rate": 3.2044006681233226e-06, "loss": 0.3065, "step": 2840 }, { "epoch": 1.314648212226067, "grad_norm": 1.0204840898513794, "learning_rate": 3.1668693117425128e-06, "loss": 0.3182, "step": 2850 }, { "epoch": 1.3192618223760093, "grad_norm": 0.909860372543335, "learning_rate": 3.1294569045850844e-06, "loss": 0.3362, "step": 2860 }, { "epoch": 1.3238754325259516, "grad_norm": 1.1789814233779907, "learning_rate": 3.092165874288525e-06, "loss": 0.3202, "step": 2870 }, { "epoch": 1.328489042675894, "grad_norm": 0.8858640193939209, "learning_rate": 3.05499864061435e-06, "loss": 0.3061, "step": 2880 }, { "epoch": 1.3331026528258363, "grad_norm": 0.8976421356201172, "learning_rate": 3.017957615291088e-06, "loss": 0.2937, "step": 2890 }, { "epoch": 1.3377162629757786, "grad_norm": 1.0524935722351074, "learning_rate": 2.981045201857796e-06, "loss": 0.3056, "step": 2900 }, { "epoch": 1.342329873125721, "grad_norm": 1.0822246074676514, "learning_rate": 2.9442637955080787e-06, "loss": 0.2964, "step": 2910 }, { "epoch": 1.3469434832756633, "grad_norm": 1.043286681175232, "learning_rate": 2.9076157829346883e-06, "loss": 0.322, "step": 2920 }, { "epoch": 1.3515570934256056, "grad_norm": 1.2808856964111328, "learning_rate": 2.871103542174637e-06, "loss": 0.3186, "step": 2930 }, { "epoch": 1.356170703575548, "grad_norm": 0.9058982729911804, "learning_rate": 2.8347294424549075e-06, "loss": 0.2989, "step": 2940 }, { "epoch": 1.3607843137254902, "grad_norm": 0.8408973217010498, "learning_rate": 2.7984958440387045e-06, "loss": 0.3095, "step": 2950 }, { "epoch": 1.3653979238754326, "grad_norm": 0.7515527606010437, "learning_rate": 2.7624050980723032e-06, "loss": 0.3209, "step": 2960 }, { "epoch": 1.370011534025375, "grad_norm": 0.9075823426246643, "learning_rate": 2.726459546432488e-06, "loss": 0.3238, "step": 2970 }, { "epoch": 1.3746251441753172, "grad_norm": 0.7237765192985535, "learning_rate": 2.690661521574596e-06, "loss": 0.2856, "step": 2980 }, { "epoch": 1.3792387543252596, "grad_norm": 1.110317349433899, "learning_rate": 2.655013346381158e-06, "loss": 0.3145, "step": 2990 }, { "epoch": 1.3838523644752019, "grad_norm": 0.925359845161438, "learning_rate": 2.6195173340111767e-06, "loss": 0.2938, "step": 3000 }, { "epoch": 1.3838523644752019, "eval_loss": 0.3571609854698181, "eval_runtime": 407.4112, "eval_samples_per_second": 8.274, "eval_steps_per_second": 1.183, "step": 3000 }, { "epoch": 1.3884659746251442, "grad_norm": 0.768183171749115, "learning_rate": 2.5841757877500245e-06, "loss": 0.2978, "step": 3010 }, { "epoch": 1.3930795847750865, "grad_norm": 0.9414054751396179, "learning_rate": 2.548991000859997e-06, "loss": 0.2824, "step": 3020 }, { "epoch": 1.3976931949250289, "grad_norm": 1.148766040802002, "learning_rate": 2.513965256431488e-06, "loss": 0.3256, "step": 3030 }, { "epoch": 1.4023068050749712, "grad_norm": 1.0877196788787842, "learning_rate": 2.4791008272348656e-06, "loss": 0.3297, "step": 3040 }, { "epoch": 1.4069204152249135, "grad_norm": 1.0412601232528687, "learning_rate": 2.444399975572974e-06, "loss": 0.3279, "step": 3050 }, { "epoch": 1.4115340253748558, "grad_norm": 1.1179814338684082, "learning_rate": 2.40986495313435e-06, "loss": 0.3103, "step": 3060 }, { "epoch": 1.4161476355247982, "grad_norm": 0.7637813091278076, "learning_rate": 2.3754980008471074e-06, "loss": 0.3231, "step": 3070 }, { "epoch": 1.4207612456747405, "grad_norm": 1.0383415222167969, "learning_rate": 2.3413013487335332e-06, "loss": 0.3138, "step": 3080 }, { "epoch": 1.4253748558246828, "grad_norm": 0.8388441801071167, "learning_rate": 2.307277215765377e-06, "loss": 0.2695, "step": 3090 }, { "epoch": 1.4299884659746251, "grad_norm": 0.9990552663803101, "learning_rate": 2.273427809719867e-06, "loss": 0.2983, "step": 3100 }, { "epoch": 1.4346020761245675, "grad_norm": 0.8428414463996887, "learning_rate": 2.2397553270364546e-06, "loss": 0.3141, "step": 3110 }, { "epoch": 1.4392156862745098, "grad_norm": 0.9415843486785889, "learning_rate": 2.206261952674284e-06, "loss": 0.2959, "step": 3120 }, { "epoch": 1.4438292964244521, "grad_norm": 1.3723379373550415, "learning_rate": 2.172949859970422e-06, "loss": 0.3348, "step": 3130 }, { "epoch": 1.4484429065743945, "grad_norm": 0.8914421796798706, "learning_rate": 2.1398212104988273e-06, "loss": 0.3098, "step": 3140 }, { "epoch": 1.4530565167243368, "grad_norm": 0.8774773478507996, "learning_rate": 2.1068781539300874e-06, "loss": 0.2701, "step": 3150 }, { "epoch": 1.457670126874279, "grad_norm": 1.0844688415527344, "learning_rate": 2.0741228278919347e-06, "loss": 0.3135, "step": 3160 }, { "epoch": 1.4622837370242214, "grad_norm": 1.0603561401367188, "learning_rate": 2.0415573578305343e-06, "loss": 0.3234, "step": 3170 }, { "epoch": 1.4668973471741638, "grad_norm": 0.9056031703948975, "learning_rate": 2.0091838568725685e-06, "loss": 0.3034, "step": 3180 }, { "epoch": 1.471510957324106, "grad_norm": 0.8055946230888367, "learning_rate": 1.977004425688126e-06, "loss": 0.3403, "step": 3190 }, { "epoch": 1.4761245674740484, "grad_norm": 1.1034411191940308, "learning_rate": 1.945021152354379e-06, "loss": 0.3235, "step": 3200 }, { "epoch": 1.4807381776239907, "grad_norm": 1.3559411764144897, "learning_rate": 1.913236112220101e-06, "loss": 0.2852, "step": 3210 }, { "epoch": 1.485351787773933, "grad_norm": 1.1443620920181274, "learning_rate": 1.8816513677709935e-06, "loss": 0.3362, "step": 3220 }, { "epoch": 1.4899653979238754, "grad_norm": 0.9599668979644775, "learning_rate": 1.8502689684958664e-06, "loss": 0.2814, "step": 3230 }, { "epoch": 1.4945790080738177, "grad_norm": 1.053106665611267, "learning_rate": 1.8190909507536326e-06, "loss": 0.3092, "step": 3240 }, { "epoch": 1.49919261822376, "grad_norm": 0.979612410068512, "learning_rate": 1.7881193376411822e-06, "loss": 0.2931, "step": 3250 }, { "epoch": 1.5038062283737024, "grad_norm": 1.0935841798782349, "learning_rate": 1.7573561388621102e-06, "loss": 0.2852, "step": 3260 }, { "epoch": 1.5084198385236447, "grad_norm": 1.003023386001587, "learning_rate": 1.7268033505962972e-06, "loss": 0.3252, "step": 3270 }, { "epoch": 1.5130334486735872, "grad_norm": 0.8895878195762634, "learning_rate": 1.6964629553703893e-06, "loss": 0.2965, "step": 3280 }, { "epoch": 1.5176470588235293, "grad_norm": 1.2238770723342896, "learning_rate": 1.6663369219291558e-06, "loss": 0.3256, "step": 3290 }, { "epoch": 1.522260668973472, "grad_norm": 0.9977489709854126, "learning_rate": 1.6364272051077335e-06, "loss": 0.3087, "step": 3300 }, { "epoch": 1.526874279123414, "grad_norm": 0.8793919682502747, "learning_rate": 1.606735745704784e-06, "loss": 0.3082, "step": 3310 }, { "epoch": 1.5314878892733566, "grad_norm": 1.015448808670044, "learning_rate": 1.5772644703565564e-06, "loss": 0.3089, "step": 3320 }, { "epoch": 1.5361014994232987, "grad_norm": 0.9907401204109192, "learning_rate": 1.5480152914118784e-06, "loss": 0.312, "step": 3330 }, { "epoch": 1.5407151095732412, "grad_norm": 1.1181472539901733, "learning_rate": 1.5189901068080536e-06, "loss": 0.2756, "step": 3340 }, { "epoch": 1.5453287197231833, "grad_norm": 0.7450747489929199, "learning_rate": 1.4901907999477167e-06, "loss": 0.2931, "step": 3350 }, { "epoch": 1.5499423298731259, "grad_norm": 0.7395336031913757, "learning_rate": 1.4616192395766189e-06, "loss": 0.3312, "step": 3360 }, { "epoch": 1.554555940023068, "grad_norm": 1.0844025611877441, "learning_rate": 1.4332772796623655e-06, "loss": 0.2877, "step": 3370 }, { "epoch": 1.5591695501730105, "grad_norm": 1.0162688493728638, "learning_rate": 1.405166759274123e-06, "loss": 0.2865, "step": 3380 }, { "epoch": 1.5637831603229526, "grad_norm": 1.459636926651001, "learning_rate": 1.3772895024632753e-06, "loss": 0.2772, "step": 3390 }, { "epoch": 1.5683967704728952, "grad_norm": 1.1167926788330078, "learning_rate": 1.349647318145067e-06, "loss": 0.2826, "step": 3400 }, { "epoch": 1.5730103806228373, "grad_norm": 1.4571030139923096, "learning_rate": 1.3222419999812248e-06, "loss": 0.2582, "step": 3410 }, { "epoch": 1.5776239907727798, "grad_norm": 1.154638648033142, "learning_rate": 1.2950753262635712e-06, "loss": 0.3361, "step": 3420 }, { "epoch": 1.582237600922722, "grad_norm": 0.5414898991584778, "learning_rate": 1.2681490597986313e-06, "loss": 0.305, "step": 3430 }, { "epoch": 1.5868512110726645, "grad_norm": 0.8521725535392761, "learning_rate": 1.2414649477932511e-06, "loss": 0.2935, "step": 3440 }, { "epoch": 1.5914648212226066, "grad_norm": 1.0056465864181519, "learning_rate": 1.2150247217412186e-06, "loss": 0.3227, "step": 3450 }, { "epoch": 1.5960784313725491, "grad_norm": 1.3629816770553589, "learning_rate": 1.1888300973109112e-06, "loss": 0.3037, "step": 3460 }, { "epoch": 1.6006920415224912, "grad_norm": 1.0090680122375488, "learning_rate": 1.1628827742339688e-06, "loss": 0.3172, "step": 3470 }, { "epoch": 1.6053056516724338, "grad_norm": 1.347844123840332, "learning_rate": 1.1371844361950045e-06, "loss": 0.3046, "step": 3480 }, { "epoch": 1.6099192618223759, "grad_norm": 0.7335895299911499, "learning_rate": 1.1117367507223452e-06, "loss": 0.3107, "step": 3490 }, { "epoch": 1.6145328719723184, "grad_norm": 0.8737802505493164, "learning_rate": 1.0865413690798321e-06, "loss": 0.2891, "step": 3500 }, { "epoch": 1.6145328719723184, "eval_loss": 0.3522779047489166, "eval_runtime": 405.2938, "eval_samples_per_second": 8.317, "eval_steps_per_second": 1.189, "step": 3500 }, { "epoch": 1.6191464821222605, "grad_norm": 0.9549174904823303, "learning_rate": 1.061599926159676e-06, "loss": 0.3177, "step": 3510 }, { "epoch": 1.623760092272203, "grad_norm": 1.2092400789260864, "learning_rate": 1.036914040376364e-06, "loss": 0.2951, "step": 3520 }, { "epoch": 1.6283737024221452, "grad_norm": 0.9136941432952881, "learning_rate": 1.0124853135616475e-06, "loss": 0.273, "step": 3530 }, { "epoch": 1.6329873125720877, "grad_norm": 0.8041252493858337, "learning_rate": 9.883153308606035e-07, "loss": 0.307, "step": 3540 }, { "epoch": 1.6376009227220298, "grad_norm": 2.2038888931274414, "learning_rate": 9.644056606287727e-07, "loss": 0.3031, "step": 3550 }, { "epoch": 1.6422145328719724, "grad_norm": 0.995631754398346, "learning_rate": 9.407578543303913e-07, "loss": 0.3121, "step": 3560 }, { "epoch": 1.6468281430219145, "grad_norm": 1.1409215927124023, "learning_rate": 9.173734464377204e-07, "loss": 0.2709, "step": 3570 }, { "epoch": 1.651441753171857, "grad_norm": 1.1905242204666138, "learning_rate": 8.942539543314799e-07, "loss": 0.2877, "step": 3580 }, { "epoch": 1.6560553633217991, "grad_norm": 1.2991387844085693, "learning_rate": 8.714008782023797e-07, "loss": 0.306, "step": 3590 }, { "epoch": 1.6606689734717417, "grad_norm": 1.122862696647644, "learning_rate": 8.488157009537796e-07, "loss": 0.3156, "step": 3600 }, { "epoch": 1.665282583621684, "grad_norm": 1.0552375316619873, "learning_rate": 8.264998881054659e-07, "loss": 0.3164, "step": 3610 }, { "epoch": 1.6698961937716263, "grad_norm": 1.0529013872146606, "learning_rate": 8.044548876985531e-07, "loss": 0.2823, "step": 3620 }, { "epoch": 1.6745098039215687, "grad_norm": 1.0816291570663452, "learning_rate": 7.826821302015275e-07, "loss": 0.3184, "step": 3630 }, { "epoch": 1.679123414071511, "grad_norm": 0.7152329683303833, "learning_rate": 7.61183028417422e-07, "loss": 0.2956, "step": 3640 }, { "epoch": 1.6837370242214533, "grad_norm": 0.9819076061248779, "learning_rate": 7.399589773921412e-07, "loss": 0.3187, "step": 3650 }, { "epoch": 1.6883506343713957, "grad_norm": 0.6662834286689758, "learning_rate": 7.190113543239408e-07, "loss": 0.3194, "step": 3660 }, { "epoch": 1.692964244521338, "grad_norm": 1.200137734413147, "learning_rate": 6.983415184740616e-07, "loss": 0.2958, "step": 3670 }, { "epoch": 1.6975778546712803, "grad_norm": 0.8711331486701965, "learning_rate": 6.779508110785332e-07, "loss": 0.2761, "step": 3680 }, { "epoch": 1.7021914648212226, "grad_norm": 1.2060991525650024, "learning_rate": 6.578405552611361e-07, "loss": 0.2758, "step": 3690 }, { "epoch": 1.706805074971165, "grad_norm": 0.7914460897445679, "learning_rate": 6.380120559475505e-07, "loss": 0.3272, "step": 3700 }, { "epoch": 1.7114186851211073, "grad_norm": 1.0925244092941284, "learning_rate": 6.184665997806832e-07, "loss": 0.2947, "step": 3710 }, { "epoch": 1.7160322952710496, "grad_norm": 0.9509474635124207, "learning_rate": 5.992054550371723e-07, "loss": 0.3304, "step": 3720 }, { "epoch": 1.720645905420992, "grad_norm": 1.3933912515640259, "learning_rate": 5.802298715451016e-07, "loss": 0.3214, "step": 3730 }, { "epoch": 1.7252595155709343, "grad_norm": 1.1641534566879272, "learning_rate": 5.615410806028875e-07, "loss": 0.2974, "step": 3740 }, { "epoch": 1.7298731257208766, "grad_norm": 0.8583273887634277, "learning_rate": 5.431402948993947e-07, "loss": 0.2869, "step": 3750 }, { "epoch": 1.734486735870819, "grad_norm": 1.1167171001434326, "learning_rate": 5.250287084352373e-07, "loss": 0.329, "step": 3760 }, { "epoch": 1.7391003460207612, "grad_norm": 1.1780617237091064, "learning_rate": 5.072074964453055e-07, "loss": 0.3262, "step": 3770 }, { "epoch": 1.7437139561707036, "grad_norm": 1.1685618162155151, "learning_rate": 4.896778153225062e-07, "loss": 0.2963, "step": 3780 }, { "epoch": 1.748327566320646, "grad_norm": 1.0674740076065063, "learning_rate": 4.7244080254272795e-07, "loss": 0.3124, "step": 3790 }, { "epoch": 1.7529411764705882, "grad_norm": 1.2227847576141357, "learning_rate": 4.55497576591028e-07, "loss": 0.293, "step": 3800 }, { "epoch": 1.7575547866205306, "grad_norm": 1.0834511518478394, "learning_rate": 4.3884923688905676e-07, "loss": 0.3092, "step": 3810 }, { "epoch": 1.7621683967704729, "grad_norm": 0.7183946371078491, "learning_rate": 4.224968637237198e-07, "loss": 0.2644, "step": 3820 }, { "epoch": 1.7667820069204152, "grad_norm": 1.1382250785827637, "learning_rate": 4.064415181770787e-07, "loss": 0.2823, "step": 3830 }, { "epoch": 1.7713956170703575, "grad_norm": 1.1042758226394653, "learning_rate": 3.90684242057498e-07, "loss": 0.3121, "step": 3840 }, { "epoch": 1.7760092272202999, "grad_norm": 0.8227053284645081, "learning_rate": 3.752260578320427e-07, "loss": 0.3145, "step": 3850 }, { "epoch": 1.7806228373702422, "grad_norm": 1.0205223560333252, "learning_rate": 3.600679685601349e-07, "loss": 0.3086, "step": 3860 }, { "epoch": 1.7852364475201845, "grad_norm": 0.8816052675247192, "learning_rate": 3.4521095782846623e-07, "loss": 0.2978, "step": 3870 }, { "epoch": 1.7898500576701268, "grad_norm": 1.44774329662323, "learning_rate": 3.306559896871714e-07, "loss": 0.3016, "step": 3880 }, { "epoch": 1.7944636678200692, "grad_norm": 0.9304317235946655, "learning_rate": 3.164040085872755e-07, "loss": 0.3066, "step": 3890 }, { "epoch": 1.7990772779700115, "grad_norm": 1.0888575315475464, "learning_rate": 3.0245593931940766e-07, "loss": 0.2851, "step": 3900 }, { "epoch": 1.8036908881199538, "grad_norm": 0.8086104989051819, "learning_rate": 2.8881268695379436e-07, "loss": 0.2901, "step": 3910 }, { "epoch": 1.8083044982698961, "grad_norm": 0.7356364727020264, "learning_rate": 2.7547513678153005e-07, "loss": 0.2997, "step": 3920 }, { "epoch": 1.8129181084198385, "grad_norm": 1.017858624458313, "learning_rate": 2.624441542571327e-07, "loss": 0.3282, "step": 3930 }, { "epoch": 1.8175317185697808, "grad_norm": 0.9155429601669312, "learning_rate": 2.497205849423834e-07, "loss": 0.2596, "step": 3940 }, { "epoch": 1.8221453287197233, "grad_norm": 0.9723671078681946, "learning_rate": 2.3730525445146146e-07, "loss": 0.3077, "step": 3950 }, { "epoch": 1.8267589388696654, "grad_norm": 1.2884184122085571, "learning_rate": 2.25198968397371e-07, "loss": 0.3411, "step": 3960 }, { "epoch": 1.831372549019608, "grad_norm": 0.9986656308174133, "learning_rate": 2.134025123396638e-07, "loss": 0.2816, "step": 3970 }, { "epoch": 1.83598615916955, "grad_norm": 0.7283441424369812, "learning_rate": 2.019166517334703e-07, "loss": 0.3093, "step": 3980 }, { "epoch": 1.8405997693194927, "grad_norm": 1.1408605575561523, "learning_rate": 1.9074213187982416e-07, "loss": 0.2848, "step": 3990 }, { "epoch": 1.8452133794694348, "grad_norm": 1.2907861471176147, "learning_rate": 1.7987967787730541e-07, "loss": 0.3013, "step": 4000 }, { "epoch": 1.8452133794694348, "eval_loss": 0.3492071032524109, "eval_runtime": 413.2138, "eval_samples_per_second": 8.158, "eval_steps_per_second": 1.166, "step": 4000 }, { "epoch": 1.8498269896193773, "grad_norm": 0.9750341176986694, "learning_rate": 1.6932999457498823e-07, "loss": 0.2951, "step": 4010 }, { "epoch": 1.8544405997693194, "grad_norm": 1.0746599435806274, "learning_rate": 1.5909376652670283e-07, "loss": 0.2518, "step": 4020 }, { "epoch": 1.859054209919262, "grad_norm": 1.2660961151123047, "learning_rate": 1.4917165794661849e-07, "loss": 0.3032, "step": 4030 }, { "epoch": 1.863667820069204, "grad_norm": 0.8381269574165344, "learning_rate": 1.395643126661428e-07, "loss": 0.2954, "step": 4040 }, { "epoch": 1.8682814302191466, "grad_norm": 0.9705010056495667, "learning_rate": 1.302723540921419e-07, "loss": 0.2756, "step": 4050 }, { "epoch": 1.8728950403690887, "grad_norm": 1.080946445465088, "learning_rate": 1.212963851664928e-07, "loss": 0.2792, "step": 4060 }, { "epoch": 1.8775086505190313, "grad_norm": 1.0065879821777344, "learning_rate": 1.1263698832695513e-07, "loss": 0.3286, "step": 4070 }, { "epoch": 1.8821222606689734, "grad_norm": 0.9605699181556702, "learning_rate": 1.0429472546938158e-07, "loss": 0.2919, "step": 4080 }, { "epoch": 1.886735870818916, "grad_norm": 1.2412244081497192, "learning_rate": 9.627013791125294e-08, "loss": 0.3285, "step": 4090 }, { "epoch": 1.891349480968858, "grad_norm": 1.0108641386032104, "learning_rate": 8.856374635655696e-08, "loss": 0.3172, "step": 4100 }, { "epoch": 1.8959630911188006, "grad_norm": 1.0112003087997437, "learning_rate": 8.117605086199686e-08, "loss": 0.2838, "step": 4110 }, { "epoch": 1.9005767012687427, "grad_norm": 0.8483492136001587, "learning_rate": 7.410753080454746e-08, "loss": 0.3001, "step": 4120 }, { "epoch": 1.9051903114186852, "grad_norm": 0.8492743372917175, "learning_rate": 6.735864485034493e-08, "loss": 0.2853, "step": 4130 }, { "epoch": 1.9098039215686273, "grad_norm": 1.6273894309997559, "learning_rate": 6.092983092492844e-08, "loss": 0.3221, "step": 4140 }, { "epoch": 1.9144175317185699, "grad_norm": 1.2614809274673462, "learning_rate": 5.482150618481952e-08, "loss": 0.2815, "step": 4150 }, { "epoch": 1.919031141868512, "grad_norm": 1.0911434888839722, "learning_rate": 4.9034066990457094e-08, "loss": 0.3051, "step": 4160 }, { "epoch": 1.9236447520184545, "grad_norm": 1.4605953693389893, "learning_rate": 4.356788888047747e-08, "loss": 0.301, "step": 4170 }, { "epoch": 1.9282583621683966, "grad_norm": 0.8996632099151611, "learning_rate": 3.8423326547344376e-08, "loss": 0.297, "step": 4180 }, { "epoch": 1.9328719723183392, "grad_norm": 0.7587381601333618, "learning_rate": 3.360071381433516e-08, "loss": 0.309, "step": 4190 }, { "epoch": 1.9374855824682813, "grad_norm": 0.9873301386833191, "learning_rate": 2.9100363613879246e-08, "loss": 0.2899, "step": 4200 }, { "epoch": 1.9420991926182238, "grad_norm": 1.2133296728134155, "learning_rate": 2.492256796725212e-08, "loss": 0.2905, "step": 4210 }, { "epoch": 1.946712802768166, "grad_norm": 1.017208456993103, "learning_rate": 2.1067597965624963e-08, "loss": 0.3042, "step": 4220 }, { "epoch": 1.9513264129181085, "grad_norm": 1.4077831506729126, "learning_rate": 1.753570375247815e-08, "loss": 0.2793, "step": 4230 }, { "epoch": 1.9559400230680506, "grad_norm": 1.1960562467575073, "learning_rate": 1.4327114507365347e-08, "loss": 0.3012, "step": 4240 }, { "epoch": 1.9605536332179931, "grad_norm": 1.1810780763626099, "learning_rate": 1.1442038431044856e-08, "loss": 0.2987, "step": 4250 }, { "epoch": 1.9651672433679352, "grad_norm": 1.0378133058547974, "learning_rate": 8.880662731968748e-09, "loss": 0.3028, "step": 4260 }, { "epoch": 1.9697808535178778, "grad_norm": 0.9532252550125122, "learning_rate": 6.6431536141348115e-09, "loss": 0.327, "step": 4270 }, { "epoch": 1.9743944636678201, "grad_norm": 0.8077103495597839, "learning_rate": 4.729656266304061e-09, "loss": 0.2861, "step": 4280 }, { "epoch": 1.9790080738177624, "grad_norm": 1.0243134498596191, "learning_rate": 3.1402948525766085e-09, "loss": 0.3224, "step": 4290 }, { "epoch": 1.9836216839677048, "grad_norm": 1.4665522575378418, "learning_rate": 1.8751725043375526e-09, "loss": 0.2968, "step": 4300 }, { "epoch": 1.988235294117647, "grad_norm": 1.1807401180267334, "learning_rate": 9.343713135623323e-10, "loss": 0.3063, "step": 4310 }, { "epoch": 1.9928489042675894, "grad_norm": 1.1453770399093628, "learning_rate": 3.1795232749320947e-10, "loss": 0.2943, "step": 4320 }, { "epoch": 1.9974625144175318, "grad_norm": 1.3085640668869019, "learning_rate": 2.5955544673550438e-11, "loss": 0.2894, "step": 4330 }, { "epoch": 1.9993079584775086, "step": 4334, "total_flos": 545751381377024.0, "train_loss": 0.3850643413100971, "train_runtime": 89295.2192, "train_samples_per_second": 0.68, "train_steps_per_second": 0.049 } ], "logging_steps": 10, "max_steps": 4334, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 545751381377024.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }