{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.956521739130435, "eval_steps": 500, "global_step": 2180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004576659038901602, "grad_norm": 0.795125424861908, "learning_rate": 9.174311926605506e-07, "loss": 2.0857, "step": 1 }, { "epoch": 0.02288329519450801, "grad_norm": 0.7591376900672913, "learning_rate": 4.587155963302753e-06, "loss": 2.0888, "step": 5 }, { "epoch": 0.04576659038901602, "grad_norm": 0.7638061046600342, "learning_rate": 9.174311926605506e-06, "loss": 2.0744, "step": 10 }, { "epoch": 0.06864988558352403, "grad_norm": 0.6205304265022278, "learning_rate": 1.3761467889908258e-05, "loss": 2.0578, "step": 15 }, { "epoch": 0.09153318077803203, "grad_norm": 0.40848588943481445, "learning_rate": 1.834862385321101e-05, "loss": 2.0541, "step": 20 }, { "epoch": 0.11441647597254005, "grad_norm": 0.35839515924453735, "learning_rate": 2.2935779816513765e-05, "loss": 2.0396, "step": 25 }, { "epoch": 0.13729977116704806, "grad_norm": 0.3498051166534424, "learning_rate": 2.7522935779816515e-05, "loss": 2.0186, "step": 30 }, { "epoch": 0.16018306636155608, "grad_norm": 0.34915778040885925, "learning_rate": 3.211009174311927e-05, "loss": 1.9692, "step": 35 }, { "epoch": 0.18306636155606407, "grad_norm": 0.2829169034957886, "learning_rate": 3.669724770642202e-05, "loss": 1.9478, "step": 40 }, { "epoch": 0.20594965675057209, "grad_norm": 0.29471325874328613, "learning_rate": 4.1284403669724776e-05, "loss": 1.9081, "step": 45 }, { "epoch": 0.2288329519450801, "grad_norm": 0.26286041736602783, "learning_rate": 4.587155963302753e-05, "loss": 1.8852, "step": 50 }, { "epoch": 0.2517162471395881, "grad_norm": 0.253055214881897, "learning_rate": 5.0458715596330276e-05, "loss": 1.8466, "step": 55 }, { "epoch": 0.2745995423340961, "grad_norm": 0.25780022144317627, "learning_rate": 5.504587155963303e-05, "loss": 1.7978, "step": 60 }, { "epoch": 0.2974828375286041, "grad_norm": 0.22376658022403717, "learning_rate": 5.9633027522935784e-05, "loss": 1.7513, "step": 65 }, { "epoch": 0.32036613272311215, "grad_norm": 0.20868130028247833, "learning_rate": 6.422018348623854e-05, "loss": 1.7581, "step": 70 }, { "epoch": 0.34324942791762014, "grad_norm": 0.20019210875034332, "learning_rate": 6.880733944954129e-05, "loss": 1.7225, "step": 75 }, { "epoch": 0.36613272311212813, "grad_norm": 0.20799387991428375, "learning_rate": 7.339449541284404e-05, "loss": 1.7073, "step": 80 }, { "epoch": 0.3890160183066362, "grad_norm": 0.19921623170375824, "learning_rate": 7.79816513761468e-05, "loss": 1.698, "step": 85 }, { "epoch": 0.41189931350114417, "grad_norm": 0.19697250425815582, "learning_rate": 8.256880733944955e-05, "loss": 1.6806, "step": 90 }, { "epoch": 0.43478260869565216, "grad_norm": 0.21829445660114288, "learning_rate": 8.715596330275229e-05, "loss": 1.661, "step": 95 }, { "epoch": 0.4576659038901602, "grad_norm": 0.20831452310085297, "learning_rate": 9.174311926605506e-05, "loss": 1.6499, "step": 100 }, { "epoch": 0.4805491990846682, "grad_norm": 0.2224440723657608, "learning_rate": 9.63302752293578e-05, "loss": 1.6504, "step": 105 }, { "epoch": 0.5034324942791762, "grad_norm": 0.22193986177444458, "learning_rate": 0.00010091743119266055, "loss": 1.6399, "step": 110 }, { "epoch": 0.5263157894736842, "grad_norm": 0.2367602288722992, "learning_rate": 0.00010550458715596329, "loss": 1.6236, "step": 115 }, { "epoch": 0.5491990846681922, "grad_norm": 0.2362162321805954, "learning_rate": 0.00011009174311926606, "loss": 1.6227, "step": 120 }, { "epoch": 0.5720823798627003, "grad_norm": 0.24902412295341492, "learning_rate": 0.00011467889908256881, "loss": 1.5984, "step": 125 }, { "epoch": 0.5949656750572082, "grad_norm": 0.2517242431640625, "learning_rate": 0.00011926605504587157, "loss": 1.6003, "step": 130 }, { "epoch": 0.6178489702517163, "grad_norm": 0.2526886463165283, "learning_rate": 0.00012385321100917432, "loss": 1.5961, "step": 135 }, { "epoch": 0.6407322654462243, "grad_norm": 0.2544921338558197, "learning_rate": 0.00012844036697247707, "loss": 1.5782, "step": 140 }, { "epoch": 0.6636155606407322, "grad_norm": 0.2370745986700058, "learning_rate": 0.00013302752293577983, "loss": 1.5817, "step": 145 }, { "epoch": 0.6864988558352403, "grad_norm": 0.2654547095298767, "learning_rate": 0.00013761467889908258, "loss": 1.5683, "step": 150 }, { "epoch": 0.7093821510297483, "grad_norm": 0.251737505197525, "learning_rate": 0.0001422018348623853, "loss": 1.5683, "step": 155 }, { "epoch": 0.7322654462242563, "grad_norm": 0.27123987674713135, "learning_rate": 0.0001467889908256881, "loss": 1.5492, "step": 160 }, { "epoch": 0.7551487414187643, "grad_norm": 0.2787420451641083, "learning_rate": 0.00015137614678899084, "loss": 1.5573, "step": 165 }, { "epoch": 0.7780320366132724, "grad_norm": 0.3056250810623169, "learning_rate": 0.0001559633027522936, "loss": 1.5535, "step": 170 }, { "epoch": 0.8009153318077803, "grad_norm": 0.4223385751247406, "learning_rate": 0.00016055045871559632, "loss": 1.5624, "step": 175 }, { "epoch": 0.8237986270022883, "grad_norm": 0.3226919174194336, "learning_rate": 0.0001651376146788991, "loss": 1.5548, "step": 180 }, { "epoch": 0.8466819221967964, "grad_norm": 0.42250385880470276, "learning_rate": 0.00016972477064220186, "loss": 1.5403, "step": 185 }, { "epoch": 0.8695652173913043, "grad_norm": 0.29832813143730164, "learning_rate": 0.00017431192660550458, "loss": 1.5406, "step": 190 }, { "epoch": 0.8924485125858124, "grad_norm": 0.318697988986969, "learning_rate": 0.00017889908256880734, "loss": 1.5423, "step": 195 }, { "epoch": 0.9153318077803204, "grad_norm": 0.30288153886795044, "learning_rate": 0.00018348623853211012, "loss": 1.5432, "step": 200 }, { "epoch": 0.9382151029748284, "grad_norm": 0.26959261298179626, "learning_rate": 0.00018807339449541284, "loss": 1.5264, "step": 205 }, { "epoch": 0.9610983981693364, "grad_norm": 0.27854207158088684, "learning_rate": 0.0001926605504587156, "loss": 1.5328, "step": 210 }, { "epoch": 0.9839816933638444, "grad_norm": 0.26367124915122986, "learning_rate": 0.00019724770642201835, "loss": 1.5142, "step": 215 }, { "epoch": 1.0, "eval_loss": 2.4457786083221436, "eval_runtime": 0.158, "eval_samples_per_second": 63.273, "eval_steps_per_second": 6.327, "step": 219 }, { "epoch": 1.0045766590389016, "grad_norm": 0.24696815013885498, "learning_rate": 0.00019999948721966259, "loss": 1.5267, "step": 220 }, { "epoch": 1.0274599542334095, "grad_norm": 0.2515549063682556, "learning_rate": 0.0001999937185012612, "loss": 1.5197, "step": 225 }, { "epoch": 1.0503432494279177, "grad_norm": 0.26250341534614563, "learning_rate": 0.00019998154046002822, "loss": 1.4932, "step": 230 }, { "epoch": 1.0732265446224256, "grad_norm": 0.2599635422229767, "learning_rate": 0.00019996295387654262, "loss": 1.5057, "step": 235 }, { "epoch": 1.0961098398169336, "grad_norm": 0.2875217795372009, "learning_rate": 0.0001999379599421534, "loss": 1.5123, "step": 240 }, { "epoch": 1.1189931350114417, "grad_norm": 0.2571655511856079, "learning_rate": 0.00019990656025890315, "loss": 1.5125, "step": 245 }, { "epoch": 1.1418764302059496, "grad_norm": 0.2501162588596344, "learning_rate": 0.00019986875683942535, "loss": 1.5125, "step": 250 }, { "epoch": 1.1647597254004576, "grad_norm": 0.2591252028942108, "learning_rate": 0.00019982455210681537, "loss": 1.491, "step": 255 }, { "epoch": 1.1876430205949657, "grad_norm": 0.2666991055011749, "learning_rate": 0.00019977394889447524, "loss": 1.4931, "step": 260 }, { "epoch": 1.2105263157894737, "grad_norm": 0.3265208601951599, "learning_rate": 0.00019971695044593196, "loss": 1.5088, "step": 265 }, { "epoch": 1.2334096109839816, "grad_norm": 0.3080615699291229, "learning_rate": 0.00019965356041462955, "loss": 1.4943, "step": 270 }, { "epoch": 1.2562929061784898, "grad_norm": 0.2847588062286377, "learning_rate": 0.00019958378286369502, "loss": 1.4829, "step": 275 }, { "epoch": 1.2791762013729977, "grad_norm": 0.2870901823043823, "learning_rate": 0.00019950762226567781, "loss": 1.4841, "step": 280 }, { "epoch": 1.3020594965675056, "grad_norm": 0.27676528692245483, "learning_rate": 0.00019942508350226314, "loss": 1.4901, "step": 285 }, { "epoch": 1.3249427917620138, "grad_norm": 0.26728203892707825, "learning_rate": 0.00019933617186395917, "loss": 1.4743, "step": 290 }, { "epoch": 1.3478260869565217, "grad_norm": 0.2857504189014435, "learning_rate": 0.0001992408930497578, "loss": 1.4919, "step": 295 }, { "epoch": 1.3707093821510297, "grad_norm": 0.36729517579078674, "learning_rate": 0.00019913925316676945, "loss": 1.4872, "step": 300 }, { "epoch": 1.3935926773455378, "grad_norm": 0.2691424489021301, "learning_rate": 0.0001990312587298316, "loss": 1.4785, "step": 305 }, { "epoch": 1.4164759725400458, "grad_norm": 0.3448796570301056, "learning_rate": 0.00019891691666109113, "loss": 1.4857, "step": 310 }, { "epoch": 1.4393592677345537, "grad_norm": 0.29720258712768555, "learning_rate": 0.0001987962342895607, "loss": 1.4844, "step": 315 }, { "epoch": 1.4622425629290619, "grad_norm": 0.25346481800079346, "learning_rate": 0.00019866921935064906, "loss": 1.4822, "step": 320 }, { "epoch": 1.4851258581235698, "grad_norm": 0.2484666258096695, "learning_rate": 0.0001985358799856651, "loss": 1.4804, "step": 325 }, { "epoch": 1.5080091533180777, "grad_norm": 0.2337537705898285, "learning_rate": 0.00019839622474129596, "loss": 1.4762, "step": 330 }, { "epoch": 1.5308924485125859, "grad_norm": 0.26846134662628174, "learning_rate": 0.00019825026256905948, "loss": 1.4656, "step": 335 }, { "epoch": 1.5537757437070938, "grad_norm": 0.239125594496727, "learning_rate": 0.00019809800282473013, "loss": 1.4802, "step": 340 }, { "epoch": 1.5766590389016018, "grad_norm": 0.2293742597103119, "learning_rate": 0.00019793945526773947, "loss": 1.4603, "step": 345 }, { "epoch": 1.59954233409611, "grad_norm": 0.2655136287212372, "learning_rate": 0.0001977746300605507, "loss": 1.4783, "step": 350 }, { "epoch": 1.6224256292906178, "grad_norm": 0.25485122203826904, "learning_rate": 0.00019760353776800704, "loss": 1.4755, "step": 355 }, { "epoch": 1.6453089244851258, "grad_norm": 0.2485138326883316, "learning_rate": 0.00019742618935665476, "loss": 1.464, "step": 360 }, { "epoch": 1.668192219679634, "grad_norm": 0.23952895402908325, "learning_rate": 0.0001972425961940401, "loss": 1.4652, "step": 365 }, { "epoch": 1.6910755148741419, "grad_norm": 0.22710928320884705, "learning_rate": 0.00019705277004798073, "loss": 1.4585, "step": 370 }, { "epoch": 1.7139588100686498, "grad_norm": 0.2363155037164688, "learning_rate": 0.00019685672308581152, "loss": 1.4725, "step": 375 }, { "epoch": 1.736842105263158, "grad_norm": 0.23678353428840637, "learning_rate": 0.0001966544678736044, "loss": 1.4654, "step": 380 }, { "epoch": 1.759725400457666, "grad_norm": 0.24644772708415985, "learning_rate": 0.00019644601737536338, "loss": 1.4473, "step": 385 }, { "epoch": 1.7826086956521738, "grad_norm": 0.24988573789596558, "learning_rate": 0.00019623138495219292, "loss": 1.4658, "step": 390 }, { "epoch": 1.805491990846682, "grad_norm": 0.2501707971096039, "learning_rate": 0.00019601058436144225, "loss": 1.4631, "step": 395 }, { "epoch": 1.82837528604119, "grad_norm": 0.22899797558784485, "learning_rate": 0.00019578362975582292, "loss": 1.4663, "step": 400 }, { "epoch": 1.8512585812356979, "grad_norm": 0.2957789897918701, "learning_rate": 0.0001955505356825021, "loss": 1.4649, "step": 405 }, { "epoch": 1.874141876430206, "grad_norm": 0.26199233531951904, "learning_rate": 0.00019531131708217005, "loss": 1.4568, "step": 410 }, { "epoch": 1.897025171624714, "grad_norm": 0.2635655403137207, "learning_rate": 0.00019506598928808216, "loss": 1.4748, "step": 415 }, { "epoch": 1.919908466819222, "grad_norm": 0.23314692080020905, "learning_rate": 0.0001948145680250766, "loss": 1.4615, "step": 420 }, { "epoch": 1.94279176201373, "grad_norm": 0.24421729147434235, "learning_rate": 0.000194557069408566, "loss": 1.463, "step": 425 }, { "epoch": 1.965675057208238, "grad_norm": 0.2970636188983917, "learning_rate": 0.00019429350994350483, "loss": 1.4702, "step": 430 }, { "epoch": 1.988558352402746, "grad_norm": 0.2254825085401535, "learning_rate": 0.0001940239065233311, "loss": 1.4574, "step": 435 }, { "epoch": 2.0, "eval_loss": 2.456493854522705, "eval_runtime": 0.1591, "eval_samples_per_second": 62.851, "eval_steps_per_second": 6.285, "step": 438 }, { "epoch": 2.009153318077803, "grad_norm": 0.2410830557346344, "learning_rate": 0.00019374827642888398, "loss": 1.4467, "step": 440 }, { "epoch": 2.0320366132723113, "grad_norm": 0.3021443784236908, "learning_rate": 0.00019346663732729572, "loss": 1.4415, "step": 445 }, { "epoch": 2.054919908466819, "grad_norm": 0.2840650975704193, "learning_rate": 0.0001931790072708596, "loss": 1.4438, "step": 450 }, { "epoch": 2.077803203661327, "grad_norm": 0.25511375069618225, "learning_rate": 0.0001928854046958725, "loss": 1.4412, "step": 455 }, { "epoch": 2.1006864988558354, "grad_norm": 0.25664883852005005, "learning_rate": 0.00019258584842145343, "loss": 1.4476, "step": 460 }, { "epoch": 2.123569794050343, "grad_norm": 0.29023540019989014, "learning_rate": 0.00019228035764833718, "loss": 1.4393, "step": 465 }, { "epoch": 2.1464530892448512, "grad_norm": 0.23540236055850983, "learning_rate": 0.00019196895195764362, "loss": 1.45, "step": 470 }, { "epoch": 2.1693363844393594, "grad_norm": 0.23571573197841644, "learning_rate": 0.0001916516513096226, "loss": 1.4292, "step": 475 }, { "epoch": 2.192219679633867, "grad_norm": 0.22221675515174866, "learning_rate": 0.0001913284760423745, "loss": 1.4385, "step": 480 }, { "epoch": 2.2151029748283753, "grad_norm": 0.24515533447265625, "learning_rate": 0.00019099944687054672, "loss": 1.438, "step": 485 }, { "epoch": 2.2379862700228834, "grad_norm": 0.22849489748477936, "learning_rate": 0.00019066458488400584, "loss": 1.4365, "step": 490 }, { "epoch": 2.260869565217391, "grad_norm": 0.23650361597537994, "learning_rate": 0.0001903239115464859, "loss": 1.4421, "step": 495 }, { "epoch": 2.2837528604118993, "grad_norm": 0.2418275624513626, "learning_rate": 0.00018997744869421246, "loss": 1.4275, "step": 500 }, { "epoch": 2.3066361556064074, "grad_norm": 0.2576926350593567, "learning_rate": 0.00018962521853450323, "loss": 1.4315, "step": 505 }, { "epoch": 2.329519450800915, "grad_norm": 0.26270532608032227, "learning_rate": 0.00018926724364434446, "loss": 1.4386, "step": 510 }, { "epoch": 2.3524027459954233, "grad_norm": 0.2584366798400879, "learning_rate": 0.00018890354696894375, "loss": 1.4288, "step": 515 }, { "epoch": 2.3752860411899315, "grad_norm": 0.2276887744665146, "learning_rate": 0.0001885341518202595, "loss": 1.4331, "step": 520 }, { "epoch": 2.398169336384439, "grad_norm": 0.25574877858161926, "learning_rate": 0.00018815908187550667, "loss": 1.4343, "step": 525 }, { "epoch": 2.4210526315789473, "grad_norm": 0.2539375424385071, "learning_rate": 0.00018777836117563892, "loss": 1.4283, "step": 530 }, { "epoch": 2.4439359267734555, "grad_norm": 0.23258014023303986, "learning_rate": 0.0001873920141238079, "loss": 1.4333, "step": 535 }, { "epoch": 2.466819221967963, "grad_norm": 0.24933487176895142, "learning_rate": 0.00018700006548379898, "loss": 1.4245, "step": 540 }, { "epoch": 2.4897025171624714, "grad_norm": 0.2398696094751358, "learning_rate": 0.00018660254037844388, "loss": 1.4373, "step": 545 }, { "epoch": 2.5125858123569795, "grad_norm": 0.2470606118440628, "learning_rate": 0.0001861994642880105, "loss": 1.4316, "step": 550 }, { "epoch": 2.5354691075514877, "grad_norm": 0.2763492166996002, "learning_rate": 0.0001857908630485696, "loss": 1.4229, "step": 555 }, { "epoch": 2.5583524027459954, "grad_norm": 0.25183382630348206, "learning_rate": 0.00018537676285033887, "loss": 1.4235, "step": 560 }, { "epoch": 2.5812356979405036, "grad_norm": 0.25710180401802063, "learning_rate": 0.00018495719023600414, "loss": 1.4203, "step": 565 }, { "epoch": 2.6041189931350113, "grad_norm": 0.22346384823322296, "learning_rate": 0.0001845321720990181, "loss": 1.4274, "step": 570 }, { "epoch": 2.6270022883295194, "grad_norm": 0.2290782779455185, "learning_rate": 0.00018410173568187647, "loss": 1.4304, "step": 575 }, { "epoch": 2.6498855835240276, "grad_norm": 0.25173699855804443, "learning_rate": 0.00018366590857437184, "loss": 1.4331, "step": 580 }, { "epoch": 2.6727688787185357, "grad_norm": 0.2373346984386444, "learning_rate": 0.00018322471871182528, "loss": 1.4257, "step": 585 }, { "epoch": 2.6956521739130435, "grad_norm": 0.2577199339866638, "learning_rate": 0.00018277819437329576, "loss": 1.4382, "step": 590 }, { "epoch": 2.7185354691075516, "grad_norm": 0.23805540800094604, "learning_rate": 0.00018232636417976744, "loss": 1.4365, "step": 595 }, { "epoch": 2.7414187643020593, "grad_norm": 0.25108346343040466, "learning_rate": 0.00018186925709231532, "loss": 1.424, "step": 600 }, { "epoch": 2.7643020594965675, "grad_norm": 0.25178954005241394, "learning_rate": 0.00018140690241024872, "loss": 1.429, "step": 605 }, { "epoch": 2.7871853546910756, "grad_norm": 0.2157055288553238, "learning_rate": 0.0001809393297692334, "loss": 1.4257, "step": 610 }, { "epoch": 2.8100686498855834, "grad_norm": 0.22406814992427826, "learning_rate": 0.00018046656913939195, "loss": 1.4248, "step": 615 }, { "epoch": 2.8329519450800915, "grad_norm": 0.22442927956581116, "learning_rate": 0.0001799886508233829, "loss": 1.4292, "step": 620 }, { "epoch": 2.8558352402745997, "grad_norm": 0.22204960882663727, "learning_rate": 0.00017950560545445813, "loss": 1.4399, "step": 625 }, { "epoch": 2.8787185354691074, "grad_norm": 0.2269710898399353, "learning_rate": 0.0001790174639944997, "loss": 1.4248, "step": 630 }, { "epoch": 2.9016018306636155, "grad_norm": 0.23222693800926208, "learning_rate": 0.000178524257732035, "loss": 1.4261, "step": 635 }, { "epoch": 2.9244851258581237, "grad_norm": 0.21854890882968903, "learning_rate": 0.00017802601828023138, "loss": 1.4172, "step": 640 }, { "epoch": 2.9473684210526314, "grad_norm": 0.2316950112581253, "learning_rate": 0.0001775227775748699, "loss": 1.4156, "step": 645 }, { "epoch": 2.9702517162471396, "grad_norm": 0.21298670768737793, "learning_rate": 0.00017701456787229804, "loss": 1.4288, "step": 650 }, { "epoch": 2.9931350114416477, "grad_norm": 0.21937227249145508, "learning_rate": 0.00017650142174736262, "loss": 1.4298, "step": 655 }, { "epoch": 3.0, "eval_loss": 2.4537367820739746, "eval_runtime": 0.1693, "eval_samples_per_second": 59.08, "eval_steps_per_second": 5.908, "step": 657 }, { "epoch": 3.013729977116705, "grad_norm": 0.22709442675113678, "learning_rate": 0.0001759833720913214, "loss": 1.4058, "step": 660 }, { "epoch": 3.0366132723112127, "grad_norm": 0.21663448214530945, "learning_rate": 0.00017546045210973507, "loss": 1.4108, "step": 665 }, { "epoch": 3.059496567505721, "grad_norm": 0.21900029480457306, "learning_rate": 0.00017493269532033883, "loss": 1.41, "step": 670 }, { "epoch": 3.082379862700229, "grad_norm": 0.23556417226791382, "learning_rate": 0.00017440013555089393, "loss": 1.4084, "step": 675 }, { "epoch": 3.1052631578947367, "grad_norm": 0.26198992133140564, "learning_rate": 0.0001738628069370195, "loss": 1.4108, "step": 680 }, { "epoch": 3.128146453089245, "grad_norm": 0.2543962299823761, "learning_rate": 0.00017332074392000454, "loss": 1.4124, "step": 685 }, { "epoch": 3.151029748283753, "grad_norm": 0.23739871382713318, "learning_rate": 0.00017277398124460023, "loss": 1.4048, "step": 690 }, { "epoch": 3.1739130434782608, "grad_norm": 0.23524770140647888, "learning_rate": 0.00017222255395679296, "loss": 1.4184, "step": 695 }, { "epoch": 3.196796338672769, "grad_norm": 0.2273031771183014, "learning_rate": 0.000171666497401558, "loss": 1.4122, "step": 700 }, { "epoch": 3.219679633867277, "grad_norm": 0.23595724999904633, "learning_rate": 0.00017110584722059393, "loss": 1.407, "step": 705 }, { "epoch": 3.242562929061785, "grad_norm": 0.21827439963817596, "learning_rate": 0.0001705406393500381, "loss": 1.4027, "step": 710 }, { "epoch": 3.265446224256293, "grad_norm": 0.22116319835186005, "learning_rate": 0.00016997091001816336, "loss": 1.4049, "step": 715 }, { "epoch": 3.288329519450801, "grad_norm": 0.23044751584529877, "learning_rate": 0.00016939669574305566, "loss": 1.4095, "step": 720 }, { "epoch": 3.311212814645309, "grad_norm": 0.22559754550457, "learning_rate": 0.00016881803333027362, "loss": 1.3991, "step": 725 }, { "epoch": 3.334096109839817, "grad_norm": 0.2260756939649582, "learning_rate": 0.0001682349598704892, "loss": 1.4123, "step": 730 }, { "epoch": 3.356979405034325, "grad_norm": 0.2145521193742752, "learning_rate": 0.00016764751273711044, "loss": 1.4119, "step": 735 }, { "epoch": 3.379862700228833, "grad_norm": 0.23654837906360626, "learning_rate": 0.00016705572958388576, "loss": 1.4024, "step": 740 }, { "epoch": 3.402745995423341, "grad_norm": 0.21862386167049408, "learning_rate": 0.0001664596483424906, "loss": 1.4104, "step": 745 }, { "epoch": 3.425629290617849, "grad_norm": 0.2240535169839859, "learning_rate": 0.00016585930722009601, "loss": 1.4012, "step": 750 }, { "epoch": 3.448512585812357, "grad_norm": 0.23204627633094788, "learning_rate": 0.00016525474469691984, "loss": 1.4044, "step": 755 }, { "epoch": 3.471395881006865, "grad_norm": 0.23645585775375366, "learning_rate": 0.00016464599952375998, "loss": 1.4135, "step": 760 }, { "epoch": 3.494279176201373, "grad_norm": 0.2199520468711853, "learning_rate": 0.00016403311071951082, "loss": 1.413, "step": 765 }, { "epoch": 3.517162471395881, "grad_norm": 0.23638251423835754, "learning_rate": 0.000163416117568662, "loss": 1.4101, "step": 770 }, { "epoch": 3.540045766590389, "grad_norm": 0.2419108748435974, "learning_rate": 0.00016279505961878064, "loss": 1.399, "step": 775 }, { "epoch": 3.5629290617848968, "grad_norm": 0.23601311445236206, "learning_rate": 0.0001621699766779763, "loss": 1.4084, "step": 780 }, { "epoch": 3.585812356979405, "grad_norm": 0.24460086226463318, "learning_rate": 0.0001615409088123493, "loss": 1.4144, "step": 785 }, { "epoch": 3.608695652173913, "grad_norm": 0.21848763525485992, "learning_rate": 0.00016090789634342278, "loss": 1.3929, "step": 790 }, { "epoch": 3.6315789473684212, "grad_norm": 0.22614765167236328, "learning_rate": 0.00016027097984555816, "loss": 1.399, "step": 795 }, { "epoch": 3.654462242562929, "grad_norm": 0.22731591761112213, "learning_rate": 0.00015963020014335438, "loss": 1.3998, "step": 800 }, { "epoch": 3.677345537757437, "grad_norm": 0.21770906448364258, "learning_rate": 0.00015898559830903106, "loss": 1.4064, "step": 805 }, { "epoch": 3.700228832951945, "grad_norm": 0.25158295035362244, "learning_rate": 0.0001583372156597961, "loss": 1.4046, "step": 810 }, { "epoch": 3.723112128146453, "grad_norm": 0.23650844395160675, "learning_rate": 0.00015768509375519726, "loss": 1.3942, "step": 815 }, { "epoch": 3.745995423340961, "grad_norm": 0.22743116319179535, "learning_rate": 0.00015702927439445826, "loss": 1.4022, "step": 820 }, { "epoch": 3.7688787185354693, "grad_norm": 0.23518233001232147, "learning_rate": 0.0001563697996137997, "loss": 1.3986, "step": 825 }, { "epoch": 3.791762013729977, "grad_norm": 0.22372908890247345, "learning_rate": 0.00015570671168374438, "loss": 1.3937, "step": 830 }, { "epoch": 3.814645308924485, "grad_norm": 0.2474096119403839, "learning_rate": 0.00015504005310640822, "loss": 1.3984, "step": 835 }, { "epoch": 3.837528604118993, "grad_norm": 0.23158776760101318, "learning_rate": 0.00015436986661277577, "loss": 1.3972, "step": 840 }, { "epoch": 3.860411899313501, "grad_norm": 0.239366814494133, "learning_rate": 0.0001536961951599613, "loss": 1.4023, "step": 845 }, { "epoch": 3.883295194508009, "grad_norm": 0.26158976554870605, "learning_rate": 0.0001530190819284555, "loss": 1.3918, "step": 850 }, { "epoch": 3.9061784897025174, "grad_norm": 0.2554950416088104, "learning_rate": 0.00015233857031935749, "loss": 1.3927, "step": 855 }, { "epoch": 3.929061784897025, "grad_norm": 0.23338022828102112, "learning_rate": 0.00015165470395159313, "loss": 1.3929, "step": 860 }, { "epoch": 3.9519450800915332, "grad_norm": 0.2260856032371521, "learning_rate": 0.00015096752665911913, "loss": 1.4055, "step": 865 }, { "epoch": 3.974828375286041, "grad_norm": 0.2310597449541092, "learning_rate": 0.0001502770824881133, "loss": 1.3984, "step": 870 }, { "epoch": 3.997711670480549, "grad_norm": 0.22370661795139313, "learning_rate": 0.00014958341569415147, "loss": 1.3972, "step": 875 }, { "epoch": 4.0, "eval_loss": 2.462189197540283, "eval_runtime": 0.1584, "eval_samples_per_second": 63.121, "eval_steps_per_second": 6.312, "step": 876 }, { "epoch": 4.018306636155606, "grad_norm": 0.24240370094776154, "learning_rate": 0.00014888657073937076, "loss": 1.3961, "step": 880 }, { "epoch": 4.0411899313501145, "grad_norm": 0.22996193170547485, "learning_rate": 0.0001481865922896196, "loss": 1.3808, "step": 885 }, { "epoch": 4.064073226544623, "grad_norm": 0.21983887255191803, "learning_rate": 0.00014748352521159493, "loss": 1.3764, "step": 890 }, { "epoch": 4.086956521739131, "grad_norm": 0.23078066110610962, "learning_rate": 0.00014677741456996617, "loss": 1.3826, "step": 895 }, { "epoch": 4.109839816933638, "grad_norm": 0.2580978274345398, "learning_rate": 0.0001460683056244869, "loss": 1.3816, "step": 900 }, { "epoch": 4.132723112128146, "grad_norm": 0.22216792404651642, "learning_rate": 0.00014535624382709382, "loss": 1.3767, "step": 905 }, { "epoch": 4.155606407322654, "grad_norm": 0.21901686489582062, "learning_rate": 0.00014464127481899312, "loss": 1.3951, "step": 910 }, { "epoch": 4.178489702517163, "grad_norm": 0.22333645820617676, "learning_rate": 0.0001439234444277354, "loss": 1.3946, "step": 915 }, { "epoch": 4.201372997711671, "grad_norm": 0.2606089413166046, "learning_rate": 0.00014320279866427796, "loss": 1.3836, "step": 920 }, { "epoch": 4.224256292906179, "grad_norm": 0.2596084475517273, "learning_rate": 0.00014247938372003582, "loss": 1.3849, "step": 925 }, { "epoch": 4.247139588100686, "grad_norm": 0.2835715115070343, "learning_rate": 0.00014175324596392075, "loss": 1.386, "step": 930 }, { "epoch": 4.270022883295194, "grad_norm": 0.2396639585494995, "learning_rate": 0.0001410244319393694, "loss": 1.3848, "step": 935 }, { "epoch": 4.2929061784897025, "grad_norm": 0.22530311346054077, "learning_rate": 0.00014029298836135988, "loss": 1.3839, "step": 940 }, { "epoch": 4.315789473684211, "grad_norm": 0.23546090722084045, "learning_rate": 0.0001395589621134174, "loss": 1.3948, "step": 945 }, { "epoch": 4.338672768878719, "grad_norm": 0.24042978882789612, "learning_rate": 0.00013882240024460927, "loss": 1.3891, "step": 950 }, { "epoch": 4.361556064073227, "grad_norm": 0.26460978388786316, "learning_rate": 0.00013808334996652904, "loss": 1.3821, "step": 955 }, { "epoch": 4.384439359267734, "grad_norm": 0.2298058718442917, "learning_rate": 0.0001373418586502706, "loss": 1.3796, "step": 960 }, { "epoch": 4.407322654462242, "grad_norm": 0.21995796263217926, "learning_rate": 0.0001365979738233916, "loss": 1.3892, "step": 965 }, { "epoch": 4.4302059496567505, "grad_norm": 0.22304576635360718, "learning_rate": 0.0001358517431668672, "loss": 1.3731, "step": 970 }, { "epoch": 4.453089244851259, "grad_norm": 0.2750610411167145, "learning_rate": 0.0001351032145120337, "loss": 1.3732, "step": 975 }, { "epoch": 4.475972540045767, "grad_norm": 0.24216437339782715, "learning_rate": 0.00013435243583752294, "loss": 1.3955, "step": 980 }, { "epoch": 4.498855835240275, "grad_norm": 0.23672589659690857, "learning_rate": 0.00013359945526618668, "loss": 1.3859, "step": 985 }, { "epoch": 4.521739130434782, "grad_norm": 0.21664075553417206, "learning_rate": 0.00013284432106201233, "loss": 1.3885, "step": 990 }, { "epoch": 4.54462242562929, "grad_norm": 0.22267000377178192, "learning_rate": 0.00013208708162702922, "loss": 1.3811, "step": 995 }, { "epoch": 4.567505720823799, "grad_norm": 0.23190075159072876, "learning_rate": 0.00013132778549820618, "loss": 1.3813, "step": 1000 }, { "epoch": 4.590389016018307, "grad_norm": 0.21617767214775085, "learning_rate": 0.0001305664813443405, "loss": 1.3821, "step": 1005 }, { "epoch": 4.613272311212815, "grad_norm": 0.22316230833530426, "learning_rate": 0.00012980321796293836, "loss": 1.3854, "step": 1010 }, { "epoch": 4.636155606407323, "grad_norm": 0.22770841419696808, "learning_rate": 0.00012903804427708704, "loss": 1.392, "step": 1015 }, { "epoch": 4.65903890160183, "grad_norm": 0.25319135189056396, "learning_rate": 0.00012827100933231905, "loss": 1.384, "step": 1020 }, { "epoch": 4.6819221967963385, "grad_norm": 0.24790321290493011, "learning_rate": 0.0001275021622934685, "loss": 1.3817, "step": 1025 }, { "epoch": 4.704805491990847, "grad_norm": 0.24224399030208588, "learning_rate": 0.00012673155244151985, "loss": 1.3869, "step": 1030 }, { "epoch": 4.727688787185355, "grad_norm": 0.2306058555841446, "learning_rate": 0.0001259592291704489, "loss": 1.3816, "step": 1035 }, { "epoch": 4.750572082379863, "grad_norm": 0.2296520620584488, "learning_rate": 0.000125185241984057, "loss": 1.3855, "step": 1040 }, { "epoch": 4.77345537757437, "grad_norm": 0.22120504081249237, "learning_rate": 0.00012440964049279787, "loss": 1.3839, "step": 1045 }, { "epoch": 4.796338672768878, "grad_norm": 0.23121312260627747, "learning_rate": 0.00012363247441059776, "loss": 1.3816, "step": 1050 }, { "epoch": 4.8192219679633865, "grad_norm": 0.22182002663612366, "learning_rate": 0.00012285379355166893, "loss": 1.3808, "step": 1055 }, { "epoch": 4.842105263157895, "grad_norm": 0.22699835896492004, "learning_rate": 0.00012207364782731655, "loss": 1.3829, "step": 1060 }, { "epoch": 4.864988558352403, "grad_norm": 0.24495604634284973, "learning_rate": 0.00012129208724273984, "loss": 1.3827, "step": 1065 }, { "epoch": 4.887871853546911, "grad_norm": 0.235788956284523, "learning_rate": 0.00012050916189382646, "loss": 1.3898, "step": 1070 }, { "epoch": 4.910755148741419, "grad_norm": 0.24433469772338867, "learning_rate": 0.00011972492196394187, "loss": 1.3913, "step": 1075 }, { "epoch": 4.933638443935926, "grad_norm": 0.2331000417470932, "learning_rate": 0.00011893941772071249, "loss": 1.3906, "step": 1080 }, { "epoch": 4.956521739130435, "grad_norm": 0.22595056891441345, "learning_rate": 0.0001181526995128038, "loss": 1.3981, "step": 1085 }, { "epoch": 4.979405034324943, "grad_norm": 0.22172114253044128, "learning_rate": 0.00011736481776669306, "loss": 1.3895, "step": 1090 }, { "epoch": 5.0, "grad_norm": 0.3044658601284027, "learning_rate": 0.0001165758229834371, "loss": 1.3868, "step": 1095 }, { "epoch": 5.0, "eval_loss": 2.481019973754883, "eval_runtime": 0.1574, "eval_samples_per_second": 63.534, "eval_steps_per_second": 6.353, "step": 1095 }, { "epoch": 5.022883295194508, "grad_norm": 0.24489176273345947, "learning_rate": 0.0001157857657354354, "loss": 1.376, "step": 1100 }, { "epoch": 5.045766590389016, "grad_norm": 0.2253871113061905, "learning_rate": 0.00011499469666318858, "loss": 1.3759, "step": 1105 }, { "epoch": 5.068649885583524, "grad_norm": 0.235123872756958, "learning_rate": 0.00011420266647205231, "loss": 1.3774, "step": 1110 }, { "epoch": 5.091533180778032, "grad_norm": 0.2346729189157486, "learning_rate": 0.00011340972592898744, "loss": 1.3556, "step": 1115 }, { "epoch": 5.11441647597254, "grad_norm": 0.2369905263185501, "learning_rate": 0.00011261592585930576, "loss": 1.3714, "step": 1120 }, { "epoch": 5.137299771167048, "grad_norm": 0.2265693098306656, "learning_rate": 0.00011182131714341247, "loss": 1.37, "step": 1125 }, { "epoch": 5.160183066361556, "grad_norm": 0.21262961626052856, "learning_rate": 0.00011102595071354472, "loss": 1.3639, "step": 1130 }, { "epoch": 5.183066361556064, "grad_norm": 0.2247275561094284, "learning_rate": 0.00011022987755050704, "loss": 1.3649, "step": 1135 }, { "epoch": 5.2059496567505725, "grad_norm": 0.22549380362033844, "learning_rate": 0.00010943314868040364, "loss": 1.3542, "step": 1140 }, { "epoch": 5.22883295194508, "grad_norm": 0.2336389124393463, "learning_rate": 0.00010863581517136776, "loss": 1.3659, "step": 1145 }, { "epoch": 5.251716247139588, "grad_norm": 0.24654635787010193, "learning_rate": 0.00010783792813028827, "loss": 1.3742, "step": 1150 }, { "epoch": 5.274599542334096, "grad_norm": 0.22848239541053772, "learning_rate": 0.00010703953869953403, "loss": 1.3783, "step": 1155 }, { "epoch": 5.297482837528604, "grad_norm": 0.241807758808136, "learning_rate": 0.00010624069805367559, "loss": 1.3731, "step": 1160 }, { "epoch": 5.320366132723112, "grad_norm": 0.23657450079917908, "learning_rate": 0.00010544145739620519, "loss": 1.3674, "step": 1165 }, { "epoch": 5.34324942791762, "grad_norm": 0.2408100813627243, "learning_rate": 0.00010464186795625482, "loss": 1.3732, "step": 1170 }, { "epoch": 5.366132723112128, "grad_norm": 0.23080427944660187, "learning_rate": 0.00010384198098531225, "loss": 1.3912, "step": 1175 }, { "epoch": 5.389016018306636, "grad_norm": 0.23397450149059296, "learning_rate": 0.00010304184775393642, "loss": 1.381, "step": 1180 }, { "epoch": 5.411899313501144, "grad_norm": 0.2350955307483673, "learning_rate": 0.00010224151954847064, "loss": 1.3768, "step": 1185 }, { "epoch": 5.434782608695652, "grad_norm": 0.23939336836338043, "learning_rate": 0.00010144104766775572, "loss": 1.3731, "step": 1190 }, { "epoch": 5.4576659038901605, "grad_norm": 0.23009169101715088, "learning_rate": 0.0001006404834198416, "loss": 1.3715, "step": 1195 }, { "epoch": 5.480549199084669, "grad_norm": 0.2236054390668869, "learning_rate": 9.983987811869862e-05, "loss": 1.3737, "step": 1200 }, { "epoch": 5.503432494279176, "grad_norm": 0.23314499855041504, "learning_rate": 9.903928308092865e-05, "loss": 1.369, "step": 1205 }, { "epoch": 5.526315789473684, "grad_norm": 0.22338473796844482, "learning_rate": 9.823874962247564e-05, "loss": 1.3717, "step": 1210 }, { "epoch": 5.549199084668192, "grad_norm": 0.22037141025066376, "learning_rate": 9.743832905533644e-05, "loss": 1.3789, "step": 1215 }, { "epoch": 5.5720823798627, "grad_norm": 0.2261509895324707, "learning_rate": 9.663807268427198e-05, "loss": 1.3741, "step": 1220 }, { "epoch": 5.5949656750572085, "grad_norm": 0.2503112256526947, "learning_rate": 9.583803180351852e-05, "loss": 1.3805, "step": 1225 }, { "epoch": 5.617848970251716, "grad_norm": 0.22896412014961243, "learning_rate": 9.503825769350017e-05, "loss": 1.3583, "step": 1230 }, { "epoch": 5.640732265446224, "grad_norm": 0.2552075982093811, "learning_rate": 9.423880161754158e-05, "loss": 1.3767, "step": 1235 }, { "epoch": 5.663615560640732, "grad_norm": 0.23098088800907135, "learning_rate": 9.343971481858246e-05, "loss": 1.3719, "step": 1240 }, { "epoch": 5.68649885583524, "grad_norm": 0.23425135016441345, "learning_rate": 9.264104851589273e-05, "loss": 1.3678, "step": 1245 }, { "epoch": 5.709382151029748, "grad_norm": 0.229757159948349, "learning_rate": 9.184285390178978e-05, "loss": 1.3652, "step": 1250 }, { "epoch": 5.732265446224257, "grad_norm": 0.22069285809993744, "learning_rate": 9.104518213835692e-05, "loss": 1.3628, "step": 1255 }, { "epoch": 5.755148741418765, "grad_norm": 0.23451228439807892, "learning_rate": 9.024808435416434e-05, "loss": 1.3658, "step": 1260 }, { "epoch": 5.778032036613272, "grad_norm": 0.2394520491361618, "learning_rate": 8.945161164099157e-05, "loss": 1.3775, "step": 1265 }, { "epoch": 5.80091533180778, "grad_norm": 0.23245683312416077, "learning_rate": 8.865581505055291e-05, "loss": 1.372, "step": 1270 }, { "epoch": 5.823798627002288, "grad_norm": 0.23066289722919464, "learning_rate": 8.7860745591225e-05, "loss": 1.3739, "step": 1275 }, { "epoch": 5.8466819221967965, "grad_norm": 0.23424118757247925, "learning_rate": 8.706645422477739e-05, "loss": 1.3597, "step": 1280 }, { "epoch": 5.869565217391305, "grad_norm": 0.2321302741765976, "learning_rate": 8.627299186310603e-05, "loss": 1.3732, "step": 1285 }, { "epoch": 5.892448512585812, "grad_norm": 0.2273801863193512, "learning_rate": 8.548040936496989e-05, "loss": 1.3685, "step": 1290 }, { "epoch": 5.91533180778032, "grad_norm": 0.2422792911529541, "learning_rate": 8.468875753273115e-05, "loss": 1.372, "step": 1295 }, { "epoch": 5.938215102974828, "grad_norm": 0.22604131698608398, "learning_rate": 8.389808710909881e-05, "loss": 1.3676, "step": 1300 }, { "epoch": 5.961098398169336, "grad_norm": 0.24549026787281036, "learning_rate": 8.310844877387637e-05, "loss": 1.358, "step": 1305 }, { "epoch": 5.983981693363845, "grad_norm": 0.23445729911327362, "learning_rate": 8.231989314071317e-05, "loss": 1.3692, "step": 1310 }, { "epoch": 6.0, "eval_loss": 2.5001282691955566, "eval_runtime": 0.1605, "eval_samples_per_second": 62.3, "eval_steps_per_second": 6.23, "step": 1314 }, { "epoch": 6.004576659038902, "grad_norm": 0.2388521283864975, "learning_rate": 8.153247075386043e-05, "loss": 1.3541, "step": 1315 }, { "epoch": 6.02745995423341, "grad_norm": 0.23299774527549744, "learning_rate": 8.07462320849313e-05, "loss": 1.3572, "step": 1320 }, { "epoch": 6.050343249427917, "grad_norm": 0.23086461424827576, "learning_rate": 7.996122752966595e-05, "loss": 1.3568, "step": 1325 }, { "epoch": 6.073226544622425, "grad_norm": 0.23091675341129303, "learning_rate": 7.917750740470117e-05, "loss": 1.3618, "step": 1330 }, { "epoch": 6.0961098398169336, "grad_norm": 0.23161986470222473, "learning_rate": 7.839512194434531e-05, "loss": 1.363, "step": 1335 }, { "epoch": 6.118993135011442, "grad_norm": 0.23545809090137482, "learning_rate": 7.761412129735852e-05, "loss": 1.3609, "step": 1340 }, { "epoch": 6.14187643020595, "grad_norm": 0.23319639265537262, "learning_rate": 7.683455552373799e-05, "loss": 1.365, "step": 1345 }, { "epoch": 6.164759725400458, "grad_norm": 0.23536434769630432, "learning_rate": 7.605647459150961e-05, "loss": 1.3673, "step": 1350 }, { "epoch": 6.187643020594965, "grad_norm": 0.2269584983587265, "learning_rate": 7.527992837352501e-05, "loss": 1.3635, "step": 1355 }, { "epoch": 6.2105263157894735, "grad_norm": 0.22587326169013977, "learning_rate": 7.450496664426477e-05, "loss": 1.3595, "step": 1360 }, { "epoch": 6.233409610983982, "grad_norm": 0.22287671267986298, "learning_rate": 7.37316390766482e-05, "loss": 1.3752, "step": 1365 }, { "epoch": 6.25629290617849, "grad_norm": 0.23099535703659058, "learning_rate": 7.295999523884921e-05, "loss": 1.3613, "step": 1370 }, { "epoch": 6.279176201372998, "grad_norm": 0.22423085570335388, "learning_rate": 7.219008459111937e-05, "loss": 1.3658, "step": 1375 }, { "epoch": 6.302059496567506, "grad_norm": 0.23212574422359467, "learning_rate": 7.142195648261747e-05, "loss": 1.361, "step": 1380 }, { "epoch": 6.324942791762013, "grad_norm": 0.22750739753246307, "learning_rate": 7.065566014824643e-05, "loss": 1.3602, "step": 1385 }, { "epoch": 6.3478260869565215, "grad_norm": 0.2309405356645584, "learning_rate": 6.989124470549745e-05, "loss": 1.366, "step": 1390 }, { "epoch": 6.37070938215103, "grad_norm": 0.24688778817653656, "learning_rate": 6.912875915130183e-05, "loss": 1.3645, "step": 1395 }, { "epoch": 6.393592677345538, "grad_norm": 0.23304879665374756, "learning_rate": 6.83682523588902e-05, "loss": 1.3519, "step": 1400 }, { "epoch": 6.416475972540046, "grad_norm": 0.2324698269367218, "learning_rate": 6.760977307466008e-05, "loss": 1.358, "step": 1405 }, { "epoch": 6.439359267734554, "grad_norm": 0.23698380589485168, "learning_rate": 6.685336991505122e-05, "loss": 1.3569, "step": 1410 }, { "epoch": 6.462242562929061, "grad_norm": 0.2366679459810257, "learning_rate": 6.609909136342955e-05, "loss": 1.3538, "step": 1415 }, { "epoch": 6.48512585812357, "grad_norm": 0.2314102202653885, "learning_rate": 6.534698576697939e-05, "loss": 1.3607, "step": 1420 }, { "epoch": 6.508009153318078, "grad_norm": 0.2393520474433899, "learning_rate": 6.459710133360464e-05, "loss": 1.3649, "step": 1425 }, { "epoch": 6.530892448512586, "grad_norm": 0.23024214804172516, "learning_rate": 6.384948612883873e-05, "loss": 1.367, "step": 1430 }, { "epoch": 6.553775743707094, "grad_norm": 0.23126175999641418, "learning_rate": 6.310418807276375e-05, "loss": 1.3623, "step": 1435 }, { "epoch": 6.576659038901602, "grad_norm": 0.2327370047569275, "learning_rate": 6.2361254936939e-05, "loss": 1.3532, "step": 1440 }, { "epoch": 6.5995423340961095, "grad_norm": 0.22811928391456604, "learning_rate": 6.162073434133876e-05, "loss": 1.3662, "step": 1445 }, { "epoch": 6.622425629290618, "grad_norm": 0.25274351239204407, "learning_rate": 6.088267375130023e-05, "loss": 1.3537, "step": 1450 }, { "epoch": 6.645308924485126, "grad_norm": 0.24889418482780457, "learning_rate": 6.01471204744809e-05, "loss": 1.3598, "step": 1455 }, { "epoch": 6.668192219679634, "grad_norm": 0.23805195093154907, "learning_rate": 5.941412165782645e-05, "loss": 1.3525, "step": 1460 }, { "epoch": 6.691075514874142, "grad_norm": 0.2376497983932495, "learning_rate": 5.868372428454861e-05, "loss": 1.3662, "step": 1465 }, { "epoch": 6.71395881006865, "grad_norm": 0.24028287827968597, "learning_rate": 5.79559751711138e-05, "loss": 1.3524, "step": 1470 }, { "epoch": 6.7368421052631575, "grad_norm": 0.23552508652210236, "learning_rate": 5.72309209642422e-05, "loss": 1.3435, "step": 1475 }, { "epoch": 6.759725400457666, "grad_norm": 0.23250797390937805, "learning_rate": 5.650860813791785e-05, "loss": 1.3576, "step": 1480 }, { "epoch": 6.782608695652174, "grad_norm": 0.23260493576526642, "learning_rate": 5.5789082990409945e-05, "loss": 1.3648, "step": 1485 }, { "epoch": 6.805491990846682, "grad_norm": 0.23065823316574097, "learning_rate": 5.507239164130501e-05, "loss": 1.3553, "step": 1490 }, { "epoch": 6.82837528604119, "grad_norm": 0.22898992896080017, "learning_rate": 5.4358580028550896e-05, "loss": 1.357, "step": 1495 }, { "epoch": 6.851258581235698, "grad_norm": 0.24019262194633484, "learning_rate": 5.364769390551225e-05, "loss": 1.3561, "step": 1500 }, { "epoch": 6.874141876430206, "grad_norm": 0.24061718583106995, "learning_rate": 5.293977883803797e-05, "loss": 1.3654, "step": 1505 }, { "epoch": 6.897025171624714, "grad_norm": 0.2238311618566513, "learning_rate": 5.2234880201540284e-05, "loss": 1.3542, "step": 1510 }, { "epoch": 6.919908466819222, "grad_norm": 0.22695277631282806, "learning_rate": 5.1533043178086536e-05, "loss": 1.354, "step": 1515 }, { "epoch": 6.94279176201373, "grad_norm": 0.22949448227882385, "learning_rate": 5.0834312753503124e-05, "loss": 1.3613, "step": 1520 }, { "epoch": 6.965675057208238, "grad_norm": 0.23366492986679077, "learning_rate": 5.0138733714492e-05, "loss": 1.3497, "step": 1525 }, { "epoch": 6.988558352402746, "grad_norm": 0.2318800687789917, "learning_rate": 4.9446350645759885e-05, "loss": 1.3509, "step": 1530 }, { "epoch": 7.0, "eval_loss": 2.50384783744812, "eval_runtime": 0.1625, "eval_samples_per_second": 61.536, "eval_steps_per_second": 6.154, "step": 1533 }, { "epoch": 7.009153318077804, "grad_norm": 0.24519094824790955, "learning_rate": 4.8757207927160584e-05, "loss": 1.3595, "step": 1535 }, { "epoch": 7.032036613272311, "grad_norm": 0.23380842804908752, "learning_rate": 4.807134973085036e-05, "loss": 1.3402, "step": 1540 }, { "epoch": 7.054919908466819, "grad_norm": 0.23818600177764893, "learning_rate": 4.738882001845668e-05, "loss": 1.3542, "step": 1545 }, { "epoch": 7.077803203661327, "grad_norm": 0.2337319701910019, "learning_rate": 4.6709662538260267e-05, "loss": 1.3603, "step": 1550 }, { "epoch": 7.100686498855835, "grad_norm": 0.23182658851146698, "learning_rate": 4.603392082239102e-05, "loss": 1.3513, "step": 1555 }, { "epoch": 7.1235697940503435, "grad_norm": 0.22734318673610687, "learning_rate": 4.53616381840377e-05, "loss": 1.3515, "step": 1560 }, { "epoch": 7.146453089244852, "grad_norm": 0.2282598614692688, "learning_rate": 4.469285771467181e-05, "loss": 1.3529, "step": 1565 }, { "epoch": 7.169336384439359, "grad_norm": 0.23265138268470764, "learning_rate": 4.402762228128531e-05, "loss": 1.3513, "step": 1570 }, { "epoch": 7.192219679633867, "grad_norm": 0.23814916610717773, "learning_rate": 4.336597452364309e-05, "loss": 1.3459, "step": 1575 }, { "epoch": 7.215102974828375, "grad_norm": 0.23314997553825378, "learning_rate": 4.2707956851550016e-05, "loss": 1.351, "step": 1580 }, { "epoch": 7.237986270022883, "grad_norm": 0.2370002567768097, "learning_rate": 4.205361144213227e-05, "loss": 1.3552, "step": 1585 }, { "epoch": 7.260869565217392, "grad_norm": 0.2338656485080719, "learning_rate": 4.140298023713416e-05, "loss": 1.3598, "step": 1590 }, { "epoch": 7.2837528604119, "grad_norm": 0.2293270081281662, "learning_rate": 4.075610494022964e-05, "loss": 1.3565, "step": 1595 }, { "epoch": 7.306636155606407, "grad_norm": 0.23523476719856262, "learning_rate": 4.011302701434937e-05, "loss": 1.3545, "step": 1600 }, { "epoch": 7.329519450800915, "grad_norm": 0.22826853394508362, "learning_rate": 3.947378767902284e-05, "loss": 1.3411, "step": 1605 }, { "epoch": 7.352402745995423, "grad_norm": 0.24477452039718628, "learning_rate": 3.8838427907736476e-05, "loss": 1.3482, "step": 1610 }, { "epoch": 7.3752860411899315, "grad_norm": 0.23173397779464722, "learning_rate": 3.8206988425307246e-05, "loss": 1.3602, "step": 1615 }, { "epoch": 7.39816933638444, "grad_norm": 0.2321847528219223, "learning_rate": 3.757950970527249e-05, "loss": 1.3572, "step": 1620 }, { "epoch": 7.421052631578947, "grad_norm": 0.2273811548948288, "learning_rate": 3.695603196729543e-05, "loss": 1.3575, "step": 1625 }, { "epoch": 7.443935926773455, "grad_norm": 0.22913090884685516, "learning_rate": 3.633659517458736e-05, "loss": 1.3566, "step": 1630 }, { "epoch": 7.466819221967963, "grad_norm": 0.23222309350967407, "learning_rate": 3.5721239031346066e-05, "loss": 1.3461, "step": 1635 }, { "epoch": 7.489702517162471, "grad_norm": 0.2386443018913269, "learning_rate": 3.5110002980210975e-05, "loss": 1.3335, "step": 1640 }, { "epoch": 7.5125858123569795, "grad_norm": 0.23243063688278198, "learning_rate": 3.450292619973483e-05, "loss": 1.3497, "step": 1645 }, { "epoch": 7.535469107551488, "grad_norm": 0.22777137160301208, "learning_rate": 3.3900047601872596e-05, "loss": 1.3526, "step": 1650 }, { "epoch": 7.558352402745996, "grad_norm": 0.23459511995315552, "learning_rate": 3.3301405829487195e-05, "loss": 1.3484, "step": 1655 }, { "epoch": 7.581235697940503, "grad_norm": 0.22907888889312744, "learning_rate": 3.270703925387279e-05, "loss": 1.3529, "step": 1660 }, { "epoch": 7.604118993135011, "grad_norm": 0.22607311606407166, "learning_rate": 3.2116985972295076e-05, "loss": 1.3557, "step": 1665 }, { "epoch": 7.627002288329519, "grad_norm": 0.2280961275100708, "learning_rate": 3.153128380554941e-05, "loss": 1.3485, "step": 1670 }, { "epoch": 7.649885583524028, "grad_norm": 0.22796593606472015, "learning_rate": 3.094997029553673e-05, "loss": 1.3524, "step": 1675 }, { "epoch": 7.672768878718536, "grad_norm": 0.22574880719184875, "learning_rate": 3.037308270285709e-05, "loss": 1.3415, "step": 1680 }, { "epoch": 7.695652173913043, "grad_norm": 0.22538483142852783, "learning_rate": 2.9800658004421366e-05, "loss": 1.3584, "step": 1685 }, { "epoch": 7.718535469107551, "grad_norm": 0.22863738238811493, "learning_rate": 2.923273289108115e-05, "loss": 1.3555, "step": 1690 }, { "epoch": 7.741418764302059, "grad_norm": 0.23403213918209076, "learning_rate": 2.8669343765277078e-05, "loss": 1.3455, "step": 1695 }, { "epoch": 7.7643020594965675, "grad_norm": 0.23343510925769806, "learning_rate": 2.8110526738705344e-05, "loss": 1.3472, "step": 1700 }, { "epoch": 7.787185354691076, "grad_norm": 0.2283450961112976, "learning_rate": 2.755631763000318e-05, "loss": 1.3594, "step": 1705 }, { "epoch": 7.810068649885584, "grad_norm": 0.2360517978668213, "learning_rate": 2.7006751962452882e-05, "loss": 1.347, "step": 1710 }, { "epoch": 7.832951945080092, "grad_norm": 0.2376154214143753, "learning_rate": 2.6461864961704975e-05, "loss": 1.3532, "step": 1715 }, { "epoch": 7.855835240274599, "grad_norm": 0.23671028017997742, "learning_rate": 2.592169155352031e-05, "loss": 1.3589, "step": 1720 }, { "epoch": 7.878718535469107, "grad_norm": 0.23020578920841217, "learning_rate": 2.538626636153131e-05, "loss": 1.3527, "step": 1725 }, { "epoch": 7.9016018306636155, "grad_norm": 0.2411489188671112, "learning_rate": 2.485562370502279e-05, "loss": 1.3508, "step": 1730 }, { "epoch": 7.924485125858124, "grad_norm": 0.23260930180549622, "learning_rate": 2.4329797596732252e-05, "loss": 1.3446, "step": 1735 }, { "epoch": 7.947368421052632, "grad_norm": 0.23165710270404816, "learning_rate": 2.3808821740669606e-05, "loss": 1.357, "step": 1740 }, { "epoch": 7.970251716247139, "grad_norm": 0.240895077586174, "learning_rate": 2.3292729529956935e-05, "loss": 1.3631, "step": 1745 }, { "epoch": 7.993135011441647, "grad_norm": 0.2322818636894226, "learning_rate": 2.2781554044688015e-05, "loss": 1.3413, "step": 1750 }, { "epoch": 8.0, "eval_loss": 2.5123343467712402, "eval_runtime": 0.1598, "eval_samples_per_second": 62.575, "eval_steps_per_second": 6.258, "step": 1752 }, { "epoch": 8.013729977116705, "grad_norm": 0.22951894998550415, "learning_rate": 2.227532804980813e-05, "loss": 1.3476, "step": 1755 }, { "epoch": 8.036613272311213, "grad_norm": 0.23239345848560333, "learning_rate": 2.1774083993013718e-05, "loss": 1.344, "step": 1760 }, { "epoch": 8.05949656750572, "grad_norm": 0.23161360621452332, "learning_rate": 2.1277854002672683e-05, "loss": 1.3465, "step": 1765 }, { "epoch": 8.082379862700229, "grad_norm": 0.24183310568332672, "learning_rate": 2.078666988576504e-05, "loss": 1.3395, "step": 1770 }, { "epoch": 8.105263157894736, "grad_norm": 0.23045741021633148, "learning_rate": 2.030056312584424e-05, "loss": 1.3489, "step": 1775 }, { "epoch": 8.128146453089245, "grad_norm": 0.23021478950977325, "learning_rate": 1.9819564881018983e-05, "loss": 1.3461, "step": 1780 }, { "epoch": 8.151029748283753, "grad_norm": 0.22722554206848145, "learning_rate": 1.934370598195622e-05, "loss": 1.344, "step": 1785 }, { "epoch": 8.173913043478262, "grad_norm": 0.23223134875297546, "learning_rate": 1.887301692990494e-05, "loss": 1.3585, "step": 1790 }, { "epoch": 8.196796338672769, "grad_norm": 0.2324499934911728, "learning_rate": 1.8407527894741184e-05, "loss": 1.3466, "step": 1795 }, { "epoch": 8.219679633867276, "grad_norm": 0.23666158318519592, "learning_rate": 1.7947268713034127e-05, "loss": 1.3353, "step": 1800 }, { "epoch": 8.242562929061785, "grad_norm": 0.2395636886358261, "learning_rate": 1.7492268886133676e-05, "loss": 1.3312, "step": 1805 }, { "epoch": 8.265446224256292, "grad_norm": 0.23455668985843658, "learning_rate": 1.7042557578279626e-05, "loss": 1.3536, "step": 1810 }, { "epoch": 8.288329519450802, "grad_norm": 0.24188482761383057, "learning_rate": 1.6598163614732154e-05, "loss": 1.3592, "step": 1815 }, { "epoch": 8.311212814645309, "grad_norm": 0.23694472014904022, "learning_rate": 1.6159115479924257e-05, "loss": 1.3485, "step": 1820 }, { "epoch": 8.334096109839816, "grad_norm": 0.2400536835193634, "learning_rate": 1.5725441315636002e-05, "loss": 1.3387, "step": 1825 }, { "epoch": 8.356979405034325, "grad_norm": 0.2341165691614151, "learning_rate": 1.529716891919074e-05, "loss": 1.3429, "step": 1830 }, { "epoch": 8.379862700228832, "grad_norm": 0.23209024965763092, "learning_rate": 1.4874325741673278e-05, "loss": 1.3537, "step": 1835 }, { "epoch": 8.402745995423341, "grad_norm": 0.2267337441444397, "learning_rate": 1.4456938886170412e-05, "loss": 1.3432, "step": 1840 }, { "epoch": 8.425629290617849, "grad_norm": 0.22982369363307953, "learning_rate": 1.4045035106033655e-05, "loss": 1.3401, "step": 1845 }, { "epoch": 8.448512585812358, "grad_norm": 0.23104478418827057, "learning_rate": 1.3638640803164516e-05, "loss": 1.3481, "step": 1850 }, { "epoch": 8.471395881006865, "grad_norm": 0.22922466695308685, "learning_rate": 1.3237782026322055e-05, "loss": 1.3548, "step": 1855 }, { "epoch": 8.494279176201372, "grad_norm": 0.2303161472082138, "learning_rate": 1.2842484469453365e-05, "loss": 1.3461, "step": 1860 }, { "epoch": 8.517162471395881, "grad_norm": 0.23580402135849, "learning_rate": 1.2452773470046541e-05, "loss": 1.3517, "step": 1865 }, { "epoch": 8.540045766590389, "grad_norm": 0.2284560203552246, "learning_rate": 1.2068674007506786e-05, "loss": 1.34, "step": 1870 }, { "epoch": 8.562929061784898, "grad_norm": 0.23408174514770508, "learning_rate": 1.1690210701555104e-05, "loss": 1.3382, "step": 1875 }, { "epoch": 8.585812356979405, "grad_norm": 0.22775082290172577, "learning_rate": 1.1317407810650372e-05, "loss": 1.3533, "step": 1880 }, { "epoch": 8.608695652173914, "grad_norm": 0.23187576234340668, "learning_rate": 1.0950289230434374e-05, "loss": 1.342, "step": 1885 }, { "epoch": 8.631578947368421, "grad_norm": 0.23138730227947235, "learning_rate": 1.058887849220026e-05, "loss": 1.3571, "step": 1890 }, { "epoch": 8.654462242562929, "grad_norm": 0.2307848185300827, "learning_rate": 1.02331987613841e-05, "loss": 1.3525, "step": 1895 }, { "epoch": 8.677345537757438, "grad_norm": 0.2283322662115097, "learning_rate": 9.883272836080116e-06, "loss": 1.3472, "step": 1900 }, { "epoch": 8.700228832951945, "grad_norm": 0.22654347121715546, "learning_rate": 9.539123145579476e-06, "loss": 1.3477, "step": 1905 }, { "epoch": 8.723112128146454, "grad_norm": 0.23034285008907318, "learning_rate": 9.200771748932513e-06, "loss": 1.345, "step": 1910 }, { "epoch": 8.745995423340961, "grad_norm": 0.23074457049369812, "learning_rate": 8.868240333534815e-06, "loss": 1.3389, "step": 1915 }, { "epoch": 8.768878718535468, "grad_norm": 0.22868825495243073, "learning_rate": 8.541550213737171e-06, "loss": 1.3417, "step": 1920 }, { "epoch": 8.791762013729977, "grad_norm": 0.22654058039188385, "learning_rate": 8.220722329479346e-06, "loss": 1.3458, "step": 1925 }, { "epoch": 8.814645308924485, "grad_norm": 0.22552727162837982, "learning_rate": 7.905777244947954e-06, "loss": 1.3444, "step": 1930 }, { "epoch": 8.837528604118994, "grad_norm": 0.22772538661956787, "learning_rate": 7.5967351472582275e-06, "loss": 1.355, "step": 1935 }, { "epoch": 8.860411899313501, "grad_norm": 0.2261328399181366, "learning_rate": 7.293615845160196e-06, "loss": 1.3314, "step": 1940 }, { "epoch": 8.883295194508008, "grad_norm": 0.23299558460712433, "learning_rate": 6.99643876776891e-06, "loss": 1.3461, "step": 1945 }, { "epoch": 8.906178489702517, "grad_norm": 0.22399510443210602, "learning_rate": 6.705222963319191e-06, "loss": 1.3412, "step": 1950 }, { "epoch": 8.929061784897025, "grad_norm": 0.22913098335266113, "learning_rate": 6.419987097944579e-06, "loss": 1.3519, "step": 1955 }, { "epoch": 8.951945080091534, "grad_norm": 0.22438156604766846, "learning_rate": 6.140749454480932e-06, "loss": 1.3513, "step": 1960 }, { "epoch": 8.974828375286041, "grad_norm": 0.23290272057056427, "learning_rate": 5.867527931294614e-06, "loss": 1.3495, "step": 1965 }, { "epoch": 8.99771167048055, "grad_norm": 0.2285740226507187, "learning_rate": 5.6003400411351325e-06, "loss": 1.3562, "step": 1970 }, { "epoch": 9.0, "eval_loss": 2.517554998397827, "eval_runtime": 0.1572, "eval_samples_per_second": 63.605, "eval_steps_per_second": 6.36, "step": 1971 }, { "epoch": 9.018306636155607, "grad_norm": 0.22652767598628998, "learning_rate": 5.339202910012708e-06, "loss": 1.3499, "step": 1975 }, { "epoch": 9.041189931350115, "grad_norm": 0.2265496850013733, "learning_rate": 5.0841332761005e-06, "loss": 1.3514, "step": 1980 }, { "epoch": 9.064073226544622, "grad_norm": 0.22489045560359955, "learning_rate": 4.835147488661795e-06, "loss": 1.3402, "step": 1985 }, { "epoch": 9.08695652173913, "grad_norm": 0.23145288228988647, "learning_rate": 4.592261507001993e-06, "loss": 1.343, "step": 1990 }, { "epoch": 9.109839816933638, "grad_norm": 0.228094220161438, "learning_rate": 4.355490899445691e-06, "loss": 1.3422, "step": 1995 }, { "epoch": 9.132723112128147, "grad_norm": 0.23417501151561737, "learning_rate": 4.124850842338779e-06, "loss": 1.3512, "step": 2000 }, { "epoch": 9.155606407322654, "grad_norm": 0.2259424477815628, "learning_rate": 3.900356119075743e-06, "loss": 1.3428, "step": 2005 }, { "epoch": 9.178489702517162, "grad_norm": 0.22936026751995087, "learning_rate": 3.6820211191520125e-06, "loss": 1.343, "step": 2010 }, { "epoch": 9.20137299771167, "grad_norm": 0.2265051156282425, "learning_rate": 3.469859837241651e-06, "loss": 1.3517, "step": 2015 }, { "epoch": 9.224256292906178, "grad_norm": 0.22797267138957977, "learning_rate": 3.263885872300343e-06, "loss": 1.3497, "step": 2020 }, { "epoch": 9.247139588100687, "grad_norm": 0.2263391762971878, "learning_rate": 3.064112426693799e-06, "loss": 1.3381, "step": 2025 }, { "epoch": 9.270022883295194, "grad_norm": 0.23144488036632538, "learning_rate": 2.8705523053513816e-06, "loss": 1.3453, "step": 2030 }, { "epoch": 9.292906178489703, "grad_norm": 0.2310468852519989, "learning_rate": 2.6832179149454793e-06, "loss": 1.338, "step": 2035 }, { "epoch": 9.31578947368421, "grad_norm": 0.2297661006450653, "learning_rate": 2.502121263096224e-06, "loss": 1.3424, "step": 2040 }, { "epoch": 9.338672768878718, "grad_norm": 0.23045673966407776, "learning_rate": 2.3272739576017945e-06, "loss": 1.3525, "step": 2045 }, { "epoch": 9.361556064073227, "grad_norm": 0.22856095433235168, "learning_rate": 2.1586872056944428e-06, "loss": 1.3441, "step": 2050 }, { "epoch": 9.384439359267734, "grad_norm": 0.23274967074394226, "learning_rate": 1.996371813322129e-06, "loss": 1.355, "step": 2055 }, { "epoch": 9.407322654462243, "grad_norm": 0.22686432301998138, "learning_rate": 1.840338184455881e-06, "loss": 1.341, "step": 2060 }, { "epoch": 9.43020594965675, "grad_norm": 0.22302532196044922, "learning_rate": 1.6905963204229436e-06, "loss": 1.3573, "step": 2065 }, { "epoch": 9.453089244851258, "grad_norm": 0.2239583283662796, "learning_rate": 1.5471558192656777e-06, "loss": 1.3357, "step": 2070 }, { "epoch": 9.475972540045767, "grad_norm": 0.22839844226837158, "learning_rate": 1.4100258751264195e-06, "loss": 1.3387, "step": 2075 }, { "epoch": 9.498855835240274, "grad_norm": 0.22954653203487396, "learning_rate": 1.2792152776580968e-06, "loss": 1.3428, "step": 2080 }, { "epoch": 9.521739130434783, "grad_norm": 0.22747966647148132, "learning_rate": 1.1547324114608904e-06, "loss": 1.3396, "step": 2085 }, { "epoch": 9.54462242562929, "grad_norm": 0.2284754067659378, "learning_rate": 1.036585255544764e-06, "loss": 1.3472, "step": 2090 }, { "epoch": 9.5675057208238, "grad_norm": 0.22196947038173676, "learning_rate": 9.247813828180407e-07, "loss": 1.3454, "step": 2095 }, { "epoch": 9.590389016018307, "grad_norm": 0.22999419271945953, "learning_rate": 8.193279596020121e-07, "loss": 1.3431, "step": 2100 }, { "epoch": 9.613272311212814, "grad_norm": 0.23028169572353363, "learning_rate": 7.202317451716067e-07, "loss": 1.3277, "step": 2105 }, { "epoch": 9.636155606407323, "grad_norm": 0.22792372107505798, "learning_rate": 6.274990913221035e-07, "loss": 1.3374, "step": 2110 }, { "epoch": 9.65903890160183, "grad_norm": 0.22512663900852203, "learning_rate": 5.411359419620232e-07, "loss": 1.3503, "step": 2115 }, { "epoch": 9.68192219679634, "grad_norm": 0.22875413298606873, "learning_rate": 4.6114783273213393e-07, "loss": 1.3451, "step": 2120 }, { "epoch": 9.704805491990847, "grad_norm": 0.22629615664482117, "learning_rate": 3.8753989065064557e-07, "loss": 1.3315, "step": 2125 }, { "epoch": 9.727688787185354, "grad_norm": 0.22211916744709015, "learning_rate": 3.203168337845508e-07, "loss": 1.35, "step": 2130 }, { "epoch": 9.750572082379863, "grad_norm": 0.22795149683952332, "learning_rate": 2.594829709472446e-07, "loss": 1.3314, "step": 2135 }, { "epoch": 9.77345537757437, "grad_norm": 0.231648787856102, "learning_rate": 2.05042201422323e-07, "loss": 1.3568, "step": 2140 }, { "epoch": 9.79633867276888, "grad_norm": 0.22703665494918823, "learning_rate": 1.5699801471364962e-07, "loss": 1.3411, "step": 2145 }, { "epoch": 9.819221967963387, "grad_norm": 0.2311718612909317, "learning_rate": 1.1535349032167908e-07, "loss": 1.3456, "step": 2150 }, { "epoch": 9.842105263157894, "grad_norm": 0.2319943606853485, "learning_rate": 8.011129754611491e-08, "loss": 1.3507, "step": 2155 }, { "epoch": 9.864988558352403, "grad_norm": 0.2376013547182083, "learning_rate": 5.127369531473525e-08, "loss": 1.3372, "step": 2160 }, { "epoch": 9.88787185354691, "grad_norm": 0.2285187989473343, "learning_rate": 2.884253203869758e-08, "loss": 1.3469, "step": 2165 }, { "epoch": 9.91075514874142, "grad_norm": 0.22958822548389435, "learning_rate": 1.2819245493955744e-08, "loss": 1.3467, "step": 2170 }, { "epoch": 9.933638443935926, "grad_norm": 0.22622478008270264, "learning_rate": 3.2048627292113887e-09, "loss": 1.3452, "step": 2175 }, { "epoch": 9.956521739130435, "grad_norm": 0.2237871289253235, "learning_rate": 0.0, "loss": 1.3538, "step": 2180 }, { "epoch": 9.956521739130435, "eval_loss": 2.5189335346221924, "eval_runtime": 0.1831, "eval_samples_per_second": 54.615, "eval_steps_per_second": 5.461, "step": 2180 }, { "epoch": 9.956521739130435, "step": 2180, "total_flos": 2.4191218891641324e+18, "train_loss": 1.418152675716155, "train_runtime": 3547.1741, "train_samples_per_second": 39.395, "train_steps_per_second": 0.615 } ], "logging_steps": 5, "max_steps": 2180, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4191218891641324e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }