{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998197093715069, "eval_steps": 500, "global_step": 3466, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014423250279450475, "grad_norm": 23.09968734754774, "learning_rate": 2.3054755043227666e-07, "loss": 12.1657, "step": 5 }, { "epoch": 0.002884650055890095, "grad_norm": 23.350567085111635, "learning_rate": 5.187319884726226e-07, "loss": 12.1499, "step": 10 }, { "epoch": 0.004326975083835142, "grad_norm": 22.840877913954497, "learning_rate": 8.069164265129684e-07, "loss": 12.0857, "step": 15 }, { "epoch": 0.00576930011178019, "grad_norm": 21.40321138460624, "learning_rate": 1.0951008645533142e-06, "loss": 11.8028, "step": 20 }, { "epoch": 0.007211625139725237, "grad_norm": 18.192353108517974, "learning_rate": 1.3832853025936602e-06, "loss": 11.3384, "step": 25 }, { "epoch": 0.008653950167670284, "grad_norm": 18.559232783911973, "learning_rate": 1.6714697406340058e-06, "loss": 10.3127, "step": 30 }, { "epoch": 0.010096275195615331, "grad_norm": 37.79150391064707, "learning_rate": 1.959654178674352e-06, "loss": 9.0664, "step": 35 }, { "epoch": 0.01153860022356038, "grad_norm": 33.772043740311254, "learning_rate": 2.247838616714698e-06, "loss": 7.409, "step": 40 }, { "epoch": 0.012980925251505427, "grad_norm": 23.04632172544007, "learning_rate": 2.5360230547550434e-06, "loss": 6.3338, "step": 45 }, { "epoch": 0.014423250279450473, "grad_norm": 25.32559722397877, "learning_rate": 2.8242074927953894e-06, "loss": 4.4908, "step": 50 }, { "epoch": 0.015865575307395522, "grad_norm": 9.143968031022688, "learning_rate": 3.1123919308357354e-06, "loss": 3.2978, "step": 55 }, { "epoch": 0.01730790033534057, "grad_norm": 2.3359297684099745, "learning_rate": 3.400576368876081e-06, "loss": 2.6887, "step": 60 }, { "epoch": 0.018750225363285616, "grad_norm": 1.5235792893524585, "learning_rate": 3.6887608069164266e-06, "loss": 2.6051, "step": 65 }, { "epoch": 0.020192550391230663, "grad_norm": 1.6452371227737381, "learning_rate": 3.976945244956772e-06, "loss": 2.5288, "step": 70 }, { "epoch": 0.021634875419175713, "grad_norm": 2.3877151673363133, "learning_rate": 4.265129682997119e-06, "loss": 2.4368, "step": 75 }, { "epoch": 0.02307720044712076, "grad_norm": 3.5448230000902283, "learning_rate": 4.553314121037464e-06, "loss": 2.2394, "step": 80 }, { "epoch": 0.024519525475065806, "grad_norm": 3.998099329525319, "learning_rate": 4.84149855907781e-06, "loss": 2.0687, "step": 85 }, { "epoch": 0.025961850503010853, "grad_norm": 5.3900301279889025, "learning_rate": 5.129682997118156e-06, "loss": 2.0427, "step": 90 }, { "epoch": 0.0274041755309559, "grad_norm": 7.2317244995568215, "learning_rate": 5.417867435158502e-06, "loss": 1.9167, "step": 95 }, { "epoch": 0.028846500558900947, "grad_norm": 4.190169407947923, "learning_rate": 5.706051873198848e-06, "loss": 1.8528, "step": 100 }, { "epoch": 0.030288825586845997, "grad_norm": 5.165106554451897, "learning_rate": 5.994236311239193e-06, "loss": 1.8751, "step": 105 }, { "epoch": 0.031731150614791044, "grad_norm": 3.2421300129897426, "learning_rate": 6.2824207492795395e-06, "loss": 1.7973, "step": 110 }, { "epoch": 0.03317347564273609, "grad_norm": 4.460292781455887, "learning_rate": 6.570605187319885e-06, "loss": 1.6292, "step": 115 }, { "epoch": 0.03461580067068114, "grad_norm": 4.913131259117871, "learning_rate": 6.8587896253602315e-06, "loss": 1.655, "step": 120 }, { "epoch": 0.03605812569862619, "grad_norm": 4.1881116653103945, "learning_rate": 7.146974063400577e-06, "loss": 1.664, "step": 125 }, { "epoch": 0.03750045072657123, "grad_norm": 5.723431293294362, "learning_rate": 7.4351585014409235e-06, "loss": 1.6202, "step": 130 }, { "epoch": 0.03894277575451628, "grad_norm": 4.909602119186479, "learning_rate": 7.723342939481268e-06, "loss": 1.5486, "step": 135 }, { "epoch": 0.040385100782461325, "grad_norm": 5.928676345818394, "learning_rate": 8.011527377521614e-06, "loss": 1.4965, "step": 140 }, { "epoch": 0.041827425810406375, "grad_norm": 5.5830317263384845, "learning_rate": 8.299711815561961e-06, "loss": 1.4195, "step": 145 }, { "epoch": 0.043269750838351426, "grad_norm": 5.587820490379444, "learning_rate": 8.587896253602305e-06, "loss": 1.3894, "step": 150 }, { "epoch": 0.04471207586629647, "grad_norm": 3.5851612990900836, "learning_rate": 8.876080691642652e-06, "loss": 1.4654, "step": 155 }, { "epoch": 0.04615440089424152, "grad_norm": 4.792344497245253, "learning_rate": 9.164265129682998e-06, "loss": 1.3801, "step": 160 }, { "epoch": 0.04759672592218656, "grad_norm": 3.5644574463856387, "learning_rate": 9.452449567723344e-06, "loss": 1.3527, "step": 165 }, { "epoch": 0.04903905095013161, "grad_norm": 4.245088356022904, "learning_rate": 9.740634005763689e-06, "loss": 1.3465, "step": 170 }, { "epoch": 0.050481375978076656, "grad_norm": 4.623244884122231, "learning_rate": 1.0028818443804036e-05, "loss": 1.3647, "step": 175 }, { "epoch": 0.05192370100602171, "grad_norm": 3.5591972450196043, "learning_rate": 1.031700288184438e-05, "loss": 1.261, "step": 180 }, { "epoch": 0.05336602603396676, "grad_norm": 3.6288737317693243, "learning_rate": 1.0605187319884726e-05, "loss": 1.2178, "step": 185 }, { "epoch": 0.0548083510619118, "grad_norm": 5.472679192029011, "learning_rate": 1.0893371757925073e-05, "loss": 1.2372, "step": 190 }, { "epoch": 0.05625067608985685, "grad_norm": 2.987171924181164, "learning_rate": 1.1181556195965419e-05, "loss": 1.1878, "step": 195 }, { "epoch": 0.057693001117801894, "grad_norm": 3.633711033064426, "learning_rate": 1.1469740634005764e-05, "loss": 1.1895, "step": 200 }, { "epoch": 0.059135326145746944, "grad_norm": 3.9402926571067978, "learning_rate": 1.175792507204611e-05, "loss": 1.1368, "step": 205 }, { "epoch": 0.060577651173691995, "grad_norm": 3.527134311033913, "learning_rate": 1.2046109510086457e-05, "loss": 1.1306, "step": 210 }, { "epoch": 0.06201997620163704, "grad_norm": 3.679407663475352, "learning_rate": 1.2334293948126803e-05, "loss": 1.0846, "step": 215 }, { "epoch": 0.06346230122958209, "grad_norm": 3.1104059182965047, "learning_rate": 1.2622478386167147e-05, "loss": 1.1201, "step": 220 }, { "epoch": 0.06490462625752713, "grad_norm": 4.203869282005421, "learning_rate": 1.2910662824207494e-05, "loss": 1.0694, "step": 225 }, { "epoch": 0.06634695128547217, "grad_norm": 3.936128919901792, "learning_rate": 1.319884726224784e-05, "loss": 1.0191, "step": 230 }, { "epoch": 0.06778927631341723, "grad_norm": 2.2362445033305804, "learning_rate": 1.3487031700288185e-05, "loss": 0.9774, "step": 235 }, { "epoch": 0.06923160134136228, "grad_norm": 2.757438827888907, "learning_rate": 1.377521613832853e-05, "loss": 1.0124, "step": 240 }, { "epoch": 0.07067392636930732, "grad_norm": 3.4599226565163783, "learning_rate": 1.4063400576368878e-05, "loss": 0.9295, "step": 245 }, { "epoch": 0.07211625139725238, "grad_norm": 2.0262096895794963, "learning_rate": 1.4351585014409224e-05, "loss": 0.9118, "step": 250 }, { "epoch": 0.07355857642519742, "grad_norm": 2.487400868386021, "learning_rate": 1.4639769452449568e-05, "loss": 0.9409, "step": 255 }, { "epoch": 0.07500090145314246, "grad_norm": 1.9303088742335475, "learning_rate": 1.4927953890489915e-05, "loss": 0.9211, "step": 260 }, { "epoch": 0.0764432264810875, "grad_norm": 2.175412817851971, "learning_rate": 1.521613832853026e-05, "loss": 0.9168, "step": 265 }, { "epoch": 0.07788555150903256, "grad_norm": 2.5796504124225033, "learning_rate": 1.5504322766570608e-05, "loss": 0.9527, "step": 270 }, { "epoch": 0.0793278765369776, "grad_norm": 1.9788435183920994, "learning_rate": 1.5792507204610953e-05, "loss": 0.8426, "step": 275 }, { "epoch": 0.08077020156492265, "grad_norm": 2.003074548053739, "learning_rate": 1.60806916426513e-05, "loss": 0.8527, "step": 280 }, { "epoch": 0.08221252659286771, "grad_norm": 2.1994335722383602, "learning_rate": 1.6368876080691644e-05, "loss": 0.8072, "step": 285 }, { "epoch": 0.08365485162081275, "grad_norm": 1.726445070641134, "learning_rate": 1.665706051873199e-05, "loss": 0.8163, "step": 290 }, { "epoch": 0.0850971766487578, "grad_norm": 2.350691118327581, "learning_rate": 1.6945244956772336e-05, "loss": 0.7651, "step": 295 }, { "epoch": 0.08653950167670285, "grad_norm": 2.6639655167915115, "learning_rate": 1.723342939481268e-05, "loss": 0.7535, "step": 300 }, { "epoch": 0.0879818267046479, "grad_norm": 1.3919563172463725, "learning_rate": 1.7521613832853027e-05, "loss": 0.785, "step": 305 }, { "epoch": 0.08942415173259294, "grad_norm": 1.2944766289360783, "learning_rate": 1.7809798270893372e-05, "loss": 0.7111, "step": 310 }, { "epoch": 0.09086647676053798, "grad_norm": 1.4798988266070112, "learning_rate": 1.8097982708933718e-05, "loss": 0.7293, "step": 315 }, { "epoch": 0.09230880178848304, "grad_norm": 1.1830162313483426, "learning_rate": 1.8386167146974067e-05, "loss": 0.7231, "step": 320 }, { "epoch": 0.09375112681642808, "grad_norm": 1.5568610974134778, "learning_rate": 1.867435158501441e-05, "loss": 0.7445, "step": 325 }, { "epoch": 0.09519345184437313, "grad_norm": 1.1492164494899182, "learning_rate": 1.8962536023054755e-05, "loss": 0.6959, "step": 330 }, { "epoch": 0.09663577687231818, "grad_norm": 1.0978857201097723, "learning_rate": 1.9250720461095104e-05, "loss": 0.7057, "step": 335 }, { "epoch": 0.09807810190026323, "grad_norm": 1.0096489653703298, "learning_rate": 1.953890489913545e-05, "loss": 0.6772, "step": 340 }, { "epoch": 0.09952042692820827, "grad_norm": 1.1232844613521993, "learning_rate": 1.9827089337175795e-05, "loss": 0.7246, "step": 345 }, { "epoch": 0.10096275195615331, "grad_norm": 1.02243795388932, "learning_rate": 1.9999979709215212e-05, "loss": 0.7024, "step": 350 }, { "epoch": 0.10240507698409837, "grad_norm": 1.1367801539352143, "learning_rate": 1.9999751438831965e-05, "loss": 0.6489, "step": 355 }, { "epoch": 0.10384740201204341, "grad_norm": 1.1572043181625398, "learning_rate": 1.9999269540393507e-05, "loss": 0.6489, "step": 360 }, { "epoch": 0.10528972703998846, "grad_norm": 1.0269240416486167, "learning_rate": 1.9998534026122433e-05, "loss": 0.6782, "step": 365 }, { "epoch": 0.10673205206793351, "grad_norm": 0.9511160065038861, "learning_rate": 1.9997544914673915e-05, "loss": 0.6312, "step": 370 }, { "epoch": 0.10817437709587856, "grad_norm": 1.1374311508874984, "learning_rate": 1.999630223113522e-05, "loss": 0.6628, "step": 375 }, { "epoch": 0.1096167021238236, "grad_norm": 1.450941328478541, "learning_rate": 1.9994806007025068e-05, "loss": 0.6389, "step": 380 }, { "epoch": 0.11105902715176866, "grad_norm": 0.8046806001901237, "learning_rate": 1.9993056280292845e-05, "loss": 0.6482, "step": 385 }, { "epoch": 0.1125013521797137, "grad_norm": 0.8216403494158578, "learning_rate": 1.999105309531763e-05, "loss": 0.6078, "step": 390 }, { "epoch": 0.11394367720765874, "grad_norm": 0.8600864577290717, "learning_rate": 1.9988796502907083e-05, "loss": 0.63, "step": 395 }, { "epoch": 0.11538600223560379, "grad_norm": 0.798579467879802, "learning_rate": 1.9986286560296134e-05, "loss": 0.6109, "step": 400 }, { "epoch": 0.11682832726354885, "grad_norm": 0.7668970837973854, "learning_rate": 1.998352333114556e-05, "loss": 0.5857, "step": 405 }, { "epoch": 0.11827065229149389, "grad_norm": 1.0143366745206854, "learning_rate": 1.998050688554034e-05, "loss": 0.6176, "step": 410 }, { "epoch": 0.11971297731943893, "grad_norm": 0.7114180483975799, "learning_rate": 1.9977237299987903e-05, "loss": 0.62, "step": 415 }, { "epoch": 0.12115530234738399, "grad_norm": 0.8179413343809848, "learning_rate": 1.997371465741617e-05, "loss": 0.6205, "step": 420 }, { "epoch": 0.12259762737532903, "grad_norm": 0.6435940720725398, "learning_rate": 1.996993904717146e-05, "loss": 0.5878, "step": 425 }, { "epoch": 0.12403995240327408, "grad_norm": 0.9102246188273324, "learning_rate": 1.9965910565016223e-05, "loss": 0.6021, "step": 430 }, { "epoch": 0.12548227743121912, "grad_norm": 0.6153476600060466, "learning_rate": 1.9961629313126608e-05, "loss": 0.5674, "step": 435 }, { "epoch": 0.12692460245916418, "grad_norm": 0.5823753109992822, "learning_rate": 1.9957095400089875e-05, "loss": 0.5819, "step": 440 }, { "epoch": 0.12836692748710923, "grad_norm": 0.6280650049871973, "learning_rate": 1.9952308940901634e-05, "loss": 0.6357, "step": 445 }, { "epoch": 0.12980925251505426, "grad_norm": 1.12163730124818, "learning_rate": 1.9947270056962934e-05, "loss": 0.5659, "step": 450 }, { "epoch": 0.13125157754299932, "grad_norm": 0.8453741002711367, "learning_rate": 1.994197887607719e-05, "loss": 0.5423, "step": 455 }, { "epoch": 0.13269390257094435, "grad_norm": 0.6945577095672939, "learning_rate": 1.993643553244693e-05, "loss": 0.6118, "step": 460 }, { "epoch": 0.1341362275988894, "grad_norm": 0.6080087347638511, "learning_rate": 1.993064016667039e-05, "loss": 0.5912, "step": 465 }, { "epoch": 0.13557855262683446, "grad_norm": 0.5072027520003524, "learning_rate": 1.992459292573796e-05, "loss": 0.6086, "step": 470 }, { "epoch": 0.1370208776547795, "grad_norm": 0.5194397753829619, "learning_rate": 1.991829396302845e-05, "loss": 0.5554, "step": 475 }, { "epoch": 0.13846320268272455, "grad_norm": 0.6531400636419847, "learning_rate": 1.9911743438305203e-05, "loss": 0.5738, "step": 480 }, { "epoch": 0.1399055277106696, "grad_norm": 0.8007993447245763, "learning_rate": 1.990494151771202e-05, "loss": 0.5698, "step": 485 }, { "epoch": 0.14134785273861464, "grad_norm": 0.7192330669398362, "learning_rate": 1.989788837376899e-05, "loss": 0.5629, "step": 490 }, { "epoch": 0.1427901777665597, "grad_norm": 0.688440868686088, "learning_rate": 1.989058418536807e-05, "loss": 0.5734, "step": 495 }, { "epoch": 0.14423250279450475, "grad_norm": 1.001172764554856, "learning_rate": 1.988302913776858e-05, "loss": 0.5745, "step": 500 }, { "epoch": 0.14423250279450475, "eval_loss": 0.568706750869751, "eval_runtime": 161.3667, "eval_samples_per_second": 11.161, "eval_steps_per_second": 2.795, "step": 500 }, { "epoch": 0.14567482782244978, "grad_norm": 1.0515733209433527, "learning_rate": 1.9875223422592485e-05, "loss": 0.5704, "step": 505 }, { "epoch": 0.14711715285039484, "grad_norm": 1.0276945765068186, "learning_rate": 1.986716723781954e-05, "loss": 0.6123, "step": 510 }, { "epoch": 0.1485594778783399, "grad_norm": 0.8043743845845657, "learning_rate": 1.985886078778227e-05, "loss": 0.5437, "step": 515 }, { "epoch": 0.15000180290628493, "grad_norm": 0.6535595881064415, "learning_rate": 1.9850304283160793e-05, "loss": 0.5527, "step": 520 }, { "epoch": 0.15144412793422998, "grad_norm": 0.7357564272936004, "learning_rate": 1.9841497940977464e-05, "loss": 0.5432, "step": 525 }, { "epoch": 0.152886452962175, "grad_norm": 0.7287222676647807, "learning_rate": 1.983244198459138e-05, "loss": 0.5811, "step": 530 }, { "epoch": 0.15432877799012007, "grad_norm": 0.5697752505815841, "learning_rate": 1.982313664369271e-05, "loss": 0.5627, "step": 535 }, { "epoch": 0.15577110301806513, "grad_norm": 0.5170616797914624, "learning_rate": 1.981358215429687e-05, "loss": 0.5592, "step": 540 }, { "epoch": 0.15721342804601016, "grad_norm": 0.619913426569597, "learning_rate": 1.9803778758738543e-05, "loss": 0.5435, "step": 545 }, { "epoch": 0.1586557530739552, "grad_norm": 0.9727823301261521, "learning_rate": 1.9793726705665524e-05, "loss": 0.5889, "step": 550 }, { "epoch": 0.16009807810190027, "grad_norm": 0.6044688838902901, "learning_rate": 1.9783426250032412e-05, "loss": 0.5678, "step": 555 }, { "epoch": 0.1615404031298453, "grad_norm": 0.46024598144245266, "learning_rate": 1.9772877653094165e-05, "loss": 0.5639, "step": 560 }, { "epoch": 0.16298272815779036, "grad_norm": 0.45100341602786603, "learning_rate": 1.9762081182399434e-05, "loss": 0.5717, "step": 565 }, { "epoch": 0.16442505318573541, "grad_norm": 0.5540308655652189, "learning_rate": 1.9751037111783818e-05, "loss": 0.5623, "step": 570 }, { "epoch": 0.16586737821368044, "grad_norm": 0.43976603899998645, "learning_rate": 1.9739745721362897e-05, "loss": 0.5319, "step": 575 }, { "epoch": 0.1673097032416255, "grad_norm": 0.4612500025708451, "learning_rate": 1.9728207297525125e-05, "loss": 0.5653, "step": 580 }, { "epoch": 0.16875202826957056, "grad_norm": 0.5752333041985558, "learning_rate": 1.9716422132924572e-05, "loss": 0.567, "step": 585 }, { "epoch": 0.1701943532975156, "grad_norm": 0.5369943570453672, "learning_rate": 1.9704390526473515e-05, "loss": 0.5609, "step": 590 }, { "epoch": 0.17163667832546065, "grad_norm": 0.5164720235053389, "learning_rate": 1.9692112783334826e-05, "loss": 0.5415, "step": 595 }, { "epoch": 0.1730790033534057, "grad_norm": 0.7665382521888024, "learning_rate": 1.967958921491426e-05, "loss": 0.5671, "step": 600 }, { "epoch": 0.17452132838135073, "grad_norm": 0.6256340257615823, "learning_rate": 1.966682013885255e-05, "loss": 0.5533, "step": 605 }, { "epoch": 0.1759636534092958, "grad_norm": 0.4893424331522886, "learning_rate": 1.9653805879017323e-05, "loss": 0.5589, "step": 610 }, { "epoch": 0.17740597843724082, "grad_norm": 0.4930248858437027, "learning_rate": 1.964054676549494e-05, "loss": 0.5418, "step": 615 }, { "epoch": 0.17884830346518588, "grad_norm": 0.45814407628412845, "learning_rate": 1.9627043134582068e-05, "loss": 0.5195, "step": 620 }, { "epoch": 0.18029062849313093, "grad_norm": 0.5315704703868885, "learning_rate": 1.9613295328777187e-05, "loss": 0.5095, "step": 625 }, { "epoch": 0.18173295352107596, "grad_norm": 0.43146076740416167, "learning_rate": 1.959930369677189e-05, "loss": 0.4929, "step": 630 }, { "epoch": 0.18317527854902102, "grad_norm": 0.4627882494650573, "learning_rate": 1.958506859344204e-05, "loss": 0.5141, "step": 635 }, { "epoch": 0.18461760357696608, "grad_norm": 0.621672972720691, "learning_rate": 1.9570590379838767e-05, "loss": 0.5486, "step": 640 }, { "epoch": 0.1860599286049111, "grad_norm": 0.5063460018719447, "learning_rate": 1.9555869423179316e-05, "loss": 0.5497, "step": 645 }, { "epoch": 0.18750225363285616, "grad_norm": 0.48895947210824475, "learning_rate": 1.9540906096837727e-05, "loss": 0.5465, "step": 650 }, { "epoch": 0.18894457866080122, "grad_norm": 0.47357663586358684, "learning_rate": 1.9525700780335372e-05, "loss": 0.529, "step": 655 }, { "epoch": 0.19038690368874625, "grad_norm": 0.43786638884850015, "learning_rate": 1.951025385933132e-05, "loss": 0.522, "step": 660 }, { "epoch": 0.1918292287166913, "grad_norm": 0.5828551791972233, "learning_rate": 1.9494565725612565e-05, "loss": 0.5334, "step": 665 }, { "epoch": 0.19327155374463637, "grad_norm": 0.4669699168406431, "learning_rate": 1.9478636777084077e-05, "loss": 0.4846, "step": 670 }, { "epoch": 0.1947138787725814, "grad_norm": 0.5626195687859905, "learning_rate": 1.946246741775873e-05, "loss": 0.556, "step": 675 }, { "epoch": 0.19615620380052645, "grad_norm": 0.5482755680769119, "learning_rate": 1.9446058057747025e-05, "loss": 0.4561, "step": 680 }, { "epoch": 0.1975985288284715, "grad_norm": 0.4878018831010534, "learning_rate": 1.9429409113246715e-05, "loss": 0.526, "step": 685 }, { "epoch": 0.19904085385641654, "grad_norm": 0.7436357434374212, "learning_rate": 1.9412521006532245e-05, "loss": 0.5088, "step": 690 }, { "epoch": 0.2004831788843616, "grad_norm": 0.45530676409796045, "learning_rate": 1.939539416594402e-05, "loss": 0.5214, "step": 695 }, { "epoch": 0.20192550391230663, "grad_norm": 0.6302948823981896, "learning_rate": 1.937802902587757e-05, "loss": 0.5591, "step": 700 }, { "epoch": 0.20336782894025168, "grad_norm": 0.4921513503843826, "learning_rate": 1.936042602677251e-05, "loss": 0.5288, "step": 705 }, { "epoch": 0.20481015396819674, "grad_norm": 0.5421091687931597, "learning_rate": 1.934258561510138e-05, "loss": 0.5151, "step": 710 }, { "epoch": 0.20625247899614177, "grad_norm": 0.7576428493111558, "learning_rate": 1.932450824335832e-05, "loss": 0.477, "step": 715 }, { "epoch": 0.20769480402408683, "grad_norm": 0.424961853700426, "learning_rate": 1.9306194370047592e-05, "loss": 0.5342, "step": 720 }, { "epoch": 0.20913712905203188, "grad_norm": 0.49906945581307455, "learning_rate": 1.9287644459671948e-05, "loss": 0.5334, "step": 725 }, { "epoch": 0.2105794540799769, "grad_norm": 0.46177937508565325, "learning_rate": 1.926885898272085e-05, "loss": 0.4989, "step": 730 }, { "epoch": 0.21202177910792197, "grad_norm": 0.4920606306275181, "learning_rate": 1.9249838415658543e-05, "loss": 0.5448, "step": 735 }, { "epoch": 0.21346410413586703, "grad_norm": 0.4191101613829332, "learning_rate": 1.9230583240911954e-05, "loss": 0.4694, "step": 740 }, { "epoch": 0.21490642916381206, "grad_norm": 0.48817506876963557, "learning_rate": 1.9211093946858484e-05, "loss": 0.5173, "step": 745 }, { "epoch": 0.21634875419175711, "grad_norm": 0.5126984233381934, "learning_rate": 1.919137102781359e-05, "loss": 0.5074, "step": 750 }, { "epoch": 0.21779107921970217, "grad_norm": 0.5334260917924061, "learning_rate": 1.9171414984018266e-05, "loss": 0.4917, "step": 755 }, { "epoch": 0.2192334042476472, "grad_norm": 0.5501541841297073, "learning_rate": 1.915122632162635e-05, "loss": 0.5152, "step": 760 }, { "epoch": 0.22067572927559226, "grad_norm": 0.4359723210170646, "learning_rate": 1.913080555269169e-05, "loss": 0.5215, "step": 765 }, { "epoch": 0.22211805430353732, "grad_norm": 0.5662077360043514, "learning_rate": 1.911015319515515e-05, "loss": 0.5253, "step": 770 }, { "epoch": 0.22356037933148235, "grad_norm": 0.4764077159702808, "learning_rate": 1.908926977283148e-05, "loss": 0.5066, "step": 775 }, { "epoch": 0.2250027043594274, "grad_norm": 0.5639009005172965, "learning_rate": 1.9068155815396018e-05, "loss": 0.474, "step": 780 }, { "epoch": 0.22644502938737243, "grad_norm": 0.6776509031874417, "learning_rate": 1.904681185837128e-05, "loss": 0.5025, "step": 785 }, { "epoch": 0.2278873544153175, "grad_norm": 0.3940863617407268, "learning_rate": 1.9025238443113346e-05, "loss": 0.4781, "step": 790 }, { "epoch": 0.22932967944326255, "grad_norm": 0.5731371374463607, "learning_rate": 1.9003436116798156e-05, "loss": 0.5325, "step": 795 }, { "epoch": 0.23077200447120758, "grad_norm": 0.44630504407580995, "learning_rate": 1.898140543240762e-05, "loss": 0.5094, "step": 800 }, { "epoch": 0.23221432949915263, "grad_norm": 0.5013841323056458, "learning_rate": 1.8959146948715582e-05, "loss": 0.5123, "step": 805 }, { "epoch": 0.2336566545270977, "grad_norm": 0.6517172353158069, "learning_rate": 1.8936661230273677e-05, "loss": 0.4944, "step": 810 }, { "epoch": 0.23509897955504272, "grad_norm": 0.5321704297258375, "learning_rate": 1.8913948847396978e-05, "loss": 0.5111, "step": 815 }, { "epoch": 0.23654130458298778, "grad_norm": 0.5733385459091142, "learning_rate": 1.8891010376149554e-05, "loss": 0.5255, "step": 820 }, { "epoch": 0.23798362961093283, "grad_norm": 0.6439828549708082, "learning_rate": 1.8867846398329856e-05, "loss": 0.5224, "step": 825 }, { "epoch": 0.23942595463887786, "grad_norm": 0.526933741666615, "learning_rate": 1.884445750145595e-05, "loss": 0.4987, "step": 830 }, { "epoch": 0.24086827966682292, "grad_norm": 0.4358091890203275, "learning_rate": 1.882084427875062e-05, "loss": 0.5151, "step": 835 }, { "epoch": 0.24231060469476798, "grad_norm": 0.42052312366605993, "learning_rate": 1.8797007329126336e-05, "loss": 0.5292, "step": 840 }, { "epoch": 0.243752929722713, "grad_norm": 0.5162254671712243, "learning_rate": 1.8772947257170034e-05, "loss": 0.4701, "step": 845 }, { "epoch": 0.24519525475065806, "grad_norm": 0.41421320556868774, "learning_rate": 1.8748664673127814e-05, "loss": 0.4869, "step": 850 }, { "epoch": 0.2466375797786031, "grad_norm": 0.44489422959937447, "learning_rate": 1.872416019288944e-05, "loss": 0.5107, "step": 855 }, { "epoch": 0.24807990480654815, "grad_norm": 0.5131502882549939, "learning_rate": 1.8699434437972726e-05, "loss": 0.5002, "step": 860 }, { "epoch": 0.2495222298344932, "grad_norm": 0.4410628046298298, "learning_rate": 1.8674488035507776e-05, "loss": 0.5033, "step": 865 }, { "epoch": 0.25096455486243824, "grad_norm": 0.424822720640458, "learning_rate": 1.864932161822107e-05, "loss": 0.459, "step": 870 }, { "epoch": 0.2524068798903833, "grad_norm": 0.546763650924181, "learning_rate": 1.8623935824419416e-05, "loss": 0.4782, "step": 875 }, { "epoch": 0.25384920491832835, "grad_norm": 0.571446149303962, "learning_rate": 1.859833129797378e-05, "loss": 0.4971, "step": 880 }, { "epoch": 0.2552915299462734, "grad_norm": 0.3881051890411508, "learning_rate": 1.857250868830292e-05, "loss": 0.4645, "step": 885 }, { "epoch": 0.25673385497421847, "grad_norm": 0.4365270093969844, "learning_rate": 1.8546468650356947e-05, "loss": 0.4999, "step": 890 }, { "epoch": 0.25817618000216347, "grad_norm": 0.39922925876114046, "learning_rate": 1.852021184460069e-05, "loss": 0.4607, "step": 895 }, { "epoch": 0.2596185050301085, "grad_norm": 0.4385372209974039, "learning_rate": 1.849373893699697e-05, "loss": 0.5032, "step": 900 }, { "epoch": 0.2610608300580536, "grad_norm": 0.4289486219739114, "learning_rate": 1.8467050598989677e-05, "loss": 0.5003, "step": 905 }, { "epoch": 0.26250315508599864, "grad_norm": 0.4045886984758963, "learning_rate": 1.8440147507486765e-05, "loss": 0.4644, "step": 910 }, { "epoch": 0.2639454801139437, "grad_norm": 0.43637212820672877, "learning_rate": 1.8413030344843064e-05, "loss": 0.5057, "step": 915 }, { "epoch": 0.2653878051418887, "grad_norm": 0.468355616591299, "learning_rate": 1.838569979884301e-05, "loss": 0.4967, "step": 920 }, { "epoch": 0.26683013016983376, "grad_norm": 0.4257178939942325, "learning_rate": 1.835815656268314e-05, "loss": 0.4848, "step": 925 }, { "epoch": 0.2682724551977788, "grad_norm": 0.6504232751090008, "learning_rate": 1.8330401334954567e-05, "loss": 0.4958, "step": 930 }, { "epoch": 0.26971478022572387, "grad_norm": 0.4492644770064815, "learning_rate": 1.8302434819625234e-05, "loss": 0.4868, "step": 935 }, { "epoch": 0.27115710525366893, "grad_norm": 0.37095796426726924, "learning_rate": 1.8274257726022054e-05, "loss": 0.4472, "step": 940 }, { "epoch": 0.272599430281614, "grad_norm": 0.4070852473871566, "learning_rate": 1.824587076881294e-05, "loss": 0.4686, "step": 945 }, { "epoch": 0.274041755309559, "grad_norm": 0.44023807834971757, "learning_rate": 1.821727466798867e-05, "loss": 0.471, "step": 950 }, { "epoch": 0.27548408033750404, "grad_norm": 0.5209872184391927, "learning_rate": 1.8188470148844602e-05, "loss": 0.4962, "step": 955 }, { "epoch": 0.2769264053654491, "grad_norm": 0.41685090109899176, "learning_rate": 1.8159457941962325e-05, "loss": 0.475, "step": 960 }, { "epoch": 0.27836873039339416, "grad_norm": 0.5171250899115861, "learning_rate": 1.8130238783191087e-05, "loss": 0.5163, "step": 965 }, { "epoch": 0.2798110554213392, "grad_norm": 0.47139497814149867, "learning_rate": 1.810081341362915e-05, "loss": 0.4641, "step": 970 }, { "epoch": 0.2812533804492843, "grad_norm": 0.3879518437836758, "learning_rate": 1.8071182579604986e-05, "loss": 0.4777, "step": 975 }, { "epoch": 0.2826957054772293, "grad_norm": 0.455341690737865, "learning_rate": 1.804134703265836e-05, "loss": 0.5271, "step": 980 }, { "epoch": 0.28413803050517433, "grad_norm": 0.39108612071221016, "learning_rate": 1.8011307529521255e-05, "loss": 0.4645, "step": 985 }, { "epoch": 0.2855803555331194, "grad_norm": 0.3865948965496386, "learning_rate": 1.7981064832098687e-05, "loss": 0.4578, "step": 990 }, { "epoch": 0.28702268056106445, "grad_norm": 0.40375523747783393, "learning_rate": 1.7950619707449374e-05, "loss": 0.4923, "step": 995 }, { "epoch": 0.2884650055890095, "grad_norm": 0.3376017909117174, "learning_rate": 1.7919972927766288e-05, "loss": 0.4658, "step": 1000 }, { "epoch": 0.2884650055890095, "eval_loss": 0.4833250343799591, "eval_runtime": 142.0125, "eval_samples_per_second": 12.682, "eval_steps_per_second": 3.176, "step": 1000 }, { "epoch": 0.2899073306169545, "grad_norm": 0.47138251586932034, "learning_rate": 1.7889125270357053e-05, "loss": 0.4851, "step": 1005 }, { "epoch": 0.29134965564489956, "grad_norm": 0.522686359505293, "learning_rate": 1.7858077517624265e-05, "loss": 0.4788, "step": 1010 }, { "epoch": 0.2927919806728446, "grad_norm": 0.6355398882354177, "learning_rate": 1.7826830457045608e-05, "loss": 0.4525, "step": 1015 }, { "epoch": 0.2942343057007897, "grad_norm": 0.44577505392395406, "learning_rate": 1.7795384881153896e-05, "loss": 0.4614, "step": 1020 }, { "epoch": 0.29567663072873474, "grad_norm": 0.454859759409631, "learning_rate": 1.7763741587516983e-05, "loss": 0.5021, "step": 1025 }, { "epoch": 0.2971189557566798, "grad_norm": 0.6161570485074761, "learning_rate": 1.7731901378717523e-05, "loss": 0.4903, "step": 1030 }, { "epoch": 0.2985612807846248, "grad_norm": 0.43940664169854093, "learning_rate": 1.769986506233261e-05, "loss": 0.4819, "step": 1035 }, { "epoch": 0.30000360581256985, "grad_norm": 0.4426640967510136, "learning_rate": 1.7667633450913307e-05, "loss": 0.4579, "step": 1040 }, { "epoch": 0.3014459308405149, "grad_norm": 0.5064920131450599, "learning_rate": 1.763520736196402e-05, "loss": 0.5066, "step": 1045 }, { "epoch": 0.30288825586845997, "grad_norm": 0.3628170152752897, "learning_rate": 1.7602587617921785e-05, "loss": 0.423, "step": 1050 }, { "epoch": 0.304330580896405, "grad_norm": 0.4756441564342862, "learning_rate": 1.7569775046135388e-05, "loss": 0.5278, "step": 1055 }, { "epoch": 0.30577290592435, "grad_norm": 0.40932967287449395, "learning_rate": 1.753677047884439e-05, "loss": 0.4565, "step": 1060 }, { "epoch": 0.3072152309522951, "grad_norm": 0.4148447936276441, "learning_rate": 1.7503574753158022e-05, "loss": 0.4819, "step": 1065 }, { "epoch": 0.30865755598024014, "grad_norm": 0.3868133979093347, "learning_rate": 1.747018871103395e-05, "loss": 0.4707, "step": 1070 }, { "epoch": 0.3100998810081852, "grad_norm": 0.39630255989567886, "learning_rate": 1.743661319925691e-05, "loss": 0.4387, "step": 1075 }, { "epoch": 0.31154220603613025, "grad_norm": 0.4233553435649959, "learning_rate": 1.7402849069417246e-05, "loss": 0.465, "step": 1080 }, { "epoch": 0.3129845310640753, "grad_norm": 0.37304393376464795, "learning_rate": 1.7368897177889307e-05, "loss": 0.4854, "step": 1085 }, { "epoch": 0.3144268560920203, "grad_norm": 0.41669096423193014, "learning_rate": 1.7334758385809715e-05, "loss": 0.4369, "step": 1090 }, { "epoch": 0.31586918111996537, "grad_norm": 0.3950040493214593, "learning_rate": 1.7300433559055533e-05, "loss": 0.4488, "step": 1095 }, { "epoch": 0.3173115061479104, "grad_norm": 0.4206456914262744, "learning_rate": 1.7265923568222315e-05, "loss": 0.4608, "step": 1100 }, { "epoch": 0.3187538311758555, "grad_norm": 0.5459001712618055, "learning_rate": 1.7231229288602e-05, "loss": 0.4419, "step": 1105 }, { "epoch": 0.32019615620380054, "grad_norm": 0.4002983479690819, "learning_rate": 1.7196351600160725e-05, "loss": 0.4575, "step": 1110 }, { "epoch": 0.3216384812317456, "grad_norm": 0.5400371185813517, "learning_rate": 1.716129138751651e-05, "loss": 0.4402, "step": 1115 }, { "epoch": 0.3230808062596906, "grad_norm": 0.4526337203461876, "learning_rate": 1.712604953991681e-05, "loss": 0.4923, "step": 1120 }, { "epoch": 0.32452313128763566, "grad_norm": 0.3924148895626424, "learning_rate": 1.709062695121597e-05, "loss": 0.4734, "step": 1125 }, { "epoch": 0.3259654563155807, "grad_norm": 0.45730078891879783, "learning_rate": 1.7055024519852554e-05, "loss": 0.4935, "step": 1130 }, { "epoch": 0.32740778134352577, "grad_norm": 0.41765126413107173, "learning_rate": 1.7019243148826547e-05, "loss": 0.4778, "step": 1135 }, { "epoch": 0.32885010637147083, "grad_norm": 0.48822731606676767, "learning_rate": 1.6983283745676464e-05, "loss": 0.4786, "step": 1140 }, { "epoch": 0.33029243139941583, "grad_norm": 0.47444702764857977, "learning_rate": 1.6947147222456318e-05, "loss": 0.4732, "step": 1145 }, { "epoch": 0.3317347564273609, "grad_norm": 0.36819652961308474, "learning_rate": 1.6910834495712504e-05, "loss": 0.49, "step": 1150 }, { "epoch": 0.33317708145530595, "grad_norm": 0.3963647053897705, "learning_rate": 1.6874346486460543e-05, "loss": 0.4599, "step": 1155 }, { "epoch": 0.334619406483251, "grad_norm": 0.3557684139157355, "learning_rate": 1.6837684120161723e-05, "loss": 0.4603, "step": 1160 }, { "epoch": 0.33606173151119606, "grad_norm": 0.42399774345522806, "learning_rate": 1.680084832669962e-05, "loss": 0.4322, "step": 1165 }, { "epoch": 0.3375040565391411, "grad_norm": 0.4013586249486658, "learning_rate": 1.6763840040356522e-05, "loss": 0.4398, "step": 1170 }, { "epoch": 0.3389463815670861, "grad_norm": 0.44604773948712173, "learning_rate": 1.6726660199789733e-05, "loss": 0.4265, "step": 1175 }, { "epoch": 0.3403887065950312, "grad_norm": 0.39551679284847074, "learning_rate": 1.6689309748007753e-05, "loss": 0.4418, "step": 1180 }, { "epoch": 0.34183103162297623, "grad_norm": 0.451264115692116, "learning_rate": 1.6651789632346377e-05, "loss": 0.4483, "step": 1185 }, { "epoch": 0.3432733566509213, "grad_norm": 0.4689614820007113, "learning_rate": 1.6614100804444657e-05, "loss": 0.467, "step": 1190 }, { "epoch": 0.34471568167886635, "grad_norm": 0.3841720473679624, "learning_rate": 1.6576244220220763e-05, "loss": 0.4313, "step": 1195 }, { "epoch": 0.3461580067068114, "grad_norm": 0.4091561009628973, "learning_rate": 1.6538220839847745e-05, "loss": 0.434, "step": 1200 }, { "epoch": 0.3476003317347564, "grad_norm": 0.4473483816905544, "learning_rate": 1.6500031627729178e-05, "loss": 0.4446, "step": 1205 }, { "epoch": 0.34904265676270146, "grad_norm": 0.4800983187244669, "learning_rate": 1.6461677552474698e-05, "loss": 0.4691, "step": 1210 }, { "epoch": 0.3504849817906465, "grad_norm": 0.388554374886088, "learning_rate": 1.642315958687543e-05, "loss": 0.4517, "step": 1215 }, { "epoch": 0.3519273068185916, "grad_norm": 0.4804591032499286, "learning_rate": 1.6384478707879337e-05, "loss": 0.4736, "step": 1220 }, { "epoch": 0.35336963184653664, "grad_norm": 0.4242345257393015, "learning_rate": 1.6345635896566415e-05, "loss": 0.4453, "step": 1225 }, { "epoch": 0.35481195687448164, "grad_norm": 0.5125929278365619, "learning_rate": 1.6306632138123814e-05, "loss": 0.4894, "step": 1230 }, { "epoch": 0.3562542819024267, "grad_norm": 0.4135575305051168, "learning_rate": 1.626746842182087e-05, "loss": 0.4516, "step": 1235 }, { "epoch": 0.35769660693037175, "grad_norm": 0.49733207897305337, "learning_rate": 1.6228145740983986e-05, "loss": 0.4676, "step": 1240 }, { "epoch": 0.3591389319583168, "grad_norm": 0.405324125927312, "learning_rate": 1.618866509297147e-05, "loss": 0.4539, "step": 1245 }, { "epoch": 0.36058125698626187, "grad_norm": 0.43290260214899146, "learning_rate": 1.61490274791482e-05, "loss": 0.43, "step": 1250 }, { "epoch": 0.3620235820142069, "grad_norm": 0.3648124960837181, "learning_rate": 1.6109233904860258e-05, "loss": 0.4516, "step": 1255 }, { "epoch": 0.3634659070421519, "grad_norm": 0.43358315460862995, "learning_rate": 1.606928537940942e-05, "loss": 0.4565, "step": 1260 }, { "epoch": 0.364908232070097, "grad_norm": 0.5070316730676355, "learning_rate": 1.602918291602755e-05, "loss": 0.4547, "step": 1265 }, { "epoch": 0.36635055709804204, "grad_norm": 0.4556281361017855, "learning_rate": 1.5988927531850913e-05, "loss": 0.4631, "step": 1270 }, { "epoch": 0.3677928821259871, "grad_norm": 0.4210598158384229, "learning_rate": 1.5948520247894363e-05, "loss": 0.4595, "step": 1275 }, { "epoch": 0.36923520715393215, "grad_norm": 0.4325982920205171, "learning_rate": 1.590796208902546e-05, "loss": 0.4698, "step": 1280 }, { "epoch": 0.3706775321818772, "grad_norm": 0.4263624320016057, "learning_rate": 1.5867254083938472e-05, "loss": 0.4371, "step": 1285 }, { "epoch": 0.3721198572098222, "grad_norm": 0.4792938379196713, "learning_rate": 1.582639726512828e-05, "loss": 0.4464, "step": 1290 }, { "epoch": 0.37356218223776727, "grad_norm": 0.43544663382731996, "learning_rate": 1.5785392668864186e-05, "loss": 0.4658, "step": 1295 }, { "epoch": 0.37500450726571233, "grad_norm": 0.38089232775082726, "learning_rate": 1.5744241335163642e-05, "loss": 0.4492, "step": 1300 }, { "epoch": 0.3764468322936574, "grad_norm": 0.3692067776356917, "learning_rate": 1.570294430776587e-05, "loss": 0.4402, "step": 1305 }, { "epoch": 0.37788915732160244, "grad_norm": 0.43939772643420716, "learning_rate": 1.5661502634105376e-05, "loss": 0.4413, "step": 1310 }, { "epoch": 0.37933148234954744, "grad_norm": 0.39362265905546057, "learning_rate": 1.5619917365285394e-05, "loss": 0.4314, "step": 1315 }, { "epoch": 0.3807738073774925, "grad_norm": 0.41565735116305985, "learning_rate": 1.557818955605123e-05, "loss": 0.4564, "step": 1320 }, { "epoch": 0.38221613240543756, "grad_norm": 0.3633587329212366, "learning_rate": 1.55363202647635e-05, "loss": 0.4568, "step": 1325 }, { "epoch": 0.3836584574333826, "grad_norm": 0.43886686943718484, "learning_rate": 1.5494310553371292e-05, "loss": 0.4408, "step": 1330 }, { "epoch": 0.3851007824613277, "grad_norm": 0.44313421551297705, "learning_rate": 1.545216148738523e-05, "loss": 0.4728, "step": 1335 }, { "epoch": 0.38654310748927273, "grad_norm": 0.43446763871019, "learning_rate": 1.5409874135850453e-05, "loss": 0.4413, "step": 1340 }, { "epoch": 0.38798543251721773, "grad_norm": 0.5046802087731463, "learning_rate": 1.5367449571319486e-05, "loss": 0.451, "step": 1345 }, { "epoch": 0.3894277575451628, "grad_norm": 0.4176799699807321, "learning_rate": 1.5324888869825062e-05, "loss": 0.4575, "step": 1350 }, { "epoch": 0.39087008257310785, "grad_norm": 0.4357723650429465, "learning_rate": 1.5282193110852806e-05, "loss": 0.4628, "step": 1355 }, { "epoch": 0.3923124076010529, "grad_norm": 0.47847755269517595, "learning_rate": 1.5239363377313864e-05, "loss": 0.4426, "step": 1360 }, { "epoch": 0.39375473262899796, "grad_norm": 0.42951183292967315, "learning_rate": 1.5196400755517445e-05, "loss": 0.4173, "step": 1365 }, { "epoch": 0.395197057656943, "grad_norm": 0.3712834304196652, "learning_rate": 1.5153306335143247e-05, "loss": 0.4185, "step": 1370 }, { "epoch": 0.396639382684888, "grad_norm": 0.40028893775485, "learning_rate": 1.5110081209213849e-05, "loss": 0.4404, "step": 1375 }, { "epoch": 0.3980817077128331, "grad_norm": 0.3524439650077371, "learning_rate": 1.5066726474066962e-05, "loss": 0.436, "step": 1380 }, { "epoch": 0.39952403274077813, "grad_norm": 0.41796871469443936, "learning_rate": 1.5023243229327631e-05, "loss": 0.4465, "step": 1385 }, { "epoch": 0.4009663577687232, "grad_norm": 0.39648024648913516, "learning_rate": 1.4979632577880355e-05, "loss": 0.4599, "step": 1390 }, { "epoch": 0.40240868279666825, "grad_norm": 0.4177593581987727, "learning_rate": 1.4935895625841095e-05, "loss": 0.4341, "step": 1395 }, { "epoch": 0.40385100782461325, "grad_norm": 0.39474357091689116, "learning_rate": 1.4892033482529233e-05, "loss": 0.4251, "step": 1400 }, { "epoch": 0.4052933328525583, "grad_norm": 0.3925865645135851, "learning_rate": 1.484804726043943e-05, "loss": 0.4188, "step": 1405 }, { "epoch": 0.40673565788050337, "grad_norm": 0.43881341912306815, "learning_rate": 1.480393807521342e-05, "loss": 0.4626, "step": 1410 }, { "epoch": 0.4081779829084484, "grad_norm": 0.38784235208087897, "learning_rate": 1.4759707045611694e-05, "loss": 0.4356, "step": 1415 }, { "epoch": 0.4096203079363935, "grad_norm": 0.4652349082201273, "learning_rate": 1.4715355293485134e-05, "loss": 0.4429, "step": 1420 }, { "epoch": 0.41106263296433854, "grad_norm": 0.5020179396910893, "learning_rate": 1.4670883943746575e-05, "loss": 0.4424, "step": 1425 }, { "epoch": 0.41250495799228354, "grad_norm": 0.46646941577755224, "learning_rate": 1.4626294124342237e-05, "loss": 0.4473, "step": 1430 }, { "epoch": 0.4139472830202286, "grad_norm": 0.3715580720003536, "learning_rate": 1.4581586966223156e-05, "loss": 0.457, "step": 1435 }, { "epoch": 0.41538960804817365, "grad_norm": 0.3913149158851186, "learning_rate": 1.453676360331647e-05, "loss": 0.4232, "step": 1440 }, { "epoch": 0.4168319330761187, "grad_norm": 0.3755928140913827, "learning_rate": 1.4491825172496675e-05, "loss": 0.4376, "step": 1445 }, { "epoch": 0.41827425810406377, "grad_norm": 0.4632236851893659, "learning_rate": 1.4446772813556784e-05, "loss": 0.4547, "step": 1450 }, { "epoch": 0.4197165831320088, "grad_norm": 0.3622221987812085, "learning_rate": 1.4401607669179415e-05, "loss": 0.4189, "step": 1455 }, { "epoch": 0.4211589081599538, "grad_norm": 0.4427510263617938, "learning_rate": 1.4356330884907823e-05, "loss": 0.4307, "step": 1460 }, { "epoch": 0.4226012331878989, "grad_norm": 0.40821656664051026, "learning_rate": 1.4310943609116815e-05, "loss": 0.4416, "step": 1465 }, { "epoch": 0.42404355821584394, "grad_norm": 0.45484460030870416, "learning_rate": 1.4265446992983661e-05, "loss": 0.449, "step": 1470 }, { "epoch": 0.425485883243789, "grad_norm": 0.38430976618751717, "learning_rate": 1.4219842190458865e-05, "loss": 0.4445, "step": 1475 }, { "epoch": 0.42692820827173406, "grad_norm": 0.40624625230940725, "learning_rate": 1.4174130358236924e-05, "loss": 0.4734, "step": 1480 }, { "epoch": 0.42837053329967906, "grad_norm": 0.38501281348072397, "learning_rate": 1.4128312655726957e-05, "loss": 0.4407, "step": 1485 }, { "epoch": 0.4298128583276241, "grad_norm": 0.5552503619067779, "learning_rate": 1.4082390245023337e-05, "loss": 0.4559, "step": 1490 }, { "epoch": 0.43125518335556917, "grad_norm": 0.41269951819834144, "learning_rate": 1.4036364290876176e-05, "loss": 0.4407, "step": 1495 }, { "epoch": 0.43269750838351423, "grad_norm": 0.4132538908060478, "learning_rate": 1.3990235960661824e-05, "loss": 0.4439, "step": 1500 }, { "epoch": 0.43269750838351423, "eval_loss": 0.43445292115211487, "eval_runtime": 142.5412, "eval_samples_per_second": 12.635, "eval_steps_per_second": 3.164, "step": 1500 }, { "epoch": 0.4341398334114593, "grad_norm": 0.42757706099004156, "learning_rate": 1.3944006424353229e-05, "loss": 0.4247, "step": 1505 }, { "epoch": 0.43558215843940434, "grad_norm": 0.36759037583277737, "learning_rate": 1.389767685449027e-05, "loss": 0.4306, "step": 1510 }, { "epoch": 0.43702448346734935, "grad_norm": 0.42042330760151675, "learning_rate": 1.3851248426150026e-05, "loss": 0.4244, "step": 1515 }, { "epoch": 0.4384668084952944, "grad_norm": 0.38414415773611094, "learning_rate": 1.380472231691697e-05, "loss": 0.4377, "step": 1520 }, { "epoch": 0.43990913352323946, "grad_norm": 0.4303765304251248, "learning_rate": 1.375809970685309e-05, "loss": 0.4574, "step": 1525 }, { "epoch": 0.4413514585511845, "grad_norm": 0.39045631524439356, "learning_rate": 1.3711381778467972e-05, "loss": 0.4487, "step": 1530 }, { "epoch": 0.4427937835791296, "grad_norm": 0.409923537347395, "learning_rate": 1.36645697166888e-05, "loss": 0.4155, "step": 1535 }, { "epoch": 0.44423610860707463, "grad_norm": 0.4590281734742793, "learning_rate": 1.3617664708830304e-05, "loss": 0.4211, "step": 1540 }, { "epoch": 0.44567843363501963, "grad_norm": 0.4340206380764746, "learning_rate": 1.3570667944564651e-05, "loss": 0.43, "step": 1545 }, { "epoch": 0.4471207586629647, "grad_norm": 0.3867702108735739, "learning_rate": 1.3523580615891258e-05, "loss": 0.4367, "step": 1550 }, { "epoch": 0.44856308369090975, "grad_norm": 0.45493644595260835, "learning_rate": 1.347640391710657e-05, "loss": 0.4336, "step": 1555 }, { "epoch": 0.4500054087188548, "grad_norm": 0.41557484865468924, "learning_rate": 1.3429139044773768e-05, "loss": 0.4128, "step": 1560 }, { "epoch": 0.45144773374679986, "grad_norm": 0.41564130897863455, "learning_rate": 1.3381787197692413e-05, "loss": 0.3957, "step": 1565 }, { "epoch": 0.45289005877474486, "grad_norm": 0.4011264197640641, "learning_rate": 1.3334349576868046e-05, "loss": 0.442, "step": 1570 }, { "epoch": 0.4543323838026899, "grad_norm": 0.4825855614290229, "learning_rate": 1.3286827385481726e-05, "loss": 0.4058, "step": 1575 }, { "epoch": 0.455774708830635, "grad_norm": 0.3921023793032671, "learning_rate": 1.3239221828859509e-05, "loss": 0.3884, "step": 1580 }, { "epoch": 0.45721703385858004, "grad_norm": 0.40627991293028837, "learning_rate": 1.3191534114441883e-05, "loss": 0.4333, "step": 1585 }, { "epoch": 0.4586593588865251, "grad_norm": 0.43891554498901797, "learning_rate": 1.3143765451753137e-05, "loss": 0.4166, "step": 1590 }, { "epoch": 0.46010168391447015, "grad_norm": 0.39830311047980305, "learning_rate": 1.3095917052370686e-05, "loss": 0.4235, "step": 1595 }, { "epoch": 0.46154400894241515, "grad_norm": 0.3980453207285396, "learning_rate": 1.3047990129894348e-05, "loss": 0.4001, "step": 1600 }, { "epoch": 0.4629863339703602, "grad_norm": 0.4136578166461488, "learning_rate": 1.299998589991555e-05, "loss": 0.4076, "step": 1605 }, { "epoch": 0.46442865899830527, "grad_norm": 0.4343208402620231, "learning_rate": 1.2951905579986506e-05, "loss": 0.4384, "step": 1610 }, { "epoch": 0.4658709840262503, "grad_norm": 0.45578762184210947, "learning_rate": 1.290375038958933e-05, "loss": 0.4048, "step": 1615 }, { "epoch": 0.4673133090541954, "grad_norm": 0.46943412662551365, "learning_rate": 1.285552155010511e-05, "loss": 0.401, "step": 1620 }, { "epoch": 0.46875563408214044, "grad_norm": 0.40848878753251544, "learning_rate": 1.2807220284782926e-05, "loss": 0.4461, "step": 1625 }, { "epoch": 0.47019795911008544, "grad_norm": 0.3921726292273481, "learning_rate": 1.2758847818708832e-05, "loss": 0.4205, "step": 1630 }, { "epoch": 0.4716402841380305, "grad_norm": 0.45781513572784016, "learning_rate": 1.2710405378774768e-05, "loss": 0.4423, "step": 1635 }, { "epoch": 0.47308260916597555, "grad_norm": 0.45862261759553535, "learning_rate": 1.2661894193647458e-05, "loss": 0.4, "step": 1640 }, { "epoch": 0.4745249341939206, "grad_norm": 0.3527899534786595, "learning_rate": 1.261331549373724e-05, "loss": 0.3998, "step": 1645 }, { "epoch": 0.47596725922186567, "grad_norm": 0.36297450328540837, "learning_rate": 1.2564670511166865e-05, "loss": 0.4206, "step": 1650 }, { "epoch": 0.47740958424981067, "grad_norm": 0.4030716124087903, "learning_rate": 1.2515960479740224e-05, "loss": 0.4047, "step": 1655 }, { "epoch": 0.4788519092777557, "grad_norm": 0.41175543047417906, "learning_rate": 1.246718663491108e-05, "loss": 0.4345, "step": 1660 }, { "epoch": 0.4802942343057008, "grad_norm": 0.3574092930784039, "learning_rate": 1.2418350213751728e-05, "loss": 0.4081, "step": 1665 }, { "epoch": 0.48173655933364584, "grad_norm": 0.3954039812545518, "learning_rate": 1.2369452454921604e-05, "loss": 0.4159, "step": 1670 }, { "epoch": 0.4831788843615909, "grad_norm": 0.4497181497561506, "learning_rate": 1.2320494598635886e-05, "loss": 0.4052, "step": 1675 }, { "epoch": 0.48462120938953596, "grad_norm": 0.44655082111096045, "learning_rate": 1.2271477886634023e-05, "loss": 0.4123, "step": 1680 }, { "epoch": 0.48606353441748096, "grad_norm": 0.40423139543908587, "learning_rate": 1.2222403562148252e-05, "loss": 0.4152, "step": 1685 }, { "epoch": 0.487505859445426, "grad_norm": 0.36806086858378434, "learning_rate": 1.2173272869872062e-05, "loss": 0.4252, "step": 1690 }, { "epoch": 0.4889481844733711, "grad_norm": 0.41722654899253564, "learning_rate": 1.2124087055928617e-05, "loss": 0.3879, "step": 1695 }, { "epoch": 0.49039050950131613, "grad_norm": 0.4329150355333478, "learning_rate": 1.207484736783916e-05, "loss": 0.3849, "step": 1700 }, { "epoch": 0.4918328345292612, "grad_norm": 0.4710085788902766, "learning_rate": 1.2025555054491367e-05, "loss": 0.4303, "step": 1705 }, { "epoch": 0.4932751595572062, "grad_norm": 0.443066548358196, "learning_rate": 1.1976211366107668e-05, "loss": 0.4198, "step": 1710 }, { "epoch": 0.49471748458515125, "grad_norm": 0.3338656609348242, "learning_rate": 1.1926817554213548e-05, "loss": 0.3911, "step": 1715 }, { "epoch": 0.4961598096130963, "grad_norm": 0.38270258610415053, "learning_rate": 1.1877374871605786e-05, "loss": 0.4068, "step": 1720 }, { "epoch": 0.49760213464104136, "grad_norm": 0.40504870451767916, "learning_rate": 1.18278845723207e-05, "loss": 0.4117, "step": 1725 }, { "epoch": 0.4990444596689864, "grad_norm": 0.4346348228563321, "learning_rate": 1.1778347911602329e-05, "loss": 0.4104, "step": 1730 }, { "epoch": 0.5004867846969314, "grad_norm": 0.4075021793881479, "learning_rate": 1.1728766145870587e-05, "loss": 0.4229, "step": 1735 }, { "epoch": 0.5019291097248765, "grad_norm": 0.418017099187981, "learning_rate": 1.167914053268942e-05, "loss": 0.407, "step": 1740 }, { "epoch": 0.5033714347528215, "grad_norm": 0.39895813955242926, "learning_rate": 1.1629472330734888e-05, "loss": 0.3978, "step": 1745 }, { "epoch": 0.5048137597807666, "grad_norm": 0.40383289208967305, "learning_rate": 1.1579762799763249e-05, "loss": 0.4175, "step": 1750 }, { "epoch": 0.5062560848087116, "grad_norm": 0.5225560587862472, "learning_rate": 1.1530013200579008e-05, "loss": 0.4131, "step": 1755 }, { "epoch": 0.5076984098366567, "grad_norm": 0.4004897787727647, "learning_rate": 1.1480224795002943e-05, "loss": 0.3888, "step": 1760 }, { "epoch": 0.5091407348646018, "grad_norm": 0.4248175503521806, "learning_rate": 1.1430398845840085e-05, "loss": 0.4324, "step": 1765 }, { "epoch": 0.5105830598925468, "grad_norm": 0.43829908182981264, "learning_rate": 1.1380536616847706e-05, "loss": 0.4079, "step": 1770 }, { "epoch": 0.5120253849204919, "grad_norm": 0.43570794658905476, "learning_rate": 1.1330639372703258e-05, "loss": 0.4045, "step": 1775 }, { "epoch": 0.5134677099484369, "grad_norm": 0.43500914045447153, "learning_rate": 1.12807083789723e-05, "loss": 0.419, "step": 1780 }, { "epoch": 0.5149100349763819, "grad_norm": 0.41351142363579385, "learning_rate": 1.123074490207639e-05, "loss": 0.3986, "step": 1785 }, { "epoch": 0.5163523600043269, "grad_norm": 0.37789765808010595, "learning_rate": 1.1180750209260972e-05, "loss": 0.4016, "step": 1790 }, { "epoch": 0.517794685032272, "grad_norm": 0.4013962679722207, "learning_rate": 1.1130725568563241e-05, "loss": 0.4081, "step": 1795 }, { "epoch": 0.519237010060217, "grad_norm": 0.38374761554210224, "learning_rate": 1.1080672248779964e-05, "loss": 0.4061, "step": 1800 }, { "epoch": 0.5206793350881621, "grad_norm": 0.44182386119487255, "learning_rate": 1.1030591519435316e-05, "loss": 0.3916, "step": 1805 }, { "epoch": 0.5221216601161072, "grad_norm": 0.44971294735945117, "learning_rate": 1.0980484650748666e-05, "loss": 0.3996, "step": 1810 }, { "epoch": 0.5235639851440522, "grad_norm": 0.35276497806950113, "learning_rate": 1.0930352913602371e-05, "loss": 0.3732, "step": 1815 }, { "epoch": 0.5250063101719973, "grad_norm": 0.42340138266599786, "learning_rate": 1.0880197579509532e-05, "loss": 0.4222, "step": 1820 }, { "epoch": 0.5264486351999423, "grad_norm": 0.39078797688993877, "learning_rate": 1.0830019920581753e-05, "loss": 0.4136, "step": 1825 }, { "epoch": 0.5278909602278874, "grad_norm": 0.4130289272161752, "learning_rate": 1.0779821209496876e-05, "loss": 0.4192, "step": 1830 }, { "epoch": 0.5293332852558325, "grad_norm": 0.41541974485384586, "learning_rate": 1.0729602719466692e-05, "loss": 0.4031, "step": 1835 }, { "epoch": 0.5307756102837774, "grad_norm": 0.44049659174573497, "learning_rate": 1.067936572420466e-05, "loss": 0.4069, "step": 1840 }, { "epoch": 0.5322179353117225, "grad_norm": 0.44056632399340595, "learning_rate": 1.0629111497893591e-05, "loss": 0.3964, "step": 1845 }, { "epoch": 0.5336602603396675, "grad_norm": 0.40575645379756525, "learning_rate": 1.0578841315153333e-05, "loss": 0.3953, "step": 1850 }, { "epoch": 0.5351025853676126, "grad_norm": 0.37056517023195357, "learning_rate": 1.0528556451008447e-05, "loss": 0.4058, "step": 1855 }, { "epoch": 0.5365449103955576, "grad_norm": 0.38961078802000476, "learning_rate": 1.0478258180855869e-05, "loss": 0.3783, "step": 1860 }, { "epoch": 0.5379872354235027, "grad_norm": 0.4278326171242378, "learning_rate": 1.0427947780432547e-05, "loss": 0.4025, "step": 1865 }, { "epoch": 0.5394295604514477, "grad_norm": 0.4487192036382051, "learning_rate": 1.0377626525783101e-05, "loss": 0.3933, "step": 1870 }, { "epoch": 0.5408718854793928, "grad_norm": 0.5348996401888022, "learning_rate": 1.0327295693227454e-05, "loss": 0.447, "step": 1875 }, { "epoch": 0.5423142105073379, "grad_norm": 0.527197311781129, "learning_rate": 1.0276956559328455e-05, "loss": 0.3949, "step": 1880 }, { "epoch": 0.5437565355352829, "grad_norm": 0.41151058505508553, "learning_rate": 1.0226610400859498e-05, "loss": 0.4051, "step": 1885 }, { "epoch": 0.545198860563228, "grad_norm": 0.37166405264306773, "learning_rate": 1.0176258494772153e-05, "loss": 0.3991, "step": 1890 }, { "epoch": 0.5466411855911729, "grad_norm": 0.4167614980577364, "learning_rate": 1.0125902118163762e-05, "loss": 0.4086, "step": 1895 }, { "epoch": 0.548083510619118, "grad_norm": 0.4002106455641225, "learning_rate": 1.007554254824506e-05, "loss": 0.4006, "step": 1900 }, { "epoch": 0.549525835647063, "grad_norm": 0.38648887792017217, "learning_rate": 1.0025181062307774e-05, "loss": 0.4009, "step": 1905 }, { "epoch": 0.5509681606750081, "grad_norm": 0.4402653770907521, "learning_rate": 9.974818937692228e-06, "loss": 0.3909, "step": 1910 }, { "epoch": 0.5524104857029531, "grad_norm": 0.39402192655503426, "learning_rate": 9.92445745175494e-06, "loss": 0.3793, "step": 1915 }, { "epoch": 0.5538528107308982, "grad_norm": 0.36447042674734037, "learning_rate": 9.874097881836241e-06, "loss": 0.3856, "step": 1920 }, { "epoch": 0.5552951357588433, "grad_norm": 0.38084863196798785, "learning_rate": 9.823741505227852e-06, "loss": 0.3821, "step": 1925 }, { "epoch": 0.5567374607867883, "grad_norm": 0.3689396281200298, "learning_rate": 9.773389599140504e-06, "loss": 0.3888, "step": 1930 }, { "epoch": 0.5581797858147334, "grad_norm": 0.42447241183482853, "learning_rate": 9.72304344067155e-06, "loss": 0.4018, "step": 1935 }, { "epoch": 0.5596221108426784, "grad_norm": 0.34840166562757835, "learning_rate": 9.672704306772547e-06, "loss": 0.381, "step": 1940 }, { "epoch": 0.5610644358706235, "grad_norm": 0.3824007554962182, "learning_rate": 9.6223734742169e-06, "loss": 0.405, "step": 1945 }, { "epoch": 0.5625067608985685, "grad_norm": 0.40567921647837246, "learning_rate": 9.572052219567455e-06, "loss": 0.3886, "step": 1950 }, { "epoch": 0.5639490859265135, "grad_norm": 0.4496361442646002, "learning_rate": 9.521741819144135e-06, "loss": 0.3926, "step": 1955 }, { "epoch": 0.5653914109544586, "grad_norm": 0.3771274201963948, "learning_rate": 9.471443548991557e-06, "loss": 0.4009, "step": 1960 }, { "epoch": 0.5668337359824036, "grad_norm": 0.3832741322922619, "learning_rate": 9.421158684846669e-06, "loss": 0.3926, "step": 1965 }, { "epoch": 0.5682760610103487, "grad_norm": 0.41676932794244004, "learning_rate": 9.370888502106414e-06, "loss": 0.4194, "step": 1970 }, { "epoch": 0.5697183860382937, "grad_norm": 0.4465176481054024, "learning_rate": 9.320634275795342e-06, "loss": 0.3885, "step": 1975 }, { "epoch": 0.5711607110662388, "grad_norm": 0.41454265589275485, "learning_rate": 9.270397280533311e-06, "loss": 0.4041, "step": 1980 }, { "epoch": 0.5726030360941838, "grad_norm": 0.37529076026198815, "learning_rate": 9.220178790503125e-06, "loss": 0.3784, "step": 1985 }, { "epoch": 0.5740453611221289, "grad_norm": 0.4006407856625201, "learning_rate": 9.169980079418248e-06, "loss": 0.3742, "step": 1990 }, { "epoch": 0.575487686150074, "grad_norm": 0.4075785016746068, "learning_rate": 9.119802420490473e-06, "loss": 0.4184, "step": 1995 }, { "epoch": 0.576930011178019, "grad_norm": 0.3892341916180056, "learning_rate": 9.06964708639763e-06, "loss": 0.3865, "step": 2000 }, { "epoch": 0.576930011178019, "eval_loss": 0.3948507606983185, "eval_runtime": 142.1685, "eval_samples_per_second": 12.668, "eval_steps_per_second": 3.172, "step": 2000 }, { "epoch": 0.5783723362059641, "grad_norm": 0.4476758638692534, "learning_rate": 9.019515349251337e-06, "loss": 0.4076, "step": 2005 }, { "epoch": 0.579814661233909, "grad_norm": 0.38084358148704506, "learning_rate": 8.969408480564684e-06, "loss": 0.3951, "step": 2010 }, { "epoch": 0.5812569862618541, "grad_norm": 0.3946160859854508, "learning_rate": 8.919327751220038e-06, "loss": 0.3737, "step": 2015 }, { "epoch": 0.5826993112897991, "grad_norm": 0.4376591903476801, "learning_rate": 8.86927443143676e-06, "loss": 0.3993, "step": 2020 }, { "epoch": 0.5841416363177442, "grad_norm": 0.4220093736158996, "learning_rate": 8.819249790739033e-06, "loss": 0.3896, "step": 2025 }, { "epoch": 0.5855839613456892, "grad_norm": 0.37781362600911217, "learning_rate": 8.769255097923617e-06, "loss": 0.358, "step": 2030 }, { "epoch": 0.5870262863736343, "grad_norm": 0.37752543573320735, "learning_rate": 8.719291621027703e-06, "loss": 0.4016, "step": 2035 }, { "epoch": 0.5884686114015794, "grad_norm": 0.4195162100656966, "learning_rate": 8.669360627296745e-06, "loss": 0.3755, "step": 2040 }, { "epoch": 0.5899109364295244, "grad_norm": 0.40866907101120203, "learning_rate": 8.619463383152296e-06, "loss": 0.3964, "step": 2045 }, { "epoch": 0.5913532614574695, "grad_norm": 0.4194072279329464, "learning_rate": 8.56960115415992e-06, "loss": 0.3853, "step": 2050 }, { "epoch": 0.5927955864854145, "grad_norm": 0.503872591140977, "learning_rate": 8.519775204997063e-06, "loss": 0.4161, "step": 2055 }, { "epoch": 0.5942379115133596, "grad_norm": 0.4656959686074043, "learning_rate": 8.469986799420993e-06, "loss": 0.4207, "step": 2060 }, { "epoch": 0.5956802365413045, "grad_norm": 0.4068362162842934, "learning_rate": 8.420237200236753e-06, "loss": 0.3717, "step": 2065 }, { "epoch": 0.5971225615692496, "grad_norm": 0.4469993385978865, "learning_rate": 8.370527669265114e-06, "loss": 0.4039, "step": 2070 }, { "epoch": 0.5985648865971946, "grad_norm": 0.43643202324029334, "learning_rate": 8.320859467310582e-06, "loss": 0.3749, "step": 2075 }, { "epoch": 0.6000072116251397, "grad_norm": 0.5297689595825736, "learning_rate": 8.271233854129413e-06, "loss": 0.376, "step": 2080 }, { "epoch": 0.6014495366530848, "grad_norm": 0.489056954944045, "learning_rate": 8.221652088397675e-06, "loss": 0.3933, "step": 2085 }, { "epoch": 0.6028918616810298, "grad_norm": 0.37378771704976776, "learning_rate": 8.172115427679304e-06, "loss": 0.3945, "step": 2090 }, { "epoch": 0.6043341867089749, "grad_norm": 0.4235226777306445, "learning_rate": 8.122625128394216e-06, "loss": 0.3826, "step": 2095 }, { "epoch": 0.6057765117369199, "grad_norm": 0.4021066843708137, "learning_rate": 8.073182445786455e-06, "loss": 0.3642, "step": 2100 }, { "epoch": 0.607218836764865, "grad_norm": 0.3735730097404964, "learning_rate": 8.023788633892334e-06, "loss": 0.3725, "step": 2105 }, { "epoch": 0.60866116179281, "grad_norm": 0.42115686535849983, "learning_rate": 7.974444945508637e-06, "loss": 0.3876, "step": 2110 }, { "epoch": 0.6101034868207551, "grad_norm": 0.42268328106794184, "learning_rate": 7.925152632160841e-06, "loss": 0.4042, "step": 2115 }, { "epoch": 0.6115458118487, "grad_norm": 0.4303350707681742, "learning_rate": 7.875912944071386e-06, "loss": 0.3718, "step": 2120 }, { "epoch": 0.6129881368766451, "grad_norm": 0.41179372110756424, "learning_rate": 7.826727130127942e-06, "loss": 0.3844, "step": 2125 }, { "epoch": 0.6144304619045902, "grad_norm": 0.3763060638976918, "learning_rate": 7.77759643785175e-06, "loss": 0.378, "step": 2130 }, { "epoch": 0.6158727869325352, "grad_norm": 0.40647467863126857, "learning_rate": 7.72852211336598e-06, "loss": 0.3633, "step": 2135 }, { "epoch": 0.6173151119604803, "grad_norm": 0.4427513530880047, "learning_rate": 7.679505401364116e-06, "loss": 0.3728, "step": 2140 }, { "epoch": 0.6187574369884253, "grad_norm": 0.40218277177425543, "learning_rate": 7.630547545078398e-06, "loss": 0.3936, "step": 2145 }, { "epoch": 0.6201997620163704, "grad_norm": 0.40266373448906506, "learning_rate": 7.581649786248276e-06, "loss": 0.3956, "step": 2150 }, { "epoch": 0.6216420870443155, "grad_norm": 0.4101360200980578, "learning_rate": 7.532813365088921e-06, "loss": 0.3935, "step": 2155 }, { "epoch": 0.6230844120722605, "grad_norm": 0.4360450388421823, "learning_rate": 7.484039520259781e-06, "loss": 0.393, "step": 2160 }, { "epoch": 0.6245267371002056, "grad_norm": 0.3984091507351705, "learning_rate": 7.435329488833137e-06, "loss": 0.3857, "step": 2165 }, { "epoch": 0.6259690621281506, "grad_norm": 0.4057039326760462, "learning_rate": 7.38668450626276e-06, "loss": 0.4013, "step": 2170 }, { "epoch": 0.6274113871560957, "grad_norm": 0.39301356289008293, "learning_rate": 7.338105806352542e-06, "loss": 0.3613, "step": 2175 }, { "epoch": 0.6288537121840406, "grad_norm": 0.4031222004525292, "learning_rate": 7.289594621225236e-06, "loss": 0.3775, "step": 2180 }, { "epoch": 0.6302960372119857, "grad_norm": 0.42389618462152223, "learning_rate": 7.241152181291173e-06, "loss": 0.3842, "step": 2185 }, { "epoch": 0.6317383622399307, "grad_norm": 0.4222447939654566, "learning_rate": 7.192779715217075e-06, "loss": 0.3747, "step": 2190 }, { "epoch": 0.6331806872678758, "grad_norm": 0.3616433078805121, "learning_rate": 7.144478449894894e-06, "loss": 0.3619, "step": 2195 }, { "epoch": 0.6346230122958209, "grad_norm": 0.40315108612725287, "learning_rate": 7.096249610410671e-06, "loss": 0.383, "step": 2200 }, { "epoch": 0.6360653373237659, "grad_norm": 0.39550949033278987, "learning_rate": 7.0480944200134975e-06, "loss": 0.3993, "step": 2205 }, { "epoch": 0.637507662351711, "grad_norm": 0.4061605042450912, "learning_rate": 7.00001410008445e-06, "loss": 0.3667, "step": 2210 }, { "epoch": 0.638949987379656, "grad_norm": 0.399669288075527, "learning_rate": 6.952009870105654e-06, "loss": 0.387, "step": 2215 }, { "epoch": 0.6403923124076011, "grad_norm": 0.4188823149502449, "learning_rate": 6.904082947629317e-06, "loss": 0.3814, "step": 2220 }, { "epoch": 0.6418346374355461, "grad_norm": 0.3729926900968089, "learning_rate": 6.856234548246866e-06, "loss": 0.3647, "step": 2225 }, { "epoch": 0.6432769624634912, "grad_norm": 0.3995200969127714, "learning_rate": 6.808465885558122e-06, "loss": 0.3778, "step": 2230 }, { "epoch": 0.6447192874914361, "grad_norm": 0.4182365028017815, "learning_rate": 6.760778171140492e-06, "loss": 0.4071, "step": 2235 }, { "epoch": 0.6461616125193812, "grad_norm": 0.419641094415173, "learning_rate": 6.713172614518278e-06, "loss": 0.3838, "step": 2240 }, { "epoch": 0.6476039375473263, "grad_norm": 0.455639932664125, "learning_rate": 6.665650423131953e-06, "loss": 0.3864, "step": 2245 }, { "epoch": 0.6490462625752713, "grad_norm": 0.42278667120966895, "learning_rate": 6.618212802307589e-06, "loss": 0.396, "step": 2250 }, { "epoch": 0.6504885876032164, "grad_norm": 0.44585454789944867, "learning_rate": 6.570860955226234e-06, "loss": 0.3811, "step": 2255 }, { "epoch": 0.6519309126311614, "grad_norm": 0.3966025625438823, "learning_rate": 6.5235960828934305e-06, "loss": 0.3732, "step": 2260 }, { "epoch": 0.6533732376591065, "grad_norm": 0.40489868259557904, "learning_rate": 6.476419384108745e-06, "loss": 0.3567, "step": 2265 }, { "epoch": 0.6548155626870515, "grad_norm": 0.39366736678335024, "learning_rate": 6.429332055435349e-06, "loss": 0.3623, "step": 2270 }, { "epoch": 0.6562578877149966, "grad_norm": 0.42529750592620424, "learning_rate": 6.382335291169698e-06, "loss": 0.3676, "step": 2275 }, { "epoch": 0.6577002127429417, "grad_norm": 0.44036040562921713, "learning_rate": 6.335430283311206e-06, "loss": 0.3889, "step": 2280 }, { "epoch": 0.6591425377708867, "grad_norm": 0.3787593063841428, "learning_rate": 6.288618221532031e-06, "loss": 0.386, "step": 2285 }, { "epoch": 0.6605848627988317, "grad_norm": 0.4169592811397764, "learning_rate": 6.241900293146915e-06, "loss": 0.3752, "step": 2290 }, { "epoch": 0.6620271878267767, "grad_norm": 0.4047539500558757, "learning_rate": 6.195277683083033e-06, "loss": 0.3658, "step": 2295 }, { "epoch": 0.6634695128547218, "grad_norm": 0.3845249122797127, "learning_rate": 6.148751573849976e-06, "loss": 0.3563, "step": 2300 }, { "epoch": 0.6649118378826668, "grad_norm": 0.4633041975142693, "learning_rate": 6.102323145509732e-06, "loss": 0.3852, "step": 2305 }, { "epoch": 0.6663541629106119, "grad_norm": 0.3985148240515743, "learning_rate": 6.055993575646775e-06, "loss": 0.3915, "step": 2310 }, { "epoch": 0.667796487938557, "grad_norm": 0.40716397694215495, "learning_rate": 6.00976403933818e-06, "loss": 0.3605, "step": 2315 }, { "epoch": 0.669238812966502, "grad_norm": 0.38795576025941675, "learning_rate": 5.963635709123825e-06, "loss": 0.37, "step": 2320 }, { "epoch": 0.6706811379944471, "grad_norm": 0.4110632294347015, "learning_rate": 5.91760975497667e-06, "loss": 0.3853, "step": 2325 }, { "epoch": 0.6721234630223921, "grad_norm": 0.3969166036791085, "learning_rate": 5.871687344273045e-06, "loss": 0.3672, "step": 2330 }, { "epoch": 0.6735657880503372, "grad_norm": 0.41207993758304634, "learning_rate": 5.8258696417630825e-06, "loss": 0.3547, "step": 2335 }, { "epoch": 0.6750081130782822, "grad_norm": 0.3680867654775724, "learning_rate": 5.780157809541134e-06, "loss": 0.3625, "step": 2340 }, { "epoch": 0.6764504381062273, "grad_norm": 0.4267438085961488, "learning_rate": 5.734553007016345e-06, "loss": 0.3999, "step": 2345 }, { "epoch": 0.6778927631341722, "grad_norm": 0.3986326036374569, "learning_rate": 5.68905639088319e-06, "loss": 0.3303, "step": 2350 }, { "epoch": 0.6793350881621173, "grad_norm": 0.42614206231420926, "learning_rate": 5.643669115092183e-06, "loss": 0.3589, "step": 2355 }, { "epoch": 0.6807774131900624, "grad_norm": 0.3776847045804154, "learning_rate": 5.598392330820586e-06, "loss": 0.3609, "step": 2360 }, { "epoch": 0.6822197382180074, "grad_norm": 0.41271036973705766, "learning_rate": 5.553227186443215e-06, "loss": 0.3615, "step": 2365 }, { "epoch": 0.6836620632459525, "grad_norm": 0.38781546784387094, "learning_rate": 5.508174827503328e-06, "loss": 0.3433, "step": 2370 }, { "epoch": 0.6851043882738975, "grad_norm": 0.39550012764434234, "learning_rate": 5.46323639668353e-06, "loss": 0.3691, "step": 2375 }, { "epoch": 0.6865467133018426, "grad_norm": 0.4203725670836375, "learning_rate": 5.4184130337768485e-06, "loss": 0.3882, "step": 2380 }, { "epoch": 0.6879890383297876, "grad_norm": 0.41719368579398214, "learning_rate": 5.373705875657766e-06, "loss": 0.3678, "step": 2385 }, { "epoch": 0.6894313633577327, "grad_norm": 0.408418654280754, "learning_rate": 5.329116056253429e-06, "loss": 0.3788, "step": 2390 }, { "epoch": 0.6908736883856778, "grad_norm": 0.4432414502444195, "learning_rate": 5.284644706514868e-06, "loss": 0.3733, "step": 2395 }, { "epoch": 0.6923160134136228, "grad_norm": 0.43523682450545426, "learning_rate": 5.240292954388306e-06, "loss": 0.3716, "step": 2400 }, { "epoch": 0.6937583384415678, "grad_norm": 0.4389694994462393, "learning_rate": 5.1960619247865815e-06, "loss": 0.3655, "step": 2405 }, { "epoch": 0.6952006634695128, "grad_norm": 0.3932614135155125, "learning_rate": 5.15195273956057e-06, "loss": 0.3971, "step": 2410 }, { "epoch": 0.6966429884974579, "grad_norm": 0.38979362609767165, "learning_rate": 5.107966517470771e-06, "loss": 0.3724, "step": 2415 }, { "epoch": 0.6980853135254029, "grad_norm": 0.4209080852390916, "learning_rate": 5.064104374158909e-06, "loss": 0.3911, "step": 2420 }, { "epoch": 0.699527638553348, "grad_norm": 0.45055904805315533, "learning_rate": 5.0203674221196485e-06, "loss": 0.3633, "step": 2425 }, { "epoch": 0.700969963581293, "grad_norm": 0.3868393099197903, "learning_rate": 4.9767567706723706e-06, "loss": 0.3515, "step": 2430 }, { "epoch": 0.7024122886092381, "grad_norm": 0.41826804531316264, "learning_rate": 4.933273525933041e-06, "loss": 0.3519, "step": 2435 }, { "epoch": 0.7038546136371832, "grad_norm": 0.45957339946847975, "learning_rate": 4.889918790786153e-06, "loss": 0.3807, "step": 2440 }, { "epoch": 0.7052969386651282, "grad_norm": 0.4540538141436769, "learning_rate": 4.846693664856754e-06, "loss": 0.3465, "step": 2445 }, { "epoch": 0.7067392636930733, "grad_norm": 0.47813500195150954, "learning_rate": 4.803599244482558e-06, "loss": 0.376, "step": 2450 }, { "epoch": 0.7081815887210183, "grad_norm": 0.3925519413949624, "learning_rate": 4.760636622686136e-06, "loss": 0.3404, "step": 2455 }, { "epoch": 0.7096239137489633, "grad_norm": 0.4289528139780234, "learning_rate": 4.717806889147196e-06, "loss": 0.3627, "step": 2460 }, { "epoch": 0.7110662387769083, "grad_norm": 0.41215198190870284, "learning_rate": 4.675111130174939e-06, "loss": 0.3716, "step": 2465 }, { "epoch": 0.7125085638048534, "grad_norm": 0.4403007485651443, "learning_rate": 4.632550428680515e-06, "loss": 0.3765, "step": 2470 }, { "epoch": 0.7139508888327984, "grad_norm": 0.4311724864201015, "learning_rate": 4.590125864149551e-06, "loss": 0.3743, "step": 2475 }, { "epoch": 0.7153932138607435, "grad_norm": 0.46098384046435353, "learning_rate": 4.547838512614773e-06, "loss": 0.3505, "step": 2480 }, { "epoch": 0.7168355388886886, "grad_norm": 0.40338840945222365, "learning_rate": 4.505689446628712e-06, "loss": 0.3691, "step": 2485 }, { "epoch": 0.7182778639166336, "grad_norm": 0.40824551867501546, "learning_rate": 4.4636797352365035e-06, "loss": 0.3585, "step": 2490 }, { "epoch": 0.7197201889445787, "grad_norm": 0.4297027171998161, "learning_rate": 4.421810443948774e-06, "loss": 0.3705, "step": 2495 }, { "epoch": 0.7211625139725237, "grad_norm": 0.40341531049143703, "learning_rate": 4.38008263471461e-06, "loss": 0.3815, "step": 2500 }, { "epoch": 0.7211625139725237, "eval_loss": 0.37222930788993835, "eval_runtime": 142.2441, "eval_samples_per_second": 12.661, "eval_steps_per_second": 3.171, "step": 2500 }, { "epoch": 0.7226048390004688, "grad_norm": 0.4407059294927956, "learning_rate": 4.338497365894628e-06, "loss": 0.3661, "step": 2505 }, { "epoch": 0.7240471640284138, "grad_norm": 0.43213340820969415, "learning_rate": 4.297055692234133e-06, "loss": 0.3548, "step": 2510 }, { "epoch": 0.7254894890563589, "grad_norm": 0.40790860794488015, "learning_rate": 4.25575866483636e-06, "loss": 0.3693, "step": 2515 }, { "epoch": 0.7269318140843039, "grad_norm": 0.39452605394978774, "learning_rate": 4.214607331135817e-06, "loss": 0.3629, "step": 2520 }, { "epoch": 0.7283741391122489, "grad_norm": 0.4535519104968178, "learning_rate": 4.173602734871723e-06, "loss": 0.3631, "step": 2525 }, { "epoch": 0.729816464140194, "grad_norm": 0.4215165521407461, "learning_rate": 4.132745916061528e-06, "loss": 0.3623, "step": 2530 }, { "epoch": 0.731258789168139, "grad_norm": 0.4369337778893739, "learning_rate": 4.09203791097454e-06, "loss": 0.3799, "step": 2535 }, { "epoch": 0.7327011141960841, "grad_norm": 0.4218365082776104, "learning_rate": 4.051479752105642e-06, "loss": 0.3281, "step": 2540 }, { "epoch": 0.7341434392240291, "grad_norm": 0.39141469492573994, "learning_rate": 4.01107246814909e-06, "loss": 0.3779, "step": 2545 }, { "epoch": 0.7355857642519742, "grad_norm": 0.4361183098017262, "learning_rate": 3.970817083972451e-06, "loss": 0.3677, "step": 2550 }, { "epoch": 0.7370280892799193, "grad_norm": 0.4212489079522315, "learning_rate": 3.930714620590582e-06, "loss": 0.3697, "step": 2555 }, { "epoch": 0.7384704143078643, "grad_norm": 0.42629366346781794, "learning_rate": 3.890766095139744e-06, "loss": 0.336, "step": 2560 }, { "epoch": 0.7399127393358094, "grad_norm": 0.39167597840940843, "learning_rate": 3.850972520851804e-06, "loss": 0.3297, "step": 2565 }, { "epoch": 0.7413550643637544, "grad_norm": 0.4233310284348778, "learning_rate": 3.8113349070285344e-06, "loss": 0.3613, "step": 2570 }, { "epoch": 0.7427973893916994, "grad_norm": 0.4263022461531563, "learning_rate": 3.771854259016019e-06, "loss": 0.3529, "step": 2575 }, { "epoch": 0.7442397144196444, "grad_norm": 0.3973240159937157, "learning_rate": 3.7325315781791337e-06, "loss": 0.3661, "step": 2580 }, { "epoch": 0.7456820394475895, "grad_norm": 0.39734045764738396, "learning_rate": 3.693367861876188e-06, "loss": 0.3815, "step": 2585 }, { "epoch": 0.7471243644755345, "grad_norm": 0.4473118684590064, "learning_rate": 3.6543641034335873e-06, "loss": 0.3488, "step": 2590 }, { "epoch": 0.7485666895034796, "grad_norm": 0.4071557714101167, "learning_rate": 3.615521292120663e-06, "loss": 0.36, "step": 2595 }, { "epoch": 0.7500090145314247, "grad_norm": 0.4149969887621353, "learning_rate": 3.5768404131245695e-06, "loss": 0.3619, "step": 2600 }, { "epoch": 0.7514513395593697, "grad_norm": 0.41064754239264667, "learning_rate": 3.5383224475253043e-06, "loss": 0.3623, "step": 2605 }, { "epoch": 0.7528936645873148, "grad_norm": 0.48731666991216727, "learning_rate": 3.4999683722708265e-06, "loss": 0.3824, "step": 2610 }, { "epoch": 0.7543359896152598, "grad_norm": 0.42149841198530297, "learning_rate": 3.4617791601522565e-06, "loss": 0.3658, "step": 2615 }, { "epoch": 0.7557783146432049, "grad_norm": 0.3936949177789515, "learning_rate": 3.423755779779243e-06, "loss": 0.3308, "step": 2620 }, { "epoch": 0.7572206396711499, "grad_norm": 0.43489944362821054, "learning_rate": 3.3858991955553455e-06, "loss": 0.3815, "step": 2625 }, { "epoch": 0.7586629646990949, "grad_norm": 0.3921717289554429, "learning_rate": 3.348210367653625e-06, "loss": 0.3531, "step": 2630 }, { "epoch": 0.76010528972704, "grad_norm": 0.44238912615157533, "learning_rate": 3.3106902519922523e-06, "loss": 0.3696, "step": 2635 }, { "epoch": 0.761547614754985, "grad_norm": 0.4536027992384981, "learning_rate": 3.27333980021027e-06, "loss": 0.37, "step": 2640 }, { "epoch": 0.7629899397829301, "grad_norm": 0.4564191707678332, "learning_rate": 3.236159959643482e-06, "loss": 0.3819, "step": 2645 }, { "epoch": 0.7644322648108751, "grad_norm": 0.5326593840798252, "learning_rate": 3.1991516733003813e-06, "loss": 0.3758, "step": 2650 }, { "epoch": 0.7658745898388202, "grad_norm": 0.43321441818668444, "learning_rate": 3.1623158798382813e-06, "loss": 0.3783, "step": 2655 }, { "epoch": 0.7673169148667652, "grad_norm": 0.4454237213343821, "learning_rate": 3.125653513539456e-06, "loss": 0.3607, "step": 2660 }, { "epoch": 0.7687592398947103, "grad_norm": 0.4107211963202732, "learning_rate": 3.089165504287499e-06, "loss": 0.3482, "step": 2665 }, { "epoch": 0.7702015649226553, "grad_norm": 0.3789782102911423, "learning_rate": 3.052852777543687e-06, "loss": 0.3543, "step": 2670 }, { "epoch": 0.7716438899506004, "grad_norm": 0.4079189291227377, "learning_rate": 3.0167162543235384e-06, "loss": 0.3276, "step": 2675 }, { "epoch": 0.7730862149785455, "grad_norm": 0.4472943997084153, "learning_rate": 2.9807568511734564e-06, "loss": 0.3825, "step": 2680 }, { "epoch": 0.7745285400064905, "grad_norm": 0.430008379042804, "learning_rate": 2.944975480147445e-06, "loss": 0.3595, "step": 2685 }, { "epoch": 0.7759708650344355, "grad_norm": 0.4401700574196651, "learning_rate": 2.909373048784032e-06, "loss": 0.3779, "step": 2690 }, { "epoch": 0.7774131900623805, "grad_norm": 0.4208383654033427, "learning_rate": 2.873950460083191e-06, "loss": 0.3749, "step": 2695 }, { "epoch": 0.7788555150903256, "grad_norm": 0.4174074736046765, "learning_rate": 2.8387086124834952e-06, "loss": 0.374, "step": 2700 }, { "epoch": 0.7802978401182706, "grad_norm": 0.42868575004589055, "learning_rate": 2.8036483998392784e-06, "loss": 0.3564, "step": 2705 }, { "epoch": 0.7817401651462157, "grad_norm": 0.3985935455753018, "learning_rate": 2.768770711398001e-06, "loss": 0.3667, "step": 2710 }, { "epoch": 0.7831824901741608, "grad_norm": 0.40569605016983845, "learning_rate": 2.734076431777688e-06, "loss": 0.3506, "step": 2715 }, { "epoch": 0.7846248152021058, "grad_norm": 0.39328145893392497, "learning_rate": 2.6995664409444665e-06, "loss": 0.3464, "step": 2720 }, { "epoch": 0.7860671402300509, "grad_norm": 0.4528233880552543, "learning_rate": 2.6652416141902913e-06, "loss": 0.3605, "step": 2725 }, { "epoch": 0.7875094652579959, "grad_norm": 0.4480705994704807, "learning_rate": 2.631102822110695e-06, "loss": 0.3726, "step": 2730 }, { "epoch": 0.788951790285941, "grad_norm": 0.4574022134374259, "learning_rate": 2.597150930582757e-06, "loss": 0.359, "step": 2735 }, { "epoch": 0.790394115313886, "grad_norm": 0.4078128321456425, "learning_rate": 2.563386800743094e-06, "loss": 0.3413, "step": 2740 }, { "epoch": 0.791836440341831, "grad_norm": 0.44464864656256, "learning_rate": 2.5298112889660544e-06, "loss": 0.3587, "step": 2745 }, { "epoch": 0.793278765369776, "grad_norm": 0.3890963843751233, "learning_rate": 2.4964252468419802e-06, "loss": 0.344, "step": 2750 }, { "epoch": 0.7947210903977211, "grad_norm": 0.42348428672207705, "learning_rate": 2.463229521155611e-06, "loss": 0.3835, "step": 2755 }, { "epoch": 0.7961634154256662, "grad_norm": 0.4244981524719468, "learning_rate": 2.430224953864617e-06, "loss": 0.3908, "step": 2760 }, { "epoch": 0.7976057404536112, "grad_norm": 0.4461589097043871, "learning_rate": 2.397412382078219e-06, "loss": 0.3493, "step": 2765 }, { "epoch": 0.7990480654815563, "grad_norm": 0.4226119316706504, "learning_rate": 2.364792638035982e-06, "loss": 0.3549, "step": 2770 }, { "epoch": 0.8004903905095013, "grad_norm": 0.43426124883547124, "learning_rate": 2.3323665490866964e-06, "loss": 0.3578, "step": 2775 }, { "epoch": 0.8019327155374464, "grad_norm": 0.42274869171496543, "learning_rate": 2.300134937667391e-06, "loss": 0.3805, "step": 2780 }, { "epoch": 0.8033750405653914, "grad_norm": 0.4841781161829471, "learning_rate": 2.2680986212824786e-06, "loss": 0.3499, "step": 2785 }, { "epoch": 0.8048173655933365, "grad_norm": 0.428134320224768, "learning_rate": 2.2362584124830167e-06, "loss": 0.3684, "step": 2790 }, { "epoch": 0.8062596906212816, "grad_norm": 0.4117804314200649, "learning_rate": 2.204615118846107e-06, "loss": 0.3869, "step": 2795 }, { "epoch": 0.8077020156492265, "grad_norm": 0.41413616917927765, "learning_rate": 2.1731695429543974e-06, "loss": 0.338, "step": 2800 }, { "epoch": 0.8091443406771716, "grad_norm": 0.4360068588380961, "learning_rate": 2.141922482375737e-06, "loss": 0.3665, "step": 2805 }, { "epoch": 0.8105866657051166, "grad_norm": 0.4334830193418244, "learning_rate": 2.1108747296429477e-06, "loss": 0.3721, "step": 2810 }, { "epoch": 0.8120289907330617, "grad_norm": 0.507519342034383, "learning_rate": 2.080027072233718e-06, "loss": 0.3646, "step": 2815 }, { "epoch": 0.8134713157610067, "grad_norm": 0.42834185576130923, "learning_rate": 2.049380292550629e-06, "loss": 0.3633, "step": 2820 }, { "epoch": 0.8149136407889518, "grad_norm": 0.453195030964312, "learning_rate": 2.018935167901316e-06, "loss": 0.3539, "step": 2825 }, { "epoch": 0.8163559658168968, "grad_norm": 0.4103347116873249, "learning_rate": 1.9886924704787482e-06, "loss": 0.3457, "step": 2830 }, { "epoch": 0.8177982908448419, "grad_norm": 0.4081898260751316, "learning_rate": 1.9586529673416433e-06, "loss": 0.347, "step": 2835 }, { "epoch": 0.819240615872787, "grad_norm": 0.40268175350554464, "learning_rate": 1.928817420395018e-06, "loss": 0.3772, "step": 2840 }, { "epoch": 0.820682940900732, "grad_norm": 0.43775696767862726, "learning_rate": 1.8991865863708547e-06, "loss": 0.3718, "step": 2845 }, { "epoch": 0.8221252659286771, "grad_norm": 0.43895036356232614, "learning_rate": 1.8697612168089152e-06, "loss": 0.3648, "step": 2850 }, { "epoch": 0.823567590956622, "grad_norm": 0.40821144604675824, "learning_rate": 1.8405420580376755e-06, "loss": 0.3422, "step": 2855 }, { "epoch": 0.8250099159845671, "grad_norm": 0.4577535204704979, "learning_rate": 1.811529851155398e-06, "loss": 0.3511, "step": 2860 }, { "epoch": 0.8264522410125121, "grad_norm": 0.40698416625428246, "learning_rate": 1.7827253320113347e-06, "loss": 0.3521, "step": 2865 }, { "epoch": 0.8278945660404572, "grad_norm": 0.48745985212369625, "learning_rate": 1.7541292311870616e-06, "loss": 0.3727, "step": 2870 }, { "epoch": 0.8293368910684022, "grad_norm": 0.4152788200688241, "learning_rate": 1.7257422739779495e-06, "loss": 0.3406, "step": 2875 }, { "epoch": 0.8307792160963473, "grad_norm": 0.42357457834820555, "learning_rate": 1.6975651803747716e-06, "loss": 0.3614, "step": 2880 }, { "epoch": 0.8322215411242924, "grad_norm": 0.4290601435620992, "learning_rate": 1.6695986650454355e-06, "loss": 0.349, "step": 2885 }, { "epoch": 0.8336638661522374, "grad_norm": 0.40830671063358515, "learning_rate": 1.6418434373168623e-06, "loss": 0.3592, "step": 2890 }, { "epoch": 0.8351061911801825, "grad_norm": 0.4097799963554095, "learning_rate": 1.614300201156994e-06, "loss": 0.3359, "step": 2895 }, { "epoch": 0.8365485162081275, "grad_norm": 0.43204146744095845, "learning_rate": 1.5869696551569346e-06, "loss": 0.3596, "step": 2900 }, { "epoch": 0.8379908412360726, "grad_norm": 0.46076233886580875, "learning_rate": 1.5598524925132396e-06, "loss": 0.3609, "step": 2905 }, { "epoch": 0.8394331662640176, "grad_norm": 0.4286297255981423, "learning_rate": 1.5329494010103263e-06, "loss": 0.3607, "step": 2910 }, { "epoch": 0.8408754912919626, "grad_norm": 0.3956440167259478, "learning_rate": 1.5062610630030317e-06, "loss": 0.316, "step": 2915 }, { "epoch": 0.8423178163199077, "grad_norm": 0.41432843943606673, "learning_rate": 1.4797881553993099e-06, "loss": 0.3589, "step": 2920 }, { "epoch": 0.8437601413478527, "grad_norm": 0.397270661772685, "learning_rate": 1.4535313496430558e-06, "loss": 0.3519, "step": 2925 }, { "epoch": 0.8452024663757978, "grad_norm": 0.41857285751070505, "learning_rate": 1.4274913116970846e-06, "loss": 0.3401, "step": 2930 }, { "epoch": 0.8466447914037428, "grad_norm": 0.3941031419777465, "learning_rate": 1.4016687020262231e-06, "loss": 0.3504, "step": 2935 }, { "epoch": 0.8480871164316879, "grad_norm": 0.428688446592497, "learning_rate": 1.3760641755805848e-06, "loss": 0.3614, "step": 2940 }, { "epoch": 0.8495294414596329, "grad_norm": 0.4097211469034453, "learning_rate": 1.3506783817789337e-06, "loss": 0.3384, "step": 2945 }, { "epoch": 0.850971766487578, "grad_norm": 0.44047116848231305, "learning_rate": 1.3255119644922266e-06, "loss": 0.3638, "step": 2950 }, { "epoch": 0.852414091515523, "grad_norm": 0.3994464624403052, "learning_rate": 1.300565562027276e-06, "loss": 0.3447, "step": 2955 }, { "epoch": 0.8538564165434681, "grad_norm": 0.44495457947302897, "learning_rate": 1.2758398071105626e-06, "loss": 0.3546, "step": 2960 }, { "epoch": 0.8552987415714132, "grad_norm": 0.4147516297268767, "learning_rate": 1.2513353268721907e-06, "loss": 0.3421, "step": 2965 }, { "epoch": 0.8567410665993581, "grad_norm": 0.422646250463158, "learning_rate": 1.2270527428299684e-06, "loss": 0.3579, "step": 2970 }, { "epoch": 0.8581833916273032, "grad_norm": 0.4189403344854125, "learning_rate": 1.2029926708736673e-06, "loss": 0.3425, "step": 2975 }, { "epoch": 0.8596257166552482, "grad_norm": 0.41547910036939945, "learning_rate": 1.179155721249381e-06, "loss": 0.3376, "step": 2980 }, { "epoch": 0.8610680416831933, "grad_norm": 0.42428858195226893, "learning_rate": 1.1555424985440522e-06, "loss": 0.3554, "step": 2985 }, { "epoch": 0.8625103667111383, "grad_norm": 0.4425537282272965, "learning_rate": 1.1321536016701473e-06, "loss": 0.351, "step": 2990 }, { "epoch": 0.8639526917390834, "grad_norm": 0.4161228925911087, "learning_rate": 1.1089896238504461e-06, "loss": 0.336, "step": 2995 }, { "epoch": 0.8653950167670285, "grad_norm": 0.37656047979276985, "learning_rate": 1.086051152603026e-06, "loss": 0.3509, "step": 3000 }, { "epoch": 0.8653950167670285, "eval_loss": 0.3611552119255066, "eval_runtime": 142.3229, "eval_samples_per_second": 12.654, "eval_steps_per_second": 3.169, "step": 3000 }, { "epoch": 0.8668373417949735, "grad_norm": 0.4463172354545017, "learning_rate": 1.0633387697263254e-06, "loss": 0.35, "step": 3005 }, { "epoch": 0.8682796668229186, "grad_norm": 0.43074983850708387, "learning_rate": 1.0408530512844196e-06, "loss": 0.3613, "step": 3010 }, { "epoch": 0.8697219918508636, "grad_norm": 0.39354733454334206, "learning_rate": 1.0185945675923813e-06, "loss": 0.3727, "step": 3015 }, { "epoch": 0.8711643168788087, "grad_norm": 0.44960602091132634, "learning_rate": 9.965638832018432e-07, "loss": 0.372, "step": 3020 }, { "epoch": 0.8726066419067536, "grad_norm": 0.42518881330063735, "learning_rate": 9.747615568866553e-07, "loss": 0.3516, "step": 3025 }, { "epoch": 0.8740489669346987, "grad_norm": 0.44741688383815076, "learning_rate": 9.531881416287203e-07, "loss": 0.3562, "step": 3030 }, { "epoch": 0.8754912919626437, "grad_norm": 0.4331522299966881, "learning_rate": 9.318441846039828e-07, "loss": 0.3548, "step": 3035 }, { "epoch": 0.8769336169905888, "grad_norm": 0.506237893255727, "learning_rate": 9.107302271685226e-07, "loss": 0.3412, "step": 3040 }, { "epoch": 0.8783759420185339, "grad_norm": 0.4658754493753741, "learning_rate": 8.898468048448528e-07, "loss": 0.3336, "step": 3045 }, { "epoch": 0.8798182670464789, "grad_norm": 0.438225563597408, "learning_rate": 8.691944473083114e-07, "loss": 0.3422, "step": 3050 }, { "epoch": 0.881260592074424, "grad_norm": 0.4170714809613398, "learning_rate": 8.487736783736533e-07, "loss": 0.3621, "step": 3055 }, { "epoch": 0.882702917102369, "grad_norm": 0.4590349478238853, "learning_rate": 8.285850159817388e-07, "loss": 0.3791, "step": 3060 }, { "epoch": 0.8841452421303141, "grad_norm": 0.4332258091307991, "learning_rate": 8.086289721864127e-07, "loss": 0.3404, "step": 3065 }, { "epoch": 0.8855875671582591, "grad_norm": 0.4452410333427778, "learning_rate": 7.889060531415193e-07, "loss": 0.3541, "step": 3070 }, { "epoch": 0.8870298921862042, "grad_norm": 0.42507300447077245, "learning_rate": 7.694167590880475e-07, "loss": 0.3549, "step": 3075 }, { "epoch": 0.8884722172141493, "grad_norm": 0.4227403053651907, "learning_rate": 7.501615843414623e-07, "loss": 0.3264, "step": 3080 }, { "epoch": 0.8899145422420942, "grad_norm": 0.4131961662824003, "learning_rate": 7.311410172791522e-07, "loss": 0.3369, "step": 3085 }, { "epoch": 0.8913568672700393, "grad_norm": 0.39579591570866374, "learning_rate": 7.123555403280558e-07, "loss": 0.3483, "step": 3090 }, { "epoch": 0.8927991922979843, "grad_norm": 0.42292696994848605, "learning_rate": 6.938056299524099e-07, "loss": 0.3398, "step": 3095 }, { "epoch": 0.8942415173259294, "grad_norm": 0.38022938922831223, "learning_rate": 6.754917566416796e-07, "loss": 0.3469, "step": 3100 }, { "epoch": 0.8956838423538744, "grad_norm": 0.4849805496701068, "learning_rate": 6.574143848986226e-07, "loss": 0.3618, "step": 3105 }, { "epoch": 0.8971261673818195, "grad_norm": 0.44465461522642474, "learning_rate": 6.395739732274919e-07, "loss": 0.3642, "step": 3110 }, { "epoch": 0.8985684924097646, "grad_norm": 0.44656695164750837, "learning_rate": 6.219709741224322e-07, "loss": 0.3563, "step": 3115 }, { "epoch": 0.9000108174377096, "grad_norm": 0.4269116876807273, "learning_rate": 6.046058340559824e-07, "loss": 0.3431, "step": 3120 }, { "epoch": 0.9014531424656547, "grad_norm": 0.4086865891433274, "learning_rate": 5.874789934677583e-07, "loss": 0.3505, "step": 3125 }, { "epoch": 0.9028954674935997, "grad_norm": 0.4404444466800333, "learning_rate": 5.705908867532862e-07, "loss": 0.3407, "step": 3130 }, { "epoch": 0.9043377925215448, "grad_norm": 0.45999537115175176, "learning_rate": 5.53941942252979e-07, "loss": 0.37, "step": 3135 }, { "epoch": 0.9057801175494897, "grad_norm": 0.4242568290280731, "learning_rate": 5.375325822412747e-07, "loss": 0.3316, "step": 3140 }, { "epoch": 0.9072224425774348, "grad_norm": 0.4753028820261241, "learning_rate": 5.213632229159227e-07, "loss": 0.3785, "step": 3145 }, { "epoch": 0.9086647676053798, "grad_norm": 0.4699691806857396, "learning_rate": 5.054342743874386e-07, "loss": 0.3617, "step": 3150 }, { "epoch": 0.9101070926333249, "grad_norm": 0.4352496762130561, "learning_rate": 4.897461406686821e-07, "loss": 0.3359, "step": 3155 }, { "epoch": 0.91154941766127, "grad_norm": 0.4316421343515809, "learning_rate": 4.742992196646301e-07, "loss": 0.3376, "step": 3160 }, { "epoch": 0.912991742689215, "grad_norm": 0.4001287994073788, "learning_rate": 4.590939031622743e-07, "loss": 0.3351, "step": 3165 }, { "epoch": 0.9144340677171601, "grad_norm": 0.4363788326973079, "learning_rate": 4.4413057682068606e-07, "loss": 0.3473, "step": 3170 }, { "epoch": 0.9158763927451051, "grad_norm": 0.44176842953481193, "learning_rate": 4.2940962016123524e-07, "loss": 0.3332, "step": 3175 }, { "epoch": 0.9173187177730502, "grad_norm": 0.43914474716543256, "learning_rate": 4.149314065579624e-07, "loss": 0.3383, "step": 3180 }, { "epoch": 0.9187610428009952, "grad_norm": 0.4540079519566383, "learning_rate": 4.0069630322811303e-07, "loss": 0.3786, "step": 3185 }, { "epoch": 0.9202033678289403, "grad_norm": 0.4612868459187327, "learning_rate": 3.867046712228162e-07, "loss": 0.3625, "step": 3190 }, { "epoch": 0.9216456928568852, "grad_norm": 0.40372545279617805, "learning_rate": 3.729568654179361e-07, "loss": 0.3308, "step": 3195 }, { "epoch": 0.9230880178848303, "grad_norm": 0.4204476032972304, "learning_rate": 3.5945323450506387e-07, "loss": 0.3346, "step": 3200 }, { "epoch": 0.9245303429127754, "grad_norm": 0.45260198781122246, "learning_rate": 3.4619412098267693e-07, "loss": 0.3795, "step": 3205 }, { "epoch": 0.9259726679407204, "grad_norm": 0.42527213346553855, "learning_rate": 3.331798611474535e-07, "loss": 0.3421, "step": 3210 }, { "epoch": 0.9274149929686655, "grad_norm": 0.414984415520749, "learning_rate": 3.204107850857374e-07, "loss": 0.3291, "step": 3215 }, { "epoch": 0.9288573179966105, "grad_norm": 0.4549260227056393, "learning_rate": 3.0788721666517365e-07, "loss": 0.3486, "step": 3220 }, { "epoch": 0.9302996430245556, "grad_norm": 0.4443023622951338, "learning_rate": 2.9560947352648697e-07, "loss": 0.3756, "step": 3225 }, { "epoch": 0.9317419680525006, "grad_norm": 0.4250192102717841, "learning_rate": 2.8357786707542854e-07, "loss": 0.3525, "step": 3230 }, { "epoch": 0.9331842930804457, "grad_norm": 0.41194820669384097, "learning_rate": 2.71792702474879e-07, "loss": 0.3562, "step": 3235 }, { "epoch": 0.9346266181083908, "grad_norm": 0.42277936484045997, "learning_rate": 2.602542786371065e-07, "loss": 0.3609, "step": 3240 }, { "epoch": 0.9360689431363358, "grad_norm": 0.402522590339594, "learning_rate": 2.489628882161832e-07, "loss": 0.3323, "step": 3245 }, { "epoch": 0.9375112681642809, "grad_norm": 0.42468823176649917, "learning_rate": 2.3791881760056756e-07, "loss": 0.3705, "step": 3250 }, { "epoch": 0.9389535931922258, "grad_norm": 0.42563197511583134, "learning_rate": 2.2712234690583813e-07, "loss": 0.3635, "step": 3255 }, { "epoch": 0.9403959182201709, "grad_norm": 0.4452148892270775, "learning_rate": 2.1657374996758795e-07, "loss": 0.3478, "step": 3260 }, { "epoch": 0.9418382432481159, "grad_norm": 0.4539015567282992, "learning_rate": 2.0627329433447917e-07, "loss": 0.3736, "step": 3265 }, { "epoch": 0.943280568276061, "grad_norm": 0.40270803503237657, "learning_rate": 1.9622124126145837e-07, "loss": 0.3378, "step": 3270 }, { "epoch": 0.944722893304006, "grad_norm": 0.4075396549757293, "learning_rate": 1.864178457031318e-07, "loss": 0.3562, "step": 3275 }, { "epoch": 0.9461652183319511, "grad_norm": 0.43266062909072267, "learning_rate": 1.768633563072919e-07, "loss": 0.3451, "step": 3280 }, { "epoch": 0.9476075433598962, "grad_norm": 0.418621662939926, "learning_rate": 1.6755801540862092e-07, "loss": 0.334, "step": 3285 }, { "epoch": 0.9490498683878412, "grad_norm": 0.4221481289163581, "learning_rate": 1.5850205902253613e-07, "loss": 0.3536, "step": 3290 }, { "epoch": 0.9504921934157863, "grad_norm": 0.40400229300396406, "learning_rate": 1.4969571683920768e-07, "loss": 0.3636, "step": 3295 }, { "epoch": 0.9519345184437313, "grad_norm": 0.4142859171614361, "learning_rate": 1.411392122177302e-07, "loss": 0.3302, "step": 3300 }, { "epoch": 0.9533768434716764, "grad_norm": 0.4259634616965583, "learning_rate": 1.3283276218046259e-07, "loss": 0.3674, "step": 3305 }, { "epoch": 0.9548191684996213, "grad_norm": 0.41429097541392035, "learning_rate": 1.2477657740751714e-07, "loss": 0.3483, "step": 3310 }, { "epoch": 0.9562614935275664, "grad_norm": 0.42353387168902784, "learning_rate": 1.169708622314214e-07, "loss": 0.3608, "step": 3315 }, { "epoch": 0.9577038185555115, "grad_norm": 0.42693212185785107, "learning_rate": 1.0941581463193129e-07, "loss": 0.3452, "step": 3320 }, { "epoch": 0.9591461435834565, "grad_norm": 0.4328702433520352, "learning_rate": 1.021116262310129e-07, "loss": 0.3413, "step": 3325 }, { "epoch": 0.9605884686114016, "grad_norm": 0.41956255025855793, "learning_rate": 9.505848228798076e-08, "loss": 0.3604, "step": 3330 }, { "epoch": 0.9620307936393466, "grad_norm": 0.4209071869524921, "learning_rate": 8.825656169480056e-08, "loss": 0.3384, "step": 3335 }, { "epoch": 0.9634731186672917, "grad_norm": 0.4118105753397592, "learning_rate": 8.170603697154944e-08, "loss": 0.3338, "step": 3340 }, { "epoch": 0.9649154436952367, "grad_norm": 0.43817584876124205, "learning_rate": 7.540707426204163e-08, "loss": 0.3281, "step": 3345 }, { "epoch": 0.9663577687231818, "grad_norm": 0.3903217050033041, "learning_rate": 6.935983332961305e-08, "loss": 0.3308, "step": 3350 }, { "epoch": 0.9678000937511269, "grad_norm": 0.41905865354117233, "learning_rate": 6.356446755307444e-08, "loss": 0.3509, "step": 3355 }, { "epoch": 0.9692424187790719, "grad_norm": 0.41394321455611666, "learning_rate": 5.802112392281123e-08, "loss": 0.3377, "step": 3360 }, { "epoch": 0.9706847438070169, "grad_norm": 0.4316304666724342, "learning_rate": 5.272994303706758e-08, "loss": 0.3592, "step": 3365 }, { "epoch": 0.9721270688349619, "grad_norm": 0.45454272140307556, "learning_rate": 4.769105909836924e-08, "loss": 0.3485, "step": 3370 }, { "epoch": 0.973569393862907, "grad_norm": 0.43202485000084534, "learning_rate": 4.2904599910127406e-08, "loss": 0.3538, "step": 3375 }, { "epoch": 0.975011718890852, "grad_norm": 0.44712558770756466, "learning_rate": 3.837068687339351e-08, "loss": 0.367, "step": 3380 }, { "epoch": 0.9764540439187971, "grad_norm": 0.423193248701901, "learning_rate": 3.408943498377726e-08, "loss": 0.3351, "step": 3385 }, { "epoch": 0.9778963689467421, "grad_norm": 0.47037763666404425, "learning_rate": 3.006095282854116e-08, "loss": 0.3966, "step": 3390 }, { "epoch": 0.9793386939746872, "grad_norm": 0.4314080592872779, "learning_rate": 2.628534258383164e-08, "loss": 0.357, "step": 3395 }, { "epoch": 0.9807810190026323, "grad_norm": 0.45121239415975073, "learning_rate": 2.2762700012097795e-08, "loss": 0.3564, "step": 3400 }, { "epoch": 0.9822233440305773, "grad_norm": 0.4226505971917229, "learning_rate": 1.9493114459659956e-08, "loss": 0.3625, "step": 3405 }, { "epoch": 0.9836656690585224, "grad_norm": 0.4197713049001792, "learning_rate": 1.6476668854440435e-08, "loss": 0.3526, "step": 3410 }, { "epoch": 0.9851079940864674, "grad_norm": 0.4575738762031232, "learning_rate": 1.3713439703865183e-08, "loss": 0.3762, "step": 3415 }, { "epoch": 0.9865503191144124, "grad_norm": 0.4574906098764045, "learning_rate": 1.120349709291868e-08, "loss": 0.3634, "step": 3420 }, { "epoch": 0.9879926441423574, "grad_norm": 0.43088006927461175, "learning_rate": 8.946904682370917e-09, "loss": 0.3675, "step": 3425 }, { "epoch": 0.9894349691703025, "grad_norm": 0.4103449101623024, "learning_rate": 6.943719707158681e-09, "loss": 0.3496, "step": 3430 }, { "epoch": 0.9908772941982475, "grad_norm": 0.40469613082222705, "learning_rate": 5.193992974935613e-09, "loss": 0.369, "step": 3435 }, { "epoch": 0.9923196192261926, "grad_norm": 0.46076258755412675, "learning_rate": 3.697768864782125e-09, "loss": 0.3588, "step": 3440 }, { "epoch": 0.9937619442541377, "grad_norm": 0.4334341619233562, "learning_rate": 2.4550853260851826e-09, "loss": 0.3345, "step": 3445 }, { "epoch": 0.9952042692820827, "grad_norm": 0.44568439209243566, "learning_rate": 1.4659738775679721e-09, "loss": 0.3459, "step": 3450 }, { "epoch": 0.9966465943100278, "grad_norm": 0.45951543969711284, "learning_rate": 7.30459606494982e-10, "loss": 0.3791, "step": 3455 }, { "epoch": 0.9980889193379728, "grad_norm": 0.4459520568434071, "learning_rate": 2.4856116803695375e-10, "loss": 0.3525, "step": 3460 }, { "epoch": 0.9995312443659179, "grad_norm": 0.4581327568157757, "learning_rate": 2.0290784791265893e-11, "loss": 0.3492, "step": 3465 }, { "epoch": 0.9998197093715069, "step": 3466, "total_flos": 4977616761913344.0, "train_loss": 0.6325558101381102, "train_runtime": 63848.9812, "train_samples_per_second": 3.475, "train_steps_per_second": 0.054 } ], "logging_steps": 5, "max_steps": 3466, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4977616761913344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }