{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995385325334564, "eval_steps": 100, "global_step": 1083, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009229349330872173, "grad_norm": 9.21747479580034, "learning_rate": 4.587155963302753e-08, "loss": 1.1391, "step": 1 }, { "epoch": 0.0046146746654360865, "grad_norm": 9.32631104687394, "learning_rate": 2.2935779816513764e-07, "loss": 1.1363, "step": 5 }, { "epoch": 0.009229349330872173, "grad_norm": 6.904809666749362, "learning_rate": 4.587155963302753e-07, "loss": 1.1181, "step": 10 }, { "epoch": 0.01384402399630826, "grad_norm": 4.252191591840914, "learning_rate": 6.880733944954129e-07, "loss": 1.0599, "step": 15 }, { "epoch": 0.018458698661744346, "grad_norm": 2.42026912444101, "learning_rate": 9.174311926605506e-07, "loss": 1.036, "step": 20 }, { "epoch": 0.023073373327180433, "grad_norm": 2.149192317698554, "learning_rate": 1.1467889908256882e-06, "loss": 1.0183, "step": 25 }, { "epoch": 0.02768804799261652, "grad_norm": 1.946217058573502, "learning_rate": 1.3761467889908258e-06, "loss": 0.9811, "step": 30 }, { "epoch": 0.032302722658052604, "grad_norm": 1.8635683033872088, "learning_rate": 1.6055045871559635e-06, "loss": 1.0023, "step": 35 }, { "epoch": 0.03691739732348869, "grad_norm": 1.8627278195783645, "learning_rate": 1.8348623853211011e-06, "loss": 1.0052, "step": 40 }, { "epoch": 0.04153207198892478, "grad_norm": 1.8639839407230159, "learning_rate": 2.064220183486239e-06, "loss": 0.9805, "step": 45 }, { "epoch": 0.046146746654360866, "grad_norm": 1.8520384144815225, "learning_rate": 2.2935779816513764e-06, "loss": 0.9727, "step": 50 }, { "epoch": 0.050761421319796954, "grad_norm": 1.8187949601170375, "learning_rate": 2.522935779816514e-06, "loss": 0.9716, "step": 55 }, { "epoch": 0.05537609598523304, "grad_norm": 1.744535465844643, "learning_rate": 2.7522935779816517e-06, "loss": 0.9827, "step": 60 }, { "epoch": 0.05999077065066913, "grad_norm": 1.7884368898018332, "learning_rate": 2.981651376146789e-06, "loss": 0.9817, "step": 65 }, { "epoch": 0.06460544531610521, "grad_norm": 2.043892613490708, "learning_rate": 3.211009174311927e-06, "loss": 0.9579, "step": 70 }, { "epoch": 0.0692201199815413, "grad_norm": 1.954683604133804, "learning_rate": 3.4403669724770644e-06, "loss": 0.962, "step": 75 }, { "epoch": 0.07383479464697738, "grad_norm": 2.1294521693145834, "learning_rate": 3.6697247706422022e-06, "loss": 0.9531, "step": 80 }, { "epoch": 0.07844946931241348, "grad_norm": 1.724553323495733, "learning_rate": 3.89908256880734e-06, "loss": 0.9699, "step": 85 }, { "epoch": 0.08306414397784956, "grad_norm": 1.782808734986599, "learning_rate": 4.128440366972478e-06, "loss": 0.9672, "step": 90 }, { "epoch": 0.08767881864328565, "grad_norm": 1.9895808040539045, "learning_rate": 4.357798165137615e-06, "loss": 0.9713, "step": 95 }, { "epoch": 0.09229349330872173, "grad_norm": 2.454158326269674, "learning_rate": 4.587155963302753e-06, "loss": 0.977, "step": 100 }, { "epoch": 0.09229349330872173, "eval_loss": 0.9750353693962097, "eval_runtime": 897.2073, "eval_samples_per_second": 17.109, "eval_steps_per_second": 0.134, "step": 100 }, { "epoch": 0.09690816797415783, "grad_norm": 3.043630262591518, "learning_rate": 4.816513761467891e-06, "loss": 1.0005, "step": 105 }, { "epoch": 0.10152284263959391, "grad_norm": 2.250020293031914, "learning_rate": 4.999986995565604e-06, "loss": 0.983, "step": 110 }, { "epoch": 0.10613751730503, "grad_norm": 2.467248477924322, "learning_rate": 4.999531854567251e-06, "loss": 0.9869, "step": 115 }, { "epoch": 0.11075219197046608, "grad_norm": 1.8823588948860963, "learning_rate": 4.998426627134992e-06, "loss": 0.9653, "step": 120 }, { "epoch": 0.11536686663590216, "grad_norm": 2.5542023711742177, "learning_rate": 4.99667160072e-06, "loss": 0.9863, "step": 125 }, { "epoch": 0.11998154130133826, "grad_norm": 2.159080260322966, "learning_rate": 4.994267231775293e-06, "loss": 0.9587, "step": 130 }, { "epoch": 0.12459621596677434, "grad_norm": 1.947122711805113, "learning_rate": 4.9912141456370235e-06, "loss": 0.9877, "step": 135 }, { "epoch": 0.12921089063221042, "grad_norm": 1.9160522350665006, "learning_rate": 4.987513136361838e-06, "loss": 0.9823, "step": 140 }, { "epoch": 0.13382556529764653, "grad_norm": 1.643819359603758, "learning_rate": 4.983165166520355e-06, "loss": 0.9592, "step": 145 }, { "epoch": 0.1384402399630826, "grad_norm": 1.9111155898238734, "learning_rate": 4.978171366946815e-06, "loss": 0.9467, "step": 150 }, { "epoch": 0.1430549146285187, "grad_norm": 1.8962800405520117, "learning_rate": 4.972533036444971e-06, "loss": 0.9973, "step": 155 }, { "epoch": 0.14766958929395477, "grad_norm": 1.9916665078710525, "learning_rate": 4.966251641450295e-06, "loss": 0.9532, "step": 160 }, { "epoch": 0.15228426395939088, "grad_norm": 2.0789615238934216, "learning_rate": 4.959328815648577e-06, "loss": 0.9619, "step": 165 }, { "epoch": 0.15689893862482696, "grad_norm": 1.924282439427388, "learning_rate": 4.95176635955103e-06, "loss": 0.9643, "step": 170 }, { "epoch": 0.16151361329026304, "grad_norm": 1.8215270244225596, "learning_rate": 4.943566240026014e-06, "loss": 0.9493, "step": 175 }, { "epoch": 0.16612828795569912, "grad_norm": 1.7244089849981508, "learning_rate": 4.934730589787482e-06, "loss": 0.9632, "step": 180 }, { "epoch": 0.1707429626211352, "grad_norm": 1.7885441778481475, "learning_rate": 4.925261706840294e-06, "loss": 0.9697, "step": 185 }, { "epoch": 0.1753576372865713, "grad_norm": 1.7015821372313695, "learning_rate": 4.915162053882552e-06, "loss": 0.9518, "step": 190 }, { "epoch": 0.17997231195200739, "grad_norm": 1.8797017342958653, "learning_rate": 4.904434257665084e-06, "loss": 0.9582, "step": 195 }, { "epoch": 0.18458698661744347, "grad_norm": 1.9599193924367686, "learning_rate": 4.8930811083082805e-06, "loss": 0.962, "step": 200 }, { "epoch": 0.18458698661744347, "eval_loss": 0.97027587890625, "eval_runtime": 678.8685, "eval_samples_per_second": 22.611, "eval_steps_per_second": 0.177, "step": 200 }, { "epoch": 0.18920166128287955, "grad_norm": 1.6185669834732714, "learning_rate": 4.881105558576419e-06, "loss": 0.9527, "step": 205 }, { "epoch": 0.19381633594831565, "grad_norm": 1.9513047781566992, "learning_rate": 4.868510723109712e-06, "loss": 0.9337, "step": 210 }, { "epoch": 0.19843101061375173, "grad_norm": 2.2007134730719717, "learning_rate": 4.85529987761423e-06, "loss": 0.9475, "step": 215 }, { "epoch": 0.20304568527918782, "grad_norm": 1.7234029691654333, "learning_rate": 4.8414764580099536e-06, "loss": 0.9697, "step": 220 }, { "epoch": 0.2076603599446239, "grad_norm": 2.0041628584262874, "learning_rate": 4.827044059537141e-06, "loss": 0.9729, "step": 225 }, { "epoch": 0.21227503461006, "grad_norm": 1.8592735020606461, "learning_rate": 4.8120064358212726e-06, "loss": 0.9509, "step": 230 }, { "epoch": 0.21688970927549608, "grad_norm": 1.800207324441241, "learning_rate": 4.7963674978967916e-06, "loss": 0.9673, "step": 235 }, { "epoch": 0.22150438394093216, "grad_norm": 1.9519675562584446, "learning_rate": 4.780131313189912e-06, "loss": 0.9585, "step": 240 }, { "epoch": 0.22611905860636825, "grad_norm": 1.820917457510076, "learning_rate": 4.763302104460745e-06, "loss": 0.9582, "step": 245 }, { "epoch": 0.23073373327180433, "grad_norm": 1.849379990230495, "learning_rate": 4.745884248705035e-06, "loss": 0.958, "step": 250 }, { "epoch": 0.23534840793724043, "grad_norm": 1.756608683876342, "learning_rate": 4.727882276015773e-06, "loss": 0.9627, "step": 255 }, { "epoch": 0.23996308260267651, "grad_norm": 1.7882434321278007, "learning_rate": 4.7093008684049945e-06, "loss": 0.9662, "step": 260 }, { "epoch": 0.2445777572681126, "grad_norm": 1.767882784557367, "learning_rate": 4.690144858586071e-06, "loss": 0.9707, "step": 265 }, { "epoch": 0.24919243193354867, "grad_norm": 1.6242697429092763, "learning_rate": 4.670419228716794e-06, "loss": 0.9691, "step": 270 }, { "epoch": 0.25380710659898476, "grad_norm": 1.6345433829887448, "learning_rate": 4.6501291091036076e-06, "loss": 0.9545, "step": 275 }, { "epoch": 0.25842178126442084, "grad_norm": 2.022315638454952, "learning_rate": 4.629279776867298e-06, "loss": 0.9654, "step": 280 }, { "epoch": 0.26303645592985697, "grad_norm": 1.7035754028500263, "learning_rate": 4.6078766545705e-06, "loss": 0.9525, "step": 285 }, { "epoch": 0.26765113059529305, "grad_norm": 1.6433313155282105, "learning_rate": 4.585925308807385e-06, "loss": 0.9643, "step": 290 }, { "epoch": 0.27226580526072913, "grad_norm": 1.7430161713619263, "learning_rate": 4.563431448755876e-06, "loss": 0.9554, "step": 295 }, { "epoch": 0.2768804799261652, "grad_norm": 1.7944167443642705, "learning_rate": 4.540400924692793e-06, "loss": 0.9629, "step": 300 }, { "epoch": 0.2768804799261652, "eval_loss": 0.9640328288078308, "eval_runtime": 646.7806, "eval_samples_per_second": 23.733, "eval_steps_per_second": 0.186, "step": 300 }, { "epoch": 0.2814951545916013, "grad_norm": 1.6727578460910815, "learning_rate": 4.516839726472287e-06, "loss": 0.9352, "step": 305 }, { "epoch": 0.2861098292570374, "grad_norm": 1.6937159792463894, "learning_rate": 4.492753981967985e-06, "loss": 0.9612, "step": 310 }, { "epoch": 0.29072450392247345, "grad_norm": 1.6932559756624015, "learning_rate": 4.468149955479231e-06, "loss": 0.9502, "step": 315 }, { "epoch": 0.29533917858790953, "grad_norm": 1.7252625714395642, "learning_rate": 4.443034046101842e-06, "loss": 0.9529, "step": 320 }, { "epoch": 0.2999538532533456, "grad_norm": 1.7373868046279137, "learning_rate": 4.417412786063816e-06, "loss": 0.9582, "step": 325 }, { "epoch": 0.30456852791878175, "grad_norm": 1.6536825460108018, "learning_rate": 4.391292839026407e-06, "loss": 0.9737, "step": 330 }, { "epoch": 0.30918320258421783, "grad_norm": 1.675760556263245, "learning_rate": 4.364680998351016e-06, "loss": 0.9696, "step": 335 }, { "epoch": 0.3137978772496539, "grad_norm": 1.6398783987173011, "learning_rate": 4.337584185332353e-06, "loss": 0.9597, "step": 340 }, { "epoch": 0.31841255191509, "grad_norm": 1.6408572573338802, "learning_rate": 4.310009447398327e-06, "loss": 0.9668, "step": 345 }, { "epoch": 0.3230272265805261, "grad_norm": 1.6487510432775814, "learning_rate": 4.28196395627712e-06, "loss": 0.9442, "step": 350 }, { "epoch": 0.32764190124596215, "grad_norm": 1.6194603575446027, "learning_rate": 4.25345500613195e-06, "loss": 0.9555, "step": 355 }, { "epoch": 0.33225657591139823, "grad_norm": 1.759504209301756, "learning_rate": 4.224490011663972e-06, "loss": 0.954, "step": 360 }, { "epoch": 0.3368712505768343, "grad_norm": 1.6716810309336956, "learning_rate": 4.195076506183846e-06, "loss": 0.9487, "step": 365 }, { "epoch": 0.3414859252422704, "grad_norm": 1.7489967721807036, "learning_rate": 4.1652221396524435e-06, "loss": 0.9742, "step": 370 }, { "epoch": 0.34610059990770653, "grad_norm": 1.6985994364850043, "learning_rate": 4.134934676691224e-06, "loss": 0.9559, "step": 375 }, { "epoch": 0.3507152745731426, "grad_norm": 1.871025727941928, "learning_rate": 4.1042219945627835e-06, "loss": 0.9505, "step": 380 }, { "epoch": 0.3553299492385787, "grad_norm": 1.6075998703469527, "learning_rate": 4.0730920811221155e-06, "loss": 0.93, "step": 385 }, { "epoch": 0.35994462390401477, "grad_norm": 1.6457488125315527, "learning_rate": 4.041553032739096e-06, "loss": 0.9403, "step": 390 }, { "epoch": 0.36455929856945085, "grad_norm": 1.5924111716460287, "learning_rate": 4.009613052192759e-06, "loss": 0.9623, "step": 395 }, { "epoch": 0.36917397323488693, "grad_norm": 1.5590742628266465, "learning_rate": 3.977280446537893e-06, "loss": 0.9442, "step": 400 }, { "epoch": 0.36917397323488693, "eval_loss": 0.9574902653694153, "eval_runtime": 624.3544, "eval_samples_per_second": 24.585, "eval_steps_per_second": 0.192, "step": 400 }, { "epoch": 0.373788647900323, "grad_norm": 1.60263749640404, "learning_rate": 3.9445636249445015e-06, "loss": 0.9491, "step": 405 }, { "epoch": 0.3784033225657591, "grad_norm": 1.638940603978051, "learning_rate": 3.91147109651074e-06, "loss": 0.9383, "step": 410 }, { "epoch": 0.3830179972311952, "grad_norm": 1.755136588103001, "learning_rate": 3.87801146804982e-06, "loss": 0.9291, "step": 415 }, { "epoch": 0.3876326718966313, "grad_norm": 1.7055552089268673, "learning_rate": 3.8441934418515356e-06, "loss": 0.9718, "step": 420 }, { "epoch": 0.3922473465620674, "grad_norm": 1.6117946383083048, "learning_rate": 3.810025813418939e-06, "loss": 0.9441, "step": 425 }, { "epoch": 0.39686202122750347, "grad_norm": 1.696657745451938, "learning_rate": 3.775517469180775e-06, "loss": 0.9514, "step": 430 }, { "epoch": 0.40147669589293955, "grad_norm": 1.4828726478668206, "learning_rate": 3.7406773841802716e-06, "loss": 0.931, "step": 435 }, { "epoch": 0.40609137055837563, "grad_norm": 1.5867959288490177, "learning_rate": 3.705514619740883e-06, "loss": 0.9469, "step": 440 }, { "epoch": 0.4107060452238117, "grad_norm": 1.5387669296506232, "learning_rate": 3.670038321109586e-06, "loss": 0.9472, "step": 445 }, { "epoch": 0.4153207198892478, "grad_norm": 1.6036856664700139, "learning_rate": 3.634257715078361e-06, "loss": 0.9558, "step": 450 }, { "epoch": 0.41993539455468387, "grad_norm": 1.6323062322644255, "learning_rate": 3.5981821075844503e-06, "loss": 0.9367, "step": 455 }, { "epoch": 0.42455006922012, "grad_norm": 1.5312977997668584, "learning_rate": 3.5618208812900446e-06, "loss": 0.9339, "step": 460 }, { "epoch": 0.4291647438855561, "grad_norm": 1.5671296307753086, "learning_rate": 3.5251834931420096e-06, "loss": 0.9483, "step": 465 }, { "epoch": 0.43377941855099217, "grad_norm": 1.5643719028299428, "learning_rate": 3.4882794719122882e-06, "loss": 0.9567, "step": 470 }, { "epoch": 0.43839409321642825, "grad_norm": 1.5405866169718432, "learning_rate": 3.4511184157196297e-06, "loss": 0.9526, "step": 475 }, { "epoch": 0.44300876788186433, "grad_norm": 1.592504155638412, "learning_rate": 3.4137099895332794e-06, "loss": 0.9422, "step": 480 }, { "epoch": 0.4476234425473004, "grad_norm": 1.709428108076362, "learning_rate": 3.3760639226592804e-06, "loss": 0.9634, "step": 485 }, { "epoch": 0.4522381172127365, "grad_norm": 1.5945154883384607, "learning_rate": 3.3381900062100434e-06, "loss": 0.9395, "step": 490 }, { "epoch": 0.45685279187817257, "grad_norm": 1.540282704340126, "learning_rate": 3.300098090557846e-06, "loss": 0.9257, "step": 495 }, { "epoch": 0.46146746654360865, "grad_norm": 1.5625000616216127, "learning_rate": 3.261798082772909e-06, "loss": 0.9595, "step": 500 }, { "epoch": 0.46146746654360865, "eval_loss": 0.9511266350746155, "eval_runtime": 624.3502, "eval_samples_per_second": 24.586, "eval_steps_per_second": 0.192, "step": 500 }, { "epoch": 0.4660821412090448, "grad_norm": 1.531429813118766, "learning_rate": 3.223299944046739e-06, "loss": 0.9345, "step": 505 }, { "epoch": 0.47069681587448087, "grad_norm": 1.5789359710981146, "learning_rate": 3.184613687101388e-06, "loss": 0.9489, "step": 510 }, { "epoch": 0.47531149053991695, "grad_norm": 1.5205168738843033, "learning_rate": 3.1457493735853075e-06, "loss": 0.9572, "step": 515 }, { "epoch": 0.47992616520535303, "grad_norm": 1.6218298931024684, "learning_rate": 3.1067171114564887e-06, "loss": 0.9481, "step": 520 }, { "epoch": 0.4845408398707891, "grad_norm": 1.6417074181294504, "learning_rate": 3.0675270523535466e-06, "loss": 0.9318, "step": 525 }, { "epoch": 0.4891555145362252, "grad_norm": 1.460789566333901, "learning_rate": 3.028189388955449e-06, "loss": 0.9438, "step": 530 }, { "epoch": 0.49377018920166127, "grad_norm": 1.557851001181342, "learning_rate": 2.9887143523305756e-06, "loss": 0.9281, "step": 535 }, { "epoch": 0.49838486386709735, "grad_norm": 1.6248209544628205, "learning_rate": 2.9491122092757825e-06, "loss": 0.9395, "step": 540 }, { "epoch": 0.5029995385325334, "grad_norm": 1.5476409163555114, "learning_rate": 2.9093932596461822e-06, "loss": 0.9299, "step": 545 }, { "epoch": 0.5076142131979695, "grad_norm": 1.5437406288809876, "learning_rate": 2.8695678336763254e-06, "loss": 0.9358, "step": 550 }, { "epoch": 0.5122288878634056, "grad_norm": 1.6371553022227263, "learning_rate": 2.8296462892934784e-06, "loss": 0.9455, "step": 555 }, { "epoch": 0.5168435625288417, "grad_norm": 1.551651736499346, "learning_rate": 2.7896390094236976e-06, "loss": 0.9341, "step": 560 }, { "epoch": 0.5214582371942778, "grad_norm": 1.5008925482067859, "learning_rate": 2.749556399291407e-06, "loss": 0.9337, "step": 565 }, { "epoch": 0.5260729118597139, "grad_norm": 1.5792225714429848, "learning_rate": 2.7094088837131717e-06, "loss": 0.9309, "step": 570 }, { "epoch": 0.53068758652515, "grad_norm": 1.5264458652211905, "learning_rate": 2.6692069043863777e-06, "loss": 0.932, "step": 575 }, { "epoch": 0.5353022611905861, "grad_norm": 1.602337588848531, "learning_rate": 2.6289609171735216e-06, "loss": 0.9314, "step": 580 }, { "epoch": 0.5399169358560222, "grad_norm": 1.618050075538486, "learning_rate": 2.5886813893828144e-06, "loss": 0.9319, "step": 585 }, { "epoch": 0.5445316105214583, "grad_norm": 1.5144862695457302, "learning_rate": 2.5483787970458114e-06, "loss": 0.9244, "step": 590 }, { "epoch": 0.5491462851868943, "grad_norm": 1.559900327478649, "learning_rate": 2.50806362219277e-06, "loss": 0.9478, "step": 595 }, { "epoch": 0.5537609598523304, "grad_norm": 1.450110651237953, "learning_rate": 2.4677463501264513e-06, "loss": 0.9288, "step": 600 }, { "epoch": 0.5537609598523304, "eval_loss": 0.9444681406021118, "eval_runtime": 927.5923, "eval_samples_per_second": 16.548, "eval_steps_per_second": 0.129, "step": 600 }, { "epoch": 0.5583756345177665, "grad_norm": 1.5437479875464375, "learning_rate": 2.427437466695062e-06, "loss": 0.9371, "step": 605 }, { "epoch": 0.5629903091832026, "grad_norm": 1.4937755492391802, "learning_rate": 2.3871474555650704e-06, "loss": 0.9474, "step": 610 }, { "epoch": 0.5676049838486387, "grad_norm": 1.5424277719142725, "learning_rate": 2.3468867954945728e-06, "loss": 0.9303, "step": 615 }, { "epoch": 0.5722196585140747, "grad_norm": 1.558684456225011, "learning_rate": 2.3066659576079443e-06, "loss": 0.9432, "step": 620 }, { "epoch": 0.5768343331795108, "grad_norm": 1.4672708994343564, "learning_rate": 2.266495402672479e-06, "loss": 0.9298, "step": 625 }, { "epoch": 0.5814490078449469, "grad_norm": 1.5812210723829734, "learning_rate": 2.2263855783777114e-06, "loss": 0.9367, "step": 630 }, { "epoch": 0.586063682510383, "grad_norm": 1.5496204274404133, "learning_rate": 2.1863469166181596e-06, "loss": 0.9439, "step": 635 }, { "epoch": 0.5906783571758191, "grad_norm": 1.9271804906757715, "learning_rate": 2.1463898307801475e-06, "loss": 0.9449, "step": 640 }, { "epoch": 0.5952930318412551, "grad_norm": 1.5353867041625484, "learning_rate": 2.106524713033473e-06, "loss": 0.9232, "step": 645 }, { "epoch": 0.5999077065066912, "grad_norm": 1.55884597473951, "learning_rate": 2.0667619316285695e-06, "loss": 0.9427, "step": 650 }, { "epoch": 0.6045223811721273, "grad_norm": 1.5099166906913315, "learning_rate": 2.027111828199897e-06, "loss": 0.9373, "step": 655 }, { "epoch": 0.6091370558375635, "grad_norm": 1.5119802066459846, "learning_rate": 1.987584715076262e-06, "loss": 0.9264, "step": 660 }, { "epoch": 0.6137517305029996, "grad_norm": 1.523286025026604, "learning_rate": 1.9481908725987457e-06, "loss": 0.9361, "step": 665 }, { "epoch": 0.6183664051684357, "grad_norm": 1.5442251368295816, "learning_rate": 1.908940546446967e-06, "loss": 0.9389, "step": 670 }, { "epoch": 0.6229810798338717, "grad_norm": 1.5117545218990087, "learning_rate": 1.8698439449743448e-06, "loss": 0.9323, "step": 675 }, { "epoch": 0.6275957544993078, "grad_norm": 1.515458116184657, "learning_rate": 1.8309112365530829e-06, "loss": 0.9317, "step": 680 }, { "epoch": 0.6322104291647439, "grad_norm": 1.5539352126328116, "learning_rate": 1.792152546929541e-06, "loss": 0.9331, "step": 685 }, { "epoch": 0.63682510383018, "grad_norm": 1.4887363734079302, "learning_rate": 1.753577956590701e-06, "loss": 0.9287, "step": 690 }, { "epoch": 0.6414397784956161, "grad_norm": 1.5084549901000108, "learning_rate": 1.7151974981424001e-06, "loss": 0.9436, "step": 695 }, { "epoch": 0.6460544531610521, "grad_norm": 1.555620567622698, "learning_rate": 1.6770211537000162e-06, "loss": 0.9131, "step": 700 }, { "epoch": 0.6460544531610521, "eval_loss": 0.9387167096138, "eval_runtime": 624.2192, "eval_samples_per_second": 24.591, "eval_steps_per_second": 0.192, "step": 700 }, { "epoch": 0.6506691278264882, "grad_norm": 1.5415605370252414, "learning_rate": 1.6390588522922885e-06, "loss": 0.9108, "step": 705 }, { "epoch": 0.6552838024919243, "grad_norm": 1.465810880705891, "learning_rate": 1.601320467278942e-06, "loss": 0.9161, "step": 710 }, { "epoch": 0.6598984771573604, "grad_norm": 1.5655061299032593, "learning_rate": 1.563815813782793e-06, "loss": 0.9306, "step": 715 }, { "epoch": 0.6645131518227965, "grad_norm": 1.5149376469704838, "learning_rate": 1.526554646136998e-06, "loss": 0.9313, "step": 720 }, { "epoch": 0.6691278264882325, "grad_norm": 1.5174533273111726, "learning_rate": 1.4895466553481164e-06, "loss": 0.9193, "step": 725 }, { "epoch": 0.6737425011536686, "grad_norm": 1.5170716383670275, "learning_rate": 1.4528014665756426e-06, "loss": 0.9219, "step": 730 }, { "epoch": 0.6783571758191047, "grad_norm": 1.4428914011764975, "learning_rate": 1.4163286366286583e-06, "loss": 0.9265, "step": 735 }, { "epoch": 0.6829718504845408, "grad_norm": 1.5297889826974815, "learning_rate": 1.3801376514802728e-06, "loss": 0.9286, "step": 740 }, { "epoch": 0.687586525149977, "grad_norm": 1.4535828054395095, "learning_rate": 1.3442379238004735e-06, "loss": 0.9291, "step": 745 }, { "epoch": 0.6922011998154131, "grad_norm": 1.5004673906000858, "learning_rate": 1.3086387905080552e-06, "loss": 0.9212, "step": 750 }, { "epoch": 0.6968158744808491, "grad_norm": 1.6144659493936522, "learning_rate": 1.2733495103422356e-06, "loss": 0.9357, "step": 755 }, { "epoch": 0.7014305491462852, "grad_norm": 1.5182833736012866, "learning_rate": 1.2383792614546184e-06, "loss": 0.9291, "step": 760 }, { "epoch": 0.7060452238117213, "grad_norm": 1.4811237244228022, "learning_rate": 1.2037371390221075e-06, "loss": 0.9314, "step": 765 }, { "epoch": 0.7106598984771574, "grad_norm": 1.590380067709118, "learning_rate": 1.1694321528814012e-06, "loss": 0.9253, "step": 770 }, { "epoch": 0.7152745731425935, "grad_norm": 1.4752434806101777, "learning_rate": 1.1354732251856893e-06, "loss": 0.9168, "step": 775 }, { "epoch": 0.7198892478080295, "grad_norm": 1.5101231637035002, "learning_rate": 1.1018691880841439e-06, "loss": 0.9187, "step": 780 }, { "epoch": 0.7245039224734656, "grad_norm": 1.4280774491608867, "learning_rate": 1.0686287814248331e-06, "loss": 0.9192, "step": 785 }, { "epoch": 0.7291185971389017, "grad_norm": 1.4993688122643356, "learning_rate": 1.035760650481623e-06, "loss": 0.9301, "step": 790 }, { "epoch": 0.7337332718043378, "grad_norm": 1.4945371271836858, "learning_rate": 1.0032733437056972e-06, "loss": 0.9134, "step": 795 }, { "epoch": 0.7383479464697739, "grad_norm": 1.4373254469237389, "learning_rate": 9.711753105022373e-07, "loss": 0.9053, "step": 800 }, { "epoch": 0.7383479464697739, "eval_loss": 0.9340656995773315, "eval_runtime": 640.3734, "eval_samples_per_second": 23.97, "eval_steps_per_second": 0.187, "step": 800 }, { "epoch": 0.7429626211352099, "grad_norm": 1.4974834920489408, "learning_rate": 9.394748990328822e-07, "loss": 0.936, "step": 805 }, { "epoch": 0.747577295800646, "grad_norm": 1.519981282630895, "learning_rate": 9.081803540445072e-07, "loss": 0.9574, "step": 810 }, { "epoch": 0.7521919704660821, "grad_norm": 1.4743451364800968, "learning_rate": 8.772998147249007e-07, "loss": 0.9362, "step": 815 }, { "epoch": 0.7568066451315182, "grad_norm": 1.504464804764378, "learning_rate": 8.468413125859037e-07, "loss": 0.9341, "step": 820 }, { "epoch": 0.7614213197969543, "grad_norm": 1.5181624277683528, "learning_rate": 8.168127693745423e-07, "loss": 0.9179, "step": 825 }, { "epoch": 0.7660359944623903, "grad_norm": 1.4986157582249817, "learning_rate": 7.87221995012718e-07, "loss": 0.9251, "step": 830 }, { "epoch": 0.7706506691278265, "grad_norm": 1.5331062050181175, "learning_rate": 7.580766855659727e-07, "loss": 0.9346, "step": 835 }, { "epoch": 0.7752653437932626, "grad_norm": 1.4653042645263943, "learning_rate": 7.293844212418769e-07, "loss": 0.9469, "step": 840 }, { "epoch": 0.7798800184586987, "grad_norm": 1.4781975909182412, "learning_rate": 7.011526644185401e-07, "loss": 0.9157, "step": 845 }, { "epoch": 0.7844946931241348, "grad_norm": 1.4828206936520378, "learning_rate": 6.733887577037713e-07, "loss": 0.9217, "step": 850 }, { "epoch": 0.7891093677895709, "grad_norm": 1.5691486927339433, "learning_rate": 6.460999220253919e-07, "loss": 0.9227, "step": 855 }, { "epoch": 0.7937240424550069, "grad_norm": 1.50166727015388, "learning_rate": 6.192932547531904e-07, "loss": 0.9184, "step": 860 }, { "epoch": 0.798338717120443, "grad_norm": 1.4556654028230263, "learning_rate": 5.929757278530179e-07, "loss": 0.9198, "step": 865 }, { "epoch": 0.8029533917858791, "grad_norm": 1.4486962123298668, "learning_rate": 5.671541860734933e-07, "loss": 0.9248, "step": 870 }, { "epoch": 0.8075680664513152, "grad_norm": 1.448405322107658, "learning_rate": 5.418353451658026e-07, "loss": 0.9255, "step": 875 }, { "epoch": 0.8121827411167513, "grad_norm": 1.4852528389327857, "learning_rate": 5.170257901370391e-07, "loss": 0.9201, "step": 880 }, { "epoch": 0.8167974157821873, "grad_norm": 1.4489675730010414, "learning_rate": 4.927319735375591e-07, "loss": 0.9205, "step": 885 }, { "epoch": 0.8214120904476234, "grad_norm": 1.4888385189112359, "learning_rate": 4.689602137827762e-07, "loss": 0.9156, "step": 890 }, { "epoch": 0.8260267651130595, "grad_norm": 1.4598980376028965, "learning_rate": 4.457166935098511e-07, "loss": 0.9369, "step": 895 }, { "epoch": 0.8306414397784956, "grad_norm": 1.48422793438408, "learning_rate": 4.230074579696883e-07, "loss": 0.9221, "step": 900 }, { "epoch": 0.8306414397784956, "eval_loss": 0.9309639930725098, "eval_runtime": 623.2765, "eval_samples_per_second": 24.628, "eval_steps_per_second": 0.193, "step": 900 }, { "epoch": 0.8352561144439317, "grad_norm": 1.4610057134597918, "learning_rate": 4.0083841345466944e-07, "loss": 0.9251, "step": 905 }, { "epoch": 0.8398707891093677, "grad_norm": 1.4597814580939314, "learning_rate": 3.7921532576252305e-07, "loss": 0.9148, "step": 910 }, { "epoch": 0.8444854637748038, "grad_norm": 1.5030362710632983, "learning_rate": 3.5814381869674064e-07, "loss": 0.927, "step": 915 }, { "epoch": 0.84910013844024, "grad_norm": 1.4283564011576262, "learning_rate": 3.3762937260391484e-07, "loss": 0.9185, "step": 920 }, { "epoch": 0.8537148131056761, "grad_norm": 1.4666353451320664, "learning_rate": 3.1767732294839785e-07, "loss": 0.9298, "step": 925 }, { "epoch": 0.8583294877711122, "grad_norm": 1.465141344334021, "learning_rate": 2.9829285892463143e-07, "loss": 0.92, "step": 930 }, { "epoch": 0.8629441624365483, "grad_norm": 1.4780301160828582, "learning_rate": 2.7948102210752894e-07, "loss": 0.9104, "step": 935 }, { "epoch": 0.8675588371019843, "grad_norm": 1.4443139101470652, "learning_rate": 2.612467051412412e-07, "loss": 0.9028, "step": 940 }, { "epoch": 0.8721735117674204, "grad_norm": 1.448117060556846, "learning_rate": 2.4359465046666725e-07, "loss": 0.9276, "step": 945 }, { "epoch": 0.8767881864328565, "grad_norm": 1.4493259447942397, "learning_rate": 2.2652944908801878e-07, "loss": 0.9329, "step": 950 }, { "epoch": 0.8814028610982926, "grad_norm": 1.421093156453384, "learning_rate": 2.1005553937878094e-07, "loss": 0.9005, "step": 955 }, { "epoch": 0.8860175357637287, "grad_norm": 1.4355748172215945, "learning_rate": 1.9417720592736445e-07, "loss": 0.9266, "step": 960 }, { "epoch": 0.8906322104291647, "grad_norm": 1.4852268716913266, "learning_rate": 1.7889857842275483e-07, "loss": 0.9271, "step": 965 }, { "epoch": 0.8952468850946008, "grad_norm": 1.4934907730868012, "learning_rate": 1.642236305804512e-07, "loss": 0.9292, "step": 970 }, { "epoch": 0.8998615597600369, "grad_norm": 1.4220797215846808, "learning_rate": 1.50156179108967e-07, "loss": 0.908, "step": 975 }, { "epoch": 0.904476234425473, "grad_norm": 1.495083088875934, "learning_rate": 1.3669988271717032e-07, "loss": 0.9355, "step": 980 }, { "epoch": 0.9090909090909091, "grad_norm": 1.5182197209406392, "learning_rate": 1.2385824116271262e-07, "loss": 0.9212, "step": 985 }, { "epoch": 0.9137055837563451, "grad_norm": 1.4706555461385944, "learning_rate": 1.1163459434180318e-07, "loss": 0.9278, "step": 990 }, { "epoch": 0.9183202584217812, "grad_norm": 1.4386080105822332, "learning_rate": 1.0003212142055435e-07, "loss": 0.9293, "step": 995 }, { "epoch": 0.9229349330872173, "grad_norm": 1.424726265338342, "learning_rate": 8.905384000813727e-08, "loss": 0.9109, "step": 1000 }, { "epoch": 0.9229349330872173, "eval_loss": 0.9295699596405029, "eval_runtime": 1057.9145, "eval_samples_per_second": 14.51, "eval_steps_per_second": 0.113, "step": 1000 }, { "epoch": 0.9275496077526535, "grad_norm": 1.4832613038564573, "learning_rate": 7.870260537194918e-08, "loss": 0.9339, "step": 1005 }, { "epoch": 0.9321642824180896, "grad_norm": 1.497006066691023, "learning_rate": 6.898110969500799e-08, "loss": 0.9326, "step": 1010 }, { "epoch": 0.9367789570835257, "grad_norm": 1.4677754148709299, "learning_rate": 5.989188137575957e-08, "loss": 0.9159, "step": 1015 }, { "epoch": 0.9413936317489617, "grad_norm": 1.425614196561809, "learning_rate": 5.143728437048412e-08, "loss": 0.918, "step": 1020 }, { "epoch": 0.9460083064143978, "grad_norm": 1.4462912456963855, "learning_rate": 4.361951757847066e-08, "loss": 0.9225, "step": 1025 }, { "epoch": 0.9506229810798339, "grad_norm": 1.4641416653725123, "learning_rate": 3.64406142701193e-08, "loss": 0.9105, "step": 1030 }, { "epoch": 0.95523765574527, "grad_norm": 1.4748777780105025, "learning_rate": 2.990244155812255e-08, "loss": 0.9339, "step": 1035 }, { "epoch": 0.9598523304107061, "grad_norm": 1.4503913713599144, "learning_rate": 2.400669991186072e-08, "loss": 0.9106, "step": 1040 }, { "epoch": 0.9644670050761421, "grad_norm": 1.4765958461425874, "learning_rate": 1.875492271513679e-08, "loss": 0.9217, "step": 1045 }, { "epoch": 0.9690816797415782, "grad_norm": 1.4979865845055302, "learning_rate": 1.4148475867370448e-08, "loss": 0.9338, "step": 1050 }, { "epoch": 0.9736963544070143, "grad_norm": 1.45168993774126, "learning_rate": 1.0188557428350288e-08, "loss": 0.9317, "step": 1055 }, { "epoch": 0.9783110290724504, "grad_norm": 1.4646703636820788, "learning_rate": 6.876197306637244e-09, "loss": 0.9211, "step": 1060 }, { "epoch": 0.9829257037378865, "grad_norm": 1.442565338087296, "learning_rate": 4.212256991704134e-09, "loss": 0.9187, "step": 1065 }, { "epoch": 0.9875403784033225, "grad_norm": 1.4318946555647172, "learning_rate": 2.1974293298762662e-09, "loss": 0.9281, "step": 1070 }, { "epoch": 0.9921550530687586, "grad_norm": 1.4598414281806804, "learning_rate": 8.32238344132541e-10, "loss": 0.9226, "step": 1075 }, { "epoch": 0.9967697277341947, "grad_norm": 1.4827039534649538, "learning_rate": 1.1703909781751731e-10, "loss": 0.9144, "step": 1080 }, { "epoch": 0.9995385325334564, "step": 1083, "total_flos": 453306954547200.0, "train_loss": 0.945578172706611, "train_runtime": 38930.1546, "train_samples_per_second": 3.562, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 1083, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 453306954547200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }