{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995385325334564, "eval_steps": 100, "global_step": 1083, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009229349330872173, "grad_norm": 9.222650171740101, "learning_rate": 9.174311926605506e-08, "loss": 1.1391, "step": 1 }, { "epoch": 0.0046146746654360865, "grad_norm": 8.813354760736841, "learning_rate": 4.587155963302753e-07, "loss": 1.1346, "step": 5 }, { "epoch": 0.009229349330872173, "grad_norm": 5.082254385960609, "learning_rate": 9.174311926605506e-07, "loss": 1.0934, "step": 10 }, { "epoch": 0.01384402399630826, "grad_norm": 3.211536911547781, "learning_rate": 1.3761467889908258e-06, "loss": 1.0264, "step": 15 }, { "epoch": 0.018458698661744346, "grad_norm": 2.7181445543779494, "learning_rate": 1.8348623853211011e-06, "loss": 1.0199, "step": 20 }, { "epoch": 0.023073373327180433, "grad_norm": 2.3056635728248183, "learning_rate": 2.2935779816513764e-06, "loss": 1.0075, "step": 25 }, { "epoch": 0.02768804799261652, "grad_norm": 2.6315302800596365, "learning_rate": 2.7522935779816517e-06, "loss": 0.9732, "step": 30 }, { "epoch": 0.032302722658052604, "grad_norm": 2.0678619803720886, "learning_rate": 3.211009174311927e-06, "loss": 0.9964, "step": 35 }, { "epoch": 0.03691739732348869, "grad_norm": 3.005634145043816, "learning_rate": 3.6697247706422022e-06, "loss": 1.0015, "step": 40 }, { "epoch": 0.04153207198892478, "grad_norm": 2.495857277480277, "learning_rate": 4.128440366972478e-06, "loss": 0.979, "step": 45 }, { "epoch": 0.046146746654360866, "grad_norm": 2.442092473284365, "learning_rate": 4.587155963302753e-06, "loss": 0.9723, "step": 50 }, { "epoch": 0.050761421319796954, "grad_norm": 2.1284759016567136, "learning_rate": 5.045871559633028e-06, "loss": 0.9722, "step": 55 }, { "epoch": 0.05537609598523304, "grad_norm": 2.83564108049901, "learning_rate": 5.504587155963303e-06, "loss": 0.9852, "step": 60 }, { "epoch": 0.05999077065066913, "grad_norm": 2.0748240090940975, "learning_rate": 5.963302752293578e-06, "loss": 0.9845, "step": 65 }, { "epoch": 0.06460544531610521, "grad_norm": 2.9709334092183863, "learning_rate": 6.422018348623854e-06, "loss": 0.9615, "step": 70 }, { "epoch": 0.0692201199815413, "grad_norm": 2.184920669369361, "learning_rate": 6.880733944954129e-06, "loss": 0.9675, "step": 75 }, { "epoch": 0.07383479464697738, "grad_norm": 2.1738804546566817, "learning_rate": 7.3394495412844045e-06, "loss": 0.9589, "step": 80 }, { "epoch": 0.07844946931241348, "grad_norm": 2.223214536084122, "learning_rate": 7.79816513761468e-06, "loss": 0.977, "step": 85 }, { "epoch": 0.08306414397784956, "grad_norm": 1.990814551883062, "learning_rate": 8.256880733944956e-06, "loss": 0.9767, "step": 90 }, { "epoch": 0.08767881864328565, "grad_norm": 2.0700255239398375, "learning_rate": 8.71559633027523e-06, "loss": 0.9828, "step": 95 }, { "epoch": 0.09229349330872173, "grad_norm": 1.821385794436734, "learning_rate": 9.174311926605506e-06, "loss": 0.9896, "step": 100 }, { "epoch": 0.09229349330872173, "eval_loss": 0.9892959594726562, "eval_runtime": 901.1864, "eval_samples_per_second": 17.033, "eval_steps_per_second": 0.133, "step": 100 }, { "epoch": 0.09690816797415783, "grad_norm": 2.1363964627484155, "learning_rate": 9.633027522935781e-06, "loss": 1.015, "step": 105 }, { "epoch": 0.10152284263959391, "grad_norm": 1.9477999761197564, "learning_rate": 9.999973991131207e-06, "loss": 0.9983, "step": 110 }, { "epoch": 0.10613751730503, "grad_norm": 2.0399110459564516, "learning_rate": 9.999063709134502e-06, "loss": 1.0062, "step": 115 }, { "epoch": 0.11075219197046608, "grad_norm": 1.9098211009145907, "learning_rate": 9.996853254269984e-06, "loss": 0.9855, "step": 120 }, { "epoch": 0.11536686663590216, "grad_norm": 1.7833107957442789, "learning_rate": 9.99334320144e-06, "loss": 1.0093, "step": 125 }, { "epoch": 0.11998154130133826, "grad_norm": 1.7815777364460414, "learning_rate": 9.988534463550585e-06, "loss": 0.981, "step": 130 }, { "epoch": 0.12459621596677434, "grad_norm": 1.8694974323828528, "learning_rate": 9.982428291274047e-06, "loss": 1.0102, "step": 135 }, { "epoch": 0.12921089063221042, "grad_norm": 1.8237445983099982, "learning_rate": 9.975026272723677e-06, "loss": 1.006, "step": 140 }, { "epoch": 0.13382556529764653, "grad_norm": 1.8625032130850405, "learning_rate": 9.96633033304071e-06, "loss": 0.984, "step": 145 }, { "epoch": 0.1384402399630826, "grad_norm": 2.072572452539365, "learning_rate": 9.95634273389363e-06, "loss": 0.9712, "step": 150 }, { "epoch": 0.1430549146285187, "grad_norm": 2.0593175620343547, "learning_rate": 9.945066072889942e-06, "loss": 1.0211, "step": 155 }, { "epoch": 0.14766958929395477, "grad_norm": 2.0200152592170943, "learning_rate": 9.93250328290059e-06, "loss": 0.9775, "step": 160 }, { "epoch": 0.15228426395939088, "grad_norm": 1.875921122063454, "learning_rate": 9.918657631297153e-06, "loss": 0.9847, "step": 165 }, { "epoch": 0.15689893862482696, "grad_norm": 1.8527340279543534, "learning_rate": 9.90353271910206e-06, "loss": 0.9872, "step": 170 }, { "epoch": 0.16151361329026304, "grad_norm": 1.8908823561795056, "learning_rate": 9.887132480052028e-06, "loss": 0.9712, "step": 175 }, { "epoch": 0.16612828795569912, "grad_norm": 1.8296519425504616, "learning_rate": 9.869461179574963e-06, "loss": 0.9859, "step": 180 }, { "epoch": 0.1707429626211352, "grad_norm": 1.7574451917239733, "learning_rate": 9.850523413680588e-06, "loss": 0.9911, "step": 185 }, { "epoch": 0.1753576372865713, "grad_norm": 1.7389467033514425, "learning_rate": 9.830324107765104e-06, "loss": 0.9739, "step": 190 }, { "epoch": 0.17997231195200739, "grad_norm": 1.7050530099463075, "learning_rate": 9.808868515330169e-06, "loss": 0.9803, "step": 195 }, { "epoch": 0.18458698661744347, "grad_norm": 1.756999826792102, "learning_rate": 9.786162216616561e-06, "loss": 0.9838, "step": 200 }, { "epoch": 0.18458698661744347, "eval_loss": 0.9935115575790405, "eval_runtime": 751.2551, "eval_samples_per_second": 20.432, "eval_steps_per_second": 0.16, "step": 200 }, { "epoch": 0.18920166128287955, "grad_norm": 1.778225787399477, "learning_rate": 9.762211117152839e-06, "loss": 0.9754, "step": 205 }, { "epoch": 0.19381633594831565, "grad_norm": 1.6039072749046104, "learning_rate": 9.737021446219424e-06, "loss": 0.9577, "step": 210 }, { "epoch": 0.19843101061375173, "grad_norm": 1.8562210127250676, "learning_rate": 9.71059975522846e-06, "loss": 0.9708, "step": 215 }, { "epoch": 0.20304568527918782, "grad_norm": 1.602175292503488, "learning_rate": 9.682952916019907e-06, "loss": 0.9922, "step": 220 }, { "epoch": 0.2076603599446239, "grad_norm": 1.5958137821643004, "learning_rate": 9.654088119074282e-06, "loss": 0.9965, "step": 225 }, { "epoch": 0.21227503461006, "grad_norm": 1.8573846533843932, "learning_rate": 9.624012871642545e-06, "loss": 0.9753, "step": 230 }, { "epoch": 0.21688970927549608, "grad_norm": 1.640370350766078, "learning_rate": 9.592734995793583e-06, "loss": 0.9925, "step": 235 }, { "epoch": 0.22150438394093216, "grad_norm": 1.6631975332565576, "learning_rate": 9.560262626379824e-06, "loss": 0.9829, "step": 240 }, { "epoch": 0.22611905860636825, "grad_norm": 1.6393108832006937, "learning_rate": 9.52660420892149e-06, "loss": 0.9837, "step": 245 }, { "epoch": 0.23073373327180433, "grad_norm": 1.5999393221352023, "learning_rate": 9.49176849741007e-06, "loss": 0.9822, "step": 250 }, { "epoch": 0.23534840793724043, "grad_norm": 1.6562799165061401, "learning_rate": 9.455764552031546e-06, "loss": 0.9876, "step": 255 }, { "epoch": 0.23996308260267651, "grad_norm": 1.597298992345653, "learning_rate": 9.418601736809989e-06, "loss": 0.99, "step": 260 }, { "epoch": 0.2445777572681126, "grad_norm": 1.615615429575249, "learning_rate": 9.380289717172141e-06, "loss": 0.9944, "step": 265 }, { "epoch": 0.24919243193354867, "grad_norm": 1.6391764879253228, "learning_rate": 9.340838457433588e-06, "loss": 0.9941, "step": 270 }, { "epoch": 0.25380710659898476, "grad_norm": 1.7100035873468014, "learning_rate": 9.300258218207215e-06, "loss": 0.9785, "step": 275 }, { "epoch": 0.25842178126442084, "grad_norm": 2.095593373075969, "learning_rate": 9.258559553734597e-06, "loss": 0.9883, "step": 280 }, { "epoch": 0.26303645592985697, "grad_norm": 1.6273765059661172, "learning_rate": 9.215753309141e-06, "loss": 0.9772, "step": 285 }, { "epoch": 0.26765113059529305, "grad_norm": 1.6695304390680636, "learning_rate": 9.17185061761477e-06, "loss": 0.9881, "step": 290 }, { "epoch": 0.27226580526072913, "grad_norm": 1.6630409654600378, "learning_rate": 9.126862897511752e-06, "loss": 0.9785, "step": 295 }, { "epoch": 0.2768804799261652, "grad_norm": 1.6120537857852493, "learning_rate": 9.080801849385585e-06, "loss": 0.9853, "step": 300 }, { "epoch": 0.2768804799261652, "eval_loss": 0.9881044626235962, "eval_runtime": 648.9048, "eval_samples_per_second": 23.655, "eval_steps_per_second": 0.185, "step": 300 }, { "epoch": 0.2814951545916013, "grad_norm": 1.5682941966581576, "learning_rate": 9.033679452944574e-06, "loss": 0.9593, "step": 305 }, { "epoch": 0.2861098292570374, "grad_norm": 1.7509871813668145, "learning_rate": 8.98550796393597e-06, "loss": 0.9833, "step": 310 }, { "epoch": 0.29072450392247345, "grad_norm": 1.5617036787004468, "learning_rate": 8.936299910958461e-06, "loss": 0.9732, "step": 315 }, { "epoch": 0.29533917858790953, "grad_norm": 1.6332967875738513, "learning_rate": 8.886068092203684e-06, "loss": 0.9744, "step": 320 }, { "epoch": 0.2999538532533456, "grad_norm": 1.7354150495189573, "learning_rate": 8.834825572127632e-06, "loss": 0.9827, "step": 325 }, { "epoch": 0.30456852791878175, "grad_norm": 1.588233836037739, "learning_rate": 8.782585678052814e-06, "loss": 0.9962, "step": 330 }, { "epoch": 0.30918320258421783, "grad_norm": 1.5809425639553816, "learning_rate": 8.729361996702032e-06, "loss": 0.9903, "step": 335 }, { "epoch": 0.3137978772496539, "grad_norm": 1.603602687498353, "learning_rate": 8.675168370664706e-06, "loss": 0.981, "step": 340 }, { "epoch": 0.31841255191509, "grad_norm": 1.5782147372024207, "learning_rate": 8.620018894796654e-06, "loss": 0.9888, "step": 345 }, { "epoch": 0.3230272265805261, "grad_norm": 1.5566880880848435, "learning_rate": 8.56392791255424e-06, "loss": 0.9648, "step": 350 }, { "epoch": 0.32764190124596215, "grad_norm": 1.655119543708647, "learning_rate": 8.5069100122639e-06, "loss": 0.9761, "step": 355 }, { "epoch": 0.33225657591139823, "grad_norm": 1.9303593174940483, "learning_rate": 8.448980023327943e-06, "loss": 0.978, "step": 360 }, { "epoch": 0.3368712505768343, "grad_norm": 1.6875645085180389, "learning_rate": 8.390153012367692e-06, "loss": 0.9704, "step": 365 }, { "epoch": 0.3414859252422704, "grad_norm": 1.625080279365752, "learning_rate": 8.330444279304887e-06, "loss": 0.9933, "step": 370 }, { "epoch": 0.34610059990770653, "grad_norm": 1.5399521803126077, "learning_rate": 8.269869353382448e-06, "loss": 0.9768, "step": 375 }, { "epoch": 0.3507152745731426, "grad_norm": 1.5199082736016265, "learning_rate": 8.208443989125567e-06, "loss": 0.9702, "step": 380 }, { "epoch": 0.3553299492385787, "grad_norm": 1.6898798315423416, "learning_rate": 8.146184162244231e-06, "loss": 0.9508, "step": 385 }, { "epoch": 0.35994462390401477, "grad_norm": 1.5767639905295907, "learning_rate": 8.083106065478192e-06, "loss": 0.9617, "step": 390 }, { "epoch": 0.36455929856945085, "grad_norm": 1.6940722444661227, "learning_rate": 8.019226104385519e-06, "loss": 0.9827, "step": 395 }, { "epoch": 0.36917397323488693, "grad_norm": 1.542549043886959, "learning_rate": 7.954560893075785e-06, "loss": 0.9638, "step": 400 }, { "epoch": 0.36917397323488693, "eval_loss": 0.978050708770752, "eval_runtime": 631.4932, "eval_samples_per_second": 24.307, "eval_steps_per_second": 0.19, "step": 400 }, { "epoch": 0.373788647900323, "grad_norm": 1.5935524851120768, "learning_rate": 7.889127249889003e-06, "loss": 0.9673, "step": 405 }, { "epoch": 0.3784033225657591, "grad_norm": 1.497119604051003, "learning_rate": 7.82294219302148e-06, "loss": 0.9574, "step": 410 }, { "epoch": 0.3830179972311952, "grad_norm": 1.709877148318467, "learning_rate": 7.75602293609964e-06, "loss": 0.9483, "step": 415 }, { "epoch": 0.3876326718966313, "grad_norm": 1.5527701210281757, "learning_rate": 7.688386883703071e-06, "loss": 0.9904, "step": 420 }, { "epoch": 0.3922473465620674, "grad_norm": 1.651581387061671, "learning_rate": 7.620051626837878e-06, "loss": 0.9619, "step": 425 }, { "epoch": 0.39686202122750347, "grad_norm": 1.5992647254822225, "learning_rate": 7.55103493836155e-06, "loss": 0.9688, "step": 430 }, { "epoch": 0.40147669589293955, "grad_norm": 1.4883188672166754, "learning_rate": 7.481354768360543e-06, "loss": 0.9493, "step": 435 }, { "epoch": 0.40609137055837563, "grad_norm": 1.540282089268457, "learning_rate": 7.411029239481766e-06, "loss": 0.9656, "step": 440 }, { "epoch": 0.4107060452238117, "grad_norm": 1.57006447700181, "learning_rate": 7.340076642219172e-06, "loss": 0.963, "step": 445 }, { "epoch": 0.4153207198892478, "grad_norm": 1.6069506996123204, "learning_rate": 7.268515430156722e-06, "loss": 0.9729, "step": 450 }, { "epoch": 0.41993539455468387, "grad_norm": 1.5002386553771234, "learning_rate": 7.196364215168901e-06, "loss": 0.9521, "step": 455 }, { "epoch": 0.42455006922012, "grad_norm": 1.4646720705420488, "learning_rate": 7.123641762580089e-06, "loss": 0.9507, "step": 460 }, { "epoch": 0.4291647438855561, "grad_norm": 1.6149847953084777, "learning_rate": 7.050366986284019e-06, "loss": 0.9635, "step": 465 }, { "epoch": 0.43377941855099217, "grad_norm": 1.554087118833947, "learning_rate": 6.9765589438245765e-06, "loss": 0.9744, "step": 470 }, { "epoch": 0.43839409321642825, "grad_norm": 1.4890425150210163, "learning_rate": 6.9022368314392595e-06, "loss": 0.9694, "step": 475 }, { "epoch": 0.44300876788186433, "grad_norm": 1.5064709526763582, "learning_rate": 6.827419979066559e-06, "loss": 0.9577, "step": 480 }, { "epoch": 0.4476234425473004, "grad_norm": 1.5765372233113244, "learning_rate": 6.752127845318561e-06, "loss": 0.9777, "step": 485 }, { "epoch": 0.4522381172127365, "grad_norm": 1.502803008633363, "learning_rate": 6.676380012420087e-06, "loss": 0.9543, "step": 490 }, { "epoch": 0.45685279187817257, "grad_norm": 1.480113884170601, "learning_rate": 6.600196181115692e-06, "loss": 0.9413, "step": 495 }, { "epoch": 0.46146746654360865, "grad_norm": 1.518180225480369, "learning_rate": 6.523596165545818e-06, "loss": 0.9745, "step": 500 }, { "epoch": 0.46146746654360865, "eval_loss": 0.9680244326591492, "eval_runtime": 631.2665, "eval_samples_per_second": 24.316, "eval_steps_per_second": 0.19, "step": 500 }, { "epoch": 0.4660821412090448, "grad_norm": 1.5182752103495631, "learning_rate": 6.446599888093478e-06, "loss": 0.9493, "step": 505 }, { "epoch": 0.47069681587448087, "grad_norm": 1.6381019403661983, "learning_rate": 6.369227374202776e-06, "loss": 0.9655, "step": 510 }, { "epoch": 0.47531149053991695, "grad_norm": 1.480944986996014, "learning_rate": 6.291498747170615e-06, "loss": 0.973, "step": 515 }, { "epoch": 0.47992616520535303, "grad_norm": 1.515420838901527, "learning_rate": 6.213434222912977e-06, "loss": 0.9618, "step": 520 }, { "epoch": 0.4845408398707891, "grad_norm": 1.5303480066641486, "learning_rate": 6.135054104707093e-06, "loss": 0.9439, "step": 525 }, { "epoch": 0.4891555145362252, "grad_norm": 1.487348077399422, "learning_rate": 6.056378777910898e-06, "loss": 0.9565, "step": 530 }, { "epoch": 0.49377018920166127, "grad_norm": 1.5775060237163985, "learning_rate": 5.977428704661151e-06, "loss": 0.9407, "step": 535 }, { "epoch": 0.49838486386709735, "grad_norm": 1.6339716299433642, "learning_rate": 5.898224418551565e-06, "loss": 0.9532, "step": 540 }, { "epoch": 0.5029995385325334, "grad_norm": 1.5688231463762927, "learning_rate": 5.8187865192923644e-06, "loss": 0.9433, "step": 545 }, { "epoch": 0.5076142131979695, "grad_norm": 1.465375400099311, "learning_rate": 5.739135667352651e-06, "loss": 0.9494, "step": 550 }, { "epoch": 0.5122288878634056, "grad_norm": 1.590916658194805, "learning_rate": 5.659292578586957e-06, "loss": 0.9574, "step": 555 }, { "epoch": 0.5168435625288417, "grad_norm": 1.4545280778553549, "learning_rate": 5.579278018847395e-06, "loss": 0.9471, "step": 560 }, { "epoch": 0.5214582371942778, "grad_norm": 1.4258346370497963, "learning_rate": 5.499112798582814e-06, "loss": 0.9456, "step": 565 }, { "epoch": 0.5260729118597139, "grad_norm": 1.4412831745954977, "learning_rate": 5.418817767426343e-06, "loss": 0.9419, "step": 570 }, { "epoch": 0.53068758652515, "grad_norm": 1.5161519877588197, "learning_rate": 5.3384138087727555e-06, "loss": 0.9429, "step": 575 }, { "epoch": 0.5353022611905861, "grad_norm": 1.5365045658327852, "learning_rate": 5.257921834347043e-06, "loss": 0.9421, "step": 580 }, { "epoch": 0.5399169358560222, "grad_norm": 1.5096905798025197, "learning_rate": 5.177362778765629e-06, "loss": 0.9418, "step": 585 }, { "epoch": 0.5445316105214583, "grad_norm": 1.4931388458814197, "learning_rate": 5.096757594091623e-06, "loss": 0.9336, "step": 590 }, { "epoch": 0.5491462851868943, "grad_norm": 1.54787574418171, "learning_rate": 5.01612724438554e-06, "loss": 0.9594, "step": 595 }, { "epoch": 0.5537609598523304, "grad_norm": 1.4047855369892224, "learning_rate": 4.935492700252903e-06, "loss": 0.9396, "step": 600 }, { "epoch": 0.5537609598523304, "eval_loss": 0.9567832350730896, "eval_runtime": 1029.8073, "eval_samples_per_second": 14.906, "eval_steps_per_second": 0.117, "step": 600 }, { "epoch": 0.5583756345177665, "grad_norm": 1.532026589616403, "learning_rate": 4.854874933390124e-06, "loss": 0.9464, "step": 605 }, { "epoch": 0.5629903091832026, "grad_norm": 1.4445505648251018, "learning_rate": 4.774294911130141e-06, "loss": 0.9564, "step": 610 }, { "epoch": 0.5676049838486387, "grad_norm": 1.3806609000939527, "learning_rate": 4.6937735909891456e-06, "loss": 0.9401, "step": 615 }, { "epoch": 0.5722196585140747, "grad_norm": 1.4758989461572256, "learning_rate": 4.6133319152158886e-06, "loss": 0.9504, "step": 620 }, { "epoch": 0.5768343331795108, "grad_norm": 1.411552408489387, "learning_rate": 4.532990805344958e-06, "loss": 0.9382, "step": 625 }, { "epoch": 0.5814490078449469, "grad_norm": 1.4215358213534153, "learning_rate": 4.452771156755423e-06, "loss": 0.9457, "step": 630 }, { "epoch": 0.586063682510383, "grad_norm": 1.5194235066664028, "learning_rate": 4.372693833236319e-06, "loss": 0.9538, "step": 635 }, { "epoch": 0.5906783571758191, "grad_norm": 1.504251672226047, "learning_rate": 4.292779661560295e-06, "loss": 0.9541, "step": 640 }, { "epoch": 0.5952930318412551, "grad_norm": 1.477570181176633, "learning_rate": 4.213049426066946e-06, "loss": 0.932, "step": 645 }, { "epoch": 0.5999077065066912, "grad_norm": 1.4830574491662214, "learning_rate": 4.133523863257139e-06, "loss": 0.9499, "step": 650 }, { "epoch": 0.6045223811721273, "grad_norm": 1.4506011398131606, "learning_rate": 4.054223656399794e-06, "loss": 0.9432, "step": 655 }, { "epoch": 0.6091370558375635, "grad_norm": 1.5259712782334296, "learning_rate": 3.975169430152524e-06, "loss": 0.9336, "step": 660 }, { "epoch": 0.6137517305029996, "grad_norm": 1.4462086241646492, "learning_rate": 3.8963817451974915e-06, "loss": 0.9434, "step": 665 }, { "epoch": 0.6183664051684357, "grad_norm": 1.4279004522752485, "learning_rate": 3.817881092893934e-06, "loss": 0.9468, "step": 670 }, { "epoch": 0.6229810798338717, "grad_norm": 1.4169615015471333, "learning_rate": 3.7396878899486896e-06, "loss": 0.9416, "step": 675 }, { "epoch": 0.6275957544993078, "grad_norm": 1.4479507419493, "learning_rate": 3.6618224731061658e-06, "loss": 0.9388, "step": 680 }, { "epoch": 0.6322104291647439, "grad_norm": 1.4589224612269058, "learning_rate": 3.584305093859082e-06, "loss": 0.9384, "step": 685 }, { "epoch": 0.63682510383018, "grad_norm": 1.4183964058904668, "learning_rate": 3.507155913181402e-06, "loss": 0.9347, "step": 690 }, { "epoch": 0.6414397784956161, "grad_norm": 1.4542672740750342, "learning_rate": 3.4303949962848003e-06, "loss": 0.9494, "step": 695 }, { "epoch": 0.6460544531610521, "grad_norm": 1.466448569041173, "learning_rate": 3.3540423074000323e-06, "loss": 0.9176, "step": 700 }, { "epoch": 0.6460544531610521, "eval_loss": 0.9464961290359497, "eval_runtime": 630.6464, "eval_samples_per_second": 24.34, "eval_steps_per_second": 0.19, "step": 700 }, { "epoch": 0.6506691278264882, "grad_norm": 1.4112284829468522, "learning_rate": 3.278117704584577e-06, "loss": 0.9164, "step": 705 }, { "epoch": 0.6552838024919243, "grad_norm": 1.4108457010710305, "learning_rate": 3.202640934557884e-06, "loss": 0.9213, "step": 710 }, { "epoch": 0.6598984771573604, "grad_norm": 1.4189577161809213, "learning_rate": 3.127631627565586e-06, "loss": 0.9368, "step": 715 }, { "epoch": 0.6645131518227965, "grad_norm": 1.452002361321377, "learning_rate": 3.053109292273996e-06, "loss": 0.9372, "step": 720 }, { "epoch": 0.6691278264882325, "grad_norm": 1.4024056929659159, "learning_rate": 2.9790933106962328e-06, "loss": 0.925, "step": 725 }, { "epoch": 0.6737425011536686, "grad_norm": 1.4306957271780452, "learning_rate": 2.9056029331512853e-06, "loss": 0.9259, "step": 730 }, { "epoch": 0.6783571758191047, "grad_norm": 1.3638433368150233, "learning_rate": 2.8326572732573167e-06, "loss": 0.9298, "step": 735 }, { "epoch": 0.6829718504845408, "grad_norm": 1.438840741138399, "learning_rate": 2.7602753029605456e-06, "loss": 0.9312, "step": 740 }, { "epoch": 0.687586525149977, "grad_norm": 1.3538477302959735, "learning_rate": 2.688475847600947e-06, "loss": 0.9328, "step": 745 }, { "epoch": 0.6922011998154131, "grad_norm": 1.473692645733402, "learning_rate": 2.6172775810161104e-06, "loss": 0.9239, "step": 750 }, { "epoch": 0.6968158744808491, "grad_norm": 1.5623662932746458, "learning_rate": 2.546699020684471e-06, "loss": 0.9371, "step": 755 }, { "epoch": 0.7014305491462852, "grad_norm": 1.4016556607303827, "learning_rate": 2.4767585229092368e-06, "loss": 0.9308, "step": 760 }, { "epoch": 0.7060452238117213, "grad_norm": 1.3899751453383695, "learning_rate": 2.407474278044215e-06, "loss": 0.9332, "step": 765 }, { "epoch": 0.7106598984771574, "grad_norm": 1.4345839815007075, "learning_rate": 2.3388643057628025e-06, "loss": 0.9283, "step": 770 }, { "epoch": 0.7152745731425935, "grad_norm": 1.4014510611067819, "learning_rate": 2.2709464503713785e-06, "loss": 0.9196, "step": 775 }, { "epoch": 0.7198892478080295, "grad_norm": 1.4343098995975527, "learning_rate": 2.2037383761682877e-06, "loss": 0.9211, "step": 780 }, { "epoch": 0.7245039224734656, "grad_norm": 1.342137285764838, "learning_rate": 2.1372575628496662e-06, "loss": 0.9206, "step": 785 }, { "epoch": 0.7291185971389017, "grad_norm": 1.396553897850799, "learning_rate": 2.071521300963246e-06, "loss": 0.9324, "step": 790 }, { "epoch": 0.7337332718043378, "grad_norm": 1.4382708247406493, "learning_rate": 2.0065466874113944e-06, "loss": 0.9159, "step": 795 }, { "epoch": 0.7383479464697739, "grad_norm": 1.3548453547021793, "learning_rate": 1.9423506210044746e-06, "loss": 0.9067, "step": 800 }, { "epoch": 0.7383479464697739, "eval_loss": 0.9378637671470642, "eval_runtime": 663.1431, "eval_samples_per_second": 23.147, "eval_steps_per_second": 0.181, "step": 800 }, { "epoch": 0.7429626211352099, "grad_norm": 1.3924317571532645, "learning_rate": 1.8789497980657644e-06, "loss": 0.9387, "step": 805 }, { "epoch": 0.747577295800646, "grad_norm": 1.4165077817002125, "learning_rate": 1.8163607080890143e-06, "loss": 0.9593, "step": 810 }, { "epoch": 0.7521919704660821, "grad_norm": 1.4141204258596356, "learning_rate": 1.7545996294498013e-06, "loss": 0.9374, "step": 815 }, { "epoch": 0.7568066451315182, "grad_norm": 1.3739608338173115, "learning_rate": 1.6936826251718075e-06, "loss": 0.9345, "step": 820 }, { "epoch": 0.7614213197969543, "grad_norm": 1.4298339467420962, "learning_rate": 1.6336255387490846e-06, "loss": 0.9185, "step": 825 }, { "epoch": 0.7660359944623903, "grad_norm": 1.4280106425956387, "learning_rate": 1.574443990025436e-06, "loss": 0.9251, "step": 830 }, { "epoch": 0.7706506691278265, "grad_norm": 1.4182309514153764, "learning_rate": 1.5161533711319454e-06, "loss": 0.9337, "step": 835 }, { "epoch": 0.7752653437932626, "grad_norm": 1.3937433613798544, "learning_rate": 1.4587688424837538e-06, "loss": 0.9448, "step": 840 }, { "epoch": 0.7798800184586987, "grad_norm": 1.456151982504558, "learning_rate": 1.4023053288370803e-06, "loss": 0.9129, "step": 845 }, { "epoch": 0.7844946931241348, "grad_norm": 1.3853091041139631, "learning_rate": 1.3467775154075425e-06, "loss": 0.9213, "step": 850 }, { "epoch": 0.7891093677895709, "grad_norm": 1.4547278352297517, "learning_rate": 1.2921998440507838e-06, "loss": 0.9211, "step": 855 }, { "epoch": 0.7937240424550069, "grad_norm": 1.4019092163127755, "learning_rate": 1.2385865095063808e-06, "loss": 0.9189, "step": 860 }, { "epoch": 0.798338717120443, "grad_norm": 1.3640880739873995, "learning_rate": 1.1859514557060358e-06, "loss": 0.9184, "step": 865 }, { "epoch": 0.8029533917858791, "grad_norm": 1.4032390358058673, "learning_rate": 1.1343083721469867e-06, "loss": 0.9234, "step": 870 }, { "epoch": 0.8075680664513152, "grad_norm": 1.365688843862272, "learning_rate": 1.0836706903316052e-06, "loss": 0.9244, "step": 875 }, { "epoch": 0.8121827411167513, "grad_norm": 1.4146840493944073, "learning_rate": 1.0340515802740781e-06, "loss": 0.9197, "step": 880 }, { "epoch": 0.8167974157821873, "grad_norm": 1.3895617974460883, "learning_rate": 9.854639470751182e-07, "loss": 0.9194, "step": 885 }, { "epoch": 0.8214120904476234, "grad_norm": 1.4357287631638345, "learning_rate": 9.379204275655524e-07, "loss": 0.9156, "step": 890 }, { "epoch": 0.8260267651130595, "grad_norm": 1.3844689117152469, "learning_rate": 8.914333870197022e-07, "loss": 0.9355, "step": 895 }, { "epoch": 0.8306414397784956, "grad_norm": 1.3689079608163703, "learning_rate": 8.460149159393766e-07, "loss": 0.9221, "step": 900 }, { "epoch": 0.8306414397784956, "eval_loss": 0.9320199489593506, "eval_runtime": 630.5251, "eval_samples_per_second": 24.345, "eval_steps_per_second": 0.19, "step": 900 }, { "epoch": 0.8352561144439317, "grad_norm": 1.373835743305699, "learning_rate": 8.016768269093389e-07, "loss": 0.9225, "step": 905 }, { "epoch": 0.8398707891093677, "grad_norm": 1.3862642218872392, "learning_rate": 7.584306515250461e-07, "loss": 0.9118, "step": 910 }, { "epoch": 0.8444854637748038, "grad_norm": 1.427561354442013, "learning_rate": 7.162876373934813e-07, "loss": 0.9257, "step": 915 }, { "epoch": 0.84910013844024, "grad_norm": 1.3318980900547395, "learning_rate": 6.752587452078297e-07, "loss": 0.9168, "step": 920 }, { "epoch": 0.8537148131056761, "grad_norm": 1.426821326563428, "learning_rate": 6.353546458967957e-07, "loss": 0.9269, "step": 925 }, { "epoch": 0.8583294877711122, "grad_norm": 1.4130164875607825, "learning_rate": 5.965857178492629e-07, "loss": 0.9177, "step": 930 }, { "epoch": 0.8629441624365483, "grad_norm": 1.37338302698056, "learning_rate": 5.589620442150579e-07, "loss": 0.908, "step": 935 }, { "epoch": 0.8675588371019843, "grad_norm": 1.3675248953715173, "learning_rate": 5.224934102824824e-07, "loss": 0.9018, "step": 940 }, { "epoch": 0.8721735117674204, "grad_norm": 1.3837632491876184, "learning_rate": 4.871893009333345e-07, "loss": 0.9266, "step": 945 }, { "epoch": 0.8767881864328565, "grad_norm": 1.3573040010924988, "learning_rate": 4.5305889817603757e-07, "loss": 0.9303, "step": 950 }, { "epoch": 0.8814028610982926, "grad_norm": 1.3246603553725993, "learning_rate": 4.201110787575619e-07, "loss": 0.9003, "step": 955 }, { "epoch": 0.8860175357637287, "grad_norm": 1.3518273420371894, "learning_rate": 3.883544118547289e-07, "loss": 0.9223, "step": 960 }, { "epoch": 0.8906322104291647, "grad_norm": 1.3931507467693585, "learning_rate": 3.5779715684550966e-07, "loss": 0.9233, "step": 965 }, { "epoch": 0.8952468850946008, "grad_norm": 1.4917970308046273, "learning_rate": 3.284472611609024e-07, "loss": 0.9262, "step": 970 }, { "epoch": 0.8998615597600369, "grad_norm": 1.3457919919745878, "learning_rate": 3.00312358217934e-07, "loss": 0.9061, "step": 975 }, { "epoch": 0.904476234425473, "grad_norm": 1.399847886161994, "learning_rate": 2.7339976543434065e-07, "loss": 0.9303, "step": 980 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4141769147196013, "learning_rate": 2.4771648232542524e-07, "loss": 0.9184, "step": 985 }, { "epoch": 0.9137055837563451, "grad_norm": 1.4105606054413926, "learning_rate": 2.2326918868360636e-07, "loss": 0.923, "step": 990 }, { "epoch": 0.9183202584217812, "grad_norm": 1.3589359885492762, "learning_rate": 2.000642428411087e-07, "loss": 0.9274, "step": 995 }, { "epoch": 0.9229349330872173, "grad_norm": 1.3455010064913675, "learning_rate": 1.7810768001627455e-07, "loss": 0.9087, "step": 1000 }, { "epoch": 0.9229349330872173, "eval_loss": 0.9292727112770081, "eval_runtime": 1081.8404, "eval_samples_per_second": 14.189, "eval_steps_per_second": 0.111, "step": 1000 }, { "epoch": 0.9275496077526535, "grad_norm": 1.3936222044373034, "learning_rate": 1.5740521074389837e-07, "loss": 0.9294, "step": 1005 }, { "epoch": 0.9321642824180896, "grad_norm": 1.6247273551888408, "learning_rate": 1.3796221939001598e-07, "loss": 0.9291, "step": 1010 }, { "epoch": 0.9367789570835257, "grad_norm": 1.386776941906468, "learning_rate": 1.1978376275151915e-07, "loss": 0.9116, "step": 1015 }, { "epoch": 0.9413936317489617, "grad_norm": 1.3387125746511046, "learning_rate": 1.0287456874096824e-07, "loss": 0.9137, "step": 1020 }, { "epoch": 0.9460083064143978, "grad_norm": 1.3731528656657377, "learning_rate": 8.723903515694132e-08, "loss": 0.9208, "step": 1025 }, { "epoch": 0.9506229810798339, "grad_norm": 1.3785247362215645, "learning_rate": 7.28812285402386e-08, "loss": 0.9062, "step": 1030 }, { "epoch": 0.95523765574527, "grad_norm": 1.4102762199212637, "learning_rate": 5.98048831162451e-08, "loss": 0.9286, "step": 1035 }, { "epoch": 0.9598523304107061, "grad_norm": 1.3925062428618178, "learning_rate": 4.801339982372144e-08, "loss": 0.9066, "step": 1040 }, { "epoch": 0.9644670050761421, "grad_norm": 1.3653301284738, "learning_rate": 3.750984543027358e-08, "loss": 0.9197, "step": 1045 }, { "epoch": 0.9690816797415782, "grad_norm": 1.3996435505145486, "learning_rate": 2.8296951734740896e-08, "loss": 0.9303, "step": 1050 }, { "epoch": 0.9736963544070143, "grad_norm": 1.367940583530902, "learning_rate": 2.0377114856700575e-08, "loss": 0.9286, "step": 1055 }, { "epoch": 0.9783110290724504, "grad_norm": 1.4028823114794313, "learning_rate": 1.3752394613274488e-08, "loss": 0.9169, "step": 1060 }, { "epoch": 0.9829257037378865, "grad_norm": 1.354080160910544, "learning_rate": 8.424513983408267e-09, "loss": 0.9156, "step": 1065 }, { "epoch": 0.9875403784033225, "grad_norm": 1.3348691278230316, "learning_rate": 4.3948586597525325e-09, "loss": 0.9243, "step": 1070 }, { "epoch": 0.9921550530687586, "grad_norm": 1.3552426228812648, "learning_rate": 1.664476688265082e-09, "loss": 0.9195, "step": 1075 }, { "epoch": 0.9967697277341947, "grad_norm": 1.3989954260059947, "learning_rate": 2.3407819563503463e-10, "loss": 0.9093, "step": 1080 }, { "epoch": 0.9995385325334564, "step": 1083, "total_flos": 453306954547200.0, "train_loss": 0.9547446678880179, "train_runtime": 38927.5133, "train_samples_per_second": 3.563, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 1083, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 453306954547200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }