{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 3846, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000260035754916301, "grad_norm": 5.25, "learning_rate": 0.0, "loss": 1.5204, "step": 1 }, { "epoch": 0.001300178774581505, "grad_norm": 5.6875, "learning_rate": 2.077922077922078e-07, "loss": 1.6753, "step": 5 }, { "epoch": 0.00260035754916301, "grad_norm": 8.5, "learning_rate": 4.675324675324676e-07, "loss": 1.5933, "step": 10 }, { "epoch": 0.0039005363237445147, "grad_norm": 18.25, "learning_rate": 7.272727272727273e-07, "loss": 1.5821, "step": 15 }, { "epoch": 0.00520071509832602, "grad_norm": 6.15625, "learning_rate": 9.870129870129872e-07, "loss": 1.5405, "step": 20 }, { "epoch": 0.006500893872907525, "grad_norm": 5.25, "learning_rate": 1.2467532467532468e-06, "loss": 1.6146, "step": 25 }, { "epoch": 0.0078010726474890294, "grad_norm": 4.15625, "learning_rate": 1.5064935064935066e-06, "loss": 1.5282, "step": 30 }, { "epoch": 0.009101251422070534, "grad_norm": 4.5625, "learning_rate": 1.7662337662337665e-06, "loss": 1.6264, "step": 35 }, { "epoch": 0.01040143019665204, "grad_norm": 4.75, "learning_rate": 2.0259740259740263e-06, "loss": 1.53, "step": 40 }, { "epoch": 0.011701608971233545, "grad_norm": 4.53125, "learning_rate": 2.285714285714286e-06, "loss": 1.5046, "step": 45 }, { "epoch": 0.01300178774581505, "grad_norm": 3.5, "learning_rate": 2.5454545454545456e-06, "loss": 1.5772, "step": 50 }, { "epoch": 0.014301966520396555, "grad_norm": 2.84375, "learning_rate": 2.8051948051948052e-06, "loss": 1.5543, "step": 55 }, { "epoch": 0.015602145294978059, "grad_norm": 2.84375, "learning_rate": 3.0649350649350653e-06, "loss": 1.5663, "step": 60 }, { "epoch": 0.016902324069559563, "grad_norm": 2.40625, "learning_rate": 3.324675324675325e-06, "loss": 1.5217, "step": 65 }, { "epoch": 0.01820250284414107, "grad_norm": 2.15625, "learning_rate": 3.584415584415585e-06, "loss": 1.4673, "step": 70 }, { "epoch": 0.019502681618722574, "grad_norm": 2.6875, "learning_rate": 3.844155844155845e-06, "loss": 1.5448, "step": 75 }, { "epoch": 0.02080286039330408, "grad_norm": 2.40625, "learning_rate": 4.103896103896105e-06, "loss": 1.515, "step": 80 }, { "epoch": 0.022103039167885585, "grad_norm": 2.15625, "learning_rate": 4.363636363636364e-06, "loss": 1.5072, "step": 85 }, { "epoch": 0.02340321794246709, "grad_norm": 2.140625, "learning_rate": 4.623376623376624e-06, "loss": 1.4343, "step": 90 }, { "epoch": 0.024703396717048593, "grad_norm": 1.984375, "learning_rate": 4.883116883116883e-06, "loss": 1.4416, "step": 95 }, { "epoch": 0.0260035754916301, "grad_norm": 2.109375, "learning_rate": 5.142857142857142e-06, "loss": 1.4342, "step": 100 }, { "epoch": 0.027303754266211604, "grad_norm": 1.9765625, "learning_rate": 5.4025974025974024e-06, "loss": 1.4649, "step": 105 }, { "epoch": 0.02860393304079311, "grad_norm": 1.671875, "learning_rate": 5.6623376623376625e-06, "loss": 1.3714, "step": 110 }, { "epoch": 0.029904111815374616, "grad_norm": 1.859375, "learning_rate": 5.9220779220779226e-06, "loss": 1.4987, "step": 115 }, { "epoch": 0.031204290589956118, "grad_norm": 1.734375, "learning_rate": 6.181818181818182e-06, "loss": 1.3712, "step": 120 }, { "epoch": 0.03250446936453762, "grad_norm": 1.765625, "learning_rate": 6.441558441558442e-06, "loss": 1.3568, "step": 125 }, { "epoch": 0.033804648139119126, "grad_norm": 1.84375, "learning_rate": 6.701298701298702e-06, "loss": 1.3799, "step": 130 }, { "epoch": 0.035104826913700635, "grad_norm": 1.71875, "learning_rate": 6.961038961038962e-06, "loss": 1.2913, "step": 135 }, { "epoch": 0.03640500568828214, "grad_norm": 1.734375, "learning_rate": 7.220779220779221e-06, "loss": 1.3968, "step": 140 }, { "epoch": 0.037705184462863646, "grad_norm": 1.734375, "learning_rate": 7.480519480519481e-06, "loss": 1.3804, "step": 145 }, { "epoch": 0.03900536323744515, "grad_norm": 1.796875, "learning_rate": 7.74025974025974e-06, "loss": 1.3666, "step": 150 }, { "epoch": 0.04030554201202665, "grad_norm": 1.796875, "learning_rate": 8.000000000000001e-06, "loss": 1.3184, "step": 155 }, { "epoch": 0.04160572078660816, "grad_norm": 1.671875, "learning_rate": 8.25974025974026e-06, "loss": 1.2788, "step": 160 }, { "epoch": 0.04290589956118966, "grad_norm": 1.828125, "learning_rate": 8.51948051948052e-06, "loss": 1.3343, "step": 165 }, { "epoch": 0.04420607833577117, "grad_norm": 1.859375, "learning_rate": 8.779220779220779e-06, "loss": 1.3861, "step": 170 }, { "epoch": 0.04550625711035267, "grad_norm": 1.7421875, "learning_rate": 9.03896103896104e-06, "loss": 1.2976, "step": 175 }, { "epoch": 0.04680643588493418, "grad_norm": 4.125, "learning_rate": 9.298701298701299e-06, "loss": 1.2974, "step": 180 }, { "epoch": 0.048106614659515684, "grad_norm": 1.6015625, "learning_rate": 9.558441558441558e-06, "loss": 1.2776, "step": 185 }, { "epoch": 0.049406793434097186, "grad_norm": 1.8671875, "learning_rate": 9.81818181818182e-06, "loss": 1.3431, "step": 190 }, { "epoch": 0.050706972208678695, "grad_norm": 1.6640625, "learning_rate": 1.0077922077922078e-05, "loss": 1.2846, "step": 195 }, { "epoch": 0.0520071509832602, "grad_norm": 1.7265625, "learning_rate": 1.0337662337662338e-05, "loss": 1.284, "step": 200 }, { "epoch": 0.0520071509832602, "eval_loss": 1.294779896736145, "eval_runtime": 577.8458, "eval_samples_per_second": 11.823, "eval_steps_per_second": 3.942, "step": 200 }, { "epoch": 0.05330732975784171, "grad_norm": 1.6328125, "learning_rate": 1.0597402597402597e-05, "loss": 1.3109, "step": 205 }, { "epoch": 0.05460750853242321, "grad_norm": 1.609375, "learning_rate": 1.0857142857142858e-05, "loss": 1.2279, "step": 210 }, { "epoch": 0.05590768730700471, "grad_norm": 1.4921875, "learning_rate": 1.1116883116883117e-05, "loss": 1.244, "step": 215 }, { "epoch": 0.05720786608158622, "grad_norm": 1.5625, "learning_rate": 1.1376623376623376e-05, "loss": 1.288, "step": 220 }, { "epoch": 0.05850804485616772, "grad_norm": 1.65625, "learning_rate": 1.1636363636363637e-05, "loss": 1.2457, "step": 225 }, { "epoch": 0.05980822363074923, "grad_norm": 1.515625, "learning_rate": 1.1896103896103896e-05, "loss": 1.2354, "step": 230 }, { "epoch": 0.06110840240533073, "grad_norm": 1.5, "learning_rate": 1.2155844155844157e-05, "loss": 1.2379, "step": 235 }, { "epoch": 0.062408581179912236, "grad_norm": 1.4765625, "learning_rate": 1.2415584415584416e-05, "loss": 1.2707, "step": 240 }, { "epoch": 0.06370875995449374, "grad_norm": 2.09375, "learning_rate": 1.2675324675324676e-05, "loss": 1.3032, "step": 245 }, { "epoch": 0.06500893872907525, "grad_norm": 1.640625, "learning_rate": 1.2935064935064937e-05, "loss": 1.2676, "step": 250 }, { "epoch": 0.06630911750365676, "grad_norm": 1.4140625, "learning_rate": 1.3194805194805196e-05, "loss": 1.311, "step": 255 }, { "epoch": 0.06760929627823825, "grad_norm": 1.6171875, "learning_rate": 1.3454545454545455e-05, "loss": 1.2658, "step": 260 }, { "epoch": 0.06890947505281976, "grad_norm": 1.40625, "learning_rate": 1.3714285714285716e-05, "loss": 1.2508, "step": 265 }, { "epoch": 0.07020965382740127, "grad_norm": 1.4296875, "learning_rate": 1.3974025974025975e-05, "loss": 1.2345, "step": 270 }, { "epoch": 0.07150983260198278, "grad_norm": 2.265625, "learning_rate": 1.4233766233766236e-05, "loss": 1.2223, "step": 275 }, { "epoch": 0.07281001137656427, "grad_norm": 1.40625, "learning_rate": 1.4493506493506495e-05, "loss": 1.2487, "step": 280 }, { "epoch": 0.07411019015114578, "grad_norm": 1.453125, "learning_rate": 1.4753246753246754e-05, "loss": 1.1905, "step": 285 }, { "epoch": 0.07541036892572729, "grad_norm": 1.5859375, "learning_rate": 1.5012987012987015e-05, "loss": 1.3005, "step": 290 }, { "epoch": 0.07671054770030879, "grad_norm": 1.4453125, "learning_rate": 1.5272727272727276e-05, "loss": 1.28, "step": 295 }, { "epoch": 0.0780107264748903, "grad_norm": 1.5625, "learning_rate": 1.5532467532467534e-05, "loss": 1.2268, "step": 300 }, { "epoch": 0.0793109052494718, "grad_norm": 1.59375, "learning_rate": 1.5792207792207795e-05, "loss": 1.2504, "step": 305 }, { "epoch": 0.0806110840240533, "grad_norm": 1.359375, "learning_rate": 1.6051948051948056e-05, "loss": 1.2441, "step": 310 }, { "epoch": 0.08191126279863481, "grad_norm": 1.4453125, "learning_rate": 1.6311688311688313e-05, "loss": 1.2219, "step": 315 }, { "epoch": 0.08321144157321632, "grad_norm": 1.578125, "learning_rate": 1.6571428571428574e-05, "loss": 1.2467, "step": 320 }, { "epoch": 0.08451162034779783, "grad_norm": 1.3984375, "learning_rate": 1.683116883116883e-05, "loss": 1.2455, "step": 325 }, { "epoch": 0.08581179912237932, "grad_norm": 1.5390625, "learning_rate": 1.7090909090909092e-05, "loss": 1.1899, "step": 330 }, { "epoch": 0.08711197789696083, "grad_norm": 1.34375, "learning_rate": 1.735064935064935e-05, "loss": 1.2434, "step": 335 }, { "epoch": 0.08841215667154234, "grad_norm": 1.3671875, "learning_rate": 1.761038961038961e-05, "loss": 1.2047, "step": 340 }, { "epoch": 0.08971233544612384, "grad_norm": 1.484375, "learning_rate": 1.7870129870129872e-05, "loss": 1.2164, "step": 345 }, { "epoch": 0.09101251422070535, "grad_norm": 1.6328125, "learning_rate": 1.812987012987013e-05, "loss": 1.2265, "step": 350 }, { "epoch": 0.09231269299528685, "grad_norm": 1.40625, "learning_rate": 1.838961038961039e-05, "loss": 1.243, "step": 355 }, { "epoch": 0.09361287176986836, "grad_norm": 1.6640625, "learning_rate": 1.864935064935065e-05, "loss": 1.2475, "step": 360 }, { "epoch": 0.09491305054444986, "grad_norm": 1.4453125, "learning_rate": 1.8909090909090912e-05, "loss": 1.2459, "step": 365 }, { "epoch": 0.09621322931903137, "grad_norm": 1.4375, "learning_rate": 1.916883116883117e-05, "loss": 1.2937, "step": 370 }, { "epoch": 0.09751340809361288, "grad_norm": 1.5234375, "learning_rate": 1.942857142857143e-05, "loss": 1.2286, "step": 375 }, { "epoch": 0.09881358686819437, "grad_norm": 1.390625, "learning_rate": 1.968831168831169e-05, "loss": 1.2505, "step": 380 }, { "epoch": 0.10011376564277588, "grad_norm": 1.5546875, "learning_rate": 1.994805194805195e-05, "loss": 1.2724, "step": 385 }, { "epoch": 0.10141394441735739, "grad_norm": 1.453125, "learning_rate": 1.9999934084726765e-05, "loss": 1.221, "step": 390 }, { "epoch": 0.10271412319193889, "grad_norm": 1.5625, "learning_rate": 1.9999666305418534e-05, "loss": 1.3084, "step": 395 }, { "epoch": 0.1040143019665204, "grad_norm": 1.453125, "learning_rate": 1.999919254788234e-05, "loss": 1.239, "step": 400 }, { "epoch": 0.1040143019665204, "eval_loss": 1.2270315885543823, "eval_runtime": 577.8859, "eval_samples_per_second": 11.822, "eval_steps_per_second": 3.942, "step": 400 }, { "epoch": 0.1053144807411019, "grad_norm": 1.4921875, "learning_rate": 1.999851282187689e-05, "loss": 1.2416, "step": 405 }, { "epoch": 0.10661465951568341, "grad_norm": 1.421875, "learning_rate": 1.9997627141403514e-05, "loss": 1.2305, "step": 410 }, { "epoch": 0.10791483829026491, "grad_norm": 1.4140625, "learning_rate": 1.9996535524705926e-05, "loss": 1.1928, "step": 415 }, { "epoch": 0.10921501706484642, "grad_norm": 1.5078125, "learning_rate": 1.999523799426979e-05, "loss": 1.2644, "step": 420 }, { "epoch": 0.11051519583942793, "grad_norm": 1.5078125, "learning_rate": 1.9993734576822303e-05, "loss": 1.2157, "step": 425 }, { "epoch": 0.11181537461400942, "grad_norm": 1.328125, "learning_rate": 1.9992025303331632e-05, "loss": 1.2573, "step": 430 }, { "epoch": 0.11311555338859093, "grad_norm": 1.390625, "learning_rate": 1.9990110209006258e-05, "loss": 1.2262, "step": 435 }, { "epoch": 0.11441573216317244, "grad_norm": 1.4375, "learning_rate": 1.998798933329427e-05, "loss": 1.2127, "step": 440 }, { "epoch": 0.11571591093775394, "grad_norm": 1.3828125, "learning_rate": 1.998566271988255e-05, "loss": 1.1727, "step": 445 }, { "epoch": 0.11701608971233544, "grad_norm": 1.3984375, "learning_rate": 1.998313041669587e-05, "loss": 1.2352, "step": 450 }, { "epoch": 0.11831626848691695, "grad_norm": 1.4609375, "learning_rate": 1.99803924758959e-05, "loss": 1.1963, "step": 455 }, { "epoch": 0.11961644726149846, "grad_norm": 1.4609375, "learning_rate": 1.9977448953880144e-05, "loss": 1.2486, "step": 460 }, { "epoch": 0.12091662603607996, "grad_norm": 1.34375, "learning_rate": 1.9974299911280766e-05, "loss": 1.1692, "step": 465 }, { "epoch": 0.12221680481066147, "grad_norm": 1.40625, "learning_rate": 1.9970945412963356e-05, "loss": 1.1331, "step": 470 }, { "epoch": 0.12351698358524298, "grad_norm": 1.4921875, "learning_rate": 1.9967385528025577e-05, "loss": 1.1713, "step": 475 }, { "epoch": 0.12481716235982447, "grad_norm": 1.3828125, "learning_rate": 1.9963620329795762e-05, "loss": 1.1785, "step": 480 }, { "epoch": 0.12611734113440598, "grad_norm": 1.3828125, "learning_rate": 1.9959649895831378e-05, "loss": 1.2178, "step": 485 }, { "epoch": 0.12741751990898748, "grad_norm": 1.375, "learning_rate": 1.9955474307917453e-05, "loss": 1.2124, "step": 490 }, { "epoch": 0.128717698683569, "grad_norm": 1.4765625, "learning_rate": 1.995109365206488e-05, "loss": 1.2345, "step": 495 }, { "epoch": 0.1300178774581505, "grad_norm": 1.453125, "learning_rate": 1.9946508018508637e-05, "loss": 1.2402, "step": 500 }, { "epoch": 0.131318056232732, "grad_norm": 1.390625, "learning_rate": 1.9941717501705945e-05, "loss": 1.241, "step": 505 }, { "epoch": 0.1326182350073135, "grad_norm": 1.4140625, "learning_rate": 1.9936722200334312e-05, "loss": 1.1926, "step": 510 }, { "epoch": 0.133918413781895, "grad_norm": 1.3671875, "learning_rate": 1.9931522217289512e-05, "loss": 1.1603, "step": 515 }, { "epoch": 0.1352185925564765, "grad_norm": 1.4375, "learning_rate": 1.9926117659683437e-05, "loss": 1.2126, "step": 520 }, { "epoch": 0.13651877133105803, "grad_norm": 1.359375, "learning_rate": 1.992050863884193e-05, "loss": 1.2186, "step": 525 }, { "epoch": 0.13781895010563952, "grad_norm": 1.4765625, "learning_rate": 1.991469527030246e-05, "loss": 1.1917, "step": 530 }, { "epoch": 0.13911912888022104, "grad_norm": 1.578125, "learning_rate": 1.9908677673811764e-05, "loss": 1.1894, "step": 535 }, { "epoch": 0.14041930765480254, "grad_norm": 1.40625, "learning_rate": 1.9902455973323352e-05, "loss": 1.1932, "step": 540 }, { "epoch": 0.14171948642938403, "grad_norm": 1.4140625, "learning_rate": 1.9896030296994996e-05, "loss": 1.2742, "step": 545 }, { "epoch": 0.14301966520396556, "grad_norm": 1.3359375, "learning_rate": 1.988940077718605e-05, "loss": 1.234, "step": 550 }, { "epoch": 0.14431984397854705, "grad_norm": 1.390625, "learning_rate": 1.9882567550454747e-05, "loss": 1.2243, "step": 555 }, { "epoch": 0.14562002275312855, "grad_norm": 1.4765625, "learning_rate": 1.9875530757555377e-05, "loss": 1.1907, "step": 560 }, { "epoch": 0.14692020152771007, "grad_norm": 1.3515625, "learning_rate": 1.986829054343539e-05, "loss": 1.2344, "step": 565 }, { "epoch": 0.14822038030229157, "grad_norm": 1.3515625, "learning_rate": 1.9860847057232407e-05, "loss": 1.142, "step": 570 }, { "epoch": 0.14952055907687306, "grad_norm": 1.40625, "learning_rate": 1.9853200452271157e-05, "loss": 1.1788, "step": 575 }, { "epoch": 0.15082073785145458, "grad_norm": 1.3828125, "learning_rate": 1.9845350886060314e-05, "loss": 1.1476, "step": 580 }, { "epoch": 0.15212091662603608, "grad_norm": 1.34375, "learning_rate": 1.9837298520289238e-05, "loss": 1.1865, "step": 585 }, { "epoch": 0.15342109540061757, "grad_norm": 1.359375, "learning_rate": 1.9829043520824676e-05, "loss": 1.258, "step": 590 }, { "epoch": 0.1547212741751991, "grad_norm": 1.3203125, "learning_rate": 1.9820586057707314e-05, "loss": 1.1719, "step": 595 }, { "epoch": 0.1560214529497806, "grad_norm": 1.3203125, "learning_rate": 1.9811926305148292e-05, "loss": 1.2676, "step": 600 }, { "epoch": 0.1560214529497806, "eval_loss": 1.2073785066604614, "eval_runtime": 578.0178, "eval_samples_per_second": 11.82, "eval_steps_per_second": 3.941, "step": 600 }, { "epoch": 0.1573216317243621, "grad_norm": 1.421875, "learning_rate": 1.980306444152561e-05, "loss": 1.2199, "step": 605 }, { "epoch": 0.1586218104989436, "grad_norm": 1.34375, "learning_rate": 1.979400064938046e-05, "loss": 1.2054, "step": 610 }, { "epoch": 0.1599219892735251, "grad_norm": 1.390625, "learning_rate": 1.9784735115413453e-05, "loss": 1.1933, "step": 615 }, { "epoch": 0.1612221680481066, "grad_norm": 1.40625, "learning_rate": 1.9775268030480785e-05, "loss": 1.1606, "step": 620 }, { "epoch": 0.16252234682268812, "grad_norm": 1.3984375, "learning_rate": 1.97655995895903e-05, "loss": 1.2031, "step": 625 }, { "epoch": 0.16382252559726962, "grad_norm": 1.3671875, "learning_rate": 1.975572999189748e-05, "loss": 1.17, "step": 630 }, { "epoch": 0.16512270437185114, "grad_norm": 1.2890625, "learning_rate": 1.974565944070133e-05, "loss": 1.1652, "step": 635 }, { "epoch": 0.16642288314643264, "grad_norm": 1.3984375, "learning_rate": 1.9735388143440203e-05, "loss": 1.2398, "step": 640 }, { "epoch": 0.16772306192101413, "grad_norm": 1.3046875, "learning_rate": 1.972491631168752e-05, "loss": 1.1702, "step": 645 }, { "epoch": 0.16902324069559566, "grad_norm": 1.46875, "learning_rate": 1.971424416114741e-05, "loss": 1.2149, "step": 650 }, { "epoch": 0.17032341947017715, "grad_norm": 1.4609375, "learning_rate": 1.9703371911650278e-05, "loss": 1.1942, "step": 655 }, { "epoch": 0.17162359824475865, "grad_norm": 1.3984375, "learning_rate": 1.9692299787148265e-05, "loss": 1.2498, "step": 660 }, { "epoch": 0.17292377701934017, "grad_norm": 1.3515625, "learning_rate": 1.968102801571064e-05, "loss": 1.2154, "step": 665 }, { "epoch": 0.17422395579392166, "grad_norm": 1.390625, "learning_rate": 1.96695568295191e-05, "loss": 1.1647, "step": 670 }, { "epoch": 0.17552413456850316, "grad_norm": 1.5703125, "learning_rate": 1.965788646486299e-05, "loss": 1.1973, "step": 675 }, { "epoch": 0.17682431334308468, "grad_norm": 1.375, "learning_rate": 1.964601716213444e-05, "loss": 1.1358, "step": 680 }, { "epoch": 0.17812449211766618, "grad_norm": 1.375, "learning_rate": 1.9633949165823395e-05, "loss": 1.2036, "step": 685 }, { "epoch": 0.17942467089224767, "grad_norm": 1.3671875, "learning_rate": 1.9621682724512606e-05, "loss": 1.1692, "step": 690 }, { "epoch": 0.1807248496668292, "grad_norm": 1.390625, "learning_rate": 1.9609218090872488e-05, "loss": 1.2024, "step": 695 }, { "epoch": 0.1820250284414107, "grad_norm": 1.2890625, "learning_rate": 1.9596555521655918e-05, "loss": 1.132, "step": 700 }, { "epoch": 0.1833252072159922, "grad_norm": 1.3359375, "learning_rate": 1.9583695277692965e-05, "loss": 1.1681, "step": 705 }, { "epoch": 0.1846253859905737, "grad_norm": 1.28125, "learning_rate": 1.957063762388549e-05, "loss": 1.2149, "step": 710 }, { "epoch": 0.1859255647651552, "grad_norm": 1.4140625, "learning_rate": 1.955738282920171e-05, "loss": 1.2244, "step": 715 }, { "epoch": 0.18722574353973673, "grad_norm": 1.4296875, "learning_rate": 1.9543931166670638e-05, "loss": 1.2104, "step": 720 }, { "epoch": 0.18852592231431822, "grad_norm": 1.328125, "learning_rate": 1.9530282913376497e-05, "loss": 1.1897, "step": 725 }, { "epoch": 0.18982610108889972, "grad_norm": 1.3359375, "learning_rate": 1.9516438350452966e-05, "loss": 1.1736, "step": 730 }, { "epoch": 0.19112627986348124, "grad_norm": 1.3125, "learning_rate": 1.9502397763077406e-05, "loss": 1.1593, "step": 735 }, { "epoch": 0.19242645863806274, "grad_norm": 1.4453125, "learning_rate": 1.9488161440465007e-05, "loss": 1.1957, "step": 740 }, { "epoch": 0.19372663741264423, "grad_norm": 1.4375, "learning_rate": 1.9473729675862807e-05, "loss": 1.1894, "step": 745 }, { "epoch": 0.19502681618722575, "grad_norm": 1.375, "learning_rate": 1.945910276654365e-05, "loss": 1.205, "step": 750 }, { "epoch": 0.19632699496180725, "grad_norm": 1.359375, "learning_rate": 1.9444281013800075e-05, "loss": 1.1771, "step": 755 }, { "epoch": 0.19762717373638874, "grad_norm": 1.3984375, "learning_rate": 1.9429264722938113e-05, "loss": 1.2281, "step": 760 }, { "epoch": 0.19892735251097027, "grad_norm": 1.25, "learning_rate": 1.941405420327098e-05, "loss": 1.1867, "step": 765 }, { "epoch": 0.20022753128555176, "grad_norm": 1.3359375, "learning_rate": 1.939864976811272e-05, "loss": 1.1832, "step": 770 }, { "epoch": 0.20152771006013326, "grad_norm": 1.3125, "learning_rate": 1.9383051734771747e-05, "loss": 1.2205, "step": 775 }, { "epoch": 0.20282788883471478, "grad_norm": 1.3125, "learning_rate": 1.936726042454431e-05, "loss": 1.1497, "step": 780 }, { "epoch": 0.20412806760929628, "grad_norm": 1.453125, "learning_rate": 1.9351276162707873e-05, "loss": 1.228, "step": 785 }, { "epoch": 0.20542824638387777, "grad_norm": 1.2734375, "learning_rate": 1.933509927851442e-05, "loss": 1.1728, "step": 790 }, { "epoch": 0.2067284251584593, "grad_norm": 1.375, "learning_rate": 1.9318730105183654e-05, "loss": 1.1638, "step": 795 }, { "epoch": 0.2080286039330408, "grad_norm": 1.421875, "learning_rate": 1.9302168979896162e-05, "loss": 1.1911, "step": 800 }, { "epoch": 0.2080286039330408, "eval_loss": 1.1971442699432373, "eval_runtime": 577.9872, "eval_samples_per_second": 11.82, "eval_steps_per_second": 3.941, "step": 800 }, { "epoch": 0.20932878270762229, "grad_norm": 1.3203125, "learning_rate": 1.928541624378646e-05, "loss": 1.1981, "step": 805 }, { "epoch": 0.2106289614822038, "grad_norm": 1.3125, "learning_rate": 1.9268472241935944e-05, "loss": 1.1914, "step": 810 }, { "epoch": 0.2119291402567853, "grad_norm": 1.4453125, "learning_rate": 1.925133732336581e-05, "loss": 1.2269, "step": 815 }, { "epoch": 0.21322931903136683, "grad_norm": 1.40625, "learning_rate": 1.923401184102985e-05, "loss": 1.1337, "step": 820 }, { "epoch": 0.21452949780594832, "grad_norm": 1.4140625, "learning_rate": 1.9216496151807197e-05, "loss": 1.2179, "step": 825 }, { "epoch": 0.21582967658052982, "grad_norm": 1.3671875, "learning_rate": 1.9198790616494942e-05, "loss": 1.1693, "step": 830 }, { "epoch": 0.21712985535511134, "grad_norm": 1.5078125, "learning_rate": 1.918089559980074e-05, "loss": 1.1724, "step": 835 }, { "epoch": 0.21843003412969283, "grad_norm": 1.5078125, "learning_rate": 1.9162811470335273e-05, "loss": 1.2139, "step": 840 }, { "epoch": 0.21973021290427433, "grad_norm": 1.359375, "learning_rate": 1.914453860060467e-05, "loss": 1.1259, "step": 845 }, { "epoch": 0.22103039167885585, "grad_norm": 1.3828125, "learning_rate": 1.912607736700281e-05, "loss": 1.1635, "step": 850 }, { "epoch": 0.22233057045343735, "grad_norm": 1.4140625, "learning_rate": 1.910742814980362e-05, "loss": 1.2043, "step": 855 }, { "epoch": 0.22363074922801884, "grad_norm": 1.3515625, "learning_rate": 1.9088591333153177e-05, "loss": 1.1439, "step": 860 }, { "epoch": 0.22493092800260037, "grad_norm": 1.296875, "learning_rate": 1.906956730506184e-05, "loss": 1.1846, "step": 865 }, { "epoch": 0.22623110677718186, "grad_norm": 1.40625, "learning_rate": 1.9050356457396253e-05, "loss": 1.1619, "step": 870 }, { "epoch": 0.22753128555176336, "grad_norm": 1.375, "learning_rate": 1.9030959185871252e-05, "loss": 1.1945, "step": 875 }, { "epoch": 0.22883146432634488, "grad_norm": 1.40625, "learning_rate": 1.9011375890041727e-05, "loss": 1.1457, "step": 880 }, { "epoch": 0.23013164310092638, "grad_norm": 1.3828125, "learning_rate": 1.8991606973294402e-05, "loss": 1.1881, "step": 885 }, { "epoch": 0.23143182187550787, "grad_norm": 1.4765625, "learning_rate": 1.897165284283951e-05, "loss": 1.2191, "step": 890 }, { "epoch": 0.2327320006500894, "grad_norm": 1.3515625, "learning_rate": 1.8951513909702406e-05, "loss": 1.1946, "step": 895 }, { "epoch": 0.2340321794246709, "grad_norm": 1.359375, "learning_rate": 1.893119058871511e-05, "loss": 1.1576, "step": 900 }, { "epoch": 0.23533235819925238, "grad_norm": 1.3984375, "learning_rate": 1.891068329850776e-05, "loss": 1.2013, "step": 905 }, { "epoch": 0.2366325369738339, "grad_norm": 1.4375, "learning_rate": 1.8889992461499974e-05, "loss": 1.2041, "step": 910 }, { "epoch": 0.2379327157484154, "grad_norm": 1.3515625, "learning_rate": 1.8869118503892173e-05, "loss": 1.1634, "step": 915 }, { "epoch": 0.23923289452299693, "grad_norm": 1.390625, "learning_rate": 1.8848061855656786e-05, "loss": 1.2705, "step": 920 }, { "epoch": 0.24053307329757842, "grad_norm": 1.328125, "learning_rate": 1.882682295052939e-05, "loss": 1.1535, "step": 925 }, { "epoch": 0.24183325207215992, "grad_norm": 1.3125, "learning_rate": 1.8805402225999795e-05, "loss": 1.1677, "step": 930 }, { "epoch": 0.24313343084674144, "grad_norm": 1.3359375, "learning_rate": 1.8783800123303008e-05, "loss": 1.1444, "step": 935 }, { "epoch": 0.24443360962132293, "grad_norm": 1.375, "learning_rate": 1.8762017087410162e-05, "loss": 1.184, "step": 940 }, { "epoch": 0.24573378839590443, "grad_norm": 1.3515625, "learning_rate": 1.8740053567019344e-05, "loss": 1.1915, "step": 945 }, { "epoch": 0.24703396717048595, "grad_norm": 1.3203125, "learning_rate": 1.871791001454635e-05, "loss": 1.0851, "step": 950 }, { "epoch": 0.24833414594506745, "grad_norm": 1.3671875, "learning_rate": 1.8695586886115372e-05, "loss": 1.2117, "step": 955 }, { "epoch": 0.24963432471964894, "grad_norm": 1.2578125, "learning_rate": 1.86730846415496e-05, "loss": 1.1424, "step": 960 }, { "epoch": 0.25093450349423047, "grad_norm": 1.3359375, "learning_rate": 1.865040374436174e-05, "loss": 1.1457, "step": 965 }, { "epoch": 0.25223468226881196, "grad_norm": 1.3125, "learning_rate": 1.862754466174449e-05, "loss": 1.1142, "step": 970 }, { "epoch": 0.25353486104339346, "grad_norm": 1.3671875, "learning_rate": 1.8604507864560895e-05, "loss": 1.1479, "step": 975 }, { "epoch": 0.25483503981797495, "grad_norm": 1.3515625, "learning_rate": 1.858129382733465e-05, "loss": 1.1608, "step": 980 }, { "epoch": 0.2561352185925565, "grad_norm": 1.3046875, "learning_rate": 1.855790302824034e-05, "loss": 1.1776, "step": 985 }, { "epoch": 0.257435397367138, "grad_norm": 1.390625, "learning_rate": 1.8534335949093586e-05, "loss": 1.1316, "step": 990 }, { "epoch": 0.2587355761417195, "grad_norm": 1.3828125, "learning_rate": 1.8510593075341093e-05, "loss": 1.2005, "step": 995 }, { "epoch": 0.260035754916301, "grad_norm": 1.3671875, "learning_rate": 1.84866748960507e-05, "loss": 1.1766, "step": 1000 }, { "epoch": 0.260035754916301, "eval_loss": 1.1860284805297852, "eval_runtime": 578.04, "eval_samples_per_second": 11.819, "eval_steps_per_second": 3.941, "step": 1000 }, { "epoch": 0.2613359336908825, "grad_norm": 1.5, "learning_rate": 1.846258190390125e-05, "loss": 1.1841, "step": 1005 }, { "epoch": 0.262636112465464, "grad_norm": 1.3671875, "learning_rate": 1.8438314595172502e-05, "loss": 1.1468, "step": 1010 }, { "epoch": 0.26393629124004553, "grad_norm": 1.3515625, "learning_rate": 1.8413873469734856e-05, "loss": 1.1377, "step": 1015 }, { "epoch": 0.265236470014627, "grad_norm": 1.34375, "learning_rate": 1.8389259031039083e-05, "loss": 1.0577, "step": 1020 }, { "epoch": 0.2665366487892085, "grad_norm": 1.421875, "learning_rate": 1.8364471786105946e-05, "loss": 1.1714, "step": 1025 }, { "epoch": 0.26783682756379, "grad_norm": 1.609375, "learning_rate": 1.833951224551576e-05, "loss": 1.1308, "step": 1030 }, { "epoch": 0.2691370063383715, "grad_norm": 1.328125, "learning_rate": 1.8314380923397884e-05, "loss": 1.129, "step": 1035 }, { "epoch": 0.270437185112953, "grad_norm": 1.359375, "learning_rate": 1.82890783374201e-05, "loss": 1.2, "step": 1040 }, { "epoch": 0.27173736388753456, "grad_norm": 1.3828125, "learning_rate": 1.8263605008777995e-05, "loss": 1.1621, "step": 1045 }, { "epoch": 0.27303754266211605, "grad_norm": 1.3046875, "learning_rate": 1.823796146218418e-05, "loss": 1.1301, "step": 1050 }, { "epoch": 0.27433772143669755, "grad_norm": 1.3125, "learning_rate": 1.821214822585751e-05, "loss": 1.1889, "step": 1055 }, { "epoch": 0.27563790021127904, "grad_norm": 1.3046875, "learning_rate": 1.81861658315122e-05, "loss": 1.1556, "step": 1060 }, { "epoch": 0.27693807898586054, "grad_norm": 1.6640625, "learning_rate": 1.8160014814346854e-05, "loss": 1.141, "step": 1065 }, { "epoch": 0.2782382577604421, "grad_norm": 1.375, "learning_rate": 1.813369571303348e-05, "loss": 1.219, "step": 1070 }, { "epoch": 0.2795384365350236, "grad_norm": 1.3359375, "learning_rate": 1.8107209069706342e-05, "loss": 1.1646, "step": 1075 }, { "epoch": 0.2808386153096051, "grad_norm": 1.34375, "learning_rate": 1.8080555429950832e-05, "loss": 1.1884, "step": 1080 }, { "epoch": 0.2821387940841866, "grad_norm": 1.3359375, "learning_rate": 1.8053735342792227e-05, "loss": 1.1783, "step": 1085 }, { "epoch": 0.28343897285876807, "grad_norm": 1.296875, "learning_rate": 1.8026749360684358e-05, "loss": 1.2179, "step": 1090 }, { "epoch": 0.28473915163334956, "grad_norm": 1.390625, "learning_rate": 1.7999598039498255e-05, "loss": 1.1706, "step": 1095 }, { "epoch": 0.2860393304079311, "grad_norm": 1.515625, "learning_rate": 1.7972281938510678e-05, "loss": 1.1852, "step": 1100 }, { "epoch": 0.2873395091825126, "grad_norm": 1.359375, "learning_rate": 1.7944801620392617e-05, "loss": 1.1764, "step": 1105 }, { "epoch": 0.2886396879570941, "grad_norm": 1.2734375, "learning_rate": 1.791715765119768e-05, "loss": 1.1539, "step": 1110 }, { "epoch": 0.2899398667316756, "grad_norm": 1.328125, "learning_rate": 1.788935060035045e-05, "loss": 1.1427, "step": 1115 }, { "epoch": 0.2912400455062571, "grad_norm": 1.296875, "learning_rate": 1.7861381040634745e-05, "loss": 1.1732, "step": 1120 }, { "epoch": 0.2925402242808386, "grad_norm": 1.3359375, "learning_rate": 1.7833249548181822e-05, "loss": 1.1609, "step": 1125 }, { "epoch": 0.29384040305542014, "grad_norm": 1.3046875, "learning_rate": 1.780495670245852e-05, "loss": 1.1417, "step": 1130 }, { "epoch": 0.29514058183000164, "grad_norm": 1.375, "learning_rate": 1.7776503086255306e-05, "loss": 1.1847, "step": 1135 }, { "epoch": 0.29644076060458313, "grad_norm": 1.3671875, "learning_rate": 1.774788928567428e-05, "loss": 1.1358, "step": 1140 }, { "epoch": 0.2977409393791646, "grad_norm": 1.34375, "learning_rate": 1.7719115890117108e-05, "loss": 1.1387, "step": 1145 }, { "epoch": 0.2990411181537461, "grad_norm": 1.484375, "learning_rate": 1.7690183492272868e-05, "loss": 1.2286, "step": 1150 }, { "epoch": 0.3003412969283277, "grad_norm": 1.5234375, "learning_rate": 1.7661092688105847e-05, "loss": 1.1997, "step": 1155 }, { "epoch": 0.30164147570290917, "grad_norm": 1.3515625, "learning_rate": 1.763184407684328e-05, "loss": 1.1797, "step": 1160 }, { "epoch": 0.30294165447749066, "grad_norm": 1.375, "learning_rate": 1.7602438260962977e-05, "loss": 1.1671, "step": 1165 }, { "epoch": 0.30424183325207216, "grad_norm": 1.3359375, "learning_rate": 1.7572875846180934e-05, "loss": 1.1743, "step": 1170 }, { "epoch": 0.30554201202665365, "grad_norm": 1.390625, "learning_rate": 1.7543157441438858e-05, "loss": 1.1965, "step": 1175 }, { "epoch": 0.30684219080123515, "grad_norm": 1.359375, "learning_rate": 1.7513283658891606e-05, "loss": 1.1047, "step": 1180 }, { "epoch": 0.3081423695758167, "grad_norm": 1.3046875, "learning_rate": 1.7483255113894587e-05, "loss": 1.1886, "step": 1185 }, { "epoch": 0.3094425483503982, "grad_norm": 1.296875, "learning_rate": 1.74530724249911e-05, "loss": 1.1184, "step": 1190 }, { "epoch": 0.3107427271249797, "grad_norm": 2.046875, "learning_rate": 1.7422736213899563e-05, "loss": 1.1979, "step": 1195 }, { "epoch": 0.3120429058995612, "grad_norm": 1.2734375, "learning_rate": 1.7392247105500733e-05, "loss": 1.1083, "step": 1200 }, { "epoch": 0.3120429058995612, "eval_loss": 1.1814509630203247, "eval_runtime": 578.0545, "eval_samples_per_second": 11.819, "eval_steps_per_second": 3.941, "step": 1200 }, { "epoch": 0.3133430846741427, "grad_norm": 1.46875, "learning_rate": 1.7361605727824814e-05, "loss": 1.2137, "step": 1205 }, { "epoch": 0.3146432634487242, "grad_norm": 1.3984375, "learning_rate": 1.7330812712038535e-05, "loss": 1.2345, "step": 1210 }, { "epoch": 0.3159434422233057, "grad_norm": 1.390625, "learning_rate": 1.7299868692432147e-05, "loss": 1.1602, "step": 1215 }, { "epoch": 0.3172436209978872, "grad_norm": 1.453125, "learning_rate": 1.7268774306406354e-05, "loss": 1.2263, "step": 1220 }, { "epoch": 0.3185437997724687, "grad_norm": 1.3125, "learning_rate": 1.723753019445918e-05, "loss": 1.1895, "step": 1225 }, { "epoch": 0.3198439785470502, "grad_norm": 1.359375, "learning_rate": 1.7206137000172777e-05, "loss": 1.1771, "step": 1230 }, { "epoch": 0.3211441573216317, "grad_norm": 1.3046875, "learning_rate": 1.7174595370200188e-05, "loss": 1.1801, "step": 1235 }, { "epoch": 0.3224443360962132, "grad_norm": 1.4765625, "learning_rate": 1.7142905954251986e-05, "loss": 1.1529, "step": 1240 }, { "epoch": 0.32374451487079475, "grad_norm": 1.3515625, "learning_rate": 1.7111069405082937e-05, "loss": 1.2316, "step": 1245 }, { "epoch": 0.32504469364537625, "grad_norm": 1.234375, "learning_rate": 1.7079086378478518e-05, "loss": 1.1408, "step": 1250 }, { "epoch": 0.32634487241995774, "grad_norm": 1.328125, "learning_rate": 1.7046957533241427e-05, "loss": 1.1577, "step": 1255 }, { "epoch": 0.32764505119453924, "grad_norm": 1.3125, "learning_rate": 1.7014683531178015e-05, "loss": 1.1236, "step": 1260 }, { "epoch": 0.32894522996912073, "grad_norm": 1.3203125, "learning_rate": 1.698226503708463e-05, "loss": 1.1672, "step": 1265 }, { "epoch": 0.3302454087437023, "grad_norm": 1.328125, "learning_rate": 1.694970271873396e-05, "loss": 1.1739, "step": 1270 }, { "epoch": 0.3315455875182838, "grad_norm": 1.359375, "learning_rate": 1.6916997246861253e-05, "loss": 1.1744, "step": 1275 }, { "epoch": 0.3328457662928653, "grad_norm": 1.34375, "learning_rate": 1.6884149295150495e-05, "loss": 1.2045, "step": 1280 }, { "epoch": 0.33414594506744677, "grad_norm": 1.34375, "learning_rate": 1.6851159540220558e-05, "loss": 1.1179, "step": 1285 }, { "epoch": 0.33544612384202827, "grad_norm": 1.3125, "learning_rate": 1.6818028661611242e-05, "loss": 1.2262, "step": 1290 }, { "epoch": 0.33674630261660976, "grad_norm": 1.4140625, "learning_rate": 1.6784757341769288e-05, "loss": 1.1726, "step": 1295 }, { "epoch": 0.3380464813911913, "grad_norm": 1.328125, "learning_rate": 1.6751346266034315e-05, "loss": 1.1737, "step": 1300 }, { "epoch": 0.3393466601657728, "grad_norm": 1.4921875, "learning_rate": 1.671779612262471e-05, "loss": 1.1418, "step": 1305 }, { "epoch": 0.3406468389403543, "grad_norm": 1.34375, "learning_rate": 1.6684107602623438e-05, "loss": 1.1819, "step": 1310 }, { "epoch": 0.3419470177149358, "grad_norm": 1.3828125, "learning_rate": 1.6650281399963818e-05, "loss": 1.1842, "step": 1315 }, { "epoch": 0.3432471964895173, "grad_norm": 1.3671875, "learning_rate": 1.661631821141523e-05, "loss": 1.1729, "step": 1320 }, { "epoch": 0.3445473752640988, "grad_norm": 1.3984375, "learning_rate": 1.6582218736568766e-05, "loss": 1.2309, "step": 1325 }, { "epoch": 0.34584755403868034, "grad_norm": 1.3046875, "learning_rate": 1.65479836778228e-05, "loss": 1.1648, "step": 1330 }, { "epoch": 0.34714773281326183, "grad_norm": 1.40625, "learning_rate": 1.6513613740368534e-05, "loss": 1.215, "step": 1335 }, { "epoch": 0.34844791158784333, "grad_norm": 1.3125, "learning_rate": 1.6479109632175482e-05, "loss": 1.12, "step": 1340 }, { "epoch": 0.3497480903624248, "grad_norm": 1.2421875, "learning_rate": 1.6444472063976858e-05, "loss": 1.102, "step": 1345 }, { "epoch": 0.3510482691370063, "grad_norm": 1.3046875, "learning_rate": 1.6409701749254966e-05, "loss": 1.1744, "step": 1350 }, { "epoch": 0.35234844791158787, "grad_norm": 1.3046875, "learning_rate": 1.6374799404226486e-05, "loss": 1.1909, "step": 1355 }, { "epoch": 0.35364862668616937, "grad_norm": 1.34375, "learning_rate": 1.633976574782773e-05, "loss": 1.2021, "step": 1360 }, { "epoch": 0.35494880546075086, "grad_norm": 1.5234375, "learning_rate": 1.6304601501699823e-05, "loss": 1.1905, "step": 1365 }, { "epoch": 0.35624898423533236, "grad_norm": 1.3359375, "learning_rate": 1.6269307390173844e-05, "loss": 1.1524, "step": 1370 }, { "epoch": 0.35754916300991385, "grad_norm": 1.453125, "learning_rate": 1.6233884140255917e-05, "loss": 1.1761, "step": 1375 }, { "epoch": 0.35884934178449535, "grad_norm": 1.34375, "learning_rate": 1.61983324816122e-05, "loss": 1.1735, "step": 1380 }, { "epoch": 0.3601495205590769, "grad_norm": 1.75, "learning_rate": 1.6162653146553907e-05, "loss": 1.1371, "step": 1385 }, { "epoch": 0.3614496993336584, "grad_norm": 1.390625, "learning_rate": 1.6126846870022176e-05, "loss": 1.1505, "step": 1390 }, { "epoch": 0.3627498781082399, "grad_norm": 1.328125, "learning_rate": 1.6090914389572964e-05, "loss": 1.1551, "step": 1395 }, { "epoch": 0.3640500568828214, "grad_norm": 1.3203125, "learning_rate": 1.605485644536183e-05, "loss": 1.2159, "step": 1400 }, { "epoch": 0.3640500568828214, "eval_loss": 1.1779258251190186, "eval_runtime": 577.7202, "eval_samples_per_second": 11.826, "eval_steps_per_second": 3.943, "step": 1400 }, { "epoch": 0.3653502356574029, "grad_norm": 1.390625, "learning_rate": 1.6018673780128704e-05, "loss": 1.1869, "step": 1405 }, { "epoch": 0.3666504144319844, "grad_norm": 1.328125, "learning_rate": 1.5982367139182588e-05, "loss": 1.2128, "step": 1410 }, { "epoch": 0.3679505932065659, "grad_norm": 1.3359375, "learning_rate": 1.5945937270386192e-05, "loss": 1.1418, "step": 1415 }, { "epoch": 0.3692507719811474, "grad_norm": 1.3359375, "learning_rate": 1.5909384924140535e-05, "loss": 1.1597, "step": 1420 }, { "epoch": 0.3705509507557289, "grad_norm": 1.4296875, "learning_rate": 1.5872710853369498e-05, "loss": 1.1802, "step": 1425 }, { "epoch": 0.3718511295303104, "grad_norm": 1.703125, "learning_rate": 1.5835915813504293e-05, "loss": 1.1699, "step": 1430 }, { "epoch": 0.3731513083048919, "grad_norm": 1.375, "learning_rate": 1.579900056246793e-05, "loss": 1.1655, "step": 1435 }, { "epoch": 0.37445148707947346, "grad_norm": 1.328125, "learning_rate": 1.576196586065958e-05, "loss": 1.1901, "step": 1440 }, { "epoch": 0.37575166585405495, "grad_norm": 1.3515625, "learning_rate": 1.5724812470938923e-05, "loss": 1.1667, "step": 1445 }, { "epoch": 0.37705184462863645, "grad_norm": 1.2734375, "learning_rate": 1.5687541158610437e-05, "loss": 1.1392, "step": 1450 }, { "epoch": 0.37835202340321794, "grad_norm": 1.359375, "learning_rate": 1.5650152691407625e-05, "loss": 1.1604, "step": 1455 }, { "epoch": 0.37965220217779944, "grad_norm": 1.3125, "learning_rate": 1.5612647839477205e-05, "loss": 1.1651, "step": 1460 }, { "epoch": 0.38095238095238093, "grad_norm": 1.3515625, "learning_rate": 1.557502737536326e-05, "loss": 1.1835, "step": 1465 }, { "epoch": 0.3822525597269625, "grad_norm": 1.328125, "learning_rate": 1.55372920739913e-05, "loss": 1.1055, "step": 1470 }, { "epoch": 0.383552738501544, "grad_norm": 1.359375, "learning_rate": 1.54994427126523e-05, "loss": 1.2111, "step": 1475 }, { "epoch": 0.3848529172761255, "grad_norm": 1.3984375, "learning_rate": 1.546148007098673e-05, "loss": 1.1328, "step": 1480 }, { "epoch": 0.38615309605070697, "grad_norm": 1.3515625, "learning_rate": 1.5423404930968444e-05, "loss": 1.1878, "step": 1485 }, { "epoch": 0.38745327482528846, "grad_norm": 1.2734375, "learning_rate": 1.53852180768886e-05, "loss": 1.1724, "step": 1490 }, { "epoch": 0.38875345359986996, "grad_norm": 1.328125, "learning_rate": 1.53469202953395e-05, "loss": 1.1638, "step": 1495 }, { "epoch": 0.3900536323744515, "grad_norm": 1.2890625, "learning_rate": 1.5308512375198388e-05, "loss": 1.2015, "step": 1500 }, { "epoch": 0.391353811149033, "grad_norm": 1.375, "learning_rate": 1.5269995107611197e-05, "loss": 1.1696, "step": 1505 }, { "epoch": 0.3926539899236145, "grad_norm": 1.3125, "learning_rate": 1.523136928597625e-05, "loss": 1.1485, "step": 1510 }, { "epoch": 0.393954168698196, "grad_norm": 1.3671875, "learning_rate": 1.5192635705927932e-05, "loss": 1.1349, "step": 1515 }, { "epoch": 0.3952543474727775, "grad_norm": 1.296875, "learning_rate": 1.5153795165320284e-05, "loss": 1.2, "step": 1520 }, { "epoch": 0.396554526247359, "grad_norm": 1.34375, "learning_rate": 1.5114848464210572e-05, "loss": 1.1329, "step": 1525 }, { "epoch": 0.39785470502194054, "grad_norm": 1.3515625, "learning_rate": 1.5075796404842819e-05, "loss": 1.1924, "step": 1530 }, { "epoch": 0.39915488379652203, "grad_norm": 1.3203125, "learning_rate": 1.5036639791631264e-05, "loss": 1.1772, "step": 1535 }, { "epoch": 0.4004550625711035, "grad_norm": 1.328125, "learning_rate": 1.4997379431143806e-05, "loss": 1.1209, "step": 1540 }, { "epoch": 0.401755241345685, "grad_norm": 1.3125, "learning_rate": 1.4958016132085371e-05, "loss": 1.1651, "step": 1545 }, { "epoch": 0.4030554201202665, "grad_norm": 1.3515625, "learning_rate": 1.491855070528128e-05, "loss": 1.1517, "step": 1550 }, { "epoch": 0.40435559889484807, "grad_norm": 1.3671875, "learning_rate": 1.4878983963660532e-05, "loss": 1.1456, "step": 1555 }, { "epoch": 0.40565577766942956, "grad_norm": 1.3515625, "learning_rate": 1.4839316722239047e-05, "loss": 1.2196, "step": 1560 }, { "epoch": 0.40695595644401106, "grad_norm": 1.3203125, "learning_rate": 1.4799549798102909e-05, "loss": 1.1362, "step": 1565 }, { "epoch": 0.40825613521859255, "grad_norm": 1.359375, "learning_rate": 1.4759684010391503e-05, "loss": 1.1681, "step": 1570 }, { "epoch": 0.40955631399317405, "grad_norm": 1.296875, "learning_rate": 1.4719720180280665e-05, "loss": 1.1182, "step": 1575 }, { "epoch": 0.41085649276775554, "grad_norm": 1.3359375, "learning_rate": 1.4679659130965751e-05, "loss": 1.2005, "step": 1580 }, { "epoch": 0.4121566715423371, "grad_norm": 1.3515625, "learning_rate": 1.46395016876447e-05, "loss": 1.1363, "step": 1585 }, { "epoch": 0.4134568503169186, "grad_norm": 1.3359375, "learning_rate": 1.4599248677501007e-05, "loss": 1.1791, "step": 1590 }, { "epoch": 0.4147570290915001, "grad_norm": 2.78125, "learning_rate": 1.4558900929686718e-05, "loss": 1.1162, "step": 1595 }, { "epoch": 0.4160572078660816, "grad_norm": 1.3359375, "learning_rate": 1.4518459275305321e-05, "loss": 1.248, "step": 1600 }, { "epoch": 0.4160572078660816, "eval_loss": 1.1750705242156982, "eval_runtime": 577.3745, "eval_samples_per_second": 11.833, "eval_steps_per_second": 3.945, "step": 1600 }, { "epoch": 0.4173573866406631, "grad_norm": 1.3203125, "learning_rate": 1.4477924547394649e-05, "loss": 1.1601, "step": 1605 }, { "epoch": 0.41865756541524457, "grad_norm": 1.40625, "learning_rate": 1.4437297580909703e-05, "loss": 1.1764, "step": 1610 }, { "epoch": 0.4199577441898261, "grad_norm": 1.2734375, "learning_rate": 1.4396579212705467e-05, "loss": 1.193, "step": 1615 }, { "epoch": 0.4212579229644076, "grad_norm": 1.3203125, "learning_rate": 1.4355770281519657e-05, "loss": 1.1243, "step": 1620 }, { "epoch": 0.4225581017389891, "grad_norm": 1.3125, "learning_rate": 1.4314871627955466e-05, "loss": 1.1671, "step": 1625 }, { "epoch": 0.4238582805135706, "grad_norm": 1.2890625, "learning_rate": 1.4273884094464218e-05, "loss": 1.1673, "step": 1630 }, { "epoch": 0.4251584592881521, "grad_norm": 1.328125, "learning_rate": 1.4232808525328037e-05, "loss": 1.1627, "step": 1635 }, { "epoch": 0.42645863806273365, "grad_norm": 1.2890625, "learning_rate": 1.4191645766642458e-05, "loss": 1.1335, "step": 1640 }, { "epoch": 0.42775881683731515, "grad_norm": 1.2890625, "learning_rate": 1.4150396666298976e-05, "loss": 1.1656, "step": 1645 }, { "epoch": 0.42905899561189664, "grad_norm": 1.40625, "learning_rate": 1.4109062073967606e-05, "loss": 1.1479, "step": 1650 }, { "epoch": 0.43035917438647814, "grad_norm": 1.296875, "learning_rate": 1.4067642841079372e-05, "loss": 1.1742, "step": 1655 }, { "epoch": 0.43165935316105963, "grad_norm": 1.4140625, "learning_rate": 1.4026139820808756e-05, "loss": 1.0978, "step": 1660 }, { "epoch": 0.43295953193564113, "grad_norm": 1.3125, "learning_rate": 1.3984553868056147e-05, "loss": 1.157, "step": 1665 }, { "epoch": 0.4342597107102227, "grad_norm": 1.296875, "learning_rate": 1.394288583943022e-05, "loss": 1.192, "step": 1670 }, { "epoch": 0.4355598894848042, "grad_norm": 1.546875, "learning_rate": 1.3901136593230275e-05, "loss": 1.1656, "step": 1675 }, { "epoch": 0.43686006825938567, "grad_norm": 1.34375, "learning_rate": 1.3859306989428594e-05, "loss": 1.1592, "step": 1680 }, { "epoch": 0.43816024703396717, "grad_norm": 1.34375, "learning_rate": 1.381739788965269e-05, "loss": 1.1105, "step": 1685 }, { "epoch": 0.43946042580854866, "grad_norm": 1.3125, "learning_rate": 1.3775410157167581e-05, "loss": 1.1577, "step": 1690 }, { "epoch": 0.44076060458313016, "grad_norm": 1.7890625, "learning_rate": 1.3733344656858006e-05, "loss": 1.1232, "step": 1695 }, { "epoch": 0.4420607833577117, "grad_norm": 1.3515625, "learning_rate": 1.3691202255210589e-05, "loss": 1.1988, "step": 1700 }, { "epoch": 0.4433609621322932, "grad_norm": 1.3046875, "learning_rate": 1.3648983820296028e-05, "loss": 1.1351, "step": 1705 }, { "epoch": 0.4446611409068747, "grad_norm": 1.2890625, "learning_rate": 1.3606690221751185e-05, "loss": 1.1691, "step": 1710 }, { "epoch": 0.4459613196814562, "grad_norm": 1.4140625, "learning_rate": 1.3564322330761172e-05, "loss": 1.1539, "step": 1715 }, { "epoch": 0.4472614984560377, "grad_norm": 1.6875, "learning_rate": 1.3521881020041424e-05, "loss": 1.2068, "step": 1720 }, { "epoch": 0.4485616772306192, "grad_norm": 1.2578125, "learning_rate": 1.347936716381972e-05, "loss": 1.1514, "step": 1725 }, { "epoch": 0.44986185600520073, "grad_norm": 1.2578125, "learning_rate": 1.3436781637818142e-05, "loss": 1.1453, "step": 1730 }, { "epoch": 0.45116203477978223, "grad_norm": 1.3046875, "learning_rate": 1.3394125319235094e-05, "loss": 1.1573, "step": 1735 }, { "epoch": 0.4524622135543637, "grad_norm": 1.390625, "learning_rate": 1.3351399086727181e-05, "loss": 1.1841, "step": 1740 }, { "epoch": 0.4537623923289452, "grad_norm": 1.359375, "learning_rate": 1.3308603820391142e-05, "loss": 1.1869, "step": 1745 }, { "epoch": 0.4550625711035267, "grad_norm": 1.359375, "learning_rate": 1.3265740401745701e-05, "loss": 1.1788, "step": 1750 }, { "epoch": 0.45636274987810826, "grad_norm": 1.3515625, "learning_rate": 1.3222809713713422e-05, "loss": 1.1538, "step": 1755 }, { "epoch": 0.45766292865268976, "grad_norm": 1.9375, "learning_rate": 1.317981264060252e-05, "loss": 1.1333, "step": 1760 }, { "epoch": 0.45896310742727126, "grad_norm": 1.484375, "learning_rate": 1.3136750068088646e-05, "loss": 1.1902, "step": 1765 }, { "epoch": 0.46026328620185275, "grad_norm": 1.5703125, "learning_rate": 1.309362288319663e-05, "loss": 1.0974, "step": 1770 }, { "epoch": 0.46156346497643425, "grad_norm": 1.296875, "learning_rate": 1.3050431974282235e-05, "loss": 1.1717, "step": 1775 }, { "epoch": 0.46286364375101574, "grad_norm": 1.375, "learning_rate": 1.3007178231013833e-05, "loss": 1.1546, "step": 1780 }, { "epoch": 0.4641638225255973, "grad_norm": 1.390625, "learning_rate": 1.2963862544354093e-05, "loss": 1.1782, "step": 1785 }, { "epoch": 0.4654640013001788, "grad_norm": 1.328125, "learning_rate": 1.2920485806541633e-05, "loss": 1.1684, "step": 1790 }, { "epoch": 0.4667641800747603, "grad_norm": 1.453125, "learning_rate": 1.2877048911072618e-05, "loss": 1.1934, "step": 1795 }, { "epoch": 0.4680643588493418, "grad_norm": 1.3359375, "learning_rate": 1.283355275268239e-05, "loss": 1.1644, "step": 1800 }, { "epoch": 0.4680643588493418, "eval_loss": 1.1729602813720703, "eval_runtime": 577.3935, "eval_samples_per_second": 11.832, "eval_steps_per_second": 3.945, "step": 1800 }, { "epoch": 0.4693645376239233, "grad_norm": 1.296875, "learning_rate": 1.2789998227327001e-05, "loss": 1.1698, "step": 1805 }, { "epoch": 0.47066471639850477, "grad_norm": 1.3125, "learning_rate": 1.274638623216479e-05, "loss": 1.1127, "step": 1810 }, { "epoch": 0.4719648951730863, "grad_norm": 1.2734375, "learning_rate": 1.2702717665537877e-05, "loss": 1.1298, "step": 1815 }, { "epoch": 0.4732650739476678, "grad_norm": 1.3359375, "learning_rate": 1.2658993426953684e-05, "loss": 1.179, "step": 1820 }, { "epoch": 0.4745652527222493, "grad_norm": 1.3203125, "learning_rate": 1.2615214417066373e-05, "loss": 1.2056, "step": 1825 }, { "epoch": 0.4758654314968308, "grad_norm": 1.25, "learning_rate": 1.257138153765833e-05, "loss": 1.1546, "step": 1830 }, { "epoch": 0.4771656102714123, "grad_norm": 1.2734375, "learning_rate": 1.2527495691621568e-05, "loss": 1.1705, "step": 1835 }, { "epoch": 0.47846578904599385, "grad_norm": 1.328125, "learning_rate": 1.2483557782939126e-05, "loss": 1.1299, "step": 1840 }, { "epoch": 0.47976596782057535, "grad_norm": 1.4765625, "learning_rate": 1.2439568716666468e-05, "loss": 1.142, "step": 1845 }, { "epoch": 0.48106614659515684, "grad_norm": 1.3046875, "learning_rate": 1.239552939891282e-05, "loss": 1.1879, "step": 1850 }, { "epoch": 0.48236632536973834, "grad_norm": 1.4140625, "learning_rate": 1.2351440736822517e-05, "loss": 1.186, "step": 1855 }, { "epoch": 0.48366650414431983, "grad_norm": 1.34375, "learning_rate": 1.2307303638556314e-05, "loss": 1.2062, "step": 1860 }, { "epoch": 0.4849666829189013, "grad_norm": 1.25, "learning_rate": 1.2263119013272676e-05, "loss": 1.1163, "step": 1865 }, { "epoch": 0.4862668616934829, "grad_norm": 1.265625, "learning_rate": 1.2218887771109054e-05, "loss": 1.1832, "step": 1870 }, { "epoch": 0.48756704046806437, "grad_norm": 1.328125, "learning_rate": 1.217461082316314e-05, "loss": 1.1123, "step": 1875 }, { "epoch": 0.48886721924264587, "grad_norm": 1.3125, "learning_rate": 1.21302890814741e-05, "loss": 1.1759, "step": 1880 }, { "epoch": 0.49016739801722736, "grad_norm": 1.2890625, "learning_rate": 1.2085923459003776e-05, "loss": 1.1703, "step": 1885 }, { "epoch": 0.49146757679180886, "grad_norm": 1.515625, "learning_rate": 1.2041514869617895e-05, "loss": 1.1375, "step": 1890 }, { "epoch": 0.49276775556639035, "grad_norm": 1.421875, "learning_rate": 1.1997064228067237e-05, "loss": 1.1785, "step": 1895 }, { "epoch": 0.4940679343409719, "grad_norm": 1.4609375, "learning_rate": 1.195257244996879e-05, "loss": 1.1617, "step": 1900 }, { "epoch": 0.4953681131155534, "grad_norm": 1.3984375, "learning_rate": 1.1908040451786903e-05, "loss": 1.1691, "step": 1905 }, { "epoch": 0.4966682918901349, "grad_norm": 1.3203125, "learning_rate": 1.1863469150814388e-05, "loss": 1.1731, "step": 1910 }, { "epoch": 0.4979684706647164, "grad_norm": 1.3671875, "learning_rate": 1.1818859465153643e-05, "loss": 1.1272, "step": 1915 }, { "epoch": 0.4992686494392979, "grad_norm": 1.359375, "learning_rate": 1.177421231369773e-05, "loss": 1.1913, "step": 1920 }, { "epoch": 0.5005688282138794, "grad_norm": 1.3203125, "learning_rate": 1.1729528616111449e-05, "loss": 1.158, "step": 1925 }, { "epoch": 0.5018690069884609, "grad_norm": 1.2890625, "learning_rate": 1.1684809292812405e-05, "loss": 1.2267, "step": 1930 }, { "epoch": 0.5031691857630424, "grad_norm": 1.3203125, "learning_rate": 1.164005526495203e-05, "loss": 1.1303, "step": 1935 }, { "epoch": 0.5044693645376239, "grad_norm": 1.3984375, "learning_rate": 1.1595267454396622e-05, "loss": 1.1648, "step": 1940 }, { "epoch": 0.5057695433122055, "grad_norm": 1.2890625, "learning_rate": 1.1550446783708349e-05, "loss": 1.1854, "step": 1945 }, { "epoch": 0.5070697220867869, "grad_norm": 1.34375, "learning_rate": 1.1505594176126255e-05, "loss": 1.1947, "step": 1950 }, { "epoch": 0.5083699008613685, "grad_norm": 1.40625, "learning_rate": 1.1460710555547232e-05, "loss": 1.1526, "step": 1955 }, { "epoch": 0.5096700796359499, "grad_norm": 2.0, "learning_rate": 1.1415796846506993e-05, "loss": 1.104, "step": 1960 }, { "epoch": 0.5109702584105315, "grad_norm": 1.3515625, "learning_rate": 1.1370853974161032e-05, "loss": 1.1826, "step": 1965 }, { "epoch": 0.512270437185113, "grad_norm": 1.34375, "learning_rate": 1.1325882864265558e-05, "loss": 1.16, "step": 1970 }, { "epoch": 0.5135706159596944, "grad_norm": 1.3046875, "learning_rate": 1.128088444315844e-05, "loss": 1.1895, "step": 1975 }, { "epoch": 0.514870794734276, "grad_norm": 1.3203125, "learning_rate": 1.1235859637740105e-05, "loss": 1.1841, "step": 1980 }, { "epoch": 0.5161709735088574, "grad_norm": 1.421875, "learning_rate": 1.1190809375454472e-05, "loss": 1.1491, "step": 1985 }, { "epoch": 0.517471152283439, "grad_norm": 1.2890625, "learning_rate": 1.1145734584269826e-05, "loss": 1.1759, "step": 1990 }, { "epoch": 0.5187713310580204, "grad_norm": 1.3359375, "learning_rate": 1.1100636192659702e-05, "loss": 1.1424, "step": 1995 }, { "epoch": 0.520071509832602, "grad_norm": 1.3359375, "learning_rate": 1.1055515129583787e-05, "loss": 1.1479, "step": 2000 }, { "epoch": 0.520071509832602, "eval_loss": 1.1712779998779297, "eval_runtime": 588.6026, "eval_samples_per_second": 11.607, "eval_steps_per_second": 3.87, "step": 2000 }, { "epoch": 0.5213716886071835, "grad_norm": 1.296875, "learning_rate": 1.1010372324468756e-05, "loss": 1.1576, "step": 2005 }, { "epoch": 0.522671867381765, "grad_norm": 1.328125, "learning_rate": 1.0965208707189134e-05, "loss": 1.1916, "step": 2010 }, { "epoch": 0.5239720461563465, "grad_norm": 1.3125, "learning_rate": 1.0920025208048157e-05, "loss": 1.1546, "step": 2015 }, { "epoch": 0.525272224930928, "grad_norm": 1.359375, "learning_rate": 1.0874822757758585e-05, "loss": 1.1511, "step": 2020 }, { "epoch": 0.5265724037055095, "grad_norm": 1.2421875, "learning_rate": 1.082960228742355e-05, "loss": 1.1292, "step": 2025 }, { "epoch": 0.5278725824800911, "grad_norm": 1.375, "learning_rate": 1.0784364728517374e-05, "loss": 1.1967, "step": 2030 }, { "epoch": 0.5291727612546725, "grad_norm": 1.296875, "learning_rate": 1.073911101286637e-05, "loss": 1.1418, "step": 2035 }, { "epoch": 0.530472940029254, "grad_norm": 1.3203125, "learning_rate": 1.0693842072629657e-05, "loss": 1.1617, "step": 2040 }, { "epoch": 0.5317731188038355, "grad_norm": 1.3359375, "learning_rate": 1.0648558840279967e-05, "loss": 1.1569, "step": 2045 }, { "epoch": 0.533073297578417, "grad_norm": 1.375, "learning_rate": 1.0603262248584416e-05, "loss": 1.1853, "step": 2050 }, { "epoch": 0.5343734763529986, "grad_norm": 1.3203125, "learning_rate": 1.0557953230585309e-05, "loss": 1.1583, "step": 2055 }, { "epoch": 0.53567365512758, "grad_norm": 1.3671875, "learning_rate": 1.0512632719580917e-05, "loss": 1.1792, "step": 2060 }, { "epoch": 0.5369738339021616, "grad_norm": 1.3203125, "learning_rate": 1.0467301649106243e-05, "loss": 1.1767, "step": 2065 }, { "epoch": 0.538274012676743, "grad_norm": 1.3125, "learning_rate": 1.0421960952913806e-05, "loss": 1.165, "step": 2070 }, { "epoch": 0.5395741914513246, "grad_norm": 1.3828125, "learning_rate": 1.0376611564954391e-05, "loss": 1.1523, "step": 2075 }, { "epoch": 0.540874370225906, "grad_norm": 1.390625, "learning_rate": 1.0331254419357833e-05, "loss": 1.1779, "step": 2080 }, { "epoch": 0.5421745490004876, "grad_norm": 1.5078125, "learning_rate": 1.028589045041375e-05, "loss": 1.1025, "step": 2085 }, { "epoch": 0.5434747277750691, "grad_norm": 1.2734375, "learning_rate": 1.0240520592552321e-05, "loss": 1.13, "step": 2090 }, { "epoch": 0.5447749065496506, "grad_norm": 1.359375, "learning_rate": 1.0195145780325021e-05, "loss": 1.1924, "step": 2095 }, { "epoch": 0.5460750853242321, "grad_norm": 1.5546875, "learning_rate": 1.0149766948385387e-05, "loss": 1.1454, "step": 2100 }, { "epoch": 0.5473752640988135, "grad_norm": 1.3515625, "learning_rate": 1.0104385031469746e-05, "loss": 1.147, "step": 2105 }, { "epoch": 0.5486754428733951, "grad_norm": 1.328125, "learning_rate": 1.005900096437797e-05, "loss": 1.1673, "step": 2110 }, { "epoch": 0.5499756216479766, "grad_norm": 1.3046875, "learning_rate": 1.0013615681954231e-05, "loss": 1.1619, "step": 2115 }, { "epoch": 0.5512758004225581, "grad_norm": 1.3671875, "learning_rate": 9.96823011906773e-06, "loss": 1.1852, "step": 2120 }, { "epoch": 0.5525759791971396, "grad_norm": 1.34375, "learning_rate": 9.922845210593445e-06, "loss": 1.1982, "step": 2125 }, { "epoch": 0.5538761579717211, "grad_norm": 1.3046875, "learning_rate": 9.877461891392874e-06, "loss": 1.1411, "step": 2130 }, { "epoch": 0.5551763367463026, "grad_norm": 1.3359375, "learning_rate": 9.832081096294775e-06, "loss": 1.1854, "step": 2135 }, { "epoch": 0.5564765155208842, "grad_norm": 1.25, "learning_rate": 9.786703760075924e-06, "loss": 1.1622, "step": 2140 }, { "epoch": 0.5577766942954656, "grad_norm": 1.3359375, "learning_rate": 9.741330817441832e-06, "loss": 1.0954, "step": 2145 }, { "epoch": 0.5590768730700472, "grad_norm": 1.3203125, "learning_rate": 9.695963203007526e-06, "loss": 1.1648, "step": 2150 }, { "epoch": 0.5603770518446286, "grad_norm": 1.265625, "learning_rate": 9.650601851278271e-06, "loss": 1.1763, "step": 2155 }, { "epoch": 0.5616772306192102, "grad_norm": 1.328125, "learning_rate": 9.60524769663033e-06, "loss": 1.2054, "step": 2160 }, { "epoch": 0.5629774093937916, "grad_norm": 1.328125, "learning_rate": 9.559901673291725e-06, "loss": 1.083, "step": 2165 }, { "epoch": 0.5642775881683731, "grad_norm": 1.4296875, "learning_rate": 9.51456471532297e-06, "loss": 1.1765, "step": 2170 }, { "epoch": 0.5655777669429547, "grad_norm": 1.3515625, "learning_rate": 9.469237756597854e-06, "loss": 1.1779, "step": 2175 }, { "epoch": 0.5668779457175361, "grad_norm": 1.3671875, "learning_rate": 9.423921730784201e-06, "loss": 1.172, "step": 2180 }, { "epoch": 0.5681781244921177, "grad_norm": 1.515625, "learning_rate": 9.378617571324631e-06, "loss": 1.1612, "step": 2185 }, { "epoch": 0.5694783032666991, "grad_norm": 1.3046875, "learning_rate": 9.333326211417327e-06, "loss": 1.1727, "step": 2190 }, { "epoch": 0.5707784820412807, "grad_norm": 1.34375, "learning_rate": 9.288048583996831e-06, "loss": 1.1389, "step": 2195 }, { "epoch": 0.5720786608158622, "grad_norm": 1.2890625, "learning_rate": 9.242785621714807e-06, "loss": 1.1292, "step": 2200 }, { "epoch": 0.5720786608158622, "eval_loss": 1.1700196266174316, "eval_runtime": 578.4636, "eval_samples_per_second": 11.811, "eval_steps_per_second": 3.938, "step": 2200 }, { "epoch": 0.5733788395904437, "grad_norm": 1.2265625, "learning_rate": 9.197538256920845e-06, "loss": 1.1811, "step": 2205 }, { "epoch": 0.5746790183650252, "grad_norm": 1.34375, "learning_rate": 9.152307421643248e-06, "loss": 1.1209, "step": 2210 }, { "epoch": 0.5759791971396067, "grad_norm": 1.3671875, "learning_rate": 9.107094047569836e-06, "loss": 1.1806, "step": 2215 }, { "epoch": 0.5772793759141882, "grad_norm": 1.2890625, "learning_rate": 9.061899066028755e-06, "loss": 1.1631, "step": 2220 }, { "epoch": 0.5785795546887698, "grad_norm": 1.3125, "learning_rate": 9.016723407969297e-06, "loss": 1.219, "step": 2225 }, { "epoch": 0.5798797334633512, "grad_norm": 1.328125, "learning_rate": 8.971568003942701e-06, "loss": 1.2102, "step": 2230 }, { "epoch": 0.5811799122379327, "grad_norm": 1.3984375, "learning_rate": 8.926433784083022e-06, "loss": 1.1547, "step": 2235 }, { "epoch": 0.5824800910125142, "grad_norm": 1.6953125, "learning_rate": 8.881321678087948e-06, "loss": 1.1607, "step": 2240 }, { "epoch": 0.5837802697870957, "grad_norm": 1.3125, "learning_rate": 8.83623261519965e-06, "loss": 1.1417, "step": 2245 }, { "epoch": 0.5850804485616772, "grad_norm": 1.2890625, "learning_rate": 8.791167524185651e-06, "loss": 1.1859, "step": 2250 }, { "epoch": 0.5863806273362587, "grad_norm": 1.3125, "learning_rate": 8.746127333319679e-06, "loss": 1.1413, "step": 2255 }, { "epoch": 0.5876808061108403, "grad_norm": 1.3203125, "learning_rate": 8.701112970362569e-06, "loss": 1.1803, "step": 2260 }, { "epoch": 0.5889809848854217, "grad_norm": 1.359375, "learning_rate": 8.65612536254313e-06, "loss": 1.1832, "step": 2265 }, { "epoch": 0.5902811636600033, "grad_norm": 1.328125, "learning_rate": 8.611165436539062e-06, "loss": 1.1154, "step": 2270 }, { "epoch": 0.5915813424345847, "grad_norm": 1.34375, "learning_rate": 8.566234118457857e-06, "loss": 1.1777, "step": 2275 }, { "epoch": 0.5928815212091663, "grad_norm": 1.53125, "learning_rate": 8.52133233381773e-06, "loss": 1.2159, "step": 2280 }, { "epoch": 0.5941816999837478, "grad_norm": 1.328125, "learning_rate": 8.476461007528538e-06, "loss": 1.1776, "step": 2285 }, { "epoch": 0.5954818787583293, "grad_norm": 1.3046875, "learning_rate": 8.431621063872759e-06, "loss": 1.1542, "step": 2290 }, { "epoch": 0.5967820575329108, "grad_norm": 1.3203125, "learning_rate": 8.386813426486427e-06, "loss": 1.132, "step": 2295 }, { "epoch": 0.5980822363074922, "grad_norm": 1.34375, "learning_rate": 8.342039018340114e-06, "loss": 1.1944, "step": 2300 }, { "epoch": 0.5993824150820738, "grad_norm": 1.203125, "learning_rate": 8.297298761719923e-06, "loss": 1.0801, "step": 2305 }, { "epoch": 0.6006825938566553, "grad_norm": 1.2890625, "learning_rate": 8.252593578208485e-06, "loss": 1.1818, "step": 2310 }, { "epoch": 0.6019827726312368, "grad_norm": 1.40625, "learning_rate": 8.207924388665971e-06, "loss": 1.1598, "step": 2315 }, { "epoch": 0.6032829514058183, "grad_norm": 1.2734375, "learning_rate": 8.16329211321114e-06, "loss": 1.1533, "step": 2320 }, { "epoch": 0.6045831301803998, "grad_norm": 1.453125, "learning_rate": 8.118697671202373e-06, "loss": 1.1676, "step": 2325 }, { "epoch": 0.6058833089549813, "grad_norm": 1.265625, "learning_rate": 8.074141981218731e-06, "loss": 1.1403, "step": 2330 }, { "epoch": 0.6071834877295628, "grad_norm": 1.2734375, "learning_rate": 8.029625961041058e-06, "loss": 1.1452, "step": 2335 }, { "epoch": 0.6084836665041443, "grad_norm": 1.3203125, "learning_rate": 7.985150527633034e-06, "loss": 1.1834, "step": 2340 }, { "epoch": 0.6097838452787259, "grad_norm": 1.3515625, "learning_rate": 7.940716597122331e-06, "loss": 1.1147, "step": 2345 }, { "epoch": 0.6110840240533073, "grad_norm": 1.3203125, "learning_rate": 7.896325084781723e-06, "loss": 1.1177, "step": 2350 }, { "epoch": 0.6123842028278889, "grad_norm": 1.296875, "learning_rate": 7.85197690501022e-06, "loss": 1.1552, "step": 2355 }, { "epoch": 0.6136843816024703, "grad_norm": 1.359375, "learning_rate": 7.807672971314258e-06, "loss": 1.1636, "step": 2360 }, { "epoch": 0.6149845603770518, "grad_norm": 1.3125, "learning_rate": 7.763414196288868e-06, "loss": 1.1521, "step": 2365 }, { "epoch": 0.6162847391516334, "grad_norm": 1.3125, "learning_rate": 7.71920149159887e-06, "loss": 1.1818, "step": 2370 }, { "epoch": 0.6175849179262148, "grad_norm": 1.3203125, "learning_rate": 7.67503576796011e-06, "loss": 1.1336, "step": 2375 }, { "epoch": 0.6188850967007964, "grad_norm": 1.3203125, "learning_rate": 7.630917935120703e-06, "loss": 1.1676, "step": 2380 }, { "epoch": 0.6201852754753778, "grad_norm": 3.125, "learning_rate": 7.586848901842271e-06, "loss": 1.1335, "step": 2385 }, { "epoch": 0.6214854542499594, "grad_norm": 1.3828125, "learning_rate": 7.54282957588125e-06, "loss": 1.1636, "step": 2390 }, { "epoch": 0.6227856330245408, "grad_norm": 1.296875, "learning_rate": 7.498860863970163e-06, "loss": 1.1389, "step": 2395 }, { "epoch": 0.6240858117991224, "grad_norm": 1.3671875, "learning_rate": 7.454943671798976e-06, "loss": 1.1441, "step": 2400 }, { "epoch": 0.6240858117991224, "eval_loss": 1.1691473722457886, "eval_runtime": 577.5415, "eval_samples_per_second": 11.829, "eval_steps_per_second": 3.944, "step": 2400 }, { "epoch": 0.6253859905737039, "grad_norm": 1.34375, "learning_rate": 7.411078903996417e-06, "loss": 1.1497, "step": 2405 }, { "epoch": 0.6266861693482854, "grad_norm": 1.2734375, "learning_rate": 7.367267464111349e-06, "loss": 1.1678, "step": 2410 }, { "epoch": 0.6279863481228669, "grad_norm": 1.3984375, "learning_rate": 7.323510254594162e-06, "loss": 1.1213, "step": 2415 }, { "epoch": 0.6292865268974484, "grad_norm": 1.5546875, "learning_rate": 7.279808176778185e-06, "loss": 1.2333, "step": 2420 }, { "epoch": 0.6305867056720299, "grad_norm": 1.390625, "learning_rate": 7.2361621308611055e-06, "loss": 1.1808, "step": 2425 }, { "epoch": 0.6318868844466115, "grad_norm": 1.3359375, "learning_rate": 7.192573015886446e-06, "loss": 1.1564, "step": 2430 }, { "epoch": 0.6331870632211929, "grad_norm": 1.4375, "learning_rate": 7.149041729725033e-06, "loss": 1.1548, "step": 2435 }, { "epoch": 0.6344872419957744, "grad_norm": 1.3203125, "learning_rate": 7.1055691690565094e-06, "loss": 1.1565, "step": 2440 }, { "epoch": 0.6357874207703559, "grad_norm": 1.390625, "learning_rate": 7.062156229350856e-06, "loss": 1.1174, "step": 2445 }, { "epoch": 0.6370875995449374, "grad_norm": 1.296875, "learning_rate": 7.018803804849956e-06, "loss": 1.2232, "step": 2450 }, { "epoch": 0.638387778319519, "grad_norm": 1.2734375, "learning_rate": 6.975512788549159e-06, "loss": 1.1557, "step": 2455 }, { "epoch": 0.6396879570941004, "grad_norm": 1.3046875, "learning_rate": 6.932284072178907e-06, "loss": 1.165, "step": 2460 }, { "epoch": 0.640988135868682, "grad_norm": 1.3359375, "learning_rate": 6.889118546186357e-06, "loss": 1.1225, "step": 2465 }, { "epoch": 0.6422883146432634, "grad_norm": 1.515625, "learning_rate": 6.846017099717028e-06, "loss": 1.1471, "step": 2470 }, { "epoch": 0.643588493417845, "grad_norm": 1.3046875, "learning_rate": 6.802980620596512e-06, "loss": 1.1728, "step": 2475 }, { "epoch": 0.6448886721924264, "grad_norm": 1.296875, "learning_rate": 6.76000999531215e-06, "loss": 1.1687, "step": 2480 }, { "epoch": 0.646188850967008, "grad_norm": 1.28125, "learning_rate": 6.717106108994809e-06, "loss": 1.1786, "step": 2485 }, { "epoch": 0.6474890297415895, "grad_norm": 1.3046875, "learning_rate": 6.6742698454006275e-06, "loss": 1.141, "step": 2490 }, { "epoch": 0.648789208516171, "grad_norm": 1.3203125, "learning_rate": 6.6315020868928135e-06, "loss": 1.173, "step": 2495 }, { "epoch": 0.6500893872907525, "grad_norm": 1.265625, "learning_rate": 6.588803714423479e-06, "loss": 1.1473, "step": 2500 }, { "epoch": 0.6513895660653339, "grad_norm": 1.3828125, "learning_rate": 6.546175607515486e-06, "loss": 1.1715, "step": 2505 }, { "epoch": 0.6526897448399155, "grad_norm": 1.296875, "learning_rate": 6.5036186442443175e-06, "loss": 1.1714, "step": 2510 }, { "epoch": 0.653989923614497, "grad_norm": 1.3125, "learning_rate": 6.461133701220019e-06, "loss": 1.1497, "step": 2515 }, { "epoch": 0.6552901023890785, "grad_norm": 1.3125, "learning_rate": 6.418721653569123e-06, "loss": 1.1169, "step": 2520 }, { "epoch": 0.65659028116366, "grad_norm": 1.359375, "learning_rate": 6.376383374916621e-06, "loss": 1.1773, "step": 2525 }, { "epoch": 0.6578904599382415, "grad_norm": 1.359375, "learning_rate": 6.334119737367978e-06, "loss": 1.1745, "step": 2530 }, { "epoch": 0.659190638712823, "grad_norm": 1.296875, "learning_rate": 6.291931611491159e-06, "loss": 1.1896, "step": 2535 }, { "epoch": 0.6604908174874046, "grad_norm": 1.3671875, "learning_rate": 6.249819866298703e-06, "loss": 1.1269, "step": 2540 }, { "epoch": 0.661790996261986, "grad_norm": 1.265625, "learning_rate": 6.207785369229822e-06, "loss": 1.1391, "step": 2545 }, { "epoch": 0.6630911750365676, "grad_norm": 1.3359375, "learning_rate": 6.165828986132531e-06, "loss": 1.1687, "step": 2550 }, { "epoch": 0.664391353811149, "grad_norm": 1.3515625, "learning_rate": 6.123951581245815e-06, "loss": 1.2048, "step": 2555 }, { "epoch": 0.6656915325857305, "grad_norm": 1.2890625, "learning_rate": 6.082154017181825e-06, "loss": 1.1827, "step": 2560 }, { "epoch": 0.666991711360312, "grad_norm": 1.28125, "learning_rate": 6.0404371549081046e-06, "loss": 1.1623, "step": 2565 }, { "epoch": 0.6682918901348935, "grad_norm": 1.2890625, "learning_rate": 5.998801853729865e-06, "loss": 1.1126, "step": 2570 }, { "epoch": 0.6695920689094751, "grad_norm": 1.3671875, "learning_rate": 5.957248971272285e-06, "loss": 1.1112, "step": 2575 }, { "epoch": 0.6708922476840565, "grad_norm": 1.3125, "learning_rate": 5.915779363462832e-06, "loss": 1.1893, "step": 2580 }, { "epoch": 0.6721924264586381, "grad_norm": 1.3203125, "learning_rate": 5.874393884513644e-06, "loss": 1.1891, "step": 2585 }, { "epoch": 0.6734926052332195, "grad_norm": 1.2578125, "learning_rate": 5.833093386903933e-06, "loss": 1.1045, "step": 2590 }, { "epoch": 0.6747927840078011, "grad_norm": 1.2578125, "learning_rate": 5.791878721362416e-06, "loss": 1.219, "step": 2595 }, { "epoch": 0.6760929627823826, "grad_norm": 1.390625, "learning_rate": 5.750750736849795e-06, "loss": 1.169, "step": 2600 }, { "epoch": 0.6760929627823826, "eval_loss": 1.1685597896575928, "eval_runtime": 577.6155, "eval_samples_per_second": 11.828, "eval_steps_per_second": 3.944, "step": 2600 }, { "epoch": 0.6773931415569641, "grad_norm": 1.3125, "learning_rate": 5.709710280541288e-06, "loss": 1.1884, "step": 2605 }, { "epoch": 0.6786933203315456, "grad_norm": 1.3125, "learning_rate": 5.668758197809142e-06, "loss": 1.1738, "step": 2610 }, { "epoch": 0.679993499106127, "grad_norm": 1.3046875, "learning_rate": 5.627895332205253e-06, "loss": 1.1211, "step": 2615 }, { "epoch": 0.6812936778807086, "grad_norm": 1.3984375, "learning_rate": 5.587122525443771e-06, "loss": 1.1684, "step": 2620 }, { "epoch": 0.6825938566552902, "grad_norm": 1.3515625, "learning_rate": 5.546440617383768e-06, "loss": 1.1769, "step": 2625 }, { "epoch": 0.6838940354298716, "grad_norm": 1.2578125, "learning_rate": 5.5058504460119475e-06, "loss": 1.134, "step": 2630 }, { "epoch": 0.6851942142044531, "grad_norm": 1.3125, "learning_rate": 5.4653528474253596e-06, "loss": 1.1147, "step": 2635 }, { "epoch": 0.6864943929790346, "grad_norm": 1.3515625, "learning_rate": 5.4249486558142065e-06, "loss": 1.2026, "step": 2640 }, { "epoch": 0.6877945717536161, "grad_norm": 1.375, "learning_rate": 5.384638703444636e-06, "loss": 1.1563, "step": 2645 }, { "epoch": 0.6890947505281976, "grad_norm": 1.296875, "learning_rate": 5.3444238206416135e-06, "loss": 1.1765, "step": 2650 }, { "epoch": 0.6903949293027791, "grad_norm": 1.421875, "learning_rate": 5.304304835771803e-06, "loss": 1.1534, "step": 2655 }, { "epoch": 0.6916951080773607, "grad_norm": 1.359375, "learning_rate": 5.264282575226534e-06, "loss": 1.1435, "step": 2660 }, { "epoch": 0.6929952868519421, "grad_norm": 1.390625, "learning_rate": 5.224357863404739e-06, "loss": 1.1568, "step": 2665 }, { "epoch": 0.6942954656265237, "grad_norm": 1.28125, "learning_rate": 5.1845315226960035e-06, "loss": 1.1412, "step": 2670 }, { "epoch": 0.6955956444011051, "grad_norm": 1.25, "learning_rate": 5.144804373463607e-06, "loss": 1.1559, "step": 2675 }, { "epoch": 0.6968958231756867, "grad_norm": 1.3984375, "learning_rate": 5.105177234027639e-06, "loss": 1.1444, "step": 2680 }, { "epoch": 0.6981960019502682, "grad_norm": 1.3203125, "learning_rate": 5.0656509206481355e-06, "loss": 1.1654, "step": 2685 }, { "epoch": 0.6994961807248496, "grad_norm": 1.515625, "learning_rate": 5.026226247508258e-06, "loss": 1.2098, "step": 2690 }, { "epoch": 0.7007963594994312, "grad_norm": 1.4140625, "learning_rate": 4.986904026697544e-06, "loss": 1.1362, "step": 2695 }, { "epoch": 0.7020965382740126, "grad_norm": 1.2578125, "learning_rate": 4.947685068195157e-06, "loss": 1.1308, "step": 2700 }, { "epoch": 0.7033967170485942, "grad_norm": 1.28125, "learning_rate": 4.908570179853209e-06, "loss": 1.1441, "step": 2705 }, { "epoch": 0.7046968958231757, "grad_norm": 1.3671875, "learning_rate": 4.869560167380123e-06, "loss": 1.2202, "step": 2710 }, { "epoch": 0.7059970745977572, "grad_norm": 1.3046875, "learning_rate": 4.830655834324044e-06, "loss": 1.1379, "step": 2715 }, { "epoch": 0.7072972533723387, "grad_norm": 1.34375, "learning_rate": 4.791857982056267e-06, "loss": 1.1414, "step": 2720 }, { "epoch": 0.7085974321469202, "grad_norm": 1.3046875, "learning_rate": 4.753167409754744e-06, "loss": 1.1287, "step": 2725 }, { "epoch": 0.7098976109215017, "grad_norm": 1.3203125, "learning_rate": 4.714584914387632e-06, "loss": 1.2116, "step": 2730 }, { "epoch": 0.7111977896960832, "grad_norm": 1.2578125, "learning_rate": 4.676111290696837e-06, "loss": 1.1359, "step": 2735 }, { "epoch": 0.7124979684706647, "grad_norm": 1.34375, "learning_rate": 4.6377473311817025e-06, "loss": 1.1473, "step": 2740 }, { "epoch": 0.7137981472452463, "grad_norm": 1.375, "learning_rate": 4.599493826082626e-06, "loss": 1.1692, "step": 2745 }, { "epoch": 0.7150983260198277, "grad_norm": 1.28125, "learning_rate": 4.561351563364833e-06, "loss": 1.1306, "step": 2750 }, { "epoch": 0.7163985047944093, "grad_norm": 1.4375, "learning_rate": 4.523321328702101e-06, "loss": 1.1438, "step": 2755 }, { "epoch": 0.7176986835689907, "grad_norm": 1.265625, "learning_rate": 4.48540390546061e-06, "loss": 1.1375, "step": 2760 }, { "epoch": 0.7189988623435722, "grad_norm": 1.3203125, "learning_rate": 4.447600074682781e-06, "loss": 1.1413, "step": 2765 }, { "epoch": 0.7202990411181538, "grad_norm": 1.375, "learning_rate": 4.4099106150712134e-06, "loss": 1.1749, "step": 2770 }, { "epoch": 0.7215992198927352, "grad_norm": 1.2890625, "learning_rate": 4.372336302972622e-06, "loss": 1.155, "step": 2775 }, { "epoch": 0.7228993986673168, "grad_norm": 1.5703125, "learning_rate": 4.334877912361851e-06, "loss": 1.1569, "step": 2780 }, { "epoch": 0.7241995774418982, "grad_norm": 1.296875, "learning_rate": 4.297536214825951e-06, "loss": 1.1328, "step": 2785 }, { "epoch": 0.7254997562164798, "grad_norm": 1.296875, "learning_rate": 4.260311979548245e-06, "loss": 1.1654, "step": 2790 }, { "epoch": 0.7267999349910613, "grad_norm": 1.296875, "learning_rate": 4.2232059732925305e-06, "loss": 1.174, "step": 2795 }, { "epoch": 0.7281001137656428, "grad_norm": 1.3125, "learning_rate": 4.186218960387247e-06, "loss": 1.123, "step": 2800 }, { "epoch": 0.7281001137656428, "eval_loss": 1.1681982278823853, "eval_runtime": 577.5234, "eval_samples_per_second": 11.83, "eval_steps_per_second": 3.944, "step": 2800 }, { "epoch": 0.7294002925402243, "grad_norm": 1.3671875, "learning_rate": 4.1493517027097644e-06, "loss": 1.1895, "step": 2805 }, { "epoch": 0.7307004713148058, "grad_norm": 1.265625, "learning_rate": 4.112604959670655e-06, "loss": 1.1323, "step": 2810 }, { "epoch": 0.7320006500893873, "grad_norm": 1.28125, "learning_rate": 4.075979488198092e-06, "loss": 1.1629, "step": 2815 }, { "epoch": 0.7333008288639687, "grad_norm": 1.328125, "learning_rate": 4.039476042722205e-06, "loss": 1.1688, "step": 2820 }, { "epoch": 0.7346010076385503, "grad_norm": 1.3203125, "learning_rate": 4.003095375159598e-06, "loss": 1.124, "step": 2825 }, { "epoch": 0.7359011864131318, "grad_norm": 1.3203125, "learning_rate": 3.966838234897817e-06, "loss": 1.1642, "step": 2830 }, { "epoch": 0.7372013651877133, "grad_norm": 1.421875, "learning_rate": 3.930705368779931e-06, "loss": 1.1488, "step": 2835 }, { "epoch": 0.7385015439622948, "grad_norm": 1.3203125, "learning_rate": 3.894697521089161e-06, "loss": 1.1468, "step": 2840 }, { "epoch": 0.7398017227368763, "grad_norm": 1.3671875, "learning_rate": 3.858815433533509e-06, "loss": 1.1851, "step": 2845 }, { "epoch": 0.7411019015114578, "grad_norm": 1.3359375, "learning_rate": 3.82305984523053e-06, "loss": 1.2236, "step": 2850 }, { "epoch": 0.7424020802860394, "grad_norm": 1.3203125, "learning_rate": 3.7874314926920664e-06, "loss": 1.1327, "step": 2855 }, { "epoch": 0.7437022590606208, "grad_norm": 1.28125, "learning_rate": 3.751931109809105e-06, "loss": 1.1473, "step": 2860 }, { "epoch": 0.7450024378352024, "grad_norm": 1.2890625, "learning_rate": 3.716559427836639e-06, "loss": 1.1859, "step": 2865 }, { "epoch": 0.7463026166097838, "grad_norm": 1.3359375, "learning_rate": 3.681317175378627e-06, "loss": 1.2249, "step": 2870 }, { "epoch": 0.7476027953843654, "grad_norm": 1.5, "learning_rate": 3.646205078372952e-06, "loss": 1.1738, "step": 2875 }, { "epoch": 0.7489029741589469, "grad_norm": 1.328125, "learning_rate": 3.6112238600765104e-06, "loss": 1.1335, "step": 2880 }, { "epoch": 0.7502031529335283, "grad_norm": 1.34375, "learning_rate": 3.576374241050279e-06, "loss": 1.1776, "step": 2885 }, { "epoch": 0.7515033317081099, "grad_norm": 1.3671875, "learning_rate": 3.5416569391444857e-06, "loss": 1.1905, "step": 2890 }, { "epoch": 0.7528035104826913, "grad_norm": 1.46875, "learning_rate": 3.5070726694838376e-06, "loss": 1.1748, "step": 2895 }, { "epoch": 0.7541036892572729, "grad_norm": 1.3046875, "learning_rate": 3.4726221444527496e-06, "loss": 1.1324, "step": 2900 }, { "epoch": 0.7554038680318543, "grad_norm": 1.234375, "learning_rate": 3.438306073680724e-06, "loss": 1.1313, "step": 2905 }, { "epoch": 0.7567040468064359, "grad_norm": 1.3125, "learning_rate": 3.404125164027684e-06, "loss": 1.1154, "step": 2910 }, { "epoch": 0.7580042255810174, "grad_norm": 1.34375, "learning_rate": 3.370080119569451e-06, "loss": 1.1652, "step": 2915 }, { "epoch": 0.7593044043555989, "grad_norm": 1.3828125, "learning_rate": 3.3361716415832147e-06, "loss": 1.1534, "step": 2920 }, { "epoch": 0.7606045831301804, "grad_norm": 1.25, "learning_rate": 3.30240042853311e-06, "loss": 1.1351, "step": 2925 }, { "epoch": 0.7619047619047619, "grad_norm": 1.3671875, "learning_rate": 3.2687671760558016e-06, "loss": 1.0693, "step": 2930 }, { "epoch": 0.7632049406793434, "grad_norm": 1.296875, "learning_rate": 3.235272576946191e-06, "loss": 1.1752, "step": 2935 }, { "epoch": 0.764505119453925, "grad_norm": 1.3359375, "learning_rate": 3.2019173211431154e-06, "loss": 1.1319, "step": 2940 }, { "epoch": 0.7658052982285064, "grad_norm": 1.2421875, "learning_rate": 3.168702095715146e-06, "loss": 1.1176, "step": 2945 }, { "epoch": 0.767105477003088, "grad_norm": 1.296875, "learning_rate": 3.135627584846449e-06, "loss": 1.1512, "step": 2950 }, { "epoch": 0.7684056557776694, "grad_norm": 1.28125, "learning_rate": 3.102694469822668e-06, "loss": 1.1601, "step": 2955 }, { "epoch": 0.769705834552251, "grad_norm": 1.2578125, "learning_rate": 3.0699034290169073e-06, "loss": 1.1384, "step": 2960 }, { "epoch": 0.7710060133268324, "grad_norm": 1.40625, "learning_rate": 3.037255137875751e-06, "loss": 1.1272, "step": 2965 }, { "epoch": 0.7723061921014139, "grad_norm": 1.3203125, "learning_rate": 3.004750268905363e-06, "loss": 1.1597, "step": 2970 }, { "epoch": 0.7736063708759955, "grad_norm": 1.375, "learning_rate": 2.9723894916576103e-06, "loss": 1.1599, "step": 2975 }, { "epoch": 0.7749065496505769, "grad_norm": 1.3125, "learning_rate": 2.940173472716301e-06, "loss": 1.1005, "step": 2980 }, { "epoch": 0.7762067284251585, "grad_norm": 1.359375, "learning_rate": 2.908102875683415e-06, "loss": 1.173, "step": 2985 }, { "epoch": 0.7775069071997399, "grad_norm": 1.28125, "learning_rate": 2.8761783611654826e-06, "loss": 1.1409, "step": 2990 }, { "epoch": 0.7788070859743215, "grad_norm": 1.3359375, "learning_rate": 2.8444005867599343e-06, "loss": 1.1506, "step": 2995 }, { "epoch": 0.780107264748903, "grad_norm": 1.234375, "learning_rate": 2.8127702070415787e-06, "loss": 1.0825, "step": 3000 }, { "epoch": 0.780107264748903, "eval_loss": 1.1680325269699097, "eval_runtime": 577.7067, "eval_samples_per_second": 11.826, "eval_steps_per_second": 3.943, "step": 3000 }, { "epoch": 0.7814074435234845, "grad_norm": 1.296875, "learning_rate": 2.781287873549119e-06, "loss": 1.1626, "step": 3005 }, { "epoch": 0.782707622298066, "grad_norm": 1.2890625, "learning_rate": 2.7499542347717166e-06, "loss": 1.1208, "step": 3010 }, { "epoch": 0.7840078010726474, "grad_norm": 1.453125, "learning_rate": 2.71876993613565e-06, "loss": 1.1578, "step": 3015 }, { "epoch": 0.785307979847229, "grad_norm": 1.3125, "learning_rate": 2.687735619991009e-06, "loss": 1.166, "step": 3020 }, { "epoch": 0.7866081586218105, "grad_norm": 1.390625, "learning_rate": 2.6568519255984736e-06, "loss": 1.1816, "step": 3025 }, { "epoch": 0.787908337396392, "grad_norm": 1.34375, "learning_rate": 2.6261194891161325e-06, "loss": 1.1414, "step": 3030 }, { "epoch": 0.7892085161709735, "grad_norm": 1.265625, "learning_rate": 2.5955389435863953e-06, "loss": 1.1011, "step": 3035 }, { "epoch": 0.790508694945555, "grad_norm": 1.4140625, "learning_rate": 2.5651109189229375e-06, "loss": 1.1629, "step": 3040 }, { "epoch": 0.7918088737201365, "grad_norm": 1.359375, "learning_rate": 2.5348360418977334e-06, "loss": 1.1627, "step": 3045 }, { "epoch": 0.793109052494718, "grad_norm": 1.3984375, "learning_rate": 2.5047149361281462e-06, "loss": 1.1426, "step": 3050 }, { "epoch": 0.7944092312692995, "grad_norm": 1.328125, "learning_rate": 2.4747482220640764e-06, "loss": 1.1909, "step": 3055 }, { "epoch": 0.7957094100438811, "grad_norm": 1.3203125, "learning_rate": 2.4449365169751916e-06, "loss": 1.2272, "step": 3060 }, { "epoch": 0.7970095888184625, "grad_norm": 1.34375, "learning_rate": 2.4152804349382007e-06, "loss": 1.1961, "step": 3065 }, { "epoch": 0.7983097675930441, "grad_norm": 1.3046875, "learning_rate": 2.3857805868242103e-06, "loss": 1.1709, "step": 3070 }, { "epoch": 0.7996099463676255, "grad_norm": 1.4453125, "learning_rate": 2.3564375802861383e-06, "loss": 1.1964, "step": 3075 }, { "epoch": 0.800910125142207, "grad_norm": 1.328125, "learning_rate": 2.32725201974621e-06, "loss": 1.1836, "step": 3080 }, { "epoch": 0.8022103039167886, "grad_norm": 1.328125, "learning_rate": 2.2982245063834828e-06, "loss": 1.1997, "step": 3085 }, { "epoch": 0.80351048269137, "grad_norm": 1.2578125, "learning_rate": 2.2693556381214934e-06, "loss": 1.1136, "step": 3090 }, { "epoch": 0.8048106614659516, "grad_norm": 1.2890625, "learning_rate": 2.240646009615912e-06, "loss": 1.1887, "step": 3095 }, { "epoch": 0.806110840240533, "grad_norm": 1.3359375, "learning_rate": 2.212096212242316e-06, "loss": 1.1503, "step": 3100 }, { "epoch": 0.8074110190151146, "grad_norm": 1.34375, "learning_rate": 2.1837068340839964e-06, "loss": 1.1761, "step": 3105 }, { "epoch": 0.8087111977896961, "grad_norm": 1.390625, "learning_rate": 2.155478459919843e-06, "loss": 1.1851, "step": 3110 }, { "epoch": 0.8100113765642776, "grad_norm": 1.2734375, "learning_rate": 2.127411671212315e-06, "loss": 1.1357, "step": 3115 }, { "epoch": 0.8113115553388591, "grad_norm": 1.3046875, "learning_rate": 2.09950704609544e-06, "loss": 1.1792, "step": 3120 }, { "epoch": 0.8126117341134406, "grad_norm": 1.3203125, "learning_rate": 2.071765159362924e-06, "loss": 1.1707, "step": 3125 }, { "epoch": 0.8139119128880221, "grad_norm": 1.265625, "learning_rate": 2.044186582456298e-06, "loss": 1.1201, "step": 3130 }, { "epoch": 0.8152120916626036, "grad_norm": 1.25, "learning_rate": 2.016771883453165e-06, "loss": 1.1181, "step": 3135 }, { "epoch": 0.8165122704371851, "grad_norm": 1.40625, "learning_rate": 1.9895216270554764e-06, "loss": 1.1251, "step": 3140 }, { "epoch": 0.8178124492117667, "grad_norm": 1.3984375, "learning_rate": 1.9624363745779184e-06, "loss": 1.17, "step": 3145 }, { "epoch": 0.8191126279863481, "grad_norm": 1.296875, "learning_rate": 1.935516683936338e-06, "loss": 1.1976, "step": 3150 }, { "epoch": 0.8204128067609296, "grad_norm": 1.2734375, "learning_rate": 1.908763109636257e-06, "loss": 1.1667, "step": 3155 }, { "epoch": 0.8217129855355111, "grad_norm": 1.3203125, "learning_rate": 1.8821762027614464e-06, "loss": 1.1874, "step": 3160 }, { "epoch": 0.8230131643100926, "grad_norm": 1.28125, "learning_rate": 1.8557565109625753e-06, "loss": 1.1837, "step": 3165 }, { "epoch": 0.8243133430846742, "grad_norm": 1.296875, "learning_rate": 1.8295045784459386e-06, "loss": 1.0932, "step": 3170 }, { "epoch": 0.8256135218592556, "grad_norm": 1.2890625, "learning_rate": 1.8034209459622288e-06, "loss": 1.1958, "step": 3175 }, { "epoch": 0.8269137006338372, "grad_norm": 1.2890625, "learning_rate": 1.7775061507954239e-06, "loss": 1.1478, "step": 3180 }, { "epoch": 0.8282138794084186, "grad_norm": 1.3203125, "learning_rate": 1.7517607267516845e-06, "loss": 1.2405, "step": 3185 }, { "epoch": 0.8295140581830002, "grad_norm": 1.453125, "learning_rate": 1.7261852041483961e-06, "loss": 1.1547, "step": 3190 }, { "epoch": 0.8308142369575817, "grad_norm": 1.3046875, "learning_rate": 1.7007801098032162e-06, "loss": 1.2307, "step": 3195 }, { "epoch": 0.8321144157321632, "grad_norm": 1.3671875, "learning_rate": 1.6755459670232454e-06, "loss": 1.2208, "step": 3200 }, { "epoch": 0.8321144157321632, "eval_loss": 1.1679521799087524, "eval_runtime": 577.5831, "eval_samples_per_second": 11.829, "eval_steps_per_second": 3.944, "step": 3200 }, { "epoch": 0.8334145945067447, "grad_norm": 1.3984375, "learning_rate": 1.6504832955942251e-06, "loss": 1.1363, "step": 3205 }, { "epoch": 0.8347147732813262, "grad_norm": 1.2734375, "learning_rate": 1.6255926117698485e-06, "loss": 1.2079, "step": 3210 }, { "epoch": 0.8360149520559077, "grad_norm": 1.3359375, "learning_rate": 1.6008744282611189e-06, "loss": 1.205, "step": 3215 }, { "epoch": 0.8373151308304891, "grad_norm": 1.3671875, "learning_rate": 1.576329254225788e-06, "loss": 1.1157, "step": 3220 }, { "epoch": 0.8386153096050707, "grad_norm": 1.28125, "learning_rate": 1.551957595257878e-06, "loss": 1.1672, "step": 3225 }, { "epoch": 0.8399154883796522, "grad_norm": 1.3046875, "learning_rate": 1.527759953377247e-06, "loss": 1.1382, "step": 3230 }, { "epoch": 0.8412156671542337, "grad_norm": 1.25, "learning_rate": 1.5037368270192765e-06, "loss": 1.145, "step": 3235 }, { "epoch": 0.8425158459288152, "grad_norm": 1.3203125, "learning_rate": 1.4798887110245686e-06, "loss": 1.1859, "step": 3240 }, { "epoch": 0.8438160247033967, "grad_norm": 1.34375, "learning_rate": 1.4562160966287886e-06, "loss": 1.1264, "step": 3245 }, { "epoch": 0.8451162034779782, "grad_norm": 1.34375, "learning_rate": 1.4327194714525217e-06, "loss": 1.121, "step": 3250 }, { "epoch": 0.8464163822525598, "grad_norm": 1.296875, "learning_rate": 1.4093993194912437e-06, "loss": 1.1104, "step": 3255 }, { "epoch": 0.8477165610271412, "grad_norm": 1.3359375, "learning_rate": 1.3862561211053405e-06, "loss": 1.1791, "step": 3260 }, { "epoch": 0.8490167398017228, "grad_norm": 1.3203125, "learning_rate": 1.3632903530102171e-06, "loss": 1.1554, "step": 3265 }, { "epoch": 0.8503169185763042, "grad_norm": 1.3046875, "learning_rate": 1.3405024882664808e-06, "loss": 1.1313, "step": 3270 }, { "epoch": 0.8516170973508858, "grad_norm": 1.296875, "learning_rate": 1.3178929962701981e-06, "loss": 1.16, "step": 3275 }, { "epoch": 0.8529172761254673, "grad_norm": 1.3046875, "learning_rate": 1.2954623427432177e-06, "loss": 1.1518, "step": 3280 }, { "epoch": 0.8542174549000487, "grad_norm": 1.46875, "learning_rate": 1.2732109897235822e-06, "loss": 1.1473, "step": 3285 }, { "epoch": 0.8555176336746303, "grad_norm": 1.2890625, "learning_rate": 1.2511393955560192e-06, "loss": 1.1437, "step": 3290 }, { "epoch": 0.8568178124492117, "grad_norm": 1.3515625, "learning_rate": 1.229248014882476e-06, "loss": 1.1226, "step": 3295 }, { "epoch": 0.8581179912237933, "grad_norm": 1.328125, "learning_rate": 1.207537298632787e-06, "loss": 1.1858, "step": 3300 }, { "epoch": 0.8594181699983747, "grad_norm": 1.3671875, "learning_rate": 1.1860076940153575e-06, "loss": 1.1549, "step": 3305 }, { "epoch": 0.8607183487729563, "grad_norm": 2.15625, "learning_rate": 1.1646596445079705e-06, "loss": 1.1501, "step": 3310 }, { "epoch": 0.8620185275475378, "grad_norm": 1.3046875, "learning_rate": 1.1434935898486377e-06, "loss": 1.1422, "step": 3315 }, { "epoch": 0.8633187063221193, "grad_norm": 1.3125, "learning_rate": 1.1225099660265526e-06, "loss": 1.1394, "step": 3320 }, { "epoch": 0.8646188850967008, "grad_norm": 1.2890625, "learning_rate": 1.1017092052731004e-06, "loss": 1.1111, "step": 3325 }, { "epoch": 0.8659190638712823, "grad_norm": 1.390625, "learning_rate": 1.0810917360529681e-06, "loss": 1.2065, "step": 3330 }, { "epoch": 0.8672192426458638, "grad_norm": 1.296875, "learning_rate": 1.0606579830553009e-06, "loss": 1.1393, "step": 3335 }, { "epoch": 0.8685194214204454, "grad_norm": 1.3125, "learning_rate": 1.0404083671849674e-06, "loss": 1.1753, "step": 3340 }, { "epoch": 0.8698196001950268, "grad_norm": 1.3828125, "learning_rate": 1.0203433055538904e-06, "loss": 1.1847, "step": 3345 }, { "epoch": 0.8711197789696083, "grad_norm": 1.3125, "learning_rate": 1.0004632114724388e-06, "loss": 1.1618, "step": 3350 }, { "epoch": 0.8724199577441898, "grad_norm": 1.4453125, "learning_rate": 9.807684944409368e-07, "loss": 1.1432, "step": 3355 }, { "epoch": 0.8737201365187713, "grad_norm": 1.3125, "learning_rate": 9.61259560141211e-07, "loss": 1.2046, "step": 3360 }, { "epoch": 0.8750203152933529, "grad_norm": 1.34375, "learning_rate": 9.419368104282467e-07, "loss": 1.1266, "step": 3365 }, { "epoch": 0.8763204940679343, "grad_norm": 1.3046875, "learning_rate": 9.228006433218961e-07, "loss": 1.1734, "step": 3370 }, { "epoch": 0.8776206728425159, "grad_norm": 1.3671875, "learning_rate": 9.038514529986941e-07, "loss": 1.1328, "step": 3375 }, { "epoch": 0.8789208516170973, "grad_norm": 1.6171875, "learning_rate": 8.850896297837275e-07, "loss": 1.2079, "step": 3380 }, { "epoch": 0.8802210303916789, "grad_norm": 1.265625, "learning_rate": 8.665155601426046e-07, "loss": 1.1265, "step": 3385 }, { "epoch": 0.8815212091662603, "grad_norm": 1.296875, "learning_rate": 8.481296266734873e-07, "loss": 1.141, "step": 3390 }, { "epoch": 0.8828213879408419, "grad_norm": 1.3125, "learning_rate": 8.299322080992112e-07, "loss": 1.183, "step": 3395 }, { "epoch": 0.8841215667154234, "grad_norm": 1.3203125, "learning_rate": 8.119236792594897e-07, "loss": 1.129, "step": 3400 }, { "epoch": 0.8841215667154234, "eval_loss": 1.1679151058197021, "eval_runtime": 577.5628, "eval_samples_per_second": 11.829, "eval_steps_per_second": 3.944, "step": 3400 }, { "epoch": 0.8854217454900049, "grad_norm": 1.3203125, "learning_rate": 7.94104411103187e-07, "loss": 1.1262, "step": 3405 }, { "epoch": 0.8867219242645864, "grad_norm": 1.296875, "learning_rate": 7.764747706806786e-07, "loss": 1.1873, "step": 3410 }, { "epoch": 0.8880221030391678, "grad_norm": 1.34375, "learning_rate": 7.59035121136289e-07, "loss": 1.2039, "step": 3415 }, { "epoch": 0.8893222818137494, "grad_norm": 1.3515625, "learning_rate": 7.417858217008222e-07, "loss": 1.1622, "step": 3420 }, { "epoch": 0.8906224605883309, "grad_norm": 1.2890625, "learning_rate": 7.247272276841422e-07, "loss": 1.1862, "step": 3425 }, { "epoch": 0.8919226393629124, "grad_norm": 1.3515625, "learning_rate": 7.07859690467877e-07, "loss": 1.1924, "step": 3430 }, { "epoch": 0.8932228181374939, "grad_norm": 1.3515625, "learning_rate": 6.911835574981573e-07, "loss": 1.1518, "step": 3435 }, { "epoch": 0.8945229969120754, "grad_norm": 1.296875, "learning_rate": 6.746991722784813e-07, "loss": 1.1507, "step": 3440 }, { "epoch": 0.8958231756866569, "grad_norm": 1.296875, "learning_rate": 6.584068743626238e-07, "loss": 1.16, "step": 3445 }, { "epoch": 0.8971233544612384, "grad_norm": 1.2890625, "learning_rate": 6.423069993476483e-07, "loss": 1.1328, "step": 3450 }, { "epoch": 0.8984235332358199, "grad_norm": 1.3125, "learning_rate": 6.263998788669956e-07, "loss": 1.1424, "step": 3455 }, { "epoch": 0.8997237120104015, "grad_norm": 1.3125, "learning_rate": 6.106858405836491e-07, "loss": 1.0907, "step": 3460 }, { "epoch": 0.9010238907849829, "grad_norm": 1.296875, "learning_rate": 5.951652081833836e-07, "loss": 1.2033, "step": 3465 }, { "epoch": 0.9023240695595645, "grad_norm": 1.3359375, "learning_rate": 5.79838301368103e-07, "loss": 1.2056, "step": 3470 }, { "epoch": 0.9036242483341459, "grad_norm": 1.328125, "learning_rate": 5.647054358492554e-07, "loss": 1.1362, "step": 3475 }, { "epoch": 0.9049244271087274, "grad_norm": 1.3359375, "learning_rate": 5.497669233413227e-07, "loss": 1.1422, "step": 3480 }, { "epoch": 0.906224605883309, "grad_norm": 1.3359375, "learning_rate": 5.350230715554105e-07, "loss": 1.2309, "step": 3485 }, { "epoch": 0.9075247846578904, "grad_norm": 1.3515625, "learning_rate": 5.20474184192894e-07, "loss": 1.1572, "step": 3490 }, { "epoch": 0.908824963432472, "grad_norm": 1.3515625, "learning_rate": 5.061205609391807e-07, "loss": 1.1473, "step": 3495 }, { "epoch": 0.9101251422070534, "grad_norm": 1.328125, "learning_rate": 4.919624974575254e-07, "loss": 1.1713, "step": 3500 }, { "epoch": 0.911425320981635, "grad_norm": 1.3515625, "learning_rate": 4.78000285382939e-07, "loss": 1.1811, "step": 3505 }, { "epoch": 0.9127254997562165, "grad_norm": 1.3046875, "learning_rate": 4.642342123161925e-07, "loss": 1.1762, "step": 3510 }, { "epoch": 0.914025678530798, "grad_norm": 1.3046875, "learning_rate": 4.5066456181788063e-07, "loss": 1.1077, "step": 3515 }, { "epoch": 0.9153258573053795, "grad_norm": 1.4140625, "learning_rate": 4.372916134025873e-07, "loss": 1.1273, "step": 3520 }, { "epoch": 0.916626036079961, "grad_norm": 1.2890625, "learning_rate": 4.241156425331239e-07, "loss": 1.1142, "step": 3525 }, { "epoch": 0.9179262148545425, "grad_norm": 1.2578125, "learning_rate": 4.111369206148641e-07, "loss": 1.136, "step": 3530 }, { "epoch": 0.919226393629124, "grad_norm": 1.3125, "learning_rate": 3.983557149901396e-07, "loss": 1.1332, "step": 3535 }, { "epoch": 0.9205265724037055, "grad_norm": 1.3046875, "learning_rate": 3.857722889327464e-07, "loss": 1.1709, "step": 3540 }, { "epoch": 0.921826751178287, "grad_norm": 1.4140625, "learning_rate": 3.73386901642514e-07, "loss": 1.1097, "step": 3545 }, { "epoch": 0.9231269299528685, "grad_norm": 1.3046875, "learning_rate": 3.6119980823996705e-07, "loss": 1.1426, "step": 3550 }, { "epoch": 0.92442710872745, "grad_norm": 1.3359375, "learning_rate": 3.492112597610742e-07, "loss": 1.132, "step": 3555 }, { "epoch": 0.9257272875020315, "grad_norm": 1.2890625, "learning_rate": 3.374215031520711e-07, "loss": 1.1641, "step": 3560 }, { "epoch": 0.927027466276613, "grad_norm": 1.2734375, "learning_rate": 3.258307812643813e-07, "loss": 1.1326, "step": 3565 }, { "epoch": 0.9283276450511946, "grad_norm": 1.328125, "learning_rate": 3.1443933284960535e-07, "loss": 1.1717, "step": 3570 }, { "epoch": 0.929627823825776, "grad_norm": 1.3671875, "learning_rate": 3.032473925546109e-07, "loss": 1.1891, "step": 3575 }, { "epoch": 0.9309280026003576, "grad_norm": 1.3203125, "learning_rate": 2.9225519091669153e-07, "loss": 1.1511, "step": 3580 }, { "epoch": 0.932228181374939, "grad_norm": 1.3203125, "learning_rate": 2.8146295435882767e-07, "loss": 1.1461, "step": 3585 }, { "epoch": 0.9335283601495206, "grad_norm": 1.3671875, "learning_rate": 2.7087090518501004e-07, "loss": 1.1733, "step": 3590 }, { "epoch": 0.9348285389241021, "grad_norm": 1.2890625, "learning_rate": 2.604792615756735e-07, "loss": 1.1656, "step": 3595 }, { "epoch": 0.9361287176986836, "grad_norm": 1.2734375, "learning_rate": 2.502882375831939e-07, "loss": 1.1435, "step": 3600 }, { "epoch": 0.9361287176986836, "eval_loss": 1.1679141521453857, "eval_runtime": 577.8338, "eval_samples_per_second": 11.823, "eval_steps_per_second": 3.942, "step": 3600 }, { "epoch": 0.9374288964732651, "grad_norm": 1.296875, "learning_rate": 2.4029804312748264e-07, "loss": 1.1184, "step": 3605 }, { "epoch": 0.9387290752478465, "grad_norm": 1.296875, "learning_rate": 2.3050888399166026e-07, "loss": 1.2475, "step": 3610 }, { "epoch": 0.9400292540224281, "grad_norm": 1.2421875, "learning_rate": 2.2092096181782208e-07, "loss": 1.1822, "step": 3615 }, { "epoch": 0.9413294327970095, "grad_norm": 1.2578125, "learning_rate": 2.115344741028813e-07, "loss": 1.1474, "step": 3620 }, { "epoch": 0.9426296115715911, "grad_norm": 1.3203125, "learning_rate": 2.0234961419449693e-07, "loss": 1.1979, "step": 3625 }, { "epoch": 0.9439297903461726, "grad_norm": 1.2890625, "learning_rate": 1.9336657128710467e-07, "loss": 1.1783, "step": 3630 }, { "epoch": 0.9452299691207541, "grad_norm": 1.3203125, "learning_rate": 1.8458553041799998e-07, "loss": 1.1873, "step": 3635 }, { "epoch": 0.9465301478953356, "grad_norm": 1.3125, "learning_rate": 1.7600667246354452e-07, "loss": 1.1468, "step": 3640 }, { "epoch": 0.9478303266699171, "grad_norm": 1.3125, "learning_rate": 1.6763017413542916e-07, "loss": 1.1716, "step": 3645 }, { "epoch": 0.9491305054444986, "grad_norm": 1.3515625, "learning_rate": 1.5945620797704343e-07, "loss": 1.1424, "step": 3650 }, { "epoch": 0.9504306842190802, "grad_norm": 1.359375, "learning_rate": 1.5148494235990963e-07, "loss": 1.1885, "step": 3655 }, { "epoch": 0.9517308629936616, "grad_norm": 1.3359375, "learning_rate": 1.437165414802244e-07, "loss": 1.1242, "step": 3660 }, { "epoch": 0.9530310417682432, "grad_norm": 1.4453125, "learning_rate": 1.3615116535547146e-07, "loss": 1.1875, "step": 3665 }, { "epoch": 0.9543312205428246, "grad_norm": 1.328125, "learning_rate": 1.2878896982112755e-07, "loss": 1.1149, "step": 3670 }, { "epoch": 0.9556313993174061, "grad_norm": 1.2890625, "learning_rate": 1.2163010652745387e-07, "loss": 1.1524, "step": 3675 }, { "epoch": 0.9569315780919877, "grad_norm": 1.359375, "learning_rate": 1.146747229363665e-07, "loss": 1.1703, "step": 3680 }, { "epoch": 0.9582317568665691, "grad_norm": 1.328125, "learning_rate": 1.0792296231840748e-07, "loss": 1.199, "step": 3685 }, { "epoch": 0.9595319356411507, "grad_norm": 1.3515625, "learning_rate": 1.0137496374978407e-07, "loss": 1.1646, "step": 3690 }, { "epoch": 0.9608321144157321, "grad_norm": 1.515625, "learning_rate": 9.503086210951306e-08, "loss": 1.1367, "step": 3695 }, { "epoch": 0.9621322931903137, "grad_norm": 1.5, "learning_rate": 8.889078807663542e-08, "loss": 1.1301, "step": 3700 }, { "epoch": 0.9634324719648951, "grad_norm": 1.3046875, "learning_rate": 8.295486812752829e-08, "loss": 1.1291, "step": 3705 }, { "epoch": 0.9647326507394767, "grad_norm": 1.7734375, "learning_rate": 7.722322453330045e-08, "loss": 1.1076, "step": 3710 }, { "epoch": 0.9660328295140582, "grad_norm": 1.2734375, "learning_rate": 7.16959753572699e-08, "loss": 1.1985, "step": 3715 }, { "epoch": 0.9673330082886397, "grad_norm": 1.296875, "learning_rate": 6.637323445253475e-08, "loss": 1.1227, "step": 3720 }, { "epoch": 0.9686331870632212, "grad_norm": 1.328125, "learning_rate": 6.125511145962715e-08, "loss": 1.125, "step": 3725 }, { "epoch": 0.9699333658378027, "grad_norm": 1.3671875, "learning_rate": 5.634171180425641e-08, "loss": 1.1624, "step": 3730 }, { "epoch": 0.9712335446123842, "grad_norm": 1.421875, "learning_rate": 5.163313669513392e-08, "loss": 1.1263, "step": 3735 }, { "epoch": 0.9725337233869658, "grad_norm": 1.3359375, "learning_rate": 4.712948312189491e-08, "loss": 1.2215, "step": 3740 }, { "epoch": 0.9738339021615472, "grad_norm": 1.359375, "learning_rate": 4.283084385309e-08, "loss": 1.127, "step": 3745 }, { "epoch": 0.9751340809361287, "grad_norm": 1.3203125, "learning_rate": 3.873730743428561e-08, "loss": 1.1148, "step": 3750 }, { "epoch": 0.9764342597107102, "grad_norm": 1.3671875, "learning_rate": 3.484895818623213e-08, "loss": 1.1234, "step": 3755 }, { "epoch": 0.9777344384852917, "grad_norm": 1.453125, "learning_rate": 3.116587620313194e-08, "loss": 1.1412, "step": 3760 }, { "epoch": 0.9790346172598733, "grad_norm": 1.2734375, "learning_rate": 2.76881373509863e-08, "loss": 1.0984, "step": 3765 }, { "epoch": 0.9803347960344547, "grad_norm": 1.2734375, "learning_rate": 2.4415813266034373e-08, "loss": 1.1885, "step": 3770 }, { "epoch": 0.9816349748090363, "grad_norm": 1.375, "learning_rate": 2.1348971353277738e-08, "loss": 1.1402, "step": 3775 }, { "epoch": 0.9829351535836177, "grad_norm": 1.296875, "learning_rate": 1.848767478508928e-08, "loss": 1.1229, "step": 3780 }, { "epoch": 0.9842353323581993, "grad_norm": 1.34375, "learning_rate": 1.5831982499915355e-08, "loss": 1.2113, "step": 3785 }, { "epoch": 0.9855355111327807, "grad_norm": 1.328125, "learning_rate": 1.3381949201062283e-08, "loss": 1.1621, "step": 3790 }, { "epoch": 0.9868356899073623, "grad_norm": 1.3359375, "learning_rate": 1.1137625355565063e-08, "loss": 1.1413, "step": 3795 }, { "epoch": 0.9881358686819438, "grad_norm": 1.4140625, "learning_rate": 9.099057193150406e-09, "loss": 1.2007, "step": 3800 }, { "epoch": 0.9881358686819438, "eval_loss": 1.1679197549819946, "eval_runtime": 577.8185, "eval_samples_per_second": 11.824, "eval_steps_per_second": 3.942, "step": 3800 }, { "epoch": 0.9894360474565252, "grad_norm": 1.3046875, "learning_rate": 7.266286705286396e-09, "loss": 1.1516, "step": 3805 }, { "epoch": 0.9907362262311068, "grad_norm": 1.4375, "learning_rate": 5.6393516443131735e-09, "loss": 1.1979, "step": 3810 }, { "epoch": 0.9920364050056882, "grad_norm": 1.3046875, "learning_rate": 4.218285522670229e-09, "loss": 1.14, "step": 3815 }, { "epoch": 0.9933365837802698, "grad_norm": 1.2421875, "learning_rate": 3.003117612202511e-09, "loss": 1.1325, "step": 3820 }, { "epoch": 0.9946367625548513, "grad_norm": 1.2578125, "learning_rate": 1.9938729435586833e-09, "loss": 1.173, "step": 3825 }, { "epoch": 0.9959369413294328, "grad_norm": 1.2578125, "learning_rate": 1.190572305673765e-09, "loss": 1.1441, "step": 3830 }, { "epoch": 0.9972371201040143, "grad_norm": 1.3046875, "learning_rate": 5.932322453461315e-10, "loss": 1.1291, "step": 3835 }, { "epoch": 0.9985372988785958, "grad_norm": 1.2890625, "learning_rate": 2.0186506689001682e-10, "loss": 1.1295, "step": 3840 }, { "epoch": 0.9998374776531773, "grad_norm": 1.2734375, "learning_rate": 1.647883188682364e-11, "loss": 1.1633, "step": 3845 }, { "epoch": 1.0, "step": 3846, "total_flos": 2.1312002805936947e+18, "train_loss": 1.184760999939929, "train_runtime": 31570.962, "train_samples_per_second": 1.949, "train_steps_per_second": 0.122 } ], "logging_steps": 5, "max_steps": 3846, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1312002805936947e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }