{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 5.53125, "learning_rate": 4.999980723453676e-05, "loss": 0.6803, "step": 5 }, { "epoch": 0.0025, "grad_norm": 3.0, "learning_rate": 4.9999228941119745e-05, "loss": 0.6176, "step": 10 }, { "epoch": 0.00375, "grad_norm": 1.8984375, "learning_rate": 4.999826512866693e-05, "loss": 0.5313, "step": 15 }, { "epoch": 0.005, "grad_norm": 1.6015625, "learning_rate": 4.999691581204152e-05, "loss": 0.5659, "step": 20 }, { "epoch": 0.00625, "grad_norm": 1.5703125, "learning_rate": 4.9995181012051625e-05, "loss": 0.5822, "step": 25 }, { "epoch": 0.0075, "grad_norm": 1.515625, "learning_rate": 4.9993060755450015e-05, "loss": 0.5849, "step": 30 }, { "epoch": 0.00875, "grad_norm": 1.4375, "learning_rate": 4.999055507493368e-05, "loss": 0.55, "step": 35 }, { "epoch": 0.01, "grad_norm": 1.515625, "learning_rate": 4.998766400914329e-05, "loss": 0.5323, "step": 40 }, { "epoch": 0.01125, "grad_norm": 1.6640625, "learning_rate": 4.9984387602662675e-05, "loss": 0.5775, "step": 45 }, { "epoch": 0.0125, "grad_norm": 1.8359375, "learning_rate": 4.9980725906018074e-05, "loss": 0.5523, "step": 50 }, { "epoch": 0.01375, "grad_norm": 1.3984375, "learning_rate": 4.9976678975677376e-05, "loss": 0.6089, "step": 55 }, { "epoch": 0.015, "grad_norm": 1.40625, "learning_rate": 4.9972246874049254e-05, "loss": 0.5445, "step": 60 }, { "epoch": 0.01625, "grad_norm": 1.375, "learning_rate": 4.996742966948219e-05, "loss": 0.5256, "step": 65 }, { "epoch": 0.0175, "grad_norm": 1.359375, "learning_rate": 4.9962227436263453e-05, "loss": 0.5118, "step": 70 }, { "epoch": 0.01875, "grad_norm": 1.2421875, "learning_rate": 4.9956640254617906e-05, "loss": 0.5458, "step": 75 }, { "epoch": 0.02, "grad_norm": 1.484375, "learning_rate": 4.995066821070679e-05, "loss": 0.5946, "step": 80 }, { "epoch": 0.02125, "grad_norm": 1.4375, "learning_rate": 4.994431139662643e-05, "loss": 0.515, "step": 85 }, { "epoch": 0.0225, "grad_norm": 1.4140625, "learning_rate": 4.9937569910406756e-05, "loss": 0.5501, "step": 90 }, { "epoch": 0.02375, "grad_norm": 1.265625, "learning_rate": 4.9930443856009826e-05, "loss": 0.5475, "step": 95 }, { "epoch": 0.025, "grad_norm": 1.46875, "learning_rate": 4.99229333433282e-05, "loss": 0.5625, "step": 100 }, { "epoch": 0.02625, "grad_norm": 1.578125, "learning_rate": 4.9915038488183295e-05, "loss": 0.5627, "step": 105 }, { "epoch": 0.0275, "grad_norm": 1.640625, "learning_rate": 4.990675941232353e-05, "loss": 0.5561, "step": 110 }, { "epoch": 0.02875, "grad_norm": 1.3046875, "learning_rate": 4.989809624342251e-05, "loss": 0.5254, "step": 115 }, { "epoch": 0.03, "grad_norm": 1.0859375, "learning_rate": 4.9889049115077005e-05, "loss": 0.5184, "step": 120 }, { "epoch": 0.03125, "grad_norm": 1.390625, "learning_rate": 4.987961816680492e-05, "loss": 0.5563, "step": 125 }, { "epoch": 0.0325, "grad_norm": 1.6875, "learning_rate": 4.9869803544043166e-05, "loss": 0.5536, "step": 130 }, { "epoch": 0.03375, "grad_norm": 1.109375, "learning_rate": 4.985960539814535e-05, "loss": 0.5544, "step": 135 }, { "epoch": 0.035, "grad_norm": 1.453125, "learning_rate": 4.98490238863795e-05, "loss": 0.5117, "step": 140 }, { "epoch": 0.03625, "grad_norm": 1.421875, "learning_rate": 4.983805917192561e-05, "loss": 0.5125, "step": 145 }, { "epoch": 0.0375, "grad_norm": 1.1875, "learning_rate": 4.982671142387316e-05, "loss": 0.5563, "step": 150 }, { "epoch": 0.03875, "grad_norm": 1.375, "learning_rate": 4.9814980817218447e-05, "loss": 0.5408, "step": 155 }, { "epoch": 0.04, "grad_norm": 1.1796875, "learning_rate": 4.980286753286195e-05, "loss": 0.5249, "step": 160 }, { "epoch": 0.04125, "grad_norm": 1.546875, "learning_rate": 4.979037175760548e-05, "loss": 0.546, "step": 165 }, { "epoch": 0.0425, "grad_norm": 1.28125, "learning_rate": 4.9777493684149375e-05, "loss": 0.5019, "step": 170 }, { "epoch": 0.04375, "grad_norm": 1.453125, "learning_rate": 4.976423351108943e-05, "loss": 0.5364, "step": 175 }, { "epoch": 0.045, "grad_norm": 1.5234375, "learning_rate": 4.975059144291394e-05, "loss": 0.5504, "step": 180 }, { "epoch": 0.04625, "grad_norm": 1.2109375, "learning_rate": 4.973656769000046e-05, "loss": 0.4682, "step": 185 }, { "epoch": 0.0475, "grad_norm": 1.328125, "learning_rate": 4.972216246861262e-05, "loss": 0.5262, "step": 190 }, { "epoch": 0.04875, "grad_norm": 1.09375, "learning_rate": 4.9707376000896736e-05, "loss": 0.5343, "step": 195 }, { "epoch": 0.05, "grad_norm": 1.3671875, "learning_rate": 4.9692208514878444e-05, "loss": 0.5171, "step": 200 }, { "epoch": 0.05125, "grad_norm": 1.1640625, "learning_rate": 4.967666024445914e-05, "loss": 0.5454, "step": 205 }, { "epoch": 0.0525, "grad_norm": 1.25, "learning_rate": 4.966073142941239e-05, "loss": 0.5378, "step": 210 }, { "epoch": 0.05375, "grad_norm": 1.140625, "learning_rate": 4.9644422315380225e-05, "loss": 0.4792, "step": 215 }, { "epoch": 0.055, "grad_norm": 1.46875, "learning_rate": 4.962773315386935e-05, "loss": 0.5336, "step": 220 }, { "epoch": 0.05625, "grad_norm": 1.359375, "learning_rate": 4.9610664202247294e-05, "loss": 0.5293, "step": 225 }, { "epoch": 0.0575, "grad_norm": 1.0546875, "learning_rate": 4.9593215723738404e-05, "loss": 0.4678, "step": 230 }, { "epoch": 0.05875, "grad_norm": 1.421875, "learning_rate": 4.957538798741979e-05, "loss": 0.549, "step": 235 }, { "epoch": 0.06, "grad_norm": 1.3046875, "learning_rate": 4.9557181268217227e-05, "loss": 0.5642, "step": 240 }, { "epoch": 0.06125, "grad_norm": 1.2734375, "learning_rate": 4.953859584690082e-05, "loss": 0.5544, "step": 245 }, { "epoch": 0.0625, "grad_norm": 1.25, "learning_rate": 4.951963201008076e-05, "loss": 0.5604, "step": 250 }, { "epoch": 0.06375, "grad_norm": 1.15625, "learning_rate": 4.9500290050202894e-05, "loss": 0.5349, "step": 255 }, { "epoch": 0.065, "grad_norm": 1.5234375, "learning_rate": 4.9480570265544144e-05, "loss": 0.5393, "step": 260 }, { "epoch": 0.06625, "grad_norm": 1.2109375, "learning_rate": 4.9460472960208e-05, "loss": 0.527, "step": 265 }, { "epoch": 0.0675, "grad_norm": 1.203125, "learning_rate": 4.943999844411977e-05, "loss": 0.4982, "step": 270 }, { "epoch": 0.06875, "grad_norm": 0.9765625, "learning_rate": 4.9419147033021814e-05, "loss": 0.4377, "step": 275 }, { "epoch": 0.07, "grad_norm": 1.453125, "learning_rate": 4.939791904846869e-05, "loss": 0.4919, "step": 280 }, { "epoch": 0.07125, "grad_norm": 1.7421875, "learning_rate": 4.937631481782218e-05, "loss": 0.5107, "step": 285 }, { "epoch": 0.0725, "grad_norm": 1.4921875, "learning_rate": 4.935433467424624e-05, "loss": 0.5611, "step": 290 }, { "epoch": 0.07375, "grad_norm": 1.8984375, "learning_rate": 4.9331978956701875e-05, "loss": 0.534, "step": 295 }, { "epoch": 0.075, "grad_norm": 1.3515625, "learning_rate": 4.9309248009941914e-05, "loss": 0.5319, "step": 300 }, { "epoch": 0.07625, "grad_norm": 1.2265625, "learning_rate": 4.928614218450568e-05, "loss": 0.4805, "step": 305 }, { "epoch": 0.0775, "grad_norm": 1.390625, "learning_rate": 4.9262661836713564e-05, "loss": 0.4656, "step": 310 }, { "epoch": 0.07875, "grad_norm": 1.0546875, "learning_rate": 4.923880732866159e-05, "loss": 0.5328, "step": 315 }, { "epoch": 0.08, "grad_norm": 1.0, "learning_rate": 4.9214579028215776e-05, "loss": 0.4949, "step": 320 }, { "epoch": 0.08125, "grad_norm": 1.2109375, "learning_rate": 4.9189977309006495e-05, "loss": 0.5222, "step": 325 }, { "epoch": 0.0825, "grad_norm": 1.2421875, "learning_rate": 4.916500255042268e-05, "loss": 0.5029, "step": 330 }, { "epoch": 0.08375, "grad_norm": 1.0546875, "learning_rate": 4.9139655137606015e-05, "loss": 0.5188, "step": 335 }, { "epoch": 0.085, "grad_norm": 1.109375, "learning_rate": 4.9113935461444955e-05, "loss": 0.5651, "step": 340 }, { "epoch": 0.08625, "grad_norm": 1.046875, "learning_rate": 4.908784391856872e-05, "loss": 0.4586, "step": 345 }, { "epoch": 0.0875, "grad_norm": 1.0625, "learning_rate": 4.906138091134118e-05, "loss": 0.539, "step": 350 }, { "epoch": 0.08875, "grad_norm": 1.3828125, "learning_rate": 4.9034546847854656e-05, "loss": 0.5331, "step": 355 }, { "epoch": 0.09, "grad_norm": 1.015625, "learning_rate": 4.900734214192358e-05, "loss": 0.4227, "step": 360 }, { "epoch": 0.09125, "grad_norm": 1.28125, "learning_rate": 4.897976721307819e-05, "loss": 0.5005, "step": 365 }, { "epoch": 0.0925, "grad_norm": 1.1171875, "learning_rate": 4.8951822486557986e-05, "loss": 0.5294, "step": 370 }, { "epoch": 0.09375, "grad_norm": 1.375, "learning_rate": 4.892350839330522e-05, "loss": 0.5729, "step": 375 }, { "epoch": 0.095, "grad_norm": 0.95703125, "learning_rate": 4.8894825369958255e-05, "loss": 0.4837, "step": 380 }, { "epoch": 0.09625, "grad_norm": 1.5078125, "learning_rate": 4.8865773858844776e-05, "loss": 0.5266, "step": 385 }, { "epoch": 0.0975, "grad_norm": 1.375, "learning_rate": 4.8836354307975026e-05, "loss": 0.5329, "step": 390 }, { "epoch": 0.09875, "grad_norm": 1.4921875, "learning_rate": 4.880656717103489e-05, "loss": 0.5096, "step": 395 }, { "epoch": 0.1, "grad_norm": 1.171875, "learning_rate": 4.877641290737884e-05, "loss": 0.4919, "step": 400 }, { "epoch": 0.10125, "grad_norm": 1.078125, "learning_rate": 4.874589198202294e-05, "loss": 0.4633, "step": 405 }, { "epoch": 0.1025, "grad_norm": 1.4296875, "learning_rate": 4.8715004865637614e-05, "loss": 0.4981, "step": 410 }, { "epoch": 0.10375, "grad_norm": 1.109375, "learning_rate": 4.868375203454041e-05, "loss": 0.4699, "step": 415 }, { "epoch": 0.105, "grad_norm": 1.1796875, "learning_rate": 4.8652133970688636e-05, "loss": 0.5086, "step": 420 }, { "epoch": 0.10625, "grad_norm": 1.1015625, "learning_rate": 4.862015116167196e-05, "loss": 0.5406, "step": 425 }, { "epoch": 0.1075, "grad_norm": 1.640625, "learning_rate": 4.8587804100704845e-05, "loss": 0.5456, "step": 430 }, { "epoch": 0.10875, "grad_norm": 1.1875, "learning_rate": 4.8555093286618995e-05, "loss": 0.5107, "step": 435 }, { "epoch": 0.11, "grad_norm": 1.34375, "learning_rate": 4.852201922385564e-05, "loss": 0.4078, "step": 440 }, { "epoch": 0.11125, "grad_norm": 1.3046875, "learning_rate": 4.848858242245773e-05, "loss": 0.4958, "step": 445 }, { "epoch": 0.1125, "grad_norm": 1.4765625, "learning_rate": 4.8454783398062106e-05, "loss": 0.4822, "step": 450 }, { "epoch": 0.11375, "grad_norm": 1.546875, "learning_rate": 4.8420622671891533e-05, "loss": 0.5489, "step": 455 }, { "epoch": 0.115, "grad_norm": 1.0703125, "learning_rate": 4.838610077074669e-05, "loss": 0.4884, "step": 460 }, { "epoch": 0.11625, "grad_norm": 1.609375, "learning_rate": 4.835121822699796e-05, "loss": 0.529, "step": 465 }, { "epoch": 0.1175, "grad_norm": 1.234375, "learning_rate": 4.8315975578577355e-05, "loss": 0.5414, "step": 470 }, { "epoch": 0.11875, "grad_norm": 1.5078125, "learning_rate": 4.828037336897009e-05, "loss": 0.4749, "step": 475 }, { "epoch": 0.12, "grad_norm": 0.9765625, "learning_rate": 4.8244412147206284e-05, "loss": 0.5205, "step": 480 }, { "epoch": 0.12125, "grad_norm": 1.2890625, "learning_rate": 4.820809246785247e-05, "loss": 0.5343, "step": 485 }, { "epoch": 0.1225, "grad_norm": 1.3828125, "learning_rate": 4.817141489100302e-05, "loss": 0.5324, "step": 490 }, { "epoch": 0.12375, "grad_norm": 1.546875, "learning_rate": 4.8134379982271556e-05, "loss": 0.5451, "step": 495 }, { "epoch": 0.125, "grad_norm": 1.390625, "learning_rate": 4.8096988312782174e-05, "loss": 0.5428, "step": 500 }, { "epoch": 0.12625, "grad_norm": 1.0546875, "learning_rate": 4.805924045916067e-05, "loss": 0.5002, "step": 505 }, { "epoch": 0.1275, "grad_norm": 1.2890625, "learning_rate": 4.8021137003525664e-05, "loss": 0.5277, "step": 510 }, { "epoch": 0.12875, "grad_norm": 1.3125, "learning_rate": 4.7982678533479555e-05, "loss": 0.5185, "step": 515 }, { "epoch": 0.13, "grad_norm": 1.3125, "learning_rate": 4.794386564209953e-05, "loss": 0.501, "step": 520 }, { "epoch": 0.13125, "grad_norm": 1.4453125, "learning_rate": 4.7904698927928406e-05, "loss": 0.4903, "step": 525 }, { "epoch": 0.1325, "grad_norm": 1.1484375, "learning_rate": 4.7865178994965344e-05, "loss": 0.4764, "step": 530 }, { "epoch": 0.13375, "grad_norm": 1.3515625, "learning_rate": 4.782530645265661e-05, "loss": 0.5046, "step": 535 }, { "epoch": 0.135, "grad_norm": 1.25, "learning_rate": 4.7785081915886134e-05, "loss": 0.4849, "step": 540 }, { "epoch": 0.13625, "grad_norm": 1.4375, "learning_rate": 4.7744506004966025e-05, "loss": 0.4874, "step": 545 }, { "epoch": 0.1375, "grad_norm": 1.1796875, "learning_rate": 4.7703579345627035e-05, "loss": 0.5632, "step": 550 }, { "epoch": 0.13875, "grad_norm": 1.1953125, "learning_rate": 4.766230256900887e-05, "loss": 0.4894, "step": 555 }, { "epoch": 0.14, "grad_norm": 0.9140625, "learning_rate": 4.762067631165049e-05, "loss": 0.4819, "step": 560 }, { "epoch": 0.14125, "grad_norm": 1.3203125, "learning_rate": 4.7578701215480284e-05, "loss": 0.4872, "step": 565 }, { "epoch": 0.1425, "grad_norm": 1.6328125, "learning_rate": 4.753637792780614e-05, "loss": 0.5274, "step": 570 }, { "epoch": 0.14375, "grad_norm": 1.359375, "learning_rate": 4.749370710130554e-05, "loss": 0.5052, "step": 575 }, { "epoch": 0.145, "grad_norm": 1.2109375, "learning_rate": 4.745068939401539e-05, "loss": 0.4819, "step": 580 }, { "epoch": 0.14625, "grad_norm": 1.4296875, "learning_rate": 4.740732546932197e-05, "loss": 0.5159, "step": 585 }, { "epoch": 0.1475, "grad_norm": 1.203125, "learning_rate": 4.7363615995950626e-05, "loss": 0.5338, "step": 590 }, { "epoch": 0.14875, "grad_norm": 1.1171875, "learning_rate": 4.7319561647955526e-05, "loss": 0.4797, "step": 595 }, { "epoch": 0.15, "grad_norm": 1.2578125, "learning_rate": 4.72751631047092e-05, "loss": 0.5453, "step": 600 }, { "epoch": 0.15125, "grad_norm": 1.234375, "learning_rate": 4.7230421050892116e-05, "loss": 0.5009, "step": 605 }, { "epoch": 0.1525, "grad_norm": 1.5625, "learning_rate": 4.718533617648209e-05, "loss": 0.4602, "step": 610 }, { "epoch": 0.15375, "grad_norm": 1.484375, "learning_rate": 4.713990917674365e-05, "loss": 0.5399, "step": 615 }, { "epoch": 0.155, "grad_norm": 1.3828125, "learning_rate": 4.709414075221734e-05, "loss": 0.5006, "step": 620 }, { "epoch": 0.15625, "grad_norm": 1.0546875, "learning_rate": 4.7048031608708876e-05, "loss": 0.4784, "step": 625 }, { "epoch": 0.1575, "grad_norm": 1.1953125, "learning_rate": 4.7001582457278304e-05, "loss": 0.4764, "step": 630 }, { "epoch": 0.15875, "grad_norm": 1.03125, "learning_rate": 4.695479401422898e-05, "loss": 0.5003, "step": 635 }, { "epoch": 0.16, "grad_norm": 1.9921875, "learning_rate": 4.690766700109659e-05, "loss": 0.4586, "step": 640 }, { "epoch": 0.16125, "grad_norm": 1.5234375, "learning_rate": 4.686020214463798e-05, "loss": 0.5272, "step": 645 }, { "epoch": 0.1625, "grad_norm": 1.3203125, "learning_rate": 4.681240017681993e-05, "loss": 0.5626, "step": 650 }, { "epoch": 0.16375, "grad_norm": 1.3984375, "learning_rate": 4.676426183480794e-05, "loss": 0.5696, "step": 655 }, { "epoch": 0.165, "grad_norm": 1.1640625, "learning_rate": 4.671578786095478e-05, "loss": 0.5391, "step": 660 }, { "epoch": 0.16625, "grad_norm": 1.1484375, "learning_rate": 4.6666979002789105e-05, "loss": 0.5195, "step": 665 }, { "epoch": 0.1675, "grad_norm": 1.5, "learning_rate": 4.661783601300388e-05, "loss": 0.4973, "step": 670 }, { "epoch": 0.16875, "grad_norm": 1.2890625, "learning_rate": 4.65683596494448e-05, "loss": 0.4741, "step": 675 }, { "epoch": 0.17, "grad_norm": 1.265625, "learning_rate": 4.65185506750986e-05, "loss": 0.4684, "step": 680 }, { "epoch": 0.17125, "grad_norm": 1.5390625, "learning_rate": 4.646840985808126e-05, "loss": 0.5307, "step": 685 }, { "epoch": 0.1725, "grad_norm": 1.296875, "learning_rate": 4.6417937971626245e-05, "loss": 0.5154, "step": 690 }, { "epoch": 0.17375, "grad_norm": 1.265625, "learning_rate": 4.636713579407245e-05, "loss": 0.5348, "step": 695 }, { "epoch": 0.175, "grad_norm": 1.1640625, "learning_rate": 4.6316004108852305e-05, "loss": 0.477, "step": 700 }, { "epoch": 0.17625, "grad_norm": 1.1640625, "learning_rate": 4.6264543704479655e-05, "loss": 0.4989, "step": 705 }, { "epoch": 0.1775, "grad_norm": 1.2421875, "learning_rate": 4.6212755374537596e-05, "loss": 0.5109, "step": 710 }, { "epoch": 0.17875, "grad_norm": 1.0625, "learning_rate": 4.616063991766623e-05, "loss": 0.48, "step": 715 }, { "epoch": 0.18, "grad_norm": 1.3984375, "learning_rate": 4.610819813755038e-05, "loss": 0.5151, "step": 720 }, { "epoch": 0.18125, "grad_norm": 0.9453125, "learning_rate": 4.6055430842907167e-05, "loss": 0.5235, "step": 725 }, { "epoch": 0.1825, "grad_norm": 1.5078125, "learning_rate": 4.600233884747355e-05, "loss": 0.5006, "step": 730 }, { "epoch": 0.18375, "grad_norm": 1.234375, "learning_rate": 4.594892296999378e-05, "loss": 0.479, "step": 735 }, { "epoch": 0.185, "grad_norm": 1.015625, "learning_rate": 4.5895184034206765e-05, "loss": 0.4807, "step": 740 }, { "epoch": 0.18625, "grad_norm": 1.40625, "learning_rate": 4.5841122868833364e-05, "loss": 0.5189, "step": 745 }, { "epoch": 0.1875, "grad_norm": 1.1953125, "learning_rate": 4.5786740307563636e-05, "loss": 0.4768, "step": 750 }, { "epoch": 0.18875, "grad_norm": 1.5234375, "learning_rate": 4.573203718904394e-05, "loss": 0.4747, "step": 755 }, { "epoch": 0.19, "grad_norm": 1.5703125, "learning_rate": 4.567701435686404e-05, "loss": 0.4756, "step": 760 }, { "epoch": 0.19125, "grad_norm": 1.1171875, "learning_rate": 4.562167265954409e-05, "loss": 0.5102, "step": 765 }, { "epoch": 0.1925, "grad_norm": 1.953125, "learning_rate": 4.55660129505215e-05, "loss": 0.5229, "step": 770 }, { "epoch": 0.19375, "grad_norm": 1.203125, "learning_rate": 4.551003608813784e-05, "loss": 0.5103, "step": 775 }, { "epoch": 0.195, "grad_norm": 1.21875, "learning_rate": 4.545374293562559e-05, "loss": 0.5216, "step": 780 }, { "epoch": 0.19625, "grad_norm": 1.078125, "learning_rate": 4.5397134361094786e-05, "loss": 0.5039, "step": 785 }, { "epoch": 0.1975, "grad_norm": 1.3828125, "learning_rate": 4.534021123751968e-05, "loss": 0.4834, "step": 790 }, { "epoch": 0.19875, "grad_norm": 1.3828125, "learning_rate": 4.528297444272525e-05, "loss": 0.4386, "step": 795 }, { "epoch": 0.2, "grad_norm": 1.328125, "learning_rate": 4.522542485937369e-05, "loss": 0.5097, "step": 800 }, { "epoch": 0.20125, "grad_norm": 1.125, "learning_rate": 4.516756337495075e-05, "loss": 0.5574, "step": 805 }, { "epoch": 0.2025, "grad_norm": 1.1953125, "learning_rate": 4.5109390881752114e-05, "loss": 0.5492, "step": 810 }, { "epoch": 0.20375, "grad_norm": 1.109375, "learning_rate": 4.5050908276869586e-05, "loss": 0.5281, "step": 815 }, { "epoch": 0.205, "grad_norm": 1.6484375, "learning_rate": 4.499211646217727e-05, "loss": 0.5042, "step": 820 }, { "epoch": 0.20625, "grad_norm": 0.9765625, "learning_rate": 4.493301634431768e-05, "loss": 0.4746, "step": 825 }, { "epoch": 0.2075, "grad_norm": 1.3046875, "learning_rate": 4.487360883468775e-05, "loss": 0.5611, "step": 830 }, { "epoch": 0.20875, "grad_norm": 1.703125, "learning_rate": 4.481389484942478e-05, "loss": 0.5058, "step": 835 }, { "epoch": 0.21, "grad_norm": 1.0703125, "learning_rate": 4.4753875309392266e-05, "loss": 0.4352, "step": 840 }, { "epoch": 0.21125, "grad_norm": 1.125, "learning_rate": 4.469355114016577e-05, "loss": 0.4849, "step": 845 }, { "epoch": 0.2125, "grad_norm": 1.1171875, "learning_rate": 4.463292327201862e-05, "loss": 0.5195, "step": 850 }, { "epoch": 0.21375, "grad_norm": 1.09375, "learning_rate": 4.4571992639907545e-05, "loss": 0.3864, "step": 855 }, { "epoch": 0.215, "grad_norm": 1.2578125, "learning_rate": 4.451076018345825e-05, "loss": 0.4903, "step": 860 }, { "epoch": 0.21625, "grad_norm": 1.21875, "learning_rate": 4.444922684695097e-05, "loss": 0.5126, "step": 865 }, { "epoch": 0.2175, "grad_norm": 1.1796875, "learning_rate": 4.4387393579305865e-05, "loss": 0.4958, "step": 870 }, { "epoch": 0.21875, "grad_norm": 1.5078125, "learning_rate": 4.4325261334068426e-05, "loss": 0.5307, "step": 875 }, { "epoch": 0.22, "grad_norm": 1.5, "learning_rate": 4.426283106939474e-05, "loss": 0.5048, "step": 880 }, { "epoch": 0.22125, "grad_norm": 1.0, "learning_rate": 4.4200103748036695e-05, "loss": 0.4757, "step": 885 }, { "epoch": 0.2225, "grad_norm": 1.359375, "learning_rate": 4.4137080337327205e-05, "loss": 0.5525, "step": 890 }, { "epoch": 0.22375, "grad_norm": 1.5390625, "learning_rate": 4.407376180916522e-05, "loss": 0.4781, "step": 895 }, { "epoch": 0.225, "grad_norm": 1.28125, "learning_rate": 4.401014914000078e-05, "loss": 0.4797, "step": 900 }, { "epoch": 0.22625, "grad_norm": 1.484375, "learning_rate": 4.3946243310819926e-05, "loss": 0.5529, "step": 905 }, { "epoch": 0.2275, "grad_norm": 1.265625, "learning_rate": 4.3882045307129594e-05, "loss": 0.4669, "step": 910 }, { "epoch": 0.22875, "grad_norm": 1.234375, "learning_rate": 4.3817556118942425e-05, "loss": 0.5328, "step": 915 }, { "epoch": 0.23, "grad_norm": 1.4140625, "learning_rate": 4.375277674076149e-05, "loss": 0.5347, "step": 920 }, { "epoch": 0.23125, "grad_norm": 1.15625, "learning_rate": 4.3687708171564925e-05, "loss": 0.4615, "step": 925 }, { "epoch": 0.2325, "grad_norm": 1.1328125, "learning_rate": 4.3622351414790554e-05, "loss": 0.5132, "step": 930 }, { "epoch": 0.23375, "grad_norm": 1.1171875, "learning_rate": 4.355670747832042e-05, "loss": 0.4063, "step": 935 }, { "epoch": 0.235, "grad_norm": 1.1171875, "learning_rate": 4.349077737446525e-05, "loss": 0.493, "step": 940 }, { "epoch": 0.23625, "grad_norm": 1.078125, "learning_rate": 4.3424562119948776e-05, "loss": 0.4826, "step": 945 }, { "epoch": 0.2375, "grad_norm": 1.0546875, "learning_rate": 4.335806273589214e-05, "loss": 0.4726, "step": 950 }, { "epoch": 0.23875, "grad_norm": 1.59375, "learning_rate": 4.329128024779812e-05, "loss": 0.4672, "step": 955 }, { "epoch": 0.24, "grad_norm": 0.91796875, "learning_rate": 4.3224215685535294e-05, "loss": 0.4467, "step": 960 }, { "epoch": 0.24125, "grad_norm": 1.484375, "learning_rate": 4.315687008332217e-05, "loss": 0.5019, "step": 965 }, { "epoch": 0.2425, "grad_norm": 1.2265625, "learning_rate": 4.3089244479711236e-05, "loss": 0.526, "step": 970 }, { "epoch": 0.24375, "grad_norm": 1.2890625, "learning_rate": 4.302133991757297e-05, "loss": 0.509, "step": 975 }, { "epoch": 0.245, "grad_norm": 0.953125, "learning_rate": 4.295315744407972e-05, "loss": 0.447, "step": 980 }, { "epoch": 0.24625, "grad_norm": 1.2578125, "learning_rate": 4.2884698110689575e-05, "loss": 0.4927, "step": 985 }, { "epoch": 0.2475, "grad_norm": 1.578125, "learning_rate": 4.281596297313013e-05, "loss": 0.4891, "step": 990 }, { "epoch": 0.24875, "grad_norm": 2.21875, "learning_rate": 4.274695309138226e-05, "loss": 0.5046, "step": 995 }, { "epoch": 0.25, "grad_norm": 1.1796875, "learning_rate": 4.267766952966369e-05, "loss": 0.4642, "step": 1000 }, { "epoch": 0.25125, "grad_norm": 1.109375, "learning_rate": 4.260811335641266e-05, "loss": 0.4396, "step": 1005 }, { "epoch": 0.2525, "grad_norm": 1.2265625, "learning_rate": 4.25382856442714e-05, "loss": 0.4386, "step": 1010 }, { "epoch": 0.25375, "grad_norm": 1.546875, "learning_rate": 4.2468187470069607e-05, "loss": 0.5335, "step": 1015 }, { "epoch": 0.255, "grad_norm": 1.59375, "learning_rate": 4.2397819914807856e-05, "loss": 0.4703, "step": 1020 }, { "epoch": 0.25625, "grad_norm": 1.0625, "learning_rate": 4.23271840636409e-05, "loss": 0.5011, "step": 1025 }, { "epoch": 0.2575, "grad_norm": 1.046875, "learning_rate": 4.225628100586093e-05, "loss": 0.5234, "step": 1030 }, { "epoch": 0.25875, "grad_norm": 1.0078125, "learning_rate": 4.218511183488082e-05, "loss": 0.4749, "step": 1035 }, { "epoch": 0.26, "grad_norm": 1.34375, "learning_rate": 4.211367764821722e-05, "loss": 0.5361, "step": 1040 }, { "epoch": 0.26125, "grad_norm": 1.1875, "learning_rate": 4.2041979547473665e-05, "loss": 0.4458, "step": 1045 }, { "epoch": 0.2625, "grad_norm": 1.140625, "learning_rate": 4.197001863832355e-05, "loss": 0.4517, "step": 1050 }, { "epoch": 0.26375, "grad_norm": 0.9921875, "learning_rate": 4.189779603049312e-05, "loss": 0.4571, "step": 1055 }, { "epoch": 0.265, "grad_norm": 1.15625, "learning_rate": 4.182531283774434e-05, "loss": 0.487, "step": 1060 }, { "epoch": 0.26625, "grad_norm": 1.140625, "learning_rate": 4.17525701778577e-05, "loss": 0.5186, "step": 1065 }, { "epoch": 0.2675, "grad_norm": 1.2578125, "learning_rate": 4.1679569172614996e-05, "loss": 0.4815, "step": 1070 }, { "epoch": 0.26875, "grad_norm": 0.98046875, "learning_rate": 4.1606310947782044e-05, "loss": 0.4563, "step": 1075 }, { "epoch": 0.27, "grad_norm": 1.0, "learning_rate": 4.1532796633091296e-05, "loss": 0.4585, "step": 1080 }, { "epoch": 0.27125, "grad_norm": 1.1640625, "learning_rate": 4.1459027362224436e-05, "loss": 0.4846, "step": 1085 }, { "epoch": 0.2725, "grad_norm": 1.03125, "learning_rate": 4.138500427279485e-05, "loss": 0.505, "step": 1090 }, { "epoch": 0.27375, "grad_norm": 1.2421875, "learning_rate": 4.1310728506330174e-05, "loss": 0.4765, "step": 1095 }, { "epoch": 0.275, "grad_norm": 1.5546875, "learning_rate": 4.123620120825459e-05, "loss": 0.5105, "step": 1100 }, { "epoch": 0.27625, "grad_norm": 0.94140625, "learning_rate": 4.116142352787125e-05, "loss": 0.4193, "step": 1105 }, { "epoch": 0.2775, "grad_norm": 1.203125, "learning_rate": 4.1086396618344476e-05, "loss": 0.4953, "step": 1110 }, { "epoch": 0.27875, "grad_norm": 1.265625, "learning_rate": 4.101112163668203e-05, "loss": 0.4572, "step": 1115 }, { "epoch": 0.28, "grad_norm": 1.1015625, "learning_rate": 4.093559974371725e-05, "loss": 0.4247, "step": 1120 }, { "epoch": 0.28125, "grad_norm": 1.484375, "learning_rate": 4.085983210409114e-05, "loss": 0.483, "step": 1125 }, { "epoch": 0.2825, "grad_norm": 0.98828125, "learning_rate": 4.0783819886234445e-05, "loss": 0.4787, "step": 1130 }, { "epoch": 0.28375, "grad_norm": 1.015625, "learning_rate": 4.0707564262349595e-05, "loss": 0.4891, "step": 1135 }, { "epoch": 0.285, "grad_norm": 1.5859375, "learning_rate": 4.063106640839264e-05, "loss": 0.503, "step": 1140 }, { "epoch": 0.28625, "grad_norm": 1.4140625, "learning_rate": 4.05543275040551e-05, "loss": 0.5003, "step": 1145 }, { "epoch": 0.2875, "grad_norm": 1.328125, "learning_rate": 4.047734873274586e-05, "loss": 0.5444, "step": 1150 }, { "epoch": 0.28875, "grad_norm": 0.86328125, "learning_rate": 4.040013128157275e-05, "loss": 0.4193, "step": 1155 }, { "epoch": 0.29, "grad_norm": 1.1328125, "learning_rate": 4.0322676341324415e-05, "loss": 0.497, "step": 1160 }, { "epoch": 0.29125, "grad_norm": 0.94140625, "learning_rate": 4.024498510645185e-05, "loss": 0.377, "step": 1165 }, { "epoch": 0.2925, "grad_norm": 1.2421875, "learning_rate": 4.0167058775049996e-05, "loss": 0.5118, "step": 1170 }, { "epoch": 0.29375, "grad_norm": 1.59375, "learning_rate": 4.008889854883929e-05, "loss": 0.4941, "step": 1175 }, { "epoch": 0.295, "grad_norm": 1.15625, "learning_rate": 4.0010505633147106e-05, "loss": 0.5302, "step": 1180 }, { "epoch": 0.29625, "grad_norm": 1.1796875, "learning_rate": 3.993188123688918e-05, "loss": 0.5273, "step": 1185 }, { "epoch": 0.2975, "grad_norm": 1.4921875, "learning_rate": 3.985302657255097e-05, "loss": 0.463, "step": 1190 }, { "epoch": 0.29875, "grad_norm": 1.4453125, "learning_rate": 3.977394285616893e-05, "loss": 0.5116, "step": 1195 }, { "epoch": 0.3, "grad_norm": 1.0078125, "learning_rate": 3.969463130731183e-05, "loss": 0.5089, "step": 1200 }, { "epoch": 0.30125, "grad_norm": 0.890625, "learning_rate": 3.961509314906184e-05, "loss": 0.5043, "step": 1205 }, { "epoch": 0.3025, "grad_norm": 1.2265625, "learning_rate": 3.953532960799577e-05, "loss": 0.4877, "step": 1210 }, { "epoch": 0.30375, "grad_norm": 0.953125, "learning_rate": 3.9455341914166075e-05, "loss": 0.5368, "step": 1215 }, { "epoch": 0.305, "grad_norm": 1.1015625, "learning_rate": 3.937513130108197e-05, "loss": 0.4303, "step": 1220 }, { "epoch": 0.30625, "grad_norm": 0.953125, "learning_rate": 3.9294699005690305e-05, "loss": 0.4978, "step": 1225 }, { "epoch": 0.3075, "grad_norm": 1.4453125, "learning_rate": 3.92140462683566e-05, "loss": 0.4898, "step": 1230 }, { "epoch": 0.30875, "grad_norm": 1.3828125, "learning_rate": 3.913317433284582e-05, "loss": 0.4307, "step": 1235 }, { "epoch": 0.31, "grad_norm": 1.078125, "learning_rate": 3.905208444630327e-05, "loss": 0.4599, "step": 1240 }, { "epoch": 0.31125, "grad_norm": 0.94921875, "learning_rate": 3.897077785923529e-05, "loss": 0.4449, "step": 1245 }, { "epoch": 0.3125, "grad_norm": 1.1796875, "learning_rate": 3.888925582549006e-05, "loss": 0.4508, "step": 1250 }, { "epoch": 0.31375, "grad_norm": 1.0703125, "learning_rate": 3.880751960223817e-05, "loss": 0.4523, "step": 1255 }, { "epoch": 0.315, "grad_norm": 1.203125, "learning_rate": 3.87255704499533e-05, "loss": 0.4782, "step": 1260 }, { "epoch": 0.31625, "grad_norm": 1.09375, "learning_rate": 3.864340963239275e-05, "loss": 0.4821, "step": 1265 }, { "epoch": 0.3175, "grad_norm": 1.0625, "learning_rate": 3.856103841657797e-05, "loss": 0.393, "step": 1270 }, { "epoch": 0.31875, "grad_norm": 1.1328125, "learning_rate": 3.847845807277502e-05, "loss": 0.4731, "step": 1275 }, { "epoch": 0.32, "grad_norm": 0.94140625, "learning_rate": 3.8395669874474915e-05, "loss": 0.4644, "step": 1280 }, { "epoch": 0.32125, "grad_norm": 1.171875, "learning_rate": 3.831267509837414e-05, "loss": 0.5069, "step": 1285 }, { "epoch": 0.3225, "grad_norm": 1.1328125, "learning_rate": 3.822947502435477e-05, "loss": 0.4767, "step": 1290 }, { "epoch": 0.32375, "grad_norm": 1.296875, "learning_rate": 3.814607093546489e-05, "loss": 0.472, "step": 1295 }, { "epoch": 0.325, "grad_norm": 1.109375, "learning_rate": 3.8062464117898724e-05, "loss": 0.4598, "step": 1300 }, { "epoch": 0.32625, "grad_norm": 1.71875, "learning_rate": 3.7978655860976824e-05, "loss": 0.4794, "step": 1305 }, { "epoch": 0.3275, "grad_norm": 1.046875, "learning_rate": 3.789464745712619e-05, "loss": 0.4728, "step": 1310 }, { "epoch": 0.32875, "grad_norm": 1.2421875, "learning_rate": 3.7810440201860334e-05, "loss": 0.4535, "step": 1315 }, { "epoch": 0.33, "grad_norm": 1.2578125, "learning_rate": 3.7726035393759285e-05, "loss": 0.4646, "step": 1320 }, { "epoch": 0.33125, "grad_norm": 1.109375, "learning_rate": 3.764143433444962e-05, "loss": 0.4597, "step": 1325 }, { "epoch": 0.3325, "grad_norm": 0.9921875, "learning_rate": 3.755663832858432e-05, "loss": 0.516, "step": 1330 }, { "epoch": 0.33375, "grad_norm": 1.078125, "learning_rate": 3.747164868382269e-05, "loss": 0.4492, "step": 1335 }, { "epoch": 0.335, "grad_norm": 1.4765625, "learning_rate": 3.7386466710810194e-05, "loss": 0.4644, "step": 1340 }, { "epoch": 0.33625, "grad_norm": 1.03125, "learning_rate": 3.730109372315822e-05, "loss": 0.5028, "step": 1345 }, { "epoch": 0.3375, "grad_norm": 1.359375, "learning_rate": 3.721553103742388e-05, "loss": 0.424, "step": 1350 }, { "epoch": 0.33875, "grad_norm": 1.1015625, "learning_rate": 3.71297799730896e-05, "loss": 0.4592, "step": 1355 }, { "epoch": 0.34, "grad_norm": 1.0390625, "learning_rate": 3.704384185254288e-05, "loss": 0.4678, "step": 1360 }, { "epoch": 0.34125, "grad_norm": 1.0703125, "learning_rate": 3.695771800105586e-05, "loss": 0.4809, "step": 1365 }, { "epoch": 0.3425, "grad_norm": 1.171875, "learning_rate": 3.6871409746764865e-05, "loss": 0.5093, "step": 1370 }, { "epoch": 0.34375, "grad_norm": 1.5625, "learning_rate": 3.678491842064995e-05, "loss": 0.4937, "step": 1375 }, { "epoch": 0.345, "grad_norm": 1.5625, "learning_rate": 3.6698245356514335e-05, "loss": 0.4107, "step": 1380 }, { "epoch": 0.34625, "grad_norm": 1.0, "learning_rate": 3.661139189096391e-05, "loss": 0.4578, "step": 1385 }, { "epoch": 0.3475, "grad_norm": 0.8359375, "learning_rate": 3.652435936338656e-05, "loss": 0.3964, "step": 1390 }, { "epoch": 0.34875, "grad_norm": 1.1171875, "learning_rate": 3.6437149115931514e-05, "loss": 0.5011, "step": 1395 }, { "epoch": 0.35, "grad_norm": 1.28125, "learning_rate": 3.634976249348867e-05, "loss": 0.494, "step": 1400 }, { "epoch": 0.35125, "grad_norm": 1.125, "learning_rate": 3.626220084366786e-05, "loss": 0.4773, "step": 1405 }, { "epoch": 0.3525, "grad_norm": 1.0390625, "learning_rate": 3.6174465516778035e-05, "loss": 0.4338, "step": 1410 }, { "epoch": 0.35375, "grad_norm": 0.99609375, "learning_rate": 3.608655786580647e-05, "loss": 0.4538, "step": 1415 }, { "epoch": 0.355, "grad_norm": 1.1875, "learning_rate": 3.599847924639788e-05, "loss": 0.4537, "step": 1420 }, { "epoch": 0.35625, "grad_norm": 1.0078125, "learning_rate": 3.591023101683355e-05, "loss": 0.448, "step": 1425 }, { "epoch": 0.3575, "grad_norm": 1.125, "learning_rate": 3.582181453801036e-05, "loss": 0.4645, "step": 1430 }, { "epoch": 0.35875, "grad_norm": 1.6015625, "learning_rate": 3.5733231173419754e-05, "loss": 0.4578, "step": 1435 }, { "epoch": 0.36, "grad_norm": 1.0, "learning_rate": 3.564448228912682e-05, "loss": 0.4704, "step": 1440 }, { "epoch": 0.36125, "grad_norm": 1.8671875, "learning_rate": 3.555556925374914e-05, "loss": 0.4383, "step": 1445 }, { "epoch": 0.3625, "grad_norm": 1.2734375, "learning_rate": 3.54664934384357e-05, "loss": 0.4192, "step": 1450 }, { "epoch": 0.36375, "grad_norm": 0.94921875, "learning_rate": 3.5377256216845785e-05, "loss": 0.5063, "step": 1455 }, { "epoch": 0.365, "grad_norm": 1.203125, "learning_rate": 3.528785896512772e-05, "loss": 0.4711, "step": 1460 }, { "epoch": 0.36625, "grad_norm": 1.15625, "learning_rate": 3.519830306189773e-05, "loss": 0.4494, "step": 1465 }, { "epoch": 0.3675, "grad_norm": 1.1796875, "learning_rate": 3.510858988821863e-05, "loss": 0.4972, "step": 1470 }, { "epoch": 0.36875, "grad_norm": 1.4765625, "learning_rate": 3.5018720827578524e-05, "loss": 0.4312, "step": 1475 }, { "epoch": 0.37, "grad_norm": 0.99609375, "learning_rate": 3.4928697265869515e-05, "loss": 0.4267, "step": 1480 }, { "epoch": 0.37125, "grad_norm": 1.390625, "learning_rate": 3.483852059136629e-05, "loss": 0.4563, "step": 1485 }, { "epoch": 0.3725, "grad_norm": 0.99609375, "learning_rate": 3.474819219470471e-05, "loss": 0.4642, "step": 1490 }, { "epoch": 0.37375, "grad_norm": 1.1796875, "learning_rate": 3.4657713468860405e-05, "loss": 0.414, "step": 1495 }, { "epoch": 0.375, "grad_norm": 1.140625, "learning_rate": 3.456708580912725e-05, "loss": 0.4919, "step": 1500 }, { "epoch": 0.37625, "grad_norm": 1.25, "learning_rate": 3.447631061309587e-05, "loss": 0.5023, "step": 1505 }, { "epoch": 0.3775, "grad_norm": 1.140625, "learning_rate": 3.438538928063208e-05, "loss": 0.469, "step": 1510 }, { "epoch": 0.37875, "grad_norm": 1.0859375, "learning_rate": 3.4294323213855305e-05, "loss": 0.4322, "step": 1515 }, { "epoch": 0.38, "grad_norm": 0.9296875, "learning_rate": 3.4203113817116957e-05, "loss": 0.4393, "step": 1520 }, { "epoch": 0.38125, "grad_norm": 1.1015625, "learning_rate": 3.411176249697875e-05, "loss": 0.4005, "step": 1525 }, { "epoch": 0.3825, "grad_norm": 1.4609375, "learning_rate": 3.402027066219105e-05, "loss": 0.4094, "step": 1530 }, { "epoch": 0.38375, "grad_norm": 1.4921875, "learning_rate": 3.392863972367114e-05, "loss": 0.4474, "step": 1535 }, { "epoch": 0.385, "grad_norm": 1.265625, "learning_rate": 3.383687109448143e-05, "loss": 0.399, "step": 1540 }, { "epoch": 0.38625, "grad_norm": 1.3515625, "learning_rate": 3.374496618980772e-05, "loss": 0.4342, "step": 1545 }, { "epoch": 0.3875, "grad_norm": 1.2890625, "learning_rate": 3.365292642693732e-05, "loss": 0.4847, "step": 1550 }, { "epoch": 0.38875, "grad_norm": 1.046875, "learning_rate": 3.356075322523725e-05, "loss": 0.4343, "step": 1555 }, { "epoch": 0.39, "grad_norm": 0.9921875, "learning_rate": 3.346844800613229e-05, "loss": 0.498, "step": 1560 }, { "epoch": 0.39125, "grad_norm": 1.0703125, "learning_rate": 3.33760121930831e-05, "loss": 0.4737, "step": 1565 }, { "epoch": 0.3925, "grad_norm": 0.99609375, "learning_rate": 3.3283447211564276e-05, "loss": 0.4965, "step": 1570 }, { "epoch": 0.39375, "grad_norm": 1.3359375, "learning_rate": 3.319075448904234e-05, "loss": 0.4626, "step": 1575 }, { "epoch": 0.395, "grad_norm": 1.1640625, "learning_rate": 3.309793545495374e-05, "loss": 0.5161, "step": 1580 }, { "epoch": 0.39625, "grad_norm": 1.21875, "learning_rate": 3.3004991540682796e-05, "loss": 0.4371, "step": 1585 }, { "epoch": 0.3975, "grad_norm": 1.109375, "learning_rate": 3.2911924179539656e-05, "loss": 0.4427, "step": 1590 }, { "epoch": 0.39875, "grad_norm": 0.93359375, "learning_rate": 3.281873480673815e-05, "loss": 0.4318, "step": 1595 }, { "epoch": 0.4, "grad_norm": 1.046875, "learning_rate": 3.272542485937369e-05, "loss": 0.4756, "step": 1600 }, { "epoch": 0.40125, "grad_norm": 1.0390625, "learning_rate": 3.2631995776401094e-05, "loss": 0.4507, "step": 1605 }, { "epoch": 0.4025, "grad_norm": 1.03125, "learning_rate": 3.253844899861239e-05, "loss": 0.4444, "step": 1610 }, { "epoch": 0.40375, "grad_norm": 1.046875, "learning_rate": 3.244478596861464e-05, "loss": 0.4291, "step": 1615 }, { "epoch": 0.405, "grad_norm": 1.125, "learning_rate": 3.23510081308076e-05, "loss": 0.4615, "step": 1620 }, { "epoch": 0.40625, "grad_norm": 1.0546875, "learning_rate": 3.225711693136156e-05, "loss": 0.4347, "step": 1625 }, { "epoch": 0.4075, "grad_norm": 1.3046875, "learning_rate": 3.2163113818194964e-05, "loss": 0.4349, "step": 1630 }, { "epoch": 0.40875, "grad_norm": 1.609375, "learning_rate": 3.206900024095208e-05, "loss": 0.4814, "step": 1635 }, { "epoch": 0.41, "grad_norm": 1.3828125, "learning_rate": 3.1974777650980735e-05, "loss": 0.502, "step": 1640 }, { "epoch": 0.41125, "grad_norm": 1.5234375, "learning_rate": 3.188044750130979e-05, "loss": 0.4457, "step": 1645 }, { "epoch": 0.4125, "grad_norm": 1.140625, "learning_rate": 3.178601124662686e-05, "loss": 0.505, "step": 1650 }, { "epoch": 0.41375, "grad_norm": 1.109375, "learning_rate": 3.169147034325582e-05, "loss": 0.4941, "step": 1655 }, { "epoch": 0.415, "grad_norm": 1.078125, "learning_rate": 3.1596826249134324e-05, "loss": 0.4524, "step": 1660 }, { "epoch": 0.41625, "grad_norm": 1.1015625, "learning_rate": 3.150208042379142e-05, "loss": 0.4826, "step": 1665 }, { "epoch": 0.4175, "grad_norm": 1.109375, "learning_rate": 3.140723432832492e-05, "loss": 0.4101, "step": 1670 }, { "epoch": 0.41875, "grad_norm": 1.3359375, "learning_rate": 3.131228942537895e-05, "loss": 0.4068, "step": 1675 }, { "epoch": 0.42, "grad_norm": 1.53125, "learning_rate": 3.121724717912138e-05, "loss": 0.4341, "step": 1680 }, { "epoch": 0.42125, "grad_norm": 1.2109375, "learning_rate": 3.112210905522119e-05, "loss": 0.4197, "step": 1685 }, { "epoch": 0.4225, "grad_norm": 1.3046875, "learning_rate": 3.102687652082597e-05, "loss": 0.4257, "step": 1690 }, { "epoch": 0.42375, "grad_norm": 1.25, "learning_rate": 3.0931551044539194e-05, "loss": 0.4513, "step": 1695 }, { "epoch": 0.425, "grad_norm": 1.21875, "learning_rate": 3.083613409639764e-05, "loss": 0.4757, "step": 1700 }, { "epoch": 0.42625, "grad_norm": 1.2421875, "learning_rate": 3.0740627147848675e-05, "loss": 0.441, "step": 1705 }, { "epoch": 0.4275, "grad_norm": 1.2265625, "learning_rate": 3.06450316717276e-05, "loss": 0.4249, "step": 1710 }, { "epoch": 0.42875, "grad_norm": 1.1953125, "learning_rate": 3.05493491422349e-05, "loss": 0.4189, "step": 1715 }, { "epoch": 0.43, "grad_norm": 1.3203125, "learning_rate": 3.045358103491357e-05, "loss": 0.4315, "step": 1720 }, { "epoch": 0.43125, "grad_norm": 1.15625, "learning_rate": 3.035772882662627e-05, "loss": 0.4641, "step": 1725 }, { "epoch": 0.4325, "grad_norm": 1.390625, "learning_rate": 3.026179399553264e-05, "loss": 0.4701, "step": 1730 }, { "epoch": 0.43375, "grad_norm": 1.421875, "learning_rate": 3.0165778021066453e-05, "loss": 0.4827, "step": 1735 }, { "epoch": 0.435, "grad_norm": 1.3203125, "learning_rate": 3.0069682383912813e-05, "loss": 0.4439, "step": 1740 }, { "epoch": 0.43625, "grad_norm": 1.28125, "learning_rate": 2.9973508565985313e-05, "loss": 0.4916, "step": 1745 }, { "epoch": 0.4375, "grad_norm": 1.4140625, "learning_rate": 2.9877258050403212e-05, "loss": 0.464, "step": 1750 }, { "epoch": 0.43875, "grad_norm": 1.0078125, "learning_rate": 2.9780932321468515e-05, "loss": 0.4105, "step": 1755 }, { "epoch": 0.44, "grad_norm": 1.3125, "learning_rate": 2.9684532864643122e-05, "loss": 0.4312, "step": 1760 }, { "epoch": 0.44125, "grad_norm": 1.1875, "learning_rate": 2.9588061166525914e-05, "loss": 0.4465, "step": 1765 }, { "epoch": 0.4425, "grad_norm": 1.5, "learning_rate": 2.949151871482982e-05, "loss": 0.4136, "step": 1770 }, { "epoch": 0.44375, "grad_norm": 1.21875, "learning_rate": 2.9394906998358868e-05, "loss": 0.4107, "step": 1775 }, { "epoch": 0.445, "grad_norm": 0.98828125, "learning_rate": 2.929822750698524e-05, "loss": 0.4327, "step": 1780 }, { "epoch": 0.44625, "grad_norm": 1.2890625, "learning_rate": 2.92014817316263e-05, "loss": 0.4597, "step": 1785 }, { "epoch": 0.4475, "grad_norm": 1.046875, "learning_rate": 2.9104671164221576e-05, "loss": 0.4685, "step": 1790 }, { "epoch": 0.44875, "grad_norm": 1.046875, "learning_rate": 2.9007797297709782e-05, "loss": 0.451, "step": 1795 }, { "epoch": 0.45, "grad_norm": 1.40625, "learning_rate": 2.8910861626005776e-05, "loss": 0.4101, "step": 1800 }, { "epoch": 0.45125, "grad_norm": 1.2109375, "learning_rate": 2.8813865643977526e-05, "loss": 0.4775, "step": 1805 }, { "epoch": 0.4525, "grad_norm": 1.234375, "learning_rate": 2.871681084742308e-05, "loss": 0.4588, "step": 1810 }, { "epoch": 0.45375, "grad_norm": 1.265625, "learning_rate": 2.8619698733047447e-05, "loss": 0.4476, "step": 1815 }, { "epoch": 0.455, "grad_norm": 1.140625, "learning_rate": 2.8522530798439567e-05, "loss": 0.4375, "step": 1820 }, { "epoch": 0.45625, "grad_norm": 1.015625, "learning_rate": 2.8425308542049206e-05, "loss": 0.422, "step": 1825 }, { "epoch": 0.4575, "grad_norm": 0.94921875, "learning_rate": 2.832803346316381e-05, "loss": 0.4887, "step": 1830 }, { "epoch": 0.45875, "grad_norm": 1.1796875, "learning_rate": 2.8230707061885443e-05, "loss": 0.4136, "step": 1835 }, { "epoch": 0.46, "grad_norm": 1.140625, "learning_rate": 2.8133330839107608e-05, "loss": 0.4236, "step": 1840 }, { "epoch": 0.46125, "grad_norm": 1.078125, "learning_rate": 2.803590629649212e-05, "loss": 0.4983, "step": 1845 }, { "epoch": 0.4625, "grad_norm": 1.265625, "learning_rate": 2.7938434936445945e-05, "loss": 0.4988, "step": 1850 }, { "epoch": 0.46375, "grad_norm": 0.8828125, "learning_rate": 2.784091826209803e-05, "loss": 0.4337, "step": 1855 }, { "epoch": 0.465, "grad_norm": 1.1640625, "learning_rate": 2.774335777727613e-05, "loss": 0.4574, "step": 1860 }, { "epoch": 0.46625, "grad_norm": 0.95703125, "learning_rate": 2.764575498648362e-05, "loss": 0.4606, "step": 1865 }, { "epoch": 0.4675, "grad_norm": 1.1328125, "learning_rate": 2.754811139487625e-05, "loss": 0.4489, "step": 1870 }, { "epoch": 0.46875, "grad_norm": 1.1171875, "learning_rate": 2.7450428508239024e-05, "loss": 0.4016, "step": 1875 }, { "epoch": 0.47, "grad_norm": 1.1328125, "learning_rate": 2.7352707832962865e-05, "loss": 0.4191, "step": 1880 }, { "epoch": 0.47125, "grad_norm": 1.2109375, "learning_rate": 2.725495087602148e-05, "loss": 0.5397, "step": 1885 }, { "epoch": 0.4725, "grad_norm": 1.4609375, "learning_rate": 2.7157159144948092e-05, "loss": 0.4646, "step": 1890 }, { "epoch": 0.47375, "grad_norm": 0.9453125, "learning_rate": 2.7059334147812142e-05, "loss": 0.4443, "step": 1895 }, { "epoch": 0.475, "grad_norm": 0.95703125, "learning_rate": 2.6961477393196126e-05, "loss": 0.4943, "step": 1900 }, { "epoch": 0.47625, "grad_norm": 0.8046875, "learning_rate": 2.6863590390172243e-05, "loss": 0.4654, "step": 1905 }, { "epoch": 0.4775, "grad_norm": 1.2890625, "learning_rate": 2.6765674648279172e-05, "loss": 0.4517, "step": 1910 }, { "epoch": 0.47875, "grad_norm": 1.1875, "learning_rate": 2.666773167749878e-05, "loss": 0.4525, "step": 1915 }, { "epoch": 0.48, "grad_norm": 1.09375, "learning_rate": 2.656976298823284e-05, "loss": 0.4676, "step": 1920 }, { "epoch": 0.48125, "grad_norm": 1.4609375, "learning_rate": 2.6471770091279724e-05, "loss": 0.495, "step": 1925 }, { "epoch": 0.4825, "grad_norm": 1.0234375, "learning_rate": 2.637375449781115e-05, "loss": 0.4322, "step": 1930 }, { "epoch": 0.48375, "grad_norm": 1.1328125, "learning_rate": 2.627571771934879e-05, "loss": 0.4147, "step": 1935 }, { "epoch": 0.485, "grad_norm": 1.078125, "learning_rate": 2.6177661267741065e-05, "loss": 0.4204, "step": 1940 }, { "epoch": 0.48625, "grad_norm": 0.98828125, "learning_rate": 2.607958665513976e-05, "loss": 0.4245, "step": 1945 }, { "epoch": 0.4875, "grad_norm": 0.95703125, "learning_rate": 2.598149539397672e-05, "loss": 0.4582, "step": 1950 }, { "epoch": 0.48875, "grad_norm": 1.46875, "learning_rate": 2.5883388996940534e-05, "loss": 0.4445, "step": 1955 }, { "epoch": 0.49, "grad_norm": 1.25, "learning_rate": 2.578526897695321e-05, "loss": 0.4533, "step": 1960 }, { "epoch": 0.49125, "grad_norm": 1.0390625, "learning_rate": 2.5687136847146838e-05, "loss": 0.4334, "step": 1965 }, { "epoch": 0.4925, "grad_norm": 0.89453125, "learning_rate": 2.558899412084026e-05, "loss": 0.434, "step": 1970 }, { "epoch": 0.49375, "grad_norm": 1.25, "learning_rate": 2.5490842311515707e-05, "loss": 0.4257, "step": 1975 }, { "epoch": 0.495, "grad_norm": 1.6953125, "learning_rate": 2.539268293279552e-05, "loss": 0.4503, "step": 1980 }, { "epoch": 0.49625, "grad_norm": 1.234375, "learning_rate": 2.529451749841873e-05, "loss": 0.5045, "step": 1985 }, { "epoch": 0.4975, "grad_norm": 1.3046875, "learning_rate": 2.5196347522217784e-05, "loss": 0.4307, "step": 1990 }, { "epoch": 0.49875, "grad_norm": 1.0078125, "learning_rate": 2.509817451809515e-05, "loss": 0.4701, "step": 1995 }, { "epoch": 0.5, "grad_norm": 1.3046875, "learning_rate": 2.5e-05, "loss": 0.4573, "step": 2000 }, { "epoch": 0.50125, "grad_norm": 1.2421875, "learning_rate": 2.4901825481904855e-05, "loss": 0.4304, "step": 2005 }, { "epoch": 0.5025, "grad_norm": 1.578125, "learning_rate": 2.480365247778223e-05, "loss": 0.4334, "step": 2010 }, { "epoch": 0.50375, "grad_norm": 1.0390625, "learning_rate": 2.4705482501581266e-05, "loss": 0.4507, "step": 2015 }, { "epoch": 0.505, "grad_norm": 1.3984375, "learning_rate": 2.460731706720449e-05, "loss": 0.4555, "step": 2020 }, { "epoch": 0.50625, "grad_norm": 1.1328125, "learning_rate": 2.4509157688484295e-05, "loss": 0.4791, "step": 2025 }, { "epoch": 0.5075, "grad_norm": 1.171875, "learning_rate": 2.4411005879159753e-05, "loss": 0.4324, "step": 2030 }, { "epoch": 0.50875, "grad_norm": 1.3828125, "learning_rate": 2.4312863152853165e-05, "loss": 0.4534, "step": 2035 }, { "epoch": 0.51, "grad_norm": 1.5546875, "learning_rate": 2.4214731023046793e-05, "loss": 0.4411, "step": 2040 }, { "epoch": 0.51125, "grad_norm": 1.3359375, "learning_rate": 2.4116611003059472e-05, "loss": 0.4333, "step": 2045 }, { "epoch": 0.5125, "grad_norm": 1.46875, "learning_rate": 2.4018504606023293e-05, "loss": 0.4231, "step": 2050 }, { "epoch": 0.51375, "grad_norm": 1.4140625, "learning_rate": 2.392041334486024e-05, "loss": 0.3752, "step": 2055 }, { "epoch": 0.515, "grad_norm": 1.1328125, "learning_rate": 2.3822338732258937e-05, "loss": 0.4876, "step": 2060 }, { "epoch": 0.51625, "grad_norm": 0.91796875, "learning_rate": 2.3724282280651214e-05, "loss": 0.3989, "step": 2065 }, { "epoch": 0.5175, "grad_norm": 0.859375, "learning_rate": 2.3626245502188864e-05, "loss": 0.4102, "step": 2070 }, { "epoch": 0.51875, "grad_norm": 0.9921875, "learning_rate": 2.3528229908720272e-05, "loss": 0.3997, "step": 2075 }, { "epoch": 0.52, "grad_norm": 0.71484375, "learning_rate": 2.3430237011767167e-05, "loss": 0.4009, "step": 2080 }, { "epoch": 0.52125, "grad_norm": 1.046875, "learning_rate": 2.3332268322501228e-05, "loss": 0.4769, "step": 2085 }, { "epoch": 0.5225, "grad_norm": 1.078125, "learning_rate": 2.323432535172084e-05, "loss": 0.4405, "step": 2090 }, { "epoch": 0.52375, "grad_norm": 1.0625, "learning_rate": 2.313640960982776e-05, "loss": 0.4436, "step": 2095 }, { "epoch": 0.525, "grad_norm": 1.09375, "learning_rate": 2.303852260680388e-05, "loss": 0.4027, "step": 2100 }, { "epoch": 0.52625, "grad_norm": 1.2890625, "learning_rate": 2.294066585218786e-05, "loss": 0.4086, "step": 2105 }, { "epoch": 0.5275, "grad_norm": 1.046875, "learning_rate": 2.284284085505192e-05, "loss": 0.4262, "step": 2110 }, { "epoch": 0.52875, "grad_norm": 1.484375, "learning_rate": 2.274504912397852e-05, "loss": 0.4605, "step": 2115 }, { "epoch": 0.53, "grad_norm": 1.2734375, "learning_rate": 2.2647292167037144e-05, "loss": 0.4534, "step": 2120 }, { "epoch": 0.53125, "grad_norm": 0.890625, "learning_rate": 2.2549571491760986e-05, "loss": 0.3628, "step": 2125 }, { "epoch": 0.5325, "grad_norm": 1.3046875, "learning_rate": 2.2451888605123754e-05, "loss": 0.4879, "step": 2130 }, { "epoch": 0.53375, "grad_norm": 0.96875, "learning_rate": 2.2354245013516393e-05, "loss": 0.4517, "step": 2135 }, { "epoch": 0.535, "grad_norm": 1.1640625, "learning_rate": 2.225664222272387e-05, "loss": 0.4303, "step": 2140 }, { "epoch": 0.53625, "grad_norm": 0.83203125, "learning_rate": 2.2159081737901975e-05, "loss": 0.4172, "step": 2145 }, { "epoch": 0.5375, "grad_norm": 1.125, "learning_rate": 2.2061565063554064e-05, "loss": 0.4169, "step": 2150 }, { "epoch": 0.53875, "grad_norm": 1.25, "learning_rate": 2.1964093703507893e-05, "loss": 0.4839, "step": 2155 }, { "epoch": 0.54, "grad_norm": 0.8359375, "learning_rate": 2.186666916089239e-05, "loss": 0.3919, "step": 2160 }, { "epoch": 0.54125, "grad_norm": 1.2421875, "learning_rate": 2.1769292938114563e-05, "loss": 0.4435, "step": 2165 }, { "epoch": 0.5425, "grad_norm": 1.3046875, "learning_rate": 2.1671966536836196e-05, "loss": 0.4902, "step": 2170 }, { "epoch": 0.54375, "grad_norm": 1.296875, "learning_rate": 2.1574691457950803e-05, "loss": 0.4667, "step": 2175 }, { "epoch": 0.545, "grad_norm": 1.1015625, "learning_rate": 2.1477469201560435e-05, "loss": 0.3795, "step": 2180 }, { "epoch": 0.54625, "grad_norm": 1.140625, "learning_rate": 2.1380301266952556e-05, "loss": 0.4658, "step": 2185 }, { "epoch": 0.5475, "grad_norm": 1.6171875, "learning_rate": 2.1283189152576925e-05, "loss": 0.4589, "step": 2190 }, { "epoch": 0.54875, "grad_norm": 1.3828125, "learning_rate": 2.118613435602248e-05, "loss": 0.4394, "step": 2195 }, { "epoch": 0.55, "grad_norm": 1.0390625, "learning_rate": 2.1089138373994223e-05, "loss": 0.4321, "step": 2200 }, { "epoch": 0.55125, "grad_norm": 1.3046875, "learning_rate": 2.0992202702290227e-05, "loss": 0.4084, "step": 2205 }, { "epoch": 0.5525, "grad_norm": 1.2890625, "learning_rate": 2.089532883577843e-05, "loss": 0.4489, "step": 2210 }, { "epoch": 0.55375, "grad_norm": 1.2421875, "learning_rate": 2.0798518268373706e-05, "loss": 0.4403, "step": 2215 }, { "epoch": 0.555, "grad_norm": 1.1796875, "learning_rate": 2.070177249301476e-05, "loss": 0.4286, "step": 2220 }, { "epoch": 0.55625, "grad_norm": 1.28125, "learning_rate": 2.0605093001641138e-05, "loss": 0.4557, "step": 2225 }, { "epoch": 0.5575, "grad_norm": 1.171875, "learning_rate": 2.0508481285170186e-05, "loss": 0.4686, "step": 2230 }, { "epoch": 0.55875, "grad_norm": 1.375, "learning_rate": 2.04119388334741e-05, "loss": 0.4402, "step": 2235 }, { "epoch": 0.56, "grad_norm": 1.1953125, "learning_rate": 2.031546713535688e-05, "loss": 0.3973, "step": 2240 }, { "epoch": 0.56125, "grad_norm": 1.1015625, "learning_rate": 2.0219067678531494e-05, "loss": 0.4349, "step": 2245 }, { "epoch": 0.5625, "grad_norm": 1.1015625, "learning_rate": 2.0122741949596797e-05, "loss": 0.4329, "step": 2250 }, { "epoch": 0.56375, "grad_norm": 1.1875, "learning_rate": 2.002649143401469e-05, "loss": 0.4402, "step": 2255 }, { "epoch": 0.565, "grad_norm": 1.0546875, "learning_rate": 1.9930317616087196e-05, "loss": 0.4342, "step": 2260 }, { "epoch": 0.56625, "grad_norm": 0.98046875, "learning_rate": 1.9834221978933543e-05, "loss": 0.4537, "step": 2265 }, { "epoch": 0.5675, "grad_norm": 1.2890625, "learning_rate": 1.9738206004467363e-05, "loss": 0.4597, "step": 2270 }, { "epoch": 0.56875, "grad_norm": 1.4296875, "learning_rate": 1.9642271173373737e-05, "loss": 0.4372, "step": 2275 }, { "epoch": 0.57, "grad_norm": 1.171875, "learning_rate": 1.9546418965086442e-05, "loss": 0.4062, "step": 2280 }, { "epoch": 0.57125, "grad_norm": 1.453125, "learning_rate": 1.9450650857765102e-05, "loss": 0.4698, "step": 2285 }, { "epoch": 0.5725, "grad_norm": 1.140625, "learning_rate": 1.935496832827241e-05, "loss": 0.4312, "step": 2290 }, { "epoch": 0.57375, "grad_norm": 1.21875, "learning_rate": 1.925937285215133e-05, "loss": 0.4643, "step": 2295 }, { "epoch": 0.575, "grad_norm": 1.1015625, "learning_rate": 1.9163865903602374e-05, "loss": 0.4256, "step": 2300 }, { "epoch": 0.57625, "grad_norm": 1.1171875, "learning_rate": 1.9068448955460805e-05, "loss": 0.3879, "step": 2305 }, { "epoch": 0.5775, "grad_norm": 1.5390625, "learning_rate": 1.897312347917404e-05, "loss": 0.4048, "step": 2310 }, { "epoch": 0.57875, "grad_norm": 1.03125, "learning_rate": 1.8877890944778815e-05, "loss": 0.4572, "step": 2315 }, { "epoch": 0.58, "grad_norm": 1.1171875, "learning_rate": 1.8782752820878634e-05, "loss": 0.4634, "step": 2320 }, { "epoch": 0.58125, "grad_norm": 1.2265625, "learning_rate": 1.868771057462105e-05, "loss": 0.3985, "step": 2325 }, { "epoch": 0.5825, "grad_norm": 1.1328125, "learning_rate": 1.8592765671675084e-05, "loss": 0.4463, "step": 2330 }, { "epoch": 0.58375, "grad_norm": 0.91015625, "learning_rate": 1.8497919576208585e-05, "loss": 0.4083, "step": 2335 }, { "epoch": 0.585, "grad_norm": 1.4921875, "learning_rate": 1.8403173750865685e-05, "loss": 0.3929, "step": 2340 }, { "epoch": 0.58625, "grad_norm": 1.390625, "learning_rate": 1.830852965674419e-05, "loss": 0.4659, "step": 2345 }, { "epoch": 0.5875, "grad_norm": 1.2109375, "learning_rate": 1.8213988753373146e-05, "loss": 0.3986, "step": 2350 }, { "epoch": 0.58875, "grad_norm": 1.046875, "learning_rate": 1.8119552498690215e-05, "loss": 0.4043, "step": 2355 }, { "epoch": 0.59, "grad_norm": 2.84375, "learning_rate": 1.802522234901927e-05, "loss": 0.449, "step": 2360 }, { "epoch": 0.59125, "grad_norm": 1.40625, "learning_rate": 1.793099975904791e-05, "loss": 0.4178, "step": 2365 }, { "epoch": 0.5925, "grad_norm": 1.390625, "learning_rate": 1.783688618180504e-05, "loss": 0.4422, "step": 2370 }, { "epoch": 0.59375, "grad_norm": 0.94921875, "learning_rate": 1.7742883068638447e-05, "loss": 0.4666, "step": 2375 }, { "epoch": 0.595, "grad_norm": 1.015625, "learning_rate": 1.7648991869192405e-05, "loss": 0.4226, "step": 2380 }, { "epoch": 0.59625, "grad_norm": 1.40625, "learning_rate": 1.7555214031385375e-05, "loss": 0.408, "step": 2385 }, { "epoch": 0.5975, "grad_norm": 1.7109375, "learning_rate": 1.746155100138761e-05, "loss": 0.4778, "step": 2390 }, { "epoch": 0.59875, "grad_norm": 1.25, "learning_rate": 1.7368004223598912e-05, "loss": 0.4059, "step": 2395 }, { "epoch": 0.6, "grad_norm": 1.296875, "learning_rate": 1.7274575140626318e-05, "loss": 0.4398, "step": 2400 }, { "epoch": 0.60125, "grad_norm": 1.1015625, "learning_rate": 1.7181265193261865e-05, "loss": 0.482, "step": 2405 }, { "epoch": 0.6025, "grad_norm": 0.84765625, "learning_rate": 1.7088075820460346e-05, "loss": 0.4192, "step": 2410 }, { "epoch": 0.60375, "grad_norm": 1.3125, "learning_rate": 1.6995008459317206e-05, "loss": 0.4748, "step": 2415 }, { "epoch": 0.605, "grad_norm": 1.5703125, "learning_rate": 1.690206454504627e-05, "loss": 0.4276, "step": 2420 }, { "epoch": 0.60625, "grad_norm": 1.125, "learning_rate": 1.6809245510957665e-05, "loss": 0.3548, "step": 2425 }, { "epoch": 0.6075, "grad_norm": 1.0078125, "learning_rate": 1.6716552788435724e-05, "loss": 0.4122, "step": 2430 }, { "epoch": 0.60875, "grad_norm": 1.1484375, "learning_rate": 1.66239878069169e-05, "loss": 0.432, "step": 2435 }, { "epoch": 0.61, "grad_norm": 1.1015625, "learning_rate": 1.6531551993867717e-05, "loss": 0.4467, "step": 2440 }, { "epoch": 0.61125, "grad_norm": 1.3828125, "learning_rate": 1.643924677476276e-05, "loss": 0.4652, "step": 2445 }, { "epoch": 0.6125, "grad_norm": 1.1328125, "learning_rate": 1.6347073573062672e-05, "loss": 0.4024, "step": 2450 }, { "epoch": 0.61375, "grad_norm": 1.1953125, "learning_rate": 1.6255033810192282e-05, "loss": 0.3974, "step": 2455 }, { "epoch": 0.615, "grad_norm": 1.09375, "learning_rate": 1.6163128905518578e-05, "loss": 0.3891, "step": 2460 }, { "epoch": 0.61625, "grad_norm": 1.328125, "learning_rate": 1.6071360276328874e-05, "loss": 0.3499, "step": 2465 }, { "epoch": 0.6175, "grad_norm": 1.109375, "learning_rate": 1.5979729337808955e-05, "loss": 0.4386, "step": 2470 }, { "epoch": 0.61875, "grad_norm": 1.1328125, "learning_rate": 1.588823750302126e-05, "loss": 0.4494, "step": 2475 }, { "epoch": 0.62, "grad_norm": 1.4296875, "learning_rate": 1.5796886182883053e-05, "loss": 0.4025, "step": 2480 }, { "epoch": 0.62125, "grad_norm": 1.6796875, "learning_rate": 1.57056767861447e-05, "loss": 0.4192, "step": 2485 }, { "epoch": 0.6225, "grad_norm": 1.3046875, "learning_rate": 1.561461071936792e-05, "loss": 0.4509, "step": 2490 }, { "epoch": 0.62375, "grad_norm": 1.2109375, "learning_rate": 1.552368938690414e-05, "loss": 0.3897, "step": 2495 }, { "epoch": 0.625, "grad_norm": 1.28125, "learning_rate": 1.5432914190872757e-05, "loss": 0.473, "step": 2500 }, { "epoch": 0.62625, "grad_norm": 1.1328125, "learning_rate": 1.5342286531139605e-05, "loss": 0.4333, "step": 2505 }, { "epoch": 0.6275, "grad_norm": 1.0546875, "learning_rate": 1.5251807805295302e-05, "loss": 0.4245, "step": 2510 }, { "epoch": 0.62875, "grad_norm": 1.1953125, "learning_rate": 1.5161479408633713e-05, "loss": 0.4342, "step": 2515 }, { "epoch": 0.63, "grad_norm": 1.0703125, "learning_rate": 1.5071302734130489e-05, "loss": 0.3951, "step": 2520 }, { "epoch": 0.63125, "grad_norm": 1.2890625, "learning_rate": 1.498127917242148e-05, "loss": 0.4367, "step": 2525 }, { "epoch": 0.6325, "grad_norm": 1.2578125, "learning_rate": 1.4891410111781378e-05, "loss": 0.4766, "step": 2530 }, { "epoch": 0.63375, "grad_norm": 1.3828125, "learning_rate": 1.4801696938102272e-05, "loss": 0.373, "step": 2535 }, { "epoch": 0.635, "grad_norm": 1.3046875, "learning_rate": 1.4712141034872282e-05, "loss": 0.3804, "step": 2540 }, { "epoch": 0.63625, "grad_norm": 1.09375, "learning_rate": 1.4622743783154223e-05, "loss": 0.4206, "step": 2545 }, { "epoch": 0.6375, "grad_norm": 1.1875, "learning_rate": 1.4533506561564306e-05, "loss": 0.4585, "step": 2550 }, { "epoch": 0.63875, "grad_norm": 0.84765625, "learning_rate": 1.4444430746250867e-05, "loss": 0.3796, "step": 2555 }, { "epoch": 0.64, "grad_norm": 1.0625, "learning_rate": 1.4355517710873184e-05, "loss": 0.4296, "step": 2560 }, { "epoch": 0.64125, "grad_norm": 1.6796875, "learning_rate": 1.4266768826580257e-05, "loss": 0.5008, "step": 2565 }, { "epoch": 0.6425, "grad_norm": 1.0625, "learning_rate": 1.4178185461989662e-05, "loss": 0.3952, "step": 2570 }, { "epoch": 0.64375, "grad_norm": 1.0703125, "learning_rate": 1.4089768983166444e-05, "loss": 0.4494, "step": 2575 }, { "epoch": 0.645, "grad_norm": 0.88671875, "learning_rate": 1.4001520753602121e-05, "loss": 0.3944, "step": 2580 }, { "epoch": 0.64625, "grad_norm": 1.328125, "learning_rate": 1.3913442134193544e-05, "loss": 0.4276, "step": 2585 }, { "epoch": 0.6475, "grad_norm": 1.4296875, "learning_rate": 1.3825534483221974e-05, "loss": 0.4433, "step": 2590 }, { "epoch": 0.64875, "grad_norm": 1.1640625, "learning_rate": 1.3737799156332143e-05, "loss": 0.3992, "step": 2595 }, { "epoch": 0.65, "grad_norm": 0.96875, "learning_rate": 1.3650237506511331e-05, "loss": 0.4488, "step": 2600 }, { "epoch": 0.65125, "grad_norm": 1.3828125, "learning_rate": 1.3562850884068487e-05, "loss": 0.4243, "step": 2605 }, { "epoch": 0.6525, "grad_norm": 1.1171875, "learning_rate": 1.3475640636613446e-05, "loss": 0.3477, "step": 2610 }, { "epoch": 0.65375, "grad_norm": 1.2734375, "learning_rate": 1.3388608109036086e-05, "loss": 0.4413, "step": 2615 }, { "epoch": 0.655, "grad_norm": 1.0625, "learning_rate": 1.330175464348567e-05, "loss": 0.4487, "step": 2620 }, { "epoch": 0.65625, "grad_norm": 1.28125, "learning_rate": 1.3215081579350058e-05, "loss": 0.4122, "step": 2625 }, { "epoch": 0.6575, "grad_norm": 1.09375, "learning_rate": 1.312859025323514e-05, "loss": 0.424, "step": 2630 }, { "epoch": 0.65875, "grad_norm": 1.078125, "learning_rate": 1.3042281998944151e-05, "loss": 0.4013, "step": 2635 }, { "epoch": 0.66, "grad_norm": 1.9375, "learning_rate": 1.2956158147457115e-05, "loss": 0.5066, "step": 2640 }, { "epoch": 0.66125, "grad_norm": 1.25, "learning_rate": 1.2870220026910407e-05, "loss": 0.3935, "step": 2645 }, { "epoch": 0.6625, "grad_norm": 1.21875, "learning_rate": 1.2784468962576136e-05, "loss": 0.4039, "step": 2650 }, { "epoch": 0.66375, "grad_norm": 1.1875, "learning_rate": 1.2698906276841776e-05, "loss": 0.4817, "step": 2655 }, { "epoch": 0.665, "grad_norm": 0.99609375, "learning_rate": 1.261353328918981e-05, "loss": 0.3917, "step": 2660 }, { "epoch": 0.66625, "grad_norm": 1.2421875, "learning_rate": 1.2528351316177319e-05, "loss": 0.425, "step": 2665 }, { "epoch": 0.6675, "grad_norm": 1.015625, "learning_rate": 1.2443361671415687e-05, "loss": 0.4234, "step": 2670 }, { "epoch": 0.66875, "grad_norm": 1.1171875, "learning_rate": 1.235856566555039e-05, "loss": 0.4414, "step": 2675 }, { "epoch": 0.67, "grad_norm": 1.40625, "learning_rate": 1.2273964606240718e-05, "loss": 0.4563, "step": 2680 }, { "epoch": 0.67125, "grad_norm": 1.15625, "learning_rate": 1.2189559798139682e-05, "loss": 0.4132, "step": 2685 }, { "epoch": 0.6725, "grad_norm": 1.765625, "learning_rate": 1.2105352542873815e-05, "loss": 0.4317, "step": 2690 }, { "epoch": 0.67375, "grad_norm": 0.9296875, "learning_rate": 1.2021344139023186e-05, "loss": 0.4073, "step": 2695 }, { "epoch": 0.675, "grad_norm": 1.4765625, "learning_rate": 1.1937535882101281e-05, "loss": 0.4147, "step": 2700 }, { "epoch": 0.67625, "grad_norm": 1.5625, "learning_rate": 1.1853929064535111e-05, "loss": 0.4394, "step": 2705 }, { "epoch": 0.6775, "grad_norm": 1.40625, "learning_rate": 1.1770524975645238e-05, "loss": 0.461, "step": 2710 }, { "epoch": 0.67875, "grad_norm": 1.0, "learning_rate": 1.1687324901625879e-05, "loss": 0.4279, "step": 2715 }, { "epoch": 0.68, "grad_norm": 0.9765625, "learning_rate": 1.1604330125525079e-05, "loss": 0.4201, "step": 2720 }, { "epoch": 0.68125, "grad_norm": 0.98828125, "learning_rate": 1.1521541927224994e-05, "loss": 0.4392, "step": 2725 }, { "epoch": 0.6825, "grad_norm": 1.140625, "learning_rate": 1.1438961583422037e-05, "loss": 0.4064, "step": 2730 }, { "epoch": 0.68375, "grad_norm": 1.1796875, "learning_rate": 1.1356590367607252e-05, "loss": 0.4081, "step": 2735 }, { "epoch": 0.685, "grad_norm": 1.1171875, "learning_rate": 1.1274429550046704e-05, "loss": 0.4629, "step": 2740 }, { "epoch": 0.68625, "grad_norm": 0.9453125, "learning_rate": 1.1192480397761837e-05, "loss": 0.3942, "step": 2745 }, { "epoch": 0.6875, "grad_norm": 0.9765625, "learning_rate": 1.1110744174509952e-05, "loss": 0.4581, "step": 2750 }, { "epoch": 0.68875, "grad_norm": 1.1015625, "learning_rate": 1.1029222140764712e-05, "loss": 0.4079, "step": 2755 }, { "epoch": 0.69, "grad_norm": 1.1328125, "learning_rate": 1.0947915553696742e-05, "loss": 0.3924, "step": 2760 }, { "epoch": 0.69125, "grad_norm": 0.94921875, "learning_rate": 1.0866825667154182e-05, "loss": 0.3715, "step": 2765 }, { "epoch": 0.6925, "grad_norm": 1.09375, "learning_rate": 1.07859537316434e-05, "loss": 0.4238, "step": 2770 }, { "epoch": 0.69375, "grad_norm": 1.3203125, "learning_rate": 1.0705300994309697e-05, "loss": 0.4465, "step": 2775 }, { "epoch": 0.695, "grad_norm": 1.046875, "learning_rate": 1.0624868698918045e-05, "loss": 0.4295, "step": 2780 }, { "epoch": 0.69625, "grad_norm": 1.140625, "learning_rate": 1.0544658085833919e-05, "loss": 0.4527, "step": 2785 }, { "epoch": 0.6975, "grad_norm": 1.1015625, "learning_rate": 1.0464670392004235e-05, "loss": 0.4721, "step": 2790 }, { "epoch": 0.69875, "grad_norm": 1.234375, "learning_rate": 1.0384906850938166e-05, "loss": 0.4632, "step": 2795 }, { "epoch": 0.7, "grad_norm": 1.03125, "learning_rate": 1.0305368692688174e-05, "loss": 0.4382, "step": 2800 }, { "epoch": 0.70125, "grad_norm": 1.046875, "learning_rate": 1.0226057143831064e-05, "loss": 0.4699, "step": 2805 }, { "epoch": 0.7025, "grad_norm": 1.4296875, "learning_rate": 1.0146973427449038e-05, "loss": 0.4368, "step": 2810 }, { "epoch": 0.70375, "grad_norm": 0.9765625, "learning_rate": 1.0068118763110824e-05, "loss": 0.4513, "step": 2815 }, { "epoch": 0.705, "grad_norm": 1.0, "learning_rate": 9.989494366852904e-06, "loss": 0.3863, "step": 2820 }, { "epoch": 0.70625, "grad_norm": 1.0390625, "learning_rate": 9.911101451160715e-06, "loss": 0.3907, "step": 2825 }, { "epoch": 0.7075, "grad_norm": 1.078125, "learning_rate": 9.832941224950012e-06, "loss": 0.4537, "step": 2830 }, { "epoch": 0.70875, "grad_norm": 1.3125, "learning_rate": 9.755014893548157e-06, "loss": 0.4082, "step": 2835 }, { "epoch": 0.71, "grad_norm": 1.1015625, "learning_rate": 9.677323658675594e-06, "loss": 0.3992, "step": 2840 }, { "epoch": 0.71125, "grad_norm": 1.3125, "learning_rate": 9.599868718427257e-06, "loss": 0.4512, "step": 2845 }, { "epoch": 0.7125, "grad_norm": 0.8984375, "learning_rate": 9.522651267254149e-06, "loss": 0.419, "step": 2850 }, { "epoch": 0.71375, "grad_norm": 1.3671875, "learning_rate": 9.445672495944899e-06, "loss": 0.4542, "step": 2855 }, { "epoch": 0.715, "grad_norm": 1.1640625, "learning_rate": 9.368933591607378e-06, "loss": 0.4554, "step": 2860 }, { "epoch": 0.71625, "grad_norm": 1.3359375, "learning_rate": 9.292435737650407e-06, "loss": 0.4158, "step": 2865 }, { "epoch": 0.7175, "grad_norm": 0.8671875, "learning_rate": 9.216180113765558e-06, "loss": 0.4145, "step": 2870 }, { "epoch": 0.71875, "grad_norm": 1.1640625, "learning_rate": 9.140167895908867e-06, "loss": 0.4276, "step": 2875 }, { "epoch": 0.72, "grad_norm": 1.125, "learning_rate": 9.064400256282757e-06, "loss": 0.4477, "step": 2880 }, { "epoch": 0.72125, "grad_norm": 1.1796875, "learning_rate": 8.988878363317979e-06, "loss": 0.4563, "step": 2885 }, { "epoch": 0.7225, "grad_norm": 1.234375, "learning_rate": 8.913603381655528e-06, "loss": 0.4396, "step": 2890 }, { "epoch": 0.72375, "grad_norm": 1.5390625, "learning_rate": 8.838576472128756e-06, "loss": 0.4831, "step": 2895 }, { "epoch": 0.725, "grad_norm": 1.03125, "learning_rate": 8.763798791745411e-06, "loss": 0.4437, "step": 2900 }, { "epoch": 0.72625, "grad_norm": 1.140625, "learning_rate": 8.689271493669837e-06, "loss": 0.4639, "step": 2905 }, { "epoch": 0.7275, "grad_norm": 1.421875, "learning_rate": 8.614995727205156e-06, "loss": 0.4215, "step": 2910 }, { "epoch": 0.72875, "grad_norm": 1.28125, "learning_rate": 8.540972637775572e-06, "loss": 0.4615, "step": 2915 }, { "epoch": 0.73, "grad_norm": 1.0703125, "learning_rate": 8.467203366908707e-06, "loss": 0.4043, "step": 2920 }, { "epoch": 0.73125, "grad_norm": 1.3046875, "learning_rate": 8.393689052217966e-06, "loss": 0.4634, "step": 2925 }, { "epoch": 0.7325, "grad_norm": 1.328125, "learning_rate": 8.320430827385003e-06, "loss": 0.4411, "step": 2930 }, { "epoch": 0.73375, "grad_norm": 1.2265625, "learning_rate": 8.24742982214231e-06, "loss": 0.3556, "step": 2935 }, { "epoch": 0.735, "grad_norm": 1.0234375, "learning_rate": 8.174687162255672e-06, "loss": 0.4456, "step": 2940 }, { "epoch": 0.73625, "grad_norm": 0.8984375, "learning_rate": 8.102203969506886e-06, "loss": 0.4254, "step": 2945 }, { "epoch": 0.7375, "grad_norm": 1.234375, "learning_rate": 8.029981361676456e-06, "loss": 0.427, "step": 2950 }, { "epoch": 0.73875, "grad_norm": 1.0390625, "learning_rate": 7.958020452526346e-06, "loss": 0.4323, "step": 2955 }, { "epoch": 0.74, "grad_norm": 1.125, "learning_rate": 7.886322351782783e-06, "loss": 0.3968, "step": 2960 }, { "epoch": 0.74125, "grad_norm": 1.0234375, "learning_rate": 7.814888165119186e-06, "loss": 0.4628, "step": 2965 }, { "epoch": 0.7425, "grad_norm": 1.0078125, "learning_rate": 7.743718994139071e-06, "loss": 0.4388, "step": 2970 }, { "epoch": 0.74375, "grad_norm": 1.34375, "learning_rate": 7.672815936359107e-06, "loss": 0.4029, "step": 2975 }, { "epoch": 0.745, "grad_norm": 1.0703125, "learning_rate": 7.602180085192143e-06, "loss": 0.4214, "step": 2980 }, { "epoch": 0.74625, "grad_norm": 1.4296875, "learning_rate": 7.531812529930398e-06, "loss": 0.4165, "step": 2985 }, { "epoch": 0.7475, "grad_norm": 1.1171875, "learning_rate": 7.461714355728608e-06, "loss": 0.4016, "step": 2990 }, { "epoch": 0.74875, "grad_norm": 1.203125, "learning_rate": 7.391886643587342e-06, "loss": 0.4527, "step": 2995 }, { "epoch": 0.75, "grad_norm": 1.15625, "learning_rate": 7.3223304703363135e-06, "loss": 0.4143, "step": 3000 }, { "epoch": 0.75125, "grad_norm": 1.328125, "learning_rate": 7.253046908617747e-06, "loss": 0.4667, "step": 3005 }, { "epoch": 0.7525, "grad_norm": 1.2734375, "learning_rate": 7.184037026869867e-06, "loss": 0.4032, "step": 3010 }, { "epoch": 0.75375, "grad_norm": 1.0859375, "learning_rate": 7.115301889310427e-06, "loss": 0.433, "step": 3015 }, { "epoch": 0.755, "grad_norm": 1.046875, "learning_rate": 7.046842555920283e-06, "loss": 0.4017, "step": 3020 }, { "epoch": 0.75625, "grad_norm": 1.0546875, "learning_rate": 6.9786600824270296e-06, "loss": 0.4006, "step": 3025 }, { "epoch": 0.7575, "grad_norm": 1.3515625, "learning_rate": 6.91075552028877e-06, "loss": 0.4536, "step": 3030 }, { "epoch": 0.75875, "grad_norm": 1.4609375, "learning_rate": 6.84312991667784e-06, "loss": 0.4295, "step": 3035 }, { "epoch": 0.76, "grad_norm": 0.9375, "learning_rate": 6.775784314464717e-06, "loss": 0.4216, "step": 3040 }, { "epoch": 0.76125, "grad_norm": 1.1328125, "learning_rate": 6.708719752201884e-06, "loss": 0.4071, "step": 3045 }, { "epoch": 0.7625, "grad_norm": 0.87109375, "learning_rate": 6.641937264107867e-06, "loss": 0.4518, "step": 3050 }, { "epoch": 0.76375, "grad_norm": 1.453125, "learning_rate": 6.575437880051233e-06, "loss": 0.4776, "step": 3055 }, { "epoch": 0.765, "grad_norm": 1.1796875, "learning_rate": 6.509222625534755e-06, "loss": 0.4084, "step": 3060 }, { "epoch": 0.76625, "grad_norm": 1.4609375, "learning_rate": 6.443292521679578e-06, "loss": 0.4825, "step": 3065 }, { "epoch": 0.7675, "grad_norm": 1.171875, "learning_rate": 6.377648585209456e-06, "loss": 0.4788, "step": 3070 }, { "epoch": 0.76875, "grad_norm": 1.1953125, "learning_rate": 6.312291828435077e-06, "loss": 0.4077, "step": 3075 }, { "epoch": 0.77, "grad_norm": 0.98046875, "learning_rate": 6.247223259238511e-06, "loss": 0.4103, "step": 3080 }, { "epoch": 0.77125, "grad_norm": 1.21875, "learning_rate": 6.182443881057576e-06, "loss": 0.4401, "step": 3085 }, { "epoch": 0.7725, "grad_norm": 1.3984375, "learning_rate": 6.117954692870412e-06, "loss": 0.4628, "step": 3090 }, { "epoch": 0.77375, "grad_norm": 1.15625, "learning_rate": 6.053756689180082e-06, "loss": 0.3789, "step": 3095 }, { "epoch": 0.775, "grad_norm": 0.9609375, "learning_rate": 5.989850859999227e-06, "loss": 0.4261, "step": 3100 }, { "epoch": 0.77625, "grad_norm": 1.3359375, "learning_rate": 5.926238190834779e-06, "loss": 0.4548, "step": 3105 }, { "epoch": 0.7775, "grad_norm": 1.125, "learning_rate": 5.8629196626728e-06, "loss": 0.4496, "step": 3110 }, { "epoch": 0.77875, "grad_norm": 0.984375, "learning_rate": 5.7998962519633045e-06, "loss": 0.3764, "step": 3115 }, { "epoch": 0.78, "grad_norm": 1.6015625, "learning_rate": 5.737168930605272e-06, "loss": 0.3888, "step": 3120 }, { "epoch": 0.78125, "grad_norm": 1.015625, "learning_rate": 5.674738665931575e-06, "loss": 0.4209, "step": 3125 }, { "epoch": 0.7825, "grad_norm": 1.3203125, "learning_rate": 5.612606420694141e-06, "loss": 0.4727, "step": 3130 }, { "epoch": 0.78375, "grad_norm": 1.0546875, "learning_rate": 5.550773153049046e-06, "loss": 0.4365, "step": 3135 }, { "epoch": 0.785, "grad_norm": 1.2421875, "learning_rate": 5.489239816541755e-06, "loss": 0.4403, "step": 3140 }, { "epoch": 0.78625, "grad_norm": 1.390625, "learning_rate": 5.428007360092463e-06, "loss": 0.4521, "step": 3145 }, { "epoch": 0.7875, "grad_norm": 1.25, "learning_rate": 5.367076727981382e-06, "loss": 0.4657, "step": 3150 }, { "epoch": 0.78875, "grad_norm": 1.015625, "learning_rate": 5.306448859834228e-06, "loss": 0.4367, "step": 3155 }, { "epoch": 0.79, "grad_norm": 1.1015625, "learning_rate": 5.24612469060774e-06, "loss": 0.4053, "step": 3160 }, { "epoch": 0.79125, "grad_norm": 1.515625, "learning_rate": 5.186105150575232e-06, "loss": 0.3926, "step": 3165 }, { "epoch": 0.7925, "grad_norm": 1.34375, "learning_rate": 5.12639116531225e-06, "loss": 0.4534, "step": 3170 }, { "epoch": 0.79375, "grad_norm": 1.3125, "learning_rate": 5.066983655682325e-06, "loss": 0.4551, "step": 3175 }, { "epoch": 0.795, "grad_norm": 1.375, "learning_rate": 5.007883537822736e-06, "loss": 0.4066, "step": 3180 }, { "epoch": 0.79625, "grad_norm": 0.75, "learning_rate": 4.949091723130425e-06, "loss": 0.4247, "step": 3185 }, { "epoch": 0.7975, "grad_norm": 1.6640625, "learning_rate": 4.890609118247888e-06, "loss": 0.4215, "step": 3190 }, { "epoch": 0.79875, "grad_norm": 1.4296875, "learning_rate": 4.832436625049256e-06, "loss": 0.4385, "step": 3195 }, { "epoch": 0.8, "grad_norm": 0.98046875, "learning_rate": 4.7745751406263165e-06, "loss": 0.4393, "step": 3200 }, { "epoch": 0.80125, "grad_norm": 0.9140625, "learning_rate": 4.717025557274749e-06, "loss": 0.42, "step": 3205 }, { "epoch": 0.8025, "grad_norm": 0.90234375, "learning_rate": 4.659788762480327e-06, "loss": 0.3758, "step": 3210 }, { "epoch": 0.80375, "grad_norm": 1.1640625, "learning_rate": 4.602865638905224e-06, "loss": 0.4448, "step": 3215 }, { "epoch": 0.805, "grad_norm": 1.078125, "learning_rate": 4.54625706437441e-06, "loss": 0.4453, "step": 3220 }, { "epoch": 0.80625, "grad_norm": 1.3125, "learning_rate": 4.48996391186216e-06, "loss": 0.4359, "step": 3225 }, { "epoch": 0.8075, "grad_norm": 1.3515625, "learning_rate": 4.433987049478508e-06, "loss": 0.3974, "step": 3230 }, { "epoch": 0.80875, "grad_norm": 1.125, "learning_rate": 4.378327340455915e-06, "loss": 0.4194, "step": 3235 }, { "epoch": 0.81, "grad_norm": 0.87890625, "learning_rate": 4.322985643135952e-06, "loss": 0.4214, "step": 3240 }, { "epoch": 0.81125, "grad_norm": 1.1640625, "learning_rate": 4.267962810956061e-06, "loss": 0.3592, "step": 3245 }, { "epoch": 0.8125, "grad_norm": 1.2421875, "learning_rate": 4.213259692436367e-06, "loss": 0.3997, "step": 3250 }, { "epoch": 0.81375, "grad_norm": 1.484375, "learning_rate": 4.158877131166641e-06, "loss": 0.4471, "step": 3255 }, { "epoch": 0.815, "grad_norm": 0.953125, "learning_rate": 4.104815965793249e-06, "loss": 0.4293, "step": 3260 }, { "epoch": 0.81625, "grad_norm": 1.203125, "learning_rate": 4.051077030006228e-06, "loss": 0.4562, "step": 3265 }, { "epoch": 0.8175, "grad_norm": 1.0703125, "learning_rate": 3.9976611525264525e-06, "loss": 0.434, "step": 3270 }, { "epoch": 0.81875, "grad_norm": 0.8671875, "learning_rate": 3.944569157092839e-06, "loss": 0.4524, "step": 3275 }, { "epoch": 0.82, "grad_norm": 1.296875, "learning_rate": 3.891801862449629e-06, "loss": 0.4498, "step": 3280 }, { "epoch": 0.82125, "grad_norm": 1.0625, "learning_rate": 3.839360082333771e-06, "loss": 0.4329, "step": 3285 }, { "epoch": 0.8225, "grad_norm": 1.375, "learning_rate": 3.7872446254624104e-06, "loss": 0.3884, "step": 3290 }, { "epoch": 0.82375, "grad_norm": 1.046875, "learning_rate": 3.735456295520348e-06, "loss": 0.4114, "step": 3295 }, { "epoch": 0.825, "grad_norm": 1.3125, "learning_rate": 3.6839958911476957e-06, "loss": 0.4404, "step": 3300 }, { "epoch": 0.82625, "grad_norm": 1.171875, "learning_rate": 3.6328642059275524e-06, "loss": 0.4548, "step": 3305 }, { "epoch": 0.8275, "grad_norm": 0.92578125, "learning_rate": 3.5820620283737616e-06, "loss": 0.4648, "step": 3310 }, { "epoch": 0.82875, "grad_norm": 0.875, "learning_rate": 3.5315901419187363e-06, "loss": 0.4233, "step": 3315 }, { "epoch": 0.83, "grad_norm": 0.953125, "learning_rate": 3.4814493249014116e-06, "loss": 0.4021, "step": 3320 }, { "epoch": 0.83125, "grad_norm": 1.0546875, "learning_rate": 3.431640350555204e-06, "loss": 0.4732, "step": 3325 }, { "epoch": 0.8325, "grad_norm": 1.0546875, "learning_rate": 3.382163986996126e-06, "loss": 0.4174, "step": 3330 }, { "epoch": 0.83375, "grad_norm": 1.3984375, "learning_rate": 3.3330209972108976e-06, "loss": 0.4284, "step": 3335 }, { "epoch": 0.835, "grad_norm": 1.109375, "learning_rate": 3.284212139045223e-06, "loss": 0.4183, "step": 3340 }, { "epoch": 0.83625, "grad_norm": 1.390625, "learning_rate": 3.2357381651920648e-06, "loss": 0.3996, "step": 3345 }, { "epoch": 0.8375, "grad_norm": 1.2578125, "learning_rate": 3.187599823180071e-06, "loss": 0.4317, "step": 3350 }, { "epoch": 0.83875, "grad_norm": 1.21875, "learning_rate": 3.139797855362031e-06, "loss": 0.4341, "step": 3355 }, { "epoch": 0.84, "grad_norm": 1.59375, "learning_rate": 3.092332998903416e-06, "loss": 0.4516, "step": 3360 }, { "epoch": 0.84125, "grad_norm": 1.0703125, "learning_rate": 3.0452059857710186e-06, "loss": 0.4371, "step": 3365 }, { "epoch": 0.8425, "grad_norm": 0.9921875, "learning_rate": 2.9984175427217016e-06, "loss": 0.4346, "step": 3370 }, { "epoch": 0.84375, "grad_norm": 0.96875, "learning_rate": 2.9519683912911266e-06, "loss": 0.3893, "step": 3375 }, { "epoch": 0.845, "grad_norm": 1.046875, "learning_rate": 2.9058592477826636e-06, "loss": 0.4086, "step": 3380 }, { "epoch": 0.84625, "grad_norm": 0.9375, "learning_rate": 2.860090823256359e-06, "loss": 0.4211, "step": 3385 }, { "epoch": 0.8475, "grad_norm": 1.296875, "learning_rate": 2.8146638235179213e-06, "loss": 0.422, "step": 3390 }, { "epoch": 0.84875, "grad_norm": 1.0234375, "learning_rate": 2.769578949107893e-06, "loss": 0.4117, "step": 3395 }, { "epoch": 0.85, "grad_norm": 0.90625, "learning_rate": 2.7248368952908053e-06, "loss": 0.3836, "step": 3400 }, { "epoch": 0.85125, "grad_norm": 1.046875, "learning_rate": 2.6804383520444815e-06, "loss": 0.3996, "step": 3405 }, { "epoch": 0.8525, "grad_norm": 1.234375, "learning_rate": 2.6363840040493747e-06, "loss": 0.4007, "step": 3410 }, { "epoch": 0.85375, "grad_norm": 1.0625, "learning_rate": 2.5926745306780324e-06, "loss": 0.4431, "step": 3415 }, { "epoch": 0.855, "grad_norm": 1.09375, "learning_rate": 2.5493106059846116e-06, "loss": 0.4013, "step": 3420 }, { "epoch": 0.85625, "grad_norm": 1.3125, "learning_rate": 2.506292898694468e-06, "loss": 0.4748, "step": 3425 }, { "epoch": 0.8575, "grad_norm": 1.3828125, "learning_rate": 2.4636220721938554e-06, "loss": 0.4454, "step": 3430 }, { "epoch": 0.85875, "grad_norm": 1.546875, "learning_rate": 2.421298784519724e-06, "loss": 0.3844, "step": 3435 }, { "epoch": 0.86, "grad_norm": 1.296875, "learning_rate": 2.379323688349516e-06, "loss": 0.4772, "step": 3440 }, { "epoch": 0.86125, "grad_norm": 1.09375, "learning_rate": 2.3376974309911343e-06, "loss": 0.4668, "step": 3445 }, { "epoch": 0.8625, "grad_norm": 1.359375, "learning_rate": 2.296420654372966e-06, "loss": 0.4191, "step": 3450 }, { "epoch": 0.86375, "grad_norm": 0.8828125, "learning_rate": 2.2554939950339747e-06, "loss": 0.3971, "step": 3455 }, { "epoch": 0.865, "grad_norm": 1.296875, "learning_rate": 2.2149180841138676e-06, "loss": 0.4282, "step": 3460 }, { "epoch": 0.86625, "grad_norm": 1.0, "learning_rate": 2.1746935473433928e-06, "loss": 0.4406, "step": 3465 }, { "epoch": 0.8675, "grad_norm": 1.3828125, "learning_rate": 2.1348210050346595e-06, "loss": 0.3914, "step": 3470 }, { "epoch": 0.86875, "grad_norm": 1.1015625, "learning_rate": 2.0953010720716037e-06, "loss": 0.3676, "step": 3475 }, { "epoch": 0.87, "grad_norm": 1.234375, "learning_rate": 2.0561343579004715e-06, "loss": 0.3973, "step": 3480 }, { "epoch": 0.87125, "grad_norm": 1.4375, "learning_rate": 2.0173214665204555e-06, "loss": 0.4067, "step": 3485 }, { "epoch": 0.8725, "grad_norm": 1.2109375, "learning_rate": 1.9788629964743455e-06, "loss": 0.4279, "step": 3490 }, { "epoch": 0.87375, "grad_norm": 1.3359375, "learning_rate": 1.940759540839329e-06, "loss": 0.4449, "step": 3495 }, { "epoch": 0.875, "grad_norm": 1.2890625, "learning_rate": 1.9030116872178316e-06, "loss": 0.4364, "step": 3500 }, { "epoch": 0.87625, "grad_norm": 1.1015625, "learning_rate": 1.8656200177284505e-06, "loss": 0.3991, "step": 3505 }, { "epoch": 0.8775, "grad_norm": 1.109375, "learning_rate": 1.8285851089969802e-06, "loss": 0.3922, "step": 3510 }, { "epoch": 0.87875, "grad_norm": 1.015625, "learning_rate": 1.7919075321475325e-06, "loss": 0.4559, "step": 3515 }, { "epoch": 0.88, "grad_norm": 1.15625, "learning_rate": 1.7555878527937164e-06, "loss": 0.3947, "step": 3520 }, { "epoch": 0.88125, "grad_norm": 0.98828125, "learning_rate": 1.7196266310299108e-06, "loss": 0.3853, "step": 3525 }, { "epoch": 0.8825, "grad_norm": 1.1015625, "learning_rate": 1.6840244214226502e-06, "loss": 0.4429, "step": 3530 }, { "epoch": 0.88375, "grad_norm": 1.2109375, "learning_rate": 1.6487817730020365e-06, "loss": 0.4092, "step": 3535 }, { "epoch": 0.885, "grad_norm": 1.046875, "learning_rate": 1.6138992292533183e-06, "loss": 0.4348, "step": 3540 }, { "epoch": 0.88625, "grad_norm": 0.92578125, "learning_rate": 1.579377328108464e-06, "loss": 0.4362, "step": 3545 }, { "epoch": 0.8875, "grad_norm": 1.3359375, "learning_rate": 1.5452166019378989e-06, "loss": 0.431, "step": 3550 }, { "epoch": 0.88875, "grad_norm": 1.0703125, "learning_rate": 1.5114175775422762e-06, "loss": 0.4164, "step": 3555 }, { "epoch": 0.89, "grad_norm": 1.296875, "learning_rate": 1.4779807761443636e-06, "loss": 0.4154, "step": 3560 }, { "epoch": 0.89125, "grad_norm": 1.3046875, "learning_rate": 1.4449067133810056e-06, "loss": 0.4108, "step": 3565 }, { "epoch": 0.8925, "grad_norm": 1.1796875, "learning_rate": 1.4121958992951629e-06, "loss": 0.4024, "step": 3570 }, { "epoch": 0.89375, "grad_norm": 1.0703125, "learning_rate": 1.379848838328049e-06, "loss": 0.4041, "step": 3575 }, { "epoch": 0.895, "grad_norm": 1.2109375, "learning_rate": 1.3478660293113676e-06, "loss": 0.4291, "step": 3580 }, { "epoch": 0.89625, "grad_norm": 1.0703125, "learning_rate": 1.3162479654595938e-06, "loss": 0.4561, "step": 3585 }, { "epoch": 0.8975, "grad_norm": 2.03125, "learning_rate": 1.284995134362385e-06, "loss": 0.4599, "step": 3590 }, { "epoch": 0.89875, "grad_norm": 0.92578125, "learning_rate": 1.2541080179770571e-06, "loss": 0.371, "step": 3595 }, { "epoch": 0.9, "grad_norm": 1.1171875, "learning_rate": 1.2235870926211619e-06, "loss": 0.428, "step": 3600 }, { "epoch": 0.90125, "grad_norm": 1.1796875, "learning_rate": 1.193432828965113e-06, "loss": 0.4093, "step": 3605 }, { "epoch": 0.9025, "grad_norm": 1.0078125, "learning_rate": 1.16364569202497e-06, "loss": 0.4123, "step": 3610 }, { "epoch": 0.90375, "grad_norm": 1.0625, "learning_rate": 1.134226141155223e-06, "loss": 0.4212, "step": 3615 }, { "epoch": 0.905, "grad_norm": 0.99609375, "learning_rate": 1.105174630041747e-06, "loss": 0.4379, "step": 3620 }, { "epoch": 0.90625, "grad_norm": 1.28125, "learning_rate": 1.0764916066947794e-06, "loss": 0.4788, "step": 3625 }, { "epoch": 0.9075, "grad_norm": 1.1875, "learning_rate": 1.0481775134420225e-06, "loss": 0.4523, "step": 3630 }, { "epoch": 0.90875, "grad_norm": 1.5078125, "learning_rate": 1.020232786921821e-06, "loss": 0.4141, "step": 3635 }, { "epoch": 0.91, "grad_norm": 1.0078125, "learning_rate": 9.926578580764234e-07, "loss": 0.3628, "step": 3640 }, { "epoch": 0.91125, "grad_norm": 1.109375, "learning_rate": 9.654531521453513e-07, "loss": 0.4142, "step": 3645 }, { "epoch": 0.9125, "grad_norm": 1.3046875, "learning_rate": 9.386190886588208e-07, "loss": 0.4348, "step": 3650 }, { "epoch": 0.91375, "grad_norm": 1.6796875, "learning_rate": 9.121560814312813e-07, "loss": 0.4138, "step": 3655 }, { "epoch": 0.915, "grad_norm": 1.078125, "learning_rate": 8.860645385550481e-07, "loss": 0.4332, "step": 3660 }, { "epoch": 0.91625, "grad_norm": 1.1484375, "learning_rate": 8.603448623939858e-07, "loss": 0.4577, "step": 3665 }, { "epoch": 0.9175, "grad_norm": 1.1484375, "learning_rate": 8.349974495773183e-07, "loss": 0.4456, "step": 3670 }, { "epoch": 0.91875, "grad_norm": 1.28125, "learning_rate": 8.10022690993506e-07, "loss": 0.3946, "step": 3675 }, { "epoch": 0.92, "grad_norm": 1.3203125, "learning_rate": 7.854209717842231e-07, "loss": 0.478, "step": 3680 }, { "epoch": 0.92125, "grad_norm": 1.1171875, "learning_rate": 7.611926713384121e-07, "loss": 0.3592, "step": 3685 }, { "epoch": 0.9225, "grad_norm": 1.234375, "learning_rate": 7.373381632864384e-07, "loss": 0.4425, "step": 3690 }, { "epoch": 0.92375, "grad_norm": 1.1796875, "learning_rate": 7.138578154943288e-07, "loss": 0.4219, "step": 3695 }, { "epoch": 0.925, "grad_norm": 1.3125, "learning_rate": 6.907519900580861e-07, "loss": 0.4419, "step": 3700 }, { "epoch": 0.92625, "grad_norm": 1.1796875, "learning_rate": 6.680210432981254e-07, "loss": 0.3983, "step": 3705 }, { "epoch": 0.9275, "grad_norm": 1.046875, "learning_rate": 6.456653257537665e-07, "loss": 0.4417, "step": 3710 }, { "epoch": 0.92875, "grad_norm": 1.03125, "learning_rate": 6.2368518217783e-07, "loss": 0.4469, "step": 3715 }, { "epoch": 0.93, "grad_norm": 1.0234375, "learning_rate": 6.020809515313142e-07, "loss": 0.435, "step": 3720 }, { "epoch": 0.93125, "grad_norm": 1.6875, "learning_rate": 5.808529669781904e-07, "loss": 0.3856, "step": 3725 }, { "epoch": 0.9325, "grad_norm": 1.1875, "learning_rate": 5.600015558802352e-07, "loss": 0.4587, "step": 3730 }, { "epoch": 0.93375, "grad_norm": 1.203125, "learning_rate": 5.39527039792001e-07, "loss": 0.4325, "step": 3735 }, { "epoch": 0.935, "grad_norm": 1.484375, "learning_rate": 5.194297344558536e-07, "loss": 0.4166, "step": 3740 }, { "epoch": 0.93625, "grad_norm": 1.2890625, "learning_rate": 4.997099497971114e-07, "loss": 0.4347, "step": 3745 }, { "epoch": 0.9375, "grad_norm": 0.95703125, "learning_rate": 4.803679899192392e-07, "loss": 0.4252, "step": 3750 }, { "epoch": 0.93875, "grad_norm": 1.1875, "learning_rate": 4.614041530991903e-07, "loss": 0.4036, "step": 3755 }, { "epoch": 0.94, "grad_norm": 1.375, "learning_rate": 4.4281873178278475e-07, "loss": 0.4359, "step": 3760 }, { "epoch": 0.94125, "grad_norm": 1.359375, "learning_rate": 4.246120125802111e-07, "loss": 0.4566, "step": 3765 }, { "epoch": 0.9425, "grad_norm": 1.0078125, "learning_rate": 4.067842762616014e-07, "loss": 0.4226, "step": 3770 }, { "epoch": 0.94375, "grad_norm": 1.1953125, "learning_rate": 3.8933579775271013e-07, "loss": 0.3903, "step": 3775 }, { "epoch": 0.945, "grad_norm": 1.125, "learning_rate": 3.7226684613065333e-07, "loss": 0.4119, "step": 3780 }, { "epoch": 0.94625, "grad_norm": 1.3515625, "learning_rate": 3.555776846197817e-07, "loss": 0.4268, "step": 3785 }, { "epoch": 0.9475, "grad_norm": 0.9765625, "learning_rate": 3.3926857058761417e-07, "loss": 0.405, "step": 3790 }, { "epoch": 0.94875, "grad_norm": 1.2734375, "learning_rate": 3.233397555408607e-07, "loss": 0.4161, "step": 3795 }, { "epoch": 0.95, "grad_norm": 1.2578125, "learning_rate": 3.077914851215585e-07, "loss": 0.4293, "step": 3800 }, { "epoch": 0.95125, "grad_norm": 1.3671875, "learning_rate": 2.92623999103267e-07, "loss": 0.4066, "step": 3805 }, { "epoch": 0.9525, "grad_norm": 1.0859375, "learning_rate": 2.778375313873871e-07, "loss": 0.4309, "step": 3810 }, { "epoch": 0.95375, "grad_norm": 1.2421875, "learning_rate": 2.634323099995395e-07, "loss": 0.4623, "step": 3815 }, { "epoch": 0.955, "grad_norm": 1.6015625, "learning_rate": 2.494085570860616e-07, "loss": 0.3977, "step": 3820 }, { "epoch": 0.95625, "grad_norm": 1.28125, "learning_rate": 2.3576648891056875e-07, "loss": 0.4135, "step": 3825 }, { "epoch": 0.9575, "grad_norm": 1.1640625, "learning_rate": 2.2250631585063186e-07, "loss": 0.3874, "step": 3830 }, { "epoch": 0.95875, "grad_norm": 1.21875, "learning_rate": 2.0962824239451894e-07, "loss": 0.4494, "step": 3835 }, { "epoch": 0.96, "grad_norm": 1.1328125, "learning_rate": 1.9713246713805588e-07, "loss": 0.3946, "step": 3840 }, { "epoch": 0.96125, "grad_norm": 0.85546875, "learning_rate": 1.8501918278155393e-07, "loss": 0.4312, "step": 3845 }, { "epoch": 0.9625, "grad_norm": 1.359375, "learning_rate": 1.732885761268427e-07, "loss": 0.4453, "step": 3850 }, { "epoch": 0.96375, "grad_norm": 0.984375, "learning_rate": 1.619408280743917e-07, "loss": 0.3943, "step": 3855 }, { "epoch": 0.965, "grad_norm": 0.90234375, "learning_rate": 1.509761136205101e-07, "loss": 0.4144, "step": 3860 }, { "epoch": 0.96625, "grad_norm": 1.046875, "learning_rate": 1.4039460185465703e-07, "loss": 0.4026, "step": 3865 }, { "epoch": 0.9675, "grad_norm": 1.0625, "learning_rate": 1.3019645595683806e-07, "loss": 0.432, "step": 3870 }, { "epoch": 0.96875, "grad_norm": 1.2890625, "learning_rate": 1.2038183319507955e-07, "loss": 0.4384, "step": 3875 }, { "epoch": 0.97, "grad_norm": 1.328125, "learning_rate": 1.109508849230001e-07, "loss": 0.4622, "step": 3880 }, { "epoch": 0.97125, "grad_norm": 1.265625, "learning_rate": 1.0190375657749274e-07, "loss": 0.4136, "step": 3885 }, { "epoch": 0.9725, "grad_norm": 1.359375, "learning_rate": 9.324058767646859e-08, "loss": 0.4417, "step": 3890 }, { "epoch": 0.97375, "grad_norm": 1.0625, "learning_rate": 8.496151181670852e-08, "loss": 0.4316, "step": 3895 }, { "epoch": 0.975, "grad_norm": 0.8515625, "learning_rate": 7.706665667180091e-08, "loss": 0.4246, "step": 3900 }, { "epoch": 0.97625, "grad_norm": 1.3515625, "learning_rate": 6.955614399018206e-08, "loss": 0.4481, "step": 3905 }, { "epoch": 0.9775, "grad_norm": 1.1171875, "learning_rate": 6.243008959324892e-08, "loss": 0.4544, "step": 3910 }, { "epoch": 0.97875, "grad_norm": 1.3671875, "learning_rate": 5.568860337357151e-08, "loss": 0.4005, "step": 3915 }, { "epoch": 0.98, "grad_norm": 1.265625, "learning_rate": 4.9331789293211026e-08, "loss": 0.4955, "step": 3920 }, { "epoch": 0.98125, "grad_norm": 1.5546875, "learning_rate": 4.335974538210441e-08, "loss": 0.4102, "step": 3925 }, { "epoch": 0.9825, "grad_norm": 1.2421875, "learning_rate": 3.7772563736551694e-08, "loss": 0.4542, "step": 3930 }, { "epoch": 0.98375, "grad_norm": 1.2421875, "learning_rate": 3.2570330517811555e-08, "loss": 0.4691, "step": 3935 }, { "epoch": 0.985, "grad_norm": 0.9140625, "learning_rate": 2.7753125950752413e-08, "loss": 0.4155, "step": 3940 }, { "epoch": 0.98625, "grad_norm": 1.3125, "learning_rate": 2.3321024322625617e-08, "loss": 0.4268, "step": 3945 }, { "epoch": 0.9875, "grad_norm": 1.0546875, "learning_rate": 1.9274093981927478e-08, "loss": 0.4098, "step": 3950 }, { "epoch": 0.98875, "grad_norm": 1.171875, "learning_rate": 1.5612397337325113e-08, "loss": 0.3965, "step": 3955 }, { "epoch": 0.99, "grad_norm": 1.2421875, "learning_rate": 1.233599085671e-08, "loss": 0.4047, "step": 3960 }, { "epoch": 0.99125, "grad_norm": 1.3125, "learning_rate": 9.444925066329213e-09, "loss": 0.4358, "step": 3965 }, { "epoch": 0.9925, "grad_norm": 0.88671875, "learning_rate": 6.939244549986068e-09, "loss": 0.465, "step": 3970 }, { "epoch": 0.99375, "grad_norm": 1.109375, "learning_rate": 4.818987948379539e-09, "loss": 0.4483, "step": 3975 }, { "epoch": 0.995, "grad_norm": 1.46875, "learning_rate": 3.0841879584853073e-09, "loss": 0.4986, "step": 3980 }, { "epoch": 0.99625, "grad_norm": 1.2734375, "learning_rate": 1.7348713330672671e-09, "loss": 0.4121, "step": 3985 }, { "epoch": 0.9975, "grad_norm": 1.078125, "learning_rate": 7.710588802584129e-10, "loss": 0.3764, "step": 3990 }, { "epoch": 0.99875, "grad_norm": 1.125, "learning_rate": 1.9276546323609978e-10, "loss": 0.4443, "step": 3995 }, { "epoch": 1.0, "grad_norm": 0.8515625, "learning_rate": 0.0, "loss": 0.4357, "step": 4000 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.184022656018907e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }