diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,5633 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 4000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.00125, - "grad_norm": 5.53125, - "learning_rate": 4.999980723453676e-05, - "loss": 0.6803, - "step": 5 - }, - { - "epoch": 0.0025, - "grad_norm": 3.0, - "learning_rate": 4.9999228941119745e-05, - "loss": 0.6176, - "step": 10 - }, - { - "epoch": 0.00375, - "grad_norm": 1.8984375, - "learning_rate": 4.999826512866693e-05, - "loss": 0.5313, - "step": 15 - }, - { - "epoch": 0.005, - "grad_norm": 1.6015625, - "learning_rate": 4.999691581204152e-05, - "loss": 0.5659, - "step": 20 - }, - { - "epoch": 0.00625, - "grad_norm": 1.5703125, - "learning_rate": 4.9995181012051625e-05, - "loss": 0.5822, - "step": 25 - }, - { - "epoch": 0.0075, - "grad_norm": 1.515625, - "learning_rate": 4.9993060755450015e-05, - "loss": 0.5849, - "step": 30 - }, - { - "epoch": 0.00875, - "grad_norm": 1.4375, - "learning_rate": 4.999055507493368e-05, - "loss": 0.55, - "step": 35 - }, - { - "epoch": 0.01, - "grad_norm": 1.515625, - "learning_rate": 4.998766400914329e-05, - "loss": 0.5323, - "step": 40 - }, - { - "epoch": 0.01125, - "grad_norm": 1.6640625, - "learning_rate": 4.9984387602662675e-05, - "loss": 0.5775, - "step": 45 - }, - { - "epoch": 0.0125, - "grad_norm": 1.8359375, - "learning_rate": 4.9980725906018074e-05, - "loss": 0.5523, - "step": 50 - }, - { - "epoch": 0.01375, - "grad_norm": 1.3984375, - "learning_rate": 4.9976678975677376e-05, - "loss": 0.6089, - "step": 55 - }, - { - "epoch": 0.015, - "grad_norm": 1.40625, - "learning_rate": 4.9972246874049254e-05, - "loss": 0.5445, - "step": 60 - }, - { - "epoch": 0.01625, - "grad_norm": 1.375, - "learning_rate": 4.996742966948219e-05, - "loss": 0.5256, - "step": 65 - }, - { - "epoch": 0.0175, - "grad_norm": 1.359375, - "learning_rate": 4.9962227436263453e-05, - "loss": 0.5118, - "step": 70 - }, - { - "epoch": 0.01875, - "grad_norm": 1.2421875, - "learning_rate": 4.9956640254617906e-05, - "loss": 0.5458, - "step": 75 - }, - { - "epoch": 0.02, - "grad_norm": 1.484375, - "learning_rate": 4.995066821070679e-05, - "loss": 0.5946, - "step": 80 - }, - { - "epoch": 0.02125, - "grad_norm": 1.4375, - "learning_rate": 4.994431139662643e-05, - "loss": 0.515, - "step": 85 - }, - { - "epoch": 0.0225, - "grad_norm": 1.4140625, - "learning_rate": 4.9937569910406756e-05, - "loss": 0.5501, - "step": 90 - }, - { - "epoch": 0.02375, - "grad_norm": 1.265625, - "learning_rate": 4.9930443856009826e-05, - "loss": 0.5475, - "step": 95 - }, - { - "epoch": 0.025, - "grad_norm": 1.46875, - "learning_rate": 4.99229333433282e-05, - "loss": 0.5625, - "step": 100 - }, - { - "epoch": 0.02625, - "grad_norm": 1.578125, - "learning_rate": 4.9915038488183295e-05, - "loss": 0.5627, - "step": 105 - }, - { - "epoch": 0.0275, - "grad_norm": 1.640625, - "learning_rate": 4.990675941232353e-05, - "loss": 0.5561, - "step": 110 - }, - { - "epoch": 0.02875, - "grad_norm": 1.3046875, - "learning_rate": 4.989809624342251e-05, - "loss": 0.5254, - "step": 115 - }, - { - "epoch": 0.03, - "grad_norm": 1.0859375, - "learning_rate": 4.9889049115077005e-05, - "loss": 0.5184, - "step": 120 - }, - { - "epoch": 0.03125, - "grad_norm": 1.390625, - "learning_rate": 4.987961816680492e-05, - "loss": 0.5563, - "step": 125 - }, - { - "epoch": 0.0325, - "grad_norm": 1.6875, - "learning_rate": 4.9869803544043166e-05, - "loss": 0.5536, - "step": 130 - }, - { - "epoch": 0.03375, - "grad_norm": 1.109375, - "learning_rate": 4.985960539814535e-05, - "loss": 0.5544, - "step": 135 - }, - { - "epoch": 0.035, - "grad_norm": 1.453125, - "learning_rate": 4.98490238863795e-05, - "loss": 0.5117, - "step": 140 - }, - { - "epoch": 0.03625, - "grad_norm": 1.421875, - "learning_rate": 4.983805917192561e-05, - "loss": 0.5125, - "step": 145 - }, - { - "epoch": 0.0375, - "grad_norm": 1.1875, - "learning_rate": 4.982671142387316e-05, - "loss": 0.5563, - "step": 150 - }, - { - "epoch": 0.03875, - "grad_norm": 1.375, - "learning_rate": 4.9814980817218447e-05, - "loss": 0.5408, - "step": 155 - }, - { - "epoch": 0.04, - "grad_norm": 1.1796875, - "learning_rate": 4.980286753286195e-05, - "loss": 0.5249, - "step": 160 - }, - { - "epoch": 0.04125, - "grad_norm": 1.546875, - "learning_rate": 4.979037175760548e-05, - "loss": 0.546, - "step": 165 - }, - { - "epoch": 0.0425, - "grad_norm": 1.28125, - "learning_rate": 4.9777493684149375e-05, - "loss": 0.5019, - "step": 170 - }, - { - "epoch": 0.04375, - "grad_norm": 1.453125, - "learning_rate": 4.976423351108943e-05, - "loss": 0.5364, - "step": 175 - }, - { - "epoch": 0.045, - "grad_norm": 1.5234375, - "learning_rate": 4.975059144291394e-05, - "loss": 0.5504, - "step": 180 - }, - { - "epoch": 0.04625, - "grad_norm": 1.2109375, - "learning_rate": 4.973656769000046e-05, - "loss": 0.4682, - "step": 185 - }, - { - "epoch": 0.0475, - "grad_norm": 1.328125, - "learning_rate": 4.972216246861262e-05, - "loss": 0.5262, - "step": 190 - }, - { - "epoch": 0.04875, - "grad_norm": 1.09375, - "learning_rate": 4.9707376000896736e-05, - "loss": 0.5343, - "step": 195 - }, - { - "epoch": 0.05, - "grad_norm": 1.3671875, - "learning_rate": 4.9692208514878444e-05, - "loss": 0.5171, - "step": 200 - }, - { - "epoch": 0.05125, - "grad_norm": 1.1640625, - "learning_rate": 4.967666024445914e-05, - "loss": 0.5454, - "step": 205 - }, - { - "epoch": 0.0525, - "grad_norm": 1.25, - "learning_rate": 4.966073142941239e-05, - "loss": 0.5378, - "step": 210 - }, - { - "epoch": 0.05375, - "grad_norm": 1.140625, - "learning_rate": 4.9644422315380225e-05, - "loss": 0.4792, - "step": 215 - }, - { - "epoch": 0.055, - "grad_norm": 1.46875, - "learning_rate": 4.962773315386935e-05, - "loss": 0.5336, - "step": 220 - }, - { - "epoch": 0.05625, - "grad_norm": 1.359375, - "learning_rate": 4.9610664202247294e-05, - "loss": 0.5293, - "step": 225 - }, - { - "epoch": 0.0575, - "grad_norm": 1.0546875, - "learning_rate": 4.9593215723738404e-05, - "loss": 0.4678, - "step": 230 - }, - { - "epoch": 0.05875, - "grad_norm": 1.421875, - "learning_rate": 4.957538798741979e-05, - "loss": 0.549, - "step": 235 - }, - { - "epoch": 0.06, - "grad_norm": 1.3046875, - "learning_rate": 4.9557181268217227e-05, - "loss": 0.5642, - "step": 240 - }, - { - "epoch": 0.06125, - "grad_norm": 1.2734375, - "learning_rate": 4.953859584690082e-05, - "loss": 0.5544, - "step": 245 - }, - { - "epoch": 0.0625, - "grad_norm": 1.25, - "learning_rate": 4.951963201008076e-05, - "loss": 0.5604, - "step": 250 - }, - { - "epoch": 0.06375, - "grad_norm": 1.15625, - "learning_rate": 4.9500290050202894e-05, - "loss": 0.5349, - "step": 255 - }, - { - "epoch": 0.065, - "grad_norm": 1.5234375, - "learning_rate": 4.9480570265544144e-05, - "loss": 0.5393, - "step": 260 - }, - { - "epoch": 0.06625, - "grad_norm": 1.2109375, - "learning_rate": 4.9460472960208e-05, - "loss": 0.527, - "step": 265 - }, - { - "epoch": 0.0675, - "grad_norm": 1.203125, - "learning_rate": 4.943999844411977e-05, - "loss": 0.4982, - "step": 270 - }, - { - "epoch": 0.06875, - "grad_norm": 0.9765625, - "learning_rate": 4.9419147033021814e-05, - "loss": 0.4377, - "step": 275 - }, - { - "epoch": 0.07, - "grad_norm": 1.453125, - "learning_rate": 4.939791904846869e-05, - "loss": 0.4919, - "step": 280 - }, - { - "epoch": 0.07125, - "grad_norm": 1.7421875, - "learning_rate": 4.937631481782218e-05, - "loss": 0.5107, - "step": 285 - }, - { - "epoch": 0.0725, - "grad_norm": 1.4921875, - "learning_rate": 4.935433467424624e-05, - "loss": 0.5611, - "step": 290 - }, - { - "epoch": 0.07375, - "grad_norm": 1.8984375, - "learning_rate": 4.9331978956701875e-05, - "loss": 0.534, - "step": 295 - }, - { - "epoch": 0.075, - "grad_norm": 1.3515625, - "learning_rate": 4.9309248009941914e-05, - "loss": 0.5319, - "step": 300 - }, - { - "epoch": 0.07625, - "grad_norm": 1.2265625, - "learning_rate": 4.928614218450568e-05, - "loss": 0.4805, - "step": 305 - }, - { - "epoch": 0.0775, - "grad_norm": 1.390625, - "learning_rate": 4.9262661836713564e-05, - "loss": 0.4656, - "step": 310 - }, - { - "epoch": 0.07875, - "grad_norm": 1.0546875, - "learning_rate": 4.923880732866159e-05, - "loss": 0.5328, - "step": 315 - }, - { - "epoch": 0.08, - "grad_norm": 1.0, - "learning_rate": 4.9214579028215776e-05, - "loss": 0.4949, - "step": 320 - }, - { - "epoch": 0.08125, - "grad_norm": 1.2109375, - "learning_rate": 4.9189977309006495e-05, - "loss": 0.5222, - "step": 325 - }, - { - "epoch": 0.0825, - "grad_norm": 1.2421875, - "learning_rate": 4.916500255042268e-05, - "loss": 0.5029, - "step": 330 - }, - { - "epoch": 0.08375, - "grad_norm": 1.0546875, - "learning_rate": 4.9139655137606015e-05, - "loss": 0.5188, - "step": 335 - }, - { - "epoch": 0.085, - "grad_norm": 1.109375, - "learning_rate": 4.9113935461444955e-05, - "loss": 0.5651, - "step": 340 - }, - { - "epoch": 0.08625, - "grad_norm": 1.046875, - "learning_rate": 4.908784391856872e-05, - "loss": 0.4586, - "step": 345 - }, - { - "epoch": 0.0875, - "grad_norm": 1.0625, - "learning_rate": 4.906138091134118e-05, - "loss": 0.539, - "step": 350 - }, - { - "epoch": 0.08875, - "grad_norm": 1.3828125, - "learning_rate": 4.9034546847854656e-05, - "loss": 0.5331, - "step": 355 - }, - { - "epoch": 0.09, - "grad_norm": 1.015625, - "learning_rate": 4.900734214192358e-05, - "loss": 0.4227, - "step": 360 - }, - { - "epoch": 0.09125, - "grad_norm": 1.28125, - "learning_rate": 4.897976721307819e-05, - "loss": 0.5005, - "step": 365 - }, - { - "epoch": 0.0925, - "grad_norm": 1.1171875, - "learning_rate": 4.8951822486557986e-05, - "loss": 0.5294, - "step": 370 - }, - { - "epoch": 0.09375, - "grad_norm": 1.375, - "learning_rate": 4.892350839330522e-05, - "loss": 0.5729, - "step": 375 - }, - { - "epoch": 0.095, - "grad_norm": 0.95703125, - "learning_rate": 4.8894825369958255e-05, - "loss": 0.4837, - "step": 380 - }, - { - "epoch": 0.09625, - "grad_norm": 1.5078125, - "learning_rate": 4.8865773858844776e-05, - "loss": 0.5266, - "step": 385 - }, - { - "epoch": 0.0975, - "grad_norm": 1.375, - "learning_rate": 4.8836354307975026e-05, - "loss": 0.5329, - "step": 390 - }, - { - "epoch": 0.09875, - "grad_norm": 1.4921875, - "learning_rate": 4.880656717103489e-05, - "loss": 0.5096, - "step": 395 - }, - { - "epoch": 0.1, - "grad_norm": 1.171875, - "learning_rate": 4.877641290737884e-05, - "loss": 0.4919, - "step": 400 - }, - { - "epoch": 0.10125, - "grad_norm": 1.078125, - "learning_rate": 4.874589198202294e-05, - "loss": 0.4633, - "step": 405 - }, - { - "epoch": 0.1025, - "grad_norm": 1.4296875, - "learning_rate": 4.8715004865637614e-05, - "loss": 0.4981, - "step": 410 - }, - { - "epoch": 0.10375, - "grad_norm": 1.109375, - "learning_rate": 4.868375203454041e-05, - "loss": 0.4699, - "step": 415 - }, - { - "epoch": 0.105, - "grad_norm": 1.1796875, - "learning_rate": 4.8652133970688636e-05, - "loss": 0.5086, - "step": 420 - }, - { - "epoch": 0.10625, - "grad_norm": 1.1015625, - "learning_rate": 4.862015116167196e-05, - "loss": 0.5406, - "step": 425 - }, - { - "epoch": 0.1075, - "grad_norm": 1.640625, - "learning_rate": 4.8587804100704845e-05, - "loss": 0.5456, - "step": 430 - }, - { - "epoch": 0.10875, - "grad_norm": 1.1875, - "learning_rate": 4.8555093286618995e-05, - "loss": 0.5107, - "step": 435 - }, - { - "epoch": 0.11, - "grad_norm": 1.34375, - "learning_rate": 4.852201922385564e-05, - "loss": 0.4078, - "step": 440 - }, - { - "epoch": 0.11125, - "grad_norm": 1.3046875, - "learning_rate": 4.848858242245773e-05, - "loss": 0.4958, - "step": 445 - }, - { - "epoch": 0.1125, - "grad_norm": 1.4765625, - "learning_rate": 4.8454783398062106e-05, - "loss": 0.4822, - "step": 450 - }, - { - "epoch": 0.11375, - "grad_norm": 1.546875, - "learning_rate": 4.8420622671891533e-05, - "loss": 0.5489, - "step": 455 - }, - { - "epoch": 0.115, - "grad_norm": 1.0703125, - "learning_rate": 4.838610077074669e-05, - "loss": 0.4884, - "step": 460 - }, - { - "epoch": 0.11625, - "grad_norm": 1.609375, - "learning_rate": 4.835121822699796e-05, - "loss": 0.529, - "step": 465 - }, - { - "epoch": 0.1175, - "grad_norm": 1.234375, - "learning_rate": 4.8315975578577355e-05, - "loss": 0.5414, - "step": 470 - }, - { - "epoch": 0.11875, - "grad_norm": 1.5078125, - "learning_rate": 4.828037336897009e-05, - "loss": 0.4749, - "step": 475 - }, - { - "epoch": 0.12, - "grad_norm": 0.9765625, - "learning_rate": 4.8244412147206284e-05, - "loss": 0.5205, - "step": 480 - }, - { - "epoch": 0.12125, - "grad_norm": 1.2890625, - "learning_rate": 4.820809246785247e-05, - "loss": 0.5343, - "step": 485 - }, - { - "epoch": 0.1225, - "grad_norm": 1.3828125, - "learning_rate": 4.817141489100302e-05, - "loss": 0.5324, - "step": 490 - }, - { - "epoch": 0.12375, - "grad_norm": 1.546875, - "learning_rate": 4.8134379982271556e-05, - "loss": 0.5451, - "step": 495 - }, - { - "epoch": 0.125, - "grad_norm": 1.390625, - "learning_rate": 4.8096988312782174e-05, - "loss": 0.5428, - "step": 500 - }, - { - "epoch": 0.12625, - "grad_norm": 1.0546875, - "learning_rate": 4.805924045916067e-05, - "loss": 0.5002, - "step": 505 - }, - { - "epoch": 0.1275, - "grad_norm": 1.2890625, - "learning_rate": 4.8021137003525664e-05, - "loss": 0.5277, - "step": 510 - }, - { - "epoch": 0.12875, - "grad_norm": 1.3125, - "learning_rate": 4.7982678533479555e-05, - "loss": 0.5185, - "step": 515 - }, - { - "epoch": 0.13, - "grad_norm": 1.3125, - "learning_rate": 4.794386564209953e-05, - "loss": 0.501, - "step": 520 - }, - { - "epoch": 0.13125, - "grad_norm": 1.4453125, - "learning_rate": 4.7904698927928406e-05, - "loss": 0.4903, - "step": 525 - }, - { - "epoch": 0.1325, - "grad_norm": 1.1484375, - "learning_rate": 4.7865178994965344e-05, - "loss": 0.4764, - "step": 530 - }, - { - "epoch": 0.13375, - "grad_norm": 1.3515625, - "learning_rate": 4.782530645265661e-05, - "loss": 0.5046, - "step": 535 - }, - { - "epoch": 0.135, - "grad_norm": 1.25, - "learning_rate": 4.7785081915886134e-05, - "loss": 0.4849, - "step": 540 - }, - { - "epoch": 0.13625, - "grad_norm": 1.4375, - "learning_rate": 4.7744506004966025e-05, - "loss": 0.4874, - "step": 545 - }, - { - "epoch": 0.1375, - "grad_norm": 1.1796875, - "learning_rate": 4.7703579345627035e-05, - "loss": 0.5632, - "step": 550 - }, - { - "epoch": 0.13875, - "grad_norm": 1.1953125, - "learning_rate": 4.766230256900887e-05, - "loss": 0.4894, - "step": 555 - }, - { - "epoch": 0.14, - "grad_norm": 0.9140625, - "learning_rate": 4.762067631165049e-05, - "loss": 0.4819, - "step": 560 - }, - { - "epoch": 0.14125, - "grad_norm": 1.3203125, - "learning_rate": 4.7578701215480284e-05, - "loss": 0.4872, - "step": 565 - }, - { - "epoch": 0.1425, - "grad_norm": 1.6328125, - "learning_rate": 4.753637792780614e-05, - "loss": 0.5274, - "step": 570 - }, - { - "epoch": 0.14375, - "grad_norm": 1.359375, - "learning_rate": 4.749370710130554e-05, - "loss": 0.5052, - "step": 575 - }, - { - "epoch": 0.145, - "grad_norm": 1.2109375, - "learning_rate": 4.745068939401539e-05, - "loss": 0.4819, - "step": 580 - }, - { - "epoch": 0.14625, - "grad_norm": 1.4296875, - "learning_rate": 4.740732546932197e-05, - "loss": 0.5159, - "step": 585 - }, - { - "epoch": 0.1475, - "grad_norm": 1.203125, - "learning_rate": 4.7363615995950626e-05, - "loss": 0.5338, - "step": 590 - }, - { - "epoch": 0.14875, - "grad_norm": 1.1171875, - "learning_rate": 4.7319561647955526e-05, - "loss": 0.4797, - "step": 595 - }, - { - "epoch": 0.15, - "grad_norm": 1.2578125, - "learning_rate": 4.72751631047092e-05, - "loss": 0.5453, - "step": 600 - }, - { - "epoch": 0.15125, - "grad_norm": 1.234375, - "learning_rate": 4.7230421050892116e-05, - "loss": 0.5009, - "step": 605 - }, - { - "epoch": 0.1525, - "grad_norm": 1.5625, - "learning_rate": 4.718533617648209e-05, - "loss": 0.4602, - "step": 610 - }, - { - "epoch": 0.15375, - "grad_norm": 1.484375, - "learning_rate": 4.713990917674365e-05, - "loss": 0.5399, - "step": 615 - }, - { - "epoch": 0.155, - "grad_norm": 1.3828125, - "learning_rate": 4.709414075221734e-05, - "loss": 0.5006, - "step": 620 - }, - { - "epoch": 0.15625, - "grad_norm": 1.0546875, - "learning_rate": 4.7048031608708876e-05, - "loss": 0.4784, - "step": 625 - }, - { - "epoch": 0.1575, - "grad_norm": 1.1953125, - "learning_rate": 4.7001582457278304e-05, - "loss": 0.4764, - "step": 630 - }, - { - "epoch": 0.15875, - "grad_norm": 1.03125, - "learning_rate": 4.695479401422898e-05, - "loss": 0.5003, - "step": 635 - }, - { - "epoch": 0.16, - "grad_norm": 1.9921875, - "learning_rate": 4.690766700109659e-05, - "loss": 0.4586, - "step": 640 - }, - { - "epoch": 0.16125, - "grad_norm": 1.5234375, - "learning_rate": 4.686020214463798e-05, - "loss": 0.5272, - "step": 645 - }, - { - "epoch": 0.1625, - "grad_norm": 1.3203125, - "learning_rate": 4.681240017681993e-05, - "loss": 0.5626, - "step": 650 - }, - { - "epoch": 0.16375, - "grad_norm": 1.3984375, - "learning_rate": 4.676426183480794e-05, - "loss": 0.5696, - "step": 655 - }, - { - "epoch": 0.165, - "grad_norm": 1.1640625, - "learning_rate": 4.671578786095478e-05, - "loss": 0.5391, - "step": 660 - }, - { - "epoch": 0.16625, - "grad_norm": 1.1484375, - "learning_rate": 4.6666979002789105e-05, - "loss": 0.5195, - "step": 665 - }, - { - "epoch": 0.1675, - "grad_norm": 1.5, - "learning_rate": 4.661783601300388e-05, - "loss": 0.4973, - "step": 670 - }, - { - "epoch": 0.16875, - "grad_norm": 1.2890625, - "learning_rate": 4.65683596494448e-05, - "loss": 0.4741, - "step": 675 - }, - { - "epoch": 0.17, - "grad_norm": 1.265625, - "learning_rate": 4.65185506750986e-05, - "loss": 0.4684, - "step": 680 - }, - { - "epoch": 0.17125, - "grad_norm": 1.5390625, - "learning_rate": 4.646840985808126e-05, - "loss": 0.5307, - "step": 685 - }, - { - "epoch": 0.1725, - "grad_norm": 1.296875, - "learning_rate": 4.6417937971626245e-05, - "loss": 0.5154, - "step": 690 - }, - { - "epoch": 0.17375, - "grad_norm": 1.265625, - "learning_rate": 4.636713579407245e-05, - "loss": 0.5348, - "step": 695 - }, - { - "epoch": 0.175, - "grad_norm": 1.1640625, - "learning_rate": 4.6316004108852305e-05, - "loss": 0.477, - "step": 700 - }, - { - "epoch": 0.17625, - "grad_norm": 1.1640625, - "learning_rate": 4.6264543704479655e-05, - "loss": 0.4989, - "step": 705 - }, - { - "epoch": 0.1775, - "grad_norm": 1.2421875, - "learning_rate": 4.6212755374537596e-05, - "loss": 0.5109, - "step": 710 - }, - { - "epoch": 0.17875, - "grad_norm": 1.0625, - "learning_rate": 4.616063991766623e-05, - "loss": 0.48, - "step": 715 - }, - { - "epoch": 0.18, - "grad_norm": 1.3984375, - "learning_rate": 4.610819813755038e-05, - "loss": 0.5151, - "step": 720 - }, - { - "epoch": 0.18125, - "grad_norm": 0.9453125, - "learning_rate": 4.6055430842907167e-05, - "loss": 0.5235, - "step": 725 - }, - { - "epoch": 0.1825, - "grad_norm": 1.5078125, - "learning_rate": 4.600233884747355e-05, - "loss": 0.5006, - "step": 730 - }, - { - "epoch": 0.18375, - "grad_norm": 1.234375, - "learning_rate": 4.594892296999378e-05, - "loss": 0.479, - "step": 735 - }, - { - "epoch": 0.185, - "grad_norm": 1.015625, - "learning_rate": 4.5895184034206765e-05, - "loss": 0.4807, - "step": 740 - }, - { - "epoch": 0.18625, - "grad_norm": 1.40625, - "learning_rate": 4.5841122868833364e-05, - "loss": 0.5189, - "step": 745 - }, - { - "epoch": 0.1875, - "grad_norm": 1.1953125, - "learning_rate": 4.5786740307563636e-05, - "loss": 0.4768, - "step": 750 - }, - { - "epoch": 0.18875, - "grad_norm": 1.5234375, - "learning_rate": 4.573203718904394e-05, - "loss": 0.4747, - "step": 755 - }, - { - "epoch": 0.19, - "grad_norm": 1.5703125, - "learning_rate": 4.567701435686404e-05, - "loss": 0.4756, - "step": 760 - }, - { - "epoch": 0.19125, - "grad_norm": 1.1171875, - "learning_rate": 4.562167265954409e-05, - "loss": 0.5102, - "step": 765 - }, - { - "epoch": 0.1925, - "grad_norm": 1.953125, - "learning_rate": 4.55660129505215e-05, - "loss": 0.5229, - "step": 770 - }, - { - "epoch": 0.19375, - "grad_norm": 1.203125, - "learning_rate": 4.551003608813784e-05, - "loss": 0.5103, - "step": 775 - }, - { - "epoch": 0.195, - "grad_norm": 1.21875, - "learning_rate": 4.545374293562559e-05, - "loss": 0.5216, - "step": 780 - }, - { - "epoch": 0.19625, - "grad_norm": 1.078125, - "learning_rate": 4.5397134361094786e-05, - "loss": 0.5039, - "step": 785 - }, - { - "epoch": 0.1975, - "grad_norm": 1.3828125, - "learning_rate": 4.534021123751968e-05, - "loss": 0.4834, - "step": 790 - }, - { - "epoch": 0.19875, - "grad_norm": 1.3828125, - "learning_rate": 4.528297444272525e-05, - "loss": 0.4386, - "step": 795 - }, - { - "epoch": 0.2, - "grad_norm": 1.328125, - "learning_rate": 4.522542485937369e-05, - "loss": 0.5097, - "step": 800 - }, - { - "epoch": 0.20125, - "grad_norm": 1.125, - "learning_rate": 4.516756337495075e-05, - "loss": 0.5574, - "step": 805 - }, - { - "epoch": 0.2025, - "grad_norm": 1.1953125, - "learning_rate": 4.5109390881752114e-05, - "loss": 0.5492, - "step": 810 - }, - { - "epoch": 0.20375, - "grad_norm": 1.109375, - "learning_rate": 4.5050908276869586e-05, - "loss": 0.5281, - "step": 815 - }, - { - "epoch": 0.205, - "grad_norm": 1.6484375, - "learning_rate": 4.499211646217727e-05, - "loss": 0.5042, - "step": 820 - }, - { - "epoch": 0.20625, - "grad_norm": 0.9765625, - "learning_rate": 4.493301634431768e-05, - "loss": 0.4746, - "step": 825 - }, - { - "epoch": 0.2075, - "grad_norm": 1.3046875, - "learning_rate": 4.487360883468775e-05, - "loss": 0.5611, - "step": 830 - }, - { - "epoch": 0.20875, - "grad_norm": 1.703125, - "learning_rate": 4.481389484942478e-05, - "loss": 0.5058, - "step": 835 - }, - { - "epoch": 0.21, - "grad_norm": 1.0703125, - "learning_rate": 4.4753875309392266e-05, - "loss": 0.4352, - "step": 840 - }, - { - "epoch": 0.21125, - "grad_norm": 1.125, - "learning_rate": 4.469355114016577e-05, - "loss": 0.4849, - "step": 845 - }, - { - "epoch": 0.2125, - "grad_norm": 1.1171875, - "learning_rate": 4.463292327201862e-05, - "loss": 0.5195, - "step": 850 - }, - { - "epoch": 0.21375, - "grad_norm": 1.09375, - "learning_rate": 4.4571992639907545e-05, - "loss": 0.3864, - "step": 855 - }, - { - "epoch": 0.215, - "grad_norm": 1.2578125, - "learning_rate": 4.451076018345825e-05, - "loss": 0.4903, - "step": 860 - }, - { - "epoch": 0.21625, - "grad_norm": 1.21875, - "learning_rate": 4.444922684695097e-05, - "loss": 0.5126, - "step": 865 - }, - { - "epoch": 0.2175, - "grad_norm": 1.1796875, - "learning_rate": 4.4387393579305865e-05, - "loss": 0.4958, - "step": 870 - }, - { - "epoch": 0.21875, - "grad_norm": 1.5078125, - "learning_rate": 4.4325261334068426e-05, - "loss": 0.5307, - "step": 875 - }, - { - "epoch": 0.22, - "grad_norm": 1.5, - "learning_rate": 4.426283106939474e-05, - "loss": 0.5048, - "step": 880 - }, - { - "epoch": 0.22125, - "grad_norm": 1.0, - "learning_rate": 4.4200103748036695e-05, - "loss": 0.4757, - "step": 885 - }, - { - "epoch": 0.2225, - "grad_norm": 1.359375, - "learning_rate": 4.4137080337327205e-05, - "loss": 0.5525, - "step": 890 - }, - { - "epoch": 0.22375, - "grad_norm": 1.5390625, - "learning_rate": 4.407376180916522e-05, - "loss": 0.4781, - "step": 895 - }, - { - "epoch": 0.225, - "grad_norm": 1.28125, - "learning_rate": 4.401014914000078e-05, - "loss": 0.4797, - "step": 900 - }, - { - "epoch": 0.22625, - "grad_norm": 1.484375, - "learning_rate": 4.3946243310819926e-05, - "loss": 0.5529, - "step": 905 - }, - { - "epoch": 0.2275, - "grad_norm": 1.265625, - "learning_rate": 4.3882045307129594e-05, - "loss": 0.4669, - "step": 910 - }, - { - "epoch": 0.22875, - "grad_norm": 1.234375, - "learning_rate": 4.3817556118942425e-05, - "loss": 0.5328, - "step": 915 - }, - { - "epoch": 0.23, - "grad_norm": 1.4140625, - "learning_rate": 4.375277674076149e-05, - "loss": 0.5347, - "step": 920 - }, - { - "epoch": 0.23125, - "grad_norm": 1.15625, - "learning_rate": 4.3687708171564925e-05, - "loss": 0.4615, - "step": 925 - }, - { - "epoch": 0.2325, - "grad_norm": 1.1328125, - "learning_rate": 4.3622351414790554e-05, - "loss": 0.5132, - "step": 930 - }, - { - "epoch": 0.23375, - "grad_norm": 1.1171875, - "learning_rate": 4.355670747832042e-05, - "loss": 0.4063, - "step": 935 - }, - { - "epoch": 0.235, - "grad_norm": 1.1171875, - "learning_rate": 4.349077737446525e-05, - "loss": 0.493, - "step": 940 - }, - { - "epoch": 0.23625, - "grad_norm": 1.078125, - "learning_rate": 4.3424562119948776e-05, - "loss": 0.4826, - "step": 945 - }, - { - "epoch": 0.2375, - "grad_norm": 1.0546875, - "learning_rate": 4.335806273589214e-05, - "loss": 0.4726, - "step": 950 - }, - { - "epoch": 0.23875, - "grad_norm": 1.59375, - "learning_rate": 4.329128024779812e-05, - "loss": 0.4672, - "step": 955 - }, - { - "epoch": 0.24, - "grad_norm": 0.91796875, - "learning_rate": 4.3224215685535294e-05, - "loss": 0.4467, - "step": 960 - }, - { - "epoch": 0.24125, - "grad_norm": 1.484375, - "learning_rate": 4.315687008332217e-05, - "loss": 0.5019, - "step": 965 - }, - { - "epoch": 0.2425, - "grad_norm": 1.2265625, - "learning_rate": 4.3089244479711236e-05, - "loss": 0.526, - "step": 970 - }, - { - "epoch": 0.24375, - "grad_norm": 1.2890625, - "learning_rate": 4.302133991757297e-05, - "loss": 0.509, - "step": 975 - }, - { - "epoch": 0.245, - "grad_norm": 0.953125, - "learning_rate": 4.295315744407972e-05, - "loss": 0.447, - "step": 980 - }, - { - "epoch": 0.24625, - "grad_norm": 1.2578125, - "learning_rate": 4.2884698110689575e-05, - "loss": 0.4927, - "step": 985 - }, - { - "epoch": 0.2475, - "grad_norm": 1.578125, - "learning_rate": 4.281596297313013e-05, - "loss": 0.4891, - "step": 990 - }, - { - "epoch": 0.24875, - "grad_norm": 2.21875, - "learning_rate": 4.274695309138226e-05, - "loss": 0.5046, - "step": 995 - }, - { - "epoch": 0.25, - "grad_norm": 1.1796875, - "learning_rate": 4.267766952966369e-05, - "loss": 0.4642, - "step": 1000 - }, - { - "epoch": 0.25125, - "grad_norm": 1.109375, - "learning_rate": 4.260811335641266e-05, - "loss": 0.4396, - "step": 1005 - }, - { - "epoch": 0.2525, - "grad_norm": 1.2265625, - "learning_rate": 4.25382856442714e-05, - "loss": 0.4386, - "step": 1010 - }, - { - "epoch": 0.25375, - "grad_norm": 1.546875, - "learning_rate": 4.2468187470069607e-05, - "loss": 0.5335, - "step": 1015 - }, - { - "epoch": 0.255, - "grad_norm": 1.59375, - "learning_rate": 4.2397819914807856e-05, - "loss": 0.4703, - "step": 1020 - }, - { - "epoch": 0.25625, - "grad_norm": 1.0625, - "learning_rate": 4.23271840636409e-05, - "loss": 0.5011, - "step": 1025 - }, - { - "epoch": 0.2575, - "grad_norm": 1.046875, - "learning_rate": 4.225628100586093e-05, - "loss": 0.5234, - "step": 1030 - }, - { - "epoch": 0.25875, - "grad_norm": 1.0078125, - "learning_rate": 4.218511183488082e-05, - "loss": 0.4749, - "step": 1035 - }, - { - "epoch": 0.26, - "grad_norm": 1.34375, - "learning_rate": 4.211367764821722e-05, - "loss": 0.5361, - "step": 1040 - }, - { - "epoch": 0.26125, - "grad_norm": 1.1875, - "learning_rate": 4.2041979547473665e-05, - "loss": 0.4458, - "step": 1045 - }, - { - "epoch": 0.2625, - "grad_norm": 1.140625, - "learning_rate": 4.197001863832355e-05, - "loss": 0.4517, - "step": 1050 - }, - { - "epoch": 0.26375, - "grad_norm": 0.9921875, - "learning_rate": 4.189779603049312e-05, - "loss": 0.4571, - "step": 1055 - }, - { - "epoch": 0.265, - "grad_norm": 1.15625, - "learning_rate": 4.182531283774434e-05, - "loss": 0.487, - "step": 1060 - }, - { - "epoch": 0.26625, - "grad_norm": 1.140625, - "learning_rate": 4.17525701778577e-05, - "loss": 0.5186, - "step": 1065 - }, - { - "epoch": 0.2675, - "grad_norm": 1.2578125, - "learning_rate": 4.1679569172614996e-05, - "loss": 0.4815, - "step": 1070 - }, - { - "epoch": 0.26875, - "grad_norm": 0.98046875, - "learning_rate": 4.1606310947782044e-05, - "loss": 0.4563, - "step": 1075 - }, - { - "epoch": 0.27, - "grad_norm": 1.0, - "learning_rate": 4.1532796633091296e-05, - "loss": 0.4585, - "step": 1080 - }, - { - "epoch": 0.27125, - "grad_norm": 1.1640625, - "learning_rate": 4.1459027362224436e-05, - "loss": 0.4846, - "step": 1085 - }, - { - "epoch": 0.2725, - "grad_norm": 1.03125, - "learning_rate": 4.138500427279485e-05, - "loss": 0.505, - "step": 1090 - }, - { - "epoch": 0.27375, - "grad_norm": 1.2421875, - "learning_rate": 4.1310728506330174e-05, - "loss": 0.4765, - "step": 1095 - }, - { - "epoch": 0.275, - "grad_norm": 1.5546875, - "learning_rate": 4.123620120825459e-05, - "loss": 0.5105, - "step": 1100 - }, - { - "epoch": 0.27625, - "grad_norm": 0.94140625, - "learning_rate": 4.116142352787125e-05, - "loss": 0.4193, - "step": 1105 - }, - { - "epoch": 0.2775, - "grad_norm": 1.203125, - "learning_rate": 4.1086396618344476e-05, - "loss": 0.4953, - "step": 1110 - }, - { - "epoch": 0.27875, - "grad_norm": 1.265625, - "learning_rate": 4.101112163668203e-05, - "loss": 0.4572, - "step": 1115 - }, - { - "epoch": 0.28, - "grad_norm": 1.1015625, - "learning_rate": 4.093559974371725e-05, - "loss": 0.4247, - "step": 1120 - }, - { - "epoch": 0.28125, - "grad_norm": 1.484375, - "learning_rate": 4.085983210409114e-05, - "loss": 0.483, - "step": 1125 - }, - { - "epoch": 0.2825, - "grad_norm": 0.98828125, - "learning_rate": 4.0783819886234445e-05, - "loss": 0.4787, - "step": 1130 - }, - { - "epoch": 0.28375, - "grad_norm": 1.015625, - "learning_rate": 4.0707564262349595e-05, - "loss": 0.4891, - "step": 1135 - }, - { - "epoch": 0.285, - "grad_norm": 1.5859375, - "learning_rate": 4.063106640839264e-05, - "loss": 0.503, - "step": 1140 - }, - { - "epoch": 0.28625, - "grad_norm": 1.4140625, - "learning_rate": 4.05543275040551e-05, - "loss": 0.5003, - "step": 1145 - }, - { - "epoch": 0.2875, - "grad_norm": 1.328125, - "learning_rate": 4.047734873274586e-05, - "loss": 0.5444, - "step": 1150 - }, - { - "epoch": 0.28875, - "grad_norm": 0.86328125, - "learning_rate": 4.040013128157275e-05, - "loss": 0.4193, - "step": 1155 - }, - { - "epoch": 0.29, - "grad_norm": 1.1328125, - "learning_rate": 4.0322676341324415e-05, - "loss": 0.497, - "step": 1160 - }, - { - "epoch": 0.29125, - "grad_norm": 0.94140625, - "learning_rate": 4.024498510645185e-05, - "loss": 0.377, - "step": 1165 - }, - { - "epoch": 0.2925, - "grad_norm": 1.2421875, - "learning_rate": 4.0167058775049996e-05, - "loss": 0.5118, - "step": 1170 - }, - { - "epoch": 0.29375, - "grad_norm": 1.59375, - "learning_rate": 4.008889854883929e-05, - "loss": 0.4941, - "step": 1175 - }, - { - "epoch": 0.295, - "grad_norm": 1.15625, - "learning_rate": 4.0010505633147106e-05, - "loss": 0.5302, - "step": 1180 - }, - { - "epoch": 0.29625, - "grad_norm": 1.1796875, - "learning_rate": 3.993188123688918e-05, - "loss": 0.5273, - "step": 1185 - }, - { - "epoch": 0.2975, - "grad_norm": 1.4921875, - "learning_rate": 3.985302657255097e-05, - "loss": 0.463, - "step": 1190 - }, - { - "epoch": 0.29875, - "grad_norm": 1.4453125, - "learning_rate": 3.977394285616893e-05, - "loss": 0.5116, - "step": 1195 - }, - { - "epoch": 0.3, - "grad_norm": 1.0078125, - "learning_rate": 3.969463130731183e-05, - "loss": 0.5089, - "step": 1200 - }, - { - "epoch": 0.30125, - "grad_norm": 0.890625, - "learning_rate": 3.961509314906184e-05, - "loss": 0.5043, - "step": 1205 - }, - { - "epoch": 0.3025, - "grad_norm": 1.2265625, - "learning_rate": 3.953532960799577e-05, - "loss": 0.4877, - "step": 1210 - }, - { - "epoch": 0.30375, - "grad_norm": 0.953125, - "learning_rate": 3.9455341914166075e-05, - "loss": 0.5368, - "step": 1215 - }, - { - "epoch": 0.305, - "grad_norm": 1.1015625, - "learning_rate": 3.937513130108197e-05, - "loss": 0.4303, - "step": 1220 - }, - { - "epoch": 0.30625, - "grad_norm": 0.953125, - "learning_rate": 3.9294699005690305e-05, - "loss": 0.4978, - "step": 1225 - }, - { - "epoch": 0.3075, - "grad_norm": 1.4453125, - "learning_rate": 3.92140462683566e-05, - "loss": 0.4898, - "step": 1230 - }, - { - "epoch": 0.30875, - "grad_norm": 1.3828125, - "learning_rate": 3.913317433284582e-05, - "loss": 0.4307, - "step": 1235 - }, - { - "epoch": 0.31, - "grad_norm": 1.078125, - "learning_rate": 3.905208444630327e-05, - "loss": 0.4599, - "step": 1240 - }, - { - "epoch": 0.31125, - "grad_norm": 0.94921875, - "learning_rate": 3.897077785923529e-05, - "loss": 0.4449, - "step": 1245 - }, - { - "epoch": 0.3125, - "grad_norm": 1.1796875, - "learning_rate": 3.888925582549006e-05, - "loss": 0.4508, - "step": 1250 - }, - { - "epoch": 0.31375, - "grad_norm": 1.0703125, - "learning_rate": 3.880751960223817e-05, - "loss": 0.4523, - "step": 1255 - }, - { - "epoch": 0.315, - "grad_norm": 1.203125, - "learning_rate": 3.87255704499533e-05, - "loss": 0.4782, - "step": 1260 - }, - { - "epoch": 0.31625, - "grad_norm": 1.09375, - "learning_rate": 3.864340963239275e-05, - "loss": 0.4821, - "step": 1265 - }, - { - "epoch": 0.3175, - "grad_norm": 1.0625, - "learning_rate": 3.856103841657797e-05, - "loss": 0.393, - "step": 1270 - }, - { - "epoch": 0.31875, - "grad_norm": 1.1328125, - "learning_rate": 3.847845807277502e-05, - "loss": 0.4731, - "step": 1275 - }, - { - "epoch": 0.32, - "grad_norm": 0.94140625, - "learning_rate": 3.8395669874474915e-05, - "loss": 0.4644, - "step": 1280 - }, - { - "epoch": 0.32125, - "grad_norm": 1.171875, - "learning_rate": 3.831267509837414e-05, - "loss": 0.5069, - "step": 1285 - }, - { - "epoch": 0.3225, - "grad_norm": 1.1328125, - "learning_rate": 3.822947502435477e-05, - "loss": 0.4767, - "step": 1290 - }, - { - "epoch": 0.32375, - "grad_norm": 1.296875, - "learning_rate": 3.814607093546489e-05, - "loss": 0.472, - "step": 1295 - }, - { - "epoch": 0.325, - "grad_norm": 1.109375, - "learning_rate": 3.8062464117898724e-05, - "loss": 0.4598, - "step": 1300 - }, - { - "epoch": 0.32625, - "grad_norm": 1.71875, - "learning_rate": 3.7978655860976824e-05, - "loss": 0.4794, - "step": 1305 - }, - { - "epoch": 0.3275, - "grad_norm": 1.046875, - "learning_rate": 3.789464745712619e-05, - "loss": 0.4728, - "step": 1310 - }, - { - "epoch": 0.32875, - "grad_norm": 1.2421875, - "learning_rate": 3.7810440201860334e-05, - "loss": 0.4535, - "step": 1315 - }, - { - "epoch": 0.33, - "grad_norm": 1.2578125, - "learning_rate": 3.7726035393759285e-05, - "loss": 0.4646, - "step": 1320 - }, - { - "epoch": 0.33125, - "grad_norm": 1.109375, - "learning_rate": 3.764143433444962e-05, - "loss": 0.4597, - "step": 1325 - }, - { - "epoch": 0.3325, - "grad_norm": 0.9921875, - "learning_rate": 3.755663832858432e-05, - "loss": 0.516, - "step": 1330 - }, - { - "epoch": 0.33375, - "grad_norm": 1.078125, - "learning_rate": 3.747164868382269e-05, - "loss": 0.4492, - "step": 1335 - }, - { - "epoch": 0.335, - "grad_norm": 1.4765625, - "learning_rate": 3.7386466710810194e-05, - "loss": 0.4644, - "step": 1340 - }, - { - "epoch": 0.33625, - "grad_norm": 1.03125, - "learning_rate": 3.730109372315822e-05, - "loss": 0.5028, - "step": 1345 - }, - { - "epoch": 0.3375, - "grad_norm": 1.359375, - "learning_rate": 3.721553103742388e-05, - "loss": 0.424, - "step": 1350 - }, - { - "epoch": 0.33875, - "grad_norm": 1.1015625, - "learning_rate": 3.71297799730896e-05, - "loss": 0.4592, - "step": 1355 - }, - { - "epoch": 0.34, - "grad_norm": 1.0390625, - "learning_rate": 3.704384185254288e-05, - "loss": 0.4678, - "step": 1360 - }, - { - "epoch": 0.34125, - "grad_norm": 1.0703125, - "learning_rate": 3.695771800105586e-05, - "loss": 0.4809, - "step": 1365 - }, - { - "epoch": 0.3425, - "grad_norm": 1.171875, - "learning_rate": 3.6871409746764865e-05, - "loss": 0.5093, - "step": 1370 - }, - { - "epoch": 0.34375, - "grad_norm": 1.5625, - "learning_rate": 3.678491842064995e-05, - "loss": 0.4937, - "step": 1375 - }, - { - "epoch": 0.345, - "grad_norm": 1.5625, - "learning_rate": 3.6698245356514335e-05, - "loss": 0.4107, - "step": 1380 - }, - { - "epoch": 0.34625, - "grad_norm": 1.0, - "learning_rate": 3.661139189096391e-05, - "loss": 0.4578, - "step": 1385 - }, - { - "epoch": 0.3475, - "grad_norm": 0.8359375, - "learning_rate": 3.652435936338656e-05, - "loss": 0.3964, - "step": 1390 - }, - { - "epoch": 0.34875, - "grad_norm": 1.1171875, - "learning_rate": 3.6437149115931514e-05, - "loss": 0.5011, - "step": 1395 - }, - { - "epoch": 0.35, - "grad_norm": 1.28125, - "learning_rate": 3.634976249348867e-05, - "loss": 0.494, - "step": 1400 - }, - { - "epoch": 0.35125, - "grad_norm": 1.125, - "learning_rate": 3.626220084366786e-05, - "loss": 0.4773, - "step": 1405 - }, - { - "epoch": 0.3525, - "grad_norm": 1.0390625, - "learning_rate": 3.6174465516778035e-05, - "loss": 0.4338, - "step": 1410 - }, - { - "epoch": 0.35375, - "grad_norm": 0.99609375, - "learning_rate": 3.608655786580647e-05, - "loss": 0.4538, - "step": 1415 - }, - { - "epoch": 0.355, - "grad_norm": 1.1875, - "learning_rate": 3.599847924639788e-05, - "loss": 0.4537, - "step": 1420 - }, - { - "epoch": 0.35625, - "grad_norm": 1.0078125, - "learning_rate": 3.591023101683355e-05, - "loss": 0.448, - "step": 1425 - }, - { - "epoch": 0.3575, - "grad_norm": 1.125, - "learning_rate": 3.582181453801036e-05, - "loss": 0.4645, - "step": 1430 - }, - { - "epoch": 0.35875, - "grad_norm": 1.6015625, - "learning_rate": 3.5733231173419754e-05, - "loss": 0.4578, - "step": 1435 - }, - { - "epoch": 0.36, - "grad_norm": 1.0, - "learning_rate": 3.564448228912682e-05, - "loss": 0.4704, - "step": 1440 - }, - { - "epoch": 0.36125, - "grad_norm": 1.8671875, - "learning_rate": 3.555556925374914e-05, - "loss": 0.4383, - "step": 1445 - }, - { - "epoch": 0.3625, - "grad_norm": 1.2734375, - "learning_rate": 3.54664934384357e-05, - "loss": 0.4192, - "step": 1450 - }, - { - "epoch": 0.36375, - "grad_norm": 0.94921875, - "learning_rate": 3.5377256216845785e-05, - "loss": 0.5063, - "step": 1455 - }, - { - "epoch": 0.365, - "grad_norm": 1.203125, - "learning_rate": 3.528785896512772e-05, - "loss": 0.4711, - "step": 1460 - }, - { - "epoch": 0.36625, - "grad_norm": 1.15625, - "learning_rate": 3.519830306189773e-05, - "loss": 0.4494, - "step": 1465 - }, - { - "epoch": 0.3675, - "grad_norm": 1.1796875, - "learning_rate": 3.510858988821863e-05, - "loss": 0.4972, - "step": 1470 - }, - { - "epoch": 0.36875, - "grad_norm": 1.4765625, - "learning_rate": 3.5018720827578524e-05, - "loss": 0.4312, - "step": 1475 - }, - { - "epoch": 0.37, - "grad_norm": 0.99609375, - "learning_rate": 3.4928697265869515e-05, - "loss": 0.4267, - "step": 1480 - }, - { - "epoch": 0.37125, - "grad_norm": 1.390625, - "learning_rate": 3.483852059136629e-05, - "loss": 0.4563, - "step": 1485 - }, - { - "epoch": 0.3725, - "grad_norm": 0.99609375, - "learning_rate": 3.474819219470471e-05, - "loss": 0.4642, - "step": 1490 - }, - { - "epoch": 0.37375, - "grad_norm": 1.1796875, - "learning_rate": 3.4657713468860405e-05, - "loss": 0.414, - "step": 1495 - }, - { - "epoch": 0.375, - "grad_norm": 1.140625, - "learning_rate": 3.456708580912725e-05, - "loss": 0.4919, - "step": 1500 - }, - { - "epoch": 0.37625, - "grad_norm": 1.25, - "learning_rate": 3.447631061309587e-05, - "loss": 0.5023, - "step": 1505 - }, - { - "epoch": 0.3775, - "grad_norm": 1.140625, - "learning_rate": 3.438538928063208e-05, - "loss": 0.469, - "step": 1510 - }, - { - "epoch": 0.37875, - "grad_norm": 1.0859375, - "learning_rate": 3.4294323213855305e-05, - "loss": 0.4322, - "step": 1515 - }, - { - "epoch": 0.38, - "grad_norm": 0.9296875, - "learning_rate": 3.4203113817116957e-05, - "loss": 0.4393, - "step": 1520 - }, - { - "epoch": 0.38125, - "grad_norm": 1.1015625, - "learning_rate": 3.411176249697875e-05, - "loss": 0.4005, - "step": 1525 - }, - { - "epoch": 0.3825, - "grad_norm": 1.4609375, - "learning_rate": 3.402027066219105e-05, - "loss": 0.4094, - "step": 1530 - }, - { - "epoch": 0.38375, - "grad_norm": 1.4921875, - "learning_rate": 3.392863972367114e-05, - "loss": 0.4474, - "step": 1535 - }, - { - "epoch": 0.385, - "grad_norm": 1.265625, - "learning_rate": 3.383687109448143e-05, - "loss": 0.399, - "step": 1540 - }, - { - "epoch": 0.38625, - "grad_norm": 1.3515625, - "learning_rate": 3.374496618980772e-05, - "loss": 0.4342, - "step": 1545 - }, - { - "epoch": 0.3875, - "grad_norm": 1.2890625, - "learning_rate": 3.365292642693732e-05, - "loss": 0.4847, - "step": 1550 - }, - { - "epoch": 0.38875, - "grad_norm": 1.046875, - "learning_rate": 3.356075322523725e-05, - "loss": 0.4343, - "step": 1555 - }, - { - "epoch": 0.39, - "grad_norm": 0.9921875, - "learning_rate": 3.346844800613229e-05, - "loss": 0.498, - "step": 1560 - }, - { - "epoch": 0.39125, - "grad_norm": 1.0703125, - "learning_rate": 3.33760121930831e-05, - "loss": 0.4737, - "step": 1565 - }, - { - "epoch": 0.3925, - "grad_norm": 0.99609375, - "learning_rate": 3.3283447211564276e-05, - "loss": 0.4965, - "step": 1570 - }, - { - "epoch": 0.39375, - "grad_norm": 1.3359375, - "learning_rate": 3.319075448904234e-05, - "loss": 0.4626, - "step": 1575 - }, - { - "epoch": 0.395, - "grad_norm": 1.1640625, - "learning_rate": 3.309793545495374e-05, - "loss": 0.5161, - "step": 1580 - }, - { - "epoch": 0.39625, - "grad_norm": 1.21875, - "learning_rate": 3.3004991540682796e-05, - "loss": 0.4371, - "step": 1585 - }, - { - "epoch": 0.3975, - "grad_norm": 1.109375, - "learning_rate": 3.2911924179539656e-05, - "loss": 0.4427, - "step": 1590 - }, - { - "epoch": 0.39875, - "grad_norm": 0.93359375, - "learning_rate": 3.281873480673815e-05, - "loss": 0.4318, - "step": 1595 - }, - { - "epoch": 0.4, - "grad_norm": 1.046875, - "learning_rate": 3.272542485937369e-05, - "loss": 0.4756, - "step": 1600 - }, - { - "epoch": 0.40125, - "grad_norm": 1.0390625, - "learning_rate": 3.2631995776401094e-05, - "loss": 0.4507, - "step": 1605 - }, - { - "epoch": 0.4025, - "grad_norm": 1.03125, - "learning_rate": 3.253844899861239e-05, - "loss": 0.4444, - "step": 1610 - }, - { - "epoch": 0.40375, - "grad_norm": 1.046875, - "learning_rate": 3.244478596861464e-05, - "loss": 0.4291, - "step": 1615 - }, - { - "epoch": 0.405, - "grad_norm": 1.125, - "learning_rate": 3.23510081308076e-05, - "loss": 0.4615, - "step": 1620 - }, - { - "epoch": 0.40625, - "grad_norm": 1.0546875, - "learning_rate": 3.225711693136156e-05, - "loss": 0.4347, - "step": 1625 - }, - { - "epoch": 0.4075, - "grad_norm": 1.3046875, - "learning_rate": 3.2163113818194964e-05, - "loss": 0.4349, - "step": 1630 - }, - { - "epoch": 0.40875, - "grad_norm": 1.609375, - "learning_rate": 3.206900024095208e-05, - "loss": 0.4814, - "step": 1635 - }, - { - "epoch": 0.41, - "grad_norm": 1.3828125, - "learning_rate": 3.1974777650980735e-05, - "loss": 0.502, - "step": 1640 - }, - { - "epoch": 0.41125, - "grad_norm": 1.5234375, - "learning_rate": 3.188044750130979e-05, - "loss": 0.4457, - "step": 1645 - }, - { - "epoch": 0.4125, - "grad_norm": 1.140625, - "learning_rate": 3.178601124662686e-05, - "loss": 0.505, - "step": 1650 - }, - { - "epoch": 0.41375, - "grad_norm": 1.109375, - "learning_rate": 3.169147034325582e-05, - "loss": 0.4941, - "step": 1655 - }, - { - "epoch": 0.415, - "grad_norm": 1.078125, - "learning_rate": 3.1596826249134324e-05, - "loss": 0.4524, - "step": 1660 - }, - { - "epoch": 0.41625, - "grad_norm": 1.1015625, - "learning_rate": 3.150208042379142e-05, - "loss": 0.4826, - "step": 1665 - }, - { - "epoch": 0.4175, - "grad_norm": 1.109375, - "learning_rate": 3.140723432832492e-05, - "loss": 0.4101, - "step": 1670 - }, - { - "epoch": 0.41875, - "grad_norm": 1.3359375, - "learning_rate": 3.131228942537895e-05, - "loss": 0.4068, - "step": 1675 - }, - { - "epoch": 0.42, - "grad_norm": 1.53125, - "learning_rate": 3.121724717912138e-05, - "loss": 0.4341, - "step": 1680 - }, - { - "epoch": 0.42125, - "grad_norm": 1.2109375, - "learning_rate": 3.112210905522119e-05, - "loss": 0.4197, - "step": 1685 - }, - { - "epoch": 0.4225, - "grad_norm": 1.3046875, - "learning_rate": 3.102687652082597e-05, - "loss": 0.4257, - "step": 1690 - }, - { - "epoch": 0.42375, - "grad_norm": 1.25, - "learning_rate": 3.0931551044539194e-05, - "loss": 0.4513, - "step": 1695 - }, - { - "epoch": 0.425, - "grad_norm": 1.21875, - "learning_rate": 3.083613409639764e-05, - "loss": 0.4757, - "step": 1700 - }, - { - "epoch": 0.42625, - "grad_norm": 1.2421875, - "learning_rate": 3.0740627147848675e-05, - "loss": 0.441, - "step": 1705 - }, - { - "epoch": 0.4275, - "grad_norm": 1.2265625, - "learning_rate": 3.06450316717276e-05, - "loss": 0.4249, - "step": 1710 - }, - { - "epoch": 0.42875, - "grad_norm": 1.1953125, - "learning_rate": 3.05493491422349e-05, - "loss": 0.4189, - "step": 1715 - }, - { - "epoch": 0.43, - "grad_norm": 1.3203125, - "learning_rate": 3.045358103491357e-05, - "loss": 0.4315, - "step": 1720 - }, - { - "epoch": 0.43125, - "grad_norm": 1.15625, - "learning_rate": 3.035772882662627e-05, - "loss": 0.4641, - "step": 1725 - }, - { - "epoch": 0.4325, - "grad_norm": 1.390625, - "learning_rate": 3.026179399553264e-05, - "loss": 0.4701, - "step": 1730 - }, - { - "epoch": 0.43375, - "grad_norm": 1.421875, - "learning_rate": 3.0165778021066453e-05, - "loss": 0.4827, - "step": 1735 - }, - { - "epoch": 0.435, - "grad_norm": 1.3203125, - "learning_rate": 3.0069682383912813e-05, - "loss": 0.4439, - "step": 1740 - }, - { - "epoch": 0.43625, - "grad_norm": 1.28125, - "learning_rate": 2.9973508565985313e-05, - "loss": 0.4916, - "step": 1745 - }, - { - "epoch": 0.4375, - "grad_norm": 1.4140625, - "learning_rate": 2.9877258050403212e-05, - "loss": 0.464, - "step": 1750 - }, - { - "epoch": 0.43875, - "grad_norm": 1.0078125, - "learning_rate": 2.9780932321468515e-05, - "loss": 0.4105, - "step": 1755 - }, - { - "epoch": 0.44, - "grad_norm": 1.3125, - "learning_rate": 2.9684532864643122e-05, - "loss": 0.4312, - "step": 1760 - }, - { - "epoch": 0.44125, - "grad_norm": 1.1875, - "learning_rate": 2.9588061166525914e-05, - "loss": 0.4465, - "step": 1765 - }, - { - "epoch": 0.4425, - "grad_norm": 1.5, - "learning_rate": 2.949151871482982e-05, - "loss": 0.4136, - "step": 1770 - }, - { - "epoch": 0.44375, - "grad_norm": 1.21875, - "learning_rate": 2.9394906998358868e-05, - "loss": 0.4107, - "step": 1775 - }, - { - "epoch": 0.445, - "grad_norm": 0.98828125, - "learning_rate": 2.929822750698524e-05, - "loss": 0.4327, - "step": 1780 - }, - { - "epoch": 0.44625, - "grad_norm": 1.2890625, - "learning_rate": 2.92014817316263e-05, - "loss": 0.4597, - "step": 1785 - }, - { - "epoch": 0.4475, - "grad_norm": 1.046875, - "learning_rate": 2.9104671164221576e-05, - "loss": 0.4685, - "step": 1790 - }, - { - "epoch": 0.44875, - "grad_norm": 1.046875, - "learning_rate": 2.9007797297709782e-05, - "loss": 0.451, - "step": 1795 - }, - { - "epoch": 0.45, - "grad_norm": 1.40625, - "learning_rate": 2.8910861626005776e-05, - "loss": 0.4101, - "step": 1800 - }, - { - "epoch": 0.45125, - "grad_norm": 1.2109375, - "learning_rate": 2.8813865643977526e-05, - "loss": 0.4775, - "step": 1805 - }, - { - "epoch": 0.4525, - "grad_norm": 1.234375, - "learning_rate": 2.871681084742308e-05, - "loss": 0.4588, - "step": 1810 - }, - { - "epoch": 0.45375, - "grad_norm": 1.265625, - "learning_rate": 2.8619698733047447e-05, - "loss": 0.4476, - "step": 1815 - }, - { - "epoch": 0.455, - "grad_norm": 1.140625, - "learning_rate": 2.8522530798439567e-05, - "loss": 0.4375, - "step": 1820 - }, - { - "epoch": 0.45625, - "grad_norm": 1.015625, - "learning_rate": 2.8425308542049206e-05, - "loss": 0.422, - "step": 1825 - }, - { - "epoch": 0.4575, - "grad_norm": 0.94921875, - "learning_rate": 2.832803346316381e-05, - "loss": 0.4887, - "step": 1830 - }, - { - "epoch": 0.45875, - "grad_norm": 1.1796875, - "learning_rate": 2.8230707061885443e-05, - "loss": 0.4136, - "step": 1835 - }, - { - "epoch": 0.46, - "grad_norm": 1.140625, - "learning_rate": 2.8133330839107608e-05, - "loss": 0.4236, - "step": 1840 - }, - { - "epoch": 0.46125, - "grad_norm": 1.078125, - "learning_rate": 2.803590629649212e-05, - "loss": 0.4983, - "step": 1845 - }, - { - "epoch": 0.4625, - "grad_norm": 1.265625, - "learning_rate": 2.7938434936445945e-05, - "loss": 0.4988, - "step": 1850 - }, - { - "epoch": 0.46375, - "grad_norm": 0.8828125, - "learning_rate": 2.784091826209803e-05, - "loss": 0.4337, - "step": 1855 - }, - { - "epoch": 0.465, - "grad_norm": 1.1640625, - "learning_rate": 2.774335777727613e-05, - "loss": 0.4574, - "step": 1860 - }, - { - "epoch": 0.46625, - "grad_norm": 0.95703125, - "learning_rate": 2.764575498648362e-05, - "loss": 0.4606, - "step": 1865 - }, - { - "epoch": 0.4675, - "grad_norm": 1.1328125, - "learning_rate": 2.754811139487625e-05, - "loss": 0.4489, - "step": 1870 - }, - { - "epoch": 0.46875, - "grad_norm": 1.1171875, - "learning_rate": 2.7450428508239024e-05, - "loss": 0.4016, - "step": 1875 - }, - { - "epoch": 0.47, - "grad_norm": 1.1328125, - "learning_rate": 2.7352707832962865e-05, - "loss": 0.4191, - "step": 1880 - }, - { - "epoch": 0.47125, - "grad_norm": 1.2109375, - "learning_rate": 2.725495087602148e-05, - "loss": 0.5397, - "step": 1885 - }, - { - "epoch": 0.4725, - "grad_norm": 1.4609375, - "learning_rate": 2.7157159144948092e-05, - "loss": 0.4646, - "step": 1890 - }, - { - "epoch": 0.47375, - "grad_norm": 0.9453125, - "learning_rate": 2.7059334147812142e-05, - "loss": 0.4443, - "step": 1895 - }, - { - "epoch": 0.475, - "grad_norm": 0.95703125, - "learning_rate": 2.6961477393196126e-05, - "loss": 0.4943, - "step": 1900 - }, - { - "epoch": 0.47625, - "grad_norm": 0.8046875, - "learning_rate": 2.6863590390172243e-05, - "loss": 0.4654, - "step": 1905 - }, - { - "epoch": 0.4775, - "grad_norm": 1.2890625, - "learning_rate": 2.6765674648279172e-05, - "loss": 0.4517, - "step": 1910 - }, - { - "epoch": 0.47875, - "grad_norm": 1.1875, - "learning_rate": 2.666773167749878e-05, - "loss": 0.4525, - "step": 1915 - }, - { - "epoch": 0.48, - "grad_norm": 1.09375, - "learning_rate": 2.656976298823284e-05, - "loss": 0.4676, - "step": 1920 - }, - { - "epoch": 0.48125, - "grad_norm": 1.4609375, - "learning_rate": 2.6471770091279724e-05, - "loss": 0.495, - "step": 1925 - }, - { - "epoch": 0.4825, - "grad_norm": 1.0234375, - "learning_rate": 2.637375449781115e-05, - "loss": 0.4322, - "step": 1930 - }, - { - "epoch": 0.48375, - "grad_norm": 1.1328125, - "learning_rate": 2.627571771934879e-05, - "loss": 0.4147, - "step": 1935 - }, - { - "epoch": 0.485, - "grad_norm": 1.078125, - "learning_rate": 2.6177661267741065e-05, - "loss": 0.4204, - "step": 1940 - }, - { - "epoch": 0.48625, - "grad_norm": 0.98828125, - "learning_rate": 2.607958665513976e-05, - "loss": 0.4245, - "step": 1945 - }, - { - "epoch": 0.4875, - "grad_norm": 0.95703125, - "learning_rate": 2.598149539397672e-05, - "loss": 0.4582, - "step": 1950 - }, - { - "epoch": 0.48875, - "grad_norm": 1.46875, - "learning_rate": 2.5883388996940534e-05, - "loss": 0.4445, - "step": 1955 - }, - { - "epoch": 0.49, - "grad_norm": 1.25, - "learning_rate": 2.578526897695321e-05, - "loss": 0.4533, - "step": 1960 - }, - { - "epoch": 0.49125, - "grad_norm": 1.0390625, - "learning_rate": 2.5687136847146838e-05, - "loss": 0.4334, - "step": 1965 - }, - { - "epoch": 0.4925, - "grad_norm": 0.89453125, - "learning_rate": 2.558899412084026e-05, - "loss": 0.434, - "step": 1970 - }, - { - "epoch": 0.49375, - "grad_norm": 1.25, - "learning_rate": 2.5490842311515707e-05, - "loss": 0.4257, - "step": 1975 - }, - { - "epoch": 0.495, - "grad_norm": 1.6953125, - "learning_rate": 2.539268293279552e-05, - "loss": 0.4503, - "step": 1980 - }, - { - "epoch": 0.49625, - "grad_norm": 1.234375, - "learning_rate": 2.529451749841873e-05, - "loss": 0.5045, - "step": 1985 - }, - { - "epoch": 0.4975, - "grad_norm": 1.3046875, - "learning_rate": 2.5196347522217784e-05, - "loss": 0.4307, - "step": 1990 - }, - { - "epoch": 0.49875, - "grad_norm": 1.0078125, - "learning_rate": 2.509817451809515e-05, - "loss": 0.4701, - "step": 1995 - }, - { - "epoch": 0.5, - "grad_norm": 1.3046875, - "learning_rate": 2.5e-05, - "loss": 0.4573, - "step": 2000 - }, - { - "epoch": 0.50125, - "grad_norm": 1.2421875, - "learning_rate": 2.4901825481904855e-05, - "loss": 0.4304, - "step": 2005 - }, - { - "epoch": 0.5025, - "grad_norm": 1.578125, - "learning_rate": 2.480365247778223e-05, - "loss": 0.4334, - "step": 2010 - }, - { - "epoch": 0.50375, - "grad_norm": 1.0390625, - "learning_rate": 2.4705482501581266e-05, - "loss": 0.4507, - "step": 2015 - }, - { - "epoch": 0.505, - "grad_norm": 1.3984375, - "learning_rate": 2.460731706720449e-05, - "loss": 0.4555, - "step": 2020 - }, - { - "epoch": 0.50625, - "grad_norm": 1.1328125, - "learning_rate": 2.4509157688484295e-05, - "loss": 0.4791, - "step": 2025 - }, - { - "epoch": 0.5075, - "grad_norm": 1.171875, - "learning_rate": 2.4411005879159753e-05, - "loss": 0.4324, - "step": 2030 - }, - { - "epoch": 0.50875, - "grad_norm": 1.3828125, - "learning_rate": 2.4312863152853165e-05, - "loss": 0.4534, - "step": 2035 - }, - { - "epoch": 0.51, - "grad_norm": 1.5546875, - "learning_rate": 2.4214731023046793e-05, - "loss": 0.4411, - "step": 2040 - }, - { - "epoch": 0.51125, - "grad_norm": 1.3359375, - "learning_rate": 2.4116611003059472e-05, - "loss": 0.4333, - "step": 2045 - }, - { - "epoch": 0.5125, - "grad_norm": 1.46875, - "learning_rate": 2.4018504606023293e-05, - "loss": 0.4231, - "step": 2050 - }, - { - "epoch": 0.51375, - "grad_norm": 1.4140625, - "learning_rate": 2.392041334486024e-05, - "loss": 0.3752, - "step": 2055 - }, - { - "epoch": 0.515, - "grad_norm": 1.1328125, - "learning_rate": 2.3822338732258937e-05, - "loss": 0.4876, - "step": 2060 - }, - { - "epoch": 0.51625, - "grad_norm": 0.91796875, - "learning_rate": 2.3724282280651214e-05, - "loss": 0.3989, - "step": 2065 - }, - { - "epoch": 0.5175, - "grad_norm": 0.859375, - "learning_rate": 2.3626245502188864e-05, - "loss": 0.4102, - "step": 2070 - }, - { - "epoch": 0.51875, - "grad_norm": 0.9921875, - "learning_rate": 2.3528229908720272e-05, - "loss": 0.3997, - "step": 2075 - }, - { - "epoch": 0.52, - "grad_norm": 0.71484375, - "learning_rate": 2.3430237011767167e-05, - "loss": 0.4009, - "step": 2080 - }, - { - "epoch": 0.52125, - "grad_norm": 1.046875, - "learning_rate": 2.3332268322501228e-05, - "loss": 0.4769, - "step": 2085 - }, - { - "epoch": 0.5225, - "grad_norm": 1.078125, - "learning_rate": 2.323432535172084e-05, - "loss": 0.4405, - "step": 2090 - }, - { - "epoch": 0.52375, - "grad_norm": 1.0625, - "learning_rate": 2.313640960982776e-05, - "loss": 0.4436, - "step": 2095 - }, - { - "epoch": 0.525, - "grad_norm": 1.09375, - "learning_rate": 2.303852260680388e-05, - "loss": 0.4027, - "step": 2100 - }, - { - "epoch": 0.52625, - "grad_norm": 1.2890625, - "learning_rate": 2.294066585218786e-05, - "loss": 0.4086, - "step": 2105 - }, - { - "epoch": 0.5275, - "grad_norm": 1.046875, - "learning_rate": 2.284284085505192e-05, - "loss": 0.4262, - "step": 2110 - }, - { - "epoch": 0.52875, - "grad_norm": 1.484375, - "learning_rate": 2.274504912397852e-05, - "loss": 0.4605, - "step": 2115 - }, - { - "epoch": 0.53, - "grad_norm": 1.2734375, - "learning_rate": 2.2647292167037144e-05, - "loss": 0.4534, - "step": 2120 - }, - { - "epoch": 0.53125, - "grad_norm": 0.890625, - "learning_rate": 2.2549571491760986e-05, - "loss": 0.3628, - "step": 2125 - }, - { - "epoch": 0.5325, - "grad_norm": 1.3046875, - "learning_rate": 2.2451888605123754e-05, - "loss": 0.4879, - "step": 2130 - }, - { - "epoch": 0.53375, - "grad_norm": 0.96875, - "learning_rate": 2.2354245013516393e-05, - "loss": 0.4517, - "step": 2135 - }, - { - "epoch": 0.535, - "grad_norm": 1.1640625, - "learning_rate": 2.225664222272387e-05, - "loss": 0.4303, - "step": 2140 - }, - { - "epoch": 0.53625, - "grad_norm": 0.83203125, - "learning_rate": 2.2159081737901975e-05, - "loss": 0.4172, - "step": 2145 - }, - { - "epoch": 0.5375, - "grad_norm": 1.125, - "learning_rate": 2.2061565063554064e-05, - "loss": 0.4169, - "step": 2150 - }, - { - "epoch": 0.53875, - "grad_norm": 1.25, - "learning_rate": 2.1964093703507893e-05, - "loss": 0.4839, - "step": 2155 - }, - { - "epoch": 0.54, - "grad_norm": 0.8359375, - "learning_rate": 2.186666916089239e-05, - "loss": 0.3919, - "step": 2160 - }, - { - "epoch": 0.54125, - "grad_norm": 1.2421875, - "learning_rate": 2.1769292938114563e-05, - "loss": 0.4435, - "step": 2165 - }, - { - "epoch": 0.5425, - "grad_norm": 1.3046875, - "learning_rate": 2.1671966536836196e-05, - "loss": 0.4902, - "step": 2170 - }, - { - "epoch": 0.54375, - "grad_norm": 1.296875, - "learning_rate": 2.1574691457950803e-05, - "loss": 0.4667, - "step": 2175 - }, - { - "epoch": 0.545, - "grad_norm": 1.1015625, - "learning_rate": 2.1477469201560435e-05, - "loss": 0.3795, - "step": 2180 - }, - { - "epoch": 0.54625, - "grad_norm": 1.140625, - "learning_rate": 2.1380301266952556e-05, - "loss": 0.4658, - "step": 2185 - }, - { - "epoch": 0.5475, - "grad_norm": 1.6171875, - "learning_rate": 2.1283189152576925e-05, - "loss": 0.4589, - "step": 2190 - }, - { - "epoch": 0.54875, - "grad_norm": 1.3828125, - "learning_rate": 2.118613435602248e-05, - "loss": 0.4394, - "step": 2195 - }, - { - "epoch": 0.55, - "grad_norm": 1.0390625, - "learning_rate": 2.1089138373994223e-05, - "loss": 0.4321, - "step": 2200 - }, - { - "epoch": 0.55125, - "grad_norm": 1.3046875, - "learning_rate": 2.0992202702290227e-05, - "loss": 0.4084, - "step": 2205 - }, - { - "epoch": 0.5525, - "grad_norm": 1.2890625, - "learning_rate": 2.089532883577843e-05, - "loss": 0.4489, - "step": 2210 - }, - { - "epoch": 0.55375, - "grad_norm": 1.2421875, - "learning_rate": 2.0798518268373706e-05, - "loss": 0.4403, - "step": 2215 - }, - { - "epoch": 0.555, - "grad_norm": 1.1796875, - "learning_rate": 2.070177249301476e-05, - "loss": 0.4286, - "step": 2220 - }, - { - "epoch": 0.55625, - "grad_norm": 1.28125, - "learning_rate": 2.0605093001641138e-05, - "loss": 0.4557, - "step": 2225 - }, - { - "epoch": 0.5575, - "grad_norm": 1.171875, - "learning_rate": 2.0508481285170186e-05, - "loss": 0.4686, - "step": 2230 - }, - { - "epoch": 0.55875, - "grad_norm": 1.375, - "learning_rate": 2.04119388334741e-05, - "loss": 0.4402, - "step": 2235 - }, - { - "epoch": 0.56, - "grad_norm": 1.1953125, - "learning_rate": 2.031546713535688e-05, - "loss": 0.3973, - "step": 2240 - }, - { - "epoch": 0.56125, - "grad_norm": 1.1015625, - "learning_rate": 2.0219067678531494e-05, - "loss": 0.4349, - "step": 2245 - }, - { - "epoch": 0.5625, - "grad_norm": 1.1015625, - "learning_rate": 2.0122741949596797e-05, - "loss": 0.4329, - "step": 2250 - }, - { - "epoch": 0.56375, - "grad_norm": 1.1875, - "learning_rate": 2.002649143401469e-05, - "loss": 0.4402, - "step": 2255 - }, - { - "epoch": 0.565, - "grad_norm": 1.0546875, - "learning_rate": 1.9930317616087196e-05, - "loss": 0.4342, - "step": 2260 - }, - { - "epoch": 0.56625, - "grad_norm": 0.98046875, - "learning_rate": 1.9834221978933543e-05, - "loss": 0.4537, - "step": 2265 - }, - { - "epoch": 0.5675, - "grad_norm": 1.2890625, - "learning_rate": 1.9738206004467363e-05, - "loss": 0.4597, - "step": 2270 - }, - { - "epoch": 0.56875, - "grad_norm": 1.4296875, - "learning_rate": 1.9642271173373737e-05, - "loss": 0.4372, - "step": 2275 - }, - { - "epoch": 0.57, - "grad_norm": 1.171875, - "learning_rate": 1.9546418965086442e-05, - "loss": 0.4062, - "step": 2280 - }, - { - "epoch": 0.57125, - "grad_norm": 1.453125, - "learning_rate": 1.9450650857765102e-05, - "loss": 0.4698, - "step": 2285 - }, - { - "epoch": 0.5725, - "grad_norm": 1.140625, - "learning_rate": 1.935496832827241e-05, - "loss": 0.4312, - "step": 2290 - }, - { - "epoch": 0.57375, - "grad_norm": 1.21875, - "learning_rate": 1.925937285215133e-05, - "loss": 0.4643, - "step": 2295 - }, - { - "epoch": 0.575, - "grad_norm": 1.1015625, - "learning_rate": 1.9163865903602374e-05, - "loss": 0.4256, - "step": 2300 - }, - { - "epoch": 0.57625, - "grad_norm": 1.1171875, - "learning_rate": 1.9068448955460805e-05, - "loss": 0.3879, - "step": 2305 - }, - { - "epoch": 0.5775, - "grad_norm": 1.5390625, - "learning_rate": 1.897312347917404e-05, - "loss": 0.4048, - "step": 2310 - }, - { - "epoch": 0.57875, - "grad_norm": 1.03125, - "learning_rate": 1.8877890944778815e-05, - "loss": 0.4572, - "step": 2315 - }, - { - "epoch": 0.58, - "grad_norm": 1.1171875, - "learning_rate": 1.8782752820878634e-05, - "loss": 0.4634, - "step": 2320 - }, - { - "epoch": 0.58125, - "grad_norm": 1.2265625, - "learning_rate": 1.868771057462105e-05, - "loss": 0.3985, - "step": 2325 - }, - { - "epoch": 0.5825, - "grad_norm": 1.1328125, - "learning_rate": 1.8592765671675084e-05, - "loss": 0.4463, - "step": 2330 - }, - { - "epoch": 0.58375, - "grad_norm": 0.91015625, - "learning_rate": 1.8497919576208585e-05, - "loss": 0.4083, - "step": 2335 - }, - { - "epoch": 0.585, - "grad_norm": 1.4921875, - "learning_rate": 1.8403173750865685e-05, - "loss": 0.3929, - "step": 2340 - }, - { - "epoch": 0.58625, - "grad_norm": 1.390625, - "learning_rate": 1.830852965674419e-05, - "loss": 0.4659, - "step": 2345 - }, - { - "epoch": 0.5875, - "grad_norm": 1.2109375, - "learning_rate": 1.8213988753373146e-05, - "loss": 0.3986, - "step": 2350 - }, - { - "epoch": 0.58875, - "grad_norm": 1.046875, - "learning_rate": 1.8119552498690215e-05, - "loss": 0.4043, - "step": 2355 - }, - { - "epoch": 0.59, - "grad_norm": 2.84375, - "learning_rate": 1.802522234901927e-05, - "loss": 0.449, - "step": 2360 - }, - { - "epoch": 0.59125, - "grad_norm": 1.40625, - "learning_rate": 1.793099975904791e-05, - "loss": 0.4178, - "step": 2365 - }, - { - "epoch": 0.5925, - "grad_norm": 1.390625, - "learning_rate": 1.783688618180504e-05, - "loss": 0.4422, - "step": 2370 - }, - { - "epoch": 0.59375, - "grad_norm": 0.94921875, - "learning_rate": 1.7742883068638447e-05, - "loss": 0.4666, - "step": 2375 - }, - { - "epoch": 0.595, - "grad_norm": 1.015625, - "learning_rate": 1.7648991869192405e-05, - "loss": 0.4226, - "step": 2380 - }, - { - "epoch": 0.59625, - "grad_norm": 1.40625, - "learning_rate": 1.7555214031385375e-05, - "loss": 0.408, - "step": 2385 - }, - { - "epoch": 0.5975, - "grad_norm": 1.7109375, - "learning_rate": 1.746155100138761e-05, - "loss": 0.4778, - "step": 2390 - }, - { - "epoch": 0.59875, - "grad_norm": 1.25, - "learning_rate": 1.7368004223598912e-05, - "loss": 0.4059, - "step": 2395 - }, - { - "epoch": 0.6, - "grad_norm": 1.296875, - "learning_rate": 1.7274575140626318e-05, - "loss": 0.4398, - "step": 2400 - }, - { - "epoch": 0.60125, - "grad_norm": 1.1015625, - "learning_rate": 1.7181265193261865e-05, - "loss": 0.482, - "step": 2405 - }, - { - "epoch": 0.6025, - "grad_norm": 0.84765625, - "learning_rate": 1.7088075820460346e-05, - "loss": 0.4192, - "step": 2410 - }, - { - "epoch": 0.60375, - "grad_norm": 1.3125, - "learning_rate": 1.6995008459317206e-05, - "loss": 0.4748, - "step": 2415 - }, - { - "epoch": 0.605, - "grad_norm": 1.5703125, - "learning_rate": 1.690206454504627e-05, - "loss": 0.4276, - "step": 2420 - }, - { - "epoch": 0.60625, - "grad_norm": 1.125, - "learning_rate": 1.6809245510957665e-05, - "loss": 0.3548, - "step": 2425 - }, - { - "epoch": 0.6075, - "grad_norm": 1.0078125, - "learning_rate": 1.6716552788435724e-05, - "loss": 0.4122, - "step": 2430 - }, - { - "epoch": 0.60875, - "grad_norm": 1.1484375, - "learning_rate": 1.66239878069169e-05, - "loss": 0.432, - "step": 2435 - }, - { - "epoch": 0.61, - "grad_norm": 1.1015625, - "learning_rate": 1.6531551993867717e-05, - "loss": 0.4467, - "step": 2440 - }, - { - "epoch": 0.61125, - "grad_norm": 1.3828125, - "learning_rate": 1.643924677476276e-05, - "loss": 0.4652, - "step": 2445 - }, - { - "epoch": 0.6125, - "grad_norm": 1.1328125, - "learning_rate": 1.6347073573062672e-05, - "loss": 0.4024, - "step": 2450 - }, - { - "epoch": 0.61375, - "grad_norm": 1.1953125, - "learning_rate": 1.6255033810192282e-05, - "loss": 0.3974, - "step": 2455 - }, - { - "epoch": 0.615, - "grad_norm": 1.09375, - "learning_rate": 1.6163128905518578e-05, - "loss": 0.3891, - "step": 2460 - }, - { - "epoch": 0.61625, - "grad_norm": 1.328125, - "learning_rate": 1.6071360276328874e-05, - "loss": 0.3499, - "step": 2465 - }, - { - "epoch": 0.6175, - "grad_norm": 1.109375, - "learning_rate": 1.5979729337808955e-05, - "loss": 0.4386, - "step": 2470 - }, - { - "epoch": 0.61875, - "grad_norm": 1.1328125, - "learning_rate": 1.588823750302126e-05, - "loss": 0.4494, - "step": 2475 - }, - { - "epoch": 0.62, - "grad_norm": 1.4296875, - "learning_rate": 1.5796886182883053e-05, - "loss": 0.4025, - "step": 2480 - }, - { - "epoch": 0.62125, - "grad_norm": 1.6796875, - "learning_rate": 1.57056767861447e-05, - "loss": 0.4192, - "step": 2485 - }, - { - "epoch": 0.6225, - "grad_norm": 1.3046875, - "learning_rate": 1.561461071936792e-05, - "loss": 0.4509, - "step": 2490 - }, - { - "epoch": 0.62375, - "grad_norm": 1.2109375, - "learning_rate": 1.552368938690414e-05, - "loss": 0.3897, - "step": 2495 - }, - { - "epoch": 0.625, - "grad_norm": 1.28125, - "learning_rate": 1.5432914190872757e-05, - "loss": 0.473, - "step": 2500 - }, - { - "epoch": 0.62625, - "grad_norm": 1.1328125, - "learning_rate": 1.5342286531139605e-05, - "loss": 0.4333, - "step": 2505 - }, - { - "epoch": 0.6275, - "grad_norm": 1.0546875, - "learning_rate": 1.5251807805295302e-05, - "loss": 0.4245, - "step": 2510 - }, - { - "epoch": 0.62875, - "grad_norm": 1.1953125, - "learning_rate": 1.5161479408633713e-05, - "loss": 0.4342, - "step": 2515 - }, - { - "epoch": 0.63, - "grad_norm": 1.0703125, - "learning_rate": 1.5071302734130489e-05, - "loss": 0.3951, - "step": 2520 - }, - { - "epoch": 0.63125, - "grad_norm": 1.2890625, - "learning_rate": 1.498127917242148e-05, - "loss": 0.4367, - "step": 2525 - }, - { - "epoch": 0.6325, - "grad_norm": 1.2578125, - "learning_rate": 1.4891410111781378e-05, - "loss": 0.4766, - "step": 2530 - }, - { - "epoch": 0.63375, - "grad_norm": 1.3828125, - "learning_rate": 1.4801696938102272e-05, - "loss": 0.373, - "step": 2535 - }, - { - "epoch": 0.635, - "grad_norm": 1.3046875, - "learning_rate": 1.4712141034872282e-05, - "loss": 0.3804, - "step": 2540 - }, - { - "epoch": 0.63625, - "grad_norm": 1.09375, - "learning_rate": 1.4622743783154223e-05, - "loss": 0.4206, - "step": 2545 - }, - { - "epoch": 0.6375, - "grad_norm": 1.1875, - "learning_rate": 1.4533506561564306e-05, - "loss": 0.4585, - "step": 2550 - }, - { - "epoch": 0.63875, - "grad_norm": 0.84765625, - "learning_rate": 1.4444430746250867e-05, - "loss": 0.3796, - "step": 2555 - }, - { - "epoch": 0.64, - "grad_norm": 1.0625, - "learning_rate": 1.4355517710873184e-05, - "loss": 0.4296, - "step": 2560 - }, - { - "epoch": 0.64125, - "grad_norm": 1.6796875, - "learning_rate": 1.4266768826580257e-05, - "loss": 0.5008, - "step": 2565 - }, - { - "epoch": 0.6425, - "grad_norm": 1.0625, - "learning_rate": 1.4178185461989662e-05, - "loss": 0.3952, - "step": 2570 - }, - { - "epoch": 0.64375, - "grad_norm": 1.0703125, - "learning_rate": 1.4089768983166444e-05, - "loss": 0.4494, - "step": 2575 - }, - { - "epoch": 0.645, - "grad_norm": 0.88671875, - "learning_rate": 1.4001520753602121e-05, - "loss": 0.3944, - "step": 2580 - }, - { - "epoch": 0.64625, - "grad_norm": 1.328125, - "learning_rate": 1.3913442134193544e-05, - "loss": 0.4276, - "step": 2585 - }, - { - "epoch": 0.6475, - "grad_norm": 1.4296875, - "learning_rate": 1.3825534483221974e-05, - "loss": 0.4433, - "step": 2590 - }, - { - "epoch": 0.64875, - "grad_norm": 1.1640625, - "learning_rate": 1.3737799156332143e-05, - "loss": 0.3992, - "step": 2595 - }, - { - "epoch": 0.65, - "grad_norm": 0.96875, - "learning_rate": 1.3650237506511331e-05, - "loss": 0.4488, - "step": 2600 - }, - { - "epoch": 0.65125, - "grad_norm": 1.3828125, - "learning_rate": 1.3562850884068487e-05, - "loss": 0.4243, - "step": 2605 - }, - { - "epoch": 0.6525, - "grad_norm": 1.1171875, - "learning_rate": 1.3475640636613446e-05, - "loss": 0.3477, - "step": 2610 - }, - { - "epoch": 0.65375, - "grad_norm": 1.2734375, - "learning_rate": 1.3388608109036086e-05, - "loss": 0.4413, - "step": 2615 - }, - { - "epoch": 0.655, - "grad_norm": 1.0625, - "learning_rate": 1.330175464348567e-05, - "loss": 0.4487, - "step": 2620 - }, - { - "epoch": 0.65625, - "grad_norm": 1.28125, - "learning_rate": 1.3215081579350058e-05, - "loss": 0.4122, - "step": 2625 - }, - { - "epoch": 0.6575, - "grad_norm": 1.09375, - "learning_rate": 1.312859025323514e-05, - "loss": 0.424, - "step": 2630 - }, - { - "epoch": 0.65875, - "grad_norm": 1.078125, - "learning_rate": 1.3042281998944151e-05, - "loss": 0.4013, - "step": 2635 - }, - { - "epoch": 0.66, - "grad_norm": 1.9375, - "learning_rate": 1.2956158147457115e-05, - "loss": 0.5066, - "step": 2640 - }, - { - "epoch": 0.66125, - "grad_norm": 1.25, - "learning_rate": 1.2870220026910407e-05, - "loss": 0.3935, - "step": 2645 - }, - { - "epoch": 0.6625, - "grad_norm": 1.21875, - "learning_rate": 1.2784468962576136e-05, - "loss": 0.4039, - "step": 2650 - }, - { - "epoch": 0.66375, - "grad_norm": 1.1875, - "learning_rate": 1.2698906276841776e-05, - "loss": 0.4817, - "step": 2655 - }, - { - "epoch": 0.665, - "grad_norm": 0.99609375, - "learning_rate": 1.261353328918981e-05, - "loss": 0.3917, - "step": 2660 - }, - { - "epoch": 0.66625, - "grad_norm": 1.2421875, - "learning_rate": 1.2528351316177319e-05, - "loss": 0.425, - "step": 2665 - }, - { - "epoch": 0.6675, - "grad_norm": 1.015625, - "learning_rate": 1.2443361671415687e-05, - "loss": 0.4234, - "step": 2670 - }, - { - "epoch": 0.66875, - "grad_norm": 1.1171875, - "learning_rate": 1.235856566555039e-05, - "loss": 0.4414, - "step": 2675 - }, - { - "epoch": 0.67, - "grad_norm": 1.40625, - "learning_rate": 1.2273964606240718e-05, - "loss": 0.4563, - "step": 2680 - }, - { - "epoch": 0.67125, - "grad_norm": 1.15625, - "learning_rate": 1.2189559798139682e-05, - "loss": 0.4132, - "step": 2685 - }, - { - "epoch": 0.6725, - "grad_norm": 1.765625, - "learning_rate": 1.2105352542873815e-05, - "loss": 0.4317, - "step": 2690 - }, - { - "epoch": 0.67375, - "grad_norm": 0.9296875, - "learning_rate": 1.2021344139023186e-05, - "loss": 0.4073, - "step": 2695 - }, - { - "epoch": 0.675, - "grad_norm": 1.4765625, - "learning_rate": 1.1937535882101281e-05, - "loss": 0.4147, - "step": 2700 - }, - { - "epoch": 0.67625, - "grad_norm": 1.5625, - "learning_rate": 1.1853929064535111e-05, - "loss": 0.4394, - "step": 2705 - }, - { - "epoch": 0.6775, - "grad_norm": 1.40625, - "learning_rate": 1.1770524975645238e-05, - "loss": 0.461, - "step": 2710 - }, - { - "epoch": 0.67875, - "grad_norm": 1.0, - "learning_rate": 1.1687324901625879e-05, - "loss": 0.4279, - "step": 2715 - }, - { - "epoch": 0.68, - "grad_norm": 0.9765625, - "learning_rate": 1.1604330125525079e-05, - "loss": 0.4201, - "step": 2720 - }, - { - "epoch": 0.68125, - "grad_norm": 0.98828125, - "learning_rate": 1.1521541927224994e-05, - "loss": 0.4392, - "step": 2725 - }, - { - "epoch": 0.6825, - "grad_norm": 1.140625, - "learning_rate": 1.1438961583422037e-05, - "loss": 0.4064, - "step": 2730 - }, - { - "epoch": 0.68375, - "grad_norm": 1.1796875, - "learning_rate": 1.1356590367607252e-05, - "loss": 0.4081, - "step": 2735 - }, - { - "epoch": 0.685, - "grad_norm": 1.1171875, - "learning_rate": 1.1274429550046704e-05, - "loss": 0.4629, - "step": 2740 - }, - { - "epoch": 0.68625, - "grad_norm": 0.9453125, - "learning_rate": 1.1192480397761837e-05, - "loss": 0.3942, - "step": 2745 - }, - { - "epoch": 0.6875, - "grad_norm": 0.9765625, - "learning_rate": 1.1110744174509952e-05, - "loss": 0.4581, - "step": 2750 - }, - { - "epoch": 0.68875, - "grad_norm": 1.1015625, - "learning_rate": 1.1029222140764712e-05, - "loss": 0.4079, - "step": 2755 - }, - { - "epoch": 0.69, - "grad_norm": 1.1328125, - "learning_rate": 1.0947915553696742e-05, - "loss": 0.3924, - "step": 2760 - }, - { - "epoch": 0.69125, - "grad_norm": 0.94921875, - "learning_rate": 1.0866825667154182e-05, - "loss": 0.3715, - "step": 2765 - }, - { - "epoch": 0.6925, - "grad_norm": 1.09375, - "learning_rate": 1.07859537316434e-05, - "loss": 0.4238, - "step": 2770 - }, - { - "epoch": 0.69375, - "grad_norm": 1.3203125, - "learning_rate": 1.0705300994309697e-05, - "loss": 0.4465, - "step": 2775 - }, - { - "epoch": 0.695, - "grad_norm": 1.046875, - "learning_rate": 1.0624868698918045e-05, - "loss": 0.4295, - "step": 2780 - }, - { - "epoch": 0.69625, - "grad_norm": 1.140625, - "learning_rate": 1.0544658085833919e-05, - "loss": 0.4527, - "step": 2785 - }, - { - "epoch": 0.6975, - "grad_norm": 1.1015625, - "learning_rate": 1.0464670392004235e-05, - "loss": 0.4721, - "step": 2790 - }, - { - "epoch": 0.69875, - "grad_norm": 1.234375, - "learning_rate": 1.0384906850938166e-05, - "loss": 0.4632, - "step": 2795 - }, - { - "epoch": 0.7, - "grad_norm": 1.03125, - "learning_rate": 1.0305368692688174e-05, - "loss": 0.4382, - "step": 2800 - }, - { - "epoch": 0.70125, - "grad_norm": 1.046875, - "learning_rate": 1.0226057143831064e-05, - "loss": 0.4699, - "step": 2805 - }, - { - "epoch": 0.7025, - "grad_norm": 1.4296875, - "learning_rate": 1.0146973427449038e-05, - "loss": 0.4368, - "step": 2810 - }, - { - "epoch": 0.70375, - "grad_norm": 0.9765625, - "learning_rate": 1.0068118763110824e-05, - "loss": 0.4513, - "step": 2815 - }, - { - "epoch": 0.705, - "grad_norm": 1.0, - "learning_rate": 9.989494366852904e-06, - "loss": 0.3863, - "step": 2820 - }, - { - "epoch": 0.70625, - "grad_norm": 1.0390625, - "learning_rate": 9.911101451160715e-06, - "loss": 0.3907, - "step": 2825 - }, - { - "epoch": 0.7075, - "grad_norm": 1.078125, - "learning_rate": 9.832941224950012e-06, - "loss": 0.4537, - "step": 2830 - }, - { - "epoch": 0.70875, - "grad_norm": 1.3125, - "learning_rate": 9.755014893548157e-06, - "loss": 0.4082, - "step": 2835 - }, - { - "epoch": 0.71, - "grad_norm": 1.1015625, - "learning_rate": 9.677323658675594e-06, - "loss": 0.3992, - "step": 2840 - }, - { - "epoch": 0.71125, - "grad_norm": 1.3125, - "learning_rate": 9.599868718427257e-06, - "loss": 0.4512, - "step": 2845 - }, - { - "epoch": 0.7125, - "grad_norm": 0.8984375, - "learning_rate": 9.522651267254149e-06, - "loss": 0.419, - "step": 2850 - }, - { - "epoch": 0.71375, - "grad_norm": 1.3671875, - "learning_rate": 9.445672495944899e-06, - "loss": 0.4542, - "step": 2855 - }, - { - "epoch": 0.715, - "grad_norm": 1.1640625, - "learning_rate": 9.368933591607378e-06, - "loss": 0.4554, - "step": 2860 - }, - { - "epoch": 0.71625, - "grad_norm": 1.3359375, - "learning_rate": 9.292435737650407e-06, - "loss": 0.4158, - "step": 2865 - }, - { - "epoch": 0.7175, - "grad_norm": 0.8671875, - "learning_rate": 9.216180113765558e-06, - "loss": 0.4145, - "step": 2870 - }, - { - "epoch": 0.71875, - "grad_norm": 1.1640625, - "learning_rate": 9.140167895908867e-06, - "loss": 0.4276, - "step": 2875 - }, - { - "epoch": 0.72, - "grad_norm": 1.125, - "learning_rate": 9.064400256282757e-06, - "loss": 0.4477, - "step": 2880 - }, - { - "epoch": 0.72125, - "grad_norm": 1.1796875, - "learning_rate": 8.988878363317979e-06, - "loss": 0.4563, - "step": 2885 - }, - { - "epoch": 0.7225, - "grad_norm": 1.234375, - "learning_rate": 8.913603381655528e-06, - "loss": 0.4396, - "step": 2890 - }, - { - "epoch": 0.72375, - "grad_norm": 1.5390625, - "learning_rate": 8.838576472128756e-06, - "loss": 0.4831, - "step": 2895 - }, - { - "epoch": 0.725, - "grad_norm": 1.03125, - "learning_rate": 8.763798791745411e-06, - "loss": 0.4437, - "step": 2900 - }, - { - "epoch": 0.72625, - "grad_norm": 1.140625, - "learning_rate": 8.689271493669837e-06, - "loss": 0.4639, - "step": 2905 - }, - { - "epoch": 0.7275, - "grad_norm": 1.421875, - "learning_rate": 8.614995727205156e-06, - "loss": 0.4215, - "step": 2910 - }, - { - "epoch": 0.72875, - "grad_norm": 1.28125, - "learning_rate": 8.540972637775572e-06, - "loss": 0.4615, - "step": 2915 - }, - { - "epoch": 0.73, - "grad_norm": 1.0703125, - "learning_rate": 8.467203366908707e-06, - "loss": 0.4043, - "step": 2920 - }, - { - "epoch": 0.73125, - "grad_norm": 1.3046875, - "learning_rate": 8.393689052217966e-06, - "loss": 0.4634, - "step": 2925 - }, - { - "epoch": 0.7325, - "grad_norm": 1.328125, - "learning_rate": 8.320430827385003e-06, - "loss": 0.4411, - "step": 2930 - }, - { - "epoch": 0.73375, - "grad_norm": 1.2265625, - "learning_rate": 8.24742982214231e-06, - "loss": 0.3556, - "step": 2935 - }, - { - "epoch": 0.735, - "grad_norm": 1.0234375, - "learning_rate": 8.174687162255672e-06, - "loss": 0.4456, - "step": 2940 - }, - { - "epoch": 0.73625, - "grad_norm": 0.8984375, - "learning_rate": 8.102203969506886e-06, - "loss": 0.4254, - "step": 2945 - }, - { - "epoch": 0.7375, - "grad_norm": 1.234375, - "learning_rate": 8.029981361676456e-06, - "loss": 0.427, - "step": 2950 - }, - { - "epoch": 0.73875, - "grad_norm": 1.0390625, - "learning_rate": 7.958020452526346e-06, - "loss": 0.4323, - "step": 2955 - }, - { - "epoch": 0.74, - "grad_norm": 1.125, - "learning_rate": 7.886322351782783e-06, - "loss": 0.3968, - "step": 2960 - }, - { - "epoch": 0.74125, - "grad_norm": 1.0234375, - "learning_rate": 7.814888165119186e-06, - "loss": 0.4628, - "step": 2965 - }, - { - "epoch": 0.7425, - "grad_norm": 1.0078125, - "learning_rate": 7.743718994139071e-06, - "loss": 0.4388, - "step": 2970 - }, - { - "epoch": 0.74375, - "grad_norm": 1.34375, - "learning_rate": 7.672815936359107e-06, - "loss": 0.4029, - "step": 2975 - }, - { - "epoch": 0.745, - "grad_norm": 1.0703125, - "learning_rate": 7.602180085192143e-06, - "loss": 0.4214, - "step": 2980 - }, - { - "epoch": 0.74625, - "grad_norm": 1.4296875, - "learning_rate": 7.531812529930398e-06, - "loss": 0.4165, - "step": 2985 - }, - { - "epoch": 0.7475, - "grad_norm": 1.1171875, - "learning_rate": 7.461714355728608e-06, - "loss": 0.4016, - "step": 2990 - }, - { - "epoch": 0.74875, - "grad_norm": 1.203125, - "learning_rate": 7.391886643587342e-06, - "loss": 0.4527, - "step": 2995 - }, - { - "epoch": 0.75, - "grad_norm": 1.15625, - "learning_rate": 7.3223304703363135e-06, - "loss": 0.4143, - "step": 3000 - }, - { - "epoch": 0.75125, - "grad_norm": 1.328125, - "learning_rate": 7.253046908617747e-06, - "loss": 0.4667, - "step": 3005 - }, - { - "epoch": 0.7525, - "grad_norm": 1.2734375, - "learning_rate": 7.184037026869867e-06, - "loss": 0.4032, - "step": 3010 - }, - { - "epoch": 0.75375, - "grad_norm": 1.0859375, - "learning_rate": 7.115301889310427e-06, - "loss": 0.433, - "step": 3015 - }, - { - "epoch": 0.755, - "grad_norm": 1.046875, - "learning_rate": 7.046842555920283e-06, - "loss": 0.4017, - "step": 3020 - }, - { - "epoch": 0.75625, - "grad_norm": 1.0546875, - "learning_rate": 6.9786600824270296e-06, - "loss": 0.4006, - "step": 3025 - }, - { - "epoch": 0.7575, - "grad_norm": 1.3515625, - "learning_rate": 6.91075552028877e-06, - "loss": 0.4536, - "step": 3030 - }, - { - "epoch": 0.75875, - "grad_norm": 1.4609375, - "learning_rate": 6.84312991667784e-06, - "loss": 0.4295, - "step": 3035 - }, - { - "epoch": 0.76, - "grad_norm": 0.9375, - "learning_rate": 6.775784314464717e-06, - "loss": 0.4216, - "step": 3040 - }, - { - "epoch": 0.76125, - "grad_norm": 1.1328125, - "learning_rate": 6.708719752201884e-06, - "loss": 0.4071, - "step": 3045 - }, - { - "epoch": 0.7625, - "grad_norm": 0.87109375, - "learning_rate": 6.641937264107867e-06, - "loss": 0.4518, - "step": 3050 - }, - { - "epoch": 0.76375, - "grad_norm": 1.453125, - "learning_rate": 6.575437880051233e-06, - "loss": 0.4776, - "step": 3055 - }, - { - "epoch": 0.765, - "grad_norm": 1.1796875, - "learning_rate": 6.509222625534755e-06, - "loss": 0.4084, - "step": 3060 - }, - { - "epoch": 0.76625, - "grad_norm": 1.4609375, - "learning_rate": 6.443292521679578e-06, - "loss": 0.4825, - "step": 3065 - }, - { - "epoch": 0.7675, - "grad_norm": 1.171875, - "learning_rate": 6.377648585209456e-06, - "loss": 0.4788, - "step": 3070 - }, - { - "epoch": 0.76875, - "grad_norm": 1.1953125, - "learning_rate": 6.312291828435077e-06, - "loss": 0.4077, - "step": 3075 - }, - { - "epoch": 0.77, - "grad_norm": 0.98046875, - "learning_rate": 6.247223259238511e-06, - "loss": 0.4103, - "step": 3080 - }, - { - "epoch": 0.77125, - "grad_norm": 1.21875, - "learning_rate": 6.182443881057576e-06, - "loss": 0.4401, - "step": 3085 - }, - { - "epoch": 0.7725, - "grad_norm": 1.3984375, - "learning_rate": 6.117954692870412e-06, - "loss": 0.4628, - "step": 3090 - }, - { - "epoch": 0.77375, - "grad_norm": 1.15625, - "learning_rate": 6.053756689180082e-06, - "loss": 0.3789, - "step": 3095 - }, - { - "epoch": 0.775, - "grad_norm": 0.9609375, - "learning_rate": 5.989850859999227e-06, - "loss": 0.4261, - "step": 3100 - }, - { - "epoch": 0.77625, - "grad_norm": 1.3359375, - "learning_rate": 5.926238190834779e-06, - "loss": 0.4548, - "step": 3105 - }, - { - "epoch": 0.7775, - "grad_norm": 1.125, - "learning_rate": 5.8629196626728e-06, - "loss": 0.4496, - "step": 3110 - }, - { - "epoch": 0.77875, - "grad_norm": 0.984375, - "learning_rate": 5.7998962519633045e-06, - "loss": 0.3764, - "step": 3115 - }, - { - "epoch": 0.78, - "grad_norm": 1.6015625, - "learning_rate": 5.737168930605272e-06, - "loss": 0.3888, - "step": 3120 - }, - { - "epoch": 0.78125, - "grad_norm": 1.015625, - "learning_rate": 5.674738665931575e-06, - "loss": 0.4209, - "step": 3125 - }, - { - "epoch": 0.7825, - "grad_norm": 1.3203125, - "learning_rate": 5.612606420694141e-06, - "loss": 0.4727, - "step": 3130 - }, - { - "epoch": 0.78375, - "grad_norm": 1.0546875, - "learning_rate": 5.550773153049046e-06, - "loss": 0.4365, - "step": 3135 - }, - { - "epoch": 0.785, - "grad_norm": 1.2421875, - "learning_rate": 5.489239816541755e-06, - "loss": 0.4403, - "step": 3140 - }, - { - "epoch": 0.78625, - "grad_norm": 1.390625, - "learning_rate": 5.428007360092463e-06, - "loss": 0.4521, - "step": 3145 - }, - { - "epoch": 0.7875, - "grad_norm": 1.25, - "learning_rate": 5.367076727981382e-06, - "loss": 0.4657, - "step": 3150 - }, - { - "epoch": 0.78875, - "grad_norm": 1.015625, - "learning_rate": 5.306448859834228e-06, - "loss": 0.4367, - "step": 3155 - }, - { - "epoch": 0.79, - "grad_norm": 1.1015625, - "learning_rate": 5.24612469060774e-06, - "loss": 0.4053, - "step": 3160 - }, - { - "epoch": 0.79125, - "grad_norm": 1.515625, - "learning_rate": 5.186105150575232e-06, - "loss": 0.3926, - "step": 3165 - }, - { - "epoch": 0.7925, - "grad_norm": 1.34375, - "learning_rate": 5.12639116531225e-06, - "loss": 0.4534, - "step": 3170 - }, - { - "epoch": 0.79375, - "grad_norm": 1.3125, - "learning_rate": 5.066983655682325e-06, - "loss": 0.4551, - "step": 3175 - }, - { - "epoch": 0.795, - "grad_norm": 1.375, - "learning_rate": 5.007883537822736e-06, - "loss": 0.4066, - "step": 3180 - }, - { - "epoch": 0.79625, - "grad_norm": 0.75, - "learning_rate": 4.949091723130425e-06, - "loss": 0.4247, - "step": 3185 - }, - { - "epoch": 0.7975, - "grad_norm": 1.6640625, - "learning_rate": 4.890609118247888e-06, - "loss": 0.4215, - "step": 3190 - }, - { - "epoch": 0.79875, - "grad_norm": 1.4296875, - "learning_rate": 4.832436625049256e-06, - "loss": 0.4385, - "step": 3195 - }, - { - "epoch": 0.8, - "grad_norm": 0.98046875, - "learning_rate": 4.7745751406263165e-06, - "loss": 0.4393, - "step": 3200 - }, - { - "epoch": 0.80125, - "grad_norm": 0.9140625, - "learning_rate": 4.717025557274749e-06, - "loss": 0.42, - "step": 3205 - }, - { - "epoch": 0.8025, - "grad_norm": 0.90234375, - "learning_rate": 4.659788762480327e-06, - "loss": 0.3758, - "step": 3210 - }, - { - "epoch": 0.80375, - "grad_norm": 1.1640625, - "learning_rate": 4.602865638905224e-06, - "loss": 0.4448, - "step": 3215 - }, - { - "epoch": 0.805, - "grad_norm": 1.078125, - "learning_rate": 4.54625706437441e-06, - "loss": 0.4453, - "step": 3220 - }, - { - "epoch": 0.80625, - "grad_norm": 1.3125, - "learning_rate": 4.48996391186216e-06, - "loss": 0.4359, - "step": 3225 - }, - { - "epoch": 0.8075, - "grad_norm": 1.3515625, - "learning_rate": 4.433987049478508e-06, - "loss": 0.3974, - "step": 3230 - }, - { - "epoch": 0.80875, - "grad_norm": 1.125, - "learning_rate": 4.378327340455915e-06, - "loss": 0.4194, - "step": 3235 - }, - { - "epoch": 0.81, - "grad_norm": 0.87890625, - "learning_rate": 4.322985643135952e-06, - "loss": 0.4214, - "step": 3240 - }, - { - "epoch": 0.81125, - "grad_norm": 1.1640625, - "learning_rate": 4.267962810956061e-06, - "loss": 0.3592, - "step": 3245 - }, - { - "epoch": 0.8125, - "grad_norm": 1.2421875, - "learning_rate": 4.213259692436367e-06, - "loss": 0.3997, - "step": 3250 - }, - { - "epoch": 0.81375, - "grad_norm": 1.484375, - "learning_rate": 4.158877131166641e-06, - "loss": 0.4471, - "step": 3255 - }, - { - "epoch": 0.815, - "grad_norm": 0.953125, - "learning_rate": 4.104815965793249e-06, - "loss": 0.4293, - "step": 3260 - }, - { - "epoch": 0.81625, - "grad_norm": 1.203125, - "learning_rate": 4.051077030006228e-06, - "loss": 0.4562, - "step": 3265 - }, - { - "epoch": 0.8175, - "grad_norm": 1.0703125, - "learning_rate": 3.9976611525264525e-06, - "loss": 0.434, - "step": 3270 - }, - { - "epoch": 0.81875, - "grad_norm": 0.8671875, - "learning_rate": 3.944569157092839e-06, - "loss": 0.4524, - "step": 3275 - }, - { - "epoch": 0.82, - "grad_norm": 1.296875, - "learning_rate": 3.891801862449629e-06, - "loss": 0.4498, - "step": 3280 - }, - { - "epoch": 0.82125, - "grad_norm": 1.0625, - "learning_rate": 3.839360082333771e-06, - "loss": 0.4329, - "step": 3285 - }, - { - "epoch": 0.8225, - "grad_norm": 1.375, - "learning_rate": 3.7872446254624104e-06, - "loss": 0.3884, - "step": 3290 - }, - { - "epoch": 0.82375, - "grad_norm": 1.046875, - "learning_rate": 3.735456295520348e-06, - "loss": 0.4114, - "step": 3295 - }, - { - "epoch": 0.825, - "grad_norm": 1.3125, - "learning_rate": 3.6839958911476957e-06, - "loss": 0.4404, - "step": 3300 - }, - { - "epoch": 0.82625, - "grad_norm": 1.171875, - "learning_rate": 3.6328642059275524e-06, - "loss": 0.4548, - "step": 3305 - }, - { - "epoch": 0.8275, - "grad_norm": 0.92578125, - "learning_rate": 3.5820620283737616e-06, - "loss": 0.4648, - "step": 3310 - }, - { - "epoch": 0.82875, - "grad_norm": 0.875, - "learning_rate": 3.5315901419187363e-06, - "loss": 0.4233, - "step": 3315 - }, - { - "epoch": 0.83, - "grad_norm": 0.953125, - "learning_rate": 3.4814493249014116e-06, - "loss": 0.4021, - "step": 3320 - }, - { - "epoch": 0.83125, - "grad_norm": 1.0546875, - "learning_rate": 3.431640350555204e-06, - "loss": 0.4732, - "step": 3325 - }, - { - "epoch": 0.8325, - "grad_norm": 1.0546875, - "learning_rate": 3.382163986996126e-06, - "loss": 0.4174, - "step": 3330 - }, - { - "epoch": 0.83375, - "grad_norm": 1.3984375, - "learning_rate": 3.3330209972108976e-06, - "loss": 0.4284, - "step": 3335 - }, - { - "epoch": 0.835, - "grad_norm": 1.109375, - "learning_rate": 3.284212139045223e-06, - "loss": 0.4183, - "step": 3340 - }, - { - "epoch": 0.83625, - "grad_norm": 1.390625, - "learning_rate": 3.2357381651920648e-06, - "loss": 0.3996, - "step": 3345 - }, - { - "epoch": 0.8375, - "grad_norm": 1.2578125, - "learning_rate": 3.187599823180071e-06, - "loss": 0.4317, - "step": 3350 - }, - { - "epoch": 0.83875, - "grad_norm": 1.21875, - "learning_rate": 3.139797855362031e-06, - "loss": 0.4341, - "step": 3355 - }, - { - "epoch": 0.84, - "grad_norm": 1.59375, - "learning_rate": 3.092332998903416e-06, - "loss": 0.4516, - "step": 3360 - }, - { - "epoch": 0.84125, - "grad_norm": 1.0703125, - "learning_rate": 3.0452059857710186e-06, - "loss": 0.4371, - "step": 3365 - }, - { - "epoch": 0.8425, - "grad_norm": 0.9921875, - "learning_rate": 2.9984175427217016e-06, - "loss": 0.4346, - "step": 3370 - }, - { - "epoch": 0.84375, - "grad_norm": 0.96875, - "learning_rate": 2.9519683912911266e-06, - "loss": 0.3893, - "step": 3375 - }, - { - "epoch": 0.845, - "grad_norm": 1.046875, - "learning_rate": 2.9058592477826636e-06, - "loss": 0.4086, - "step": 3380 - }, - { - "epoch": 0.84625, - "grad_norm": 0.9375, - "learning_rate": 2.860090823256359e-06, - "loss": 0.4211, - "step": 3385 - }, - { - "epoch": 0.8475, - "grad_norm": 1.296875, - "learning_rate": 2.8146638235179213e-06, - "loss": 0.422, - "step": 3390 - }, - { - "epoch": 0.84875, - "grad_norm": 1.0234375, - "learning_rate": 2.769578949107893e-06, - "loss": 0.4117, - "step": 3395 - }, - { - "epoch": 0.85, - "grad_norm": 0.90625, - "learning_rate": 2.7248368952908053e-06, - "loss": 0.3836, - "step": 3400 - }, - { - "epoch": 0.85125, - "grad_norm": 1.046875, - "learning_rate": 2.6804383520444815e-06, - "loss": 0.3996, - "step": 3405 - }, - { - "epoch": 0.8525, - "grad_norm": 1.234375, - "learning_rate": 2.6363840040493747e-06, - "loss": 0.4007, - "step": 3410 - }, - { - "epoch": 0.85375, - "grad_norm": 1.0625, - "learning_rate": 2.5926745306780324e-06, - "loss": 0.4431, - "step": 3415 - }, - { - "epoch": 0.855, - "grad_norm": 1.09375, - "learning_rate": 2.5493106059846116e-06, - "loss": 0.4013, - "step": 3420 - }, - { - "epoch": 0.85625, - "grad_norm": 1.3125, - "learning_rate": 2.506292898694468e-06, - "loss": 0.4748, - "step": 3425 - }, - { - "epoch": 0.8575, - "grad_norm": 1.3828125, - "learning_rate": 2.4636220721938554e-06, - "loss": 0.4454, - "step": 3430 - }, - { - "epoch": 0.85875, - "grad_norm": 1.546875, - "learning_rate": 2.421298784519724e-06, - "loss": 0.3844, - "step": 3435 - }, - { - "epoch": 0.86, - "grad_norm": 1.296875, - "learning_rate": 2.379323688349516e-06, - "loss": 0.4772, - "step": 3440 - }, - { - "epoch": 0.86125, - "grad_norm": 1.09375, - "learning_rate": 2.3376974309911343e-06, - "loss": 0.4668, - "step": 3445 - }, - { - "epoch": 0.8625, - "grad_norm": 1.359375, - "learning_rate": 2.296420654372966e-06, - "loss": 0.4191, - "step": 3450 - }, - { - "epoch": 0.86375, - "grad_norm": 0.8828125, - "learning_rate": 2.2554939950339747e-06, - "loss": 0.3971, - "step": 3455 - }, - { - "epoch": 0.865, - "grad_norm": 1.296875, - "learning_rate": 2.2149180841138676e-06, - "loss": 0.4282, - "step": 3460 - }, - { - "epoch": 0.86625, - "grad_norm": 1.0, - "learning_rate": 2.1746935473433928e-06, - "loss": 0.4406, - "step": 3465 - }, - { - "epoch": 0.8675, - "grad_norm": 1.3828125, - "learning_rate": 2.1348210050346595e-06, - "loss": 0.3914, - "step": 3470 - }, - { - "epoch": 0.86875, - "grad_norm": 1.1015625, - "learning_rate": 2.0953010720716037e-06, - "loss": 0.3676, - "step": 3475 - }, - { - "epoch": 0.87, - "grad_norm": 1.234375, - "learning_rate": 2.0561343579004715e-06, - "loss": 0.3973, - "step": 3480 - }, - { - "epoch": 0.87125, - "grad_norm": 1.4375, - "learning_rate": 2.0173214665204555e-06, - "loss": 0.4067, - "step": 3485 - }, - { - "epoch": 0.8725, - "grad_norm": 1.2109375, - "learning_rate": 1.9788629964743455e-06, - "loss": 0.4279, - "step": 3490 - }, - { - "epoch": 0.87375, - "grad_norm": 1.3359375, - "learning_rate": 1.940759540839329e-06, - "loss": 0.4449, - "step": 3495 - }, - { - "epoch": 0.875, - "grad_norm": 1.2890625, - "learning_rate": 1.9030116872178316e-06, - "loss": 0.4364, - "step": 3500 - }, - { - "epoch": 0.87625, - "grad_norm": 1.1015625, - "learning_rate": 1.8656200177284505e-06, - "loss": 0.3991, - "step": 3505 - }, - { - "epoch": 0.8775, - "grad_norm": 1.109375, - "learning_rate": 1.8285851089969802e-06, - "loss": 0.3922, - "step": 3510 - }, - { - "epoch": 0.87875, - "grad_norm": 1.015625, - "learning_rate": 1.7919075321475325e-06, - "loss": 0.4559, - "step": 3515 - }, - { - "epoch": 0.88, - "grad_norm": 1.15625, - "learning_rate": 1.7555878527937164e-06, - "loss": 0.3947, - "step": 3520 - }, - { - "epoch": 0.88125, - "grad_norm": 0.98828125, - "learning_rate": 1.7196266310299108e-06, - "loss": 0.3853, - "step": 3525 - }, - { - "epoch": 0.8825, - "grad_norm": 1.1015625, - "learning_rate": 1.6840244214226502e-06, - "loss": 0.4429, - "step": 3530 - }, - { - "epoch": 0.88375, - "grad_norm": 1.2109375, - "learning_rate": 1.6487817730020365e-06, - "loss": 0.4092, - "step": 3535 - }, - { - "epoch": 0.885, - "grad_norm": 1.046875, - "learning_rate": 1.6138992292533183e-06, - "loss": 0.4348, - "step": 3540 - }, - { - "epoch": 0.88625, - "grad_norm": 0.92578125, - "learning_rate": 1.579377328108464e-06, - "loss": 0.4362, - "step": 3545 - }, - { - "epoch": 0.8875, - "grad_norm": 1.3359375, - "learning_rate": 1.5452166019378989e-06, - "loss": 0.431, - "step": 3550 - }, - { - "epoch": 0.88875, - "grad_norm": 1.0703125, - "learning_rate": 1.5114175775422762e-06, - "loss": 0.4164, - "step": 3555 - }, - { - "epoch": 0.89, - "grad_norm": 1.296875, - "learning_rate": 1.4779807761443636e-06, - "loss": 0.4154, - "step": 3560 - }, - { - "epoch": 0.89125, - "grad_norm": 1.3046875, - "learning_rate": 1.4449067133810056e-06, - "loss": 0.4108, - "step": 3565 - }, - { - "epoch": 0.8925, - "grad_norm": 1.1796875, - "learning_rate": 1.4121958992951629e-06, - "loss": 0.4024, - "step": 3570 - }, - { - "epoch": 0.89375, - "grad_norm": 1.0703125, - "learning_rate": 1.379848838328049e-06, - "loss": 0.4041, - "step": 3575 - }, - { - "epoch": 0.895, - "grad_norm": 1.2109375, - "learning_rate": 1.3478660293113676e-06, - "loss": 0.4291, - "step": 3580 - }, - { - "epoch": 0.89625, - "grad_norm": 1.0703125, - "learning_rate": 1.3162479654595938e-06, - "loss": 0.4561, - "step": 3585 - }, - { - "epoch": 0.8975, - "grad_norm": 2.03125, - "learning_rate": 1.284995134362385e-06, - "loss": 0.4599, - "step": 3590 - }, - { - "epoch": 0.89875, - "grad_norm": 0.92578125, - "learning_rate": 1.2541080179770571e-06, - "loss": 0.371, - "step": 3595 - }, - { - "epoch": 0.9, - "grad_norm": 1.1171875, - "learning_rate": 1.2235870926211619e-06, - "loss": 0.428, - "step": 3600 - }, - { - "epoch": 0.90125, - "grad_norm": 1.1796875, - "learning_rate": 1.193432828965113e-06, - "loss": 0.4093, - "step": 3605 - }, - { - "epoch": 0.9025, - "grad_norm": 1.0078125, - "learning_rate": 1.16364569202497e-06, - "loss": 0.4123, - "step": 3610 - }, - { - "epoch": 0.90375, - "grad_norm": 1.0625, - "learning_rate": 1.134226141155223e-06, - "loss": 0.4212, - "step": 3615 - }, - { - "epoch": 0.905, - "grad_norm": 0.99609375, - "learning_rate": 1.105174630041747e-06, - "loss": 0.4379, - "step": 3620 - }, - { - "epoch": 0.90625, - "grad_norm": 1.28125, - "learning_rate": 1.0764916066947794e-06, - "loss": 0.4788, - "step": 3625 - }, - { - "epoch": 0.9075, - "grad_norm": 1.1875, - "learning_rate": 1.0481775134420225e-06, - "loss": 0.4523, - "step": 3630 - }, - { - "epoch": 0.90875, - "grad_norm": 1.5078125, - "learning_rate": 1.020232786921821e-06, - "loss": 0.4141, - "step": 3635 - }, - { - "epoch": 0.91, - "grad_norm": 1.0078125, - "learning_rate": 9.926578580764234e-07, - "loss": 0.3628, - "step": 3640 - }, - { - "epoch": 0.91125, - "grad_norm": 1.109375, - "learning_rate": 9.654531521453513e-07, - "loss": 0.4142, - "step": 3645 - }, - { - "epoch": 0.9125, - "grad_norm": 1.3046875, - "learning_rate": 9.386190886588208e-07, - "loss": 0.4348, - "step": 3650 - }, - { - "epoch": 0.91375, - "grad_norm": 1.6796875, - "learning_rate": 9.121560814312813e-07, - "loss": 0.4138, - "step": 3655 - }, - { - "epoch": 0.915, - "grad_norm": 1.078125, - "learning_rate": 8.860645385550481e-07, - "loss": 0.4332, - "step": 3660 - }, - { - "epoch": 0.91625, - "grad_norm": 1.1484375, - "learning_rate": 8.603448623939858e-07, - "loss": 0.4577, - "step": 3665 - }, - { - "epoch": 0.9175, - "grad_norm": 1.1484375, - "learning_rate": 8.349974495773183e-07, - "loss": 0.4456, - "step": 3670 - }, - { - "epoch": 0.91875, - "grad_norm": 1.28125, - "learning_rate": 8.10022690993506e-07, - "loss": 0.3946, - "step": 3675 - }, - { - "epoch": 0.92, - "grad_norm": 1.3203125, - "learning_rate": 7.854209717842231e-07, - "loss": 0.478, - "step": 3680 - }, - { - "epoch": 0.92125, - "grad_norm": 1.1171875, - "learning_rate": 7.611926713384121e-07, - "loss": 0.3592, - "step": 3685 - }, - { - "epoch": 0.9225, - "grad_norm": 1.234375, - "learning_rate": 7.373381632864384e-07, - "loss": 0.4425, - "step": 3690 - }, - { - "epoch": 0.92375, - "grad_norm": 1.1796875, - "learning_rate": 7.138578154943288e-07, - "loss": 0.4219, - "step": 3695 - }, - { - "epoch": 0.925, - "grad_norm": 1.3125, - "learning_rate": 6.907519900580861e-07, - "loss": 0.4419, - "step": 3700 - }, - { - "epoch": 0.92625, - "grad_norm": 1.1796875, - "learning_rate": 6.680210432981254e-07, - "loss": 0.3983, - "step": 3705 - }, - { - "epoch": 0.9275, - "grad_norm": 1.046875, - "learning_rate": 6.456653257537665e-07, - "loss": 0.4417, - "step": 3710 - }, - { - "epoch": 0.92875, - "grad_norm": 1.03125, - "learning_rate": 6.2368518217783e-07, - "loss": 0.4469, - "step": 3715 - }, - { - "epoch": 0.93, - "grad_norm": 1.0234375, - "learning_rate": 6.020809515313142e-07, - "loss": 0.435, - "step": 3720 - }, - { - "epoch": 0.93125, - "grad_norm": 1.6875, - "learning_rate": 5.808529669781904e-07, - "loss": 0.3856, - "step": 3725 - }, - { - "epoch": 0.9325, - "grad_norm": 1.1875, - "learning_rate": 5.600015558802352e-07, - "loss": 0.4587, - "step": 3730 - }, - { - "epoch": 0.93375, - "grad_norm": 1.203125, - "learning_rate": 5.39527039792001e-07, - "loss": 0.4325, - "step": 3735 - }, - { - "epoch": 0.935, - "grad_norm": 1.484375, - "learning_rate": 5.194297344558536e-07, - "loss": 0.4166, - "step": 3740 - }, - { - "epoch": 0.93625, - "grad_norm": 1.2890625, - "learning_rate": 4.997099497971114e-07, - "loss": 0.4347, - "step": 3745 - }, - { - "epoch": 0.9375, - "grad_norm": 0.95703125, - "learning_rate": 4.803679899192392e-07, - "loss": 0.4252, - "step": 3750 - }, - { - "epoch": 0.93875, - "grad_norm": 1.1875, - "learning_rate": 4.614041530991903e-07, - "loss": 0.4036, - "step": 3755 - }, - { - "epoch": 0.94, - "grad_norm": 1.375, - "learning_rate": 4.4281873178278475e-07, - "loss": 0.4359, - "step": 3760 - }, - { - "epoch": 0.94125, - "grad_norm": 1.359375, - "learning_rate": 4.246120125802111e-07, - "loss": 0.4566, - "step": 3765 - }, - { - "epoch": 0.9425, - "grad_norm": 1.0078125, - "learning_rate": 4.067842762616014e-07, - "loss": 0.4226, - "step": 3770 - }, - { - "epoch": 0.94375, - "grad_norm": 1.1953125, - "learning_rate": 3.8933579775271013e-07, - "loss": 0.3903, - "step": 3775 - }, - { - "epoch": 0.945, - "grad_norm": 1.125, - "learning_rate": 3.7226684613065333e-07, - "loss": 0.4119, - "step": 3780 - }, - { - "epoch": 0.94625, - "grad_norm": 1.3515625, - "learning_rate": 3.555776846197817e-07, - "loss": 0.4268, - "step": 3785 - }, - { - "epoch": 0.9475, - "grad_norm": 0.9765625, - "learning_rate": 3.3926857058761417e-07, - "loss": 0.405, - "step": 3790 - }, - { - "epoch": 0.94875, - "grad_norm": 1.2734375, - "learning_rate": 3.233397555408607e-07, - "loss": 0.4161, - "step": 3795 - }, - { - "epoch": 0.95, - "grad_norm": 1.2578125, - "learning_rate": 3.077914851215585e-07, - "loss": 0.4293, - "step": 3800 - }, - { - "epoch": 0.95125, - "grad_norm": 1.3671875, - "learning_rate": 2.92623999103267e-07, - "loss": 0.4066, - "step": 3805 - }, - { - "epoch": 0.9525, - "grad_norm": 1.0859375, - "learning_rate": 2.778375313873871e-07, - "loss": 0.4309, - "step": 3810 - }, - { - "epoch": 0.95375, - "grad_norm": 1.2421875, - "learning_rate": 2.634323099995395e-07, - "loss": 0.4623, - "step": 3815 - }, - { - "epoch": 0.955, - "grad_norm": 1.6015625, - "learning_rate": 2.494085570860616e-07, - "loss": 0.3977, - "step": 3820 - }, - { - "epoch": 0.95625, - "grad_norm": 1.28125, - "learning_rate": 2.3576648891056875e-07, - "loss": 0.4135, - "step": 3825 - }, - { - "epoch": 0.9575, - "grad_norm": 1.1640625, - "learning_rate": 2.2250631585063186e-07, - "loss": 0.3874, - "step": 3830 - }, - { - "epoch": 0.95875, - "grad_norm": 1.21875, - "learning_rate": 2.0962824239451894e-07, - "loss": 0.4494, - "step": 3835 - }, - { - "epoch": 0.96, - "grad_norm": 1.1328125, - "learning_rate": 1.9713246713805588e-07, - "loss": 0.3946, - "step": 3840 - }, - { - "epoch": 0.96125, - "grad_norm": 0.85546875, - "learning_rate": 1.8501918278155393e-07, - "loss": 0.4312, - "step": 3845 - }, - { - "epoch": 0.9625, - "grad_norm": 1.359375, - "learning_rate": 1.732885761268427e-07, - "loss": 0.4453, - "step": 3850 - }, - { - "epoch": 0.96375, - "grad_norm": 0.984375, - "learning_rate": 1.619408280743917e-07, - "loss": 0.3943, - "step": 3855 - }, - { - "epoch": 0.965, - "grad_norm": 0.90234375, - "learning_rate": 1.509761136205101e-07, - "loss": 0.4144, - "step": 3860 - }, - { - "epoch": 0.96625, - "grad_norm": 1.046875, - "learning_rate": 1.4039460185465703e-07, - "loss": 0.4026, - "step": 3865 - }, - { - "epoch": 0.9675, - "grad_norm": 1.0625, - "learning_rate": 1.3019645595683806e-07, - "loss": 0.432, - "step": 3870 - }, - { - "epoch": 0.96875, - "grad_norm": 1.2890625, - "learning_rate": 1.2038183319507955e-07, - "loss": 0.4384, - "step": 3875 - }, - { - "epoch": 0.97, - "grad_norm": 1.328125, - "learning_rate": 1.109508849230001e-07, - "loss": 0.4622, - "step": 3880 - }, - { - "epoch": 0.97125, - "grad_norm": 1.265625, - "learning_rate": 1.0190375657749274e-07, - "loss": 0.4136, - "step": 3885 - }, - { - "epoch": 0.9725, - "grad_norm": 1.359375, - "learning_rate": 9.324058767646859e-08, - "loss": 0.4417, - "step": 3890 - }, - { - "epoch": 0.97375, - "grad_norm": 1.0625, - "learning_rate": 8.496151181670852e-08, - "loss": 0.4316, - "step": 3895 - }, - { - "epoch": 0.975, - "grad_norm": 0.8515625, - "learning_rate": 7.706665667180091e-08, - "loss": 0.4246, - "step": 3900 - }, - { - "epoch": 0.97625, - "grad_norm": 1.3515625, - "learning_rate": 6.955614399018206e-08, - "loss": 0.4481, - "step": 3905 - }, - { - "epoch": 0.9775, - "grad_norm": 1.1171875, - "learning_rate": 6.243008959324892e-08, - "loss": 0.4544, - "step": 3910 - }, - { - "epoch": 0.97875, - "grad_norm": 1.3671875, - "learning_rate": 5.568860337357151e-08, - "loss": 0.4005, - "step": 3915 - }, - { - "epoch": 0.98, - "grad_norm": 1.265625, - "learning_rate": 4.9331789293211026e-08, - "loss": 0.4955, - "step": 3920 - }, - { - "epoch": 0.98125, - "grad_norm": 1.5546875, - "learning_rate": 4.335974538210441e-08, - "loss": 0.4102, - "step": 3925 - }, - { - "epoch": 0.9825, - "grad_norm": 1.2421875, - "learning_rate": 3.7772563736551694e-08, - "loss": 0.4542, - "step": 3930 - }, - { - "epoch": 0.98375, - "grad_norm": 1.2421875, - "learning_rate": 3.2570330517811555e-08, - "loss": 0.4691, - "step": 3935 - }, - { - "epoch": 0.985, - "grad_norm": 0.9140625, - "learning_rate": 2.7753125950752413e-08, - "loss": 0.4155, - "step": 3940 - }, - { - "epoch": 0.98625, - "grad_norm": 1.3125, - "learning_rate": 2.3321024322625617e-08, - "loss": 0.4268, - "step": 3945 - }, - { - "epoch": 0.9875, - "grad_norm": 1.0546875, - "learning_rate": 1.9274093981927478e-08, - "loss": 0.4098, - "step": 3950 - }, - { - "epoch": 0.98875, - "grad_norm": 1.171875, - "learning_rate": 1.5612397337325113e-08, - "loss": 0.3965, - "step": 3955 - }, - { - "epoch": 0.99, - "grad_norm": 1.2421875, - "learning_rate": 1.233599085671e-08, - "loss": 0.4047, - "step": 3960 - }, - { - "epoch": 0.99125, - "grad_norm": 1.3125, - "learning_rate": 9.444925066329213e-09, - "loss": 0.4358, - "step": 3965 - }, - { - "epoch": 0.9925, - "grad_norm": 0.88671875, - "learning_rate": 6.939244549986068e-09, - "loss": 0.465, - "step": 3970 - }, - { - "epoch": 0.99375, - "grad_norm": 1.109375, - "learning_rate": 4.818987948379539e-09, - "loss": 0.4483, - "step": 3975 - }, - { - "epoch": 0.995, - "grad_norm": 1.46875, - "learning_rate": 3.0841879584853073e-09, - "loss": 0.4986, - "step": 3980 - }, - { - "epoch": 0.99625, - "grad_norm": 1.2734375, - "learning_rate": 1.7348713330672671e-09, - "loss": 0.4121, - "step": 3985 - }, - { - "epoch": 0.9975, - "grad_norm": 1.078125, - "learning_rate": 7.710588802584129e-10, - "loss": 0.3764, - "step": 3990 - }, - { - "epoch": 0.99875, - "grad_norm": 1.125, - "learning_rate": 1.9276546323609978e-10, - "loss": 0.4443, - "step": 3995 - }, - { - "epoch": 1.0, - "grad_norm": 0.8515625, - "learning_rate": 0.0, - "loss": 0.4357, - "step": 4000 - } - ], - "logging_steps": 5, - "max_steps": 4000, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 400, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 3.184022656018907e+18, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}