{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.498581560283688, "eval_steps": 500, "global_step": 10573, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0070921985815602835, "grad_norm": 5.856144428253174, "learning_rate": 3.780718336483932e-06, "loss": 0.8655, "step": 10 }, { "epoch": 0.014184397163120567, "grad_norm": 5.8862433433532715, "learning_rate": 7.561436672967864e-06, "loss": 0.7361, "step": 20 }, { "epoch": 0.02127659574468085, "grad_norm": 3.5518908500671387, "learning_rate": 1.1342155009451797e-05, "loss": 0.551, "step": 30 }, { "epoch": 0.028368794326241134, "grad_norm": 3.55409836769104, "learning_rate": 1.5122873345935728e-05, "loss": 0.3709, "step": 40 }, { "epoch": 0.03546099290780142, "grad_norm": 2.7126119136810303, "learning_rate": 1.890359168241966e-05, "loss": 0.3237, "step": 50 }, { "epoch": 0.0425531914893617, "grad_norm": 2.9090919494628906, "learning_rate": 2.2684310018903593e-05, "loss": 0.2899, "step": 60 }, { "epoch": 0.04964539007092199, "grad_norm": 1.7755730152130127, "learning_rate": 2.646502835538752e-05, "loss": 0.1869, "step": 70 }, { "epoch": 0.05673758865248227, "grad_norm": 1.6960084438323975, "learning_rate": 3.0245746691871456e-05, "loss": 0.1864, "step": 80 }, { "epoch": 0.06382978723404255, "grad_norm": 1.7639371156692505, "learning_rate": 3.4026465028355385e-05, "loss": 0.1408, "step": 90 }, { "epoch": 0.07092198581560284, "grad_norm": 2.5512263774871826, "learning_rate": 3.780718336483932e-05, "loss": 0.1363, "step": 100 }, { "epoch": 0.07801418439716312, "grad_norm": 1.9627578258514404, "learning_rate": 4.158790170132325e-05, "loss": 0.129, "step": 110 }, { "epoch": 0.0851063829787234, "grad_norm": 0.9527886509895325, "learning_rate": 4.5368620037807186e-05, "loss": 0.1181, "step": 120 }, { "epoch": 0.09219858156028368, "grad_norm": 3.0496978759765625, "learning_rate": 4.914933837429112e-05, "loss": 0.0903, "step": 130 }, { "epoch": 0.09929078014184398, "grad_norm": 1.1979912519454956, "learning_rate": 5.293005671077504e-05, "loss": 0.0906, "step": 140 }, { "epoch": 0.10638297872340426, "grad_norm": 1.2007324695587158, "learning_rate": 5.671077504725898e-05, "loss": 0.0997, "step": 150 }, { "epoch": 0.11347517730496454, "grad_norm": 2.112391948699951, "learning_rate": 6.049149338374291e-05, "loss": 0.0903, "step": 160 }, { "epoch": 0.12056737588652482, "grad_norm": 1.144476056098938, "learning_rate": 6.427221172022685e-05, "loss": 0.0853, "step": 170 }, { "epoch": 0.1276595744680851, "grad_norm": 0.9079101085662842, "learning_rate": 6.805293005671077e-05, "loss": 0.0813, "step": 180 }, { "epoch": 0.1347517730496454, "grad_norm": 2.0803258419036865, "learning_rate": 7.183364839319471e-05, "loss": 0.0929, "step": 190 }, { "epoch": 0.14184397163120568, "grad_norm": 1.2291367053985596, "learning_rate": 7.561436672967865e-05, "loss": 0.0875, "step": 200 }, { "epoch": 0.14893617021276595, "grad_norm": 1.3166685104370117, "learning_rate": 7.939508506616258e-05, "loss": 0.0903, "step": 210 }, { "epoch": 0.15602836879432624, "grad_norm": 1.6771767139434814, "learning_rate": 8.31758034026465e-05, "loss": 0.0837, "step": 220 }, { "epoch": 0.16312056737588654, "grad_norm": 1.185477375984192, "learning_rate": 8.695652173913044e-05, "loss": 0.0774, "step": 230 }, { "epoch": 0.1702127659574468, "grad_norm": 0.8530003428459167, "learning_rate": 9.073724007561437e-05, "loss": 0.0767, "step": 240 }, { "epoch": 0.1773049645390071, "grad_norm": 0.8007742762565613, "learning_rate": 9.45179584120983e-05, "loss": 0.0733, "step": 250 }, { "epoch": 0.18439716312056736, "grad_norm": 0.5508751273155212, "learning_rate": 9.829867674858224e-05, "loss": 0.0853, "step": 260 }, { "epoch": 0.19148936170212766, "grad_norm": 0.8425294756889343, "learning_rate": 0.00010207939508506617, "loss": 0.0725, "step": 270 }, { "epoch": 0.19858156028368795, "grad_norm": 1.2945622205734253, "learning_rate": 0.00010586011342155009, "loss": 0.0879, "step": 280 }, { "epoch": 0.20567375886524822, "grad_norm": 0.6478763818740845, "learning_rate": 0.00010964083175803403, "loss": 0.0553, "step": 290 }, { "epoch": 0.2127659574468085, "grad_norm": 0.9865133166313171, "learning_rate": 0.00011342155009451796, "loss": 0.0793, "step": 300 }, { "epoch": 0.2198581560283688, "grad_norm": 1.046968936920166, "learning_rate": 0.00011720226843100191, "loss": 0.0825, "step": 310 }, { "epoch": 0.22695035460992907, "grad_norm": 0.9418226480484009, "learning_rate": 0.00012098298676748583, "loss": 0.0793, "step": 320 }, { "epoch": 0.23404255319148937, "grad_norm": 1.2901511192321777, "learning_rate": 0.00012476370510396974, "loss": 0.0753, "step": 330 }, { "epoch": 0.24113475177304963, "grad_norm": 1.3087291717529297, "learning_rate": 0.0001285444234404537, "loss": 0.0615, "step": 340 }, { "epoch": 0.24822695035460993, "grad_norm": 0.9991538524627686, "learning_rate": 0.00013232514177693763, "loss": 0.0626, "step": 350 }, { "epoch": 0.2553191489361702, "grad_norm": 0.6831763386726379, "learning_rate": 0.00013610586011342154, "loss": 0.0626, "step": 360 }, { "epoch": 0.2624113475177305, "grad_norm": 0.7626124024391174, "learning_rate": 0.0001398865784499055, "loss": 0.0622, "step": 370 }, { "epoch": 0.2695035460992908, "grad_norm": 0.6531655192375183, "learning_rate": 0.00014366729678638943, "loss": 0.0607, "step": 380 }, { "epoch": 0.2765957446808511, "grad_norm": 0.8742074966430664, "learning_rate": 0.00014744801512287336, "loss": 0.0768, "step": 390 }, { "epoch": 0.28368794326241137, "grad_norm": 0.8710255026817322, "learning_rate": 0.0001512287334593573, "loss": 0.0576, "step": 400 }, { "epoch": 0.2907801418439716, "grad_norm": 0.8089184761047363, "learning_rate": 0.0001550094517958412, "loss": 0.0659, "step": 410 }, { "epoch": 0.2978723404255319, "grad_norm": 1.0766539573669434, "learning_rate": 0.00015879017013232515, "loss": 0.0748, "step": 420 }, { "epoch": 0.3049645390070922, "grad_norm": 0.9432766437530518, "learning_rate": 0.0001625708884688091, "loss": 0.0659, "step": 430 }, { "epoch": 0.3120567375886525, "grad_norm": 0.7996474504470825, "learning_rate": 0.000166351606805293, "loss": 0.0659, "step": 440 }, { "epoch": 0.3191489361702128, "grad_norm": 1.3181546926498413, "learning_rate": 0.00017013232514177695, "loss": 0.0658, "step": 450 }, { "epoch": 0.3262411347517731, "grad_norm": 0.8984364867210388, "learning_rate": 0.00017391304347826088, "loss": 0.0596, "step": 460 }, { "epoch": 0.3333333333333333, "grad_norm": 1.0370538234710693, "learning_rate": 0.0001776937618147448, "loss": 0.066, "step": 470 }, { "epoch": 0.3404255319148936, "grad_norm": 1.0649698972702026, "learning_rate": 0.00018147448015122874, "loss": 0.0597, "step": 480 }, { "epoch": 0.3475177304964539, "grad_norm": 0.5405861735343933, "learning_rate": 0.00018525519848771268, "loss": 0.0603, "step": 490 }, { "epoch": 0.3546099290780142, "grad_norm": 0.8146863579750061, "learning_rate": 0.0001890359168241966, "loss": 0.0524, "step": 500 }, { "epoch": 0.3617021276595745, "grad_norm": 0.6537788510322571, "learning_rate": 0.00019281663516068054, "loss": 0.0564, "step": 510 }, { "epoch": 0.36879432624113473, "grad_norm": 0.8714485764503479, "learning_rate": 0.00019659735349716447, "loss": 0.0525, "step": 520 }, { "epoch": 0.375886524822695, "grad_norm": 0.5386486649513245, "learning_rate": 0.00019999999510833915, "loss": 0.0557, "step": 530 }, { "epoch": 0.3829787234042553, "grad_norm": 0.6375821828842163, "learning_rate": 0.00019999940810961714, "loss": 0.0614, "step": 540 }, { "epoch": 0.3900709219858156, "grad_norm": 0.6789309978485107, "learning_rate": 0.00019999784278530695, "loss": 0.0537, "step": 550 }, { "epoch": 0.3971631205673759, "grad_norm": 0.8208333253860474, "learning_rate": 0.00019999529915072262, "loss": 0.0668, "step": 560 }, { "epoch": 0.40425531914893614, "grad_norm": 0.6849876642227173, "learning_rate": 0.00019999177723074935, "loss": 0.0612, "step": 570 }, { "epoch": 0.41134751773049644, "grad_norm": 0.690582811832428, "learning_rate": 0.00019998727705984316, "loss": 0.0652, "step": 580 }, { "epoch": 0.41843971631205673, "grad_norm": 0.5250919461250305, "learning_rate": 0.00019998179868203068, "loss": 0.0596, "step": 590 }, { "epoch": 0.425531914893617, "grad_norm": 0.6307488679885864, "learning_rate": 0.00019997534215090857, "loss": 0.057, "step": 600 }, { "epoch": 0.4326241134751773, "grad_norm": 0.630332350730896, "learning_rate": 0.00019996790752964305, "loss": 0.066, "step": 610 }, { "epoch": 0.4397163120567376, "grad_norm": 0.40226250886917114, "learning_rate": 0.00019995949489096945, "loss": 0.0555, "step": 620 }, { "epoch": 0.44680851063829785, "grad_norm": 0.5462756752967834, "learning_rate": 0.00019995010431719118, "loss": 0.0507, "step": 630 }, { "epoch": 0.45390070921985815, "grad_norm": 0.5090711116790771, "learning_rate": 0.00019993973590017922, "loss": 0.0458, "step": 640 }, { "epoch": 0.46099290780141844, "grad_norm": 0.47854822874069214, "learning_rate": 0.00019992838974137103, "loss": 0.0459, "step": 650 }, { "epoch": 0.46808510638297873, "grad_norm": 0.7866285443305969, "learning_rate": 0.00019991606595176964, "loss": 0.0585, "step": 660 }, { "epoch": 0.475177304964539, "grad_norm": 0.8126336932182312, "learning_rate": 0.0001999027646519425, "loss": 0.0448, "step": 670 }, { "epoch": 0.48226950354609927, "grad_norm": 0.5472203493118286, "learning_rate": 0.0001998884859720205, "loss": 0.0514, "step": 680 }, { "epoch": 0.48936170212765956, "grad_norm": 0.6094168424606323, "learning_rate": 0.00019987323005169638, "loss": 0.0459, "step": 690 }, { "epoch": 0.49645390070921985, "grad_norm": 0.5688044428825378, "learning_rate": 0.00019985699704022357, "loss": 0.053, "step": 700 }, { "epoch": 0.5035460992907801, "grad_norm": 0.469110906124115, "learning_rate": 0.00019983978709641481, "loss": 0.0524, "step": 710 }, { "epoch": 0.5106382978723404, "grad_norm": 0.8332406282424927, "learning_rate": 0.00019982160038864032, "loss": 0.0507, "step": 720 }, { "epoch": 0.5177304964539007, "grad_norm": 0.8524491190910339, "learning_rate": 0.00019980243709482633, "loss": 0.0573, "step": 730 }, { "epoch": 0.524822695035461, "grad_norm": 0.7200371026992798, "learning_rate": 0.00019978229740245343, "loss": 0.0502, "step": 740 }, { "epoch": 0.5319148936170213, "grad_norm": 0.4582567811012268, "learning_rate": 0.0001997611815085545, "loss": 0.0503, "step": 750 }, { "epoch": 0.5390070921985816, "grad_norm": 0.5496141910552979, "learning_rate": 0.000199739089619713, "loss": 0.0493, "step": 760 }, { "epoch": 0.5460992907801419, "grad_norm": 0.8712863326072693, "learning_rate": 0.0001997160219520608, "loss": 0.0469, "step": 770 }, { "epoch": 0.5531914893617021, "grad_norm": 0.7600995302200317, "learning_rate": 0.0001996919787312761, "loss": 0.0544, "step": 780 }, { "epoch": 0.5602836879432624, "grad_norm": 0.6321051716804504, "learning_rate": 0.00019966696019258127, "loss": 0.0418, "step": 790 }, { "epoch": 0.5673758865248227, "grad_norm": 0.5661709904670715, "learning_rate": 0.00019964096658074056, "loss": 0.0437, "step": 800 }, { "epoch": 0.574468085106383, "grad_norm": 0.322308748960495, "learning_rate": 0.00019961399815005763, "loss": 0.0379, "step": 810 }, { "epoch": 0.5815602836879432, "grad_norm": 0.5047584176063538, "learning_rate": 0.00019958605516437307, "loss": 0.0628, "step": 820 }, { "epoch": 0.5886524822695035, "grad_norm": 0.45054513216018677, "learning_rate": 0.0001995571378970619, "loss": 0.0475, "step": 830 }, { "epoch": 0.5957446808510638, "grad_norm": 0.7547653913497925, "learning_rate": 0.00019952724663103083, "loss": 0.0413, "step": 840 }, { "epoch": 0.6028368794326241, "grad_norm": 0.3875257670879364, "learning_rate": 0.00019949638165871547, "loss": 0.039, "step": 850 }, { "epoch": 0.6099290780141844, "grad_norm": 0.6647422313690186, "learning_rate": 0.00019946454328207753, "loss": 0.0559, "step": 860 }, { "epoch": 0.6170212765957447, "grad_norm": 0.3902786076068878, "learning_rate": 0.00019943173181260186, "loss": 0.0407, "step": 870 }, { "epoch": 0.624113475177305, "grad_norm": 0.5156275033950806, "learning_rate": 0.00019939794757129332, "loss": 0.0443, "step": 880 }, { "epoch": 0.6312056737588653, "grad_norm": 0.5711575746536255, "learning_rate": 0.0001993631908886738, "loss": 0.0407, "step": 890 }, { "epoch": 0.6382978723404256, "grad_norm": 0.33694812655448914, "learning_rate": 0.0001993274621047788, "loss": 0.0402, "step": 900 }, { "epoch": 0.6453900709219859, "grad_norm": 0.869691014289856, "learning_rate": 0.00019929076156915425, "loss": 0.0506, "step": 910 }, { "epoch": 0.6524822695035462, "grad_norm": 0.5810511112213135, "learning_rate": 0.00019925308964085297, "loss": 0.0537, "step": 920 }, { "epoch": 0.6595744680851063, "grad_norm": 0.666899561882019, "learning_rate": 0.00019921444668843125, "loss": 0.0574, "step": 930 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7992278337478638, "learning_rate": 0.00019917483308994527, "loss": 0.0385, "step": 940 }, { "epoch": 0.6737588652482269, "grad_norm": 0.4858379662036896, "learning_rate": 0.00019913424923294722, "loss": 0.0473, "step": 950 }, { "epoch": 0.6808510638297872, "grad_norm": 0.5866559147834778, "learning_rate": 0.0001990926955144818, "loss": 0.0465, "step": 960 }, { "epoch": 0.6879432624113475, "grad_norm": 0.6053078770637512, "learning_rate": 0.000199050172341082, "loss": 0.0498, "step": 970 }, { "epoch": 0.6950354609929078, "grad_norm": 0.5185685157775879, "learning_rate": 0.00019900668012876543, "loss": 0.0444, "step": 980 }, { "epoch": 0.7021276595744681, "grad_norm": 0.6573126316070557, "learning_rate": 0.00019896221930303, "loss": 0.046, "step": 990 }, { "epoch": 0.7092198581560284, "grad_norm": 0.6348971724510193, "learning_rate": 0.00019891679029884993, "loss": 0.0439, "step": 1000 }, { "epoch": 0.7163120567375887, "grad_norm": 0.6452783346176147, "learning_rate": 0.00019887039356067146, "loss": 0.0607, "step": 1010 }, { "epoch": 0.723404255319149, "grad_norm": 0.6160244941711426, "learning_rate": 0.00019882302954240836, "loss": 0.0397, "step": 1020 }, { "epoch": 0.7304964539007093, "grad_norm": 0.5438401699066162, "learning_rate": 0.00019877469870743778, "loss": 0.0417, "step": 1030 }, { "epoch": 0.7375886524822695, "grad_norm": 0.5617598295211792, "learning_rate": 0.00019872540152859536, "loss": 0.0405, "step": 1040 }, { "epoch": 0.7446808510638298, "grad_norm": 0.4101731479167938, "learning_rate": 0.00019867513848817093, "loss": 0.0392, "step": 1050 }, { "epoch": 0.75177304964539, "grad_norm": 0.4860725998878479, "learning_rate": 0.00019862391007790354, "loss": 0.049, "step": 1060 }, { "epoch": 0.7588652482269503, "grad_norm": 0.3138566315174103, "learning_rate": 0.00019857171679897687, "loss": 0.0372, "step": 1070 }, { "epoch": 0.7659574468085106, "grad_norm": 0.5368508100509644, "learning_rate": 0.00019851855916201404, "loss": 0.0466, "step": 1080 }, { "epoch": 0.7730496453900709, "grad_norm": 0.5401434898376465, "learning_rate": 0.000198464437687073, "loss": 0.0489, "step": 1090 }, { "epoch": 0.7801418439716312, "grad_norm": 0.3905884623527527, "learning_rate": 0.00019840935290364105, "loss": 0.0389, "step": 1100 }, { "epoch": 0.7872340425531915, "grad_norm": 0.3689773976802826, "learning_rate": 0.00019835330535062994, "loss": 0.0432, "step": 1110 }, { "epoch": 0.7943262411347518, "grad_norm": 0.5504758954048157, "learning_rate": 0.0001982962955763705, "loss": 0.04, "step": 1120 }, { "epoch": 0.8014184397163121, "grad_norm": 0.3219192326068878, "learning_rate": 0.00019823832413860714, "loss": 0.0373, "step": 1130 }, { "epoch": 0.8085106382978723, "grad_norm": 0.5545524954795837, "learning_rate": 0.00019817939160449272, "loss": 0.0367, "step": 1140 }, { "epoch": 0.8156028368794326, "grad_norm": 0.5093657374382019, "learning_rate": 0.0001981194985505827, "loss": 0.0426, "step": 1150 }, { "epoch": 0.8226950354609929, "grad_norm": 0.4452705979347229, "learning_rate": 0.00019805864556282957, "loss": 0.0357, "step": 1160 }, { "epoch": 0.8297872340425532, "grad_norm": 0.5040962100028992, "learning_rate": 0.00019799683323657726, "loss": 0.0552, "step": 1170 }, { "epoch": 0.8368794326241135, "grad_norm": 0.5125325322151184, "learning_rate": 0.00019793406217655517, "loss": 0.0455, "step": 1180 }, { "epoch": 0.8439716312056738, "grad_norm": 0.5588168501853943, "learning_rate": 0.0001978703329968722, "loss": 0.0432, "step": 1190 }, { "epoch": 0.851063829787234, "grad_norm": 0.5096802711486816, "learning_rate": 0.00019780564632101096, "loss": 0.0488, "step": 1200 }, { "epoch": 0.8581560283687943, "grad_norm": 0.2617908716201782, "learning_rate": 0.00019774000278182147, "loss": 0.0454, "step": 1210 }, { "epoch": 0.8652482269503546, "grad_norm": 0.5790386199951172, "learning_rate": 0.00019767340302151513, "loss": 0.039, "step": 1220 }, { "epoch": 0.8723404255319149, "grad_norm": 0.9014220237731934, "learning_rate": 0.00019760584769165824, "loss": 0.0325, "step": 1230 }, { "epoch": 0.8794326241134752, "grad_norm": 0.41618219017982483, "learning_rate": 0.0001975373374531658, "loss": 0.0442, "step": 1240 }, { "epoch": 0.8865248226950354, "grad_norm": 0.4697210192680359, "learning_rate": 0.00019746787297629496, "loss": 0.0436, "step": 1250 }, { "epoch": 0.8936170212765957, "grad_norm": 0.4056944251060486, "learning_rate": 0.00019739745494063855, "loss": 0.0375, "step": 1260 }, { "epoch": 0.900709219858156, "grad_norm": 0.6047598123550415, "learning_rate": 0.00019732608403511822, "loss": 0.0323, "step": 1270 }, { "epoch": 0.9078014184397163, "grad_norm": 0.411379873752594, "learning_rate": 0.00019725376095797804, "loss": 0.0483, "step": 1280 }, { "epoch": 0.9148936170212766, "grad_norm": 0.2770962119102478, "learning_rate": 0.00019718048641677728, "loss": 0.0379, "step": 1290 }, { "epoch": 0.9219858156028369, "grad_norm": 0.6620250940322876, "learning_rate": 0.00019710626112838382, "loss": 0.042, "step": 1300 }, { "epoch": 0.9290780141843972, "grad_norm": 0.4887118339538574, "learning_rate": 0.0001970310858189669, "loss": 0.0419, "step": 1310 }, { "epoch": 0.9361702127659575, "grad_norm": 0.4877367615699768, "learning_rate": 0.0001969549612239902, "loss": 0.0386, "step": 1320 }, { "epoch": 0.9432624113475178, "grad_norm": 0.6139522790908813, "learning_rate": 0.00019687788808820452, "loss": 0.0411, "step": 1330 }, { "epoch": 0.950354609929078, "grad_norm": 0.628237247467041, "learning_rate": 0.0001967998671656405, "loss": 0.0532, "step": 1340 }, { "epoch": 0.9574468085106383, "grad_norm": 0.5345218777656555, "learning_rate": 0.00019672089921960137, "loss": 0.0414, "step": 1350 }, { "epoch": 0.9645390070921985, "grad_norm": 0.35509511828422546, "learning_rate": 0.00019664098502265525, "loss": 0.0464, "step": 1360 }, { "epoch": 0.9716312056737588, "grad_norm": 0.4468687176704407, "learning_rate": 0.00019656012535662786, "loss": 0.0395, "step": 1370 }, { "epoch": 0.9787234042553191, "grad_norm": 0.5955891013145447, "learning_rate": 0.0001964783210125946, "loss": 0.045, "step": 1380 }, { "epoch": 0.9858156028368794, "grad_norm": 0.48954424262046814, "learning_rate": 0.0001963955727908732, "loss": 0.0471, "step": 1390 }, { "epoch": 0.9929078014184397, "grad_norm": 0.450183242559433, "learning_rate": 0.00019631188150101534, "loss": 0.0366, "step": 1400 }, { "epoch": 1.0, "grad_norm": 0.41711631417274475, "learning_rate": 0.0001962272479617992, "loss": 0.0528, "step": 1410 }, { "epoch": 1.0070921985815602, "grad_norm": 0.3645707666873932, "learning_rate": 0.00019614167300122126, "loss": 0.043, "step": 1420 }, { "epoch": 1.0141843971631206, "grad_norm": 0.42011067271232605, "learning_rate": 0.00019605515745648822, "loss": 0.0402, "step": 1430 }, { "epoch": 1.0212765957446808, "grad_norm": 0.4155680537223816, "learning_rate": 0.0001959677021740088, "loss": 0.0392, "step": 1440 }, { "epoch": 1.0283687943262412, "grad_norm": 0.49862140417099, "learning_rate": 0.00019587930800938545, "loss": 0.0484, "step": 1450 }, { "epoch": 1.0354609929078014, "grad_norm": 0.42772340774536133, "learning_rate": 0.00019578997582740603, "loss": 0.0349, "step": 1460 }, { "epoch": 1.0425531914893618, "grad_norm": 0.40276241302490234, "learning_rate": 0.00019569970650203534, "loss": 0.0335, "step": 1470 }, { "epoch": 1.049645390070922, "grad_norm": 0.369478315114975, "learning_rate": 0.00019560850091640647, "loss": 0.0402, "step": 1480 }, { "epoch": 1.0567375886524824, "grad_norm": 0.7682146430015564, "learning_rate": 0.00019551635996281231, "loss": 0.0392, "step": 1490 }, { "epoch": 1.0638297872340425, "grad_norm": 0.619755744934082, "learning_rate": 0.0001954232845426967, "loss": 0.0426, "step": 1500 }, { "epoch": 1.070921985815603, "grad_norm": 0.3280376195907593, "learning_rate": 0.00019532927556664573, "loss": 0.0311, "step": 1510 }, { "epoch": 1.0780141843971631, "grad_norm": 0.42268431186676025, "learning_rate": 0.00019523433395437866, "loss": 0.0354, "step": 1520 }, { "epoch": 1.0851063829787233, "grad_norm": 0.34740039706230164, "learning_rate": 0.00019513846063473907, "loss": 0.0374, "step": 1530 }, { "epoch": 1.0921985815602837, "grad_norm": 0.36237838864326477, "learning_rate": 0.00019504165654568576, "loss": 0.028, "step": 1540 }, { "epoch": 1.099290780141844, "grad_norm": 0.4167526364326477, "learning_rate": 0.00019494392263428353, "loss": 0.0447, "step": 1550 }, { "epoch": 1.1063829787234043, "grad_norm": 0.5500767827033997, "learning_rate": 0.00019484525985669383, "loss": 0.0305, "step": 1560 }, { "epoch": 1.1134751773049645, "grad_norm": 0.4666343927383423, "learning_rate": 0.00019474566917816565, "loss": 0.0323, "step": 1570 }, { "epoch": 1.1205673758865249, "grad_norm": 0.43880197405815125, "learning_rate": 0.0001946451515730258, "loss": 0.0305, "step": 1580 }, { "epoch": 1.127659574468085, "grad_norm": 0.4338850677013397, "learning_rate": 0.00019454370802466953, "loss": 0.035, "step": 1590 }, { "epoch": 1.1347517730496455, "grad_norm": 0.4450472295284271, "learning_rate": 0.00019444133952555096, "loss": 0.0371, "step": 1600 }, { "epoch": 1.1418439716312057, "grad_norm": 0.38660725951194763, "learning_rate": 0.00019433804707717328, "loss": 0.0388, "step": 1610 }, { "epoch": 1.148936170212766, "grad_norm": 0.5273442268371582, "learning_rate": 0.0001942338316900788, "loss": 0.0392, "step": 1620 }, { "epoch": 1.1560283687943262, "grad_norm": 0.5786083340644836, "learning_rate": 0.00019412869438383945, "loss": 0.0409, "step": 1630 }, { "epoch": 1.1631205673758864, "grad_norm": 0.3235854506492615, "learning_rate": 0.00019402263618704642, "loss": 0.0325, "step": 1640 }, { "epoch": 1.1702127659574468, "grad_norm": 0.4242108166217804, "learning_rate": 0.0001939156581373004, "loss": 0.0295, "step": 1650 }, { "epoch": 1.177304964539007, "grad_norm": 0.518750011920929, "learning_rate": 0.00019380776128120116, "loss": 0.0412, "step": 1660 }, { "epoch": 1.1843971631205674, "grad_norm": 0.3119601607322693, "learning_rate": 0.00019369894667433754, "loss": 0.027, "step": 1670 }, { "epoch": 1.1914893617021276, "grad_norm": 0.3405968248844147, "learning_rate": 0.00019358921538127697, "loss": 0.036, "step": 1680 }, { "epoch": 1.198581560283688, "grad_norm": 0.62176913022995, "learning_rate": 0.00019347856847555512, "loss": 0.0527, "step": 1690 }, { "epoch": 1.2056737588652482, "grad_norm": 0.432858943939209, "learning_rate": 0.00019336700703966538, "loss": 0.0381, "step": 1700 }, { "epoch": 1.2127659574468086, "grad_norm": 0.23901493847370148, "learning_rate": 0.0001932545321650483, "loss": 0.0295, "step": 1710 }, { "epoch": 1.2198581560283688, "grad_norm": 0.3481459319591522, "learning_rate": 0.00019314114495208086, "loss": 0.0418, "step": 1720 }, { "epoch": 1.226950354609929, "grad_norm": 0.3281635344028473, "learning_rate": 0.00019302684651006574, "loss": 0.0282, "step": 1730 }, { "epoch": 1.2340425531914894, "grad_norm": 0.3726864755153656, "learning_rate": 0.00019291163795722048, "loss": 0.0384, "step": 1740 }, { "epoch": 1.2411347517730495, "grad_norm": 0.35322147607803345, "learning_rate": 0.00019279552042066652, "loss": 0.0267, "step": 1750 }, { "epoch": 1.24822695035461, "grad_norm": 0.3819480538368225, "learning_rate": 0.0001926784950364181, "loss": 0.0317, "step": 1760 }, { "epoch": 1.2553191489361701, "grad_norm": 0.5167778134346008, "learning_rate": 0.00019256056294937132, "loss": 0.0365, "step": 1770 }, { "epoch": 1.2624113475177305, "grad_norm": 0.2764042615890503, "learning_rate": 0.00019244172531329278, "loss": 0.03, "step": 1780 }, { "epoch": 1.2695035460992907, "grad_norm": 0.568465530872345, "learning_rate": 0.00019232198329080836, "loss": 0.042, "step": 1790 }, { "epoch": 1.2765957446808511, "grad_norm": 0.47287610173225403, "learning_rate": 0.00019220133805339184, "loss": 0.0431, "step": 1800 }, { "epoch": 1.2836879432624113, "grad_norm": 0.5068013072013855, "learning_rate": 0.00019207979078135346, "loss": 0.0326, "step": 1810 }, { "epoch": 1.2907801418439715, "grad_norm": 0.4016683101654053, "learning_rate": 0.00019195734266382828, "loss": 0.0327, "step": 1820 }, { "epoch": 1.297872340425532, "grad_norm": 0.21608802676200867, "learning_rate": 0.00019183399489876467, "loss": 0.0332, "step": 1830 }, { "epoch": 1.3049645390070923, "grad_norm": 0.5429771542549133, "learning_rate": 0.00019170974869291255, "loss": 0.0349, "step": 1840 }, { "epoch": 1.3120567375886525, "grad_norm": 0.3801655173301697, "learning_rate": 0.00019158460526181152, "loss": 0.0377, "step": 1850 }, { "epoch": 1.3191489361702127, "grad_norm": 0.3311924934387207, "learning_rate": 0.00019145856582977904, "loss": 0.0346, "step": 1860 }, { "epoch": 1.326241134751773, "grad_norm": 0.5218601226806641, "learning_rate": 0.0001913316316298984, "loss": 0.0302, "step": 1870 }, { "epoch": 1.3333333333333333, "grad_norm": 0.37110060453414917, "learning_rate": 0.0001912038039040067, "loss": 0.0387, "step": 1880 }, { "epoch": 1.3404255319148937, "grad_norm": 0.46655604243278503, "learning_rate": 0.00019107508390268276, "loss": 0.0337, "step": 1890 }, { "epoch": 1.3475177304964538, "grad_norm": 0.33353927731513977, "learning_rate": 0.00019094547288523467, "loss": 0.0466, "step": 1900 }, { "epoch": 1.3546099290780143, "grad_norm": 0.44526827335357666, "learning_rate": 0.00019081497211968773, "loss": 0.0352, "step": 1910 }, { "epoch": 1.3617021276595744, "grad_norm": 0.4011070430278778, "learning_rate": 0.00019068358288277187, "loss": 0.0294, "step": 1920 }, { "epoch": 1.3687943262411348, "grad_norm": 0.3325769007205963, "learning_rate": 0.0001905513064599092, "loss": 0.0361, "step": 1930 }, { "epoch": 1.375886524822695, "grad_norm": 0.2585453987121582, "learning_rate": 0.0001904181441452015, "loss": 0.0302, "step": 1940 }, { "epoch": 1.3829787234042552, "grad_norm": 0.441824734210968, "learning_rate": 0.00019028409724141746, "loss": 0.0418, "step": 1950 }, { "epoch": 1.3900709219858156, "grad_norm": 0.41769495606422424, "learning_rate": 0.00019014916705998002, "loss": 0.0301, "step": 1960 }, { "epoch": 1.397163120567376, "grad_norm": 0.24308504164218903, "learning_rate": 0.00019001335492095347, "loss": 0.0353, "step": 1970 }, { "epoch": 1.4042553191489362, "grad_norm": 0.3190462589263916, "learning_rate": 0.00018987666215303058, "loss": 0.0385, "step": 1980 }, { "epoch": 1.4113475177304964, "grad_norm": 0.452863484621048, "learning_rate": 0.0001897390900935196, "loss": 0.0367, "step": 1990 }, { "epoch": 1.4184397163120568, "grad_norm": 0.3386947214603424, "learning_rate": 0.00018960064008833116, "loss": 0.0372, "step": 2000 }, { "epoch": 1.425531914893617, "grad_norm": 0.40042638778686523, "learning_rate": 0.0001894613134919651, "loss": 0.0354, "step": 2010 }, { "epoch": 1.4326241134751774, "grad_norm": 0.3207855820655823, "learning_rate": 0.00018932111166749724, "loss": 0.0304, "step": 2020 }, { "epoch": 1.4397163120567376, "grad_norm": 0.277885377407074, "learning_rate": 0.000189180035986566, "loss": 0.0303, "step": 2030 }, { "epoch": 1.4468085106382977, "grad_norm": 0.4140087366104126, "learning_rate": 0.00018903808782935904, "loss": 0.0344, "step": 2040 }, { "epoch": 1.4539007092198581, "grad_norm": 0.3237980306148529, "learning_rate": 0.00018889526858459975, "loss": 0.0317, "step": 2050 }, { "epoch": 1.4609929078014185, "grad_norm": 0.38879090547561646, "learning_rate": 0.00018875157964953358, "loss": 0.0344, "step": 2060 }, { "epoch": 1.4680851063829787, "grad_norm": 0.39974260330200195, "learning_rate": 0.0001886070224299145, "loss": 0.0339, "step": 2070 }, { "epoch": 1.475177304964539, "grad_norm": 0.3247739374637604, "learning_rate": 0.00018846159833999114, "loss": 0.0317, "step": 2080 }, { "epoch": 1.4822695035460993, "grad_norm": 0.5183299779891968, "learning_rate": 0.000188315308802493, "loss": 0.0262, "step": 2090 }, { "epoch": 1.4893617021276595, "grad_norm": 0.44669198989868164, "learning_rate": 0.00018816815524861654, "loss": 0.0323, "step": 2100 }, { "epoch": 1.49645390070922, "grad_norm": 0.5130909085273743, "learning_rate": 0.00018802013911801112, "loss": 0.0361, "step": 2110 }, { "epoch": 1.50354609929078, "grad_norm": 0.47592759132385254, "learning_rate": 0.00018787126185876502, "loss": 0.0423, "step": 2120 }, { "epoch": 1.5106382978723403, "grad_norm": 0.27714022994041443, "learning_rate": 0.0001877215249273912, "loss": 0.0398, "step": 2130 }, { "epoch": 1.5177304964539007, "grad_norm": 0.37813687324523926, "learning_rate": 0.00018757092978881302, "loss": 0.0285, "step": 2140 }, { "epoch": 1.524822695035461, "grad_norm": 0.2609386742115021, "learning_rate": 0.00018741947791634994, "loss": 0.0303, "step": 2150 }, { "epoch": 1.5319148936170213, "grad_norm": 0.36917203664779663, "learning_rate": 0.00018726717079170323, "loss": 0.0473, "step": 2160 }, { "epoch": 1.5390070921985815, "grad_norm": 0.18295632302761078, "learning_rate": 0.00018711400990494123, "loss": 0.0246, "step": 2170 }, { "epoch": 1.5460992907801419, "grad_norm": 0.2882087826728821, "learning_rate": 0.00018695999675448496, "loss": 0.0224, "step": 2180 }, { "epoch": 1.5531914893617023, "grad_norm": 0.35880571603775024, "learning_rate": 0.00018680513284709344, "loss": 0.0299, "step": 2190 }, { "epoch": 1.5602836879432624, "grad_norm": 0.4506542384624481, "learning_rate": 0.00018664941969784882, "loss": 0.0312, "step": 2200 }, { "epoch": 1.5673758865248226, "grad_norm": 0.3101454973220825, "learning_rate": 0.00018649285883014173, "loss": 0.036, "step": 2210 }, { "epoch": 1.574468085106383, "grad_norm": 0.3249278664588928, "learning_rate": 0.00018633545177565623, "loss": 0.0357, "step": 2220 }, { "epoch": 1.5815602836879432, "grad_norm": 0.23713338375091553, "learning_rate": 0.00018617720007435497, "loss": 0.0346, "step": 2230 }, { "epoch": 1.5886524822695036, "grad_norm": 0.2550758421421051, "learning_rate": 0.00018601810527446398, "loss": 0.0265, "step": 2240 }, { "epoch": 1.5957446808510638, "grad_norm": 0.41524937748908997, "learning_rate": 0.00018585816893245763, "loss": 0.0299, "step": 2250 }, { "epoch": 1.602836879432624, "grad_norm": 0.3899802565574646, "learning_rate": 0.00018569739261304328, "loss": 0.0361, "step": 2260 }, { "epoch": 1.6099290780141844, "grad_norm": 0.4563800096511841, "learning_rate": 0.00018553577788914618, "loss": 0.0358, "step": 2270 }, { "epoch": 1.6170212765957448, "grad_norm": 0.8882343769073486, "learning_rate": 0.00018537332634189384, "loss": 0.0419, "step": 2280 }, { "epoch": 1.624113475177305, "grad_norm": 0.5693446397781372, "learning_rate": 0.00018521003956060078, "loss": 0.0401, "step": 2290 }, { "epoch": 1.6312056737588652, "grad_norm": 0.3942001461982727, "learning_rate": 0.00018504591914275274, "loss": 0.035, "step": 2300 }, { "epoch": 1.6382978723404256, "grad_norm": 0.40895143151283264, "learning_rate": 0.00018488096669399133, "loss": 0.0292, "step": 2310 }, { "epoch": 1.645390070921986, "grad_norm": 0.43449512124061584, "learning_rate": 0.0001847151838280981, "loss": 0.0367, "step": 2320 }, { "epoch": 1.6524822695035462, "grad_norm": 0.4020148515701294, "learning_rate": 0.00018454857216697882, "loss": 0.0288, "step": 2330 }, { "epoch": 1.6595744680851063, "grad_norm": 0.4451272785663605, "learning_rate": 0.0001843811333406477, "loss": 0.0375, "step": 2340 }, { "epoch": 1.6666666666666665, "grad_norm": 0.26918983459472656, "learning_rate": 0.00018421286898721127, "loss": 0.0362, "step": 2350 }, { "epoch": 1.673758865248227, "grad_norm": 0.3901691436767578, "learning_rate": 0.0001840437807528525, "loss": 0.0285, "step": 2360 }, { "epoch": 1.6808510638297873, "grad_norm": 0.3481754660606384, "learning_rate": 0.00018387387029181472, "loss": 0.0292, "step": 2370 }, { "epoch": 1.6879432624113475, "grad_norm": 0.40919333696365356, "learning_rate": 0.00018370313926638522, "loss": 0.0353, "step": 2380 }, { "epoch": 1.6950354609929077, "grad_norm": 0.24539948999881744, "learning_rate": 0.0001835315893468792, "loss": 0.0407, "step": 2390 }, { "epoch": 1.702127659574468, "grad_norm": 0.25068360567092896, "learning_rate": 0.00018335922221162336, "loss": 0.0313, "step": 2400 }, { "epoch": 1.7092198581560285, "grad_norm": 0.45453083515167236, "learning_rate": 0.00018318603954693948, "loss": 0.0328, "step": 2410 }, { "epoch": 1.7163120567375887, "grad_norm": 0.5006433129310608, "learning_rate": 0.0001830120430471279, "loss": 0.03, "step": 2420 }, { "epoch": 1.7234042553191489, "grad_norm": 0.3591817021369934, "learning_rate": 0.00018283723441445097, "loss": 0.0325, "step": 2430 }, { "epoch": 1.7304964539007093, "grad_norm": 0.6227660775184631, "learning_rate": 0.00018266161535911642, "loss": 0.032, "step": 2440 }, { "epoch": 1.7375886524822695, "grad_norm": 0.24666891992092133, "learning_rate": 0.00018248518759926053, "loss": 0.0388, "step": 2450 }, { "epoch": 1.7446808510638299, "grad_norm": 0.34065425395965576, "learning_rate": 0.0001823079528609315, "loss": 0.0331, "step": 2460 }, { "epoch": 1.75177304964539, "grad_norm": 0.340526819229126, "learning_rate": 0.00018212991287807232, "loss": 0.0297, "step": 2470 }, { "epoch": 1.7588652482269502, "grad_norm": 0.3198091983795166, "learning_rate": 0.00018195106939250408, "loss": 0.0337, "step": 2480 }, { "epoch": 1.7659574468085106, "grad_norm": 0.3750438988208771, "learning_rate": 0.00018177142415390867, "loss": 0.0341, "step": 2490 }, { "epoch": 1.773049645390071, "grad_norm": 0.3867601454257965, "learning_rate": 0.00018159097891981186, "loss": 0.0318, "step": 2500 }, { "epoch": 1.7801418439716312, "grad_norm": 0.37601733207702637, "learning_rate": 0.00018140973545556594, "loss": 0.0349, "step": 2510 }, { "epoch": 1.7872340425531914, "grad_norm": 0.3913877606391907, "learning_rate": 0.00018122769553433266, "loss": 0.0257, "step": 2520 }, { "epoch": 1.7943262411347518, "grad_norm": 0.30526429414749146, "learning_rate": 0.00018104486093706567, "loss": 0.0275, "step": 2530 }, { "epoch": 1.8014184397163122, "grad_norm": 0.6147098541259766, "learning_rate": 0.0001808612334524932, "loss": 0.0408, "step": 2540 }, { "epoch": 1.8085106382978724, "grad_norm": 0.3850766718387604, "learning_rate": 0.00018067681487710053, "loss": 0.0291, "step": 2550 }, { "epoch": 1.8156028368794326, "grad_norm": 0.4822274148464203, "learning_rate": 0.00018049160701511248, "loss": 0.0441, "step": 2560 }, { "epoch": 1.8226950354609928, "grad_norm": 0.3193504810333252, "learning_rate": 0.00018030561167847568, "loss": 0.0349, "step": 2570 }, { "epoch": 1.8297872340425532, "grad_norm": 0.2928631901741028, "learning_rate": 0.00018011883068684085, "loss": 0.0401, "step": 2580 }, { "epoch": 1.8368794326241136, "grad_norm": 0.3406616151332855, "learning_rate": 0.00017993126586754508, "loss": 0.031, "step": 2590 }, { "epoch": 1.8439716312056738, "grad_norm": 0.33067846298217773, "learning_rate": 0.00017974291905559382, "loss": 0.043, "step": 2600 }, { "epoch": 1.851063829787234, "grad_norm": 0.39099758863449097, "learning_rate": 0.00017955379209364303, "loss": 0.0315, "step": 2610 }, { "epoch": 1.8581560283687943, "grad_norm": 0.34113165736198425, "learning_rate": 0.00017936388683198112, "loss": 0.0328, "step": 2620 }, { "epoch": 1.8652482269503547, "grad_norm": 0.5652561783790588, "learning_rate": 0.0001791732051285109, "loss": 0.0356, "step": 2630 }, { "epoch": 1.872340425531915, "grad_norm": 0.38378405570983887, "learning_rate": 0.0001789817488487313, "loss": 0.0324, "step": 2640 }, { "epoch": 1.8794326241134751, "grad_norm": 0.34848251938819885, "learning_rate": 0.00017878951986571913, "loss": 0.0361, "step": 2650 }, { "epoch": 1.8865248226950353, "grad_norm": 0.34506094455718994, "learning_rate": 0.00017859652006011088, "loss": 0.0254, "step": 2660 }, { "epoch": 1.8936170212765957, "grad_norm": 0.26408687233924866, "learning_rate": 0.00017840275132008422, "loss": 0.0316, "step": 2670 }, { "epoch": 1.900709219858156, "grad_norm": 0.4670010805130005, "learning_rate": 0.0001782082155413395, "loss": 0.0274, "step": 2680 }, { "epoch": 1.9078014184397163, "grad_norm": 0.2338956594467163, "learning_rate": 0.00017801291462708134, "loss": 0.0227, "step": 2690 }, { "epoch": 1.9148936170212765, "grad_norm": 0.3896683156490326, "learning_rate": 0.00017781685048799984, "loss": 0.0311, "step": 2700 }, { "epoch": 1.9219858156028369, "grad_norm": 0.3847581446170807, "learning_rate": 0.000177620025042252, "loss": 0.0265, "step": 2710 }, { "epoch": 1.9290780141843973, "grad_norm": 0.47572168707847595, "learning_rate": 0.00017742244021544293, "loss": 0.0318, "step": 2720 }, { "epoch": 1.9361702127659575, "grad_norm": 0.256161630153656, "learning_rate": 0.00017722409794060693, "loss": 0.0228, "step": 2730 }, { "epoch": 1.9432624113475176, "grad_norm": 0.4607986807823181, "learning_rate": 0.00017702500015818876, "loss": 0.0289, "step": 2740 }, { "epoch": 1.950354609929078, "grad_norm": 0.26722726225852966, "learning_rate": 0.0001768251488160245, "loss": 0.0256, "step": 2750 }, { "epoch": 1.9574468085106385, "grad_norm": 0.3193138539791107, "learning_rate": 0.00017662454586932254, "loss": 0.0277, "step": 2760 }, { "epoch": 1.9645390070921986, "grad_norm": 0.3791826665401459, "learning_rate": 0.00017642319328064446, "loss": 0.029, "step": 2770 }, { "epoch": 1.9716312056737588, "grad_norm": 0.24767844378948212, "learning_rate": 0.0001762210930198858, "loss": 0.0243, "step": 2780 }, { "epoch": 1.978723404255319, "grad_norm": 0.2745870053768158, "learning_rate": 0.00017601824706425684, "loss": 0.0343, "step": 2790 }, { "epoch": 1.9858156028368794, "grad_norm": 0.3385297656059265, "learning_rate": 0.0001758146573982632, "loss": 0.0382, "step": 2800 }, { "epoch": 1.9929078014184398, "grad_norm": 0.5617783665657043, "learning_rate": 0.0001756103260136865, "loss": 0.0285, "step": 2810 }, { "epoch": 2.0, "grad_norm": 0.40970340371131897, "learning_rate": 0.0001754052549095648, "loss": 0.0259, "step": 2820 }, { "epoch": 2.00709219858156, "grad_norm": 0.27524593472480774, "learning_rate": 0.00017519944609217295, "loss": 0.0315, "step": 2830 }, { "epoch": 2.0141843971631204, "grad_norm": 0.42446693778038025, "learning_rate": 0.00017499290157500333, "loss": 0.0284, "step": 2840 }, { "epoch": 2.021276595744681, "grad_norm": 0.42036786675453186, "learning_rate": 0.00017478562337874568, "loss": 0.0337, "step": 2850 }, { "epoch": 2.028368794326241, "grad_norm": 0.5692403316497803, "learning_rate": 0.00017457761353126765, "loss": 0.0298, "step": 2860 }, { "epoch": 2.0354609929078014, "grad_norm": 0.470859557390213, "learning_rate": 0.00017436887406759488, "loss": 0.0343, "step": 2870 }, { "epoch": 2.0425531914893615, "grad_norm": 0.36999377608299255, "learning_rate": 0.00017415940702989103, "loss": 0.0305, "step": 2880 }, { "epoch": 2.049645390070922, "grad_norm": 0.4802422523498535, "learning_rate": 0.00017394921446743783, "loss": 0.0326, "step": 2890 }, { "epoch": 2.0567375886524824, "grad_norm": 0.26723712682724, "learning_rate": 0.0001737382984366151, "loss": 0.0294, "step": 2900 }, { "epoch": 2.0638297872340425, "grad_norm": 0.5008073449134827, "learning_rate": 0.00017352666100088051, "loss": 0.0364, "step": 2910 }, { "epoch": 2.0709219858156027, "grad_norm": 0.35637590289115906, "learning_rate": 0.0001733143042307496, "loss": 0.0272, "step": 2920 }, { "epoch": 2.078014184397163, "grad_norm": 0.35349592566490173, "learning_rate": 0.00017310123020377517, "loss": 0.0284, "step": 2930 }, { "epoch": 2.0851063829787235, "grad_norm": 0.401716023683548, "learning_rate": 0.00017288744100452737, "loss": 0.0334, "step": 2940 }, { "epoch": 2.0921985815602837, "grad_norm": 0.4555324912071228, "learning_rate": 0.000172672938724573, "loss": 0.0377, "step": 2950 }, { "epoch": 2.099290780141844, "grad_norm": 0.349065363407135, "learning_rate": 0.00017245772546245518, "loss": 0.0326, "step": 2960 }, { "epoch": 2.106382978723404, "grad_norm": 0.42119070887565613, "learning_rate": 0.00017224180332367275, "loss": 0.0325, "step": 2970 }, { "epoch": 2.1134751773049647, "grad_norm": 0.4668595492839813, "learning_rate": 0.00017202517442065974, "loss": 0.0275, "step": 2980 }, { "epoch": 2.120567375886525, "grad_norm": 0.29753541946411133, "learning_rate": 0.00017180784087276476, "loss": 0.0347, "step": 2990 }, { "epoch": 2.127659574468085, "grad_norm": 0.34885773062705994, "learning_rate": 0.00017158980480623003, "loss": 0.0427, "step": 3000 }, { "epoch": 2.1347517730496453, "grad_norm": 0.3699316680431366, "learning_rate": 0.00017137106835417084, "loss": 0.0302, "step": 3010 }, { "epoch": 2.141843971631206, "grad_norm": 0.40523409843444824, "learning_rate": 0.00017115163365655456, "loss": 0.0322, "step": 3020 }, { "epoch": 2.148936170212766, "grad_norm": 0.24978692829608917, "learning_rate": 0.00017093150286017964, "loss": 0.0321, "step": 3030 }, { "epoch": 2.1560283687943262, "grad_norm": 0.2739085853099823, "learning_rate": 0.00017071067811865476, "loss": 0.0302, "step": 3040 }, { "epoch": 2.1631205673758864, "grad_norm": 0.3967369794845581, "learning_rate": 0.00017048916159237768, "loss": 0.0328, "step": 3050 }, { "epoch": 2.1702127659574466, "grad_norm": 0.2717509865760803, "learning_rate": 0.00017026695544851403, "loss": 0.0253, "step": 3060 }, { "epoch": 2.1773049645390072, "grad_norm": 0.46482571959495544, "learning_rate": 0.0001700440618609763, "loss": 0.0265, "step": 3070 }, { "epoch": 2.1843971631205674, "grad_norm": 0.42749258875846863, "learning_rate": 0.00016982048301040237, "loss": 0.0362, "step": 3080 }, { "epoch": 2.1914893617021276, "grad_norm": 0.49205732345581055, "learning_rate": 0.00016959622108413428, "loss": 0.0332, "step": 3090 }, { "epoch": 2.198581560283688, "grad_norm": 0.3003978729248047, "learning_rate": 0.00016937127827619685, "loss": 0.0338, "step": 3100 }, { "epoch": 2.2056737588652484, "grad_norm": 0.41363534331321716, "learning_rate": 0.00016914565678727617, "loss": 0.0317, "step": 3110 }, { "epoch": 2.2127659574468086, "grad_norm": 0.25217387080192566, "learning_rate": 0.000168919358824698, "loss": 0.0292, "step": 3120 }, { "epoch": 2.219858156028369, "grad_norm": 0.26034829020500183, "learning_rate": 0.00016869238660240638, "loss": 0.0286, "step": 3130 }, { "epoch": 2.226950354609929, "grad_norm": 0.333636999130249, "learning_rate": 0.00016846474234094176, "loss": 0.0305, "step": 3140 }, { "epoch": 2.2340425531914896, "grad_norm": 0.45352891087532043, "learning_rate": 0.00016823642826741938, "loss": 0.0315, "step": 3150 }, { "epoch": 2.2411347517730498, "grad_norm": 0.2589890658855438, "learning_rate": 0.00016800744661550745, "loss": 0.0259, "step": 3160 }, { "epoch": 2.24822695035461, "grad_norm": 0.39509129524230957, "learning_rate": 0.00016777779962540534, "loss": 0.0331, "step": 3170 }, { "epoch": 2.25531914893617, "grad_norm": 0.42908716201782227, "learning_rate": 0.00016754748954382165, "loss": 0.0342, "step": 3180 }, { "epoch": 2.2624113475177303, "grad_norm": 0.28686532378196716, "learning_rate": 0.0001673165186239521, "loss": 0.0288, "step": 3190 }, { "epoch": 2.269503546099291, "grad_norm": 0.5497453808784485, "learning_rate": 0.0001670848891254577, "loss": 0.0361, "step": 3200 }, { "epoch": 2.276595744680851, "grad_norm": 0.4589519798755646, "learning_rate": 0.00016685260331444253, "loss": 0.0266, "step": 3210 }, { "epoch": 2.2836879432624113, "grad_norm": 0.4259999990463257, "learning_rate": 0.0001666196634634316, "loss": 0.0273, "step": 3220 }, { "epoch": 2.2907801418439715, "grad_norm": 0.4129974842071533, "learning_rate": 0.00016638607185134852, "loss": 0.029, "step": 3230 }, { "epoch": 2.297872340425532, "grad_norm": 0.3736423850059509, "learning_rate": 0.00016615183076349336, "loss": 0.0255, "step": 3240 }, { "epoch": 2.3049645390070923, "grad_norm": 0.38349997997283936, "learning_rate": 0.00016591694249152013, "loss": 0.026, "step": 3250 }, { "epoch": 2.3120567375886525, "grad_norm": 0.4713188409805298, "learning_rate": 0.0001656814093334146, "loss": 0.031, "step": 3260 }, { "epoch": 2.3191489361702127, "grad_norm": 0.5241472721099854, "learning_rate": 0.00016544523359347143, "loss": 0.0298, "step": 3270 }, { "epoch": 2.326241134751773, "grad_norm": 0.25673630833625793, "learning_rate": 0.0001652084175822721, "loss": 0.0277, "step": 3280 }, { "epoch": 2.3333333333333335, "grad_norm": 0.2788539528846741, "learning_rate": 0.0001649709636166619, "loss": 0.0293, "step": 3290 }, { "epoch": 2.3404255319148937, "grad_norm": 0.33457013964653015, "learning_rate": 0.00016473287401972756, "loss": 0.0331, "step": 3300 }, { "epoch": 2.347517730496454, "grad_norm": 0.240670844912529, "learning_rate": 0.0001644941511207742, "loss": 0.0223, "step": 3310 }, { "epoch": 2.354609929078014, "grad_norm": 0.20768260955810547, "learning_rate": 0.00016425479725530292, "loss": 0.0239, "step": 3320 }, { "epoch": 2.3617021276595747, "grad_norm": 0.4587162733078003, "learning_rate": 0.00016401481476498772, "loss": 0.0278, "step": 3330 }, { "epoch": 2.368794326241135, "grad_norm": 0.4795977473258972, "learning_rate": 0.00016377420599765255, "loss": 0.0263, "step": 3340 }, { "epoch": 2.375886524822695, "grad_norm": 0.20740842819213867, "learning_rate": 0.0001635329733072485, "loss": 0.0273, "step": 3350 }, { "epoch": 2.382978723404255, "grad_norm": 0.28621232509613037, "learning_rate": 0.0001632911190538307, "loss": 0.0289, "step": 3360 }, { "epoch": 2.3900709219858154, "grad_norm": 0.4048430621623993, "learning_rate": 0.00016304864560353518, "loss": 0.0337, "step": 3370 }, { "epoch": 2.397163120567376, "grad_norm": 0.3549022674560547, "learning_rate": 0.00016280555532855576, "loss": 0.021, "step": 3380 }, { "epoch": 2.404255319148936, "grad_norm": 0.3781268000602722, "learning_rate": 0.00016256185060712093, "loss": 0.0278, "step": 3390 }, { "epoch": 2.4113475177304964, "grad_norm": 0.372190922498703, "learning_rate": 0.00016231753382347047, "loss": 0.0255, "step": 3400 }, { "epoch": 2.4184397163120566, "grad_norm": 0.20026937127113342, "learning_rate": 0.00016207260736783203, "loss": 0.0261, "step": 3410 }, { "epoch": 2.425531914893617, "grad_norm": 0.24735766649246216, "learning_rate": 0.00016182707363639808, "loss": 0.0252, "step": 3420 }, { "epoch": 2.4326241134751774, "grad_norm": 0.36811962723731995, "learning_rate": 0.00016158093503130215, "loss": 0.0302, "step": 3430 }, { "epoch": 2.4397163120567376, "grad_norm": 0.41325879096984863, "learning_rate": 0.0001613341939605954, "loss": 0.0402, "step": 3440 }, { "epoch": 2.4468085106382977, "grad_norm": 0.3262059688568115, "learning_rate": 0.00016108685283822317, "loss": 0.027, "step": 3450 }, { "epoch": 2.453900709219858, "grad_norm": 0.34455767273902893, "learning_rate": 0.0001608389140840013, "loss": 0.0296, "step": 3460 }, { "epoch": 2.4609929078014185, "grad_norm": 0.42494380474090576, "learning_rate": 0.0001605903801235924, "loss": 0.0307, "step": 3470 }, { "epoch": 2.4680851063829787, "grad_norm": 0.289614200592041, "learning_rate": 0.00016034125338848222, "loss": 0.0274, "step": 3480 }, { "epoch": 2.475177304964539, "grad_norm": 0.5783960223197937, "learning_rate": 0.0001600915363159557, "loss": 0.0355, "step": 3490 }, { "epoch": 2.482269503546099, "grad_norm": 0.28593072295188904, "learning_rate": 0.00015984123134907345, "loss": 0.0291, "step": 3500 }, { "epoch": 2.4893617021276597, "grad_norm": 0.2484228014945984, "learning_rate": 0.00015959034093664738, "loss": 0.0324, "step": 3510 }, { "epoch": 2.49645390070922, "grad_norm": 0.35120323300361633, "learning_rate": 0.00015933886753321722, "loss": 0.0324, "step": 3520 }, { "epoch": 2.50354609929078, "grad_norm": 0.2902463376522064, "learning_rate": 0.0001590868135990261, "loss": 0.0265, "step": 3530 }, { "epoch": 2.5106382978723403, "grad_norm": 0.3429517149925232, "learning_rate": 0.0001588341815999968, "loss": 0.0376, "step": 3540 }, { "epoch": 2.5177304964539005, "grad_norm": 0.3964468240737915, "learning_rate": 0.0001585809740077074, "loss": 0.0263, "step": 3550 }, { "epoch": 2.524822695035461, "grad_norm": 0.1953706592321396, "learning_rate": 0.0001583271932993673, "loss": 0.0314, "step": 3560 }, { "epoch": 2.5319148936170213, "grad_norm": 0.35029205679893494, "learning_rate": 0.00015807284195779272, "loss": 0.0347, "step": 3570 }, { "epoch": 2.5390070921985815, "grad_norm": 0.229562446475029, "learning_rate": 0.0001578179224713827, "loss": 0.0281, "step": 3580 }, { "epoch": 2.546099290780142, "grad_norm": 0.379069447517395, "learning_rate": 0.00015756243733409456, "loss": 0.0296, "step": 3590 }, { "epoch": 2.5531914893617023, "grad_norm": 0.17396724224090576, "learning_rate": 0.00015730638904541957, "loss": 0.0252, "step": 3600 }, { "epoch": 2.5602836879432624, "grad_norm": 0.29691052436828613, "learning_rate": 0.00015704978011035845, "loss": 0.0292, "step": 3610 }, { "epoch": 2.5673758865248226, "grad_norm": 0.39194944500923157, "learning_rate": 0.000156792613039397, "loss": 0.0274, "step": 3620 }, { "epoch": 2.574468085106383, "grad_norm": 0.1824718415737152, "learning_rate": 0.00015653489034848125, "loss": 0.0252, "step": 3630 }, { "epoch": 2.581560283687943, "grad_norm": 0.22263765335083008, "learning_rate": 0.00015627661455899327, "loss": 0.0208, "step": 3640 }, { "epoch": 2.5886524822695036, "grad_norm": 0.2892332077026367, "learning_rate": 0.00015601778819772613, "loss": 0.027, "step": 3650 }, { "epoch": 2.595744680851064, "grad_norm": 0.2670251429080963, "learning_rate": 0.00015575841379685928, "loss": 0.023, "step": 3660 }, { "epoch": 2.602836879432624, "grad_norm": 0.2950701415538788, "learning_rate": 0.00015549849389393395, "loss": 0.0257, "step": 3670 }, { "epoch": 2.6099290780141846, "grad_norm": 0.3107167184352875, "learning_rate": 0.00015523803103182805, "loss": 0.0244, "step": 3680 }, { "epoch": 2.617021276595745, "grad_norm": 0.35476645827293396, "learning_rate": 0.00015497702775873156, "loss": 0.0229, "step": 3690 }, { "epoch": 2.624113475177305, "grad_norm": 0.37949270009994507, "learning_rate": 0.00015471548662812133, "loss": 0.029, "step": 3700 }, { "epoch": 2.631205673758865, "grad_norm": 0.39569422602653503, "learning_rate": 0.00015445341019873634, "loss": 0.0312, "step": 3710 }, { "epoch": 2.6382978723404253, "grad_norm": 0.45384731888771057, "learning_rate": 0.0001541908010345525, "loss": 0.0293, "step": 3720 }, { "epoch": 2.645390070921986, "grad_norm": 0.3378404676914215, "learning_rate": 0.0001539276617047577, "loss": 0.029, "step": 3730 }, { "epoch": 2.652482269503546, "grad_norm": 0.4775332510471344, "learning_rate": 0.00015366399478372662, "loss": 0.0294, "step": 3740 }, { "epoch": 2.6595744680851063, "grad_norm": 0.3015674352645874, "learning_rate": 0.0001533998028509954, "loss": 0.0315, "step": 3750 }, { "epoch": 2.6666666666666665, "grad_norm": 0.27968233823776245, "learning_rate": 0.00015313508849123668, "loss": 0.0273, "step": 3760 }, { "epoch": 2.673758865248227, "grad_norm": 0.40850046277046204, "learning_rate": 0.00015286985429423404, "loss": 0.0247, "step": 3770 }, { "epoch": 2.6808510638297873, "grad_norm": 0.2494577169418335, "learning_rate": 0.00015260410285485693, "loss": 0.0235, "step": 3780 }, { "epoch": 2.6879432624113475, "grad_norm": 0.24964429438114166, "learning_rate": 0.00015233783677303498, "loss": 0.0259, "step": 3790 }, { "epoch": 2.6950354609929077, "grad_norm": 0.28433412313461304, "learning_rate": 0.00015207105865373295, "loss": 0.0372, "step": 3800 }, { "epoch": 2.702127659574468, "grad_norm": 0.2023301124572754, "learning_rate": 0.0001518037711069248, "loss": 0.0275, "step": 3810 }, { "epoch": 2.7092198581560285, "grad_norm": 0.26986822485923767, "learning_rate": 0.0001515359767475685, "loss": 0.0284, "step": 3820 }, { "epoch": 2.7163120567375887, "grad_norm": 0.2430671602487564, "learning_rate": 0.00015126767819558022, "loss": 0.0241, "step": 3830 }, { "epoch": 2.723404255319149, "grad_norm": 0.2965194582939148, "learning_rate": 0.00015099887807580904, "loss": 0.029, "step": 3840 }, { "epoch": 2.7304964539007095, "grad_norm": 0.2736563980579376, "learning_rate": 0.00015072957901801076, "loss": 0.0264, "step": 3850 }, { "epoch": 2.7375886524822697, "grad_norm": 0.233638733625412, "learning_rate": 0.00015045978365682257, "loss": 0.0298, "step": 3860 }, { "epoch": 2.74468085106383, "grad_norm": 0.350392609834671, "learning_rate": 0.0001501894946317372, "loss": 0.0278, "step": 3870 }, { "epoch": 2.75177304964539, "grad_norm": 0.2875712215900421, "learning_rate": 0.00014991871458707698, "loss": 0.0302, "step": 3880 }, { "epoch": 2.7588652482269502, "grad_norm": 0.3458658754825592, "learning_rate": 0.000149647446171968, "loss": 0.0316, "step": 3890 }, { "epoch": 2.7659574468085104, "grad_norm": 0.2638990879058838, "learning_rate": 0.00014937569204031436, "loss": 0.0248, "step": 3900 }, { "epoch": 2.773049645390071, "grad_norm": 0.34576040506362915, "learning_rate": 0.00014910345485077197, "loss": 0.0211, "step": 3910 }, { "epoch": 2.780141843971631, "grad_norm": 0.3976755738258362, "learning_rate": 0.00014883073726672269, "loss": 0.0225, "step": 3920 }, { "epoch": 2.7872340425531914, "grad_norm": 0.36923620104789734, "learning_rate": 0.00014855754195624822, "loss": 0.0283, "step": 3930 }, { "epoch": 2.794326241134752, "grad_norm": 0.3352384865283966, "learning_rate": 0.00014828387159210397, "loss": 0.0334, "step": 3940 }, { "epoch": 2.801418439716312, "grad_norm": 0.30438244342803955, "learning_rate": 0.00014800972885169303, "loss": 0.0322, "step": 3950 }, { "epoch": 2.8085106382978724, "grad_norm": 0.3594920337200165, "learning_rate": 0.00014773511641703987, "loss": 0.0399, "step": 3960 }, { "epoch": 2.8156028368794326, "grad_norm": 0.31048035621643066, "learning_rate": 0.00014746003697476404, "loss": 0.0273, "step": 3970 }, { "epoch": 2.8226950354609928, "grad_norm": 0.40405237674713135, "learning_rate": 0.0001471844932160541, "loss": 0.0271, "step": 3980 }, { "epoch": 2.829787234042553, "grad_norm": 0.2698522210121155, "learning_rate": 0.00014690848783664108, "loss": 0.0241, "step": 3990 }, { "epoch": 2.8368794326241136, "grad_norm": 0.3636997640132904, "learning_rate": 0.00014663202353677222, "loss": 0.0233, "step": 4000 }, { "epoch": 2.8439716312056738, "grad_norm": 0.24219240248203278, "learning_rate": 0.00014635510302118452, "loss": 0.0228, "step": 4010 }, { "epoch": 2.851063829787234, "grad_norm": 0.309758722782135, "learning_rate": 0.00014607772899907824, "loss": 0.0217, "step": 4020 }, { "epoch": 2.8581560283687946, "grad_norm": 0.33416473865509033, "learning_rate": 0.0001457999041840906, "loss": 0.0236, "step": 4030 }, { "epoch": 2.8652482269503547, "grad_norm": 0.30040785670280457, "learning_rate": 0.00014552163129426875, "loss": 0.0273, "step": 4040 }, { "epoch": 2.872340425531915, "grad_norm": 0.35119491815567017, "learning_rate": 0.00014524291305204382, "loss": 0.0286, "step": 4050 }, { "epoch": 2.879432624113475, "grad_norm": 0.31673938035964966, "learning_rate": 0.00014496375218420383, "loss": 0.0292, "step": 4060 }, { "epoch": 2.8865248226950353, "grad_norm": 0.20967337489128113, "learning_rate": 0.00014468415142186708, "loss": 0.0298, "step": 4070 }, { "epoch": 2.8936170212765955, "grad_norm": 0.5264660120010376, "learning_rate": 0.0001444041135004556, "loss": 0.0216, "step": 4080 }, { "epoch": 2.900709219858156, "grad_norm": 0.3425130248069763, "learning_rate": 0.0001441236411596683, "loss": 0.0239, "step": 4090 }, { "epoch": 2.9078014184397163, "grad_norm": 0.2979462742805481, "learning_rate": 0.00014384273714345403, "loss": 0.0243, "step": 4100 }, { "epoch": 2.9148936170212765, "grad_norm": 0.2593551576137543, "learning_rate": 0.00014356140419998493, "loss": 0.0236, "step": 4110 }, { "epoch": 2.921985815602837, "grad_norm": 0.41503509879112244, "learning_rate": 0.0001432796450816295, "loss": 0.0235, "step": 4120 }, { "epoch": 2.9290780141843973, "grad_norm": 0.30138614773750305, "learning_rate": 0.00014299746254492555, "loss": 0.0209, "step": 4130 }, { "epoch": 2.9361702127659575, "grad_norm": 0.34066081047058105, "learning_rate": 0.00014271485935055347, "loss": 0.022, "step": 4140 }, { "epoch": 2.9432624113475176, "grad_norm": 0.45455053448677063, "learning_rate": 0.00014243183826330894, "loss": 0.0303, "step": 4150 }, { "epoch": 2.950354609929078, "grad_norm": 0.19702021777629852, "learning_rate": 0.00014214840205207605, "loss": 0.0226, "step": 4160 }, { "epoch": 2.9574468085106385, "grad_norm": 0.3477088212966919, "learning_rate": 0.0001418645534898002, "loss": 0.0233, "step": 4170 }, { "epoch": 2.9645390070921986, "grad_norm": 0.349435955286026, "learning_rate": 0.00014158029535346096, "loss": 0.0252, "step": 4180 }, { "epoch": 2.971631205673759, "grad_norm": 0.1625533252954483, "learning_rate": 0.00014129563042404483, "loss": 0.0217, "step": 4190 }, { "epoch": 2.978723404255319, "grad_norm": 0.19161160290241241, "learning_rate": 0.00014101056148651823, "loss": 0.0173, "step": 4200 }, { "epoch": 2.9858156028368796, "grad_norm": 0.22051677107810974, "learning_rate": 0.00014072509132979994, "loss": 0.0263, "step": 4210 }, { "epoch": 2.99290780141844, "grad_norm": 0.3433932960033417, "learning_rate": 0.0001404392227467341, "loss": 0.02, "step": 4220 }, { "epoch": 3.0, "grad_norm": 0.5353679060935974, "learning_rate": 0.0001401529585340628, "loss": 0.0218, "step": 4230 }, { "epoch": 3.00709219858156, "grad_norm": 0.2597915232181549, "learning_rate": 0.0001398663014923986, "loss": 0.0175, "step": 4240 }, { "epoch": 3.0141843971631204, "grad_norm": 0.38695409893989563, "learning_rate": 0.00013957925442619737, "loss": 0.0281, "step": 4250 }, { "epoch": 3.021276595744681, "grad_norm": 0.33511126041412354, "learning_rate": 0.00013929182014373054, "loss": 0.0211, "step": 4260 }, { "epoch": 3.028368794326241, "grad_norm": 0.2662104368209839, "learning_rate": 0.00013900400145705794, "loss": 0.0252, "step": 4270 }, { "epoch": 3.0354609929078014, "grad_norm": 0.4052877128124237, "learning_rate": 0.00013871580118200006, "loss": 0.0245, "step": 4280 }, { "epoch": 3.0425531914893615, "grad_norm": 0.428830623626709, "learning_rate": 0.0001384272221381107, "loss": 0.0396, "step": 4290 }, { "epoch": 3.049645390070922, "grad_norm": 0.20592275261878967, "learning_rate": 0.0001381382671486491, "loss": 0.0195, "step": 4300 }, { "epoch": 3.0567375886524824, "grad_norm": 0.29792851209640503, "learning_rate": 0.00013784893904055266, "loss": 0.0221, "step": 4310 }, { "epoch": 3.0638297872340425, "grad_norm": 0.3490282893180847, "learning_rate": 0.00013755924064440904, "loss": 0.0255, "step": 4320 }, { "epoch": 3.0709219858156027, "grad_norm": 0.245464026927948, "learning_rate": 0.00013726917479442855, "loss": 0.0239, "step": 4330 }, { "epoch": 3.078014184397163, "grad_norm": 0.28532829880714417, "learning_rate": 0.00013697874432841637, "loss": 0.0206, "step": 4340 }, { "epoch": 3.0851063829787235, "grad_norm": 0.45370057225227356, "learning_rate": 0.00013668795208774496, "loss": 0.0261, "step": 4350 }, { "epoch": 3.0921985815602837, "grad_norm": 0.39404305815696716, "learning_rate": 0.00013639680091732603, "loss": 0.0244, "step": 4360 }, { "epoch": 3.099290780141844, "grad_norm": 0.3023253083229065, "learning_rate": 0.00013610529366558282, "loss": 0.0237, "step": 4370 }, { "epoch": 3.106382978723404, "grad_norm": 0.22088029980659485, "learning_rate": 0.00013581343318442226, "loss": 0.0185, "step": 4380 }, { "epoch": 3.1134751773049647, "grad_norm": 0.28878822922706604, "learning_rate": 0.00013552122232920707, "loss": 0.0264, "step": 4390 }, { "epoch": 3.120567375886525, "grad_norm": 0.2687053084373474, "learning_rate": 0.00013522866395872758, "loss": 0.0206, "step": 4400 }, { "epoch": 3.127659574468085, "grad_norm": 0.32471123337745667, "learning_rate": 0.00013493576093517434, "loss": 0.0211, "step": 4410 }, { "epoch": 3.1347517730496453, "grad_norm": 0.19945155084133148, "learning_rate": 0.00013464251612410936, "loss": 0.0225, "step": 4420 }, { "epoch": 3.141843971631206, "grad_norm": 0.203238844871521, "learning_rate": 0.00013434893239443877, "loss": 0.019, "step": 4430 }, { "epoch": 3.148936170212766, "grad_norm": 0.2069423794746399, "learning_rate": 0.00013405501261838423, "loss": 0.0246, "step": 4440 }, { "epoch": 3.1560283687943262, "grad_norm": 0.3187784254550934, "learning_rate": 0.00013376075967145524, "loss": 0.0222, "step": 4450 }, { "epoch": 3.1631205673758864, "grad_norm": 0.31714534759521484, "learning_rate": 0.00013346617643242062, "loss": 0.0246, "step": 4460 }, { "epoch": 3.1702127659574466, "grad_norm": 0.4024612605571747, "learning_rate": 0.00013317126578328065, "loss": 0.0282, "step": 4470 }, { "epoch": 3.1773049645390072, "grad_norm": 0.2583388388156891, "learning_rate": 0.00013287603060923876, "loss": 0.0184, "step": 4480 }, { "epoch": 3.1843971631205674, "grad_norm": 0.28229662775993347, "learning_rate": 0.00013258047379867334, "loss": 0.0259, "step": 4490 }, { "epoch": 3.1914893617021276, "grad_norm": 0.2962813973426819, "learning_rate": 0.00013228459824310936, "loss": 0.0327, "step": 4500 }, { "epoch": 3.198581560283688, "grad_norm": 0.2835753858089447, "learning_rate": 0.00013198840683719022, "loss": 0.0174, "step": 4510 }, { "epoch": 3.2056737588652484, "grad_norm": 0.3171481788158417, "learning_rate": 0.00013169190247864943, "loss": 0.0315, "step": 4520 }, { "epoch": 3.2127659574468086, "grad_norm": 0.3521289527416229, "learning_rate": 0.0001313950880682821, "loss": 0.0209, "step": 4530 }, { "epoch": 3.219858156028369, "grad_norm": 0.3271763324737549, "learning_rate": 0.00013109796650991683, "loss": 0.0183, "step": 4540 }, { "epoch": 3.226950354609929, "grad_norm": 0.29443103075027466, "learning_rate": 0.00013080054071038698, "loss": 0.0285, "step": 4550 }, { "epoch": 3.2340425531914896, "grad_norm": 0.35706964135169983, "learning_rate": 0.00013050281357950255, "loss": 0.026, "step": 4560 }, { "epoch": 3.2411347517730498, "grad_norm": 0.3586270809173584, "learning_rate": 0.00013020478803002142, "loss": 0.025, "step": 4570 }, { "epoch": 3.24822695035461, "grad_norm": 0.2771762013435364, "learning_rate": 0.00012990646697762107, "loss": 0.0222, "step": 4580 }, { "epoch": 3.25531914893617, "grad_norm": 0.22596819698810577, "learning_rate": 0.00012960785334087, "loss": 0.0211, "step": 4590 }, { "epoch": 3.2624113475177303, "grad_norm": 0.32333022356033325, "learning_rate": 0.00012930895004119907, "loss": 0.0189, "step": 4600 }, { "epoch": 3.269503546099291, "grad_norm": 0.35000088810920715, "learning_rate": 0.00012900976000287313, "loss": 0.0284, "step": 4610 }, { "epoch": 3.276595744680851, "grad_norm": 0.3429652154445648, "learning_rate": 0.00012871028615296212, "loss": 0.0307, "step": 4620 }, { "epoch": 3.2836879432624113, "grad_norm": 0.2919664978981018, "learning_rate": 0.00012841053142131272, "loss": 0.0226, "step": 4630 }, { "epoch": 3.2907801418439715, "grad_norm": 0.30846384167671204, "learning_rate": 0.00012811049874051955, "loss": 0.0199, "step": 4640 }, { "epoch": 3.297872340425532, "grad_norm": 0.4330957531929016, "learning_rate": 0.00012781019104589645, "loss": 0.0283, "step": 4650 }, { "epoch": 3.3049645390070923, "grad_norm": 0.2327142059803009, "learning_rate": 0.0001275096112754478, "loss": 0.0295, "step": 4660 }, { "epoch": 3.3120567375886525, "grad_norm": 0.32682859897613525, "learning_rate": 0.00012720876236983988, "loss": 0.0364, "step": 4670 }, { "epoch": 3.3191489361702127, "grad_norm": 0.25689056515693665, "learning_rate": 0.00012690764727237193, "loss": 0.02, "step": 4680 }, { "epoch": 3.326241134751773, "grad_norm": 0.275511234998703, "learning_rate": 0.0001266062689289474, "loss": 0.0307, "step": 4690 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3178229331970215, "learning_rate": 0.00012630463028804513, "loss": 0.0269, "step": 4700 }, { "epoch": 3.3404255319148937, "grad_norm": 0.2833107113838196, "learning_rate": 0.00012600273430069073, "loss": 0.0247, "step": 4710 }, { "epoch": 3.347517730496454, "grad_norm": 0.3253328800201416, "learning_rate": 0.0001257005839204273, "loss": 0.0217, "step": 4720 }, { "epoch": 3.354609929078014, "grad_norm": 0.26066187024116516, "learning_rate": 0.00012539818210328683, "loss": 0.0201, "step": 4730 }, { "epoch": 3.3617021276595747, "grad_norm": 0.24645408987998962, "learning_rate": 0.0001250955318077612, "loss": 0.0315, "step": 4740 }, { "epoch": 3.368794326241135, "grad_norm": 0.3252389132976532, "learning_rate": 0.00012479263599477318, "loss": 0.0203, "step": 4750 }, { "epoch": 3.375886524822695, "grad_norm": 0.21474260091781616, "learning_rate": 0.00012448949762764762, "loss": 0.0207, "step": 4760 }, { "epoch": 3.382978723404255, "grad_norm": 0.22793646156787872, "learning_rate": 0.00012418611967208223, "loss": 0.0211, "step": 4770 }, { "epoch": 3.3900709219858154, "grad_norm": 0.2615390419960022, "learning_rate": 0.00012388250509611876, "loss": 0.0344, "step": 4780 }, { "epoch": 3.397163120567376, "grad_norm": 0.4156443476676941, "learning_rate": 0.00012357865687011389, "loss": 0.0299, "step": 4790 }, { "epoch": 3.404255319148936, "grad_norm": 0.49900200963020325, "learning_rate": 0.00012327457796671015, "loss": 0.0245, "step": 4800 }, { "epoch": 3.4113475177304964, "grad_norm": 0.37122806906700134, "learning_rate": 0.00012297027136080687, "loss": 0.0276, "step": 4810 }, { "epoch": 3.4184397163120566, "grad_norm": 0.28465649485588074, "learning_rate": 0.00012266574002953108, "loss": 0.0196, "step": 4820 }, { "epoch": 3.425531914893617, "grad_norm": 0.24766407907009125, "learning_rate": 0.00012236098695220831, "loss": 0.0256, "step": 4830 }, { "epoch": 3.4326241134751774, "grad_norm": 0.27211466431617737, "learning_rate": 0.0001220560151103336, "loss": 0.0284, "step": 4840 }, { "epoch": 3.4397163120567376, "grad_norm": 0.2607908546924591, "learning_rate": 0.00012175082748754212, "loss": 0.0213, "step": 4850 }, { "epoch": 3.4468085106382977, "grad_norm": 0.22450798749923706, "learning_rate": 0.0001214454270695802, "loss": 0.0243, "step": 4860 }, { "epoch": 3.453900709219858, "grad_norm": 0.2559250295162201, "learning_rate": 0.00012113981684427591, "loss": 0.0302, "step": 4870 }, { "epoch": 3.4609929078014185, "grad_norm": 0.4311963617801666, "learning_rate": 0.00012083399980151, "loss": 0.0275, "step": 4880 }, { "epoch": 3.4680851063829787, "grad_norm": 0.24473054707050323, "learning_rate": 0.00012052797893318657, "loss": 0.0251, "step": 4890 }, { "epoch": 3.475177304964539, "grad_norm": 0.24619214236736298, "learning_rate": 0.00012022175723320381, "loss": 0.0198, "step": 4900 }, { "epoch": 3.482269503546099, "grad_norm": 0.3668628931045532, "learning_rate": 0.00011991533769742469, "loss": 0.0313, "step": 4910 }, { "epoch": 3.4893617021276597, "grad_norm": 0.4206676483154297, "learning_rate": 0.00011960872332364765, "loss": 0.0296, "step": 4920 }, { "epoch": 3.49645390070922, "grad_norm": 0.28001976013183594, "learning_rate": 0.00011930191711157737, "loss": 0.0243, "step": 4930 }, { "epoch": 3.50354609929078, "grad_norm": 0.2904788553714752, "learning_rate": 0.00011899492206279524, "loss": 0.0215, "step": 4940 }, { "epoch": 3.5106382978723403, "grad_norm": 0.3145068883895874, "learning_rate": 0.0001186877411807302, "loss": 0.0254, "step": 4950 }, { "epoch": 3.5177304964539005, "grad_norm": 0.42452743649482727, "learning_rate": 0.0001183803774706292, "loss": 0.0249, "step": 4960 }, { "epoch": 3.524822695035461, "grad_norm": 0.4683021903038025, "learning_rate": 0.00011807283393952786, "loss": 0.0218, "step": 4970 }, { "epoch": 3.5319148936170213, "grad_norm": 0.20090143382549286, "learning_rate": 0.00011776511359622105, "loss": 0.0187, "step": 4980 }, { "epoch": 3.5390070921985815, "grad_norm": 0.20158180594444275, "learning_rate": 0.00011745721945123343, "loss": 0.0263, "step": 4990 }, { "epoch": 3.546099290780142, "grad_norm": 0.2160181701183319, "learning_rate": 0.00011714915451679003, "loss": 0.0253, "step": 5000 }, { "epoch": 3.5531914893617023, "grad_norm": 0.2958919405937195, "learning_rate": 0.00011684092180678683, "loss": 0.0276, "step": 5010 }, { "epoch": 3.5602836879432624, "grad_norm": 0.3748587667942047, "learning_rate": 0.00011653252433676108, "loss": 0.0244, "step": 5020 }, { "epoch": 3.5673758865248226, "grad_norm": 0.1788649708032608, "learning_rate": 0.00011622396512386202, "loss": 0.0217, "step": 5030 }, { "epoch": 3.574468085106383, "grad_norm": 0.21001924574375153, "learning_rate": 0.00011591524718682127, "loss": 0.019, "step": 5040 }, { "epoch": 3.581560283687943, "grad_norm": 0.22885674238204956, "learning_rate": 0.00011560637354592332, "loss": 0.0185, "step": 5050 }, { "epoch": 3.5886524822695036, "grad_norm": 0.29409468173980713, "learning_rate": 0.0001152973472229758, "loss": 0.0167, "step": 5060 }, { "epoch": 3.595744680851064, "grad_norm": 0.30863863229751587, "learning_rate": 0.00011498817124128032, "loss": 0.0254, "step": 5070 }, { "epoch": 3.602836879432624, "grad_norm": 0.18340665102005005, "learning_rate": 0.00011467884862560245, "loss": 0.0255, "step": 5080 }, { "epoch": 3.6099290780141846, "grad_norm": 0.28949764370918274, "learning_rate": 0.00011436938240214241, "loss": 0.0303, "step": 5090 }, { "epoch": 3.617021276595745, "grad_norm": 0.19286899268627167, "learning_rate": 0.0001140597755985054, "loss": 0.0284, "step": 5100 }, { "epoch": 3.624113475177305, "grad_norm": 0.4090544879436493, "learning_rate": 0.00011375003124367192, "loss": 0.0218, "step": 5110 }, { "epoch": 3.631205673758865, "grad_norm": 0.32162442803382874, "learning_rate": 0.00011344015236796822, "loss": 0.0253, "step": 5120 }, { "epoch": 3.6382978723404253, "grad_norm": 0.2677665054798126, "learning_rate": 0.00011313014200303647, "loss": 0.0169, "step": 5130 }, { "epoch": 3.645390070921986, "grad_norm": 0.4189298152923584, "learning_rate": 0.00011282000318180545, "loss": 0.0205, "step": 5140 }, { "epoch": 3.652482269503546, "grad_norm": 0.2897457778453827, "learning_rate": 0.00011250973893846055, "loss": 0.0207, "step": 5150 }, { "epoch": 3.6595744680851063, "grad_norm": 0.21248039603233337, "learning_rate": 0.00011219935230841421, "loss": 0.0221, "step": 5160 }, { "epoch": 3.6666666666666665, "grad_norm": 0.23816858232021332, "learning_rate": 0.00011188884632827619, "loss": 0.0209, "step": 5170 }, { "epoch": 3.673758865248227, "grad_norm": 0.24563542008399963, "learning_rate": 0.00011157822403582399, "loss": 0.0233, "step": 5180 }, { "epoch": 3.6808510638297873, "grad_norm": 0.23103229701519012, "learning_rate": 0.0001112674884699729, "loss": 0.0157, "step": 5190 }, { "epoch": 3.6879432624113475, "grad_norm": 0.27050310373306274, "learning_rate": 0.00011095664267074655, "loss": 0.0214, "step": 5200 }, { "epoch": 3.6950354609929077, "grad_norm": 0.16521809995174408, "learning_rate": 0.00011064568967924683, "loss": 0.0224, "step": 5210 }, { "epoch": 3.702127659574468, "grad_norm": 0.19685564935207367, "learning_rate": 0.00011033463253762452, "loss": 0.0157, "step": 5220 }, { "epoch": 3.7092198581560285, "grad_norm": 0.2071635127067566, "learning_rate": 0.0001100234742890492, "loss": 0.0196, "step": 5230 }, { "epoch": 3.7163120567375887, "grad_norm": 0.19163092970848083, "learning_rate": 0.00010971221797767966, "loss": 0.0183, "step": 5240 }, { "epoch": 3.723404255319149, "grad_norm": 0.43572723865509033, "learning_rate": 0.00010940086664863404, "loss": 0.0189, "step": 5250 }, { "epoch": 3.7304964539007095, "grad_norm": 0.19704580307006836, "learning_rate": 0.00010908942334796015, "loss": 0.0213, "step": 5260 }, { "epoch": 3.7375886524822697, "grad_norm": 0.30172115564346313, "learning_rate": 0.00010877789112260551, "loss": 0.0242, "step": 5270 }, { "epoch": 3.74468085106383, "grad_norm": 0.2991447150707245, "learning_rate": 0.00010846627302038756, "loss": 0.0163, "step": 5280 }, { "epoch": 3.75177304964539, "grad_norm": 0.2086688131093979, "learning_rate": 0.00010815457208996407, "loss": 0.0162, "step": 5290 }, { "epoch": 3.7588652482269502, "grad_norm": 0.25282007455825806, "learning_rate": 0.000107842791380803, "loss": 0.0292, "step": 5300 }, { "epoch": 3.7659574468085104, "grad_norm": 0.21109668910503387, "learning_rate": 0.0001075309339431529, "loss": 0.0283, "step": 5310 }, { "epoch": 3.773049645390071, "grad_norm": 0.212016299366951, "learning_rate": 0.00010721900282801287, "loss": 0.0248, "step": 5320 }, { "epoch": 3.780141843971631, "grad_norm": 0.19006933271884918, "learning_rate": 0.00010690700108710297, "loss": 0.0247, "step": 5330 }, { "epoch": 3.7872340425531914, "grad_norm": 0.28409719467163086, "learning_rate": 0.00010659493177283408, "loss": 0.0236, "step": 5340 }, { "epoch": 3.794326241134752, "grad_norm": 0.3740707337856293, "learning_rate": 0.00010628279793827825, "loss": 0.0244, "step": 5350 }, { "epoch": 3.801418439716312, "grad_norm": 0.20624110102653503, "learning_rate": 0.00010597060263713872, "loss": 0.0213, "step": 5360 }, { "epoch": 3.8085106382978724, "grad_norm": 0.2841550409793854, "learning_rate": 0.0001056583489237201, "loss": 0.0156, "step": 5370 }, { "epoch": 3.8156028368794326, "grad_norm": 0.3967061936855316, "learning_rate": 0.00010534603985289844, "loss": 0.0233, "step": 5380 }, { "epoch": 3.8226950354609928, "grad_norm": 0.33291056752204895, "learning_rate": 0.00010503367848009133, "loss": 0.0214, "step": 5390 }, { "epoch": 3.829787234042553, "grad_norm": 0.3669842779636383, "learning_rate": 0.00010472126786122818, "loss": 0.0255, "step": 5400 }, { "epoch": 3.8368794326241136, "grad_norm": 0.2933257520198822, "learning_rate": 0.00010440881105272007, "loss": 0.021, "step": 5410 }, { "epoch": 3.8439716312056738, "grad_norm": 0.1543368250131607, "learning_rate": 0.00010409631111142997, "loss": 0.0206, "step": 5420 }, { "epoch": 3.851063829787234, "grad_norm": 0.2014048546552658, "learning_rate": 0.0001037837710946429, "loss": 0.0198, "step": 5430 }, { "epoch": 3.8581560283687946, "grad_norm": 0.2781049907207489, "learning_rate": 0.00010347119406003592, "loss": 0.0261, "step": 5440 }, { "epoch": 3.8652482269503547, "grad_norm": 0.26720258593559265, "learning_rate": 0.0001031585830656482, "loss": 0.0216, "step": 5450 }, { "epoch": 3.872340425531915, "grad_norm": 0.23284177482128143, "learning_rate": 0.00010284594116985125, "loss": 0.0195, "step": 5460 }, { "epoch": 3.879432624113475, "grad_norm": 0.36926499009132385, "learning_rate": 0.00010253327143131879, "loss": 0.0226, "step": 5470 }, { "epoch": 3.8865248226950353, "grad_norm": 0.3114331066608429, "learning_rate": 0.00010222057690899705, "loss": 0.0242, "step": 5480 }, { "epoch": 3.8936170212765955, "grad_norm": 0.2598218023777008, "learning_rate": 0.00010190786066207458, "loss": 0.0174, "step": 5490 }, { "epoch": 3.900709219858156, "grad_norm": 0.21172010898590088, "learning_rate": 0.00010159512574995258, "loss": 0.0164, "step": 5500 }, { "epoch": 3.9078014184397163, "grad_norm": 0.23221197724342346, "learning_rate": 0.00010128237523221487, "loss": 0.0174, "step": 5510 }, { "epoch": 3.9148936170212765, "grad_norm": 0.21310311555862427, "learning_rate": 0.00010096961216859787, "loss": 0.0186, "step": 5520 }, { "epoch": 3.921985815602837, "grad_norm": 0.22581326961517334, "learning_rate": 0.00010065683961896074, "loss": 0.0201, "step": 5530 }, { "epoch": 3.9290780141843973, "grad_norm": 0.36404818296432495, "learning_rate": 0.00010034406064325553, "loss": 0.0188, "step": 5540 }, { "epoch": 3.9361702127659575, "grad_norm": 0.2163950651884079, "learning_rate": 0.00010003127830149706, "loss": 0.0159, "step": 5550 }, { "epoch": 3.9432624113475176, "grad_norm": 0.5740591883659363, "learning_rate": 9.971849565373317e-05, "loss": 0.0293, "step": 5560 }, { "epoch": 3.950354609929078, "grad_norm": 0.31515783071517944, "learning_rate": 9.940571576001465e-05, "loss": 0.0214, "step": 5570 }, { "epoch": 3.9574468085106385, "grad_norm": 0.3176766335964203, "learning_rate": 9.909294168036531e-05, "loss": 0.018, "step": 5580 }, { "epoch": 3.9645390070921986, "grad_norm": 0.2528095245361328, "learning_rate": 9.87801764747521e-05, "loss": 0.0221, "step": 5590 }, { "epoch": 3.971631205673759, "grad_norm": 0.30647730827331543, "learning_rate": 9.846742320305527e-05, "loss": 0.0261, "step": 5600 }, { "epoch": 3.978723404255319, "grad_norm": 0.2308386266231537, "learning_rate": 9.815468492503812e-05, "loss": 0.017, "step": 5610 }, { "epoch": 3.9858156028368796, "grad_norm": 0.21092812716960907, "learning_rate": 9.78419647003174e-05, "loss": 0.023, "step": 5620 }, { "epoch": 3.99290780141844, "grad_norm": 0.3230001628398895, "learning_rate": 9.752926558833317e-05, "loss": 0.0174, "step": 5630 }, { "epoch": 4.0, "grad_norm": 0.3302082419395447, "learning_rate": 9.721659064831895e-05, "loss": 0.0206, "step": 5640 }, { "epoch": 4.00709219858156, "grad_norm": 0.2825748026371002, "learning_rate": 9.690394293927189e-05, "loss": 0.0203, "step": 5650 }, { "epoch": 4.01418439716312, "grad_norm": 0.2544749081134796, "learning_rate": 9.659132551992248e-05, "loss": 0.0191, "step": 5660 }, { "epoch": 4.0212765957446805, "grad_norm": 0.19003655016422272, "learning_rate": 9.627874144870514e-05, "loss": 0.0167, "step": 5670 }, { "epoch": 4.028368794326241, "grad_norm": 0.18228965997695923, "learning_rate": 9.596619378372794e-05, "loss": 0.0286, "step": 5680 }, { "epoch": 4.035460992907802, "grad_norm": 0.27924972772598267, "learning_rate": 9.565368558274266e-05, "loss": 0.0126, "step": 5690 }, { "epoch": 4.042553191489362, "grad_norm": 0.1970324069261551, "learning_rate": 9.534121990311515e-05, "loss": 0.0192, "step": 5700 }, { "epoch": 4.049645390070922, "grad_norm": 0.37134411931037903, "learning_rate": 9.502879980179525e-05, "loss": 0.0233, "step": 5710 }, { "epoch": 4.056737588652482, "grad_norm": 0.20801763236522675, "learning_rate": 9.471642833528673e-05, "loss": 0.0176, "step": 5720 }, { "epoch": 4.0638297872340425, "grad_norm": 0.19383108615875244, "learning_rate": 9.440410855961776e-05, "loss": 0.0228, "step": 5730 }, { "epoch": 4.070921985815603, "grad_norm": 0.21508440375328064, "learning_rate": 9.409184353031068e-05, "loss": 0.0163, "step": 5740 }, { "epoch": 4.078014184397163, "grad_norm": 0.2944611608982086, "learning_rate": 9.377963630235225e-05, "loss": 0.0219, "step": 5750 }, { "epoch": 4.085106382978723, "grad_norm": 0.23027314245700836, "learning_rate": 9.346748993016377e-05, "loss": 0.0177, "step": 5760 }, { "epoch": 4.092198581560283, "grad_norm": 0.17456182837486267, "learning_rate": 9.315540746757108e-05, "loss": 0.0157, "step": 5770 }, { "epoch": 4.099290780141844, "grad_norm": 0.34717416763305664, "learning_rate": 9.284339196777491e-05, "loss": 0.0244, "step": 5780 }, { "epoch": 4.1063829787234045, "grad_norm": 0.38114133477211, "learning_rate": 9.25314464833208e-05, "loss": 0.0189, "step": 5790 }, { "epoch": 4.113475177304965, "grad_norm": 0.3224876821041107, "learning_rate": 9.221957406606926e-05, "loss": 0.0196, "step": 5800 }, { "epoch": 4.120567375886525, "grad_norm": 0.38748404383659363, "learning_rate": 9.190777776716606e-05, "loss": 0.0271, "step": 5810 }, { "epoch": 4.127659574468085, "grad_norm": 0.3015083074569702, "learning_rate": 9.159606063701221e-05, "loss": 0.0263, "step": 5820 }, { "epoch": 4.134751773049645, "grad_norm": 0.23368023335933685, "learning_rate": 9.128442572523417e-05, "loss": 0.0199, "step": 5830 }, { "epoch": 4.141843971631205, "grad_norm": 0.209278866648674, "learning_rate": 9.097287608065414e-05, "loss": 0.0157, "step": 5840 }, { "epoch": 4.148936170212766, "grad_norm": 0.21174615621566772, "learning_rate": 9.066141475126003e-05, "loss": 0.0161, "step": 5850 }, { "epoch": 4.156028368794326, "grad_norm": 0.22363576292991638, "learning_rate": 9.035004478417573e-05, "loss": 0.0202, "step": 5860 }, { "epoch": 4.163120567375887, "grad_norm": 0.22810956835746765, "learning_rate": 9.003876922563137e-05, "loss": 0.0179, "step": 5870 }, { "epoch": 4.170212765957447, "grad_norm": 0.1800043135881424, "learning_rate": 8.972759112093336e-05, "loss": 0.0235, "step": 5880 }, { "epoch": 4.177304964539007, "grad_norm": 0.2977979779243469, "learning_rate": 8.941651351443476e-05, "loss": 0.0167, "step": 5890 }, { "epoch": 4.184397163120567, "grad_norm": 0.1779855191707611, "learning_rate": 8.910553944950549e-05, "loss": 0.0156, "step": 5900 }, { "epoch": 4.191489361702128, "grad_norm": 0.20011846721172333, "learning_rate": 8.879467196850229e-05, "loss": 0.0252, "step": 5910 }, { "epoch": 4.198581560283688, "grad_norm": 0.12876836955547333, "learning_rate": 8.848391411273933e-05, "loss": 0.0191, "step": 5920 }, { "epoch": 4.205673758865248, "grad_norm": 0.24333477020263672, "learning_rate": 8.817326892245825e-05, "loss": 0.0196, "step": 5930 }, { "epoch": 4.212765957446808, "grad_norm": 0.1953645497560501, "learning_rate": 8.786273943679835e-05, "loss": 0.0225, "step": 5940 }, { "epoch": 4.219858156028369, "grad_norm": 0.25837603211402893, "learning_rate": 8.755232869376706e-05, "loss": 0.0155, "step": 5950 }, { "epoch": 4.226950354609929, "grad_norm": 0.24299311637878418, "learning_rate": 8.724203973021015e-05, "loss": 0.0168, "step": 5960 }, { "epoch": 4.23404255319149, "grad_norm": 0.27927613258361816, "learning_rate": 8.693187558178181e-05, "loss": 0.0174, "step": 5970 }, { "epoch": 4.24113475177305, "grad_norm": 0.23098881542682648, "learning_rate": 8.662183928291532e-05, "loss": 0.0119, "step": 5980 }, { "epoch": 4.24822695035461, "grad_norm": 0.187413290143013, "learning_rate": 8.631193386679301e-05, "loss": 0.017, "step": 5990 }, { "epoch": 4.25531914893617, "grad_norm": 0.36277079582214355, "learning_rate": 8.600216236531682e-05, "loss": 0.0249, "step": 6000 }, { "epoch": 4.26241134751773, "grad_norm": 0.2662680745124817, "learning_rate": 8.569252780907862e-05, "loss": 0.0165, "step": 6010 }, { "epoch": 4.2695035460992905, "grad_norm": 0.12048438936471939, "learning_rate": 8.538303322733032e-05, "loss": 0.0155, "step": 6020 }, { "epoch": 4.276595744680851, "grad_norm": 0.2885691523551941, "learning_rate": 8.507368164795462e-05, "loss": 0.0259, "step": 6030 }, { "epoch": 4.283687943262412, "grad_norm": 0.2707173526287079, "learning_rate": 8.476447609743508e-05, "loss": 0.0206, "step": 6040 }, { "epoch": 4.290780141843972, "grad_norm": 0.32506418228149414, "learning_rate": 8.44554196008266e-05, "loss": 0.0167, "step": 6050 }, { "epoch": 4.297872340425532, "grad_norm": 0.2805072069168091, "learning_rate": 8.414651518172583e-05, "loss": 0.0192, "step": 6060 }, { "epoch": 4.304964539007092, "grad_norm": 0.34258946776390076, "learning_rate": 8.383776586224175e-05, "loss": 0.0187, "step": 6070 }, { "epoch": 4.3120567375886525, "grad_norm": 0.21396967768669128, "learning_rate": 8.35291746629657e-05, "loss": 0.0139, "step": 6080 }, { "epoch": 4.319148936170213, "grad_norm": 0.2685137987136841, "learning_rate": 8.322074460294231e-05, "loss": 0.0153, "step": 6090 }, { "epoch": 4.326241134751773, "grad_norm": 0.22136232256889343, "learning_rate": 8.291247869963959e-05, "loss": 0.0256, "step": 6100 }, { "epoch": 4.333333333333333, "grad_norm": 0.2527240514755249, "learning_rate": 8.26043799689196e-05, "loss": 0.0219, "step": 6110 }, { "epoch": 4.340425531914893, "grad_norm": 0.12039966136217117, "learning_rate": 8.229645142500897e-05, "loss": 0.0169, "step": 6120 }, { "epoch": 4.347517730496454, "grad_norm": 0.2619229853153229, "learning_rate": 8.198869608046915e-05, "loss": 0.0245, "step": 6130 }, { "epoch": 4.3546099290780145, "grad_norm": 0.3446950316429138, "learning_rate": 8.168111694616733e-05, "loss": 0.0196, "step": 6140 }, { "epoch": 4.361702127659575, "grad_norm": 0.18836474418640137, "learning_rate": 8.137371703124671e-05, "loss": 0.0154, "step": 6150 }, { "epoch": 4.368794326241135, "grad_norm": 0.2591850757598877, "learning_rate": 8.106649934309706e-05, "loss": 0.0214, "step": 6160 }, { "epoch": 4.375886524822695, "grad_norm": 0.2989495098590851, "learning_rate": 8.075946688732545e-05, "loss": 0.0169, "step": 6170 }, { "epoch": 4.382978723404255, "grad_norm": 0.33893871307373047, "learning_rate": 8.045262266772675e-05, "loss": 0.0234, "step": 6180 }, { "epoch": 4.390070921985815, "grad_norm": 0.3758526146411896, "learning_rate": 8.01459696862542e-05, "loss": 0.0184, "step": 6190 }, { "epoch": 4.397163120567376, "grad_norm": 0.26383623480796814, "learning_rate": 7.983951094299022e-05, "loss": 0.0221, "step": 6200 }, { "epoch": 4.404255319148936, "grad_norm": 0.44497719407081604, "learning_rate": 7.953324943611677e-05, "loss": 0.024, "step": 6210 }, { "epoch": 4.411347517730497, "grad_norm": 0.3149394392967224, "learning_rate": 7.92271881618863e-05, "loss": 0.0266, "step": 6220 }, { "epoch": 4.418439716312057, "grad_norm": 0.19326764345169067, "learning_rate": 7.892133011459237e-05, "loss": 0.0179, "step": 6230 }, { "epoch": 4.425531914893617, "grad_norm": 0.2152886539697647, "learning_rate": 7.861567828654013e-05, "loss": 0.0213, "step": 6240 }, { "epoch": 4.432624113475177, "grad_norm": 0.22995711863040924, "learning_rate": 7.831023566801734e-05, "loss": 0.0152, "step": 6250 }, { "epoch": 4.439716312056738, "grad_norm": 0.28582632541656494, "learning_rate": 7.800500524726505e-05, "loss": 0.0237, "step": 6260 }, { "epoch": 4.446808510638298, "grad_norm": 0.2682250142097473, "learning_rate": 7.769999001044818e-05, "loss": 0.0198, "step": 6270 }, { "epoch": 4.453900709219858, "grad_norm": 0.3235504627227783, "learning_rate": 7.739519294162652e-05, "loss": 0.0186, "step": 6280 }, { "epoch": 4.460992907801418, "grad_norm": 0.3280167281627655, "learning_rate": 7.709061702272546e-05, "loss": 0.0168, "step": 6290 }, { "epoch": 4.468085106382979, "grad_norm": 0.2376112937927246, "learning_rate": 7.678626523350674e-05, "loss": 0.0208, "step": 6300 }, { "epoch": 4.475177304964539, "grad_norm": 0.38887420296669006, "learning_rate": 7.648214055153946e-05, "loss": 0.0146, "step": 6310 }, { "epoch": 4.4822695035460995, "grad_norm": 0.24783270061016083, "learning_rate": 7.617824595217074e-05, "loss": 0.0172, "step": 6320 }, { "epoch": 4.48936170212766, "grad_norm": 0.22741125524044037, "learning_rate": 7.587458440849691e-05, "loss": 0.0202, "step": 6330 }, { "epoch": 4.49645390070922, "grad_norm": 0.3385200798511505, "learning_rate": 7.557115889133408e-05, "loss": 0.0232, "step": 6340 }, { "epoch": 4.50354609929078, "grad_norm": 0.2820025384426117, "learning_rate": 7.526797236918929e-05, "loss": 0.0148, "step": 6350 }, { "epoch": 4.51063829787234, "grad_norm": 0.27540770173072815, "learning_rate": 7.496502780823141e-05, "loss": 0.0173, "step": 6360 }, { "epoch": 4.5177304964539005, "grad_norm": 0.23689982295036316, "learning_rate": 7.466232817226224e-05, "loss": 0.0192, "step": 6370 }, { "epoch": 4.524822695035461, "grad_norm": 0.31511813402175903, "learning_rate": 7.435987642268715e-05, "loss": 0.019, "step": 6380 }, { "epoch": 4.531914893617021, "grad_norm": 0.2491617202758789, "learning_rate": 7.405767551848662e-05, "loss": 0.0233, "step": 6390 }, { "epoch": 4.539007092198582, "grad_norm": 0.3146982192993164, "learning_rate": 7.37557284161869e-05, "loss": 0.02, "step": 6400 }, { "epoch": 4.546099290780142, "grad_norm": 0.1645408272743225, "learning_rate": 7.345403806983121e-05, "loss": 0.0195, "step": 6410 }, { "epoch": 4.553191489361702, "grad_norm": 0.23353220522403717, "learning_rate": 7.31526074309509e-05, "loss": 0.0197, "step": 6420 }, { "epoch": 4.560283687943262, "grad_norm": 0.2860185205936432, "learning_rate": 7.285143944853652e-05, "loss": 0.0199, "step": 6430 }, { "epoch": 4.567375886524823, "grad_norm": 0.23552881181240082, "learning_rate": 7.255053706900887e-05, "loss": 0.0145, "step": 6440 }, { "epoch": 4.574468085106383, "grad_norm": 0.41338714957237244, "learning_rate": 7.224990323619044e-05, "loss": 0.0194, "step": 6450 }, { "epoch": 4.581560283687943, "grad_norm": 0.23148952424526215, "learning_rate": 7.194954089127628e-05, "loss": 0.0166, "step": 6460 }, { "epoch": 4.588652482269503, "grad_norm": 0.26471659541130066, "learning_rate": 7.16494529728055e-05, "loss": 0.0147, "step": 6470 }, { "epoch": 4.595744680851064, "grad_norm": 0.22270764410495758, "learning_rate": 7.134964241663237e-05, "loss": 0.0132, "step": 6480 }, { "epoch": 4.602836879432624, "grad_norm": 0.1745089590549469, "learning_rate": 7.105011215589759e-05, "loss": 0.0122, "step": 6490 }, { "epoch": 4.609929078014185, "grad_norm": 0.12301220744848251, "learning_rate": 7.075086512099973e-05, "loss": 0.0143, "step": 6500 }, { "epoch": 4.617021276595745, "grad_norm": 0.29022157192230225, "learning_rate": 7.045190423956646e-05, "loss": 0.0279, "step": 6510 }, { "epoch": 4.624113475177305, "grad_norm": 0.2177857905626297, "learning_rate": 7.015323243642584e-05, "loss": 0.0216, "step": 6520 }, { "epoch": 4.631205673758865, "grad_norm": 0.31943878531455994, "learning_rate": 6.985485263357785e-05, "loss": 0.016, "step": 6530 }, { "epoch": 4.638297872340425, "grad_norm": 0.2381797432899475, "learning_rate": 6.955676775016579e-05, "loss": 0.0205, "step": 6540 }, { "epoch": 4.6453900709219855, "grad_norm": 0.26750218868255615, "learning_rate": 6.925898070244752e-05, "loss": 0.0231, "step": 6550 }, { "epoch": 4.652482269503546, "grad_norm": 0.2351475954055786, "learning_rate": 6.896149440376725e-05, "loss": 0.0128, "step": 6560 }, { "epoch": 4.659574468085106, "grad_norm": 0.2950522005558014, "learning_rate": 6.86643117645267e-05, "loss": 0.0223, "step": 6570 }, { "epoch": 4.666666666666667, "grad_norm": 0.24356043338775635, "learning_rate": 6.836743569215696e-05, "loss": 0.0151, "step": 6580 }, { "epoch": 4.673758865248227, "grad_norm": 0.20342320203781128, "learning_rate": 6.807086909108978e-05, "loss": 0.015, "step": 6590 }, { "epoch": 4.680851063829787, "grad_norm": 0.22289365530014038, "learning_rate": 6.777461486272925e-05, "loss": 0.0134, "step": 6600 }, { "epoch": 4.6879432624113475, "grad_norm": 0.17356853187084198, "learning_rate": 6.747867590542345e-05, "loss": 0.0111, "step": 6610 }, { "epoch": 4.695035460992908, "grad_norm": 0.1519446223974228, "learning_rate": 6.718305511443612e-05, "loss": 0.0172, "step": 6620 }, { "epoch": 4.702127659574468, "grad_norm": 0.22093936800956726, "learning_rate": 6.688775538191816e-05, "loss": 0.0144, "step": 6630 }, { "epoch": 4.709219858156028, "grad_norm": 0.29076847434043884, "learning_rate": 6.659277959687954e-05, "loss": 0.0147, "step": 6640 }, { "epoch": 4.716312056737589, "grad_norm": 0.4279550611972809, "learning_rate": 6.629813064516094e-05, "loss": 0.015, "step": 6650 }, { "epoch": 4.723404255319149, "grad_norm": 0.2095872014760971, "learning_rate": 6.600381140940544e-05, "loss": 0.017, "step": 6660 }, { "epoch": 4.7304964539007095, "grad_norm": 0.19063495099544525, "learning_rate": 6.570982476903061e-05, "loss": 0.0218, "step": 6670 }, { "epoch": 4.73758865248227, "grad_norm": 0.3333429992198944, "learning_rate": 6.541617360019985e-05, "loss": 0.0175, "step": 6680 }, { "epoch": 4.74468085106383, "grad_norm": 0.20596669614315033, "learning_rate": 6.512286077579478e-05, "loss": 0.0143, "step": 6690 }, { "epoch": 4.75177304964539, "grad_norm": 0.36356234550476074, "learning_rate": 6.48298891653868e-05, "loss": 0.0218, "step": 6700 }, { "epoch": 4.75886524822695, "grad_norm": 0.18473972380161285, "learning_rate": 6.453726163520906e-05, "loss": 0.0133, "step": 6710 }, { "epoch": 4.76595744680851, "grad_norm": 0.7571324110031128, "learning_rate": 6.424498104812852e-05, "loss": 0.016, "step": 6720 }, { "epoch": 4.773049645390071, "grad_norm": 0.24869713187217712, "learning_rate": 6.395305026361795e-05, "loss": 0.0212, "step": 6730 }, { "epoch": 4.780141843971631, "grad_norm": 0.1801852136850357, "learning_rate": 6.366147213772772e-05, "loss": 0.022, "step": 6740 }, { "epoch": 4.787234042553192, "grad_norm": 0.31297555565834045, "learning_rate": 6.337024952305819e-05, "loss": 0.0178, "step": 6750 }, { "epoch": 4.794326241134752, "grad_norm": 0.1869523674249649, "learning_rate": 6.307938526873157e-05, "loss": 0.02, "step": 6760 }, { "epoch": 4.801418439716312, "grad_norm": 0.10907835513353348, "learning_rate": 6.278888222036411e-05, "loss": 0.0205, "step": 6770 }, { "epoch": 4.808510638297872, "grad_norm": 0.23566560447216034, "learning_rate": 6.249874322003833e-05, "loss": 0.0164, "step": 6780 }, { "epoch": 4.815602836879433, "grad_norm": 0.2034124732017517, "learning_rate": 6.220897110627504e-05, "loss": 0.014, "step": 6790 }, { "epoch": 4.822695035460993, "grad_norm": 0.2759285271167755, "learning_rate": 6.191956871400582e-05, "loss": 0.0257, "step": 6800 }, { "epoch": 4.829787234042553, "grad_norm": 0.36838316917419434, "learning_rate": 6.163053887454509e-05, "loss": 0.0189, "step": 6810 }, { "epoch": 4.836879432624113, "grad_norm": 0.21057933568954468, "learning_rate": 6.134188441556241e-05, "loss": 0.0168, "step": 6820 }, { "epoch": 4.843971631205674, "grad_norm": 0.22402699291706085, "learning_rate": 6.105360816105498e-05, "loss": 0.0191, "step": 6830 }, { "epoch": 4.851063829787234, "grad_norm": 0.20568257570266724, "learning_rate": 6.0765712931319826e-05, "loss": 0.02, "step": 6840 }, { "epoch": 4.858156028368795, "grad_norm": 0.1438162624835968, "learning_rate": 6.0478201542926316e-05, "loss": 0.0135, "step": 6850 }, { "epoch": 4.865248226950355, "grad_norm": 0.18207921087741852, "learning_rate": 6.019107680868859e-05, "loss": 0.0154, "step": 6860 }, { "epoch": 4.872340425531915, "grad_norm": 0.3172309398651123, "learning_rate": 5.990434153763804e-05, "loss": 0.0143, "step": 6870 }, { "epoch": 4.879432624113475, "grad_norm": 0.2266601026058197, "learning_rate": 5.9617998534995766e-05, "loss": 0.0212, "step": 6880 }, { "epoch": 4.886524822695035, "grad_norm": 0.20986783504486084, "learning_rate": 5.933205060214525e-05, "loss": 0.016, "step": 6890 }, { "epoch": 4.8936170212765955, "grad_norm": 0.19340595602989197, "learning_rate": 5.9046500536604796e-05, "loss": 0.0161, "step": 6900 }, { "epoch": 4.900709219858156, "grad_norm": 0.2367888242006302, "learning_rate": 5.8761351132000295e-05, "loss": 0.0196, "step": 6910 }, { "epoch": 4.907801418439716, "grad_norm": 0.30044275522232056, "learning_rate": 5.8476605178037925e-05, "loss": 0.0176, "step": 6920 }, { "epoch": 4.914893617021277, "grad_norm": 0.3268476128578186, "learning_rate": 5.819226546047667e-05, "loss": 0.018, "step": 6930 }, { "epoch": 4.921985815602837, "grad_norm": 0.30031996965408325, "learning_rate": 5.790833476110113e-05, "loss": 0.0155, "step": 6940 }, { "epoch": 4.929078014184397, "grad_norm": 0.2088995724916458, "learning_rate": 5.762481585769455e-05, "loss": 0.013, "step": 6950 }, { "epoch": 4.9361702127659575, "grad_norm": 0.23445673286914825, "learning_rate": 5.7341711524011224e-05, "loss": 0.019, "step": 6960 }, { "epoch": 4.943262411347518, "grad_norm": 0.1619795709848404, "learning_rate": 5.705902452974978e-05, "loss": 0.0147, "step": 6970 }, { "epoch": 4.950354609929078, "grad_norm": 0.16455408930778503, "learning_rate": 5.6776757640525736e-05, "loss": 0.015, "step": 6980 }, { "epoch": 4.957446808510638, "grad_norm": 0.1724604368209839, "learning_rate": 5.6494913617844604e-05, "loss": 0.0255, "step": 6990 }, { "epoch": 4.964539007092198, "grad_norm": 0.2437286525964737, "learning_rate": 5.6213495219074975e-05, "loss": 0.0194, "step": 7000 }, { "epoch": 4.971631205673759, "grad_norm": 0.20035170018672943, "learning_rate": 5.593250519742127e-05, "loss": 0.0197, "step": 7010 }, { "epoch": 4.9787234042553195, "grad_norm": 0.2261771708726883, "learning_rate": 5.5651946301897126e-05, "loss": 0.0173, "step": 7020 }, { "epoch": 4.98581560283688, "grad_norm": 0.24441353976726532, "learning_rate": 5.537182127729822e-05, "loss": 0.0154, "step": 7030 }, { "epoch": 4.99290780141844, "grad_norm": 0.2184004932641983, "learning_rate": 5.509213286417551e-05, "loss": 0.0145, "step": 7040 }, { "epoch": 5.0, "grad_norm": 0.3220185339450836, "learning_rate": 5.481288379880857e-05, "loss": 0.0204, "step": 7050 }, { "epoch": 5.00709219858156, "grad_norm": 0.37926825881004333, "learning_rate": 5.453407681317868e-05, "loss": 0.0158, "step": 7060 }, { "epoch": 5.01418439716312, "grad_norm": 0.37592944502830505, "learning_rate": 5.4255714634941936e-05, "loss": 0.0203, "step": 7070 }, { "epoch": 5.0212765957446805, "grad_norm": 0.32261180877685547, "learning_rate": 5.397779998740293e-05, "loss": 0.0187, "step": 7080 }, { "epoch": 5.028368794326241, "grad_norm": 0.24909910559654236, "learning_rate": 5.3700335589487925e-05, "loss": 0.0182, "step": 7090 }, { "epoch": 5.035460992907802, "grad_norm": 0.2592754065990448, "learning_rate": 5.3423324155718144e-05, "loss": 0.0145, "step": 7100 }, { "epoch": 5.042553191489362, "grad_norm": 0.30807894468307495, "learning_rate": 5.314676839618332e-05, "loss": 0.0134, "step": 7110 }, { "epoch": 5.049645390070922, "grad_norm": 0.1335049420595169, "learning_rate": 5.287067101651533e-05, "loss": 0.015, "step": 7120 }, { "epoch": 5.056737588652482, "grad_norm": 0.3497454822063446, "learning_rate": 5.259503471786136e-05, "loss": 0.0204, "step": 7130 }, { "epoch": 5.0638297872340425, "grad_norm": 0.19706270098686218, "learning_rate": 5.2319862196857914e-05, "loss": 0.017, "step": 7140 }, { "epoch": 5.070921985815603, "grad_norm": 0.31677910685539246, "learning_rate": 5.204515614560407e-05, "loss": 0.015, "step": 7150 }, { "epoch": 5.078014184397163, "grad_norm": 0.24632439017295837, "learning_rate": 5.177091925163529e-05, "loss": 0.0257, "step": 7160 }, { "epoch": 5.085106382978723, "grad_norm": 0.26360684633255005, "learning_rate": 5.149715419789723e-05, "loss": 0.0119, "step": 7170 }, { "epoch": 5.092198581560283, "grad_norm": 0.23850224912166595, "learning_rate": 5.122386366271923e-05, "loss": 0.0196, "step": 7180 }, { "epoch": 5.099290780141844, "grad_norm": 0.3354696035385132, "learning_rate": 5.0951050319788444e-05, "loss": 0.0138, "step": 7190 }, { "epoch": 5.1063829787234045, "grad_norm": 0.18239453434944153, "learning_rate": 5.067871683812338e-05, "loss": 0.0206, "step": 7200 }, { "epoch": 5.113475177304965, "grad_norm": 0.18862655758857727, "learning_rate": 5.0406865882047884e-05, "loss": 0.011, "step": 7210 }, { "epoch": 5.120567375886525, "grad_norm": 0.1917518526315689, "learning_rate": 5.0135500111165215e-05, "loss": 0.0139, "step": 7220 }, { "epoch": 5.127659574468085, "grad_norm": 0.19659163057804108, "learning_rate": 4.986462218033192e-05, "loss": 0.0161, "step": 7230 }, { "epoch": 5.134751773049645, "grad_norm": 0.23764602839946747, "learning_rate": 4.959423473963167e-05, "loss": 0.017, "step": 7240 }, { "epoch": 5.141843971631205, "grad_norm": 0.18340301513671875, "learning_rate": 4.932434043434975e-05, "loss": 0.0163, "step": 7250 }, { "epoch": 5.148936170212766, "grad_norm": 0.2168426811695099, "learning_rate": 4.905494190494674e-05, "loss": 0.0141, "step": 7260 }, { "epoch": 5.156028368794326, "grad_norm": 0.2772137522697449, "learning_rate": 4.878604178703308e-05, "loss": 0.0186, "step": 7270 }, { "epoch": 5.163120567375887, "grad_norm": 0.17947837710380554, "learning_rate": 4.851764271134296e-05, "loss": 0.0141, "step": 7280 }, { "epoch": 5.170212765957447, "grad_norm": 0.2101341038942337, "learning_rate": 4.824974730370871e-05, "loss": 0.0129, "step": 7290 }, { "epoch": 5.177304964539007, "grad_norm": 0.28040558099746704, "learning_rate": 4.798235818503522e-05, "loss": 0.0218, "step": 7300 }, { "epoch": 5.184397163120567, "grad_norm": 0.15831856429576874, "learning_rate": 4.771547797127418e-05, "loss": 0.0114, "step": 7310 }, { "epoch": 5.191489361702128, "grad_norm": 0.08963089436292648, "learning_rate": 4.744910927339842e-05, "loss": 0.0113, "step": 7320 }, { "epoch": 5.198581560283688, "grad_norm": 0.24577274918556213, "learning_rate": 4.7183254697376456e-05, "loss": 0.0145, "step": 7330 }, { "epoch": 5.205673758865248, "grad_norm": 0.31053343415260315, "learning_rate": 4.69179168441471e-05, "loss": 0.0133, "step": 7340 }, { "epoch": 5.212765957446808, "grad_norm": 0.1162974014878273, "learning_rate": 4.665309830959377e-05, "loss": 0.0167, "step": 7350 }, { "epoch": 5.219858156028369, "grad_norm": 0.30117878317832947, "learning_rate": 4.638880168451938e-05, "loss": 0.022, "step": 7360 }, { "epoch": 5.226950354609929, "grad_norm": 0.316582590341568, "learning_rate": 4.61250295546206e-05, "loss": 0.0186, "step": 7370 }, { "epoch": 5.23404255319149, "grad_norm": 0.3584196865558624, "learning_rate": 4.586178450046303e-05, "loss": 0.0182, "step": 7380 }, { "epoch": 5.24113475177305, "grad_norm": 0.2816001772880554, "learning_rate": 4.559906909745567e-05, "loss": 0.0175, "step": 7390 }, { "epoch": 5.24822695035461, "grad_norm": 0.29749003052711487, "learning_rate": 4.533688591582571e-05, "loss": 0.0132, "step": 7400 }, { "epoch": 5.25531914893617, "grad_norm": 0.15811972320079803, "learning_rate": 4.5075237520593435e-05, "loss": 0.0151, "step": 7410 }, { "epoch": 5.26241134751773, "grad_norm": 0.22752103209495544, "learning_rate": 4.4814126471547293e-05, "loss": 0.0272, "step": 7420 }, { "epoch": 5.2695035460992905, "grad_norm": 0.2540184259414673, "learning_rate": 4.455355532321852e-05, "loss": 0.0201, "step": 7430 }, { "epoch": 5.276595744680851, "grad_norm": 0.24097682535648346, "learning_rate": 4.429352662485652e-05, "loss": 0.0139, "step": 7440 }, { "epoch": 5.283687943262412, "grad_norm": 0.17041516304016113, "learning_rate": 4.403404292040357e-05, "loss": 0.014, "step": 7450 }, { "epoch": 5.290780141843972, "grad_norm": 0.2444879710674286, "learning_rate": 4.377510674847017e-05, "loss": 0.0128, "step": 7460 }, { "epoch": 5.297872340425532, "grad_norm": 0.17711535096168518, "learning_rate": 4.3516720642310204e-05, "loss": 0.0163, "step": 7470 }, { "epoch": 5.304964539007092, "grad_norm": 0.14002487063407898, "learning_rate": 4.3258887129795945e-05, "loss": 0.0164, "step": 7480 }, { "epoch": 5.3120567375886525, "grad_norm": 0.10432898253202438, "learning_rate": 4.300160873339364e-05, "loss": 0.0172, "step": 7490 }, { "epoch": 5.319148936170213, "grad_norm": 0.16279327869415283, "learning_rate": 4.2744887970138516e-05, "loss": 0.0226, "step": 7500 }, { "epoch": 5.326241134751773, "grad_norm": 0.13522747159004211, "learning_rate": 4.2488727351610335e-05, "loss": 0.0121, "step": 7510 }, { "epoch": 5.333333333333333, "grad_norm": 0.23896059393882751, "learning_rate": 4.2233129383908874e-05, "loss": 0.0193, "step": 7520 }, { "epoch": 5.340425531914893, "grad_norm": 0.20990808308124542, "learning_rate": 4.197809656762922e-05, "loss": 0.022, "step": 7530 }, { "epoch": 5.347517730496454, "grad_norm": 0.25492557883262634, "learning_rate": 4.1723631397837416e-05, "loss": 0.0138, "step": 7540 }, { "epoch": 5.3546099290780145, "grad_norm": 0.30456793308258057, "learning_rate": 4.1469736364046086e-05, "loss": 0.0174, "step": 7550 }, { "epoch": 5.361702127659575, "grad_norm": 0.18763889372348785, "learning_rate": 4.121641395019006e-05, "loss": 0.0136, "step": 7560 }, { "epoch": 5.368794326241135, "grad_norm": 0.17302776873111725, "learning_rate": 4.096366663460195e-05, "loss": 0.012, "step": 7570 }, { "epoch": 5.375886524822695, "grad_norm": 0.20956365764141083, "learning_rate": 4.0711496889988076e-05, "loss": 0.0147, "step": 7580 }, { "epoch": 5.382978723404255, "grad_norm": 0.24955029785633087, "learning_rate": 4.0459907183404135e-05, "loss": 0.0195, "step": 7590 }, { "epoch": 5.390070921985815, "grad_norm": 0.16770239174365997, "learning_rate": 4.02088999762312e-05, "loss": 0.0152, "step": 7600 }, { "epoch": 5.397163120567376, "grad_norm": 0.17319859564304352, "learning_rate": 3.995847772415159e-05, "loss": 0.0127, "step": 7610 }, { "epoch": 5.404255319148936, "grad_norm": 0.13932038843631744, "learning_rate": 3.9708642877124724e-05, "loss": 0.0121, "step": 7620 }, { "epoch": 5.411347517730497, "grad_norm": 0.2718459665775299, "learning_rate": 3.945939787936329e-05, "loss": 0.0244, "step": 7630 }, { "epoch": 5.418439716312057, "grad_norm": 0.15254083275794983, "learning_rate": 3.9210745169309374e-05, "loss": 0.0147, "step": 7640 }, { "epoch": 5.425531914893617, "grad_norm": 0.3398546576499939, "learning_rate": 3.896268717961041e-05, "loss": 0.0175, "step": 7650 }, { "epoch": 5.432624113475177, "grad_norm": 0.1195686087012291, "learning_rate": 3.871522633709555e-05, "loss": 0.018, "step": 7660 }, { "epoch": 5.439716312056738, "grad_norm": 0.22691002488136292, "learning_rate": 3.84683650627519e-05, "loss": 0.0126, "step": 7670 }, { "epoch": 5.446808510638298, "grad_norm": 0.27147871255874634, "learning_rate": 3.8222105771700725e-05, "loss": 0.0162, "step": 7680 }, { "epoch": 5.453900709219858, "grad_norm": 0.18269909918308258, "learning_rate": 3.7976450873174005e-05, "loss": 0.0134, "step": 7690 }, { "epoch": 5.460992907801418, "grad_norm": 0.1839030236005783, "learning_rate": 3.7731402770490654e-05, "loss": 0.0122, "step": 7700 }, { "epoch": 5.468085106382979, "grad_norm": 0.20004524290561676, "learning_rate": 3.748696386103313e-05, "loss": 0.0137, "step": 7710 }, { "epoch": 5.475177304964539, "grad_norm": 0.19872575998306274, "learning_rate": 3.724313653622404e-05, "loss": 0.0191, "step": 7720 }, { "epoch": 5.4822695035460995, "grad_norm": 0.23203954100608826, "learning_rate": 3.699992318150256e-05, "loss": 0.0146, "step": 7730 }, { "epoch": 5.48936170212766, "grad_norm": 0.3198186159133911, "learning_rate": 3.675732617630132e-05, "loss": 0.011, "step": 7740 }, { "epoch": 5.49645390070922, "grad_norm": 0.195682555437088, "learning_rate": 3.6515347894022914e-05, "loss": 0.0166, "step": 7750 }, { "epoch": 5.50354609929078, "grad_norm": 0.25143593549728394, "learning_rate": 3.627399070201676e-05, "loss": 0.0155, "step": 7760 }, { "epoch": 5.51063829787234, "grad_norm": 0.19740962982177734, "learning_rate": 3.603325696155605e-05, "loss": 0.0107, "step": 7770 }, { "epoch": 5.5177304964539005, "grad_norm": 0.1158263236284256, "learning_rate": 3.579314902781458e-05, "loss": 0.0162, "step": 7780 }, { "epoch": 5.524822695035461, "grad_norm": 0.29272812604904175, "learning_rate": 3.555366924984346e-05, "loss": 0.0199, "step": 7790 }, { "epoch": 5.531914893617021, "grad_norm": 0.13803668320178986, "learning_rate": 3.531481997054861e-05, "loss": 0.0105, "step": 7800 }, { "epoch": 5.539007092198582, "grad_norm": 0.16399915516376495, "learning_rate": 3.5076603526667404e-05, "loss": 0.0115, "step": 7810 }, { "epoch": 5.546099290780142, "grad_norm": 0.23237183690071106, "learning_rate": 3.4839022248746136e-05, "loss": 0.0152, "step": 7820 }, { "epoch": 5.553191489361702, "grad_norm": 0.22134317457675934, "learning_rate": 3.460207846111697e-05, "loss": 0.0128, "step": 7830 }, { "epoch": 5.560283687943262, "grad_norm": 0.14723555743694305, "learning_rate": 3.436577448187529e-05, "loss": 0.0126, "step": 7840 }, { "epoch": 5.567375886524823, "grad_norm": 0.17128515243530273, "learning_rate": 3.41301126228571e-05, "loss": 0.0145, "step": 7850 }, { "epoch": 5.574468085106383, "grad_norm": 0.20742635428905487, "learning_rate": 3.389509518961637e-05, "loss": 0.0151, "step": 7860 }, { "epoch": 5.581560283687943, "grad_norm": 0.16497284173965454, "learning_rate": 3.3660724481402326e-05, "loss": 0.0151, "step": 7870 }, { "epoch": 5.588652482269503, "grad_norm": 0.24318110942840576, "learning_rate": 3.3427002791137164e-05, "loss": 0.0159, "step": 7880 }, { "epoch": 5.595744680851064, "grad_norm": 0.18855370581150055, "learning_rate": 3.319393240539355e-05, "loss": 0.015, "step": 7890 }, { "epoch": 5.602836879432624, "grad_norm": 0.14175598323345184, "learning_rate": 3.296151560437214e-05, "loss": 0.0181, "step": 7900 }, { "epoch": 5.609929078014185, "grad_norm": 0.27144867181777954, "learning_rate": 3.272975466187951e-05, "loss": 0.0148, "step": 7910 }, { "epoch": 5.617021276595745, "grad_norm": 0.2221544086933136, "learning_rate": 3.249865184530563e-05, "loss": 0.0129, "step": 7920 }, { "epoch": 5.624113475177305, "grad_norm": 0.2543604075908661, "learning_rate": 3.226820941560186e-05, "loss": 0.014, "step": 7930 }, { "epoch": 5.631205673758865, "grad_norm": 0.10194625705480576, "learning_rate": 3.2038429627258845e-05, "loss": 0.0138, "step": 7940 }, { "epoch": 5.638297872340425, "grad_norm": 0.18883180618286133, "learning_rate": 3.180931472828435e-05, "loss": 0.0143, "step": 7950 }, { "epoch": 5.6453900709219855, "grad_norm": 0.1743205338716507, "learning_rate": 3.158086696018126e-05, "loss": 0.0128, "step": 7960 }, { "epoch": 5.652482269503546, "grad_norm": 0.2825423777103424, "learning_rate": 3.135308855792587e-05, "loss": 0.015, "step": 7970 }, { "epoch": 5.659574468085106, "grad_norm": 0.3754797577857971, "learning_rate": 3.1125981749945686e-05, "loss": 0.0117, "step": 7980 }, { "epoch": 5.666666666666667, "grad_norm": 0.19825135171413422, "learning_rate": 3.089954875809794e-05, "loss": 0.0129, "step": 7990 }, { "epoch": 5.673758865248227, "grad_norm": 0.33323025703430176, "learning_rate": 3.06737917976476e-05, "loss": 0.0155, "step": 8000 }, { "epoch": 5.680851063829787, "grad_norm": 0.09978597611188889, "learning_rate": 3.0448713077245838e-05, "loss": 0.0136, "step": 8010 }, { "epoch": 5.6879432624113475, "grad_norm": 0.14637133479118347, "learning_rate": 3.0224314798908414e-05, "loss": 0.0164, "step": 8020 }, { "epoch": 5.695035460992908, "grad_norm": 0.11906511336565018, "learning_rate": 3.0000599157994148e-05, "loss": 0.0134, "step": 8030 }, { "epoch": 5.702127659574468, "grad_norm": 0.21006911993026733, "learning_rate": 2.9777568343183303e-05, "loss": 0.0128, "step": 8040 }, { "epoch": 5.709219858156028, "grad_norm": 0.24388642609119415, "learning_rate": 2.955522453645635e-05, "loss": 0.0148, "step": 8050 }, { "epoch": 5.716312056737589, "grad_norm": 0.40704119205474854, "learning_rate": 2.9333569913072466e-05, "loss": 0.0142, "step": 8060 }, { "epoch": 5.723404255319149, "grad_norm": 0.24392291903495789, "learning_rate": 2.9112606641548436e-05, "loss": 0.0133, "step": 8070 }, { "epoch": 5.7304964539007095, "grad_norm": 0.22407054901123047, "learning_rate": 2.8892336883637327e-05, "loss": 0.0133, "step": 8080 }, { "epoch": 5.73758865248227, "grad_norm": 0.20735864341259003, "learning_rate": 2.8672762794307173e-05, "loss": 0.0108, "step": 8090 }, { "epoch": 5.74468085106383, "grad_norm": 0.20651739835739136, "learning_rate": 2.8453886521720264e-05, "loss": 0.0155, "step": 8100 }, { "epoch": 5.75177304964539, "grad_norm": 0.1309831440448761, "learning_rate": 2.8235710207211874e-05, "loss": 0.0171, "step": 8110 }, { "epoch": 5.75886524822695, "grad_norm": 0.29270994663238525, "learning_rate": 2.8018235985269325e-05, "loss": 0.0179, "step": 8120 }, { "epoch": 5.76595744680851, "grad_norm": 0.3431891202926636, "learning_rate": 2.7801465983511143e-05, "loss": 0.0156, "step": 8130 }, { "epoch": 5.773049645390071, "grad_norm": 0.12922121584415436, "learning_rate": 2.7585402322666333e-05, "loss": 0.0196, "step": 8140 }, { "epoch": 5.780141843971631, "grad_norm": 0.19691234827041626, "learning_rate": 2.737004711655342e-05, "loss": 0.0116, "step": 8150 }, { "epoch": 5.787234042553192, "grad_norm": 0.17019961774349213, "learning_rate": 2.7155402472060043e-05, "loss": 0.0145, "step": 8160 }, { "epoch": 5.794326241134752, "grad_norm": 0.18791693449020386, "learning_rate": 2.6941470489122056e-05, "loss": 0.0166, "step": 8170 }, { "epoch": 5.801418439716312, "grad_norm": 0.1223282665014267, "learning_rate": 2.6728253260703163e-05, "loss": 0.0117, "step": 8180 }, { "epoch": 5.808510638297872, "grad_norm": 0.16401457786560059, "learning_rate": 2.6515752872774458e-05, "loss": 0.0147, "step": 8190 }, { "epoch": 5.815602836879433, "grad_norm": 0.13671791553497314, "learning_rate": 2.6303971404293882e-05, "loss": 0.0128, "step": 8200 }, { "epoch": 5.822695035460993, "grad_norm": 0.21030209958553314, "learning_rate": 2.609291092718604e-05, "loss": 0.0157, "step": 8210 }, { "epoch": 5.829787234042553, "grad_norm": 0.19939203560352325, "learning_rate": 2.5882573506321772e-05, "loss": 0.0139, "step": 8220 }, { "epoch": 5.836879432624113, "grad_norm": 0.26017311215400696, "learning_rate": 2.5672961199498058e-05, "loss": 0.0133, "step": 8230 }, { "epoch": 5.843971631205674, "grad_norm": 0.29861557483673096, "learning_rate": 2.5464076057417883e-05, "loss": 0.0154, "step": 8240 }, { "epoch": 5.851063829787234, "grad_norm": 0.17012879252433777, "learning_rate": 2.5255920123670196e-05, "loss": 0.0163, "step": 8250 }, { "epoch": 5.858156028368795, "grad_norm": 0.20148979127407074, "learning_rate": 2.5048495434709708e-05, "loss": 0.0117, "step": 8260 }, { "epoch": 5.865248226950355, "grad_norm": 0.2007923573255539, "learning_rate": 2.4841804019837323e-05, "loss": 0.0146, "step": 8270 }, { "epoch": 5.872340425531915, "grad_norm": 0.1671822965145111, "learning_rate": 2.4635847901179932e-05, "loss": 0.0174, "step": 8280 }, { "epoch": 5.879432624113475, "grad_norm": 0.22692982852458954, "learning_rate": 2.4430629093670963e-05, "loss": 0.0173, "step": 8290 }, { "epoch": 5.886524822695035, "grad_norm": 0.142043337225914, "learning_rate": 2.4226149605030344e-05, "loss": 0.0099, "step": 8300 }, { "epoch": 5.8936170212765955, "grad_norm": 0.1621280163526535, "learning_rate": 2.4022411435745074e-05, "loss": 0.0111, "step": 8310 }, { "epoch": 5.900709219858156, "grad_norm": 0.21993504464626312, "learning_rate": 2.3819416579049603e-05, "loss": 0.01, "step": 8320 }, { "epoch": 5.907801418439716, "grad_norm": 0.2549281716346741, "learning_rate": 2.361716702090634e-05, "loss": 0.0156, "step": 8330 }, { "epoch": 5.914893617021277, "grad_norm": 0.13125896453857422, "learning_rate": 2.3415664739986165e-05, "loss": 0.0179, "step": 8340 }, { "epoch": 5.921985815602837, "grad_norm": 0.2061365842819214, "learning_rate": 2.321491170764908e-05, "loss": 0.0149, "step": 8350 }, { "epoch": 5.929078014184397, "grad_norm": 0.2736816108226776, "learning_rate": 2.3014909887925042e-05, "loss": 0.0186, "step": 8360 }, { "epoch": 5.9361702127659575, "grad_norm": 0.1882363259792328, "learning_rate": 2.281566123749458e-05, "loss": 0.0211, "step": 8370 }, { "epoch": 5.943262411347518, "grad_norm": 0.2734488844871521, "learning_rate": 2.2617167705669827e-05, "loss": 0.0132, "step": 8380 }, { "epoch": 5.950354609929078, "grad_norm": 0.20115582644939423, "learning_rate": 2.2419431234375178e-05, "loss": 0.0121, "step": 8390 }, { "epoch": 5.957446808510638, "grad_norm": 0.1607801914215088, "learning_rate": 2.2222453758128648e-05, "loss": 0.0128, "step": 8400 }, { "epoch": 5.964539007092198, "grad_norm": 0.20927660167217255, "learning_rate": 2.2026237204022716e-05, "loss": 0.0097, "step": 8410 }, { "epoch": 5.971631205673759, "grad_norm": 0.15127459168434143, "learning_rate": 2.1830783491705477e-05, "loss": 0.0096, "step": 8420 }, { "epoch": 5.9787234042553195, "grad_norm": 0.25664299726486206, "learning_rate": 2.1636094533361896e-05, "loss": 0.0135, "step": 8430 }, { "epoch": 5.98581560283688, "grad_norm": 0.1857176572084427, "learning_rate": 2.14421722336952e-05, "loss": 0.0131, "step": 8440 }, { "epoch": 5.99290780141844, "grad_norm": 0.11627791076898575, "learning_rate": 2.1249018489908056e-05, "loss": 0.0101, "step": 8450 }, { "epoch": 6.0, "grad_norm": 0.42432013154029846, "learning_rate": 2.1056635191684183e-05, "loss": 0.0128, "step": 8460 }, { "epoch": 6.00709219858156, "grad_norm": 0.21808061003684998, "learning_rate": 2.086502422116974e-05, "loss": 0.0136, "step": 8470 }, { "epoch": 6.01418439716312, "grad_norm": 0.2082509845495224, "learning_rate": 2.067418745295494e-05, "loss": 0.0171, "step": 8480 }, { "epoch": 6.0212765957446805, "grad_norm": 0.1480574756860733, "learning_rate": 2.0484126754055842e-05, "loss": 0.0125, "step": 8490 }, { "epoch": 6.028368794326241, "grad_norm": 0.21675336360931396, "learning_rate": 2.0294843983895828e-05, "loss": 0.0148, "step": 8500 }, { "epoch": 6.035460992907802, "grad_norm": 0.22429972887039185, "learning_rate": 2.0106340994287698e-05, "loss": 0.018, "step": 8510 }, { "epoch": 6.042553191489362, "grad_norm": 0.18754935264587402, "learning_rate": 1.9918619629415314e-05, "loss": 0.0116, "step": 8520 }, { "epoch": 6.049645390070922, "grad_norm": 0.1608024388551712, "learning_rate": 1.9731681725815676e-05, "loss": 0.0083, "step": 8530 }, { "epoch": 6.056737588652482, "grad_norm": 0.19191201031208038, "learning_rate": 1.9545529112361005e-05, "loss": 0.0173, "step": 8540 }, { "epoch": 6.0638297872340425, "grad_norm": 0.18507297337055206, "learning_rate": 1.93601636102407e-05, "loss": 0.0095, "step": 8550 }, { "epoch": 6.070921985815603, "grad_norm": 0.18908648192882538, "learning_rate": 1.917558703294361e-05, "loss": 0.0129, "step": 8560 }, { "epoch": 6.078014184397163, "grad_norm": 0.23118513822555542, "learning_rate": 1.8991801186240342e-05, "loss": 0.0089, "step": 8570 }, { "epoch": 6.085106382978723, "grad_norm": 0.24339988827705383, "learning_rate": 1.8808807868165512e-05, "loss": 0.0116, "step": 8580 }, { "epoch": 6.092198581560283, "grad_norm": 0.26911652088165283, "learning_rate": 1.862660886900016e-05, "loss": 0.0145, "step": 8590 }, { "epoch": 6.099290780141844, "grad_norm": 0.15160147845745087, "learning_rate": 1.8445205971254243e-05, "loss": 0.0101, "step": 8600 }, { "epoch": 6.1063829787234045, "grad_norm": 0.1248091533780098, "learning_rate": 1.826460094964928e-05, "loss": 0.0143, "step": 8610 }, { "epoch": 6.113475177304965, "grad_norm": 0.19379976391792297, "learning_rate": 1.808479557110081e-05, "loss": 0.0112, "step": 8620 }, { "epoch": 6.120567375886525, "grad_norm": 0.16450917720794678, "learning_rate": 1.7905791594701337e-05, "loss": 0.0202, "step": 8630 }, { "epoch": 6.127659574468085, "grad_norm": 0.3321121335029602, "learning_rate": 1.7727590771702894e-05, "loss": 0.0107, "step": 8640 }, { "epoch": 6.134751773049645, "grad_norm": 0.1483653485774994, "learning_rate": 1.7550194845500025e-05, "loss": 0.0123, "step": 8650 }, { "epoch": 6.141843971631205, "grad_norm": 0.17642395198345184, "learning_rate": 1.7373605551612805e-05, "loss": 0.009, "step": 8660 }, { "epoch": 6.148936170212766, "grad_norm": 0.24330726265907288, "learning_rate": 1.7197824617669655e-05, "loss": 0.0113, "step": 8670 }, { "epoch": 6.156028368794326, "grad_norm": 0.2407854199409485, "learning_rate": 1.7022853763390623e-05, "loss": 0.013, "step": 8680 }, { "epoch": 6.163120567375887, "grad_norm": 0.237819641828537, "learning_rate": 1.68486947005705e-05, "loss": 0.013, "step": 8690 }, { "epoch": 6.170212765957447, "grad_norm": 0.16382509469985962, "learning_rate": 1.6675349133062e-05, "loss": 0.0128, "step": 8700 }, { "epoch": 6.177304964539007, "grad_norm": 0.2901313602924347, "learning_rate": 1.6502818756759276e-05, "loss": 0.0085, "step": 8710 }, { "epoch": 6.184397163120567, "grad_norm": 0.23529794812202454, "learning_rate": 1.633110525958108e-05, "loss": 0.009, "step": 8720 }, { "epoch": 6.191489361702128, "grad_norm": 0.21968472003936768, "learning_rate": 1.616021032145444e-05, "loss": 0.0158, "step": 8730 }, { "epoch": 6.198581560283688, "grad_norm": 0.21688121557235718, "learning_rate": 1.5990135614298184e-05, "loss": 0.0114, "step": 8740 }, { "epoch": 6.205673758865248, "grad_norm": 0.19126644730567932, "learning_rate": 1.582088280200652e-05, "loss": 0.0111, "step": 8750 }, { "epoch": 6.212765957446808, "grad_norm": 0.2854389548301697, "learning_rate": 1.5652453540432856e-05, "loss": 0.0124, "step": 8760 }, { "epoch": 6.219858156028369, "grad_norm": 0.23691484332084656, "learning_rate": 1.5484849477373463e-05, "loss": 0.0163, "step": 8770 }, { "epoch": 6.226950354609929, "grad_norm": 0.10352014750242233, "learning_rate": 1.5318072252551498e-05, "loss": 0.0121, "step": 8780 }, { "epoch": 6.23404255319149, "grad_norm": 0.1918143928050995, "learning_rate": 1.5152123497600879e-05, "loss": 0.0166, "step": 8790 }, { "epoch": 6.24113475177305, "grad_norm": 0.27419474720954895, "learning_rate": 1.49870048360504e-05, "loss": 0.0176, "step": 8800 }, { "epoch": 6.24822695035461, "grad_norm": 0.1279487907886505, "learning_rate": 1.4822717883307658e-05, "loss": 0.0165, "step": 8810 }, { "epoch": 6.25531914893617, "grad_norm": 0.20851808786392212, "learning_rate": 1.46592642466435e-05, "loss": 0.0111, "step": 8820 }, { "epoch": 6.26241134751773, "grad_norm": 0.20219635963439941, "learning_rate": 1.4496645525176166e-05, "loss": 0.0105, "step": 8830 }, { "epoch": 6.2695035460992905, "grad_norm": 0.17594772577285767, "learning_rate": 1.4334863309855617e-05, "loss": 0.02, "step": 8840 }, { "epoch": 6.276595744680851, "grad_norm": 0.1454489380121231, "learning_rate": 1.4173919183448026e-05, "loss": 0.0147, "step": 8850 }, { "epoch": 6.283687943262412, "grad_norm": 0.36037492752075195, "learning_rate": 1.4013814720520258e-05, "loss": 0.0104, "step": 8860 }, { "epoch": 6.290780141843972, "grad_norm": 0.29365283250808716, "learning_rate": 1.385455148742455e-05, "loss": 0.0169, "step": 8870 }, { "epoch": 6.297872340425532, "grad_norm": 0.2135290801525116, "learning_rate": 1.36961310422831e-05, "loss": 0.0183, "step": 8880 }, { "epoch": 6.304964539007092, "grad_norm": 0.27943718433380127, "learning_rate": 1.3538554934972813e-05, "loss": 0.0148, "step": 8890 }, { "epoch": 6.3120567375886525, "grad_norm": 0.18066227436065674, "learning_rate": 1.3381824707110157e-05, "loss": 0.0115, "step": 8900 }, { "epoch": 6.319148936170213, "grad_norm": 0.087078757584095, "learning_rate": 1.3225941892036198e-05, "loss": 0.0121, "step": 8910 }, { "epoch": 6.326241134751773, "grad_norm": 0.26353609561920166, "learning_rate": 1.3070908014801375e-05, "loss": 0.0087, "step": 8920 }, { "epoch": 6.333333333333333, "grad_norm": 0.11882209032773972, "learning_rate": 1.2916724592150798e-05, "loss": 0.0088, "step": 8930 }, { "epoch": 6.340425531914893, "grad_norm": 0.17999006807804108, "learning_rate": 1.276339313250925e-05, "loss": 0.0094, "step": 8940 }, { "epoch": 6.347517730496454, "grad_norm": 0.16146190464496613, "learning_rate": 1.2610915135966495e-05, "loss": 0.0112, "step": 8950 }, { "epoch": 6.3546099290780145, "grad_norm": 0.26943373680114746, "learning_rate": 1.2459292094262664e-05, "loss": 0.014, "step": 8960 }, { "epoch": 6.361702127659575, "grad_norm": 0.22631198167800903, "learning_rate": 1.2308525490773526e-05, "loss": 0.0103, "step": 8970 }, { "epoch": 6.368794326241135, "grad_norm": 0.25705012679100037, "learning_rate": 1.2158616800496059e-05, "loss": 0.0152, "step": 8980 }, { "epoch": 6.375886524822695, "grad_norm": 0.23096348345279694, "learning_rate": 1.2009567490034046e-05, "loss": 0.0128, "step": 8990 }, { "epoch": 6.382978723404255, "grad_norm": 0.19161191582679749, "learning_rate": 1.186137901758364e-05, "loss": 0.0142, "step": 9000 }, { "epoch": 6.390070921985815, "grad_norm": 0.17679370939731598, "learning_rate": 1.1714052832919187e-05, "loss": 0.0163, "step": 9010 }, { "epoch": 6.397163120567376, "grad_norm": 0.1845289170742035, "learning_rate": 1.1567590377378979e-05, "loss": 0.0137, "step": 9020 }, { "epoch": 6.404255319148936, "grad_norm": 0.10619324445724487, "learning_rate": 1.1421993083851145e-05, "loss": 0.0119, "step": 9030 }, { "epoch": 6.411347517730497, "grad_norm": 0.17025884985923767, "learning_rate": 1.1277262376759712e-05, "loss": 0.0147, "step": 9040 }, { "epoch": 6.418439716312057, "grad_norm": 0.24711932241916656, "learning_rate": 1.1133399672050638e-05, "loss": 0.0128, "step": 9050 }, { "epoch": 6.425531914893617, "grad_norm": 0.24352525174617767, "learning_rate": 1.0990406377177865e-05, "loss": 0.0163, "step": 9060 }, { "epoch": 6.432624113475177, "grad_norm": 0.11696803569793701, "learning_rate": 1.0848283891089683e-05, "loss": 0.0108, "step": 9070 }, { "epoch": 6.439716312056738, "grad_norm": 0.14711810648441315, "learning_rate": 1.0707033604214944e-05, "loss": 0.0091, "step": 9080 }, { "epoch": 6.446808510638298, "grad_norm": 0.08239645510911942, "learning_rate": 1.0566656898449546e-05, "loss": 0.0119, "step": 9090 }, { "epoch": 6.453900709219858, "grad_norm": 0.1815529316663742, "learning_rate": 1.0427155147142887e-05, "loss": 0.0122, "step": 9100 }, { "epoch": 6.460992907801418, "grad_norm": 0.16181941330432892, "learning_rate": 1.0288529715084293e-05, "loss": 0.0091, "step": 9110 }, { "epoch": 6.468085106382979, "grad_norm": 0.1356610655784607, "learning_rate": 1.0150781958489919e-05, "loss": 0.0115, "step": 9120 }, { "epoch": 6.475177304964539, "grad_norm": 0.10379713028669357, "learning_rate": 1.0013913224989303e-05, "loss": 0.0152, "step": 9130 }, { "epoch": 6.4822695035460995, "grad_norm": 0.1019928902387619, "learning_rate": 9.877924853612186e-06, "loss": 0.011, "step": 9140 }, { "epoch": 6.48936170212766, "grad_norm": 0.2234257161617279, "learning_rate": 9.74281817477547e-06, "loss": 0.0108, "step": 9150 }, { "epoch": 6.49645390070922, "grad_norm": 0.2257377654314041, "learning_rate": 9.608594510270218e-06, "loss": 0.0111, "step": 9160 }, { "epoch": 6.50354609929078, "grad_norm": 0.1945466548204422, "learning_rate": 9.47525517324862e-06, "loss": 0.0106, "step": 9170 }, { "epoch": 6.51063829787234, "grad_norm": 0.39022570848464966, "learning_rate": 9.342801468211283e-06, "loss": 0.0112, "step": 9180 }, { "epoch": 6.5177304964539005, "grad_norm": 0.16999700665473938, "learning_rate": 9.211234690994364e-06, "loss": 0.0161, "step": 9190 }, { "epoch": 6.524822695035461, "grad_norm": 0.38921093940734863, "learning_rate": 9.080556128756901e-06, "loss": 0.0107, "step": 9200 }, { "epoch": 6.531914893617021, "grad_norm": 0.26619699597358704, "learning_rate": 8.950767059968302e-06, "loss": 0.0164, "step": 9210 }, { "epoch": 6.539007092198582, "grad_norm": 0.19151495397090912, "learning_rate": 8.821868754395734e-06, "loss": 0.0111, "step": 9220 }, { "epoch": 6.546099290780142, "grad_norm": 0.09225843101739883, "learning_rate": 8.693862473091785e-06, "loss": 0.0113, "step": 9230 }, { "epoch": 6.553191489361702, "grad_norm": 0.2490764856338501, "learning_rate": 8.566749468382074e-06, "loss": 0.0163, "step": 9240 }, { "epoch": 6.560283687943262, "grad_norm": 0.2187177836894989, "learning_rate": 8.440530983852978e-06, "loss": 0.0132, "step": 9250 }, { "epoch": 6.567375886524823, "grad_norm": 0.12911982834339142, "learning_rate": 8.315208254339557e-06, "loss": 0.011, "step": 9260 }, { "epoch": 6.574468085106383, "grad_norm": 0.20145867764949799, "learning_rate": 8.190782505913442e-06, "loss": 0.0134, "step": 9270 }, { "epoch": 6.581560283687943, "grad_norm": 0.2898944914340973, "learning_rate": 8.067254955870707e-06, "loss": 0.017, "step": 9280 }, { "epoch": 6.588652482269503, "grad_norm": 0.08676121383905411, "learning_rate": 7.944626812720169e-06, "loss": 0.0096, "step": 9290 }, { "epoch": 6.595744680851064, "grad_norm": 0.2594836950302124, "learning_rate": 7.822899276171403e-06, "loss": 0.0156, "step": 9300 }, { "epoch": 6.602836879432624, "grad_norm": 0.15736845135688782, "learning_rate": 7.702073537123145e-06, "loss": 0.0109, "step": 9310 }, { "epoch": 6.609929078014185, "grad_norm": 0.25035277009010315, "learning_rate": 7.5821507776514866e-06, "loss": 0.0261, "step": 9320 }, { "epoch": 6.617021276595745, "grad_norm": 0.361659973859787, "learning_rate": 7.463132170998388e-06, "loss": 0.0117, "step": 9330 }, { "epoch": 6.624113475177305, "grad_norm": 0.21234659850597382, "learning_rate": 7.345018881560251e-06, "loss": 0.0114, "step": 9340 }, { "epoch": 6.631205673758865, "grad_norm": 0.1619425266981125, "learning_rate": 7.227812064876471e-06, "loss": 0.0095, "step": 9350 }, { "epoch": 6.638297872340425, "grad_norm": 0.15260903537273407, "learning_rate": 7.1115128676180975e-06, "loss": 0.0129, "step": 9360 }, { "epoch": 6.6453900709219855, "grad_norm": 0.13201937079429626, "learning_rate": 6.996122427576635e-06, "loss": 0.0216, "step": 9370 }, { "epoch": 6.652482269503546, "grad_norm": 0.17256666719913483, "learning_rate": 6.881641873653022e-06, "loss": 0.0079, "step": 9380 }, { "epoch": 6.659574468085106, "grad_norm": 0.3167516589164734, "learning_rate": 6.768072325846387e-06, "loss": 0.0187, "step": 9390 }, { "epoch": 6.666666666666667, "grad_norm": 0.17999926209449768, "learning_rate": 6.655414895243306e-06, "loss": 0.0135, "step": 9400 }, { "epoch": 6.673758865248227, "grad_norm": 0.20833925902843475, "learning_rate": 6.543670684006742e-06, "loss": 0.0117, "step": 9410 }, { "epoch": 6.680851063829787, "grad_norm": 0.19959613680839539, "learning_rate": 6.432840785365368e-06, "loss": 0.0113, "step": 9420 }, { "epoch": 6.6879432624113475, "grad_norm": 0.16003111004829407, "learning_rate": 6.3229262836028924e-06, "loss": 0.013, "step": 9430 }, { "epoch": 6.695035460992908, "grad_norm": 0.295195609331131, "learning_rate": 6.213928254047352e-06, "loss": 0.0137, "step": 9440 }, { "epoch": 6.702127659574468, "grad_norm": 0.26150402426719666, "learning_rate": 6.105847763060668e-06, "loss": 0.0152, "step": 9450 }, { "epoch": 6.709219858156028, "grad_norm": 0.10643389075994492, "learning_rate": 5.998685868028231e-06, "loss": 0.0076, "step": 9460 }, { "epoch": 6.716312056737589, "grad_norm": 0.1366465985774994, "learning_rate": 5.892443617348431e-06, "loss": 0.0107, "step": 9470 }, { "epoch": 6.723404255319149, "grad_norm": 0.22469396889209747, "learning_rate": 5.7871220504226e-06, "loss": 0.0099, "step": 9480 }, { "epoch": 6.7304964539007095, "grad_norm": 0.17117497324943542, "learning_rate": 5.682722197644652e-06, "loss": 0.0108, "step": 9490 }, { "epoch": 6.73758865248227, "grad_norm": 0.21108581125736237, "learning_rate": 5.579245080391094e-06, "loss": 0.0083, "step": 9500 }, { "epoch": 6.74468085106383, "grad_norm": 0.1274946630001068, "learning_rate": 5.47669171101105e-06, "loss": 0.0141, "step": 9510 }, { "epoch": 6.75177304964539, "grad_norm": 0.22684040665626526, "learning_rate": 5.375063092816313e-06, "loss": 0.0107, "step": 9520 }, { "epoch": 6.75886524822695, "grad_norm": 0.12946170568466187, "learning_rate": 5.2743602200715505e-06, "loss": 0.0124, "step": 9530 }, { "epoch": 6.76595744680851, "grad_norm": 0.19293436408042908, "learning_rate": 5.1745840779845455e-06, "loss": 0.0103, "step": 9540 }, { "epoch": 6.773049645390071, "grad_norm": 0.17891868948936462, "learning_rate": 5.075735642696611e-06, "loss": 0.0172, "step": 9550 }, { "epoch": 6.780141843971631, "grad_norm": 0.15858317911624908, "learning_rate": 4.977815881273018e-06, "loss": 0.0094, "step": 9560 }, { "epoch": 6.787234042553192, "grad_norm": 0.1298726350069046, "learning_rate": 4.880825751693518e-06, "loss": 0.0129, "step": 9570 }, { "epoch": 6.794326241134752, "grad_norm": 0.14093485474586487, "learning_rate": 4.784766202842961e-06, "loss": 0.0063, "step": 9580 }, { "epoch": 6.801418439716312, "grad_norm": 0.3153584599494934, "learning_rate": 4.689638174502076e-06, "loss": 0.0099, "step": 9590 }, { "epoch": 6.808510638297872, "grad_norm": 0.16759419441223145, "learning_rate": 4.595442597338217e-06, "loss": 0.0163, "step": 9600 }, { "epoch": 6.815602836879433, "grad_norm": 0.2294328510761261, "learning_rate": 4.502180392896272e-06, "loss": 0.0084, "step": 9610 }, { "epoch": 6.822695035460993, "grad_norm": 0.25540658831596375, "learning_rate": 4.409852473589626e-06, "loss": 0.0109, "step": 9620 }, { "epoch": 6.829787234042553, "grad_norm": 0.16674602031707764, "learning_rate": 4.318459742691316e-06, "loss": 0.0098, "step": 9630 }, { "epoch": 6.836879432624113, "grad_norm": 0.18922989070415497, "learning_rate": 4.228003094325084e-06, "loss": 0.0077, "step": 9640 }, { "epoch": 6.843971631205674, "grad_norm": 0.12138693034648895, "learning_rate": 4.13848341345674e-06, "loss": 0.012, "step": 9650 }, { "epoch": 6.851063829787234, "grad_norm": 0.2116478681564331, "learning_rate": 4.049901575885373e-06, "loss": 0.0114, "step": 9660 }, { "epoch": 6.858156028368795, "grad_norm": 0.22467593848705292, "learning_rate": 3.962258448234912e-06, "loss": 0.0211, "step": 9670 }, { "epoch": 6.865248226950355, "grad_norm": 0.13237184286117554, "learning_rate": 3.875554887945576e-06, "loss": 0.0195, "step": 9680 }, { "epoch": 6.872340425531915, "grad_norm": 0.15521298348903656, "learning_rate": 3.789791743265503e-06, "loss": 0.0107, "step": 9690 }, { "epoch": 6.879432624113475, "grad_norm": 0.23321086168289185, "learning_rate": 3.704969853242446e-06, "loss": 0.0177, "step": 9700 }, { "epoch": 6.886524822695035, "grad_norm": 0.1563076227903366, "learning_rate": 3.6210900477155696e-06, "loss": 0.0094, "step": 9710 }, { "epoch": 6.8936170212765955, "grad_norm": 0.12247644364833832, "learning_rate": 3.5381531473073326e-06, "loss": 0.016, "step": 9720 }, { "epoch": 6.900709219858156, "grad_norm": 0.19630227982997894, "learning_rate": 3.456159963415473e-06, "loss": 0.0108, "step": 9730 }, { "epoch": 6.907801418439716, "grad_norm": 0.25300222635269165, "learning_rate": 3.3751112982050135e-06, "loss": 0.0112, "step": 9740 }, { "epoch": 6.914893617021277, "grad_norm": 0.17951223254203796, "learning_rate": 3.295007944600481e-06, "loss": 0.0158, "step": 9750 }, { "epoch": 6.921985815602837, "grad_norm": 0.09103976935148239, "learning_rate": 3.215850686278132e-06, "loss": 0.0127, "step": 9760 }, { "epoch": 6.929078014184397, "grad_norm": 0.13495738804340363, "learning_rate": 3.1376402976582507e-06, "loss": 0.008, "step": 9770 }, { "epoch": 6.9361702127659575, "grad_norm": 0.12648430466651917, "learning_rate": 3.060377543897619e-06, "loss": 0.0106, "step": 9780 }, { "epoch": 6.943262411347518, "grad_norm": 0.22089631855487823, "learning_rate": 2.984063180882013e-06, "loss": 0.0137, "step": 9790 }, { "epoch": 6.950354609929078, "grad_norm": 0.12765297293663025, "learning_rate": 2.908697955218753e-06, "loss": 0.0088, "step": 9800 }, { "epoch": 6.957446808510638, "grad_norm": 0.33271655440330505, "learning_rate": 2.834282604229521e-06, "loss": 0.0111, "step": 9810 }, { "epoch": 6.964539007092198, "grad_norm": 0.2782209515571594, "learning_rate": 2.7608178559430653e-06, "loss": 0.0102, "step": 9820 }, { "epoch": 6.971631205673759, "grad_norm": 0.21345843374729156, "learning_rate": 2.6883044290880178e-06, "loss": 0.0105, "step": 9830 }, { "epoch": 6.9787234042553195, "grad_norm": 0.10163545608520508, "learning_rate": 2.616743033086022e-06, "loss": 0.0081, "step": 9840 }, { "epoch": 6.98581560283688, "grad_norm": 0.13964958488941193, "learning_rate": 2.5461343680446727e-06, "loss": 0.0121, "step": 9850 }, { "epoch": 6.99290780141844, "grad_norm": 0.13182148337364197, "learning_rate": 2.476479124750697e-06, "loss": 0.0128, "step": 9860 }, { "epoch": 7.0, "grad_norm": 0.24027769267559052, "learning_rate": 2.4077779846631732e-06, "loss": 0.0111, "step": 9870 }, { "epoch": 7.00709219858156, "grad_norm": 0.3481847941875458, "learning_rate": 2.3400316199069238e-06, "loss": 0.0103, "step": 9880 }, { "epoch": 7.01418439716312, "grad_norm": 0.19290268421173096, "learning_rate": 2.273240693265899e-06, "loss": 0.014, "step": 9890 }, { "epoch": 7.0212765957446805, "grad_norm": 0.059760428965091705, "learning_rate": 2.207405858176692e-06, "loss": 0.0111, "step": 9900 }, { "epoch": 7.028368794326241, "grad_norm": 0.2962135374546051, "learning_rate": 2.142527758722157e-06, "loss": 0.0125, "step": 9910 }, { "epoch": 7.035460992907802, "grad_norm": 0.06278011202812195, "learning_rate": 2.0786070296250793e-06, "loss": 0.0076, "step": 9920 }, { "epoch": 7.042553191489362, "grad_norm": 0.06869203597307205, "learning_rate": 2.0156442962420252e-06, "loss": 0.0101, "step": 9930 }, { "epoch": 7.049645390070922, "grad_norm": 0.250283420085907, "learning_rate": 1.95364017455717e-06, "loss": 0.0195, "step": 9940 }, { "epoch": 7.056737588652482, "grad_norm": 0.1912689059972763, "learning_rate": 1.8925952711763006e-06, "loss": 0.0138, "step": 9950 }, { "epoch": 7.0638297872340425, "grad_norm": 0.120729461312294, "learning_rate": 1.8325101833208457e-06, "loss": 0.0158, "step": 9960 }, { "epoch": 7.070921985815603, "grad_norm": 0.2449929565191269, "learning_rate": 1.7733854988220778e-06, "loss": 0.0111, "step": 9970 }, { "epoch": 7.078014184397163, "grad_norm": 0.11768271028995514, "learning_rate": 1.7152217961153405e-06, "loss": 0.011, "step": 9980 }, { "epoch": 7.085106382978723, "grad_norm": 0.09861616790294647, "learning_rate": 1.6580196442343987e-06, "loss": 0.0097, "step": 9990 }, { "epoch": 7.092198581560283, "grad_norm": 0.12583206593990326, "learning_rate": 1.601779602805842e-06, "loss": 0.0089, "step": 10000 }, { "epoch": 7.099290780141844, "grad_norm": 0.13632884621620178, "learning_rate": 1.5465022220436442e-06, "loss": 0.01, "step": 10010 }, { "epoch": 7.1063829787234045, "grad_norm": 0.19436778128147125, "learning_rate": 1.4921880427437584e-06, "loss": 0.0166, "step": 10020 }, { "epoch": 7.113475177304965, "grad_norm": 0.10597945749759674, "learning_rate": 1.4388375962788637e-06, "loss": 0.0091, "step": 10030 }, { "epoch": 7.120567375886525, "grad_norm": 0.08205860108137131, "learning_rate": 1.3864514045931032e-06, "loss": 0.0105, "step": 10040 }, { "epoch": 7.127659574468085, "grad_norm": 0.19386780261993408, "learning_rate": 1.3350299801970335e-06, "loss": 0.0149, "step": 10050 }, { "epoch": 7.134751773049645, "grad_norm": 0.22453084588050842, "learning_rate": 1.2845738261625828e-06, "loss": 0.0116, "step": 10060 }, { "epoch": 7.141843971631205, "grad_norm": 0.1868603378534317, "learning_rate": 1.235083436118145e-06, "loss": 0.0089, "step": 10070 }, { "epoch": 7.148936170212766, "grad_norm": 0.12683051824569702, "learning_rate": 1.1865592942437275e-06, "loss": 0.0102, "step": 10080 }, { "epoch": 7.156028368794326, "grad_norm": 0.17270110547542572, "learning_rate": 1.1390018752662436e-06, "loss": 0.0141, "step": 10090 }, { "epoch": 7.163120567375887, "grad_norm": 0.3085339069366455, "learning_rate": 1.0924116444548383e-06, "loss": 0.0101, "step": 10100 }, { "epoch": 7.170212765957447, "grad_norm": 0.12679530680179596, "learning_rate": 1.0467890576163707e-06, "loss": 0.0111, "step": 10110 }, { "epoch": 7.177304964539007, "grad_norm": 0.23530429601669312, "learning_rate": 1.0021345610909171e-06, "loss": 0.0136, "step": 10120 }, { "epoch": 7.184397163120567, "grad_norm": 0.06431034952402115, "learning_rate": 9.584485917474185e-07, "loss": 0.0089, "step": 10130 }, { "epoch": 7.191489361702128, "grad_norm": 0.15055204927921295, "learning_rate": 9.157315769794284e-07, "loss": 0.009, "step": 10140 }, { "epoch": 7.198581560283688, "grad_norm": 0.1731652468442917, "learning_rate": 8.739839347009171e-07, "loss": 0.0135, "step": 10150 }, { "epoch": 7.205673758865248, "grad_norm": 0.1978446990251541, "learning_rate": 8.332060733421631e-07, "loss": 0.0116, "step": 10160 }, { "epoch": 7.212765957446808, "grad_norm": 0.29635027050971985, "learning_rate": 7.933983918457677e-07, "loss": 0.0117, "step": 10170 }, { "epoch": 7.219858156028369, "grad_norm": 0.23973168432712555, "learning_rate": 7.54561279662791e-07, "loss": 0.0146, "step": 10180 }, { "epoch": 7.226950354609929, "grad_norm": 0.12211582064628601, "learning_rate": 7.166951167488667e-07, "loss": 0.0076, "step": 10190 }, { "epoch": 7.23404255319149, "grad_norm": 0.1157701313495636, "learning_rate": 6.798002735605602e-07, "loss": 0.0166, "step": 10200 }, { "epoch": 7.24113475177305, "grad_norm": 0.08518693596124649, "learning_rate": 6.43877111051705e-07, "loss": 0.0116, "step": 10210 }, { "epoch": 7.24822695035461, "grad_norm": 0.3765665888786316, "learning_rate": 6.089259806698611e-07, "loss": 0.0174, "step": 10220 }, { "epoch": 7.25531914893617, "grad_norm": 0.315969854593277, "learning_rate": 5.749472243529064e-07, "loss": 0.0179, "step": 10230 }, { "epoch": 7.26241134751773, "grad_norm": 0.1261627972126007, "learning_rate": 5.419411745256841e-07, "loss": 0.0145, "step": 10240 }, { "epoch": 7.2695035460992905, "grad_norm": 0.14078722894191742, "learning_rate": 5.099081540967277e-07, "loss": 0.0082, "step": 10250 }, { "epoch": 7.276595744680851, "grad_norm": 0.19919142127037048, "learning_rate": 4.788484764551293e-07, "loss": 0.0096, "step": 10260 }, { "epoch": 7.283687943262412, "grad_norm": 0.1621677577495575, "learning_rate": 4.487624454674544e-07, "loss": 0.0101, "step": 10270 }, { "epoch": 7.290780141843972, "grad_norm": 0.07020825147628784, "learning_rate": 4.196503554747988e-07, "loss": 0.0081, "step": 10280 }, { "epoch": 7.297872340425532, "grad_norm": 0.4086223244667053, "learning_rate": 3.9151249128988043e-07, "loss": 0.0108, "step": 10290 }, { "epoch": 7.304964539007092, "grad_norm": 0.19677507877349854, "learning_rate": 3.643491281942302e-07, "loss": 0.0088, "step": 10300 }, { "epoch": 7.3120567375886525, "grad_norm": 0.18708615005016327, "learning_rate": 3.3816053193556073e-07, "loss": 0.0137, "step": 10310 }, { "epoch": 7.319148936170213, "grad_norm": 0.21218359470367432, "learning_rate": 3.129469587251466e-07, "loss": 0.0122, "step": 10320 }, { "epoch": 7.326241134751773, "grad_norm": 0.057501133531332016, "learning_rate": 2.8870865523525915e-07, "loss": 0.0073, "step": 10330 }, { "epoch": 7.333333333333333, "grad_norm": 0.09904036670923233, "learning_rate": 2.6544585859683556e-07, "loss": 0.0098, "step": 10340 }, { "epoch": 7.340425531914893, "grad_norm": 0.2506350874900818, "learning_rate": 2.431587963971138e-07, "loss": 0.0082, "step": 10350 }, { "epoch": 7.347517730496454, "grad_norm": 0.16516119241714478, "learning_rate": 2.218476866774344e-07, "loss": 0.0116, "step": 10360 }, { "epoch": 7.3546099290780145, "grad_norm": 0.18242092430591583, "learning_rate": 2.015127379310422e-07, "loss": 0.0145, "step": 10370 }, { "epoch": 7.361702127659575, "grad_norm": 0.1638125777244568, "learning_rate": 1.821541491011547e-07, "loss": 0.0089, "step": 10380 }, { "epoch": 7.368794326241135, "grad_norm": 0.192805677652359, "learning_rate": 1.6377210957888579e-07, "loss": 0.0138, "step": 10390 }, { "epoch": 7.375886524822695, "grad_norm": 0.16740106046199799, "learning_rate": 1.4636679920152496e-07, "loss": 0.0127, "step": 10400 }, { "epoch": 7.382978723404255, "grad_norm": 0.20716732740402222, "learning_rate": 1.2993838825066107e-07, "loss": 0.0104, "step": 10410 }, { "epoch": 7.390070921985815, "grad_norm": 0.2054859846830368, "learning_rate": 1.1448703745061684e-07, "loss": 0.0121, "step": 10420 }, { "epoch": 7.397163120567376, "grad_norm": 0.19400155544281006, "learning_rate": 1.0001289796678359e-07, "loss": 0.0158, "step": 10430 }, { "epoch": 7.404255319148936, "grad_norm": 0.257386714220047, "learning_rate": 8.651611140423344e-08, "loss": 0.0125, "step": 10440 }, { "epoch": 7.411347517730497, "grad_norm": 0.1650981456041336, "learning_rate": 7.399680980624268e-08, "loss": 0.0131, "step": 10450 }, { "epoch": 7.418439716312057, "grad_norm": 0.07822942733764648, "learning_rate": 6.24551156530817e-08, "loss": 0.0107, "step": 10460 }, { "epoch": 7.425531914893617, "grad_norm": 0.2906341850757599, "learning_rate": 5.1891141860760387e-08, "loss": 0.0074, "step": 10470 }, { "epoch": 7.432624113475177, "grad_norm": 0.10687454789876938, "learning_rate": 4.230499177994007e-08, "loss": 0.0099, "step": 10480 }, { "epoch": 7.439716312056738, "grad_norm": 0.22731392085552216, "learning_rate": 3.369675919495663e-08, "loss": 0.012, "step": 10490 }, { "epoch": 7.446808510638298, "grad_norm": 0.0800376906991005, "learning_rate": 2.6066528322832294e-08, "loss": 0.0108, "step": 10500 }, { "epoch": 7.453900709219858, "grad_norm": 0.26024994254112244, "learning_rate": 1.9414373812509655e-08, "loss": 0.0159, "step": 10510 }, { "epoch": 7.460992907801418, "grad_norm": 0.14578887820243835, "learning_rate": 1.3740360744118886e-08, "loss": 0.0163, "step": 10520 }, { "epoch": 7.468085106382979, "grad_norm": 0.12513095140457153, "learning_rate": 9.04454462830051e-09, "loss": 0.0114, "step": 10530 }, { "epoch": 7.475177304964539, "grad_norm": 0.20727184414863586, "learning_rate": 5.326971405694714e-09, "loss": 0.0126, "step": 10540 }, { "epoch": 7.4822695035460995, "grad_norm": 0.22372283041477203, "learning_rate": 2.5876774464972387e-09, "loss": 0.0101, "step": 10550 }, { "epoch": 7.48936170212766, "grad_norm": 0.26353222131729126, "learning_rate": 8.266895500708138e-10, "loss": 0.011, "step": 10560 }, { "epoch": 7.49645390070922, "grad_norm": 0.1274196356534958, "learning_rate": 4.402494471200669e-11, "loss": 0.0129, "step": 10570 }, { "epoch": 7.498581560283688, "step": 10573, "total_flos": 3.707114868479735e+17, "train_loss": 0.02894638727507956, "train_runtime": 4663.694, "train_samples_per_second": 36.273, "train_steps_per_second": 2.267 } ], "logging_steps": 10, "max_steps": 10573, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.707114868479735e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }