diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,6008 +3,394 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 4275, + "global_step": 268, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0011695906432748538, - "grad_norm": 4.3862214979637395, - "learning_rate": 1.1682242990654206e-06, - "loss": 1.4545, + "epoch": 0.018656716417910446, + "grad_norm": 23.549664007081883, + "learning_rate": 1.785714285714286e-05, + "loss": 6.105, "step": 5 }, { - "epoch": 0.0023391812865497076, - "grad_norm": 2.898308786930061, - "learning_rate": 2.3364485981308413e-06, - "loss": 1.4508, + "epoch": 0.03731343283582089, + "grad_norm": 13.350189408858572, + "learning_rate": 3.571428571428572e-05, + "loss": 4.6199, "step": 10 }, { - "epoch": 0.0035087719298245615, - "grad_norm": 3.3436616124637424, - "learning_rate": 3.5046728971962617e-06, - "loss": 1.3998, + "epoch": 0.055970149253731345, + "grad_norm": 9.366720060199096, + "learning_rate": 4.999827900623038e-05, + "loss": 3.3379, "step": 15 }, { - "epoch": 0.004678362573099415, - "grad_norm": 2.34446456390598, - "learning_rate": 4.6728971962616825e-06, - "loss": 1.3392, + "epoch": 0.07462686567164178, + "grad_norm": 6.751610634894579, + "learning_rate": 4.993807186343243e-05, + "loss": 2.6927, "step": 20 }, { - "epoch": 0.005847953216374269, - "grad_norm": 1.4686088375623854, - "learning_rate": 5.841121495327103e-06, - "loss": 1.2714, + "epoch": 0.09328358208955224, + "grad_norm": 6.488714411639678, + "learning_rate": 4.979207812402531e-05, + "loss": 2.0578, "step": 25 }, { - "epoch": 0.007017543859649123, - "grad_norm": 1.320686406588424, - "learning_rate": 7.009345794392523e-06, - "loss": 1.2414, + "epoch": 0.11194029850746269, + "grad_norm": 3.5421676034059377, + "learning_rate": 4.956085596012407e-05, + "loss": 1.5831, "step": 30 }, { - "epoch": 0.008187134502923977, - "grad_norm": 1.2402467459230129, - "learning_rate": 8.177570093457943e-06, - "loss": 1.1882, + "epoch": 0.13059701492537312, + "grad_norm": 2.571834851270616, + "learning_rate": 4.924528939432311e-05, + "loss": 1.315, "step": 35 }, { - "epoch": 0.00935672514619883, - "grad_norm": 1.2376435378586828, - "learning_rate": 9.345794392523365e-06, - "loss": 1.1749, + "epoch": 0.14925373134328357, + "grad_norm": 1.5963085542775126, + "learning_rate": 4.884658491984735e-05, + "loss": 1.1863, "step": 40 }, { - "epoch": 0.010526315789473684, - "grad_norm": 1.1524990455288477, - "learning_rate": 1.0514018691588785e-05, - "loss": 1.1588, + "epoch": 0.16791044776119404, + "grad_norm": 2.074329344904024, + "learning_rate": 4.8366266887814235e-05, + "loss": 1.1028, "step": 45 }, { - "epoch": 0.011695906432748537, - "grad_norm": 1.301145432712544, - "learning_rate": 1.1682242990654207e-05, - "loss": 1.1424, + "epoch": 0.1865671641791045, + "grad_norm": 1.3297400462887774, + "learning_rate": 4.780617167924209e-05, + "loss": 1.04, "step": 50 }, { - "epoch": 0.012865497076023392, - "grad_norm": 1.1323491606855578, - "learning_rate": 1.2850467289719625e-05, - "loss": 1.1172, + "epoch": 0.20522388059701493, + "grad_norm": 1.7003158095532473, + "learning_rate": 4.716844068408693e-05, + "loss": 0.9969, "step": 55 }, { - "epoch": 0.014035087719298246, - "grad_norm": 1.201565879775835, - "learning_rate": 1.4018691588785047e-05, - "loss": 1.1029, + "epoch": 0.22388059701492538, + "grad_norm": 1.3298211305507253, + "learning_rate": 4.6455512114150546e-05, + "loss": 0.9653, "step": 60 }, { - "epoch": 0.0152046783625731, - "grad_norm": 1.1433240312787192, - "learning_rate": 1.5186915887850467e-05, - "loss": 1.086, + "epoch": 0.24253731343283583, + "grad_norm": 1.5188363324223721, + "learning_rate": 4.5670111681161296e-05, + "loss": 0.9325, "step": 65 }, { - "epoch": 0.016374269005847954, - "grad_norm": 1.1855044769686904, - "learning_rate": 1.6355140186915887e-05, - "loss": 1.1019, + "epoch": 0.26119402985074625, + "grad_norm": 1.455002995998784, + "learning_rate": 4.481524217566783e-05, + "loss": 0.8993, "step": 70 }, { - "epoch": 0.017543859649122806, - "grad_norm": 1.2419830025546306, - "learning_rate": 1.752336448598131e-05, - "loss": 1.0782, + "epoch": 0.2798507462686567, + "grad_norm": 1.2015534696464851, + "learning_rate": 4.3894171986588217e-05, + "loss": 0.8828, "step": 75 }, { - "epoch": 0.01871345029239766, - "grad_norm": 1.1871478349287299, - "learning_rate": 1.869158878504673e-05, - "loss": 1.0558, + "epoch": 0.29850746268656714, + "grad_norm": 2.042471802842113, + "learning_rate": 4.29104226053073e-05, + "loss": 0.873, "step": 80 }, { - "epoch": 0.019883040935672516, - "grad_norm": 1.4158109594901271, - "learning_rate": 1.985981308411215e-05, - "loss": 1.0733, + "epoch": 0.31716417910447764, + "grad_norm": 1.6348429954668364, + "learning_rate": 4.186775516209732e-05, + "loss": 0.8589, "step": 85 }, { - "epoch": 0.021052631578947368, - "grad_norm": 1.3659345116154782, - "learning_rate": 2.102803738317757e-05, - "loss": 1.0608, + "epoch": 0.3358208955223881, + "grad_norm": 1.2929068924137062, + "learning_rate": 4.077015604633669e-05, + "loss": 0.8499, "step": 90 }, { - "epoch": 0.022222222222222223, - "grad_norm": 1.1708966657531445, - "learning_rate": 2.2196261682242992e-05, - "loss": 1.0539, + "epoch": 0.35447761194029853, + "grad_norm": 1.2588102670148498, + "learning_rate": 3.962182166550441e-05, + "loss": 0.8296, "step": 95 }, { - "epoch": 0.023391812865497075, - "grad_norm": 1.4057049542204068, - "learning_rate": 2.3364485981308414e-05, - "loss": 1.0491, + "epoch": 0.373134328358209, + "grad_norm": 1.016824958444882, + "learning_rate": 3.8427142401220634e-05, + "loss": 0.8183, "step": 100 }, { - "epoch": 0.02456140350877193, - "grad_norm": 1.177573164358867, - "learning_rate": 2.4532710280373832e-05, - "loss": 1.0382, + "epoch": 0.3917910447761194, + "grad_norm": 1.076029269402344, + "learning_rate": 3.71906858236735e-05, + "loss": 0.8175, "step": 105 }, { - "epoch": 0.025730994152046785, - "grad_norm": 1.2987611554079812, - "learning_rate": 2.570093457943925e-05, - "loss": 1.0269, + "epoch": 0.41044776119402987, + "grad_norm": 0.9643240409515527, + "learning_rate": 3.591717922860785e-05, + "loss": 0.8024, "step": 110 }, { - "epoch": 0.026900584795321637, - "grad_norm": 1.2805282242501288, - "learning_rate": 2.6869158878504675e-05, - "loss": 1.0303, + "epoch": 0.4291044776119403, + "grad_norm": 1.0996417684236115, + "learning_rate": 3.46114915636416e-05, + "loss": 0.8028, "step": 115 }, { - "epoch": 0.028070175438596492, - "grad_norm": 1.2521388436600536, - "learning_rate": 2.8037383177570094e-05, - "loss": 1.0186, + "epoch": 0.44776119402985076, + "grad_norm": 0.6236520681960638, + "learning_rate": 3.3278614813010034e-05, + "loss": 0.7843, "step": 120 }, { - "epoch": 0.029239766081871343, - "grad_norm": 1.2503867172495808, - "learning_rate": 2.9205607476635515e-05, - "loss": 1.0338, + "epoch": 0.4664179104477612, + "grad_norm": 0.7490461585227262, + "learning_rate": 3.1923644911909e-05, + "loss": 0.7802, "step": 125 }, { - "epoch": 0.0304093567251462, - "grad_norm": 1.2707180378214313, - "learning_rate": 3.0373831775700934e-05, - "loss": 1.0244, + "epoch": 0.48507462686567165, + "grad_norm": 0.7753398709862848, + "learning_rate": 3.0551762263406576e-05, + "loss": 0.7718, "step": 130 }, { - "epoch": 0.031578947368421054, - "grad_norm": 1.3243403485631375, - "learning_rate": 3.1542056074766355e-05, - "loss": 0.9971, + "epoch": 0.503731343283582, + "grad_norm": 0.8263114166565009, + "learning_rate": 2.9168211932412042e-05, + "loss": 0.7694, "step": 135 }, { - "epoch": 0.03274853801169591, - "grad_norm": 1.3071755226415636, - "learning_rate": 3.2710280373831774e-05, - "loss": 1.0018, + "epoch": 0.5223880597014925, + "grad_norm": 0.5977157830288298, + "learning_rate": 2.777828359242567e-05, + "loss": 0.7638, "step": 140 }, { - "epoch": 0.03391812865497076, - "grad_norm": 1.30013028710232, - "learning_rate": 3.38785046728972e-05, - "loss": 1.0152, + "epoch": 0.5410447761194029, + "grad_norm": 0.6960446376964153, + "learning_rate": 2.6387291301738377e-05, + "loss": 0.7548, "step": 145 }, { - "epoch": 0.03508771929824561, - "grad_norm": 1.646624872748112, - "learning_rate": 3.504672897196262e-05, - "loss": 1.0132, + "epoch": 0.5597014925373134, + "grad_norm": 0.5170782529619016, + "learning_rate": 2.50005531864019e-05, + "loss": 0.7465, "step": 150 }, { - "epoch": 0.03625730994152047, - "grad_norm": 1.3045097017626717, - "learning_rate": 3.621495327102804e-05, - "loss": 1.0012, + "epoch": 0.5783582089552238, + "grad_norm": 0.48159615421876123, + "learning_rate": 2.362337110764688e-05, + "loss": 0.7486, "step": 155 }, { - "epoch": 0.03742690058479532, - "grad_norm": 1.462682033635244, - "learning_rate": 3.738317757009346e-05, - "loss": 1.0128, + "epoch": 0.5970149253731343, + "grad_norm": 0.5432350714014745, + "learning_rate": 2.226101039148557e-05, + "loss": 0.741, "step": 160 }, { - "epoch": 0.03859649122807018, - "grad_norm": 1.2429985745730512, - "learning_rate": 3.855140186915888e-05, - "loss": 0.9975, + "epoch": 0.6156716417910447, + "grad_norm": 0.4696819234168871, + "learning_rate": 2.0918679697998252e-05, + "loss": 0.74, "step": 165 }, { - "epoch": 0.03976608187134503, - "grad_norm": 1.4950336114052705, - "learning_rate": 3.97196261682243e-05, - "loss": 0.9831, + "epoch": 0.6343283582089553, + "grad_norm": 0.46483548295455934, + "learning_rate": 1.9601511107268255e-05, + "loss": 0.7425, "step": 170 }, { - "epoch": 0.04093567251461988, - "grad_norm": 1.3381138350136945, - "learning_rate": 4.088785046728972e-05, - "loss": 0.9917, + "epoch": 0.6529850746268657, + "grad_norm": 0.5530022968626128, + "learning_rate": 1.8314540498102216e-05, + "loss": 0.7338, "step": 175 }, { - "epoch": 0.042105263157894736, - "grad_norm": 1.3781491921537325, - "learning_rate": 4.205607476635514e-05, - "loss": 0.9728, + "epoch": 0.6716417910447762, + "grad_norm": 0.4915552805413531, + "learning_rate": 1.7062688294552992e-05, + "loss": 0.7374, "step": 180 }, { - "epoch": 0.04327485380116959, - "grad_norm": 1.2707812943966086, - "learning_rate": 4.3224299065420565e-05, - "loss": 1.0, + "epoch": 0.6902985074626866, + "grad_norm": 0.4452556265823543, + "learning_rate": 1.5850740653856096e-05, + "loss": 0.7268, "step": 185 }, { - "epoch": 0.044444444444444446, - "grad_norm": 1.1629390254978238, - "learning_rate": 4.4392523364485984e-05, - "loss": 0.9795, + "epoch": 0.7089552238805971, + "grad_norm": 0.46333067364783953, + "learning_rate": 1.4683331167703218e-05, + "loss": 0.7275, "step": 190 }, { - "epoch": 0.0456140350877193, - "grad_norm": 1.380163782528444, - "learning_rate": 4.556074766355141e-05, - "loss": 0.9728, + "epoch": 0.7276119402985075, + "grad_norm": 0.39997819847068644, + "learning_rate": 1.356492314681356e-05, + "loss": 0.7264, "step": 195 }, { - "epoch": 0.04678362573099415, - "grad_norm": 1.2042238039537414, - "learning_rate": 4.672897196261683e-05, - "loss": 0.9758, + "epoch": 0.746268656716418, + "grad_norm": 0.405044297741591, + "learning_rate": 1.2499792556533716e-05, + "loss": 0.7238, "step": 200 }, { - "epoch": 0.047953216374269005, - "grad_norm": 1.279604288513107, - "learning_rate": 4.7897196261682245e-05, - "loss": 0.9972, + "epoch": 0.7649253731343284, + "grad_norm": 0.42435372517376513, + "learning_rate": 1.1492011668707753e-05, + "loss": 0.7191, "step": 205 }, { - "epoch": 0.04912280701754386, - "grad_norm": 1.2721504709190998, - "learning_rate": 4.9065420560747664e-05, - "loss": 0.9741, + "epoch": 0.7835820895522388, + "grad_norm": 0.3991508216227003, + "learning_rate": 1.0545433492320603e-05, + "loss": 0.7192, "step": 210 }, { - "epoch": 0.050292397660818715, - "grad_norm": 1.2259731220679402, - "learning_rate": 4.9999993267346444e-05, - "loss": 0.9679, + "epoch": 0.8022388059701493, + "grad_norm": 0.4035129969039292, + "learning_rate": 9.663677042440537e-06, + "loss": 0.7175, "step": 215 }, { - "epoch": 0.05146198830409357, - "grad_norm": 1.298654189786969, - "learning_rate": 4.999975762489519e-05, - "loss": 0.9852, + "epoch": 0.8208955223880597, + "grad_norm": 0.3656087904784262, + "learning_rate": 8.850113503781367e-06, + "loss": 0.7129, "step": 220 }, { - "epoch": 0.05263157894736842, - "grad_norm": 1.2924316625203613, - "learning_rate": 4.9999185353795504e-05, - "loss": 0.9673, + "epoch": 0.8395522388059702, + "grad_norm": 0.3363321384740739, + "learning_rate": 8.107853341784671e-06, + "loss": 0.7197, "step": 225 }, { - "epoch": 0.05380116959064327, - "grad_norm": 1.366376650054188, - "learning_rate": 4.99982764626094e-05, - "loss": 0.9815, + "epoch": 0.8582089552238806, + "grad_norm": 1.564164016405774, + "learning_rate": 7.439734410499752e-06, + "loss": 0.716, "step": 230 }, { - "epoch": 0.05497076023391813, - "grad_norm": 1.274755614910159, - "learning_rate": 4.9997030964935195e-05, - "loss": 0.987, + "epoch": 0.8768656716417911, + "grad_norm": 0.38113827793929866, + "learning_rate": 6.848311102728011e-06, + "loss": 0.7114, "step": 235 }, { - "epoch": 0.056140350877192984, - "grad_norm": 1.6362492872363399, - "learning_rate": 4.9995448879407316e-05, - "loss": 0.961, + "epoch": 0.8955223880597015, + "grad_norm": 0.35881991263382823, + "learning_rate": 6.335844583913515e-06, + "loss": 0.7117, "step": 240 }, { - "epoch": 0.05730994152046784, - "grad_norm": 1.2051858843379946, - "learning_rate": 4.999353022969603e-05, - "loss": 0.9604, + "epoch": 0.914179104477612, + "grad_norm": 0.3389853140732799, + "learning_rate": 5.904294147118193e-06, + "loss": 0.7089, "step": 245 }, { - "epoch": 0.05847953216374269, - "grad_norm": 1.3793185249466788, - "learning_rate": 4.999127504450709e-05, - "loss": 0.9591, + "epoch": 0.9328358208955224, + "grad_norm": 0.3017636686149359, + "learning_rate": 5.555309722133842e-06, + "loss": 0.7098, "step": 250 }, { - "epoch": 0.05964912280701754, - "grad_norm": 1.2084655281854946, - "learning_rate": 4.998868335758132e-05, - "loss": 0.9535, + "epoch": 0.9514925373134329, + "grad_norm": 0.32612384040343395, + "learning_rate": 5.290225567370509e-06, + "loss": 0.7085, "step": 255 }, { - "epoch": 0.0608187134502924, - "grad_norm": 1.1876622728566322, - "learning_rate": 4.998575520769404e-05, - "loss": 0.9772, + "epoch": 0.9701492537313433, + "grad_norm": 0.3215733932661547, + "learning_rate": 5.110055168638854e-06, + "loss": 0.7185, "step": 260 }, { - "epoch": 0.06198830409356725, - "grad_norm": 1.4161498983349252, - "learning_rate": 4.99824906386546e-05, - "loss": 0.9452, + "epoch": 0.9888059701492538, + "grad_norm": 0.32235402928737444, + "learning_rate": 5.0154873643297575e-06, + "loss": 0.7153, "step": 265 }, - { - "epoch": 0.06315789473684211, - "grad_norm": 1.3383887094892624, - "learning_rate": 4.997888969930562e-05, - "loss": 0.95, - "step": 270 - }, - { - "epoch": 0.06432748538011696, - "grad_norm": 1.2656564068860054, - "learning_rate": 4.997495244352232e-05, - "loss": 0.9365, - "step": 275 - }, - { - "epoch": 0.06549707602339182, - "grad_norm": 1.0862939966775438, - "learning_rate": 4.9970678930211704e-05, - "loss": 0.9525, - "step": 280 - }, - { - "epoch": 0.06666666666666667, - "grad_norm": 1.2919679051885777, - "learning_rate": 4.996606922331165e-05, - "loss": 0.9529, - "step": 285 - }, - { - "epoch": 0.06783625730994151, - "grad_norm": 1.0647888476826568, - "learning_rate": 4.996112339179e-05, - "loss": 0.9324, - "step": 290 - }, - { - "epoch": 0.06900584795321638, - "grad_norm": 1.305388044138423, - "learning_rate": 4.995584150964347e-05, - "loss": 0.9259, - "step": 295 - }, - { - "epoch": 0.07017543859649122, - "grad_norm": 1.231889595549824, - "learning_rate": 4.99502236558966e-05, - "loss": 0.948, - "step": 300 - }, - { - "epoch": 0.07134502923976609, - "grad_norm": 1.092939130481041, - "learning_rate": 4.994426991460055e-05, - "loss": 0.932, - "step": 305 - }, - { - "epoch": 0.07251461988304093, - "grad_norm": 1.3112640010430932, - "learning_rate": 4.993798037483182e-05, - "loss": 0.9327, - "step": 310 - }, - { - "epoch": 0.07368421052631578, - "grad_norm": 1.1476970342552901, - "learning_rate": 4.993135513069094e-05, - "loss": 0.9482, - "step": 315 - }, - { - "epoch": 0.07485380116959064, - "grad_norm": 1.1633564547545994, - "learning_rate": 4.992439428130109e-05, - "loss": 0.9217, - "step": 320 - }, - { - "epoch": 0.07602339181286549, - "grad_norm": 1.0735521991747736, - "learning_rate": 4.991709793080655e-05, - "loss": 0.9124, - "step": 325 - }, - { - "epoch": 0.07719298245614035, - "grad_norm": 1.0492850297391971, - "learning_rate": 4.990946618837117e-05, - "loss": 0.9393, - "step": 330 - }, - { - "epoch": 0.0783625730994152, - "grad_norm": 1.3559203599333753, - "learning_rate": 4.9901499168176786e-05, - "loss": 0.9256, - "step": 335 - }, - { - "epoch": 0.07953216374269007, - "grad_norm": 1.5371317714373305, - "learning_rate": 4.989319698942145e-05, - "loss": 0.9273, - "step": 340 - }, - { - "epoch": 0.08070175438596491, - "grad_norm": 1.9231609823006572, - "learning_rate": 4.9884559776317644e-05, - "loss": 0.9181, - "step": 345 - }, - { - "epoch": 0.08187134502923976, - "grad_norm": 1.1887874506902512, - "learning_rate": 4.987558765809048e-05, - "loss": 0.9159, - "step": 350 - }, - { - "epoch": 0.08304093567251462, - "grad_norm": 1.1113774961997955, - "learning_rate": 4.986628076897572e-05, - "loss": 0.9255, - "step": 355 - }, - { - "epoch": 0.08421052631578947, - "grad_norm": 1.2529780917385798, - "learning_rate": 4.985663924821778e-05, - "loss": 0.9134, - "step": 360 - }, - { - "epoch": 0.08538011695906433, - "grad_norm": 1.1687004195151305, - "learning_rate": 4.984666324006763e-05, - "loss": 0.939, - "step": 365 - }, - { - "epoch": 0.08654970760233918, - "grad_norm": 1.2034837816155135, - "learning_rate": 4.983635289378065e-05, - "loss": 0.9166, - "step": 370 - }, - { - "epoch": 0.08771929824561403, - "grad_norm": 1.1152012751772677, - "learning_rate": 4.9825708363614434e-05, - "loss": 0.9326, - "step": 375 - }, - { - "epoch": 0.08888888888888889, - "grad_norm": 1.282352801615335, - "learning_rate": 4.981472980882641e-05, - "loss": 0.9097, - "step": 380 - }, - { - "epoch": 0.09005847953216374, - "grad_norm": 1.3555014384579822, - "learning_rate": 4.980341739367151e-05, - "loss": 0.9566, - "step": 385 - }, - { - "epoch": 0.0912280701754386, - "grad_norm": 0.9969319217580619, - "learning_rate": 4.979177128739968e-05, - "loss": 0.9183, - "step": 390 - }, - { - "epoch": 0.09239766081871345, - "grad_norm": 1.112979405264304, - "learning_rate": 4.977979166425339e-05, - "loss": 0.9087, - "step": 395 - }, - { - "epoch": 0.0935672514619883, - "grad_norm": 7.571188451889394, - "learning_rate": 4.976747870346498e-05, - "loss": 0.9075, - "step": 400 - }, - { - "epoch": 0.09473684210526316, - "grad_norm": 1.2588282458972042, - "learning_rate": 4.9754832589254e-05, - "loss": 0.9133, - "step": 405 - }, - { - "epoch": 0.09590643274853801, - "grad_norm": 1.1389033831691695, - "learning_rate": 4.974185351082447e-05, - "loss": 0.9071, - "step": 410 - }, - { - "epoch": 0.09707602339181287, - "grad_norm": 1.0464472246368346, - "learning_rate": 4.972854166236201e-05, - "loss": 0.9033, - "step": 415 - }, - { - "epoch": 0.09824561403508772, - "grad_norm": 1.0609546057929313, - "learning_rate": 4.9714897243030984e-05, - "loss": 0.9054, - "step": 420 - }, - { - "epoch": 0.09941520467836257, - "grad_norm": 0.9766945821904102, - "learning_rate": 4.970092045697146e-05, - "loss": 0.892, - "step": 425 - }, - { - "epoch": 0.10058479532163743, - "grad_norm": 1.0824192119502347, - "learning_rate": 4.9686611513296216e-05, - "loss": 0.9076, - "step": 430 - }, - { - "epoch": 0.10175438596491228, - "grad_norm": 1.069699233932774, - "learning_rate": 4.9671970626087574e-05, - "loss": 0.9131, - "step": 435 - }, - { - "epoch": 0.10292397660818714, - "grad_norm": 1.0973125930518677, - "learning_rate": 4.96569980143942e-05, - "loss": 0.8959, - "step": 440 - }, - { - "epoch": 0.10409356725146199, - "grad_norm": 1.1776527069272567, - "learning_rate": 4.964169390222784e-05, - "loss": 0.901, - "step": 445 - }, - { - "epoch": 0.10526315789473684, - "grad_norm": 1.0756769524406018, - "learning_rate": 4.9626058518559975e-05, - "loss": 0.8986, - "step": 450 - }, - { - "epoch": 0.1064327485380117, - "grad_norm": 1.1614986981567177, - "learning_rate": 4.961009209731837e-05, - "loss": 0.9064, - "step": 455 - }, - { - "epoch": 0.10760233918128655, - "grad_norm": 1.1282199066528726, - "learning_rate": 4.959379487738359e-05, - "loss": 0.8956, - "step": 460 - }, - { - "epoch": 0.10877192982456141, - "grad_norm": 0.9009086073580999, - "learning_rate": 4.957716710258543e-05, - "loss": 0.893, - "step": 465 - }, - { - "epoch": 0.10994152046783626, - "grad_norm": 0.9495009535418211, - "learning_rate": 4.956020902169924e-05, - "loss": 0.8879, - "step": 470 - }, - { - "epoch": 0.1111111111111111, - "grad_norm": 1.1217739879192823, - "learning_rate": 4.954292088844223e-05, - "loss": 0.8931, - "step": 475 - }, - { - "epoch": 0.11228070175438597, - "grad_norm": 0.9788914879224979, - "learning_rate": 4.952530296146969e-05, - "loss": 0.893, - "step": 480 - }, - { - "epoch": 0.11345029239766082, - "grad_norm": 0.9997978391833926, - "learning_rate": 4.9507355504371064e-05, - "loss": 0.8889, - "step": 485 - }, - { - "epoch": 0.11461988304093568, - "grad_norm": 0.9268894518304199, - "learning_rate": 4.948907878566607e-05, - "loss": 0.8903, - "step": 490 - }, - { - "epoch": 0.11578947368421053, - "grad_norm": 72.60231360053689, - "learning_rate": 4.947047307880062e-05, - "loss": 0.9251, - "step": 495 - }, - { - "epoch": 0.11695906432748537, - "grad_norm": 1.0770947596244338, - "learning_rate": 4.945153866214278e-05, - "loss": 0.9046, - "step": 500 - }, - { - "epoch": 0.11812865497076024, - "grad_norm": 1.1295561944286985, - "learning_rate": 4.9432275818978595e-05, - "loss": 0.9037, - "step": 505 - }, - { - "epoch": 0.11929824561403508, - "grad_norm": 1.181923829984784, - "learning_rate": 4.941268483750782e-05, - "loss": 0.881, - "step": 510 - }, - { - "epoch": 0.12046783625730995, - "grad_norm": 0.9224413133089209, - "learning_rate": 4.939276601083965e-05, - "loss": 0.887, - "step": 515 - }, - { - "epoch": 0.1216374269005848, - "grad_norm": 0.9833800064012921, - "learning_rate": 4.937251963698829e-05, - "loss": 0.8942, - "step": 520 - }, - { - "epoch": 0.12280701754385964, - "grad_norm": 1.0462609066040864, - "learning_rate": 4.935194601886855e-05, - "loss": 0.9002, - "step": 525 - }, - { - "epoch": 0.1239766081871345, - "grad_norm": 0.8684931504628074, - "learning_rate": 4.9331045464291246e-05, - "loss": 0.8851, - "step": 530 - }, - { - "epoch": 0.12514619883040937, - "grad_norm": 1.1786968981044232, - "learning_rate": 4.9309818285958685e-05, - "loss": 0.8716, - "step": 535 - }, - { - "epoch": 0.12631578947368421, - "grad_norm": 1.0073700153107397, - "learning_rate": 4.928826480145988e-05, - "loss": 0.8618, - "step": 540 - }, - { - "epoch": 0.12748538011695906, - "grad_norm": 1.238751684745195, - "learning_rate": 4.9266385333265884e-05, - "loss": 0.932, - "step": 545 - }, - { - "epoch": 0.1286549707602339, - "grad_norm": 23.55004868122488, - "learning_rate": 4.924418020872493e-05, - "loss": 0.9185, - "step": 550 - }, - { - "epoch": 0.12982456140350876, - "grad_norm": 1.0626923904298728, - "learning_rate": 4.922164976005753e-05, - "loss": 0.8921, - "step": 555 - }, - { - "epoch": 0.13099415204678364, - "grad_norm": 0.9971583275794336, - "learning_rate": 4.91987943243515e-05, - "loss": 0.877, - "step": 560 - }, - { - "epoch": 0.13216374269005848, - "grad_norm": 2.6820593238577715, - "learning_rate": 4.917561424355696e-05, - "loss": 0.8934, - "step": 565 - }, - { - "epoch": 0.13333333333333333, - "grad_norm": 0.8203046983625998, - "learning_rate": 4.915210986448117e-05, - "loss": 0.8856, - "step": 570 - }, - { - "epoch": 0.13450292397660818, - "grad_norm": 0.9484997849093751, - "learning_rate": 4.912828153878335e-05, - "loss": 0.8754, - "step": 575 - }, - { - "epoch": 0.13567251461988303, - "grad_norm": 0.9387568040621501, - "learning_rate": 4.910412962296944e-05, - "loss": 0.876, - "step": 580 - }, - { - "epoch": 0.1368421052631579, - "grad_norm": 1.7811309672532951, - "learning_rate": 4.9079654478386724e-05, - "loss": 0.8642, - "step": 585 - }, - { - "epoch": 0.13801169590643275, - "grad_norm": 0.9033524326022147, - "learning_rate": 4.90548564712185e-05, - "loss": 0.87, - "step": 590 - }, - { - "epoch": 0.1391812865497076, - "grad_norm": 1.0353234405443796, - "learning_rate": 4.9029735972478505e-05, - "loss": 0.8671, - "step": 595 - }, - { - "epoch": 0.14035087719298245, - "grad_norm": 0.9727686082063292, - "learning_rate": 4.900429335800545e-05, - "loss": 0.8696, - "step": 600 - }, - { - "epoch": 0.1415204678362573, - "grad_norm": 1.0701058421505383, - "learning_rate": 4.897852900845733e-05, - "loss": 0.8763, - "step": 605 - }, - { - "epoch": 0.14269005847953217, - "grad_norm": 0.9599394196131743, - "learning_rate": 4.8952443309305777e-05, - "loss": 0.877, - "step": 610 - }, - { - "epoch": 0.14385964912280702, - "grad_norm": 0.9746668570633114, - "learning_rate": 4.892603665083027e-05, - "loss": 0.8663, - "step": 615 - }, - { - "epoch": 0.14502923976608187, - "grad_norm": 1.3661549067902636, - "learning_rate": 4.88993094281123e-05, - "loss": 0.8626, - "step": 620 - }, - { - "epoch": 0.14619883040935672, - "grad_norm": 1.1330888516399584, - "learning_rate": 4.887226204102945e-05, - "loss": 0.8786, - "step": 625 - }, - { - "epoch": 0.14736842105263157, - "grad_norm": 1.0877568219198543, - "learning_rate": 4.8844894894249424e-05, - "loss": 0.8638, - "step": 630 - }, - { - "epoch": 0.14853801169590644, - "grad_norm": 0.8283677070229845, - "learning_rate": 4.8817208397224015e-05, - "loss": 0.8544, - "step": 635 - }, - { - "epoch": 0.1497076023391813, - "grad_norm": 0.9792293752478846, - "learning_rate": 4.878920296418292e-05, - "loss": 0.8601, - "step": 640 - }, - { - "epoch": 0.15087719298245614, - "grad_norm": 1.0042722495236052, - "learning_rate": 4.876087901412758e-05, - "loss": 0.8611, - "step": 645 - }, - { - "epoch": 0.15204678362573099, - "grad_norm": 0.9753390505738546, - "learning_rate": 4.873223697082493e-05, - "loss": 0.8319, - "step": 650 - }, - { - "epoch": 0.15321637426900586, - "grad_norm": 0.8555144137662696, - "learning_rate": 4.870327726280103e-05, - "loss": 0.8643, - "step": 655 - }, - { - "epoch": 0.1543859649122807, - "grad_norm": 0.9466302026901982, - "learning_rate": 4.867400032333463e-05, - "loss": 0.8618, - "step": 660 - }, - { - "epoch": 0.15555555555555556, - "grad_norm": 0.9514870975188799, - "learning_rate": 4.8644406590450744e-05, - "loss": 0.858, - "step": 665 - }, - { - "epoch": 0.1567251461988304, - "grad_norm": 0.9414562779147595, - "learning_rate": 4.8614496506914087e-05, - "loss": 0.855, - "step": 670 - }, - { - "epoch": 0.15789473684210525, - "grad_norm": 0.924267446091397, - "learning_rate": 4.85842705202224e-05, - "loss": 0.8514, - "step": 675 - }, - { - "epoch": 0.15906432748538013, - "grad_norm": 0.8932378986259512, - "learning_rate": 4.8553729082599795e-05, - "loss": 0.8585, - "step": 680 - }, - { - "epoch": 0.16023391812865498, - "grad_norm": 0.9151852425271583, - "learning_rate": 4.852287265099e-05, - "loss": 0.8616, - "step": 685 - }, - { - "epoch": 0.16140350877192983, - "grad_norm": 0.9255147269629532, - "learning_rate": 4.849170168704948e-05, - "loss": 0.8405, - "step": 690 - }, - { - "epoch": 0.16257309941520467, - "grad_norm": 0.9039561346670378, - "learning_rate": 4.8460216657140586e-05, - "loss": 0.8516, - "step": 695 - }, - { - "epoch": 0.16374269005847952, - "grad_norm": 0.8504653507531936, - "learning_rate": 4.84284180323245e-05, - "loss": 0.8455, - "step": 700 - }, - { - "epoch": 0.1649122807017544, - "grad_norm": 0.8953131531650742, - "learning_rate": 4.8396306288354294e-05, - "loss": 0.8464, - "step": 705 - }, - { - "epoch": 0.16608187134502925, - "grad_norm": 0.907830157602171, - "learning_rate": 4.83638819056677e-05, - "loss": 0.858, - "step": 710 - }, - { - "epoch": 0.1672514619883041, - "grad_norm": 0.9199662102205977, - "learning_rate": 4.833114536938e-05, - "loss": 0.8461, - "step": 715 - }, - { - "epoch": 0.16842105263157894, - "grad_norm": 0.9459462552675176, - "learning_rate": 4.829809716927674e-05, - "loss": 0.8387, - "step": 720 - }, - { - "epoch": 0.1695906432748538, - "grad_norm": 0.9000637268586222, - "learning_rate": 4.8264737799806395e-05, - "loss": 0.8392, - "step": 725 - }, - { - "epoch": 0.17076023391812867, - "grad_norm": 0.8933049571672156, - "learning_rate": 4.823106776007298e-05, - "loss": 0.8467, - "step": 730 - }, - { - "epoch": 0.17192982456140352, - "grad_norm": 0.8789216469954654, - "learning_rate": 4.819708755382858e-05, - "loss": 0.8514, - "step": 735 - }, - { - "epoch": 0.17309941520467836, - "grad_norm": 0.9596321090665773, - "learning_rate": 4.816279768946584e-05, - "loss": 0.8413, - "step": 740 - }, - { - "epoch": 0.1742690058479532, - "grad_norm": 0.9644057901284268, - "learning_rate": 4.8128198680010314e-05, - "loss": 0.8516, - "step": 745 - }, - { - "epoch": 0.17543859649122806, - "grad_norm": 1.0517939442567537, - "learning_rate": 4.8093291043112796e-05, - "loss": 0.8624, - "step": 750 - }, - { - "epoch": 0.17660818713450294, - "grad_norm": 0.890785305956098, - "learning_rate": 4.8058075301041627e-05, - "loss": 0.8303, - "step": 755 - }, - { - "epoch": 0.17777777777777778, - "grad_norm": 0.8003592875071304, - "learning_rate": 4.802255198067482e-05, - "loss": 0.8363, - "step": 760 - }, - { - "epoch": 0.17894736842105263, - "grad_norm": 0.9862618280417637, - "learning_rate": 4.7986721613492184e-05, - "loss": 0.8293, - "step": 765 - }, - { - "epoch": 0.18011695906432748, - "grad_norm": 0.9484996630364185, - "learning_rate": 4.795058473556744e-05, - "loss": 0.8274, - "step": 770 - }, - { - "epoch": 0.18128654970760233, - "grad_norm": 0.8735616854937872, - "learning_rate": 4.791414188756009e-05, - "loss": 0.8492, - "step": 775 - }, - { - "epoch": 0.1824561403508772, - "grad_norm": 0.752039927949095, - "learning_rate": 4.787739361470743e-05, - "loss": 0.8576, - "step": 780 - }, - { - "epoch": 0.18362573099415205, - "grad_norm": 3.062282513190117, - "learning_rate": 4.7840340466816316e-05, - "loss": 0.8292, - "step": 785 - }, - { - "epoch": 0.1847953216374269, - "grad_norm": 0.8316980304126362, - "learning_rate": 4.780298299825503e-05, - "loss": 0.8351, - "step": 790 - }, - { - "epoch": 0.18596491228070175, - "grad_norm": 1.5278125736107897, - "learning_rate": 4.776532176794485e-05, - "loss": 0.8375, - "step": 795 - }, - { - "epoch": 0.1871345029239766, - "grad_norm": 0.8681275682341193, - "learning_rate": 4.7727357339351806e-05, - "loss": 0.8411, - "step": 800 - }, - { - "epoch": 0.18830409356725147, - "grad_norm": 0.9663714659586506, - "learning_rate": 4.768909028047823e-05, - "loss": 0.8427, - "step": 805 - }, - { - "epoch": 0.18947368421052632, - "grad_norm": 0.8699869778471631, - "learning_rate": 4.7650521163854205e-05, - "loss": 0.8448, - "step": 810 - }, - { - "epoch": 0.19064327485380117, - "grad_norm": 0.86171284732001, - "learning_rate": 4.761165056652903e-05, - "loss": 0.8372, - "step": 815 - }, - { - "epoch": 0.19181286549707602, - "grad_norm": 0.8957979278563446, - "learning_rate": 4.7572479070062616e-05, - "loss": 0.8417, - "step": 820 - }, - { - "epoch": 0.19298245614035087, - "grad_norm": 0.8114691833201496, - "learning_rate": 4.753300726051671e-05, - "loss": 0.8185, - "step": 825 - }, - { - "epoch": 0.19415204678362574, - "grad_norm": 0.8454664849785094, - "learning_rate": 4.7493235728446244e-05, - "loss": 0.8365, - "step": 830 - }, - { - "epoch": 0.1953216374269006, - "grad_norm": 0.9905108822443424, - "learning_rate": 4.745316506889035e-05, - "loss": 0.8457, - "step": 835 - }, - { - "epoch": 0.19649122807017544, - "grad_norm": 1.061405796747107, - "learning_rate": 4.74127958813636e-05, - "loss": 0.8406, - "step": 840 - }, - { - "epoch": 0.1976608187134503, - "grad_norm": 0.8473537941451442, - "learning_rate": 4.7372128769846924e-05, - "loss": 0.8338, - "step": 845 - }, - { - "epoch": 0.19883040935672514, - "grad_norm": 1.1466583458521185, - "learning_rate": 4.733116434277866e-05, - "loss": 0.8258, - "step": 850 - }, - { - "epoch": 0.2, - "grad_norm": 0.7785799226749461, - "learning_rate": 4.7289903213045386e-05, - "loss": 0.8289, - "step": 855 - }, - { - "epoch": 0.20116959064327486, - "grad_norm": 0.8590755241548019, - "learning_rate": 4.7248345997972805e-05, - "loss": 0.8117, - "step": 860 - }, - { - "epoch": 0.2023391812865497, - "grad_norm": 0.7855870003132649, - "learning_rate": 4.720649331931645e-05, - "loss": 0.8202, - "step": 865 - }, - { - "epoch": 0.20350877192982456, - "grad_norm": 1.001202081783778, - "learning_rate": 4.716434580325243e-05, - "loss": 0.8216, - "step": 870 - }, - { - "epoch": 0.2046783625730994, - "grad_norm": 0.827480178465293, - "learning_rate": 4.712190408036805e-05, - "loss": 0.8262, - "step": 875 - }, - { - "epoch": 0.20584795321637428, - "grad_norm": 0.846609703440099, - "learning_rate": 4.7079168785652367e-05, - "loss": 0.8344, - "step": 880 - }, - { - "epoch": 0.20701754385964913, - "grad_norm": 0.9219532959757525, - "learning_rate": 4.703614055848668e-05, - "loss": 0.8172, - "step": 885 - }, - { - "epoch": 0.20818713450292398, - "grad_norm": 0.7637828460875458, - "learning_rate": 4.699282004263499e-05, - "loss": 0.8388, - "step": 890 - }, - { - "epoch": 0.20935672514619882, - "grad_norm": 0.7482566919726855, - "learning_rate": 4.6949207886234364e-05, - "loss": 0.8186, - "step": 895 - }, - { - "epoch": 0.21052631578947367, - "grad_norm": 0.925006863117072, - "learning_rate": 4.690530474178522e-05, - "loss": 0.8272, - "step": 900 - }, - { - "epoch": 0.21169590643274855, - "grad_norm": 0.7993467537605942, - "learning_rate": 4.686111126614156e-05, - "loss": 0.8197, - "step": 905 - }, - { - "epoch": 0.2128654970760234, - "grad_norm": 0.8829285193487382, - "learning_rate": 4.681662812050118e-05, - "loss": 0.8193, - "step": 910 - }, - { - "epoch": 0.21403508771929824, - "grad_norm": 0.9311623970289097, - "learning_rate": 4.6771855970395756e-05, - "loss": 0.827, - "step": 915 - }, - { - "epoch": 0.2152046783625731, - "grad_norm": 0.8106256897838203, - "learning_rate": 4.6726795485680866e-05, - "loss": 0.8165, - "step": 920 - }, - { - "epoch": 0.21637426900584794, - "grad_norm": 1.0733559910224837, - "learning_rate": 4.6681447340526e-05, - "loss": 0.8253, - "step": 925 - }, - { - "epoch": 0.21754385964912282, - "grad_norm": 0.8454361163690003, - "learning_rate": 4.663581221340445e-05, - "loss": 0.8181, - "step": 930 - }, - { - "epoch": 0.21871345029239767, - "grad_norm": 0.9381694065110877, - "learning_rate": 4.65898907870832e-05, - "loss": 0.8285, - "step": 935 - }, - { - "epoch": 0.2198830409356725, - "grad_norm": 0.9095410632213895, - "learning_rate": 4.654368374861264e-05, - "loss": 0.8294, - "step": 940 - }, - { - "epoch": 0.22105263157894736, - "grad_norm": 0.8991983031636669, - "learning_rate": 4.649719178931634e-05, - "loss": 0.8219, - "step": 945 - }, - { - "epoch": 0.2222222222222222, - "grad_norm": 0.8640964269521182, - "learning_rate": 4.645041560478073e-05, - "loss": 0.8182, - "step": 950 - }, - { - "epoch": 0.22339181286549709, - "grad_norm": 0.7717008178376892, - "learning_rate": 4.6403355894844603e-05, - "loss": 0.828, - "step": 955 - }, - { - "epoch": 0.22456140350877193, - "grad_norm": 0.7548344263953722, - "learning_rate": 4.635601336358873e-05, - "loss": 0.8118, - "step": 960 - }, - { - "epoch": 0.22573099415204678, - "grad_norm": 0.8125417926022064, - "learning_rate": 4.630838871932529e-05, - "loss": 0.8173, - "step": 965 - }, - { - "epoch": 0.22690058479532163, - "grad_norm": 0.8562232744379992, - "learning_rate": 4.626048267458727e-05, - "loss": 0.8018, - "step": 970 - }, - { - "epoch": 0.22807017543859648, - "grad_norm": 0.7282902246521552, - "learning_rate": 4.621229594611783e-05, - "loss": 0.8095, - "step": 975 - }, - { - "epoch": 0.22923976608187135, - "grad_norm": 0.8873506288089448, - "learning_rate": 4.616382925485953e-05, - "loss": 0.8132, - "step": 980 - }, - { - "epoch": 0.2304093567251462, - "grad_norm": 0.8104153922565714, - "learning_rate": 4.6115083325943606e-05, - "loss": 0.8158, - "step": 985 - }, - { - "epoch": 0.23157894736842105, - "grad_norm": 0.799851963773728, - "learning_rate": 4.606605888867908e-05, - "loss": 0.7857, - "step": 990 - }, - { - "epoch": 0.2327485380116959, - "grad_norm": 0.7738461797934784, - "learning_rate": 4.6016756676541847e-05, - "loss": 0.8218, - "step": 995 - }, - { - "epoch": 0.23391812865497075, - "grad_norm": 0.8576192649795367, - "learning_rate": 4.596717742716372e-05, - "loss": 0.8175, - "step": 1000 - }, - { - "epoch": 0.23508771929824562, - "grad_norm": 0.923961104746636, - "learning_rate": 4.5917321882321396e-05, - "loss": 0.8081, - "step": 1005 - }, - { - "epoch": 0.23625730994152047, - "grad_norm": 0.8893668795457819, - "learning_rate": 4.5867190787925334e-05, - "loss": 0.8058, - "step": 1010 - }, - { - "epoch": 0.23742690058479532, - "grad_norm": 0.8897258963021679, - "learning_rate": 4.5816784894008616e-05, - "loss": 0.825, - "step": 1015 - }, - { - "epoch": 0.23859649122807017, - "grad_norm": 0.9015776574616987, - "learning_rate": 4.576610495471573e-05, - "loss": 0.7981, - "step": 1020 - }, - { - "epoch": 0.23976608187134502, - "grad_norm": 0.8415340622158813, - "learning_rate": 4.571515172829125e-05, - "loss": 0.8081, - "step": 1025 - }, - { - "epoch": 0.2409356725146199, - "grad_norm": 0.8534832428040439, - "learning_rate": 4.5663925977068534e-05, - "loss": 0.8052, - "step": 1030 - }, - { - "epoch": 0.24210526315789474, - "grad_norm": 0.7671642203239843, - "learning_rate": 4.561242846745831e-05, - "loss": 0.8083, - "step": 1035 - }, - { - "epoch": 0.2432748538011696, - "grad_norm": 0.7947438206908344, - "learning_rate": 4.556065996993718e-05, - "loss": 0.8094, - "step": 1040 - }, - { - "epoch": 0.24444444444444444, - "grad_norm": 0.7944046005545552, - "learning_rate": 4.550862125903613e-05, - "loss": 0.7997, - "step": 1045 - }, - { - "epoch": 0.24561403508771928, - "grad_norm": 0.7962491651355799, - "learning_rate": 4.5456313113328925e-05, - "loss": 0.8055, - "step": 1050 - }, - { - "epoch": 0.24678362573099416, - "grad_norm": 0.7846208889540649, - "learning_rate": 4.540373631542045e-05, - "loss": 0.8017, - "step": 1055 - }, - { - "epoch": 0.247953216374269, - "grad_norm": 0.9255444869098531, - "learning_rate": 4.5350891651935024e-05, - "loss": 0.7956, - "step": 1060 - }, - { - "epoch": 0.24912280701754386, - "grad_norm": 0.8264644211740068, - "learning_rate": 4.529777991350462e-05, - "loss": 0.787, - "step": 1065 - }, - { - "epoch": 0.25029239766081873, - "grad_norm": 0.8612407756813709, - "learning_rate": 4.524440189475702e-05, - "loss": 0.804, - "step": 1070 - }, - { - "epoch": 0.25146198830409355, - "grad_norm": 0.9201193312825702, - "learning_rate": 4.519075839430395e-05, - "loss": 0.8139, - "step": 1075 - }, - { - "epoch": 0.25263157894736843, - "grad_norm": 0.8434760312094466, - "learning_rate": 4.513685021472913e-05, - "loss": 0.8084, - "step": 1080 - }, - { - "epoch": 0.25380116959064325, - "grad_norm": 0.8097744734009323, - "learning_rate": 4.5082678162576266e-05, - "loss": 0.792, - "step": 1085 - }, - { - "epoch": 0.2549707602339181, - "grad_norm": 0.8258410598348814, - "learning_rate": 4.502824304833694e-05, - "loss": 0.8018, - "step": 1090 - }, - { - "epoch": 0.256140350877193, - "grad_norm": 0.8898423105962325, - "learning_rate": 4.497354568643856e-05, - "loss": 0.801, - "step": 1095 - }, - { - "epoch": 0.2573099415204678, - "grad_norm": 0.7682588506389413, - "learning_rate": 4.491858689523212e-05, - "loss": 0.7997, - "step": 1100 - }, - { - "epoch": 0.2584795321637427, - "grad_norm": 0.8436536091119384, - "learning_rate": 4.486336749697996e-05, - "loss": 0.7958, - "step": 1105 - }, - { - "epoch": 0.2596491228070175, - "grad_norm": 0.9134657570403645, - "learning_rate": 4.48078883178435e-05, - "loss": 0.7979, - "step": 1110 - }, - { - "epoch": 0.2608187134502924, - "grad_norm": 0.9063533205659801, - "learning_rate": 4.4752150187870835e-05, - "loss": 0.8046, - "step": 1115 - }, - { - "epoch": 0.26198830409356727, - "grad_norm": 0.9728687505767328, - "learning_rate": 4.4696153940984336e-05, - "loss": 0.8005, - "step": 1120 - }, - { - "epoch": 0.2631578947368421, - "grad_norm": 0.8465165107087465, - "learning_rate": 4.463990041496819e-05, - "loss": 0.7928, - "step": 1125 - }, - { - "epoch": 0.26432748538011697, - "grad_norm": 0.8703041775291581, - "learning_rate": 4.4583390451455825e-05, - "loss": 0.7993, - "step": 1130 - }, - { - "epoch": 0.2654970760233918, - "grad_norm": 0.8930384455063077, - "learning_rate": 4.4526624895917374e-05, - "loss": 0.8187, - "step": 1135 - }, - { - "epoch": 0.26666666666666666, - "grad_norm": 0.9533325500126925, - "learning_rate": 4.4469604597646955e-05, - "loss": 0.7915, - "step": 1140 - }, - { - "epoch": 0.26783625730994154, - "grad_norm": 0.849949068893564, - "learning_rate": 4.441233040975003e-05, - "loss": 0.8084, - "step": 1145 - }, - { - "epoch": 0.26900584795321636, - "grad_norm": 0.8296019860693888, - "learning_rate": 4.435480318913061e-05, - "loss": 0.8099, - "step": 1150 - }, - { - "epoch": 0.27017543859649124, - "grad_norm": 0.8897148979616573, - "learning_rate": 4.429702379647842e-05, - "loss": 0.8076, - "step": 1155 - }, - { - "epoch": 0.27134502923976606, - "grad_norm": 1.015942373286734, - "learning_rate": 4.4238993096256074e-05, - "loss": 0.7987, - "step": 1160 - }, - { - "epoch": 0.27251461988304093, - "grad_norm": 0.9170669198858242, - "learning_rate": 4.418071195668607e-05, - "loss": 0.8026, - "step": 1165 - }, - { - "epoch": 0.2736842105263158, - "grad_norm": 0.8544721449457394, - "learning_rate": 4.412218124973787e-05, - "loss": 0.8037, - "step": 1170 - }, - { - "epoch": 0.27485380116959063, - "grad_norm": 0.8109007540099256, - "learning_rate": 4.40634018511148e-05, - "loss": 0.7853, - "step": 1175 - }, - { - "epoch": 0.2760233918128655, - "grad_norm": 0.784688745908909, - "learning_rate": 4.4004374640240984e-05, - "loss": 0.7829, - "step": 1180 - }, - { - "epoch": 0.2771929824561403, - "grad_norm": 0.8426675242380559, - "learning_rate": 4.394510050024816e-05, - "loss": 0.7865, - "step": 1185 - }, - { - "epoch": 0.2783625730994152, - "grad_norm": 0.7417145435611461, - "learning_rate": 4.388558031796249e-05, - "loss": 0.7912, - "step": 1190 - }, - { - "epoch": 0.2795321637426901, - "grad_norm": 0.6946069237835871, - "learning_rate": 4.382581498389129e-05, - "loss": 0.7889, - "step": 1195 - }, - { - "epoch": 0.2807017543859649, - "grad_norm": 0.830664235233533, - "learning_rate": 4.376580539220967e-05, - "loss": 0.8007, - "step": 1200 - }, - { - "epoch": 0.2818713450292398, - "grad_norm": 4.044757818989429, - "learning_rate": 4.370555244074721e-05, - "loss": 0.803, - "step": 1205 - }, - { - "epoch": 0.2830409356725146, - "grad_norm": 0.9700515861829565, - "learning_rate": 4.364505703097449e-05, - "loss": 0.8076, - "step": 1210 - }, - { - "epoch": 0.28421052631578947, - "grad_norm": 0.8624986906785574, - "learning_rate": 4.358432006798962e-05, - "loss": 0.8062, - "step": 1215 - }, - { - "epoch": 0.28538011695906434, - "grad_norm": 0.8400857543523832, - "learning_rate": 4.352334246050468e-05, - "loss": 0.7977, - "step": 1220 - }, - { - "epoch": 0.28654970760233917, - "grad_norm": 0.7507912757298415, - "learning_rate": 4.346212512083216e-05, - "loss": 0.7961, - "step": 1225 - }, - { - "epoch": 0.28771929824561404, - "grad_norm": 0.8662690316314504, - "learning_rate": 4.3400668964871255e-05, - "loss": 0.8004, - "step": 1230 - }, - { - "epoch": 0.28888888888888886, - "grad_norm": 0.7711462814621782, - "learning_rate": 4.333897491209424e-05, - "loss": 0.7935, - "step": 1235 - }, - { - "epoch": 0.29005847953216374, - "grad_norm": 0.814472115106665, - "learning_rate": 4.327704388553262e-05, - "loss": 0.7875, - "step": 1240 - }, - { - "epoch": 0.2912280701754386, - "grad_norm": 0.7777758368706338, - "learning_rate": 4.321487681176338e-05, - "loss": 0.7767, - "step": 1245 - }, - { - "epoch": 0.29239766081871343, - "grad_norm": 3.7426108288257525, - "learning_rate": 4.315247462089514e-05, - "loss": 0.7869, - "step": 1250 - }, - { - "epoch": 0.2935672514619883, - "grad_norm": 0.8796776373301245, - "learning_rate": 4.308983824655418e-05, - "loss": 0.7885, - "step": 1255 - }, - { - "epoch": 0.29473684210526313, - "grad_norm": 0.8152628960370694, - "learning_rate": 4.30269686258705e-05, - "loss": 0.7793, - "step": 1260 - }, - { - "epoch": 0.295906432748538, - "grad_norm": 0.8531873491227141, - "learning_rate": 4.296386669946382e-05, - "loss": 0.7908, - "step": 1265 - }, - { - "epoch": 0.2970760233918129, - "grad_norm": 0.8766898462050617, - "learning_rate": 4.290053341142945e-05, - "loss": 0.7912, - "step": 1270 - }, - { - "epoch": 0.2982456140350877, - "grad_norm": 0.9437011508912876, - "learning_rate": 4.283696970932426e-05, - "loss": 0.7937, - "step": 1275 - }, - { - "epoch": 0.2994152046783626, - "grad_norm": 0.760727957693321, - "learning_rate": 4.27731765441524e-05, - "loss": 0.7803, - "step": 1280 - }, - { - "epoch": 0.30058479532163745, - "grad_norm": 0.8030072219606168, - "learning_rate": 4.27091548703511e-05, - "loss": 0.7789, - "step": 1285 - }, - { - "epoch": 0.3017543859649123, - "grad_norm": 0.7394799267095162, - "learning_rate": 4.264490564577647e-05, - "loss": 0.7743, - "step": 1290 - }, - { - "epoch": 0.30292397660818715, - "grad_norm": 0.8034110876662692, - "learning_rate": 4.258042983168906e-05, - "loss": 0.7964, - "step": 1295 - }, - { - "epoch": 0.30409356725146197, - "grad_norm": 0.8836489925526694, - "learning_rate": 4.251572839273953e-05, - "loss": 0.7843, - "step": 1300 - }, - { - "epoch": 0.30526315789473685, - "grad_norm": 0.8099715904918315, - "learning_rate": 4.245080229695422e-05, - "loss": 0.7765, - "step": 1305 - }, - { - "epoch": 0.3064327485380117, - "grad_norm": 0.7211770553327171, - "learning_rate": 4.238565251572065e-05, - "loss": 0.7777, - "step": 1310 - }, - { - "epoch": 0.30760233918128654, - "grad_norm": 0.8430080190315516, - "learning_rate": 4.2320280023773004e-05, - "loss": 0.792, - "step": 1315 - }, - { - "epoch": 0.3087719298245614, - "grad_norm": 0.7529220584999547, - "learning_rate": 4.225468579917755e-05, - "loss": 0.772, - "step": 1320 - }, - { - "epoch": 0.30994152046783624, - "grad_norm": 0.8246451276524118, - "learning_rate": 4.218887082331795e-05, - "loss": 0.7953, - "step": 1325 - }, - { - "epoch": 0.3111111111111111, - "grad_norm": 0.7572320411115961, - "learning_rate": 4.2122836080880656e-05, - "loss": 0.782, - "step": 1330 - }, - { - "epoch": 0.312280701754386, - "grad_norm": 0.8474595360345947, - "learning_rate": 4.2056582559840156e-05, - "loss": 0.7904, - "step": 1335 - }, - { - "epoch": 0.3134502923976608, - "grad_norm": 0.939458085785471, - "learning_rate": 4.199011125144414e-05, - "loss": 0.7763, - "step": 1340 - }, - { - "epoch": 0.3146198830409357, - "grad_norm": 0.9781432479632555, - "learning_rate": 4.192342315019875e-05, - "loss": 0.7824, - "step": 1345 - }, - { - "epoch": 0.3157894736842105, - "grad_norm": 0.7908263498688014, - "learning_rate": 4.185651925385361e-05, - "loss": 0.78, - "step": 1350 - }, - { - "epoch": 0.3169590643274854, - "grad_norm": 0.8281872357852522, - "learning_rate": 4.1789400563387014e-05, - "loss": 0.7786, - "step": 1355 - }, - { - "epoch": 0.31812865497076026, - "grad_norm": 0.7780377594605055, - "learning_rate": 4.172206808299082e-05, - "loss": 0.7729, - "step": 1360 - }, - { - "epoch": 0.3192982456140351, - "grad_norm": 0.994616363239153, - "learning_rate": 4.1654522820055543e-05, - "loss": 0.7804, - "step": 1365 - }, - { - "epoch": 0.32046783625730996, - "grad_norm": 0.7754504675615862, - "learning_rate": 4.158676578515518e-05, - "loss": 0.7753, - "step": 1370 - }, - { - "epoch": 0.3216374269005848, - "grad_norm": 0.7599612940476631, - "learning_rate": 4.1518797992032186e-05, - "loss": 0.773, - "step": 1375 - }, - { - "epoch": 0.32280701754385965, - "grad_norm": 0.7752784188686452, - "learning_rate": 4.145062045758223e-05, - "loss": 0.7763, - "step": 1380 - }, - { - "epoch": 0.32397660818713453, - "grad_norm": 0.9495320531498634, - "learning_rate": 4.138223420183902e-05, - "loss": 0.7654, - "step": 1385 - }, - { - "epoch": 0.32514619883040935, - "grad_norm": 0.7827188590394395, - "learning_rate": 4.1313640247959056e-05, - "loss": 0.7718, - "step": 1390 - }, - { - "epoch": 0.3263157894736842, - "grad_norm": 0.7318405468594709, - "learning_rate": 4.124483962220627e-05, - "loss": 0.77, - "step": 1395 - }, - { - "epoch": 0.32748538011695905, - "grad_norm": 0.763847628926599, - "learning_rate": 4.11758333539367e-05, - "loss": 0.7821, - "step": 1400 - }, - { - "epoch": 0.3286549707602339, - "grad_norm": 0.7009721668324501, - "learning_rate": 4.1106622475583125e-05, - "loss": 0.7696, - "step": 1405 - }, - { - "epoch": 0.3298245614035088, - "grad_norm": 0.7615443102851462, - "learning_rate": 4.1037208022639553e-05, - "loss": 0.7652, - "step": 1410 - }, - { - "epoch": 0.3309941520467836, - "grad_norm": 0.7973051424443336, - "learning_rate": 4.0967591033645774e-05, - "loss": 0.7804, - "step": 1415 - }, - { - "epoch": 0.3321637426900585, - "grad_norm": 0.7265512805457267, - "learning_rate": 4.08977725501718e-05, - "loss": 0.7744, - "step": 1420 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 0.7572181228671228, - "learning_rate": 4.08277536168023e-05, - "loss": 0.7692, - "step": 1425 - }, - { - "epoch": 0.3345029239766082, - "grad_norm": 0.750546140392922, - "learning_rate": 4.075753528112095e-05, - "loss": 0.7731, - "step": 1430 - }, - { - "epoch": 0.33567251461988307, - "grad_norm": 0.707104320903681, - "learning_rate": 4.068711859369478e-05, - "loss": 0.7584, - "step": 1435 - }, - { - "epoch": 0.3368421052631579, - "grad_norm": 0.8018923237653571, - "learning_rate": 4.061650460805843e-05, - "loss": 0.7801, - "step": 1440 - }, - { - "epoch": 0.33801169590643276, - "grad_norm": 0.7830790535393684, - "learning_rate": 4.054569438069843e-05, - "loss": 0.7665, - "step": 1445 - }, - { - "epoch": 0.3391812865497076, - "grad_norm": 0.8337132040033033, - "learning_rate": 4.047468897103734e-05, - "loss": 0.7676, - "step": 1450 - }, - { - "epoch": 0.34035087719298246, - "grad_norm": 0.7352284392082878, - "learning_rate": 4.040348944141795e-05, - "loss": 0.7859, - "step": 1455 - }, - { - "epoch": 0.34152046783625734, - "grad_norm": 0.7819042171394409, - "learning_rate": 4.0332096857087346e-05, - "loss": 0.7802, - "step": 1460 - }, - { - "epoch": 0.34269005847953216, - "grad_norm": 0.6839132320063686, - "learning_rate": 4.026051228618101e-05, - "loss": 0.7556, - "step": 1465 - }, - { - "epoch": 0.34385964912280703, - "grad_norm": 0.7721614295992741, - "learning_rate": 4.018873679970679e-05, - "loss": 0.7572, - "step": 1470 - }, - { - "epoch": 0.34502923976608185, - "grad_norm": 0.7881351916456097, - "learning_rate": 4.0116771471528946e-05, - "loss": 0.7836, - "step": 1475 - }, - { - "epoch": 0.34619883040935673, - "grad_norm": 17.824492025260795, - "learning_rate": 4.004461737835199e-05, - "loss": 0.8288, - "step": 1480 - }, - { - "epoch": 0.3473684210526316, - "grad_norm": 0.9522727006560597, - "learning_rate": 3.9972275599704675e-05, - "loss": 0.7831, - "step": 1485 - }, - { - "epoch": 0.3485380116959064, - "grad_norm": 1.0123330953920058, - "learning_rate": 3.989974721792376e-05, - "loss": 0.7834, - "step": 1490 - }, - { - "epoch": 0.3497076023391813, - "grad_norm": 0.9256023414663554, - "learning_rate": 3.982703331813789e-05, - "loss": 0.7629, - "step": 1495 - }, - { - "epoch": 0.3508771929824561, - "grad_norm": 0.8885054731766725, - "learning_rate": 3.97541349882513e-05, - "loss": 0.7872, - "step": 1500 - }, - { - "epoch": 0.352046783625731, - "grad_norm": 0.895363232481551, - "learning_rate": 3.9681053318927576e-05, - "loss": 0.7787, - "step": 1505 - }, - { - "epoch": 0.3532163742690059, - "grad_norm": 0.7774622671008159, - "learning_rate": 3.960778940357332e-05, - "loss": 0.7595, - "step": 1510 - }, - { - "epoch": 0.3543859649122807, - "grad_norm": 0.8521171304891902, - "learning_rate": 3.9534344338321804e-05, - "loss": 0.7752, - "step": 1515 - }, - { - "epoch": 0.35555555555555557, - "grad_norm": 0.7186359807168464, - "learning_rate": 3.946071922201654e-05, - "loss": 0.766, - "step": 1520 - }, - { - "epoch": 0.3567251461988304, - "grad_norm": 0.7748704605835699, - "learning_rate": 3.9386915156194896e-05, - "loss": 0.7729, - "step": 1525 - }, - { - "epoch": 0.35789473684210527, - "grad_norm": 0.7519175958314207, - "learning_rate": 3.931293324507157e-05, - "loss": 0.7622, - "step": 1530 - }, - { - "epoch": 0.35906432748538014, - "grad_norm": 0.9436517074298926, - "learning_rate": 3.9238774595522035e-05, - "loss": 0.7733, - "step": 1535 - }, - { - "epoch": 0.36023391812865496, - "grad_norm": 0.8056589850419347, - "learning_rate": 3.9164440317066106e-05, - "loss": 0.7508, - "step": 1540 - }, - { - "epoch": 0.36140350877192984, - "grad_norm": 0.7582238115959192, - "learning_rate": 3.9089931521851196e-05, - "loss": 0.7731, - "step": 1545 - }, - { - "epoch": 0.36257309941520466, - "grad_norm": 0.8651560934781966, - "learning_rate": 3.9015249324635765e-05, - "loss": 0.7841, - "step": 1550 - }, - { - "epoch": 0.36374269005847953, - "grad_norm": 0.7138257240856716, - "learning_rate": 3.89403948427726e-05, - "loss": 0.7738, - "step": 1555 - }, - { - "epoch": 0.3649122807017544, - "grad_norm": 0.7982217557908895, - "learning_rate": 3.8865369196192134e-05, - "loss": 0.7571, - "step": 1560 - }, - { - "epoch": 0.36608187134502923, - "grad_norm": 0.8705834592275034, - "learning_rate": 3.8790173507385664e-05, - "loss": 0.7634, - "step": 1565 - }, - { - "epoch": 0.3672514619883041, - "grad_norm": 0.7921869201643883, - "learning_rate": 3.871480890138854e-05, - "loss": 0.7665, - "step": 1570 - }, - { - "epoch": 0.3684210526315789, - "grad_norm": 0.7999595873888212, - "learning_rate": 3.863927650576339e-05, - "loss": 0.7547, - "step": 1575 - }, - { - "epoch": 0.3695906432748538, - "grad_norm": 0.7509863075089731, - "learning_rate": 3.856357745058318e-05, - "loss": 0.759, - "step": 1580 - }, - { - "epoch": 0.3707602339181287, - "grad_norm": 0.7307808913797633, - "learning_rate": 3.848771286841439e-05, - "loss": 0.7758, - "step": 1585 - }, - { - "epoch": 0.3719298245614035, - "grad_norm": 0.7909239920658141, - "learning_rate": 3.841168389429996e-05, - "loss": 0.7675, - "step": 1590 - }, - { - "epoch": 0.3730994152046784, - "grad_norm": 0.7482298719782716, - "learning_rate": 3.8335491665742405e-05, - "loss": 0.756, - "step": 1595 - }, - { - "epoch": 0.3742690058479532, - "grad_norm": 0.67253397627471, - "learning_rate": 3.825913732268677e-05, - "loss": 0.7717, - "step": 1600 - }, - { - "epoch": 0.37543859649122807, - "grad_norm": 0.7953984210357787, - "learning_rate": 3.818262200750356e-05, - "loss": 0.7666, - "step": 1605 - }, - { - "epoch": 0.37660818713450295, - "grad_norm": 0.7822921314337719, - "learning_rate": 3.810594686497163e-05, - "loss": 0.7709, - "step": 1610 - }, - { - "epoch": 0.37777777777777777, - "grad_norm": 0.6928025407807362, - "learning_rate": 3.8029113042261097e-05, - "loss": 0.7351, - "step": 1615 - }, - { - "epoch": 0.37894736842105264, - "grad_norm": 0.8574680022140106, - "learning_rate": 3.795212168891618e-05, - "loss": 0.7742, - "step": 1620 - }, - { - "epoch": 0.38011695906432746, - "grad_norm": 0.835331112988705, - "learning_rate": 3.787497395683794e-05, - "loss": 0.7625, - "step": 1625 - }, - { - "epoch": 0.38128654970760234, - "grad_norm": 0.8993766649941991, - "learning_rate": 3.779767100026711e-05, - "loss": 0.7649, - "step": 1630 - }, - { - "epoch": 0.3824561403508772, - "grad_norm": 0.8136822440130091, - "learning_rate": 3.772021397576683e-05, - "loss": 0.7564, - "step": 1635 - }, - { - "epoch": 0.38362573099415204, - "grad_norm": 0.8295895321913241, - "learning_rate": 3.764260404220529e-05, - "loss": 0.7745, - "step": 1640 - }, - { - "epoch": 0.3847953216374269, - "grad_norm": 0.772989242303164, - "learning_rate": 3.75648423607384e-05, - "loss": 0.7668, - "step": 1645 - }, - { - "epoch": 0.38596491228070173, - "grad_norm": 0.7401867656024123, - "learning_rate": 3.748693009479248e-05, - "loss": 0.7564, - "step": 1650 - }, - { - "epoch": 0.3871345029239766, - "grad_norm": 0.7597052025519834, - "learning_rate": 3.740886841004678e-05, - "loss": 0.7544, - "step": 1655 - }, - { - "epoch": 0.3883040935672515, - "grad_norm": 0.7177753699525617, - "learning_rate": 3.7330658474416076e-05, - "loss": 0.7442, - "step": 1660 - }, - { - "epoch": 0.3894736842105263, - "grad_norm": 0.7417872144963117, - "learning_rate": 3.725230145803319e-05, - "loss": 0.7633, - "step": 1665 - }, - { - "epoch": 0.3906432748538012, - "grad_norm": 0.6613723653550772, - "learning_rate": 3.7173798533231493e-05, - "loss": 0.7633, - "step": 1670 - }, - { - "epoch": 0.391812865497076, - "grad_norm": 0.6732226054962314, - "learning_rate": 3.709515087452734e-05, - "loss": 0.767, - "step": 1675 - }, - { - "epoch": 0.3929824561403509, - "grad_norm": 0.7377795343279899, - "learning_rate": 3.701635965860252e-05, - "loss": 0.7639, - "step": 1680 - }, - { - "epoch": 0.39415204678362575, - "grad_norm": 0.6914484317276971, - "learning_rate": 3.693742606428666e-05, - "loss": 0.748, - "step": 1685 - }, - { - "epoch": 0.3953216374269006, - "grad_norm": 0.7827509721068225, - "learning_rate": 3.6858351272539524e-05, - "loss": 0.7716, - "step": 1690 - }, - { - "epoch": 0.39649122807017545, - "grad_norm": 0.7243740975471005, - "learning_rate": 3.677913646643346e-05, - "loss": 0.7461, - "step": 1695 - }, - { - "epoch": 0.39766081871345027, - "grad_norm": 0.7271096610255222, - "learning_rate": 3.669978283113557e-05, - "loss": 0.7499, - "step": 1700 - }, - { - "epoch": 0.39883040935672515, - "grad_norm": 0.8171882683333956, - "learning_rate": 3.662029155389007e-05, - "loss": 0.7592, - "step": 1705 - }, - { - "epoch": 0.4, - "grad_norm": 0.729897299282628, - "learning_rate": 3.65406638240005e-05, - "loss": 0.7563, - "step": 1710 - }, - { - "epoch": 0.40116959064327484, - "grad_norm": 0.7843875646778868, - "learning_rate": 3.646090083281191e-05, - "loss": 0.7465, - "step": 1715 - }, - { - "epoch": 0.4023391812865497, - "grad_norm": 0.8337858126407102, - "learning_rate": 3.638100377369308e-05, - "loss": 0.7525, - "step": 1720 - }, - { - "epoch": 0.40350877192982454, - "grad_norm": 0.7746287703227326, - "learning_rate": 3.630097384201859e-05, - "loss": 0.7474, - "step": 1725 - }, - { - "epoch": 0.4046783625730994, - "grad_norm": 0.752728483401707, - "learning_rate": 3.6220812235151054e-05, - "loss": 0.7671, - "step": 1730 - }, - { - "epoch": 0.4058479532163743, - "grad_norm": 0.7384298994892262, - "learning_rate": 3.614052015242307e-05, - "loss": 0.7569, - "step": 1735 - }, - { - "epoch": 0.4070175438596491, - "grad_norm": 0.7301640418761658, - "learning_rate": 3.606009879511937e-05, - "loss": 0.7517, - "step": 1740 - }, - { - "epoch": 0.408187134502924, - "grad_norm": 0.7606719895102059, - "learning_rate": 3.597954936645883e-05, - "loss": 0.7627, - "step": 1745 - }, - { - "epoch": 0.4093567251461988, - "grad_norm": 0.7993038146039282, - "learning_rate": 3.589887307157644e-05, - "loss": 0.762, - "step": 1750 - }, - { - "epoch": 0.4105263157894737, - "grad_norm": 0.8077703903929386, - "learning_rate": 3.5818071117505285e-05, - "loss": 0.7449, - "step": 1755 - }, - { - "epoch": 0.41169590643274856, - "grad_norm": 0.7099492412004633, - "learning_rate": 3.573714471315852e-05, - "loss": 0.7631, - "step": 1760 - }, - { - "epoch": 0.4128654970760234, - "grad_norm": 0.7784601423237745, - "learning_rate": 3.565609506931124e-05, - "loss": 0.7557, - "step": 1765 - }, - { - "epoch": 0.41403508771929826, - "grad_norm": 0.7159896176820426, - "learning_rate": 3.557492339858236e-05, - "loss": 0.7536, - "step": 1770 - }, - { - "epoch": 0.4152046783625731, - "grad_norm": 0.7469578404845566, - "learning_rate": 3.549363091541652e-05, - "loss": 0.7393, - "step": 1775 - }, - { - "epoch": 0.41637426900584795, - "grad_norm": 0.6837755812631964, - "learning_rate": 3.541221883606587e-05, - "loss": 0.7673, - "step": 1780 - }, - { - "epoch": 0.41754385964912283, - "grad_norm": 0.7526930864574983, - "learning_rate": 3.533068837857191e-05, - "loss": 0.7502, - "step": 1785 - }, - { - "epoch": 0.41871345029239765, - "grad_norm": 0.6964791661553479, - "learning_rate": 3.5249040762747216e-05, - "loss": 0.7643, - "step": 1790 - }, - { - "epoch": 0.4198830409356725, - "grad_norm": 0.7176587460315915, - "learning_rate": 3.516727721015725e-05, - "loss": 0.7445, - "step": 1795 - }, - { - "epoch": 0.42105263157894735, - "grad_norm": 0.6949913252110833, - "learning_rate": 3.508539894410204e-05, - "loss": 0.7384, - "step": 1800 - }, - { - "epoch": 0.4222222222222222, - "grad_norm": 0.6879985495866497, - "learning_rate": 3.500340718959789e-05, - "loss": 0.7474, - "step": 1805 - }, - { - "epoch": 0.4233918128654971, - "grad_norm": 0.7255729290182649, - "learning_rate": 3.492130317335908e-05, - "loss": 0.7642, - "step": 1810 - }, - { - "epoch": 0.4245614035087719, - "grad_norm": 0.7320558645901049, - "learning_rate": 3.483908812377944e-05, - "loss": 0.7485, - "step": 1815 - }, - { - "epoch": 0.4257309941520468, - "grad_norm": 0.7458875901771815, - "learning_rate": 3.475676327091405e-05, - "loss": 0.7486, - "step": 1820 - }, - { - "epoch": 0.4269005847953216, - "grad_norm": 0.8578208276395972, - "learning_rate": 3.46743298464608e-05, - "loss": 0.7495, - "step": 1825 - }, - { - "epoch": 0.4280701754385965, - "grad_norm": 0.7859910962069061, - "learning_rate": 3.459178908374198e-05, - "loss": 0.7513, - "step": 1830 - }, - { - "epoch": 0.42923976608187137, - "grad_norm": 0.6911557004560248, - "learning_rate": 3.450914221768577e-05, - "loss": 0.7367, - "step": 1835 - }, - { - "epoch": 0.4304093567251462, - "grad_norm": 0.7456771712242715, - "learning_rate": 3.442639048480786e-05, - "loss": 0.7529, - "step": 1840 - }, - { - "epoch": 0.43157894736842106, - "grad_norm": 0.6893752794828176, - "learning_rate": 3.434353512319287e-05, - "loss": 0.7534, - "step": 1845 - }, - { - "epoch": 0.4327485380116959, - "grad_norm": 0.715785646529028, - "learning_rate": 3.426057737247585e-05, - "loss": 0.7528, - "step": 1850 - }, - { - "epoch": 0.43391812865497076, - "grad_norm": 0.8076534821677026, - "learning_rate": 3.4177518473823765e-05, - "loss": 0.7629, - "step": 1855 - }, - { - "epoch": 0.43508771929824563, - "grad_norm": 0.7529189699100864, - "learning_rate": 3.409435966991687e-05, - "loss": 0.7558, - "step": 1860 - }, - { - "epoch": 0.43625730994152045, - "grad_norm": 0.7513403291393522, - "learning_rate": 3.4011102204930164e-05, - "loss": 0.7406, - "step": 1865 - }, - { - "epoch": 0.43742690058479533, - "grad_norm": 0.7777527987715089, - "learning_rate": 3.392774732451474e-05, - "loss": 0.7491, - "step": 1870 - }, - { - "epoch": 0.43859649122807015, - "grad_norm": 0.743482516351825, - "learning_rate": 3.384429627577919e-05, - "loss": 0.73, - "step": 1875 - }, - { - "epoch": 0.439766081871345, - "grad_norm": 0.7648732013556975, - "learning_rate": 3.3760750307270885e-05, - "loss": 0.7582, - "step": 1880 - }, - { - "epoch": 0.4409356725146199, - "grad_norm": 3.3134636103773465, - "learning_rate": 3.367711066895737e-05, - "loss": 0.7523, - "step": 1885 - }, - { - "epoch": 0.4421052631578947, - "grad_norm": 0.7590057881358742, - "learning_rate": 3.359337861220762e-05, - "loss": 0.7581, - "step": 1890 - }, - { - "epoch": 0.4432748538011696, - "grad_norm": 0.9364115069402698, - "learning_rate": 3.3509555389773295e-05, - "loss": 0.7489, - "step": 1895 - }, - { - "epoch": 0.4444444444444444, - "grad_norm": 0.7556042031825977, - "learning_rate": 3.3425642255770044e-05, - "loss": 0.7386, - "step": 1900 - }, - { - "epoch": 0.4456140350877193, - "grad_norm": 0.7613584930553752, - "learning_rate": 3.334164046565873e-05, - "loss": 0.7366, - "step": 1905 - }, - { - "epoch": 0.44678362573099417, - "grad_norm": 0.7369071648445169, - "learning_rate": 3.3257551276226617e-05, - "loss": 0.733, - "step": 1910 - }, - { - "epoch": 0.447953216374269, - "grad_norm": 0.801402699988636, - "learning_rate": 3.31733759455686e-05, - "loss": 0.7411, - "step": 1915 - }, - { - "epoch": 0.44912280701754387, - "grad_norm": 0.7238254574287333, - "learning_rate": 3.308911573306837e-05, - "loss": 0.7387, - "step": 1920 - }, - { - "epoch": 0.4502923976608187, - "grad_norm": 0.6952586214256408, - "learning_rate": 3.300477189937958e-05, - "loss": 0.7366, - "step": 1925 - }, - { - "epoch": 0.45146198830409356, - "grad_norm": 0.6960451113389217, - "learning_rate": 3.292034570640695e-05, - "loss": 0.7514, - "step": 1930 - }, - { - "epoch": 0.45263157894736844, - "grad_norm": 0.6934571085329695, - "learning_rate": 3.2835838417287446e-05, - "loss": 0.7409, - "step": 1935 - }, - { - "epoch": 0.45380116959064326, - "grad_norm": 0.7794450187246763, - "learning_rate": 3.2751251296371325e-05, - "loss": 0.7309, - "step": 1940 - }, - { - "epoch": 0.45497076023391814, - "grad_norm": 0.7783530977017811, - "learning_rate": 3.266658560920326e-05, - "loss": 0.7363, - "step": 1945 - }, - { - "epoch": 0.45614035087719296, - "grad_norm": 0.7530850469408922, - "learning_rate": 3.2581842622503366e-05, - "loss": 0.7474, - "step": 1950 - }, - { - "epoch": 0.45730994152046783, - "grad_norm": 0.7293168415552727, - "learning_rate": 3.249702360414829e-05, - "loss": 0.742, - "step": 1955 - }, - { - "epoch": 0.4584795321637427, - "grad_norm": 0.7976185776852677, - "learning_rate": 3.24121298231522e-05, - "loss": 0.7252, - "step": 1960 - }, - { - "epoch": 0.45964912280701753, - "grad_norm": 0.7634063554800602, - "learning_rate": 3.232716254964785e-05, - "loss": 0.7505, - "step": 1965 - }, - { - "epoch": 0.4608187134502924, - "grad_norm": 0.7865100028320574, - "learning_rate": 3.224212305486753e-05, - "loss": 0.7541, - "step": 1970 - }, - { - "epoch": 0.4619883040935672, - "grad_norm": 0.7166973372102381, - "learning_rate": 3.215701261112406e-05, - "loss": 0.7503, - "step": 1975 - }, - { - "epoch": 0.4631578947368421, - "grad_norm": 0.7887863496852786, - "learning_rate": 3.207183249179177e-05, - "loss": 0.7402, - "step": 1980 - }, - { - "epoch": 0.464327485380117, - "grad_norm": 0.6861005458337223, - "learning_rate": 3.198658397128742e-05, - "loss": 0.7403, - "step": 1985 - }, - { - "epoch": 0.4654970760233918, - "grad_norm": 0.713511921412075, - "learning_rate": 3.190126832505116e-05, - "loss": 0.7448, - "step": 1990 - }, - { - "epoch": 0.4666666666666667, - "grad_norm": 0.7395465637381757, - "learning_rate": 3.181588682952745e-05, - "loss": 0.742, - "step": 1995 - }, - { - "epoch": 0.4678362573099415, - "grad_norm": 0.716843196936708, - "learning_rate": 3.173044076214592e-05, - "loss": 0.7215, - "step": 2000 - }, - { - "epoch": 0.46900584795321637, - "grad_norm": 0.7143308102409406, - "learning_rate": 3.164493140130232e-05, - "loss": 0.7358, - "step": 2005 - }, - { - "epoch": 0.47017543859649125, - "grad_norm": 0.7158629240615204, - "learning_rate": 3.1559360026339335e-05, - "loss": 0.7454, - "step": 2010 - }, - { - "epoch": 0.47134502923976607, - "grad_norm": 0.8189451275414514, - "learning_rate": 3.1473727917527485e-05, - "loss": 0.7352, - "step": 2015 - }, - { - "epoch": 0.47251461988304094, - "grad_norm": 0.7565575619571483, - "learning_rate": 3.138803635604596e-05, - "loss": 0.7237, - "step": 2020 - }, - { - "epoch": 0.47368421052631576, - "grad_norm": 0.7072878072619535, - "learning_rate": 3.1302286623963414e-05, - "loss": 0.7476, - "step": 2025 - }, - { - "epoch": 0.47485380116959064, - "grad_norm": 0.7391386470759366, - "learning_rate": 3.121648000421886e-05, - "loss": 0.7454, - "step": 2030 - }, - { - "epoch": 0.4760233918128655, - "grad_norm": 0.754752084236822, - "learning_rate": 3.113061778060241e-05, - "loss": 0.7392, - "step": 2035 - }, - { - "epoch": 0.47719298245614034, - "grad_norm": 0.7258136873037546, - "learning_rate": 3.10447012377361e-05, - "loss": 0.7485, - "step": 2040 - }, - { - "epoch": 0.4783625730994152, - "grad_norm": 0.740460400541068, - "learning_rate": 3.0958731661054636e-05, - "loss": 0.7345, - "step": 2045 - }, - { - "epoch": 0.47953216374269003, - "grad_norm": 0.6894454665827823, - "learning_rate": 3.08727103367862e-05, - "loss": 0.7436, - "step": 2050 - }, - { - "epoch": 0.4807017543859649, - "grad_norm": 0.7276684254100515, - "learning_rate": 3.078663855193322e-05, - "loss": 0.7316, - "step": 2055 - }, - { - "epoch": 0.4818713450292398, - "grad_norm": 0.6805155330798511, - "learning_rate": 3.070051759425305e-05, - "loss": 0.7305, - "step": 2060 - }, - { - "epoch": 0.4830409356725146, - "grad_norm": 0.7546120968812617, - "learning_rate": 3.0614348752238746e-05, - "loss": 0.739, - "step": 2065 - }, - { - "epoch": 0.4842105263157895, - "grad_norm": 0.7229476827464224, - "learning_rate": 3.052813331509978e-05, - "loss": 0.7353, - "step": 2070 - }, - { - "epoch": 0.4853801169590643, - "grad_norm": 0.6897066828114895, - "learning_rate": 3.0441872572742785e-05, - "loss": 0.7428, - "step": 2075 - }, - { - "epoch": 0.4865497076023392, - "grad_norm": 0.6803167588334946, - "learning_rate": 3.035556781575219e-05, - "loss": 0.7377, - "step": 2080 - }, - { - "epoch": 0.48771929824561405, - "grad_norm": 0.7154975306867315, - "learning_rate": 3.0269220335370945e-05, - "loss": 0.751, - "step": 2085 - }, - { - "epoch": 0.4888888888888889, - "grad_norm": 0.7036402820624741, - "learning_rate": 3.0182831423481227e-05, - "loss": 0.7372, - "step": 2090 - }, - { - "epoch": 0.49005847953216375, - "grad_norm": 0.6928895572678453, - "learning_rate": 3.0096402372585075e-05, - "loss": 0.7324, - "step": 2095 - }, - { - "epoch": 0.49122807017543857, - "grad_norm": 0.7467539933803762, - "learning_rate": 3.0009934475785083e-05, - "loss": 0.7292, - "step": 2100 - }, - { - "epoch": 0.49239766081871345, - "grad_norm": 0.7420950776012448, - "learning_rate": 2.9923429026765003e-05, - "loss": 0.7322, - "step": 2105 - }, - { - "epoch": 0.4935672514619883, - "grad_norm": 0.7072542140173895, - "learning_rate": 2.983688731977044e-05, - "loss": 0.7339, - "step": 2110 - }, - { - "epoch": 0.49473684210526314, - "grad_norm": 0.6860501654248469, - "learning_rate": 2.9750310649589465e-05, - "loss": 0.7218, - "step": 2115 - }, - { - "epoch": 0.495906432748538, - "grad_norm": 0.8043839479752058, - "learning_rate": 2.966370031153326e-05, - "loss": 0.7347, - "step": 2120 - }, - { - "epoch": 0.49707602339181284, - "grad_norm": 0.7213757854384905, - "learning_rate": 2.9577057601416717e-05, - "loss": 0.7378, - "step": 2125 - }, - { - "epoch": 0.4982456140350877, - "grad_norm": 0.6834101797567773, - "learning_rate": 2.9490383815539058e-05, - "loss": 0.7298, - "step": 2130 - }, - { - "epoch": 0.4994152046783626, - "grad_norm": 0.7213077762540968, - "learning_rate": 2.9403680250664445e-05, - "loss": 0.7431, - "step": 2135 - }, - { - "epoch": 0.5005847953216375, - "grad_norm": 0.7289094379202761, - "learning_rate": 2.931694820400259e-05, - "loss": 0.7347, - "step": 2140 - }, - { - "epoch": 0.5017543859649123, - "grad_norm": 0.8099640413051555, - "learning_rate": 2.923018897318932e-05, - "loss": 0.7303, - "step": 2145 - }, - { - "epoch": 0.5029239766081871, - "grad_norm": 0.7449546312217549, - "learning_rate": 2.914340385626717e-05, - "loss": 0.7347, - "step": 2150 - }, - { - "epoch": 0.504093567251462, - "grad_norm": 0.8184676100727405, - "learning_rate": 2.9056594151665985e-05, - "loss": 0.7081, - "step": 2155 - }, - { - "epoch": 0.5052631578947369, - "grad_norm": 0.8611040013444063, - "learning_rate": 2.8969761158183466e-05, - "loss": 0.7311, - "step": 2160 - }, - { - "epoch": 0.5064327485380117, - "grad_norm": 0.7244547799769302, - "learning_rate": 2.8882906174965742e-05, - "loss": 0.7468, - "step": 2165 - }, - { - "epoch": 0.5076023391812865, - "grad_norm": 0.7499135951738628, - "learning_rate": 2.879603050148796e-05, - "loss": 0.7457, - "step": 2170 - }, - { - "epoch": 0.5087719298245614, - "grad_norm": 0.6938064677300961, - "learning_rate": 2.8709135437534806e-05, - "loss": 0.7354, - "step": 2175 - }, - { - "epoch": 0.5099415204678363, - "grad_norm": 0.6650288090095613, - "learning_rate": 2.8622222283181087e-05, - "loss": 0.7414, - "step": 2180 - }, - { - "epoch": 0.5111111111111111, - "grad_norm": 0.7133364365712555, - "learning_rate": 2.853529233877227e-05, - "loss": 0.73, - "step": 2185 - }, - { - "epoch": 0.512280701754386, - "grad_norm": 0.6540426370572834, - "learning_rate": 2.8448346904905e-05, - "loss": 0.7294, - "step": 2190 - }, - { - "epoch": 0.5134502923976608, - "grad_norm": 0.6939489433904377, - "learning_rate": 2.8361387282407704e-05, - "loss": 0.7114, - "step": 2195 - }, - { - "epoch": 0.5146198830409356, - "grad_norm": 0.732811254127614, - "learning_rate": 2.827441477232105e-05, - "loss": 0.7296, - "step": 2200 - }, - { - "epoch": 0.5157894736842106, - "grad_norm": 0.6531853113547864, - "learning_rate": 2.818743067587857e-05, - "loss": 0.72, - "step": 2205 - }, - { - "epoch": 0.5169590643274854, - "grad_norm": 0.6766250203305809, - "learning_rate": 2.8100436294487092e-05, - "loss": 0.746, - "step": 2210 - }, - { - "epoch": 0.5181286549707602, - "grad_norm": 0.6629062039445471, - "learning_rate": 2.8013432929707374e-05, - "loss": 0.7245, - "step": 2215 - }, - { - "epoch": 0.519298245614035, - "grad_norm": 0.6740973772958735, - "learning_rate": 2.7926421883234544e-05, - "loss": 0.7156, - "step": 2220 - }, - { - "epoch": 0.52046783625731, - "grad_norm": 0.7106665581583453, - "learning_rate": 2.7839404456878666e-05, - "loss": 0.7324, - "step": 2225 - }, - { - "epoch": 0.5216374269005848, - "grad_norm": 0.6367158771455951, - "learning_rate": 2.775238195254526e-05, - "loss": 0.7169, - "step": 2230 - }, - { - "epoch": 0.5228070175438596, - "grad_norm": 0.690435470611476, - "learning_rate": 2.7665355672215824e-05, - "loss": 0.7402, - "step": 2235 - }, - { - "epoch": 0.5239766081871345, - "grad_norm": 0.7138742813319091, - "learning_rate": 2.757832691792834e-05, - "loss": 0.7374, - "step": 2240 - }, - { - "epoch": 0.5251461988304094, - "grad_norm": 0.9702115835714958, - "learning_rate": 2.7491296991757804e-05, - "loss": 0.7251, - "step": 2245 - }, - { - "epoch": 0.5263157894736842, - "grad_norm": 1.5558673963991443, - "learning_rate": 2.7404267195796752e-05, - "loss": 0.7293, - "step": 2250 - }, - { - "epoch": 0.5274853801169591, - "grad_norm": 0.852336833207038, - "learning_rate": 2.7317238832135783e-05, - "loss": 0.7434, - "step": 2255 - }, - { - "epoch": 0.5286549707602339, - "grad_norm": 0.7594137186423496, - "learning_rate": 2.723021320284404e-05, - "loss": 0.7215, - "step": 2260 - }, - { - "epoch": 0.5298245614035088, - "grad_norm": 0.6842736411453925, - "learning_rate": 2.7143191609949764e-05, - "loss": 0.7221, - "step": 2265 - }, - { - "epoch": 0.5309941520467836, - "grad_norm": 0.7448450500927407, - "learning_rate": 2.705617535542083e-05, - "loss": 0.7122, - "step": 2270 - }, - { - "epoch": 0.5321637426900585, - "grad_norm": 0.7440720641697212, - "learning_rate": 2.6969165741145213e-05, - "loss": 0.7248, - "step": 2275 - }, - { - "epoch": 0.5333333333333333, - "grad_norm": 0.7031441607842945, - "learning_rate": 2.6882164068911554e-05, - "loss": 0.7308, - "step": 2280 - }, - { - "epoch": 0.5345029239766081, - "grad_norm": 0.7425827882634033, - "learning_rate": 2.6795171640389673e-05, - "loss": 0.724, - "step": 2285 - }, - { - "epoch": 0.5356725146198831, - "grad_norm": 0.8104245018702781, - "learning_rate": 2.670818975711107e-05, - "loss": 0.7185, - "step": 2290 - }, - { - "epoch": 0.5368421052631579, - "grad_norm": 0.7033113831122282, - "learning_rate": 2.66212197204495e-05, - "loss": 0.731, - "step": 2295 - }, - { - "epoch": 0.5380116959064327, - "grad_norm": 0.6838589814643976, - "learning_rate": 2.6534262831601464e-05, - "loss": 0.721, - "step": 2300 - }, - { - "epoch": 0.5391812865497077, - "grad_norm": 0.712100800360047, - "learning_rate": 2.6447320391566738e-05, - "loss": 0.7245, - "step": 2305 - }, - { - "epoch": 0.5403508771929825, - "grad_norm": 0.6723092101148208, - "learning_rate": 2.6360393701128968e-05, - "loss": 0.7247, - "step": 2310 - }, - { - "epoch": 0.5415204678362573, - "grad_norm": 0.6805561969651415, - "learning_rate": 2.6273484060836113e-05, - "loss": 0.7222, - "step": 2315 - }, - { - "epoch": 0.5426900584795321, - "grad_norm": 0.6306098328964164, - "learning_rate": 2.618659277098105e-05, - "loss": 0.7102, - "step": 2320 - }, - { - "epoch": 0.543859649122807, - "grad_norm": 0.8205802441393052, - "learning_rate": 2.6099721131582134e-05, - "loss": 0.7242, - "step": 2325 - }, - { - "epoch": 0.5450292397660819, - "grad_norm": 0.6949882040032072, - "learning_rate": 2.6012870442363686e-05, - "loss": 0.7311, - "step": 2330 - }, - { - "epoch": 0.5461988304093567, - "grad_norm": 0.6981026562322318, - "learning_rate": 2.592604200273661e-05, - "loss": 0.7251, - "step": 2335 - }, - { - "epoch": 0.5473684210526316, - "grad_norm": 0.7024702109216225, - "learning_rate": 2.583923711177891e-05, - "loss": 0.7246, - "step": 2340 - }, - { - "epoch": 0.5485380116959064, - "grad_norm": 0.6977023494940424, - "learning_rate": 2.5752457068216256e-05, - "loss": 0.7219, - "step": 2345 - }, - { - "epoch": 0.5497076023391813, - "grad_norm": 0.6577178559620495, - "learning_rate": 2.56657031704026e-05, - "loss": 0.716, - "step": 2350 - }, - { - "epoch": 0.5508771929824562, - "grad_norm": 0.6876417494610477, - "learning_rate": 2.557897671630069e-05, - "loss": 0.7305, - "step": 2355 - }, - { - "epoch": 0.552046783625731, - "grad_norm": 0.705663481497826, - "learning_rate": 2.549227900346267e-05, - "loss": 0.7264, - "step": 2360 - }, - { - "epoch": 0.5532163742690058, - "grad_norm": 0.6381128045940445, - "learning_rate": 2.5405611329010703e-05, - "loss": 0.7194, - "step": 2365 - }, - { - "epoch": 0.5543859649122806, - "grad_norm": 0.7563891767825265, - "learning_rate": 2.53189749896175e-05, - "loss": 0.7194, - "step": 2370 - }, - { - "epoch": 0.5555555555555556, - "grad_norm": 0.773436863349914, - "learning_rate": 2.5232371281487e-05, - "loss": 0.7328, - "step": 2375 - }, - { - "epoch": 0.5567251461988304, - "grad_norm": 0.7188605088844063, - "learning_rate": 2.514580150033487e-05, - "loss": 0.7084, - "step": 2380 - }, - { - "epoch": 0.5578947368421052, - "grad_norm": 0.7060166679772149, - "learning_rate": 2.5059266941369235e-05, - "loss": 0.7298, - "step": 2385 - }, - { - "epoch": 0.5590643274853802, - "grad_norm": 0.7085718046130549, - "learning_rate": 2.4972768899271216e-05, - "loss": 0.739, - "step": 2390 - }, - { - "epoch": 0.560233918128655, - "grad_norm": 0.7549743099712579, - "learning_rate": 2.4886308668175613e-05, - "loss": 0.7265, - "step": 2395 - }, - { - "epoch": 0.5614035087719298, - "grad_norm": 0.6538332655299417, - "learning_rate": 2.479988754165148e-05, - "loss": 0.7206, - "step": 2400 - }, - { - "epoch": 0.5625730994152047, - "grad_norm": 0.7162742971948052, - "learning_rate": 2.4713506812682864e-05, - "loss": 0.7241, - "step": 2405 - }, - { - "epoch": 0.5637426900584795, - "grad_norm": 0.644979711634962, - "learning_rate": 2.4627167773649347e-05, - "loss": 0.7037, - "step": 2410 - }, - { - "epoch": 0.5649122807017544, - "grad_norm": 0.6988424837088039, - "learning_rate": 2.454087171630683e-05, - "loss": 0.7311, - "step": 2415 - }, - { - "epoch": 0.5660818713450292, - "grad_norm": 0.6592590124650294, - "learning_rate": 2.445461993176809e-05, - "loss": 0.7136, - "step": 2420 - }, - { - "epoch": 0.5672514619883041, - "grad_norm": 0.7241992097338749, - "learning_rate": 2.4368413710483563e-05, - "loss": 0.7085, - "step": 2425 - }, - { - "epoch": 0.5684210526315789, - "grad_norm": 0.7177458102026997, - "learning_rate": 2.4282254342221972e-05, - "loss": 0.716, - "step": 2430 - }, - { - "epoch": 0.5695906432748538, - "grad_norm": 0.7014993276429681, - "learning_rate": 2.419614311605106e-05, - "loss": 0.7235, - "step": 2435 - }, - { - "epoch": 0.5707602339181287, - "grad_norm": 0.6974967208078983, - "learning_rate": 2.411008132031827e-05, - "loss": 0.7176, - "step": 2440 - }, - { - "epoch": 0.5719298245614035, - "grad_norm": 0.6428857393584856, - "learning_rate": 2.402407024263155e-05, - "loss": 0.7207, - "step": 2445 - }, - { - "epoch": 0.5730994152046783, - "grad_norm": 0.6587656996704534, - "learning_rate": 2.3938111169839983e-05, - "loss": 0.7218, - "step": 2450 - }, - { - "epoch": 0.5742690058479533, - "grad_norm": 0.6511884593126578, - "learning_rate": 2.3852205388014587e-05, - "loss": 0.7215, - "step": 2455 - }, - { - "epoch": 0.5754385964912281, - "grad_norm": 0.7305618449614039, - "learning_rate": 2.3766354182429102e-05, - "loss": 0.7289, - "step": 2460 - }, - { - "epoch": 0.5766081871345029, - "grad_norm": 0.6734495634477441, - "learning_rate": 2.3680558837540696e-05, - "loss": 0.7209, - "step": 2465 - }, - { - "epoch": 0.5777777777777777, - "grad_norm": 0.693329829649744, - "learning_rate": 2.359482063697081e-05, - "loss": 0.7085, - "step": 2470 - }, - { - "epoch": 0.5789473684210527, - "grad_norm": 0.6572826509123701, - "learning_rate": 2.3509140863485913e-05, - "loss": 0.7072, - "step": 2475 - }, - { - "epoch": 0.5801169590643275, - "grad_norm": 0.7132176217171703, - "learning_rate": 2.34235207989783e-05, - "loss": 0.7197, - "step": 2480 - }, - { - "epoch": 0.5812865497076023, - "grad_norm": 0.7471511475477799, - "learning_rate": 2.3337961724446967e-05, - "loss": 0.7143, - "step": 2485 - }, - { - "epoch": 0.5824561403508772, - "grad_norm": 0.7569514713011818, - "learning_rate": 2.3252464919978394e-05, - "loss": 0.7197, - "step": 2490 - }, - { - "epoch": 0.583625730994152, - "grad_norm": 0.672678541624009, - "learning_rate": 2.3167031664727406e-05, - "loss": 0.7131, - "step": 2495 - }, - { - "epoch": 0.5847953216374269, - "grad_norm": 0.7138771698681281, - "learning_rate": 2.3081663236898065e-05, - "loss": 0.715, - "step": 2500 - }, - { - "epoch": 0.5859649122807018, - "grad_norm": 0.6632737853478047, - "learning_rate": 2.299636091372449e-05, - "loss": 0.716, - "step": 2505 - }, - { - "epoch": 0.5871345029239766, - "grad_norm": 0.646368439045504, - "learning_rate": 2.2911125971451814e-05, - "loss": 0.7105, - "step": 2510 - }, - { - "epoch": 0.5883040935672514, - "grad_norm": 0.7502430275200059, - "learning_rate": 2.2825959685317026e-05, - "loss": 0.7018, - "step": 2515 - }, - { - "epoch": 0.5894736842105263, - "grad_norm": 0.6686250670322178, - "learning_rate": 2.274086332952993e-05, - "loss": 0.7142, - "step": 2520 - }, - { - "epoch": 0.5906432748538012, - "grad_norm": 0.6713488705252936, - "learning_rate": 2.2655838177254084e-05, - "loss": 0.7096, - "step": 2525 - }, - { - "epoch": 0.591812865497076, - "grad_norm": 0.74066155087446, - "learning_rate": 2.2570885500587724e-05, - "loss": 0.7161, - "step": 2530 - }, - { - "epoch": 0.5929824561403508, - "grad_norm": 0.7203926540322918, - "learning_rate": 2.248600657054474e-05, - "loss": 0.7169, - "step": 2535 - }, - { - "epoch": 0.5941520467836258, - "grad_norm": 0.7338466015320976, - "learning_rate": 2.2401202657035695e-05, - "loss": 0.7333, - "step": 2540 - }, - { - "epoch": 0.5953216374269006, - "grad_norm": 0.6914730478070041, - "learning_rate": 2.231647502884877e-05, - "loss": 0.7093, - "step": 2545 - }, - { - "epoch": 0.5964912280701754, - "grad_norm": 0.7190763244458849, - "learning_rate": 2.2231824953630826e-05, - "loss": 0.7194, - "step": 2550 - }, - { - "epoch": 0.5976608187134503, - "grad_norm": 0.7443168685764103, - "learning_rate": 2.2147253697868404e-05, - "loss": 0.7148, - "step": 2555 - }, - { - "epoch": 0.5988304093567252, - "grad_norm": 0.6671658986596564, - "learning_rate": 2.2062762526868802e-05, - "loss": 0.7106, - "step": 2560 - }, - { - "epoch": 0.6, - "grad_norm": 0.9259171966978712, - "learning_rate": 2.1978352704741144e-05, - "loss": 0.7091, - "step": 2565 - }, - { - "epoch": 0.6011695906432749, - "grad_norm": 0.647797375674224, - "learning_rate": 2.189402549437745e-05, - "loss": 0.6978, - "step": 2570 - }, - { - "epoch": 0.6023391812865497, - "grad_norm": 0.6738560804966286, - "learning_rate": 2.1809782157433738e-05, - "loss": 0.7093, - "step": 2575 - }, - { - "epoch": 0.6035087719298246, - "grad_norm": 0.6388985088846765, - "learning_rate": 2.172562395431118e-05, - "loss": 0.7045, - "step": 2580 - }, - { - "epoch": 0.6046783625730994, - "grad_norm": 0.628701446161192, - "learning_rate": 2.1641552144137206e-05, - "loss": 0.7085, - "step": 2585 - }, - { - "epoch": 0.6058479532163743, - "grad_norm": 0.671782052313191, - "learning_rate": 2.1557567984746696e-05, - "loss": 0.7209, - "step": 2590 - }, - { - "epoch": 0.6070175438596491, - "grad_norm": 0.6644467772573169, - "learning_rate": 2.147367273266314e-05, - "loss": 0.7205, - "step": 2595 - }, - { - "epoch": 0.6081871345029239, - "grad_norm": 0.6954352839860809, - "learning_rate": 2.1389867643079848e-05, - "loss": 0.7204, - "step": 2600 - }, - { - "epoch": 0.6093567251461989, - "grad_norm": 0.6959121822967036, - "learning_rate": 2.1306153969841192e-05, - "loss": 0.7214, - "step": 2605 - }, - { - "epoch": 0.6105263157894737, - "grad_norm": 0.6785383454163235, - "learning_rate": 2.1222532965423792e-05, - "loss": 0.7071, - "step": 2610 - }, - { - "epoch": 0.6116959064327485, - "grad_norm": 0.7766968741058371, - "learning_rate": 2.1139005880917805e-05, - "loss": 0.7117, - "step": 2615 - }, - { - "epoch": 0.6128654970760234, - "grad_norm": 0.6514576309987281, - "learning_rate": 2.1055573966008264e-05, - "loss": 0.7241, - "step": 2620 - }, - { - "epoch": 0.6140350877192983, - "grad_norm": 0.7024878214812843, - "learning_rate": 2.0972238468956267e-05, - "loss": 0.7149, - "step": 2625 - }, - { - "epoch": 0.6152046783625731, - "grad_norm": 0.7189809009653396, - "learning_rate": 2.0889000636580398e-05, - "loss": 0.698, - "step": 2630 - }, - { - "epoch": 0.6163742690058479, - "grad_norm": 0.681435970631106, - "learning_rate": 2.080586171423803e-05, - "loss": 0.7188, - "step": 2635 - }, - { - "epoch": 0.6175438596491228, - "grad_norm": 0.6766943134449012, - "learning_rate": 2.0722822945806697e-05, - "loss": 0.7073, - "step": 2640 - }, - { - "epoch": 0.6187134502923977, - "grad_norm": 0.6479985354365313, - "learning_rate": 2.063988557366548e-05, - "loss": 0.7145, - "step": 2645 - }, - { - "epoch": 0.6198830409356725, - "grad_norm": 0.749913572914423, - "learning_rate": 2.0557050838676445e-05, - "loss": 0.7132, - "step": 2650 - }, - { - "epoch": 0.6210526315789474, - "grad_norm": 0.6054029968957726, - "learning_rate": 2.047431998016604e-05, - "loss": 0.7085, - "step": 2655 - }, - { - "epoch": 0.6222222222222222, - "grad_norm": 0.6625793556031936, - "learning_rate": 2.0391694235906594e-05, - "loss": 0.7207, - "step": 2660 - }, - { - "epoch": 0.623391812865497, - "grad_norm": 0.6265379430575465, - "learning_rate": 2.0309174842097755e-05, - "loss": 0.7193, - "step": 2665 - }, - { - "epoch": 0.624561403508772, - "grad_norm": 0.7859616347118907, - "learning_rate": 2.0226763033348005e-05, - "loss": 0.7123, - "step": 2670 - }, - { - "epoch": 0.6257309941520468, - "grad_norm": 0.6856709481316304, - "learning_rate": 2.0144460042656244e-05, - "loss": 0.7056, - "step": 2675 - }, - { - "epoch": 0.6269005847953216, - "grad_norm": 0.6640584443200321, - "learning_rate": 2.0062267101393255e-05, - "loss": 0.7096, - "step": 2680 - }, - { - "epoch": 0.6280701754385964, - "grad_norm": 0.7024631624759632, - "learning_rate": 1.9980185439283343e-05, - "loss": 0.7144, - "step": 2685 - }, - { - "epoch": 0.6292397660818714, - "grad_norm": 0.6513218786027092, - "learning_rate": 1.9898216284385924e-05, - "loss": 0.7207, - "step": 2690 - }, - { - "epoch": 0.6304093567251462, - "grad_norm": 0.6736080868317267, - "learning_rate": 1.9816360863077106e-05, - "loss": 0.7134, - "step": 2695 - }, - { - "epoch": 0.631578947368421, - "grad_norm": 0.7136228459371672, - "learning_rate": 1.973462040003144e-05, - "loss": 0.6967, - "step": 2700 - }, - { - "epoch": 0.632748538011696, - "grad_norm": 0.6503816533129796, - "learning_rate": 1.9652996118203487e-05, - "loss": 0.7028, - "step": 2705 - }, - { - "epoch": 0.6339181286549708, - "grad_norm": 0.6716498502463782, - "learning_rate": 1.9571489238809586e-05, - "loss": 0.6952, - "step": 2710 - }, - { - "epoch": 0.6350877192982456, - "grad_norm": 0.702194650319827, - "learning_rate": 1.949010098130958e-05, - "loss": 0.7117, - "step": 2715 - }, - { - "epoch": 0.6362573099415205, - "grad_norm": 0.7035074020264389, - "learning_rate": 1.940883256338854e-05, - "loss": 0.7086, - "step": 2720 - }, - { - "epoch": 0.6374269005847953, - "grad_norm": 0.6724554904596103, - "learning_rate": 1.9327685200938567e-05, - "loss": 0.6959, - "step": 2725 - }, - { - "epoch": 0.6385964912280702, - "grad_norm": 0.6703506374383876, - "learning_rate": 1.9246660108040615e-05, - "loss": 0.7074, - "step": 2730 - }, - { - "epoch": 0.639766081871345, - "grad_norm": 0.7136435424759008, - "learning_rate": 1.9165758496946296e-05, - "loss": 0.6949, - "step": 2735 - }, - { - "epoch": 0.6409356725146199, - "grad_norm": 0.6811443394345809, - "learning_rate": 1.9084981578059745e-05, - "loss": 0.7249, - "step": 2740 - }, - { - "epoch": 0.6421052631578947, - "grad_norm": 0.7047496550502742, - "learning_rate": 1.900433055991956e-05, - "loss": 0.6977, - "step": 2745 - }, - { - "epoch": 0.6432748538011696, - "grad_norm": 0.6693712668333812, - "learning_rate": 1.8923806649180636e-05, - "loss": 0.7042, - "step": 2750 - }, - { - "epoch": 0.6444444444444445, - "grad_norm": 0.7062577866647437, - "learning_rate": 1.8843411050596215e-05, - "loss": 0.7255, - "step": 2755 - }, - { - "epoch": 0.6456140350877193, - "grad_norm": 0.7457329699470406, - "learning_rate": 1.8763144966999742e-05, - "loss": 0.7131, - "step": 2760 - }, - { - "epoch": 0.6467836257309941, - "grad_norm": 0.6949977616539524, - "learning_rate": 1.8683009599286976e-05, - "loss": 0.7032, - "step": 2765 - }, - { - "epoch": 0.6479532163742691, - "grad_norm": 0.6446049989679632, - "learning_rate": 1.8603006146397984e-05, - "loss": 0.7045, - "step": 2770 - }, - { - "epoch": 0.6491228070175439, - "grad_norm": 0.6868201452976932, - "learning_rate": 1.8523135805299164e-05, - "loss": 0.7078, - "step": 2775 - }, - { - "epoch": 0.6502923976608187, - "grad_norm": 0.6549013173661192, - "learning_rate": 1.8443399770965368e-05, - "loss": 0.7034, - "step": 2780 - }, - { - "epoch": 0.6514619883040935, - "grad_norm": 0.6747132830101538, - "learning_rate": 1.836379923636209e-05, - "loss": 0.7047, - "step": 2785 - }, - { - "epoch": 0.6526315789473685, - "grad_norm": 0.6189077937045445, - "learning_rate": 1.8284335392427464e-05, - "loss": 0.7065, - "step": 2790 - }, - { - "epoch": 0.6538011695906433, - "grad_norm": 0.6712899258383389, - "learning_rate": 1.8205009428054616e-05, - "loss": 0.7226, - "step": 2795 - }, - { - "epoch": 0.6549707602339181, - "grad_norm": 0.6476379586448503, - "learning_rate": 1.812582253007375e-05, - "loss": 0.7098, - "step": 2800 - }, - { - "epoch": 0.656140350877193, - "grad_norm": 0.6632887980494508, - "learning_rate": 1.804677588323443e-05, - "loss": 0.6931, - "step": 2805 - }, - { - "epoch": 0.6573099415204678, - "grad_norm": 0.6676152770497438, - "learning_rate": 1.7967870670187903e-05, - "loss": 0.7026, - "step": 2810 - }, - { - "epoch": 0.6584795321637427, - "grad_norm": 0.6249710453695605, - "learning_rate": 1.7889108071469323e-05, - "loss": 0.7122, - "step": 2815 - }, - { - "epoch": 0.6596491228070176, - "grad_norm": 0.6910085759118031, - "learning_rate": 1.781048926548016e-05, - "loss": 0.6948, - "step": 2820 - }, - { - "epoch": 0.6608187134502924, - "grad_norm": 0.7039245820842112, - "learning_rate": 1.7732015428470522e-05, - "loss": 0.7057, - "step": 2825 - }, - { - "epoch": 0.6619883040935672, - "grad_norm": 0.6560416507502473, - "learning_rate": 1.7653687734521572e-05, - "loss": 0.7162, - "step": 2830 - }, - { - "epoch": 0.6631578947368421, - "grad_norm": 0.6369401839904856, - "learning_rate": 1.7575507355527965e-05, - "loss": 0.7022, - "step": 2835 - }, - { - "epoch": 0.664327485380117, - "grad_norm": 0.7618791478324256, - "learning_rate": 1.7497475461180324e-05, - "loss": 0.6871, - "step": 2840 - }, - { - "epoch": 0.6654970760233918, - "grad_norm": 0.7035877403668513, - "learning_rate": 1.7419593218947706e-05, - "loss": 0.7037, - "step": 2845 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 3.171993920451794, - "learning_rate": 1.734186179406019e-05, - "loss": 0.7086, - "step": 2850 - }, - { - "epoch": 0.6678362573099416, - "grad_norm": 0.7752177765101442, - "learning_rate": 1.7264282349491382e-05, - "loss": 0.7163, - "step": 2855 - }, - { - "epoch": 0.6690058479532164, - "grad_norm": 0.6730042085183996, - "learning_rate": 1.7186856045941044e-05, - "loss": 0.7016, - "step": 2860 - }, - { - "epoch": 0.6701754385964912, - "grad_norm": 0.6538520154513303, - "learning_rate": 1.7109584041817765e-05, - "loss": 0.6934, - "step": 2865 - }, - { - "epoch": 0.6713450292397661, - "grad_norm": 0.702951285614432, - "learning_rate": 1.7032467493221556e-05, - "loss": 0.704, - "step": 2870 - }, - { - "epoch": 0.672514619883041, - "grad_norm": 0.7222284007051577, - "learning_rate": 1.6955507553926584e-05, - "loss": 0.6891, - "step": 2875 - }, - { - "epoch": 0.6736842105263158, - "grad_norm": 0.655031348765028, - "learning_rate": 1.6878705375363964e-05, - "loss": 0.6959, - "step": 2880 - }, - { - "epoch": 0.6748538011695906, - "grad_norm": 0.650372228862769, - "learning_rate": 1.6802062106604435e-05, - "loss": 0.7037, - "step": 2885 - }, - { - "epoch": 0.6760233918128655, - "grad_norm": 0.6648549632114491, - "learning_rate": 1.6725578894341253e-05, - "loss": 0.6955, - "step": 2890 - }, - { - "epoch": 0.6771929824561403, - "grad_norm": 0.6717090275475817, - "learning_rate": 1.664925688287297e-05, - "loss": 0.7103, - "step": 2895 - }, - { - "epoch": 0.6783625730994152, - "grad_norm": 0.6499960801937501, - "learning_rate": 1.657309721408636e-05, - "loss": 0.7011, - "step": 2900 - }, - { - "epoch": 0.6795321637426901, - "grad_norm": 0.6839419015653185, - "learning_rate": 1.649710102743931e-05, - "loss": 0.6903, - "step": 2905 - }, - { - "epoch": 0.6807017543859649, - "grad_norm": 0.6488668540739176, - "learning_rate": 1.64212694599438e-05, - "loss": 0.705, - "step": 2910 - }, - { - "epoch": 0.6818713450292397, - "grad_norm": 0.6715390084742834, - "learning_rate": 1.634560364614883e-05, - "loss": 0.6983, - "step": 2915 - }, - { - "epoch": 0.6830409356725147, - "grad_norm": 0.7229792154843714, - "learning_rate": 1.6270104718123535e-05, - "loss": 0.7059, - "step": 2920 - }, - { - "epoch": 0.6842105263157895, - "grad_norm": 0.7217490768423835, - "learning_rate": 1.6194773805440166e-05, - "loss": 0.709, - "step": 2925 - }, - { - "epoch": 0.6853801169590643, - "grad_norm": 0.7323151054068626, - "learning_rate": 1.6119612035157227e-05, - "loss": 0.7249, - "step": 2930 - }, - { - "epoch": 0.6865497076023391, - "grad_norm": 0.693099764226788, - "learning_rate": 1.604462053180263e-05, - "loss": 0.6886, - "step": 2935 - }, - { - "epoch": 0.6877192982456141, - "grad_norm": 0.7447053266775585, - "learning_rate": 1.5969800417356817e-05, - "loss": 0.6943, - "step": 2940 - }, - { - "epoch": 0.6888888888888889, - "grad_norm": 0.7056733168103325, - "learning_rate": 1.5895152811236046e-05, - "loss": 0.6987, - "step": 2945 - }, - { - "epoch": 0.6900584795321637, - "grad_norm": 0.6333643279860763, - "learning_rate": 1.582067883027557e-05, - "loss": 0.6834, - "step": 2950 - }, - { - "epoch": 0.6912280701754386, - "grad_norm": 0.6657715044830338, - "learning_rate": 1.574637958871297e-05, - "loss": 0.6901, - "step": 2955 - }, - { - "epoch": 0.6923976608187135, - "grad_norm": 0.6305769577184307, - "learning_rate": 1.567225619817148e-05, - "loss": 0.6985, - "step": 2960 - }, - { - "epoch": 0.6935672514619883, - "grad_norm": 0.6912315845824749, - "learning_rate": 1.5598309767643355e-05, - "loss": 0.6909, - "step": 2965 - }, - { - "epoch": 0.6947368421052632, - "grad_norm": 0.6874615844912404, - "learning_rate": 1.5524541403473244e-05, - "loss": 0.7061, - "step": 2970 - }, - { - "epoch": 0.695906432748538, - "grad_norm": 0.6300180071584462, - "learning_rate": 1.5450952209341717e-05, - "loss": 0.6993, - "step": 2975 - }, - { - "epoch": 0.6970760233918128, - "grad_norm": 0.6805832683256726, - "learning_rate": 1.5377543286248653e-05, - "loss": 0.7066, - "step": 2980 - }, - { - "epoch": 0.6982456140350877, - "grad_norm": 0.6492844517838456, - "learning_rate": 1.5304315732496867e-05, - "loss": 0.7011, - "step": 2985 - }, - { - "epoch": 0.6994152046783626, - "grad_norm": 0.6412533888420374, - "learning_rate": 1.5231270643675577e-05, - "loss": 0.7033, - "step": 2990 - }, - { - "epoch": 0.7005847953216374, - "grad_norm": 0.6316222267104177, - "learning_rate": 1.5158409112644103e-05, - "loss": 0.7052, - "step": 2995 - }, - { - "epoch": 0.7017543859649122, - "grad_norm": 0.7274584018785283, - "learning_rate": 1.5085732229515476e-05, - "loss": 0.7037, - "step": 3000 - }, - { - "epoch": 0.7029239766081872, - "grad_norm": 0.6553959454076399, - "learning_rate": 1.5013241081640101e-05, - "loss": 0.6985, - "step": 3005 - }, - { - "epoch": 0.704093567251462, - "grad_norm": 0.6640325261209403, - "learning_rate": 1.4940936753589533e-05, - "loss": 0.7042, - "step": 3010 - }, - { - "epoch": 0.7052631578947368, - "grad_norm": 0.7105270721109682, - "learning_rate": 1.4868820327140249e-05, - "loss": 0.691, - "step": 3015 - }, - { - "epoch": 0.7064327485380117, - "grad_norm": 0.703378689555471, - "learning_rate": 1.479689288125742e-05, - "loss": 0.6955, - "step": 3020 - }, - { - "epoch": 0.7076023391812866, - "grad_norm": 0.692679701776449, - "learning_rate": 1.4725155492078813e-05, - "loss": 0.6876, - "step": 3025 - }, - { - "epoch": 0.7087719298245614, - "grad_norm": 0.7203262346843042, - "learning_rate": 1.4653609232898684e-05, - "loss": 0.6972, - "step": 3030 - }, - { - "epoch": 0.7099415204678362, - "grad_norm": 0.7433252096654863, - "learning_rate": 1.4582255174151683e-05, - "loss": 0.6978, - "step": 3035 - }, - { - "epoch": 0.7111111111111111, - "grad_norm": 0.6965363631959524, - "learning_rate": 1.45110943833969e-05, - "loss": 0.705, - "step": 3040 - }, - { - "epoch": 0.712280701754386, - "grad_norm": 0.6282817213921031, - "learning_rate": 1.4440127925301827e-05, - "loss": 0.6826, - "step": 3045 - }, - { - "epoch": 0.7134502923976608, - "grad_norm": 0.6974830082717185, - "learning_rate": 1.4369356861626467e-05, - "loss": 0.6962, - "step": 3050 - }, - { - "epoch": 0.7146198830409357, - "grad_norm": 0.6601146384075475, - "learning_rate": 1.4298782251207468e-05, - "loss": 0.6906, - "step": 3055 - }, - { - "epoch": 0.7157894736842105, - "grad_norm": 0.6814736421646006, - "learning_rate": 1.4228405149942226e-05, - "loss": 0.6911, - "step": 3060 - }, - { - "epoch": 0.7169590643274854, - "grad_norm": 2.666703566903681, - "learning_rate": 1.4158226610773117e-05, - "loss": 0.7079, - "step": 3065 - }, - { - "epoch": 0.7181286549707603, - "grad_norm": 0.7144980012945954, - "learning_rate": 1.4088247683671768e-05, - "loss": 0.6949, - "step": 3070 - }, - { - "epoch": 0.7192982456140351, - "grad_norm": 0.66028336916713, - "learning_rate": 1.4018469415623309e-05, - "loss": 0.6844, - "step": 3075 - }, - { - "epoch": 0.7204678362573099, - "grad_norm": 0.7116011959517159, - "learning_rate": 1.3948892850610709e-05, - "loss": 0.6944, - "step": 3080 - }, - { - "epoch": 0.7216374269005847, - "grad_norm": 0.719826020172388, - "learning_rate": 1.3879519029599197e-05, - "loss": 0.6855, - "step": 3085 - }, - { - "epoch": 0.7228070175438597, - "grad_norm": 0.646119503804467, - "learning_rate": 1.3810348990520635e-05, - "loss": 0.698, - "step": 3090 - }, - { - "epoch": 0.7239766081871345, - "grad_norm": 0.6590521039258358, - "learning_rate": 1.3741383768258043e-05, - "loss": 0.7111, - "step": 3095 - }, - { - "epoch": 0.7251461988304093, - "grad_norm": 0.6618851155256175, - "learning_rate": 1.3672624394630062e-05, - "loss": 0.694, - "step": 3100 - }, - { - "epoch": 0.7263157894736842, - "grad_norm": 0.6259226479372295, - "learning_rate": 1.3604071898375548e-05, - "loss": 0.6899, - "step": 3105 - }, - { - "epoch": 0.7274853801169591, - "grad_norm": 0.6544836178913599, - "learning_rate": 1.3535727305138185e-05, - "loss": 0.6802, - "step": 3110 - }, - { - "epoch": 0.7286549707602339, - "grad_norm": 0.6291569958373459, - "learning_rate": 1.3467591637451126e-05, - "loss": 0.6929, - "step": 3115 - }, - { - "epoch": 0.7298245614035088, - "grad_norm": 0.6512874151441448, - "learning_rate": 1.3399665914721682e-05, - "loss": 0.6873, - "step": 3120 - }, - { - "epoch": 0.7309941520467836, - "grad_norm": 0.6448721183547438, - "learning_rate": 1.3331951153216115e-05, - "loss": 0.6929, - "step": 3125 - }, - { - "epoch": 0.7321637426900585, - "grad_norm": 0.6512033452820375, - "learning_rate": 1.326444836604438e-05, - "loss": 0.6978, - "step": 3130 - }, - { - "epoch": 0.7333333333333333, - "grad_norm": 0.6699223613337226, - "learning_rate": 1.3197158563145013e-05, - "loss": 0.6878, - "step": 3135 - }, - { - "epoch": 0.7345029239766082, - "grad_norm": 0.6785655173965759, - "learning_rate": 1.3130082751269973e-05, - "loss": 0.7074, - "step": 3140 - }, - { - "epoch": 0.735672514619883, - "grad_norm": 0.6893282702509805, - "learning_rate": 1.3063221933969627e-05, - "loss": 0.6884, - "step": 3145 - }, - { - "epoch": 0.7368421052631579, - "grad_norm": 0.6801941856058638, - "learning_rate": 1.2996577111577714e-05, - "loss": 0.6956, - "step": 3150 - }, - { - "epoch": 0.7380116959064328, - "grad_norm": 0.7429147439910412, - "learning_rate": 1.2930149281196366e-05, - "loss": 0.6935, - "step": 3155 - }, - { - "epoch": 0.7391812865497076, - "grad_norm": 0.7584692264067732, - "learning_rate": 1.2863939436681211e-05, - "loss": 0.7084, - "step": 3160 - }, - { - "epoch": 0.7403508771929824, - "grad_norm": 0.6496501154726846, - "learning_rate": 1.2797948568626514e-05, - "loss": 0.6811, - "step": 3165 - }, - { - "epoch": 0.7415204678362574, - "grad_norm": 0.6947826629815981, - "learning_rate": 1.2732177664350297e-05, - "loss": 0.7068, - "step": 3170 - }, - { - "epoch": 0.7426900584795322, - "grad_norm": 0.6322459875062904, - "learning_rate": 1.266662770787965e-05, - "loss": 0.6771, - "step": 3175 - }, - { - "epoch": 0.743859649122807, - "grad_norm": 0.6610295189441215, - "learning_rate": 1.2601299679935944e-05, - "loss": 0.6976, - "step": 3180 - }, - { - "epoch": 0.7450292397660818, - "grad_norm": 0.6090430443039708, - "learning_rate": 1.2536194557920173e-05, - "loss": 0.6877, - "step": 3185 - }, - { - "epoch": 0.7461988304093568, - "grad_norm": 0.657566522164305, - "learning_rate": 1.2471313315898369e-05, - "loss": 0.6919, - "step": 3190 - }, - { - "epoch": 0.7473684210526316, - "grad_norm": 0.6442799866282577, - "learning_rate": 1.2406656924586971e-05, - "loss": 0.7097, - "step": 3195 - }, - { - "epoch": 0.7485380116959064, - "grad_norm": 0.6517637658675814, - "learning_rate": 1.2342226351338333e-05, - "loss": 0.6964, - "step": 3200 - }, - { - "epoch": 0.7497076023391813, - "grad_norm": 0.6298193875835328, - "learning_rate": 1.227802256012627e-05, - "loss": 0.7067, - "step": 3205 - }, - { - "epoch": 0.7508771929824561, - "grad_norm": 0.6400435584951044, - "learning_rate": 1.2214046511531579e-05, - "loss": 0.6996, - "step": 3210 - }, - { - "epoch": 0.752046783625731, - "grad_norm": 0.6682447845425866, - "learning_rate": 1.215029916272771e-05, - "loss": 0.7052, - "step": 3215 - }, - { - "epoch": 0.7532163742690059, - "grad_norm": 0.6345008179067555, - "learning_rate": 1.2086781467466466e-05, - "loss": 0.6815, - "step": 3220 - }, - { - "epoch": 0.7543859649122807, - "grad_norm": 0.6269106487695227, - "learning_rate": 1.2023494376063655e-05, - "loss": 0.6889, - "step": 3225 - }, - { - "epoch": 0.7555555555555555, - "grad_norm": 0.6766539926521694, - "learning_rate": 1.196043883538496e-05, - "loss": 0.6896, - "step": 3230 - }, - { - "epoch": 0.7567251461988304, - "grad_norm": 0.657654184164444, - "learning_rate": 1.1897615788831715e-05, - "loss": 0.697, - "step": 3235 - }, - { - "epoch": 0.7578947368421053, - "grad_norm": 0.6479759415667077, - "learning_rate": 1.1835026176326817e-05, - "loss": 0.7061, - "step": 3240 - }, - { - "epoch": 0.7590643274853801, - "grad_norm": 0.6454453155585288, - "learning_rate": 1.1772670934300637e-05, - "loss": 0.6944, - "step": 3245 - }, - { - "epoch": 0.7602339181286549, - "grad_norm": 0.6717289664011613, - "learning_rate": 1.171055099567705e-05, - "loss": 0.6975, - "step": 3250 - }, - { - "epoch": 0.7614035087719299, - "grad_norm": 0.6446562340482254, - "learning_rate": 1.164866728985944e-05, - "loss": 0.6868, - "step": 3255 - }, - { - "epoch": 0.7625730994152047, - "grad_norm": 0.6842764077833369, - "learning_rate": 1.1587020742716822e-05, - "loss": 0.6902, - "step": 3260 - }, - { - "epoch": 0.7637426900584795, - "grad_norm": 0.6529175563359729, - "learning_rate": 1.1525612276569954e-05, - "loss": 0.686, - "step": 3265 - }, - { - "epoch": 0.7649122807017544, - "grad_norm": 0.6878863831926055, - "learning_rate": 1.1464442810177591e-05, - "loss": 0.6847, - "step": 3270 - }, - { - "epoch": 0.7660818713450293, - "grad_norm": 0.634988623157636, - "learning_rate": 1.1403513258722689e-05, - "loss": 0.689, - "step": 3275 - }, - { - "epoch": 0.7672514619883041, - "grad_norm": 0.6427892453571996, - "learning_rate": 1.134282453379873e-05, - "loss": 0.6855, - "step": 3280 - }, - { - "epoch": 0.7684210526315789, - "grad_norm": 0.6571927205325351, - "learning_rate": 1.12823775433961e-05, - "loss": 0.689, - "step": 3285 - }, - { - "epoch": 0.7695906432748538, - "grad_norm": 0.6553286498653648, - "learning_rate": 1.1222173191888482e-05, - "loss": 0.701, - "step": 3290 - }, - { - "epoch": 0.7707602339181286, - "grad_norm": 0.6477793773584922, - "learning_rate": 1.1162212380019327e-05, - "loss": 0.6864, - "step": 3295 - }, - { - "epoch": 0.7719298245614035, - "grad_norm": 0.677952780272042, - "learning_rate": 1.11024960048884e-05, - "loss": 0.6795, - "step": 3300 - }, - { - "epoch": 0.7730994152046784, - "grad_norm": 0.6183556845058249, - "learning_rate": 1.1043024959938327e-05, - "loss": 0.6839, - "step": 3305 - }, - { - "epoch": 0.7742690058479532, - "grad_norm": 0.6685159928157001, - "learning_rate": 1.098380013494124e-05, - "loss": 0.6852, - "step": 3310 - }, - { - "epoch": 0.775438596491228, - "grad_norm": 0.6119355012203586, - "learning_rate": 1.0924822415985483e-05, - "loss": 0.6804, - "step": 3315 - }, - { - "epoch": 0.776608187134503, - "grad_norm": 0.6311449859816257, - "learning_rate": 1.086609268546234e-05, - "loss": 0.6725, - "step": 3320 - }, - { - "epoch": 0.7777777777777778, - "grad_norm": 0.6980417098074275, - "learning_rate": 1.0807611822052802e-05, - "loss": 0.678, - "step": 3325 - }, - { - "epoch": 0.7789473684210526, - "grad_norm": 0.5801631810377051, - "learning_rate": 1.0749380700714495e-05, - "loss": 0.6763, - "step": 3330 - }, - { - "epoch": 0.7801169590643274, - "grad_norm": 0.6589353523991406, - "learning_rate": 1.0691400192668502e-05, - "loss": 0.7009, - "step": 3335 - }, - { - "epoch": 0.7812865497076024, - "grad_norm": 0.6130234769951651, - "learning_rate": 1.063367116538641e-05, - "loss": 0.6876, - "step": 3340 - }, - { - "epoch": 0.7824561403508772, - "grad_norm": 0.609038354087779, - "learning_rate": 1.0576194482577268e-05, - "loss": 0.6821, - "step": 3345 - }, - { - "epoch": 0.783625730994152, - "grad_norm": 0.6075902184182757, - "learning_rate": 1.0518971004174691e-05, - "loss": 0.6971, - "step": 3350 - }, - { - "epoch": 0.7847953216374269, - "grad_norm": 0.6404325058010116, - "learning_rate": 1.0462001586324009e-05, - "loss": 0.6743, - "step": 3355 - }, - { - "epoch": 0.7859649122807018, - "grad_norm": 0.6755723087064721, - "learning_rate": 1.0405287081369425e-05, - "loss": 0.6946, - "step": 3360 - }, - { - "epoch": 0.7871345029239766, - "grad_norm": 0.64889270230153, - "learning_rate": 1.034882833784129e-05, - "loss": 0.6976, - "step": 3365 - }, - { - "epoch": 0.7883040935672515, - "grad_norm": 0.6371962111123631, - "learning_rate": 1.0292626200443396e-05, - "loss": 0.6785, - "step": 3370 - }, - { - "epoch": 0.7894736842105263, - "grad_norm": 0.6338899877498075, - "learning_rate": 1.0236681510040328e-05, - "loss": 0.6846, - "step": 3375 - }, - { - "epoch": 0.7906432748538011, - "grad_norm": 0.6073429500137454, - "learning_rate": 1.018099510364491e-05, - "loss": 0.6806, - "step": 3380 - }, - { - "epoch": 0.791812865497076, - "grad_norm": 0.6901673440168534, - "learning_rate": 1.0125567814405661e-05, - "loss": 0.6718, - "step": 3385 - }, - { - "epoch": 0.7929824561403509, - "grad_norm": 0.6557491619211937, - "learning_rate": 1.0070400471594324e-05, - "loss": 0.6714, - "step": 3390 - }, - { - "epoch": 0.7941520467836257, - "grad_norm": 0.6610369238302171, - "learning_rate": 1.0015493900593495e-05, - "loss": 0.6859, - "step": 3395 - }, - { - "epoch": 0.7953216374269005, - "grad_norm": 0.6249323170970883, - "learning_rate": 9.960848922884225e-06, - "loss": 0.681, - "step": 3400 - }, - { - "epoch": 0.7964912280701755, - "grad_norm": 0.6801586512490546, - "learning_rate": 9.906466356033766e-06, - "loss": 0.6912, - "step": 3405 - }, - { - "epoch": 0.7976608187134503, - "grad_norm": 0.6227084687338438, - "learning_rate": 9.852347013683339e-06, - "loss": 0.6985, - "step": 3410 - }, - { - "epoch": 0.7988304093567251, - "grad_norm": 0.6623667229073018, - "learning_rate": 9.79849170553592e-06, - "loss": 0.6674, - "step": 3415 - }, - { - "epoch": 0.8, - "grad_norm": 0.65595305832523, - "learning_rate": 9.744901237344183e-06, - "loss": 0.709, - "step": 3420 - }, - { - "epoch": 0.8011695906432749, - "grad_norm": 0.6675026240893706, - "learning_rate": 9.691576410898398e-06, - "loss": 0.6693, - "step": 3425 - }, - { - "epoch": 0.8023391812865497, - "grad_norm": 0.6930655599750254, - "learning_rate": 9.638518024014453e-06, - "loss": 0.6883, - "step": 3430 - }, - { - "epoch": 0.8035087719298246, - "grad_norm": 0.6733979146956586, - "learning_rate": 9.585726870521938e-06, - "loss": 0.6892, - "step": 3435 - }, - { - "epoch": 0.8046783625730994, - "grad_norm": 0.6450346538264267, - "learning_rate": 9.53320374025223e-06, - "loss": 0.6862, - "step": 3440 - }, - { - "epoch": 0.8058479532163743, - "grad_norm": 0.6637412669868189, - "learning_rate": 9.480949419026689e-06, - "loss": 0.6829, - "step": 3445 - }, - { - "epoch": 0.8070175438596491, - "grad_norm": 0.6846438436543126, - "learning_rate": 9.428964688644927e-06, - "loss": 0.6852, - "step": 3450 - }, - { - "epoch": 0.808187134502924, - "grad_norm": 0.6575226232119401, - "learning_rate": 9.377250326873071e-06, - "loss": 0.6781, - "step": 3455 - }, - { - "epoch": 0.8093567251461988, - "grad_norm": 0.629796602059127, - "learning_rate": 9.325807107432164e-06, - "loss": 0.6771, - "step": 3460 - }, - { - "epoch": 0.8105263157894737, - "grad_norm": 0.5978452605861834, - "learning_rate": 9.274635799986554e-06, - "loss": 0.6798, - "step": 3465 - }, - { - "epoch": 0.8116959064327486, - "grad_norm": 0.6129382768821403, - "learning_rate": 9.223737170132394e-06, - "loss": 0.6845, - "step": 3470 - }, - { - "epoch": 0.8128654970760234, - "grad_norm": 0.5979463163900788, - "learning_rate": 9.173111979386215e-06, - "loss": 0.6961, - "step": 3475 - }, - { - "epoch": 0.8140350877192982, - "grad_norm": 0.6342223790039097, - "learning_rate": 9.122760985173471e-06, - "loss": 0.6935, - "step": 3480 - }, - { - "epoch": 0.8152046783625732, - "grad_norm": 0.6718113091931592, - "learning_rate": 9.072684940817275e-06, - "loss": 0.6849, - "step": 3485 - }, - { - "epoch": 0.816374269005848, - "grad_norm": 0.6081957293761066, - "learning_rate": 9.022884595527074e-06, - "loss": 0.6813, - "step": 3490 - }, - { - "epoch": 0.8175438596491228, - "grad_norm": 0.6766450913418733, - "learning_rate": 8.97336069438747e-06, - "loss": 0.695, - "step": 3495 - }, - { - "epoch": 0.8187134502923976, - "grad_norm": 0.6127087903695808, - "learning_rate": 8.92411397834706e-06, - "loss": 0.6841, - "step": 3500 - }, - { - "epoch": 0.8198830409356725, - "grad_norm": 0.637488440649515, - "learning_rate": 8.875145184207363e-06, - "loss": 0.6755, - "step": 3505 - }, - { - "epoch": 0.8210526315789474, - "grad_norm": 0.6558356891747749, - "learning_rate": 8.826455044611775e-06, - "loss": 0.6864, - "step": 3510 - }, - { - "epoch": 0.8222222222222222, - "grad_norm": 0.6909295666801539, - "learning_rate": 8.778044288034635e-06, - "loss": 0.6932, - "step": 3515 - }, - { - "epoch": 0.8233918128654971, - "grad_norm": 0.6216064629061299, - "learning_rate": 8.729913638770295e-06, - "loss": 0.6708, - "step": 3520 - }, - { - "epoch": 0.8245614035087719, - "grad_norm": 0.6470556870374983, - "learning_rate": 8.682063816922312e-06, - "loss": 0.6805, - "step": 3525 - }, - { - "epoch": 0.8257309941520468, - "grad_norm": 0.6441674769859189, - "learning_rate": 8.634495538392659e-06, - "loss": 0.6723, - "step": 3530 - }, - { - "epoch": 0.8269005847953217, - "grad_norm": 0.6294239666284314, - "learning_rate": 8.587209514871018e-06, - "loss": 0.6833, - "step": 3535 - }, - { - "epoch": 0.8280701754385965, - "grad_norm": 0.649777714204493, - "learning_rate": 8.540206453824119e-06, - "loss": 0.6818, - "step": 3540 - }, - { - "epoch": 0.8292397660818713, - "grad_norm": 0.6581271790880595, - "learning_rate": 8.493487058485191e-06, - "loss": 0.6907, - "step": 3545 - }, - { - "epoch": 0.8304093567251462, - "grad_norm": 0.6316641622404449, - "learning_rate": 8.44705202784339e-06, - "loss": 0.6752, - "step": 3550 - }, - { - "epoch": 0.8315789473684211, - "grad_norm": 0.6299397681956188, - "learning_rate": 8.4009020566334e-06, - "loss": 0.6763, - "step": 3555 - }, - { - "epoch": 0.8327485380116959, - "grad_norm": 0.6428132759357637, - "learning_rate": 8.355037835324978e-06, - "loss": 0.6911, - "step": 3560 - }, - { - "epoch": 0.8339181286549707, - "grad_norm": 0.6492937756009263, - "learning_rate": 8.30946005011266e-06, - "loss": 0.6768, - "step": 3565 - }, - { - "epoch": 0.8350877192982457, - "grad_norm": 0.630015343422525, - "learning_rate": 8.264169382905499e-06, - "loss": 0.6807, - "step": 3570 - }, - { - "epoch": 0.8362573099415205, - "grad_norm": 0.6721711166095163, - "learning_rate": 8.219166511316844e-06, - "loss": 0.6856, - "step": 3575 - }, - { - "epoch": 0.8374269005847953, - "grad_norm": 0.6845672226540863, - "learning_rate": 8.174452108654198e-06, - "loss": 0.676, - "step": 3580 - }, - { - "epoch": 0.8385964912280702, - "grad_norm": 0.610978989043903, - "learning_rate": 8.130026843909174e-06, - "loss": 0.675, - "step": 3585 - }, - { - "epoch": 0.839766081871345, - "grad_norm": 0.6285639064741724, - "learning_rate": 8.08589138174746e-06, - "loss": 0.67, - "step": 3590 - }, - { - "epoch": 0.8409356725146199, - "grad_norm": 0.6696016237475442, - "learning_rate": 8.042046382498862e-06, - "loss": 0.6896, - "step": 3595 - }, - { - "epoch": 0.8421052631578947, - "grad_norm": 0.6489876686398364, - "learning_rate": 7.998492502147478e-06, - "loss": 0.6765, - "step": 3600 - }, - { - "epoch": 0.8432748538011696, - "grad_norm": 0.642937395823205, - "learning_rate": 7.955230392321826e-06, - "loss": 0.6797, - "step": 3605 - }, - { - "epoch": 0.8444444444444444, - "grad_norm": 0.6564324117611929, - "learning_rate": 7.91226070028513e-06, - "loss": 0.6744, - "step": 3610 - }, - { - "epoch": 0.8456140350877193, - "grad_norm": 0.646811905002716, - "learning_rate": 7.869584068925617e-06, - "loss": 0.6752, - "step": 3615 - }, - { - "epoch": 0.8467836257309942, - "grad_norm": 0.6675140856614825, - "learning_rate": 7.827201136746903e-06, - "loss": 0.6868, - "step": 3620 - }, - { - "epoch": 0.847953216374269, - "grad_norm": 0.6822010982710452, - "learning_rate": 7.78511253785846e-06, - "loss": 0.6715, - "step": 3625 - }, - { - "epoch": 0.8491228070175438, - "grad_norm": 0.6392666243584267, - "learning_rate": 7.743318901966097e-06, - "loss": 0.6868, - "step": 3630 - }, - { - "epoch": 0.8502923976608188, - "grad_norm": 0.6113947733753667, - "learning_rate": 7.701820854362548e-06, - "loss": 0.6785, - "step": 3635 - }, - { - "epoch": 0.8514619883040936, - "grad_norm": 0.6747821078646472, - "learning_rate": 7.660619015918146e-06, - "loss": 0.6718, - "step": 3640 - }, - { - "epoch": 0.8526315789473684, - "grad_norm": 0.63975289517103, - "learning_rate": 7.6197140030714796e-06, - "loss": 0.6694, - "step": 3645 - }, - { - "epoch": 0.8538011695906432, - "grad_norm": 0.626650114047046, - "learning_rate": 7.579106427820232e-06, - "loss": 0.6805, - "step": 3650 - }, - { - "epoch": 0.8549707602339182, - "grad_norm": 0.6409849046483511, - "learning_rate": 7.538796897711965e-06, - "loss": 0.6761, - "step": 3655 - }, - { - "epoch": 0.856140350877193, - "grad_norm": 0.6471022035342423, - "learning_rate": 7.498786015835073e-06, - "loss": 0.673, - "step": 3660 - }, - { - "epoch": 0.8573099415204678, - "grad_norm": 0.6407581110519617, - "learning_rate": 7.459074380809753e-06, - "loss": 0.6877, - "step": 3665 - }, - { - "epoch": 0.8584795321637427, - "grad_norm": 0.6272955753937088, - "learning_rate": 7.419662586779016e-06, - "loss": 0.6798, - "step": 3670 - }, - { - "epoch": 0.8596491228070176, - "grad_norm": 0.6367532485529698, - "learning_rate": 7.380551223399836e-06, - "loss": 0.6817, - "step": 3675 - }, - { - "epoch": 0.8608187134502924, - "grad_norm": 0.6005127501375619, - "learning_rate": 7.341740875834319e-06, - "loss": 0.6746, - "step": 3680 - }, - { - "epoch": 0.8619883040935673, - "grad_norm": 0.648068901146122, - "learning_rate": 7.303232124740925e-06, - "loss": 0.6702, - "step": 3685 - }, - { - "epoch": 0.8631578947368421, - "grad_norm": 0.6960242714907998, - "learning_rate": 7.265025546265813e-06, - "loss": 0.6734, - "step": 3690 - }, - { - "epoch": 0.8643274853801169, - "grad_norm": 0.6594167870334117, - "learning_rate": 7.227121712034209e-06, - "loss": 0.6655, - "step": 3695 - }, - { - "epoch": 0.8654970760233918, - "grad_norm": 0.6910368581395679, - "learning_rate": 7.189521189141829e-06, - "loss": 0.6782, - "step": 3700 - }, - { - "epoch": 0.8666666666666667, - "grad_norm": 0.6174471063653484, - "learning_rate": 7.152224540146443e-06, - "loss": 0.6675, - "step": 3705 - }, - { - "epoch": 0.8678362573099415, - "grad_norm": 0.6567058868642893, - "learning_rate": 7.115232323059417e-06, - "loss": 0.6704, - "step": 3710 - }, - { - "epoch": 0.8690058479532163, - "grad_norm": 0.6424216241729048, - "learning_rate": 7.078545091337374e-06, - "loss": 0.6748, - "step": 3715 - }, - { - "epoch": 0.8701754385964913, - "grad_norm": 0.6487145669681874, - "learning_rate": 7.042163393873935e-06, - "loss": 0.687, - "step": 3720 - }, - { - "epoch": 0.8713450292397661, - "grad_norm": 0.647202081451824, - "learning_rate": 7.006087774991478e-06, - "loss": 0.6773, - "step": 3725 - }, - { - "epoch": 0.8725146198830409, - "grad_norm": 0.6436919512408469, - "learning_rate": 6.970318774433005e-06, - "loss": 0.6618, - "step": 3730 - }, - { - "epoch": 0.8736842105263158, - "grad_norm": 0.6534618918546282, - "learning_rate": 6.934856927354077e-06, - "loss": 0.6869, - "step": 3735 - }, - { - "epoch": 0.8748538011695907, - "grad_norm": 0.6766619943500544, - "learning_rate": 6.899702764314796e-06, - "loss": 0.6872, - "step": 3740 - }, - { - "epoch": 0.8760233918128655, - "grad_norm": 0.609566488590824, - "learning_rate": 6.8648568112718606e-06, - "loss": 0.6729, - "step": 3745 - }, - { - "epoch": 0.8771929824561403, - "grad_norm": 0.6371626902320939, - "learning_rate": 6.830319589570722e-06, - "loss": 0.6718, - "step": 3750 - }, - { - "epoch": 0.8783625730994152, - "grad_norm": 0.6378883436857535, - "learning_rate": 6.796091615937747e-06, - "loss": 0.6601, - "step": 3755 - }, - { - "epoch": 0.87953216374269, - "grad_norm": 0.6586370697757771, - "learning_rate": 6.76217340247253e-06, - "loss": 0.6709, - "step": 3760 - }, - { - "epoch": 0.8807017543859649, - "grad_norm": 0.6137769168492903, - "learning_rate": 6.728565456640189e-06, - "loss": 0.6776, - "step": 3765 - }, - { - "epoch": 0.8818713450292398, - "grad_norm": 0.6101836408040017, - "learning_rate": 6.695268281263803e-06, - "loss": 0.6901, - "step": 3770 - }, - { - "epoch": 0.8830409356725146, - "grad_norm": 0.6329628323094153, - "learning_rate": 6.6622823745168844e-06, - "loss": 0.6897, - "step": 3775 - }, - { - "epoch": 0.8842105263157894, - "grad_norm": 0.6462015161934214, - "learning_rate": 6.629608229915907e-06, - "loss": 0.6808, - "step": 3780 - }, - { - "epoch": 0.8853801169590644, - "grad_norm": 0.6218725318180922, - "learning_rate": 6.597246336312947e-06, - "loss": 0.6934, - "step": 3785 - }, - { - "epoch": 0.8865497076023392, - "grad_norm": 0.6325806500322131, - "learning_rate": 6.565197177888353e-06, - "loss": 0.6797, - "step": 3790 - }, - { - "epoch": 0.887719298245614, - "grad_norm": 0.6680853833728703, - "learning_rate": 6.533461234143503e-06, - "loss": 0.6805, - "step": 3795 - }, - { - "epoch": 0.8888888888888888, - "grad_norm": 0.6327716959909178, - "learning_rate": 6.502038979893646e-06, - "loss": 0.6805, - "step": 3800 - }, - { - "epoch": 0.8900584795321638, - "grad_norm": 0.6591393204504788, - "learning_rate": 6.4709308852607755e-06, - "loss": 0.6671, - "step": 3805 - }, - { - "epoch": 0.8912280701754386, - "grad_norm": 0.6952150402441917, - "learning_rate": 6.440137415666606e-06, - "loss": 0.6844, - "step": 3810 - }, - { - "epoch": 0.8923976608187134, - "grad_norm": 0.6353053040972749, - "learning_rate": 6.409659031825618e-06, - "loss": 0.6858, - "step": 3815 - }, - { - "epoch": 0.8935672514619883, - "grad_norm": 0.6049168982376912, - "learning_rate": 6.379496189738146e-06, - "loss": 0.6706, - "step": 3820 - }, - { - "epoch": 0.8947368421052632, - "grad_norm": 0.606413815250955, - "learning_rate": 6.3496493406835835e-06, - "loss": 0.664, - "step": 3825 - }, - { - "epoch": 0.895906432748538, - "grad_norm": 0.6784185660908973, - "learning_rate": 6.320118931213605e-06, - "loss": 0.6732, - "step": 3830 - }, - { - "epoch": 0.8970760233918129, - "grad_norm": 0.6731290698360186, - "learning_rate": 6.290905403145488e-06, - "loss": 0.6769, - "step": 3835 - }, - { - "epoch": 0.8982456140350877, - "grad_norm": 0.6804372100085168, - "learning_rate": 6.262009193555523e-06, - "loss": 0.6741, - "step": 3840 - }, - { - "epoch": 0.8994152046783626, - "grad_norm": 0.6284090122583083, - "learning_rate": 6.233430734772457e-06, - "loss": 0.6779, - "step": 3845 - }, - { - "epoch": 0.9005847953216374, - "grad_norm": 0.6821827933733495, - "learning_rate": 6.205170454371017e-06, - "loss": 0.6665, - "step": 3850 - }, - { - "epoch": 0.9017543859649123, - "grad_norm": 0.6189927650661127, - "learning_rate": 6.1772287751655465e-06, - "loss": 0.6837, - "step": 3855 - }, - { - "epoch": 0.9029239766081871, - "grad_norm": 0.6632044649328058, - "learning_rate": 6.149606115203644e-06, - "loss": 0.6943, - "step": 3860 - }, - { - "epoch": 0.904093567251462, - "grad_norm": 0.620002257673005, - "learning_rate": 6.122302887759918e-06, - "loss": 0.68, - "step": 3865 - }, - { - "epoch": 0.9052631578947369, - "grad_norm": 0.6441357917286101, - "learning_rate": 6.0953195013298255e-06, - "loss": 0.6825, - "step": 3870 - }, - { - "epoch": 0.9064327485380117, - "grad_norm": 2.236180657709731, - "learning_rate": 6.068656359623525e-06, - "loss": 0.6714, - "step": 3875 - }, - { - "epoch": 0.9076023391812865, - "grad_norm": 0.6171661564106794, - "learning_rate": 6.042313861559872e-06, - "loss": 0.6791, - "step": 3880 - }, - { - "epoch": 0.9087719298245615, - "grad_norm": 0.6121228446850321, - "learning_rate": 6.016292401260419e-06, - "loss": 0.6866, - "step": 3885 - }, - { - "epoch": 0.9099415204678363, - "grad_norm": 0.6768230538525649, - "learning_rate": 5.990592368043533e-06, - "loss": 0.6844, - "step": 3890 - }, - { - "epoch": 0.9111111111111111, - "grad_norm": 0.6713551675375381, - "learning_rate": 5.965214146418583e-06, - "loss": 0.6865, - "step": 3895 - }, - { - "epoch": 0.9122807017543859, - "grad_norm": 0.629363611094676, - "learning_rate": 5.9401581160801645e-06, - "loss": 0.6657, - "step": 3900 - }, - { - "epoch": 0.9134502923976608, - "grad_norm": 0.6612627229951298, - "learning_rate": 5.915424651902437e-06, - "loss": 0.6911, - "step": 3905 - }, - { - "epoch": 0.9146198830409357, - "grad_norm": 0.6167590167685036, - "learning_rate": 5.891014123933495e-06, - "loss": 0.6764, - "step": 3910 - }, - { - "epoch": 0.9157894736842105, - "grad_norm": 0.6280604076161018, - "learning_rate": 5.866926897389862e-06, - "loss": 0.6725, - "step": 3915 - }, - { - "epoch": 0.9169590643274854, - "grad_norm": 0.6363040677586755, - "learning_rate": 5.8431633326509895e-06, - "loss": 0.6839, - "step": 3920 - }, - { - "epoch": 0.9181286549707602, - "grad_norm": 0.6233483061515912, - "learning_rate": 5.819723785253901e-06, - "loss": 0.67, - "step": 3925 - }, - { - "epoch": 0.9192982456140351, - "grad_norm": 0.6151583619535353, - "learning_rate": 5.796608605887838e-06, - "loss": 0.6695, - "step": 3930 - }, - { - "epoch": 0.92046783625731, - "grad_norm": 0.6038641602208659, - "learning_rate": 5.773818140389052e-06, - "loss": 0.6741, - "step": 3935 - }, - { - "epoch": 0.9216374269005848, - "grad_norm": 0.5940675664971363, - "learning_rate": 5.751352729735594e-06, - "loss": 0.6845, - "step": 3940 - }, - { - "epoch": 0.9228070175438596, - "grad_norm": 0.619934570260161, - "learning_rate": 5.729212710042228e-06, - "loss": 0.6749, - "step": 3945 - }, - { - "epoch": 0.9239766081871345, - "grad_norm": 0.6403366226216299, - "learning_rate": 5.707398412555415e-06, - "loss": 0.6787, - "step": 3950 - }, - { - "epoch": 0.9251461988304094, - "grad_norm": 0.6392263250591611, - "learning_rate": 5.685910163648331e-06, - "loss": 0.6751, - "step": 3955 - }, - { - "epoch": 0.9263157894736842, - "grad_norm": 0.6037302371230343, - "learning_rate": 5.664748284815999e-06, - "loss": 0.6745, - "step": 3960 - }, - { - "epoch": 0.927485380116959, - "grad_norm": 0.6454567383685781, - "learning_rate": 5.6439130926704926e-06, - "loss": 0.6875, - "step": 3965 - }, - { - "epoch": 0.928654970760234, - "grad_norm": 0.6680699529149163, - "learning_rate": 5.623404898936162e-06, - "loss": 0.6682, - "step": 3970 - }, - { - "epoch": 0.9298245614035088, - "grad_norm": 0.6464856994915182, - "learning_rate": 5.603224010445013e-06, - "loss": 0.6853, - "step": 3975 - }, - { - "epoch": 0.9309941520467836, - "grad_norm": 0.6425767818922832, - "learning_rate": 5.5833707291320785e-06, - "loss": 0.68, - "step": 3980 - }, - { - "epoch": 0.9321637426900585, - "grad_norm": 0.662544717871915, - "learning_rate": 5.563845352030928e-06, - "loss": 0.6757, - "step": 3985 - }, - { - "epoch": 0.9333333333333333, - "grad_norm": 0.6297971690649151, - "learning_rate": 5.544648171269207e-06, - "loss": 0.6907, - "step": 3990 - }, - { - "epoch": 0.9345029239766082, - "grad_norm": 0.6337592219063408, - "learning_rate": 5.525779474064284e-06, - "loss": 0.6799, - "step": 3995 - }, - { - "epoch": 0.935672514619883, - "grad_norm": 0.6059097225565415, - "learning_rate": 5.507239542718928e-06, - "loss": 0.6764, - "step": 4000 - }, - { - "epoch": 0.9368421052631579, - "grad_norm": 0.644532218298727, - "learning_rate": 5.48902865461711e-06, - "loss": 0.6902, - "step": 4005 - }, - { - "epoch": 0.9380116959064327, - "grad_norm": 0.6775880658086887, - "learning_rate": 5.471147082219839e-06, - "loss": 0.66, - "step": 4010 - }, - { - "epoch": 0.9391812865497076, - "grad_norm": 0.6140932767025524, - "learning_rate": 5.453595093061084e-06, - "loss": 0.6827, - "step": 4015 - }, - { - "epoch": 0.9403508771929825, - "grad_norm": 0.6442340334722829, - "learning_rate": 5.436372949743784e-06, - "loss": 0.6721, - "step": 4020 - }, - { - "epoch": 0.9415204678362573, - "grad_norm": 0.6188202885625294, - "learning_rate": 5.4194809099359016e-06, - "loss": 0.6723, - "step": 4025 - }, - { - "epoch": 0.9426900584795321, - "grad_norm": 0.6015661608114689, - "learning_rate": 5.402919226366589e-06, - "loss": 0.6683, - "step": 4030 - }, - { - "epoch": 0.9438596491228071, - "grad_norm": 0.6298388121183706, - "learning_rate": 5.386688146822386e-06, - "loss": 0.6669, - "step": 4035 - }, - { - "epoch": 0.9450292397660819, - "grad_norm": 0.6643089494356405, - "learning_rate": 5.370787914143523e-06, - "loss": 0.6758, - "step": 4040 - }, - { - "epoch": 0.9461988304093567, - "grad_norm": 0.7117885834075908, - "learning_rate": 5.355218766220295e-06, - "loss": 0.6747, - "step": 4045 - }, - { - "epoch": 0.9473684210526315, - "grad_norm": 1.069267992100781, - "learning_rate": 5.3399809359894815e-06, - "loss": 0.6923, - "step": 4050 - }, - { - "epoch": 0.9485380116959065, - "grad_norm": 0.6155712114839152, - "learning_rate": 5.325074651430884e-06, - "loss": 0.6778, - "step": 4055 - }, - { - "epoch": 0.9497076023391813, - "grad_norm": 0.6171532400917544, - "learning_rate": 5.3105001355638965e-06, - "loss": 0.6702, - "step": 4060 - }, - { - "epoch": 0.9508771929824561, - "grad_norm": 0.6746041528422164, - "learning_rate": 5.296257606444188e-06, - "loss": 0.6903, - "step": 4065 - }, - { - "epoch": 0.952046783625731, - "grad_norm": 0.6261575844875003, - "learning_rate": 5.2823472771604235e-06, - "loss": 0.6792, - "step": 4070 - }, - { - "epoch": 0.9532163742690059, - "grad_norm": 0.6213266374401143, - "learning_rate": 5.268769355831078e-06, - "loss": 0.6756, - "step": 4075 - }, - { - "epoch": 0.9543859649122807, - "grad_norm": 0.6235253860423335, - "learning_rate": 5.255524045601336e-06, - "loss": 0.6695, - "step": 4080 - }, - { - "epoch": 0.9555555555555556, - "grad_norm": 0.6199005302450762, - "learning_rate": 5.2426115446400375e-06, - "loss": 0.674, - "step": 4085 - }, - { - "epoch": 0.9567251461988304, - "grad_norm": 0.6472698916651094, - "learning_rate": 5.230032046136718e-06, - "loss": 0.677, - "step": 4090 - }, - { - "epoch": 0.9578947368421052, - "grad_norm": 0.6508423399986626, - "learning_rate": 5.217785738298716e-06, - "loss": 0.6818, - "step": 4095 - }, - { - "epoch": 0.9590643274853801, - "grad_norm": 0.6272752070860681, - "learning_rate": 5.20587280434837e-06, - "loss": 0.6655, - "step": 4100 - }, - { - "epoch": 0.960233918128655, - "grad_norm": 0.6249813913743417, - "learning_rate": 5.1942934225202566e-06, - "loss": 0.6649, - "step": 4105 - }, - { - "epoch": 0.9614035087719298, - "grad_norm": 0.6088008404693435, - "learning_rate": 5.183047766058537e-06, - "loss": 0.6673, - "step": 4110 - }, - { - "epoch": 0.9625730994152046, - "grad_norm": 0.6157262304244052, - "learning_rate": 5.172136003214364e-06, - "loss": 0.661, - "step": 4115 - }, - { - "epoch": 0.9637426900584796, - "grad_norm": 0.6202554098999495, - "learning_rate": 5.16155829724336e-06, - "loss": 0.6788, - "step": 4120 - }, - { - "epoch": 0.9649122807017544, - "grad_norm": 0.6460466051369329, - "learning_rate": 5.151314806403183e-06, - "loss": 0.6722, - "step": 4125 - }, - { - "epoch": 0.9660818713450292, - "grad_norm": 0.6197826502012264, - "learning_rate": 5.141405683951148e-06, - "loss": 0.6757, - "step": 4130 - }, - { - "epoch": 0.9672514619883041, - "grad_norm": 0.6430775828131704, - "learning_rate": 5.13183107814194e-06, - "loss": 0.6714, - "step": 4135 - }, - { - "epoch": 0.968421052631579, - "grad_norm": 0.6716734720725397, - "learning_rate": 5.1225911322253975e-06, - "loss": 0.6689, - "step": 4140 - }, - { - "epoch": 0.9695906432748538, - "grad_norm": 0.7005363493700107, - "learning_rate": 5.113685984444362e-06, - "loss": 0.6775, - "step": 4145 - }, - { - "epoch": 0.9707602339181286, - "grad_norm": 0.6210358032380061, - "learning_rate": 5.105115768032622e-06, - "loss": 0.6705, - "step": 4150 - }, - { - "epoch": 0.9719298245614035, - "grad_norm": 0.6552414802641572, - "learning_rate": 5.096880611212908e-06, - "loss": 0.6747, - "step": 4155 - }, - { - "epoch": 0.9730994152046784, - "grad_norm": 0.61884873350533, - "learning_rate": 5.0889806371949775e-06, - "loss": 0.6682, - "step": 4160 - }, - { - "epoch": 0.9742690058479532, - "grad_norm": 0.611707842587258, - "learning_rate": 5.081415964173772e-06, - "loss": 0.6815, - "step": 4165 - }, - { - "epoch": 0.9754385964912281, - "grad_norm": 0.6309765692328979, - "learning_rate": 5.074186705327656e-06, - "loss": 0.6673, - "step": 4170 - }, - { - "epoch": 0.9766081871345029, - "grad_norm": 0.6488169707206964, - "learning_rate": 5.067292968816706e-06, - "loss": 0.6785, - "step": 4175 - }, - { - "epoch": 0.9777777777777777, - "grad_norm": 0.6039253712245951, - "learning_rate": 5.060734857781115e-06, - "loss": 0.6771, - "step": 4180 - }, - { - "epoch": 0.9789473684210527, - "grad_norm": 0.6120032767383402, - "learning_rate": 5.0545124703396265e-06, - "loss": 0.6723, - "step": 4185 - }, - { - "epoch": 0.9801169590643275, - "grad_norm": 0.6868334998745559, - "learning_rate": 5.048625899588081e-06, - "loss": 0.6613, - "step": 4190 - }, - { - "epoch": 0.9812865497076023, - "grad_norm": 0.6296139609807546, - "learning_rate": 5.043075233598026e-06, - "loss": 0.6633, - "step": 4195 - }, - { - "epoch": 0.9824561403508771, - "grad_norm": 0.6349010729713203, - "learning_rate": 5.037860555415383e-06, - "loss": 0.6718, - "step": 4200 - }, - { - "epoch": 0.9836257309941521, - "grad_norm": 0.6739022750199818, - "learning_rate": 5.032981943059216e-06, - "loss": 0.6772, - "step": 4205 - }, - { - "epoch": 0.9847953216374269, - "grad_norm": 0.650862024747413, - "learning_rate": 5.028439469520571e-06, - "loss": 0.6711, - "step": 4210 - }, - { - "epoch": 0.9859649122807017, - "grad_norm": 0.6323528177012202, - "learning_rate": 5.024233202761362e-06, - "loss": 0.6672, - "step": 4215 - }, - { - "epoch": 0.9871345029239766, - "grad_norm": 0.6237317952427651, - "learning_rate": 5.020363205713377e-06, - "loss": 0.661, - "step": 4220 - }, - { - "epoch": 0.9883040935672515, - "grad_norm": 0.6005441441484929, - "learning_rate": 5.016829536277317e-06, - "loss": 0.6823, - "step": 4225 - }, - { - "epoch": 0.9894736842105263, - "grad_norm": 0.6095441780661974, - "learning_rate": 5.0136322473219525e-06, - "loss": 0.6764, - "step": 4230 - }, - { - "epoch": 0.9906432748538012, - "grad_norm": 0.6162120041059115, - "learning_rate": 5.010771386683308e-06, - "loss": 0.6639, - "step": 4235 - }, - { - "epoch": 0.991812865497076, - "grad_norm": 0.6852116289986068, - "learning_rate": 5.008246997163965e-06, - "loss": 0.6782, - "step": 4240 - }, - { - "epoch": 0.9929824561403509, - "grad_norm": 0.6310290602971912, - "learning_rate": 5.006059116532412e-06, - "loss": 0.6858, - "step": 4245 - }, - { - "epoch": 0.9941520467836257, - "grad_norm": 0.6277161545850843, - "learning_rate": 5.00420777752248e-06, - "loss": 0.6857, - "step": 4250 - }, - { - "epoch": 0.9953216374269006, - "grad_norm": 0.627301841338485, - "learning_rate": 5.002693007832853e-06, - "loss": 0.6743, - "step": 4255 - }, - { - "epoch": 0.9964912280701754, - "grad_norm": 0.6332268702872285, - "learning_rate": 5.0015148301266646e-06, - "loss": 0.6542, - "step": 4260 - }, - { - "epoch": 0.9976608187134502, - "grad_norm": 0.6299167135136249, - "learning_rate": 5.000673262031141e-06, - "loss": 0.6787, - "step": 4265 - }, - { - "epoch": 0.9988304093567252, - "grad_norm": 0.6180934069153444, - "learning_rate": 5.000168316137349e-06, - "loss": 0.6723, - "step": 4270 - }, - { - "epoch": 1.0, - "grad_norm": 0.6381738406824355, - "learning_rate": 5e-06, - "loss": 0.6605, - "step": 4275 - }, { "epoch": 1.0, - "step": 4275, - "total_flos": 588699087667200.0, - "train_loss": 0.7670444376984535, - "train_runtime": 29567.5701, - "train_samples_per_second": 18.507, - "train_steps_per_second": 0.145 + "step": 268, + "total_flos": 590489283723264.0, + "train_loss": 1.0968534403772496, + "train_runtime": 7318.0016, + "train_samples_per_second": 4.68, + "train_steps_per_second": 0.037 } ], "logging_steps": 5, - "max_steps": 4275, + "max_steps": 268, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -6020,7 +406,7 @@ "attributes": {} } }, - "total_flos": 588699087667200.0, + "total_flos": 590489283723264.0, "train_batch_size": 16, "trial_name": null, "trial_params": null