{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 900, "global_step": 22500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10222222222222223, "grad_norm": 6382.4189453125, "learning_rate": 3.4074074074074077e-06, "loss": 306.5843, "step": 23 }, { "epoch": 0.20444444444444446, "grad_norm": 154.1435546875, "learning_rate": 6.814814814814815e-06, "loss": 62.983, "step": 46 }, { "epoch": 0.30666666666666664, "grad_norm": 10.364509582519531, "learning_rate": 1.0222222222222223e-05, "loss": 7.6227, "step": 69 }, { "epoch": 0.4088888888888889, "grad_norm": 13.348139762878418, "learning_rate": 1.362962962962963e-05, "loss": 7.322, "step": 92 }, { "epoch": 0.5111111111111111, "grad_norm": 1.03432297706604, "learning_rate": 1.7037037037037038e-05, "loss": 7.2767, "step": 115 }, { "epoch": 0.6133333333333333, "grad_norm": 2.311262369155884, "learning_rate": 2.0444444444444446e-05, "loss": 7.2605, "step": 138 }, { "epoch": 0.7155555555555555, "grad_norm": 1.2174512147903442, "learning_rate": 2.3851851851851854e-05, "loss": 7.2589, "step": 161 }, { "epoch": 0.8177777777777778, "grad_norm": 1.1917160749435425, "learning_rate": 2.725925925925926e-05, "loss": 7.2573, "step": 184 }, { "epoch": 0.92, "grad_norm": 0.802689254283905, "learning_rate": 3.066666666666667e-05, "loss": 7.2555, "step": 207 }, { "epoch": 1.0222222222222221, "grad_norm": 0.8915572762489319, "learning_rate": 3.4074074074074077e-05, "loss": 7.2539, "step": 230 }, { "epoch": 1.1244444444444444, "grad_norm": 1.1943933963775635, "learning_rate": 3.7481481481481484e-05, "loss": 7.2509, "step": 253 }, { "epoch": 1.2266666666666666, "grad_norm": 0.9069448709487915, "learning_rate": 4.088888888888889e-05, "loss": 7.2492, "step": 276 }, { "epoch": 1.3288888888888888, "grad_norm": 0.7575523853302002, "learning_rate": 4.42962962962963e-05, "loss": 7.2472, "step": 299 }, { "epoch": 1.431111111111111, "grad_norm": 0.5182924866676331, "learning_rate": 4.770370370370371e-05, "loss": 7.2453, "step": 322 }, { "epoch": 1.5333333333333332, "grad_norm": 0.5943706631660461, "learning_rate": 5.111111111111111e-05, "loss": 7.2433, "step": 345 }, { "epoch": 1.6355555555555554, "grad_norm": 0.5987505912780762, "learning_rate": 5.451851851851852e-05, "loss": 7.2415, "step": 368 }, { "epoch": 1.7377777777777776, "grad_norm": 0.8691719770431519, "learning_rate": 5.792592592592593e-05, "loss": 7.2388, "step": 391 }, { "epoch": 1.8399999999999999, "grad_norm": 0.9830289483070374, "learning_rate": 6.133333333333334e-05, "loss": 7.2337, "step": 414 }, { "epoch": 1.942222222222222, "grad_norm": 0.9140754342079163, "learning_rate": 6.474074074074075e-05, "loss": 7.2245, "step": 437 }, { "epoch": 2.0444444444444443, "grad_norm": 1.7775914669036865, "learning_rate": 6.814814814814815e-05, "loss": 7.2109, "step": 460 }, { "epoch": 2.1466666666666665, "grad_norm": 1.317034363746643, "learning_rate": 7.155555555555555e-05, "loss": 7.1871, "step": 483 }, { "epoch": 2.2488888888888887, "grad_norm": 1.6885732412338257, "learning_rate": 7.496296296296297e-05, "loss": 7.1349, "step": 506 }, { "epoch": 2.351111111111111, "grad_norm": 2.464526653289795, "learning_rate": 7.837037037037037e-05, "loss": 7.0371, "step": 529 }, { "epoch": 2.453333333333333, "grad_norm": 3.4147374629974365, "learning_rate": 8.177777777777778e-05, "loss": 6.9365, "step": 552 }, { "epoch": 2.5555555555555554, "grad_norm": 3.6264212131500244, "learning_rate": 8.518518518518518e-05, "loss": 6.8548, "step": 575 }, { "epoch": 2.6577777777777776, "grad_norm": 2.2979955673217773, "learning_rate": 8.85925925925926e-05, "loss": 6.7637, "step": 598 }, { "epoch": 2.76, "grad_norm": 2.969346046447754, "learning_rate": 9.200000000000001e-05, "loss": 6.6778, "step": 621 }, { "epoch": 2.862222222222222, "grad_norm": 4.26610803604126, "learning_rate": 9.540740740740741e-05, "loss": 6.5954, "step": 644 }, { "epoch": 2.964444444444444, "grad_norm": 1.7684084177017212, "learning_rate": 9.881481481481482e-05, "loss": 6.5164, "step": 667 }, { "epoch": 3.066666666666667, "grad_norm": 3.720853090286255, "learning_rate": 9.999988344964554e-05, "loss": 6.4356, "step": 690 }, { "epoch": 3.168888888888889, "grad_norm": 2.5611510276794434, "learning_rate": 9.99992520072995e-05, "loss": 6.3594, "step": 713 }, { "epoch": 3.2711111111111113, "grad_norm": 5.3843255043029785, "learning_rate": 9.999807252777301e-05, "loss": 6.3057, "step": 736 }, { "epoch": 3.3733333333333335, "grad_norm": 4.412026882171631, "learning_rate": 9.999634502399426e-05, "loss": 6.25, "step": 759 }, { "epoch": 3.4755555555555557, "grad_norm": 3.188660144805908, "learning_rate": 9.999406951489825e-05, "loss": 6.1975, "step": 782 }, { "epoch": 3.5777777777777775, "grad_norm": 4.5765156745910645, "learning_rate": 9.999124602542662e-05, "loss": 6.1516, "step": 805 }, { "epoch": 3.68, "grad_norm": 5.967836856842041, "learning_rate": 9.998787458652739e-05, "loss": 6.1038, "step": 828 }, { "epoch": 3.7822222222222224, "grad_norm": 6.038416385650635, "learning_rate": 9.998395523515457e-05, "loss": 6.078, "step": 851 }, { "epoch": 3.8844444444444446, "grad_norm": 2.577953577041626, "learning_rate": 9.997948801426783e-05, "loss": 6.0297, "step": 874 }, { "epoch": 3.986666666666667, "grad_norm": 3.8739564418792725, "learning_rate": 9.997447297283196e-05, "loss": 5.9847, "step": 897 }, { "epoch": 4.088888888888889, "grad_norm": 5.759775161743164, "learning_rate": 9.996891016581633e-05, "loss": 5.9452, "step": 920 }, { "epoch": 4.191111111111111, "grad_norm": 5.758726596832275, "learning_rate": 9.996279965419441e-05, "loss": 5.9283, "step": 943 }, { "epoch": 4.293333333333333, "grad_norm": 3.345691204071045, "learning_rate": 9.995614150494293e-05, "loss": 5.8792, "step": 966 }, { "epoch": 4.395555555555555, "grad_norm": 5.426297664642334, "learning_rate": 9.994893579104123e-05, "loss": 5.8526, "step": 989 }, { "epoch": 4.497777777777777, "grad_norm": 4.649121284484863, "learning_rate": 9.994118259147049e-05, "loss": 5.8266, "step": 1012 }, { "epoch": 4.6, "grad_norm": 5.175451278686523, "learning_rate": 9.993288199121283e-05, "loss": 5.8114, "step": 1035 }, { "epoch": 4.702222222222222, "grad_norm": 4.655645370483398, "learning_rate": 9.992403408125033e-05, "loss": 5.7801, "step": 1058 }, { "epoch": 4.804444444444444, "grad_norm": 5.830355644226074, "learning_rate": 9.991463895856414e-05, "loss": 5.7576, "step": 1081 }, { "epoch": 4.906666666666666, "grad_norm": 3.2799057960510254, "learning_rate": 9.990469672613331e-05, "loss": 5.7327, "step": 1104 }, { "epoch": 5.0088888888888885, "grad_norm": 5.891563415527344, "learning_rate": 9.989420749293375e-05, "loss": 5.7139, "step": 1127 }, { "epoch": 5.111111111111111, "grad_norm": 6.125003337860107, "learning_rate": 9.988317137393697e-05, "loss": 5.6823, "step": 1150 }, { "epoch": 5.213333333333333, "grad_norm": 4.9209442138671875, "learning_rate": 9.987158849010885e-05, "loss": 5.6534, "step": 1173 }, { "epoch": 5.315555555555555, "grad_norm": 3.9249610900878906, "learning_rate": 9.985945896840829e-05, "loss": 5.6601, "step": 1196 }, { "epoch": 5.417777777777777, "grad_norm": 7.975271701812744, "learning_rate": 9.984678294178589e-05, "loss": 5.6278, "step": 1219 }, { "epoch": 5.52, "grad_norm": 4.072458267211914, "learning_rate": 9.983356054918238e-05, "loss": 5.6104, "step": 1242 }, { "epoch": 5.622222222222222, "grad_norm": 5.122928142547607, "learning_rate": 9.981979193552721e-05, "loss": 5.5991, "step": 1265 }, { "epoch": 5.724444444444444, "grad_norm": 6.029202461242676, "learning_rate": 9.980547725173685e-05, "loss": 5.5761, "step": 1288 }, { "epoch": 5.826666666666666, "grad_norm": 4.795958042144775, "learning_rate": 9.979061665471326e-05, "loss": 5.5573, "step": 1311 }, { "epoch": 5.928888888888888, "grad_norm": 3.8007431030273438, "learning_rate": 9.977521030734203e-05, "loss": 5.5274, "step": 1334 }, { "epoch": 6.0311111111111115, "grad_norm": 5.163888931274414, "learning_rate": 9.975925837849073e-05, "loss": 5.5212, "step": 1357 }, { "epoch": 6.133333333333334, "grad_norm": 5.857538223266602, "learning_rate": 9.9742761043007e-05, "loss": 5.5039, "step": 1380 }, { "epoch": 6.235555555555556, "grad_norm": 4.817676067352295, "learning_rate": 9.972571848171657e-05, "loss": 5.4863, "step": 1403 }, { "epoch": 6.337777777777778, "grad_norm": 4.5216450691223145, "learning_rate": 9.97081308814214e-05, "loss": 5.4866, "step": 1426 }, { "epoch": 6.44, "grad_norm": 5.7964630126953125, "learning_rate": 9.968999843489754e-05, "loss": 5.4544, "step": 1449 }, { "epoch": 6.542222222222223, "grad_norm": 7.403745174407959, "learning_rate": 9.967132134089309e-05, "loss": 5.4383, "step": 1472 }, { "epoch": 6.644444444444445, "grad_norm": 5.906863689422607, "learning_rate": 9.965209980412593e-05, "loss": 5.4435, "step": 1495 }, { "epoch": 6.746666666666667, "grad_norm": 4.985208511352539, "learning_rate": 9.963233403528154e-05, "loss": 5.4271, "step": 1518 }, { "epoch": 6.848888888888889, "grad_norm": 5.670632839202881, "learning_rate": 9.96120242510107e-05, "loss": 5.4023, "step": 1541 }, { "epoch": 6.9511111111111115, "grad_norm": 4.155480861663818, "learning_rate": 9.959117067392709e-05, "loss": 5.3781, "step": 1564 }, { "epoch": 7.053333333333334, "grad_norm": 6.202167987823486, "learning_rate": 9.95697735326048e-05, "loss": 5.3696, "step": 1587 }, { "epoch": 7.155555555555556, "grad_norm": 5.649682998657227, "learning_rate": 9.954783306157595e-05, "loss": 5.3255, "step": 1610 }, { "epoch": 7.257777777777778, "grad_norm": 6.699223518371582, "learning_rate": 9.952534950132802e-05, "loss": 5.3186, "step": 1633 }, { "epoch": 7.36, "grad_norm": 5.433987140655518, "learning_rate": 9.95023230983012e-05, "loss": 5.3147, "step": 1656 }, { "epoch": 7.4622222222222225, "grad_norm": 4.822690010070801, "learning_rate": 9.947875410488581e-05, "loss": 5.3022, "step": 1679 }, { "epoch": 7.564444444444445, "grad_norm": 5.345188617706299, "learning_rate": 9.945464277941939e-05, "loss": 5.2828, "step": 1702 }, { "epoch": 7.666666666666667, "grad_norm": 4.902531623840332, "learning_rate": 9.942998938618394e-05, "loss": 5.2818, "step": 1725 }, { "epoch": 7.768888888888889, "grad_norm": 7.8368353843688965, "learning_rate": 9.940479419540304e-05, "loss": 5.2735, "step": 1748 }, { "epoch": 7.871111111111111, "grad_norm": 5.669989585876465, "learning_rate": 9.937905748323883e-05, "loss": 5.2554, "step": 1771 }, { "epoch": 7.973333333333334, "grad_norm": 4.463327407836914, "learning_rate": 9.935277953178905e-05, "loss": 5.2421, "step": 1794 }, { "epoch": 8.075555555555555, "grad_norm": 4.2700629234313965, "learning_rate": 9.93259606290839e-05, "loss": 5.1956, "step": 1817 }, { "epoch": 8.177777777777777, "grad_norm": 5.543842315673828, "learning_rate": 9.929860106908289e-05, "loss": 5.1719, "step": 1840 }, { "epoch": 8.28, "grad_norm": 10.465546607971191, "learning_rate": 9.927070115167161e-05, "loss": 5.1691, "step": 1863 }, { "epoch": 8.382222222222222, "grad_norm": 5.517487525939941, "learning_rate": 9.924226118265849e-05, "loss": 5.1431, "step": 1886 }, { "epoch": 8.484444444444444, "grad_norm": 6.022068977355957, "learning_rate": 9.921328147377143e-05, "loss": 5.1507, "step": 1909 }, { "epoch": 8.586666666666666, "grad_norm": 4.770472526550293, "learning_rate": 9.918376234265428e-05, "loss": 5.1385, "step": 1932 }, { "epoch": 8.688888888888888, "grad_norm": 6.177302360534668, "learning_rate": 9.915370411286356e-05, "loss": 5.1091, "step": 1955 }, { "epoch": 8.79111111111111, "grad_norm": 6.306371688842773, "learning_rate": 9.912310711386473e-05, "loss": 5.1276, "step": 1978 }, { "epoch": 8.893333333333333, "grad_norm": 7.086174488067627, "learning_rate": 9.909197168102867e-05, "loss": 5.0997, "step": 2001 }, { "epoch": 8.995555555555555, "grad_norm": 5.590447902679443, "learning_rate": 9.906029815562797e-05, "loss": 5.0776, "step": 2024 }, { "epoch": 9.097777777777777, "grad_norm": 4.786597728729248, "learning_rate": 9.902808688483323e-05, "loss": 5.0244, "step": 2047 }, { "epoch": 9.2, "grad_norm": 7.7961015701293945, "learning_rate": 9.899533822170922e-05, "loss": 5.0232, "step": 2070 }, { "epoch": 9.302222222222222, "grad_norm": 5.857214450836182, "learning_rate": 9.896205252521099e-05, "loss": 5.0213, "step": 2093 }, { "epoch": 9.404444444444444, "grad_norm": 6.194970607757568, "learning_rate": 9.892823016017999e-05, "loss": 4.984, "step": 2116 }, { "epoch": 9.506666666666666, "grad_norm": 7.040445804595947, "learning_rate": 9.889387149734004e-05, "loss": 4.9845, "step": 2139 }, { "epoch": 9.608888888888888, "grad_norm": 6.245872497558594, "learning_rate": 9.885897691329327e-05, "loss": 4.9771, "step": 2162 }, { "epoch": 9.71111111111111, "grad_norm": 4.590968608856201, "learning_rate": 9.882354679051598e-05, "loss": 4.9565, "step": 2185 }, { "epoch": 9.813333333333333, "grad_norm": 5.94847297668457, "learning_rate": 9.87875815173545e-05, "loss": 4.9531, "step": 2208 }, { "epoch": 9.915555555555555, "grad_norm": 8.10450267791748, "learning_rate": 9.875108148802082e-05, "loss": 4.9557, "step": 2231 }, { "epoch": 10.017777777777777, "grad_norm": 5.512363910675049, "learning_rate": 9.871404710258841e-05, "loss": 4.9295, "step": 2254 }, { "epoch": 10.12, "grad_norm": 5.455718517303467, "learning_rate": 9.867647876698775e-05, "loss": 4.8753, "step": 2277 }, { "epoch": 10.222222222222221, "grad_norm": 5.959130764007568, "learning_rate": 9.86383768930019e-05, "loss": 4.8732, "step": 2300 }, { "epoch": 10.324444444444444, "grad_norm": 6.239514350891113, "learning_rate": 9.859974189826198e-05, "loss": 4.8707, "step": 2323 }, { "epoch": 10.426666666666666, "grad_norm": 7.127731800079346, "learning_rate": 9.856057420624259e-05, "loss": 4.846, "step": 2346 }, { "epoch": 10.528888888888888, "grad_norm": 6.327420234680176, "learning_rate": 9.852087424625717e-05, "loss": 4.8457, "step": 2369 }, { "epoch": 10.63111111111111, "grad_norm": 6.398340225219727, "learning_rate": 9.848064245345333e-05, "loss": 4.8295, "step": 2392 }, { "epoch": 10.733333333333333, "grad_norm": 5.890859603881836, "learning_rate": 9.843987926880803e-05, "loss": 4.8091, "step": 2415 }, { "epoch": 10.835555555555555, "grad_norm": 7.191392421722412, "learning_rate": 9.839858513912276e-05, "loss": 4.8022, "step": 2438 }, { "epoch": 10.937777777777779, "grad_norm": 6.238222122192383, "learning_rate": 9.835676051701867e-05, "loss": 4.7898, "step": 2461 }, { "epoch": 11.04, "grad_norm": 6.7246551513671875, "learning_rate": 9.831440586093157e-05, "loss": 4.7692, "step": 2484 }, { "epoch": 11.142222222222221, "grad_norm": 5.07949161529541, "learning_rate": 9.827152163510693e-05, "loss": 4.7251, "step": 2507 }, { "epoch": 11.244444444444444, "grad_norm": 7.340390682220459, "learning_rate": 9.82281083095948e-05, "loss": 4.7188, "step": 2530 }, { "epoch": 11.346666666666668, "grad_norm": 5.695153713226318, "learning_rate": 9.818416636024461e-05, "loss": 4.7111, "step": 2553 }, { "epoch": 11.448888888888888, "grad_norm": 5.70296573638916, "learning_rate": 9.813969626870002e-05, "loss": 4.7043, "step": 2576 }, { "epoch": 11.551111111111112, "grad_norm": 5.775058269500732, "learning_rate": 9.809469852239359e-05, "loss": 4.6924, "step": 2599 }, { "epoch": 11.653333333333332, "grad_norm": 7.319630146026611, "learning_rate": 9.804917361454145e-05, "loss": 4.6848, "step": 2622 }, { "epoch": 11.755555555555556, "grad_norm": 7.945709705352783, "learning_rate": 9.800312204413793e-05, "loss": 4.6667, "step": 2645 }, { "epoch": 11.857777777777777, "grad_norm": 7.591863632202148, "learning_rate": 9.795654431595e-05, "loss": 4.6778, "step": 2668 }, { "epoch": 11.96, "grad_norm": 6.433276653289795, "learning_rate": 9.790944094051187e-05, "loss": 4.6699, "step": 2691 }, { "epoch": 12.062222222222223, "grad_norm": 6.956933975219727, "learning_rate": 9.786181243411926e-05, "loss": 4.6113, "step": 2714 }, { "epoch": 12.164444444444445, "grad_norm": 5.551136016845703, "learning_rate": 9.781365931882387e-05, "loss": 4.582, "step": 2737 }, { "epoch": 12.266666666666667, "grad_norm": 7.214599609375, "learning_rate": 9.776498212242749e-05, "loss": 4.5932, "step": 2760 }, { "epoch": 12.36888888888889, "grad_norm": 6.5685715675354, "learning_rate": 9.771578137847639e-05, "loss": 4.5896, "step": 2783 }, { "epoch": 12.471111111111112, "grad_norm": 8.017729759216309, "learning_rate": 9.766605762625541e-05, "loss": 4.5579, "step": 2806 }, { "epoch": 12.573333333333334, "grad_norm": 6.021265983581543, "learning_rate": 9.761581141078194e-05, "loss": 4.5715, "step": 2829 }, { "epoch": 12.675555555555556, "grad_norm": 7.427931785583496, "learning_rate": 9.756504328280016e-05, "loss": 4.5681, "step": 2852 }, { "epoch": 12.777777777777779, "grad_norm": 6.325420379638672, "learning_rate": 9.751375379877481e-05, "loss": 4.5695, "step": 2875 }, { "epoch": 12.88, "grad_norm": 4.837381839752197, "learning_rate": 9.746194352088518e-05, "loss": 4.5321, "step": 2898 }, { "epoch": 12.982222222222223, "grad_norm": 6.933470726013184, "learning_rate": 9.740961301701894e-05, "loss": 4.5286, "step": 2921 }, { "epoch": 13.084444444444445, "grad_norm": 5.810832977294922, "learning_rate": 9.73567628607659e-05, "loss": 4.463, "step": 2944 }, { "epoch": 13.186666666666667, "grad_norm": 7.62490177154541, "learning_rate": 9.730339363141175e-05, "loss": 4.462, "step": 2967 }, { "epoch": 13.28888888888889, "grad_norm": 6.67575216293335, "learning_rate": 9.72495059139317e-05, "loss": 4.4402, "step": 2990 }, { "epoch": 13.391111111111112, "grad_norm": 6.110825538635254, "learning_rate": 9.719510029898398e-05, "loss": 4.443, "step": 3013 }, { "epoch": 13.493333333333334, "grad_norm": 7.317692279815674, "learning_rate": 9.714017738290358e-05, "loss": 4.4456, "step": 3036 }, { "epoch": 13.595555555555556, "grad_norm": 6.189058303833008, "learning_rate": 9.708473776769544e-05, "loss": 4.4524, "step": 3059 }, { "epoch": 13.697777777777778, "grad_norm": 5.6017632484436035, "learning_rate": 9.702878206102811e-05, "loss": 4.4234, "step": 3082 }, { "epoch": 13.8, "grad_norm": 5.7952189445495605, "learning_rate": 9.697231087622691e-05, "loss": 4.4016, "step": 3105 }, { "epoch": 13.902222222222223, "grad_norm": 5.7486677169799805, "learning_rate": 9.691532483226723e-05, "loss": 4.4106, "step": 3128 }, { "epoch": 14.004444444444445, "grad_norm": 6.603976249694824, "learning_rate": 9.68578245537679e-05, "loss": 4.367, "step": 3151 }, { "epoch": 14.106666666666667, "grad_norm": 6.593631744384766, "learning_rate": 9.679981067098414e-05, "loss": 4.3122, "step": 3174 }, { "epoch": 14.20888888888889, "grad_norm": 6.519464015960693, "learning_rate": 9.674128381980072e-05, "loss": 4.3038, "step": 3197 }, { "epoch": 14.311111111111112, "grad_norm": 7.691238880157471, "learning_rate": 9.668224464172508e-05, "loss": 4.3305, "step": 3220 }, { "epoch": 14.413333333333334, "grad_norm": 5.136379718780518, "learning_rate": 9.66226937838802e-05, "loss": 4.3137, "step": 3243 }, { "epoch": 14.515555555555556, "grad_norm": 5.727292537689209, "learning_rate": 9.65626318989975e-05, "loss": 4.3126, "step": 3266 }, { "epoch": 14.617777777777778, "grad_norm": 7.882863998413086, "learning_rate": 9.650205964540978e-05, "loss": 4.2942, "step": 3289 }, { "epoch": 14.72, "grad_norm": 7.945621013641357, "learning_rate": 9.64409776870439e-05, "loss": 4.3076, "step": 3312 }, { "epoch": 14.822222222222223, "grad_norm": 6.543049335479736, "learning_rate": 9.637938669341356e-05, "loss": 4.2815, "step": 3335 }, { "epoch": 14.924444444444445, "grad_norm": 5.685489654541016, "learning_rate": 9.631728733961194e-05, "loss": 4.2873, "step": 3358 }, { "epoch": 15.026666666666667, "grad_norm": 5.528294563293457, "learning_rate": 9.625468030630432e-05, "loss": 4.2617, "step": 3381 }, { "epoch": 15.12888888888889, "grad_norm": 7.666279315948486, "learning_rate": 9.619156627972064e-05, "loss": 4.2157, "step": 3404 }, { "epoch": 15.231111111111112, "grad_norm": 6.189380645751953, "learning_rate": 9.612794595164786e-05, "loss": 4.207, "step": 3427 }, { "epoch": 15.333333333333334, "grad_norm": 6.782273292541504, "learning_rate": 9.606382001942255e-05, "loss": 4.1977, "step": 3450 }, { "epoch": 15.435555555555556, "grad_norm": 6.819105625152588, "learning_rate": 9.599918918592313e-05, "loss": 4.2046, "step": 3473 }, { "epoch": 15.537777777777778, "grad_norm": 6.558395862579346, "learning_rate": 9.593405415956216e-05, "loss": 4.1959, "step": 3496 }, { "epoch": 15.64, "grad_norm": 7.579700946807861, "learning_rate": 9.58684156542787e-05, "loss": 4.2004, "step": 3519 }, { "epoch": 15.742222222222223, "grad_norm": 5.998022556304932, "learning_rate": 9.580227438953028e-05, "loss": 4.1972, "step": 3542 }, { "epoch": 15.844444444444445, "grad_norm": 8.631059646606445, "learning_rate": 9.573563109028523e-05, "loss": 4.1674, "step": 3565 }, { "epoch": 15.946666666666667, "grad_norm": 6.702101230621338, "learning_rate": 9.566848648701457e-05, "loss": 4.1303, "step": 3588 }, { "epoch": 16.04888888888889, "grad_norm": 7.247947692871094, "learning_rate": 9.56008413156841e-05, "loss": 4.0834, "step": 3611 }, { "epoch": 16.15111111111111, "grad_norm": 6.5919575691223145, "learning_rate": 9.553269631774631e-05, "loss": 4.0488, "step": 3634 }, { "epoch": 16.253333333333334, "grad_norm": 8.66784381866455, "learning_rate": 9.54640522401322e-05, "loss": 4.0754, "step": 3657 }, { "epoch": 16.355555555555554, "grad_norm": 7.605900764465332, "learning_rate": 9.539490983524316e-05, "loss": 4.0721, "step": 3680 }, { "epoch": 16.45777777777778, "grad_norm": 7.925562381744385, "learning_rate": 9.532526986094273e-05, "loss": 4.0685, "step": 3703 }, { "epoch": 16.56, "grad_norm": 7.180625915527344, "learning_rate": 9.525513308054819e-05, "loss": 4.0581, "step": 3726 }, { "epoch": 16.662222222222223, "grad_norm": 6.261662483215332, "learning_rate": 9.518450026282233e-05, "loss": 4.0405, "step": 3749 }, { "epoch": 16.764444444444443, "grad_norm": 5.739262580871582, "learning_rate": 9.511337218196494e-05, "loss": 4.0315, "step": 3772 }, { "epoch": 16.866666666666667, "grad_norm": 6.229343891143799, "learning_rate": 9.504174961760435e-05, "loss": 4.036, "step": 3795 }, { "epoch": 16.968888888888888, "grad_norm": 7.991888046264648, "learning_rate": 9.496963335478884e-05, "loss": 4.0707, "step": 3818 }, { "epoch": 17.07111111111111, "grad_norm": 5.881919860839844, "learning_rate": 9.489702418397814e-05, "loss": 3.9782, "step": 3841 }, { "epoch": 17.173333333333332, "grad_norm": 5.624960899353027, "learning_rate": 9.482392290103462e-05, "loss": 3.9473, "step": 3864 }, { "epoch": 17.275555555555556, "grad_norm": 5.786345481872559, "learning_rate": 9.475033030721471e-05, "loss": 3.9561, "step": 3887 }, { "epoch": 17.377777777777776, "grad_norm": 7.602824687957764, "learning_rate": 9.467624720916002e-05, "loss": 3.9605, "step": 3910 }, { "epoch": 17.48, "grad_norm": 6.39411735534668, "learning_rate": 9.460167441888854e-05, "loss": 3.9324, "step": 3933 }, { "epoch": 17.58222222222222, "grad_norm": 6.903740882873535, "learning_rate": 9.452661275378576e-05, "loss": 3.9302, "step": 3956 }, { "epoch": 17.684444444444445, "grad_norm": 7.515189170837402, "learning_rate": 9.445106303659562e-05, "loss": 3.911, "step": 3979 }, { "epoch": 17.786666666666665, "grad_norm": 6.514119625091553, "learning_rate": 9.43750260954116e-05, "loss": 3.9168, "step": 4002 }, { "epoch": 17.88888888888889, "grad_norm": 5.5810370445251465, "learning_rate": 9.429850276366758e-05, "loss": 3.9236, "step": 4025 }, { "epoch": 17.99111111111111, "grad_norm": 6.529542446136475, "learning_rate": 9.422149388012875e-05, "loss": 3.9076, "step": 4048 }, { "epoch": 18.093333333333334, "grad_norm": 5.167507171630859, "learning_rate": 9.414400028888235e-05, "loss": 3.8211, "step": 4071 }, { "epoch": 18.195555555555554, "grad_norm": 6.467238426208496, "learning_rate": 9.406602283932845e-05, "loss": 3.8423, "step": 4094 }, { "epoch": 18.297777777777778, "grad_norm": 7.490845203399658, "learning_rate": 9.398756238617071e-05, "loss": 3.8308, "step": 4117 }, { "epoch": 18.4, "grad_norm": 5.916659832000732, "learning_rate": 9.390861978940686e-05, "loss": 3.8273, "step": 4140 }, { "epoch": 18.502222222222223, "grad_norm": 6.601635456085205, "learning_rate": 9.382919591431945e-05, "loss": 3.8316, "step": 4163 }, { "epoch": 18.604444444444443, "grad_norm": 7.86677885055542, "learning_rate": 9.374929163146621e-05, "loss": 3.8223, "step": 4186 }, { "epoch": 18.706666666666667, "grad_norm": 6.863983154296875, "learning_rate": 9.36689078166706e-05, "loss": 3.8244, "step": 4209 }, { "epoch": 18.808888888888887, "grad_norm": 6.3487467765808105, "learning_rate": 9.35880453510122e-05, "loss": 3.7945, "step": 4232 }, { "epoch": 18.91111111111111, "grad_norm": 7.521273612976074, "learning_rate": 9.350670512081702e-05, "loss": 3.8077, "step": 4255 }, { "epoch": 19.013333333333332, "grad_norm": 7.834831714630127, "learning_rate": 9.34248880176478e-05, "loss": 3.7712, "step": 4278 }, { "epoch": 19.115555555555556, "grad_norm": 6.245793342590332, "learning_rate": 9.334259493829423e-05, "loss": 3.6992, "step": 4301 }, { "epoch": 19.217777777777776, "grad_norm": 7.780862808227539, "learning_rate": 9.325982678476317e-05, "loss": 3.6929, "step": 4324 }, { "epoch": 19.32, "grad_norm": 7.378338813781738, "learning_rate": 9.317658446426871e-05, "loss": 3.7204, "step": 4347 }, { "epoch": 19.42222222222222, "grad_norm": 6.953887462615967, "learning_rate": 9.309286888922219e-05, "loss": 3.7305, "step": 4370 }, { "epoch": 19.524444444444445, "grad_norm": 6.669604301452637, "learning_rate": 9.300868097722235e-05, "loss": 3.7116, "step": 4393 }, { "epoch": 19.626666666666665, "grad_norm": 6.265507221221924, "learning_rate": 9.292402165104506e-05, "loss": 3.6961, "step": 4416 }, { "epoch": 19.72888888888889, "grad_norm": 6.823009967803955, "learning_rate": 9.28388918386334e-05, "loss": 3.6913, "step": 4439 }, { "epoch": 19.83111111111111, "grad_norm": 6.928945064544678, "learning_rate": 9.275329247308737e-05, "loss": 3.7144, "step": 4462 }, { "epoch": 19.933333333333334, "grad_norm": 7.16089391708374, "learning_rate": 9.26672244926537e-05, "loss": 3.7168, "step": 4485 }, { "epoch": 20.035555555555554, "grad_norm": 6.4695563316345215, "learning_rate": 9.258068884071559e-05, "loss": 3.655, "step": 4508 }, { "epoch": 20.137777777777778, "grad_norm": 6.902811527252197, "learning_rate": 9.249368646578227e-05, "loss": 3.5855, "step": 4531 }, { "epoch": 20.24, "grad_norm": 7.336968898773193, "learning_rate": 9.24062183214788e-05, "loss": 3.5716, "step": 4554 }, { "epoch": 20.342222222222222, "grad_norm": 6.539813995361328, "learning_rate": 9.231828536653537e-05, "loss": 3.6035, "step": 4577 }, { "epoch": 20.444444444444443, "grad_norm": 8.689528465270996, "learning_rate": 9.222988856477702e-05, "loss": 3.6179, "step": 4600 }, { "epoch": 20.546666666666667, "grad_norm": 8.209162712097168, "learning_rate": 9.214102888511287e-05, "loss": 3.6182, "step": 4623 }, { "epoch": 20.648888888888887, "grad_norm": 7.320056438446045, "learning_rate": 9.20517073015257e-05, "loss": 3.5944, "step": 4646 }, { "epoch": 20.75111111111111, "grad_norm": 7.204301357269287, "learning_rate": 9.196192479306114e-05, "loss": 3.5922, "step": 4669 }, { "epoch": 20.85333333333333, "grad_norm": 7.85291051864624, "learning_rate": 9.187168234381692e-05, "loss": 3.5992, "step": 4692 }, { "epoch": 20.955555555555556, "grad_norm": 6.276856422424316, "learning_rate": 9.178098094293222e-05, "loss": 3.5929, "step": 4715 }, { "epoch": 21.057777777777776, "grad_norm": 5.835750102996826, "learning_rate": 9.168982158457672e-05, "loss": 3.5289, "step": 4738 }, { "epoch": 21.16, "grad_norm": 6.322780609130859, "learning_rate": 9.159820526793969e-05, "loss": 3.4881, "step": 4761 }, { "epoch": 21.26222222222222, "grad_norm": 7.341971397399902, "learning_rate": 9.150613299721916e-05, "loss": 3.4799, "step": 4784 }, { "epoch": 21.364444444444445, "grad_norm": 6.387499809265137, "learning_rate": 9.14136057816107e-05, "loss": 3.4747, "step": 4807 }, { "epoch": 21.466666666666665, "grad_norm": 7.271056175231934, "learning_rate": 9.132062463529665e-05, "loss": 3.4783, "step": 4830 }, { "epoch": 21.56888888888889, "grad_norm": 7.323966026306152, "learning_rate": 9.122719057743473e-05, "loss": 3.4756, "step": 4853 }, { "epoch": 21.67111111111111, "grad_norm": 7.535403251647949, "learning_rate": 9.113330463214699e-05, "loss": 3.4825, "step": 4876 }, { "epoch": 21.773333333333333, "grad_norm": 5.771243095397949, "learning_rate": 9.103896782850865e-05, "loss": 3.4737, "step": 4899 }, { "epoch": 21.875555555555554, "grad_norm": 8.020050048828125, "learning_rate": 9.094418120053667e-05, "loss": 3.4709, "step": 4922 }, { "epoch": 21.977777777777778, "grad_norm": 6.315218448638916, "learning_rate": 9.08489457871785e-05, "loss": 3.4751, "step": 4945 }, { "epoch": 22.08, "grad_norm": 7.77646017074585, "learning_rate": 9.075326263230073e-05, "loss": 3.4052, "step": 4968 }, { "epoch": 22.182222222222222, "grad_norm": 6.549800872802734, "learning_rate": 9.065713278467755e-05, "loss": 3.3815, "step": 4991 }, { "epoch": 22.284444444444443, "grad_norm": 6.499227523803711, "learning_rate": 9.056055729797938e-05, "loss": 3.3818, "step": 5014 }, { "epoch": 22.386666666666667, "grad_norm": 7.827967643737793, "learning_rate": 9.046353723076117e-05, "loss": 3.3781, "step": 5037 }, { "epoch": 22.488888888888887, "grad_norm": 8.61707592010498, "learning_rate": 9.036607364645094e-05, "loss": 3.362, "step": 5060 }, { "epoch": 22.59111111111111, "grad_norm": 8.287631034851074, "learning_rate": 9.026816761333799e-05, "loss": 3.3951, "step": 5083 }, { "epoch": 22.693333333333335, "grad_norm": 6.027954578399658, "learning_rate": 9.016982020456133e-05, "loss": 3.3988, "step": 5106 }, { "epoch": 22.795555555555556, "grad_norm": 5.422713756561279, "learning_rate": 9.00710324980978e-05, "loss": 3.3986, "step": 5129 }, { "epoch": 22.897777777777776, "grad_norm": 6.52266788482666, "learning_rate": 8.997180557675034e-05, "loss": 3.3685, "step": 5152 }, { "epoch": 23.0, "grad_norm": 8.5319242477417, "learning_rate": 8.987214052813604e-05, "loss": 3.3852, "step": 5175 }, { "epoch": 23.102222222222224, "grad_norm": 5.753627300262451, "learning_rate": 8.977203844467432e-05, "loss": 3.2861, "step": 5198 }, { "epoch": 23.204444444444444, "grad_norm": 6.238333225250244, "learning_rate": 8.967150042357484e-05, "loss": 3.297, "step": 5221 }, { "epoch": 23.306666666666665, "grad_norm": 7.126039505004883, "learning_rate": 8.957052756682556e-05, "loss": 3.3114, "step": 5244 }, { "epoch": 23.40888888888889, "grad_norm": 7.4155426025390625, "learning_rate": 8.946912098118066e-05, "loss": 3.3054, "step": 5267 }, { "epoch": 23.511111111111113, "grad_norm": 6.702388763427734, "learning_rate": 8.93672817781483e-05, "loss": 3.2675, "step": 5290 }, { "epoch": 23.613333333333333, "grad_norm": 7.878185272216797, "learning_rate": 8.926501107397863e-05, "loss": 3.2968, "step": 5313 }, { "epoch": 23.715555555555554, "grad_norm": 7.802605152130127, "learning_rate": 8.916230998965134e-05, "loss": 3.2743, "step": 5336 }, { "epoch": 23.817777777777778, "grad_norm": 6.080660820007324, "learning_rate": 8.905917965086356e-05, "loss": 3.287, "step": 5359 }, { "epoch": 23.92, "grad_norm": 7.292867660522461, "learning_rate": 8.895562118801738e-05, "loss": 3.2723, "step": 5382 }, { "epoch": 24.022222222222222, "grad_norm": 6.736908435821533, "learning_rate": 8.885163573620754e-05, "loss": 3.2406, "step": 5405 }, { "epoch": 24.124444444444446, "grad_norm": 5.8357343673706055, "learning_rate": 8.874722443520899e-05, "loss": 3.1797, "step": 5428 }, { "epoch": 24.226666666666667, "grad_norm": 6.487481117248535, "learning_rate": 8.864238842946433e-05, "loss": 3.1876, "step": 5451 }, { "epoch": 24.32888888888889, "grad_norm": 7.854300498962402, "learning_rate": 8.853712886807132e-05, "loss": 3.2056, "step": 5474 }, { "epoch": 24.43111111111111, "grad_norm": 8.225058555603027, "learning_rate": 8.84314469047703e-05, "loss": 3.2518, "step": 5497 }, { "epoch": 24.533333333333335, "grad_norm": 7.0223236083984375, "learning_rate": 8.832534369793153e-05, "loss": 3.2102, "step": 5520 }, { "epoch": 24.635555555555555, "grad_norm": 7.004525661468506, "learning_rate": 8.821882041054239e-05, "loss": 3.1674, "step": 5543 }, { "epoch": 24.73777777777778, "grad_norm": 7.304614543914795, "learning_rate": 8.811187821019486e-05, "loss": 3.188, "step": 5566 }, { "epoch": 24.84, "grad_norm": 6.002228736877441, "learning_rate": 8.800451826907245e-05, "loss": 3.1785, "step": 5589 }, { "epoch": 24.942222222222224, "grad_norm": 6.998710632324219, "learning_rate": 8.789674176393761e-05, "loss": 3.1713, "step": 5612 }, { "epoch": 25.044444444444444, "grad_norm": 7.029483795166016, "learning_rate": 8.77885498761186e-05, "loss": 3.1521, "step": 5635 }, { "epoch": 25.14666666666667, "grad_norm": 6.024033069610596, "learning_rate": 8.767994379149675e-05, "loss": 3.0885, "step": 5658 }, { "epoch": 25.24888888888889, "grad_norm": 7.233892440795898, "learning_rate": 8.757092470049329e-05, "loss": 3.0891, "step": 5681 }, { "epoch": 25.351111111111113, "grad_norm": 7.917546272277832, "learning_rate": 8.74614937980564e-05, "loss": 3.1085, "step": 5704 }, { "epoch": 25.453333333333333, "grad_norm": 7.8942437171936035, "learning_rate": 8.735165228364809e-05, "loss": 3.0931, "step": 5727 }, { "epoch": 25.555555555555557, "grad_norm": 7.184880256652832, "learning_rate": 8.724140136123106e-05, "loss": 3.1079, "step": 5750 }, { "epoch": 25.657777777777778, "grad_norm": 5.8746137619018555, "learning_rate": 8.713074223925546e-05, "loss": 3.0924, "step": 5773 }, { "epoch": 25.76, "grad_norm": 6.722870826721191, "learning_rate": 8.701967613064575e-05, "loss": 3.0918, "step": 5796 }, { "epoch": 25.862222222222222, "grad_norm": 8.777771949768066, "learning_rate": 8.690820425278721e-05, "loss": 3.1046, "step": 5819 }, { "epoch": 25.964444444444446, "grad_norm": 7.208896636962891, "learning_rate": 8.679632782751283e-05, "loss": 3.1053, "step": 5842 }, { "epoch": 26.066666666666666, "grad_norm": 12.179716110229492, "learning_rate": 8.668404808108978e-05, "loss": 3.034, "step": 5865 }, { "epoch": 26.16888888888889, "grad_norm": 6.9270501136779785, "learning_rate": 8.657136624420596e-05, "loss": 2.982, "step": 5888 }, { "epoch": 26.27111111111111, "grad_norm": 6.495911598205566, "learning_rate": 8.645828355195658e-05, "loss": 2.9953, "step": 5911 }, { "epoch": 26.373333333333335, "grad_norm": 6.193568229675293, "learning_rate": 8.634480124383057e-05, "loss": 3.0264, "step": 5934 }, { "epoch": 26.475555555555555, "grad_norm": 7.5366034507751465, "learning_rate": 8.623092056369704e-05, "loss": 3.029, "step": 5957 }, { "epoch": 26.57777777777778, "grad_norm": 7.380651473999023, "learning_rate": 8.611664275979157e-05, "loss": 3.0148, "step": 5980 }, { "epoch": 26.68, "grad_norm": 6.579084396362305, "learning_rate": 8.600196908470265e-05, "loss": 3.0019, "step": 6003 }, { "epoch": 26.782222222222224, "grad_norm": 7.964267253875732, "learning_rate": 8.588690079535779e-05, "loss": 3.0102, "step": 6026 }, { "epoch": 26.884444444444444, "grad_norm": 7.465826034545898, "learning_rate": 8.577143915300993e-05, "loss": 2.9759, "step": 6049 }, { "epoch": 26.986666666666668, "grad_norm": 6.584536552429199, "learning_rate": 8.56555854232234e-05, "loss": 2.9609, "step": 6072 }, { "epoch": 27.08888888888889, "grad_norm": 6.6631550788879395, "learning_rate": 8.553934087586026e-05, "loss": 2.8921, "step": 6095 }, { "epoch": 27.191111111111113, "grad_norm": 7.030783176422119, "learning_rate": 8.542270678506625e-05, "loss": 2.8946, "step": 6118 }, { "epoch": 27.293333333333333, "grad_norm": 6.412444114685059, "learning_rate": 8.530568442925684e-05, "loss": 2.9002, "step": 6141 }, { "epoch": 27.395555555555557, "grad_norm": 8.111526489257812, "learning_rate": 8.518827509110328e-05, "loss": 2.9037, "step": 6164 }, { "epoch": 27.497777777777777, "grad_norm": 6.402091026306152, "learning_rate": 8.507048005751847e-05, "loss": 2.9006, "step": 6187 }, { "epoch": 27.6, "grad_norm": 7.210970878601074, "learning_rate": 8.495230061964288e-05, "loss": 2.911, "step": 6210 }, { "epoch": 27.702222222222222, "grad_norm": 9.301465034484863, "learning_rate": 8.48337380728304e-05, "loss": 2.915, "step": 6233 }, { "epoch": 27.804444444444446, "grad_norm": 10.22038745880127, "learning_rate": 8.471479371663417e-05, "loss": 2.9234, "step": 6256 }, { "epoch": 27.906666666666666, "grad_norm": 8.557666778564453, "learning_rate": 8.459546885479226e-05, "loss": 2.9312, "step": 6279 }, { "epoch": 28.00888888888889, "grad_norm": 8.308337211608887, "learning_rate": 8.447576479521348e-05, "loss": 2.9055, "step": 6302 }, { "epoch": 28.11111111111111, "grad_norm": 9.826993942260742, "learning_rate": 8.435568284996294e-05, "loss": 2.795, "step": 6325 }, { "epoch": 28.213333333333335, "grad_norm": 7.39091157913208, "learning_rate": 8.423522433524776e-05, "loss": 2.7985, "step": 6348 }, { "epoch": 28.315555555555555, "grad_norm": 7.943458557128906, "learning_rate": 8.411439057140257e-05, "loss": 2.804, "step": 6371 }, { "epoch": 28.41777777777778, "grad_norm": 7.037588119506836, "learning_rate": 8.399318288287512e-05, "loss": 2.8196, "step": 6394 }, { "epoch": 28.52, "grad_norm": 6.966550350189209, "learning_rate": 8.387160259821166e-05, "loss": 2.8037, "step": 6417 }, { "epoch": 28.622222222222224, "grad_norm": 6.990281105041504, "learning_rate": 8.374965105004244e-05, "loss": 2.8049, "step": 6440 }, { "epoch": 28.724444444444444, "grad_norm": 8.029483795166016, "learning_rate": 8.362732957506714e-05, "loss": 2.8056, "step": 6463 }, { "epoch": 28.826666666666668, "grad_norm": 6.398525714874268, "learning_rate": 8.350463951404012e-05, "loss": 2.8254, "step": 6486 }, { "epoch": 28.92888888888889, "grad_norm": 9.660991668701172, "learning_rate": 8.338158221175581e-05, "loss": 2.8516, "step": 6509 }, { "epoch": 29.031111111111112, "grad_norm": 7.429766654968262, "learning_rate": 8.325815901703394e-05, "loss": 2.8115, "step": 6532 }, { "epoch": 29.133333333333333, "grad_norm": 6.842705726623535, "learning_rate": 8.313437128270469e-05, "loss": 2.7238, "step": 6555 }, { "epoch": 29.235555555555557, "grad_norm": 9.195459365844727, "learning_rate": 8.301022036559405e-05, "loss": 2.7192, "step": 6578 }, { "epoch": 29.337777777777777, "grad_norm": 7.685567378997803, "learning_rate": 8.288570762650869e-05, "loss": 2.7009, "step": 6601 }, { "epoch": 29.44, "grad_norm": 6.384602069854736, "learning_rate": 8.276083443022126e-05, "loss": 2.7286, "step": 6624 }, { "epoch": 29.54222222222222, "grad_norm": 7.564410209655762, "learning_rate": 8.263560214545532e-05, "loss": 2.7405, "step": 6647 }, { "epoch": 29.644444444444446, "grad_norm": 6.835319995880127, "learning_rate": 8.251001214487039e-05, "loss": 2.7197, "step": 6670 }, { "epoch": 29.746666666666666, "grad_norm": 7.009396553039551, "learning_rate": 8.238406580504683e-05, "loss": 2.7322, "step": 6693 }, { "epoch": 29.84888888888889, "grad_norm": 6.862404823303223, "learning_rate": 8.225776450647082e-05, "loss": 2.7476, "step": 6716 }, { "epoch": 29.95111111111111, "grad_norm": 6.345396041870117, "learning_rate": 8.213110963351928e-05, "loss": 2.7317, "step": 6739 }, { "epoch": 30.053333333333335, "grad_norm": 7.607011795043945, "learning_rate": 8.200410257444451e-05, "loss": 2.6859, "step": 6762 }, { "epoch": 30.155555555555555, "grad_norm": 6.952041149139404, "learning_rate": 8.187674472135915e-05, "loss": 2.6587, "step": 6785 }, { "epoch": 30.25777777777778, "grad_norm": 6.717074394226074, "learning_rate": 8.17490374702209e-05, "loss": 2.6636, "step": 6808 }, { "epoch": 30.36, "grad_norm": 7.299156665802002, "learning_rate": 8.162098222081711e-05, "loss": 2.6731, "step": 6831 }, { "epoch": 30.462222222222223, "grad_norm": 7.86132287979126, "learning_rate": 8.149258037674952e-05, "loss": 2.6568, "step": 6854 }, { "epoch": 30.564444444444444, "grad_norm": 6.957241535186768, "learning_rate": 8.13638333454189e-05, "loss": 2.621, "step": 6877 }, { "epoch": 30.666666666666668, "grad_norm": 7.0929741859436035, "learning_rate": 8.123474253800957e-05, "loss": 2.6453, "step": 6900 }, { "epoch": 30.76888888888889, "grad_norm": 7.3665385246276855, "learning_rate": 8.110530936947392e-05, "loss": 2.6668, "step": 6923 }, { "epoch": 30.871111111111112, "grad_norm": 8.744823455810547, "learning_rate": 8.097553525851693e-05, "loss": 2.6759, "step": 6946 }, { "epoch": 30.973333333333333, "grad_norm": 6.603512287139893, "learning_rate": 8.084542162758067e-05, "loss": 2.6677, "step": 6969 }, { "epoch": 31.075555555555557, "grad_norm": 6.355960369110107, "learning_rate": 8.071496990282861e-05, "loss": 2.6044, "step": 6992 }, { "epoch": 31.177777777777777, "grad_norm": 6.957365989685059, "learning_rate": 8.058418151413005e-05, "loss": 2.5647, "step": 7015 }, { "epoch": 31.28, "grad_norm": 7.455416679382324, "learning_rate": 8.045305789504444e-05, "loss": 2.5981, "step": 7038 }, { "epoch": 31.38222222222222, "grad_norm": 6.41038703918457, "learning_rate": 8.032160048280566e-05, "loss": 2.6026, "step": 7061 }, { "epoch": 31.484444444444446, "grad_norm": 8.298896789550781, "learning_rate": 8.018981071830622e-05, "loss": 2.5975, "step": 7084 }, { "epoch": 31.586666666666666, "grad_norm": 9.506787300109863, "learning_rate": 8.005769004608156e-05, "loss": 2.6356, "step": 7107 }, { "epoch": 31.68888888888889, "grad_norm": 8.870840072631836, "learning_rate": 7.992523991429419e-05, "loss": 2.6015, "step": 7130 }, { "epoch": 31.79111111111111, "grad_norm": 8.160204887390137, "learning_rate": 7.979246177471773e-05, "loss": 2.593, "step": 7153 }, { "epoch": 31.893333333333334, "grad_norm": 6.366309642791748, "learning_rate": 7.96593570827211e-05, "loss": 2.5548, "step": 7176 }, { "epoch": 31.995555555555555, "grad_norm": 6.812814712524414, "learning_rate": 7.952592729725254e-05, "loss": 2.5352, "step": 7199 }, { "epoch": 32.09777777777778, "grad_norm": 6.476632118225098, "learning_rate": 7.939217388082361e-05, "loss": 2.4694, "step": 7222 }, { "epoch": 32.2, "grad_norm": 7.325323104858398, "learning_rate": 7.925809829949312e-05, "loss": 2.4581, "step": 7245 }, { "epoch": 32.30222222222222, "grad_norm": 7.190999984741211, "learning_rate": 7.912370202285113e-05, "loss": 2.4829, "step": 7268 }, { "epoch": 32.404444444444444, "grad_norm": 7.949245452880859, "learning_rate": 7.898898652400281e-05, "loss": 2.5134, "step": 7291 }, { "epoch": 32.50666666666667, "grad_norm": 7.711633682250977, "learning_rate": 7.88539532795523e-05, "loss": 2.5374, "step": 7314 }, { "epoch": 32.60888888888889, "grad_norm": 7.286764621734619, "learning_rate": 7.87186037695865e-05, "loss": 2.4946, "step": 7337 }, { "epoch": 32.71111111111111, "grad_norm": 7.322375774383545, "learning_rate": 7.858293947765892e-05, "loss": 2.5086, "step": 7360 }, { "epoch": 32.81333333333333, "grad_norm": 7.134939670562744, "learning_rate": 7.844696189077328e-05, "loss": 2.4963, "step": 7383 }, { "epoch": 32.91555555555556, "grad_norm": 7.648177623748779, "learning_rate": 7.831067249936734e-05, "loss": 2.4857, "step": 7406 }, { "epoch": 33.01777777777778, "grad_norm": 6.730453968048096, "learning_rate": 7.817407279729657e-05, "loss": 2.4906, "step": 7429 }, { "epoch": 33.12, "grad_norm": 6.662753105163574, "learning_rate": 7.803716428181763e-05, "loss": 2.4054, "step": 7452 }, { "epoch": 33.22222222222222, "grad_norm": 6.583335876464844, "learning_rate": 7.789994845357212e-05, "loss": 2.3762, "step": 7475 }, { "epoch": 33.324444444444445, "grad_norm": 6.661638259887695, "learning_rate": 7.776242681657006e-05, "loss": 2.4166, "step": 7498 }, { "epoch": 33.42666666666667, "grad_norm": 6.506235599517822, "learning_rate": 7.762460087817343e-05, "loss": 2.4081, "step": 7521 }, { "epoch": 33.528888888888886, "grad_norm": 8.114941596984863, "learning_rate": 7.748647214907954e-05, "loss": 2.4189, "step": 7544 }, { "epoch": 33.63111111111111, "grad_norm": 7.059467315673828, "learning_rate": 7.73480421433047e-05, "loss": 2.4416, "step": 7567 }, { "epoch": 33.733333333333334, "grad_norm": 9.18146800994873, "learning_rate": 7.720931237816735e-05, "loss": 2.4374, "step": 7590 }, { "epoch": 33.83555555555556, "grad_norm": 7.458983898162842, "learning_rate": 7.707028437427164e-05, "loss": 2.4392, "step": 7613 }, { "epoch": 33.937777777777775, "grad_norm": 6.761877536773682, "learning_rate": 7.693095965549069e-05, "loss": 2.4354, "step": 7636 }, { "epoch": 34.04, "grad_norm": 7.720556735992432, "learning_rate": 7.679133974894983e-05, "loss": 2.3844, "step": 7659 }, { "epoch": 34.14222222222222, "grad_norm": 6.558327674865723, "learning_rate": 7.665142618501e-05, "loss": 2.3599, "step": 7682 }, { "epoch": 34.24444444444445, "grad_norm": 6.790546894073486, "learning_rate": 7.651122049725082e-05, "loss": 2.3541, "step": 7705 }, { "epoch": 34.346666666666664, "grad_norm": 6.559151649475098, "learning_rate": 7.637072422245386e-05, "loss": 2.3684, "step": 7728 }, { "epoch": 34.44888888888889, "grad_norm": 8.255489349365234, "learning_rate": 7.622993890058582e-05, "loss": 2.3799, "step": 7751 }, { "epoch": 34.55111111111111, "grad_norm": 8.185545921325684, "learning_rate": 7.60888660747816e-05, "loss": 2.3723, "step": 7774 }, { "epoch": 34.653333333333336, "grad_norm": 7.4899516105651855, "learning_rate": 7.594750729132743e-05, "loss": 2.3813, "step": 7797 }, { "epoch": 34.75555555555555, "grad_norm": 6.652093887329102, "learning_rate": 7.580586409964382e-05, "loss": 2.3641, "step": 7820 }, { "epoch": 34.85777777777778, "grad_norm": 6.916318893432617, "learning_rate": 7.566393805226874e-05, "loss": 2.3689, "step": 7843 }, { "epoch": 34.96, "grad_norm": 7.0521559715271, "learning_rate": 7.552173070484048e-05, "loss": 2.3528, "step": 7866 }, { "epoch": 35.062222222222225, "grad_norm": 7.043063163757324, "learning_rate": 7.537924361608062e-05, "loss": 2.2977, "step": 7889 }, { "epoch": 35.16444444444444, "grad_norm": 6.285613059997559, "learning_rate": 7.523647834777698e-05, "loss": 2.2593, "step": 7912 }, { "epoch": 35.266666666666666, "grad_norm": 7.13001012802124, "learning_rate": 7.509343646476646e-05, "loss": 2.268, "step": 7935 }, { "epoch": 35.36888888888889, "grad_norm": 6.38799524307251, "learning_rate": 7.495011953491793e-05, "loss": 2.291, "step": 7958 }, { "epoch": 35.471111111111114, "grad_norm": 7.488864421844482, "learning_rate": 7.480652912911501e-05, "loss": 2.3234, "step": 7981 }, { "epoch": 35.57333333333333, "grad_norm": 6.8178558349609375, "learning_rate": 7.466266682123888e-05, "loss": 2.3204, "step": 8004 }, { "epoch": 35.675555555555555, "grad_norm": 7.1541748046875, "learning_rate": 7.451853418815097e-05, "loss": 2.3137, "step": 8027 }, { "epoch": 35.77777777777778, "grad_norm": 8.040066719055176, "learning_rate": 7.437413280967578e-05, "loss": 2.3173, "step": 8050 }, { "epoch": 35.88, "grad_norm": 8.158806800842285, "learning_rate": 7.422946426858345e-05, "loss": 2.2952, "step": 8073 }, { "epoch": 35.98222222222222, "grad_norm": 7.60796594619751, "learning_rate": 7.408453015057252e-05, "loss": 2.2707, "step": 8096 }, { "epoch": 36.08444444444444, "grad_norm": 6.903555870056152, "learning_rate": 7.393933204425244e-05, "loss": 2.2153, "step": 8119 }, { "epoch": 36.18666666666667, "grad_norm": 7.1362624168396, "learning_rate": 7.379387154112625e-05, "loss": 2.2045, "step": 8142 }, { "epoch": 36.28888888888889, "grad_norm": 7.824875354766846, "learning_rate": 7.364815023557306e-05, "loss": 2.215, "step": 8165 }, { "epoch": 36.39111111111111, "grad_norm": 10.668073654174805, "learning_rate": 7.350216972483064e-05, "loss": 2.2303, "step": 8188 }, { "epoch": 36.49333333333333, "grad_norm": 5.577554225921631, "learning_rate": 7.33559316089779e-05, "loss": 2.2175, "step": 8211 }, { "epoch": 36.595555555555556, "grad_norm": 6.902368545532227, "learning_rate": 7.320943749091728e-05, "loss": 2.2207, "step": 8234 }, { "epoch": 36.69777777777778, "grad_norm": 6.997749328613281, "learning_rate": 7.30626889763573e-05, "loss": 2.2525, "step": 8257 }, { "epoch": 36.8, "grad_norm": 7.666829586029053, "learning_rate": 7.291568767379484e-05, "loss": 2.2427, "step": 8280 }, { "epoch": 36.90222222222222, "grad_norm": 6.811129093170166, "learning_rate": 7.27684351944976e-05, "loss": 2.25, "step": 8303 }, { "epoch": 37.004444444444445, "grad_norm": 5.935613632202148, "learning_rate": 7.262093315248641e-05, "loss": 2.2459, "step": 8326 }, { "epoch": 37.10666666666667, "grad_norm": 6.339777946472168, "learning_rate": 7.24731831645175e-05, "loss": 2.167, "step": 8349 }, { "epoch": 37.208888888888886, "grad_norm": 7.560238361358643, "learning_rate": 7.232518685006485e-05, "loss": 2.1952, "step": 8372 }, { "epoch": 37.31111111111111, "grad_norm": 6.586178779602051, "learning_rate": 7.21769458313024e-05, "loss": 2.1791, "step": 8395 }, { "epoch": 37.413333333333334, "grad_norm": 7.019660949707031, "learning_rate": 7.20284617330862e-05, "loss": 2.1754, "step": 8418 }, { "epoch": 37.51555555555556, "grad_norm": 7.03871488571167, "learning_rate": 7.187973618293678e-05, "loss": 2.1585, "step": 8441 }, { "epoch": 37.617777777777775, "grad_norm": 6.066256046295166, "learning_rate": 7.173077081102114e-05, "loss": 2.1424, "step": 8464 }, { "epoch": 37.72, "grad_norm": 6.991265773773193, "learning_rate": 7.158156725013493e-05, "loss": 2.1577, "step": 8487 }, { "epoch": 37.82222222222222, "grad_norm": 8.248811721801758, "learning_rate": 7.14321271356846e-05, "loss": 2.1603, "step": 8510 }, { "epoch": 37.92444444444445, "grad_norm": 8.15676212310791, "learning_rate": 7.128245210566947e-05, "loss": 2.1695, "step": 8533 }, { "epoch": 38.026666666666664, "grad_norm": 7.107559680938721, "learning_rate": 7.113254380066367e-05, "loss": 2.1488, "step": 8556 }, { "epoch": 38.12888888888889, "grad_norm": 8.755867004394531, "learning_rate": 7.098240386379831e-05, "loss": 2.1009, "step": 8579 }, { "epoch": 38.23111111111111, "grad_norm": 7.037129878997803, "learning_rate": 7.083203394074334e-05, "loss": 2.0954, "step": 8602 }, { "epoch": 38.333333333333336, "grad_norm": 6.437880039215088, "learning_rate": 7.068143567968957e-05, "loss": 2.085, "step": 8625 }, { "epoch": 38.43555555555555, "grad_norm": 10.530925750732422, "learning_rate": 7.053061073133067e-05, "loss": 2.1242, "step": 8648 }, { "epoch": 38.53777777777778, "grad_norm": 7.10654878616333, "learning_rate": 7.037956074884493e-05, "loss": 2.1354, "step": 8671 }, { "epoch": 38.64, "grad_norm": 6.740297794342041, "learning_rate": 7.022828738787724e-05, "loss": 2.1365, "step": 8694 }, { "epoch": 38.742222222222225, "grad_norm": 7.16520357131958, "learning_rate": 7.007679230652095e-05, "loss": 2.1163, "step": 8717 }, { "epoch": 38.84444444444444, "grad_norm": 7.305176258087158, "learning_rate": 6.992507716529965e-05, "loss": 2.1429, "step": 8740 }, { "epoch": 38.946666666666665, "grad_norm": 5.924234390258789, "learning_rate": 6.977314362714898e-05, "loss": 2.1132, "step": 8763 }, { "epoch": 39.04888888888889, "grad_norm": 8.262660026550293, "learning_rate": 6.962099335739837e-05, "loss": 2.0614, "step": 8786 }, { "epoch": 39.15111111111111, "grad_norm": 7.352762699127197, "learning_rate": 6.946862802375292e-05, "loss": 2.0194, "step": 8809 }, { "epoch": 39.25333333333333, "grad_norm": 6.5161824226379395, "learning_rate": 6.931604929627495e-05, "loss": 2.0356, "step": 8832 }, { "epoch": 39.355555555555554, "grad_norm": 6.718994140625, "learning_rate": 6.916325884736576e-05, "loss": 2.0442, "step": 8855 }, { "epoch": 39.45777777777778, "grad_norm": 6.267631530761719, "learning_rate": 6.901025835174739e-05, "loss": 2.0456, "step": 8878 }, { "epoch": 39.56, "grad_norm": 6.105040550231934, "learning_rate": 6.885704948644411e-05, "loss": 2.0319, "step": 8901 }, { "epoch": 39.66222222222222, "grad_norm": 6.807146072387695, "learning_rate": 6.870363393076413e-05, "loss": 2.051, "step": 8924 }, { "epoch": 39.76444444444444, "grad_norm": 6.0141987800598145, "learning_rate": 6.855001336628118e-05, "loss": 2.0376, "step": 8947 }, { "epoch": 39.86666666666667, "grad_norm": 7.84182596206665, "learning_rate": 6.839618947681609e-05, "loss": 2.0596, "step": 8970 }, { "epoch": 39.96888888888889, "grad_norm": 8.566624641418457, "learning_rate": 6.824216394841825e-05, "loss": 2.0607, "step": 8993 }, { "epoch": 40.07111111111111, "grad_norm": 6.4133992195129395, "learning_rate": 6.808793846934729e-05, "loss": 1.9994, "step": 9016 }, { "epoch": 40.17333333333333, "grad_norm": 10.160492897033691, "learning_rate": 6.79335147300544e-05, "loss": 1.9999, "step": 9039 }, { "epoch": 40.275555555555556, "grad_norm": 6.391870021820068, "learning_rate": 6.777889442316394e-05, "loss": 1.9972, "step": 9062 }, { "epoch": 40.37777777777778, "grad_norm": 9.107426643371582, "learning_rate": 6.762407924345479e-05, "loss": 1.9891, "step": 9085 }, { "epoch": 40.48, "grad_norm": 6.959272861480713, "learning_rate": 6.746907088784182e-05, "loss": 1.9765, "step": 9108 }, { "epoch": 40.58222222222222, "grad_norm": 6.614034175872803, "learning_rate": 6.73138710553573e-05, "loss": 1.993, "step": 9131 }, { "epoch": 40.684444444444445, "grad_norm": 7.331613063812256, "learning_rate": 6.715848144713227e-05, "loss": 1.9826, "step": 9154 }, { "epoch": 40.78666666666667, "grad_norm": 8.619832992553711, "learning_rate": 6.700290376637782e-05, "loss": 2.0247, "step": 9177 }, { "epoch": 40.888888888888886, "grad_norm": 7.282753944396973, "learning_rate": 6.684713971836656e-05, "loss": 2.0123, "step": 9200 }, { "epoch": 40.99111111111111, "grad_norm": 7.198232173919678, "learning_rate": 6.669119101041383e-05, "loss": 2.0095, "step": 9223 }, { "epoch": 41.093333333333334, "grad_norm": 6.148073673248291, "learning_rate": 6.6535059351859e-05, "loss": 1.9284, "step": 9246 }, { "epoch": 41.19555555555556, "grad_norm": 7.000942230224609, "learning_rate": 6.637874645404673e-05, "loss": 1.9308, "step": 9269 }, { "epoch": 41.297777777777775, "grad_norm": 9.497756004333496, "learning_rate": 6.622225403030828e-05, "loss": 1.9316, "step": 9292 }, { "epoch": 41.4, "grad_norm": 6.189666748046875, "learning_rate": 6.606558379594262e-05, "loss": 1.9304, "step": 9315 }, { "epoch": 41.50222222222222, "grad_norm": 6.823606014251709, "learning_rate": 6.590873746819772e-05, "loss": 1.9582, "step": 9338 }, { "epoch": 41.60444444444445, "grad_norm": 6.261486530303955, "learning_rate": 6.575171676625169e-05, "loss": 1.9322, "step": 9361 }, { "epoch": 41.70666666666666, "grad_norm": 6.920318603515625, "learning_rate": 6.559452341119389e-05, "loss": 1.9533, "step": 9384 }, { "epoch": 41.80888888888889, "grad_norm": 7.246551513671875, "learning_rate": 6.543715912600621e-05, "loss": 1.9548, "step": 9407 }, { "epoch": 41.91111111111111, "grad_norm": 6.377082824707031, "learning_rate": 6.527962563554402e-05, "loss": 1.9709, "step": 9430 }, { "epoch": 42.013333333333335, "grad_norm": 7.362649440765381, "learning_rate": 6.512192466651735e-05, "loss": 1.9402, "step": 9453 }, { "epoch": 42.11555555555555, "grad_norm": 9.08193588256836, "learning_rate": 6.496405794747193e-05, "loss": 1.8674, "step": 9476 }, { "epoch": 42.217777777777776, "grad_norm": 6.658238410949707, "learning_rate": 6.480602720877029e-05, "loss": 1.8556, "step": 9499 }, { "epoch": 42.32, "grad_norm": 6.951099395751953, "learning_rate": 6.464783418257277e-05, "loss": 1.8759, "step": 9522 }, { "epoch": 42.422222222222224, "grad_norm": 8.758234977722168, "learning_rate": 6.448948060281847e-05, "loss": 1.8712, "step": 9545 }, { "epoch": 42.52444444444444, "grad_norm": 6.225131988525391, "learning_rate": 6.433096820520639e-05, "loss": 1.8857, "step": 9568 }, { "epoch": 42.626666666666665, "grad_norm": 7.351943492889404, "learning_rate": 6.417229872717624e-05, "loss": 1.8809, "step": 9591 }, { "epoch": 42.72888888888889, "grad_norm": 7.482339859008789, "learning_rate": 6.401347390788952e-05, "loss": 1.8694, "step": 9614 }, { "epoch": 42.83111111111111, "grad_norm": 6.971664905548096, "learning_rate": 6.385449548821037e-05, "loss": 1.8744, "step": 9637 }, { "epoch": 42.93333333333333, "grad_norm": 6.296336650848389, "learning_rate": 6.36953652106866e-05, "loss": 1.8966, "step": 9660 }, { "epoch": 43.035555555555554, "grad_norm": 6.986079216003418, "learning_rate": 6.353608481953042e-05, "loss": 1.8555, "step": 9683 }, { "epoch": 43.13777777777778, "grad_norm": 5.542973041534424, "learning_rate": 6.337665606059953e-05, "loss": 1.8185, "step": 9706 }, { "epoch": 43.24, "grad_norm": 7.133216381072998, "learning_rate": 6.321708068137779e-05, "loss": 1.8241, "step": 9729 }, { "epoch": 43.34222222222222, "grad_norm": 6.318929672241211, "learning_rate": 6.305736043095619e-05, "loss": 1.8372, "step": 9752 }, { "epoch": 43.44444444444444, "grad_norm": 6.268241882324219, "learning_rate": 6.289749706001365e-05, "loss": 1.8602, "step": 9775 }, { "epoch": 43.54666666666667, "grad_norm": 5.881213665008545, "learning_rate": 6.273749232079778e-05, "loss": 1.8439, "step": 9798 }, { "epoch": 43.64888888888889, "grad_norm": 6.6124186515808105, "learning_rate": 6.257734796710575e-05, "loss": 1.8428, "step": 9821 }, { "epoch": 43.75111111111111, "grad_norm": 7.996447563171387, "learning_rate": 6.241706575426504e-05, "loss": 1.8354, "step": 9844 }, { "epoch": 43.85333333333333, "grad_norm": 7.1598639488220215, "learning_rate": 6.225664743911414e-05, "loss": 1.8185, "step": 9867 }, { "epoch": 43.955555555555556, "grad_norm": 7.8854265213012695, "learning_rate": 6.209609477998338e-05, "loss": 1.832, "step": 9890 }, { "epoch": 44.05777777777778, "grad_norm": 8.291993141174316, "learning_rate": 6.193540953667564e-05, "loss": 1.7871, "step": 9913 }, { "epoch": 44.16, "grad_norm": 8.600836753845215, "learning_rate": 6.177459347044703e-05, "loss": 1.7882, "step": 9936 }, { "epoch": 44.26222222222222, "grad_norm": 8.065147399902344, "learning_rate": 6.161364834398755e-05, "loss": 1.7799, "step": 9959 }, { "epoch": 44.364444444444445, "grad_norm": 8.459796905517578, "learning_rate": 6.145257592140188e-05, "loss": 1.763, "step": 9982 }, { "epoch": 44.46666666666667, "grad_norm": 6.006131649017334, "learning_rate": 6.129137796818997e-05, "loss": 1.7885, "step": 10005 }, { "epoch": 44.568888888888885, "grad_norm": 8.034002304077148, "learning_rate": 6.113005625122767e-05, "loss": 1.8008, "step": 10028 }, { "epoch": 44.67111111111111, "grad_norm": 6.57339334487915, "learning_rate": 6.09686125387474e-05, "loss": 1.786, "step": 10051 }, { "epoch": 44.77333333333333, "grad_norm": 7.233739376068115, "learning_rate": 6.080704860031879e-05, "loss": 1.7973, "step": 10074 }, { "epoch": 44.87555555555556, "grad_norm": 7.365921497344971, "learning_rate": 6.0645366206829244e-05, "loss": 1.8094, "step": 10097 }, { "epoch": 44.977777777777774, "grad_norm": 7.772608280181885, "learning_rate": 6.048356713046452e-05, "loss": 1.7963, "step": 10120 }, { "epoch": 45.08, "grad_norm": 6.320626258850098, "learning_rate": 6.032165314468935e-05, "loss": 1.7384, "step": 10143 }, { "epoch": 45.18222222222222, "grad_norm": 6.214219093322754, "learning_rate": 6.015962602422796e-05, "loss": 1.7253, "step": 10166 }, { "epoch": 45.284444444444446, "grad_norm": 6.484301567077637, "learning_rate": 5.999748754504465e-05, "loss": 1.7361, "step": 10189 }, { "epoch": 45.38666666666666, "grad_norm": 8.989522933959961, "learning_rate": 5.9835239484324304e-05, "loss": 1.7443, "step": 10212 }, { "epoch": 45.48888888888889, "grad_norm": 10.29185676574707, "learning_rate": 5.967288362045291e-05, "loss": 1.7423, "step": 10235 }, { "epoch": 45.59111111111111, "grad_norm": 7.059528350830078, "learning_rate": 5.951042173299811e-05, "loss": 1.7292, "step": 10258 }, { "epoch": 45.693333333333335, "grad_norm": 6.192359447479248, "learning_rate": 5.9347855602689616e-05, "loss": 1.7204, "step": 10281 }, { "epoch": 45.79555555555555, "grad_norm": 6.398216247558594, "learning_rate": 5.918518701139978e-05, "loss": 1.7395, "step": 10304 }, { "epoch": 45.897777777777776, "grad_norm": 6.21365213394165, "learning_rate": 5.902241774212398e-05, "loss": 1.7343, "step": 10327 }, { "epoch": 46.0, "grad_norm": 6.119551658630371, "learning_rate": 5.885954957896115e-05, "loss": 1.7463, "step": 10350 }, { "epoch": 46.102222222222224, "grad_norm": 5.506466865539551, "learning_rate": 5.8696584307094146e-05, "loss": 1.657, "step": 10373 }, { "epoch": 46.20444444444445, "grad_norm": 6.575307369232178, "learning_rate": 5.853352371277029e-05, "loss": 1.6622, "step": 10396 }, { "epoch": 46.306666666666665, "grad_norm": 6.451313018798828, "learning_rate": 5.8370369583281634e-05, "loss": 1.6861, "step": 10419 }, { "epoch": 46.40888888888889, "grad_norm": 7.1156816482543945, "learning_rate": 5.820712370694558e-05, "loss": 1.6859, "step": 10442 }, { "epoch": 46.51111111111111, "grad_norm": 6.124991416931152, "learning_rate": 5.8043787873085044e-05, "loss": 1.6763, "step": 10465 }, { "epoch": 46.61333333333333, "grad_norm": 8.477898597717285, "learning_rate": 5.7880363872009016e-05, "loss": 1.6952, "step": 10488 }, { "epoch": 46.715555555555554, "grad_norm": 7.237541198730469, "learning_rate": 5.771685349499288e-05, "loss": 1.676, "step": 10511 }, { "epoch": 46.81777777777778, "grad_norm": 5.890578269958496, "learning_rate": 5.7553258534258756e-05, "loss": 1.6964, "step": 10534 }, { "epoch": 46.92, "grad_norm": 6.47843074798584, "learning_rate": 5.7389580782955896e-05, "loss": 1.7098, "step": 10557 }, { "epoch": 47.022222222222226, "grad_norm": 9.489853858947754, "learning_rate": 5.722582203514099e-05, "loss": 1.6894, "step": 10580 }, { "epoch": 47.12444444444444, "grad_norm": 5.722830295562744, "learning_rate": 5.7061984085758555e-05, "loss": 1.6463, "step": 10603 }, { "epoch": 47.22666666666667, "grad_norm": 5.548519134521484, "learning_rate": 5.689806873062122e-05, "loss": 1.6358, "step": 10626 }, { "epoch": 47.32888888888889, "grad_norm": 5.543103218078613, "learning_rate": 5.6734077766390023e-05, "loss": 1.6249, "step": 10649 }, { "epoch": 47.431111111111115, "grad_norm": 7.334754467010498, "learning_rate": 5.6570012990554774e-05, "loss": 1.6144, "step": 10672 }, { "epoch": 47.53333333333333, "grad_norm": 6.74175500869751, "learning_rate": 5.6405876201414334e-05, "loss": 1.6413, "step": 10695 }, { "epoch": 47.635555555555555, "grad_norm": 8.000964164733887, "learning_rate": 5.624166919805686e-05, "loss": 1.6583, "step": 10718 }, { "epoch": 47.73777777777778, "grad_norm": 6.7785797119140625, "learning_rate": 5.607739378034015e-05, "loss": 1.6346, "step": 10741 }, { "epoch": 47.84, "grad_norm": 8.0484619140625, "learning_rate": 5.591305174887185e-05, "loss": 1.6615, "step": 10764 }, { "epoch": 47.94222222222222, "grad_norm": 6.589325428009033, "learning_rate": 5.574864490498982e-05, "loss": 1.6556, "step": 10787 }, { "epoch": 48.044444444444444, "grad_norm": 5.7148942947387695, "learning_rate": 5.558417505074226e-05, "loss": 1.6129, "step": 10810 }, { "epoch": 48.14666666666667, "grad_norm": 6.063688278198242, "learning_rate": 5.541964398886805e-05, "loss": 1.5707, "step": 10833 }, { "epoch": 48.24888888888889, "grad_norm": 7.891332626342773, "learning_rate": 5.525505352277695e-05, "loss": 1.5966, "step": 10856 }, { "epoch": 48.35111111111111, "grad_norm": 6.462911605834961, "learning_rate": 5.509040545652984e-05, "loss": 1.5979, "step": 10879 }, { "epoch": 48.45333333333333, "grad_norm": 6.627693176269531, "learning_rate": 5.492570159481897e-05, "loss": 1.5835, "step": 10902 }, { "epoch": 48.55555555555556, "grad_norm": 7.016481399536133, "learning_rate": 5.4760943742948126e-05, "loss": 1.6114, "step": 10925 }, { "epoch": 48.65777777777778, "grad_norm": 6.203521251678467, "learning_rate": 5.4596133706812925e-05, "loss": 1.6261, "step": 10948 }, { "epoch": 48.76, "grad_norm": 8.625542640686035, "learning_rate": 5.443127329288092e-05, "loss": 1.6152, "step": 10971 }, { "epoch": 48.86222222222222, "grad_norm": 8.934986114501953, "learning_rate": 5.426636430817189e-05, "loss": 1.6155, "step": 10994 }, { "epoch": 48.964444444444446, "grad_norm": 6.330492973327637, "learning_rate": 5.4101408560237964e-05, "loss": 1.624, "step": 11017 }, { "epoch": 49.06666666666667, "grad_norm": 7.745333671569824, "learning_rate": 5.393640785714386e-05, "loss": 1.5832, "step": 11040 }, { "epoch": 49.16888888888889, "grad_norm": 7.9969682693481445, "learning_rate": 5.377136400744701e-05, "loss": 1.5664, "step": 11063 }, { "epoch": 49.27111111111111, "grad_norm": 6.262273788452148, "learning_rate": 5.3606278820177824e-05, "loss": 1.5464, "step": 11086 }, { "epoch": 49.373333333333335, "grad_norm": 6.109494686126709, "learning_rate": 5.344115410481977e-05, "loss": 1.5242, "step": 11109 }, { "epoch": 49.47555555555556, "grad_norm": 6.395167827606201, "learning_rate": 5.3275991671289594e-05, "loss": 1.5514, "step": 11132 }, { "epoch": 49.577777777777776, "grad_norm": 8.812541961669922, "learning_rate": 5.311079332991748e-05, "loss": 1.527, "step": 11155 }, { "epoch": 49.68, "grad_norm": 8.040874481201172, "learning_rate": 5.294556089142716e-05, "loss": 1.5469, "step": 11178 }, { "epoch": 49.782222222222224, "grad_norm": 6.935076713562012, "learning_rate": 5.278029616691613e-05, "loss": 1.566, "step": 11201 }, { "epoch": 49.88444444444445, "grad_norm": 7.0155181884765625, "learning_rate": 5.261500096783577e-05, "loss": 1.5642, "step": 11224 }, { "epoch": 49.986666666666665, "grad_norm": 8.399476051330566, "learning_rate": 5.2449677105971476e-05, "loss": 1.5664, "step": 11247 }, { "epoch": 50.08888888888889, "grad_norm": 6.229375839233398, "learning_rate": 5.22843263934228e-05, "loss": 1.5044, "step": 11270 }, { "epoch": 50.19111111111111, "grad_norm": 8.590860366821289, "learning_rate": 5.211895064258365e-05, "loss": 1.5104, "step": 11293 }, { "epoch": 50.29333333333334, "grad_norm": 6.563053607940674, "learning_rate": 5.195355166612234e-05, "loss": 1.5279, "step": 11316 }, { "epoch": 50.39555555555555, "grad_norm": 6.139184474945068, "learning_rate": 5.178813127696175e-05, "loss": 1.5323, "step": 11339 }, { "epoch": 50.49777777777778, "grad_norm": 6.862679958343506, "learning_rate": 5.162269128825949e-05, "loss": 1.526, "step": 11362 }, { "epoch": 50.6, "grad_norm": 7.023072719573975, "learning_rate": 5.1457233513387994e-05, "loss": 1.5244, "step": 11385 }, { "epoch": 50.702222222222225, "grad_norm": 6.219864368438721, "learning_rate": 5.1291759765914625e-05, "loss": 1.5333, "step": 11408 }, { "epoch": 50.80444444444444, "grad_norm": 6.453531265258789, "learning_rate": 5.112627185958184e-05, "loss": 1.5319, "step": 11431 }, { "epoch": 50.906666666666666, "grad_norm": 5.3879876136779785, "learning_rate": 5.096077160828728e-05, "loss": 1.5279, "step": 11454 }, { "epoch": 51.00888888888889, "grad_norm": 6.174513339996338, "learning_rate": 5.079526082606394e-05, "loss": 1.5157, "step": 11477 }, { "epoch": 51.111111111111114, "grad_norm": 8.612546920776367, "learning_rate": 5.062974132706016e-05, "loss": 1.4655, "step": 11500 }, { "epoch": 51.21333333333333, "grad_norm": 6.833427429199219, "learning_rate": 5.046421492551992e-05, "loss": 1.4723, "step": 11523 }, { "epoch": 51.315555555555555, "grad_norm": 6.863546371459961, "learning_rate": 5.029868343576276e-05, "loss": 1.4848, "step": 11546 }, { "epoch": 51.41777777777778, "grad_norm": 7.937037467956543, "learning_rate": 5.013314867216407e-05, "loss": 1.4613, "step": 11569 }, { "epoch": 51.52, "grad_norm": 6.1333699226379395, "learning_rate": 4.996761244913508e-05, "loss": 1.478, "step": 11592 }, { "epoch": 51.62222222222222, "grad_norm": 9.617277145385742, "learning_rate": 4.980207658110305e-05, "loss": 1.4705, "step": 11615 }, { "epoch": 51.724444444444444, "grad_norm": 6.086880207061768, "learning_rate": 4.963654288249134e-05, "loss": 1.4673, "step": 11638 }, { "epoch": 51.82666666666667, "grad_norm": 5.924047470092773, "learning_rate": 4.9471013167699476e-05, "loss": 1.4855, "step": 11661 }, { "epoch": 51.92888888888889, "grad_norm": 5.790915489196777, "learning_rate": 4.930548925108342e-05, "loss": 1.4879, "step": 11684 }, { "epoch": 52.03111111111111, "grad_norm": 10.055533409118652, "learning_rate": 4.913997294693547e-05, "loss": 1.4776, "step": 11707 }, { "epoch": 52.13333333333333, "grad_norm": 5.994448661804199, "learning_rate": 4.8974466069464586e-05, "loss": 1.4281, "step": 11730 }, { "epoch": 52.23555555555556, "grad_norm": 6.34792947769165, "learning_rate": 4.880897043277632e-05, "loss": 1.4232, "step": 11753 }, { "epoch": 52.33777777777778, "grad_norm": 6.8388285636901855, "learning_rate": 4.8643487850853093e-05, "loss": 1.4415, "step": 11776 }, { "epoch": 52.44, "grad_norm": 6.194220542907715, "learning_rate": 4.847802013753414e-05, "loss": 1.4363, "step": 11799 }, { "epoch": 52.54222222222222, "grad_norm": 7.254870891571045, "learning_rate": 4.831256910649582e-05, "loss": 1.445, "step": 11822 }, { "epoch": 52.644444444444446, "grad_norm": 6.243785858154297, "learning_rate": 4.814713657123158e-05, "loss": 1.4399, "step": 11845 }, { "epoch": 52.74666666666667, "grad_norm": 7.5753607749938965, "learning_rate": 4.798172434503213e-05, "loss": 1.4521, "step": 11868 }, { "epoch": 52.84888888888889, "grad_norm": 6.7162861824035645, "learning_rate": 4.781633424096562e-05, "loss": 1.4446, "step": 11891 }, { "epoch": 52.95111111111111, "grad_norm": 8.405692100524902, "learning_rate": 4.765096807185767e-05, "loss": 1.4712, "step": 11914 }, { "epoch": 53.053333333333335, "grad_norm": 5.832555294036865, "learning_rate": 4.748562765027162e-05, "loss": 1.4306, "step": 11937 }, { "epoch": 53.15555555555556, "grad_norm": 5.443018436431885, "learning_rate": 4.7320314788488496e-05, "loss": 1.3977, "step": 11960 }, { "epoch": 53.257777777777775, "grad_norm": 6.506402969360352, "learning_rate": 4.715503129848733e-05, "loss": 1.419, "step": 11983 }, { "epoch": 53.36, "grad_norm": 7.063472747802734, "learning_rate": 4.69897789919252e-05, "loss": 1.4188, "step": 12006 }, { "epoch": 53.46222222222222, "grad_norm": 6.49618673324585, "learning_rate": 4.682455968011731e-05, "loss": 1.421, "step": 12029 }, { "epoch": 53.56444444444445, "grad_norm": 7.384080410003662, "learning_rate": 4.6659375174017316e-05, "loss": 1.4157, "step": 12052 }, { "epoch": 53.666666666666664, "grad_norm": 6.499640464782715, "learning_rate": 4.6494227284197294e-05, "loss": 1.3914, "step": 12075 }, { "epoch": 53.76888888888889, "grad_norm": 8.480474472045898, "learning_rate": 4.632911782082804e-05, "loss": 1.387, "step": 12098 }, { "epoch": 53.87111111111111, "grad_norm": 7.255825519561768, "learning_rate": 4.616404859365907e-05, "loss": 1.4147, "step": 12121 }, { "epoch": 53.973333333333336, "grad_norm": 5.0700249671936035, "learning_rate": 4.599902141199897e-05, "loss": 1.389, "step": 12144 }, { "epoch": 54.07555555555555, "grad_norm": 5.912162780761719, "learning_rate": 4.583403808469542e-05, "loss": 1.3623, "step": 12167 }, { "epoch": 54.17777777777778, "grad_norm": 5.70848274230957, "learning_rate": 4.566910042011539e-05, "loss": 1.3513, "step": 12190 }, { "epoch": 54.28, "grad_norm": 8.14360523223877, "learning_rate": 4.550421022612542e-05, "loss": 1.3729, "step": 12213 }, { "epoch": 54.382222222222225, "grad_norm": 5.549880027770996, "learning_rate": 4.5339369310071654e-05, "loss": 1.3797, "step": 12236 }, { "epoch": 54.48444444444444, "grad_norm": 6.507516384124756, "learning_rate": 4.517457947876018e-05, "loss": 1.3824, "step": 12259 }, { "epoch": 54.586666666666666, "grad_norm": 6.413192272186279, "learning_rate": 4.500984253843707e-05, "loss": 1.3718, "step": 12282 }, { "epoch": 54.68888888888889, "grad_norm": 6.168595790863037, "learning_rate": 4.484516029476873e-05, "loss": 1.3726, "step": 12305 }, { "epoch": 54.791111111111114, "grad_norm": 6.176178932189941, "learning_rate": 4.4680534552821996e-05, "loss": 1.3776, "step": 12328 }, { "epoch": 54.89333333333333, "grad_norm": 6.18988561630249, "learning_rate": 4.45159671170444e-05, "loss": 1.3764, "step": 12351 }, { "epoch": 54.995555555555555, "grad_norm": 6.998044490814209, "learning_rate": 4.4351459791244435e-05, "loss": 1.375, "step": 12374 }, { "epoch": 55.09777777777778, "grad_norm": 6.069551467895508, "learning_rate": 4.418701437857166e-05, "loss": 1.3324, "step": 12397 }, { "epoch": 55.2, "grad_norm": 6.534727096557617, "learning_rate": 4.402263268149706e-05, "loss": 1.3301, "step": 12420 }, { "epoch": 55.30222222222222, "grad_norm": 6.363480567932129, "learning_rate": 4.385831650179322e-05, "loss": 1.3524, "step": 12443 }, { "epoch": 55.404444444444444, "grad_norm": 6.515593528747559, "learning_rate": 4.3694067640514614e-05, "loss": 1.3353, "step": 12466 }, { "epoch": 55.50666666666667, "grad_norm": 6.400863170623779, "learning_rate": 4.352988789797781e-05, "loss": 1.3292, "step": 12489 }, { "epoch": 55.60888888888889, "grad_norm": 6.897211074829102, "learning_rate": 4.336577907374181e-05, "loss": 1.3591, "step": 12512 }, { "epoch": 55.71111111111111, "grad_norm": 7.05909538269043, "learning_rate": 4.320174296658827e-05, "loss": 1.3636, "step": 12535 }, { "epoch": 55.81333333333333, "grad_norm": 5.776651859283447, "learning_rate": 4.303778137450178e-05, "loss": 1.3475, "step": 12558 }, { "epoch": 55.91555555555556, "grad_norm": 6.0230021476745605, "learning_rate": 4.287389609465022e-05, "loss": 1.3681, "step": 12581 }, { "epoch": 56.01777777777778, "grad_norm": 6.32971715927124, "learning_rate": 4.271008892336497e-05, "loss": 1.3458, "step": 12604 }, { "epoch": 56.12, "grad_norm": 8.626049041748047, "learning_rate": 4.2546361656121346e-05, "loss": 1.2829, "step": 12627 }, { "epoch": 56.22222222222222, "grad_norm": 6.015228748321533, "learning_rate": 4.238271608751874e-05, "loss": 1.2816, "step": 12650 }, { "epoch": 56.324444444444445, "grad_norm": 5.704399108886719, "learning_rate": 4.221915401126113e-05, "loss": 1.3026, "step": 12673 }, { "epoch": 56.42666666666667, "grad_norm": 5.911527156829834, "learning_rate": 4.205567722013733e-05, "loss": 1.2857, "step": 12696 }, { "epoch": 56.528888888888886, "grad_norm": 6.171534538269043, "learning_rate": 4.18922875060013e-05, "loss": 1.2873, "step": 12719 }, { "epoch": 56.63111111111111, "grad_norm": 7.097690105438232, "learning_rate": 4.1728986659752636e-05, "loss": 1.3012, "step": 12742 }, { "epoch": 56.733333333333334, "grad_norm": 5.469725608825684, "learning_rate": 4.156577647131679e-05, "loss": 1.2895, "step": 12765 }, { "epoch": 56.83555555555556, "grad_norm": 6.386800765991211, "learning_rate": 4.1402658729625596e-05, "loss": 1.3026, "step": 12788 }, { "epoch": 56.937777777777775, "grad_norm": 5.86681604385376, "learning_rate": 4.1239635222597494e-05, "loss": 1.3072, "step": 12811 }, { "epoch": 57.04, "grad_norm": 6.062530517578125, "learning_rate": 4.107670773711812e-05, "loss": 1.284, "step": 12834 }, { "epoch": 57.14222222222222, "grad_norm": 5.922295570373535, "learning_rate": 4.091387805902058e-05, "loss": 1.2621, "step": 12857 }, { "epoch": 57.24444444444445, "grad_norm": 5.438425064086914, "learning_rate": 4.075114797306589e-05, "loss": 1.264, "step": 12880 }, { "epoch": 57.346666666666664, "grad_norm": 7.964729309082031, "learning_rate": 4.058851926292353e-05, "loss": 1.2781, "step": 12903 }, { "epoch": 57.44888888888889, "grad_norm": 6.432003498077393, "learning_rate": 4.042599371115172e-05, "loss": 1.2787, "step": 12926 }, { "epoch": 57.55111111111111, "grad_norm": 5.485337257385254, "learning_rate": 4.026357309917806e-05, "loss": 1.2663, "step": 12949 }, { "epoch": 57.653333333333336, "grad_norm": 6.874802112579346, "learning_rate": 4.010125920727982e-05, "loss": 1.2733, "step": 12972 }, { "epoch": 57.75555555555555, "grad_norm": 5.767955303192139, "learning_rate": 3.993905381456462e-05, "loss": 1.2763, "step": 12995 }, { "epoch": 57.85777777777778, "grad_norm": 5.2443389892578125, "learning_rate": 3.977695869895073e-05, "loss": 1.273, "step": 13018 }, { "epoch": 57.96, "grad_norm": 7.763814926147461, "learning_rate": 3.961497563714774e-05, "loss": 1.2851, "step": 13041 }, { "epoch": 58.062222222222225, "grad_norm": 6.231062412261963, "learning_rate": 3.945310640463705e-05, "loss": 1.2581, "step": 13064 }, { "epoch": 58.16444444444444, "grad_norm": 5.801052093505859, "learning_rate": 3.9291352775652325e-05, "loss": 1.2376, "step": 13087 }, { "epoch": 58.266666666666666, "grad_norm": 8.022377967834473, "learning_rate": 3.9129716523160165e-05, "loss": 1.2403, "step": 13110 }, { "epoch": 58.36888888888889, "grad_norm": 6.449449062347412, "learning_rate": 3.8968199418840575e-05, "loss": 1.2353, "step": 13133 }, { "epoch": 58.471111111111114, "grad_norm": 5.934969902038574, "learning_rate": 3.880680323306765e-05, "loss": 1.2575, "step": 13156 }, { "epoch": 58.57333333333333, "grad_norm": 6.265482425689697, "learning_rate": 3.8645529734890014e-05, "loss": 1.247, "step": 13179 }, { "epoch": 58.675555555555555, "grad_norm": 5.975387096405029, "learning_rate": 3.8484380692011605e-05, "loss": 1.2634, "step": 13202 }, { "epoch": 58.77777777777778, "grad_norm": 6.401468753814697, "learning_rate": 3.83233578707722e-05, "loss": 1.244, "step": 13225 }, { "epoch": 58.88, "grad_norm": 5.331010341644287, "learning_rate": 3.816246303612802e-05, "loss": 1.2459, "step": 13248 }, { "epoch": 58.98222222222222, "grad_norm": 5.550204277038574, "learning_rate": 3.800169795163252e-05, "loss": 1.2541, "step": 13271 }, { "epoch": 59.08444444444444, "grad_norm": 5.241280555725098, "learning_rate": 3.7841064379416903e-05, "loss": 1.2155, "step": 13294 }, { "epoch": 59.18666666666667, "grad_norm": 6.312388896942139, "learning_rate": 3.768056408017094e-05, "loss": 1.2055, "step": 13317 }, { "epoch": 59.28888888888889, "grad_norm": 5.525976181030273, "learning_rate": 3.752019881312354e-05, "loss": 1.211, "step": 13340 }, { "epoch": 59.39111111111111, "grad_norm": 6.092748641967773, "learning_rate": 3.735997033602361e-05, "loss": 1.2133, "step": 13363 }, { "epoch": 59.49333333333333, "grad_norm": 5.471757888793945, "learning_rate": 3.719988040512067e-05, "loss": 1.2267, "step": 13386 }, { "epoch": 59.595555555555556, "grad_norm": 6.422407150268555, "learning_rate": 3.703993077514563e-05, "loss": 1.2223, "step": 13409 }, { "epoch": 59.69777777777778, "grad_norm": 5.488748073577881, "learning_rate": 3.6880123199291635e-05, "loss": 1.2217, "step": 13432 }, { "epoch": 59.8, "grad_norm": 5.826624393463135, "learning_rate": 3.672045942919474e-05, "loss": 1.2216, "step": 13455 }, { "epoch": 59.90222222222222, "grad_norm": 5.7313008308410645, "learning_rate": 3.656094121491479e-05, "loss": 1.2271, "step": 13478 }, { "epoch": 60.004444444444445, "grad_norm": 7.073070526123047, "learning_rate": 3.6401570304916166e-05, "loss": 1.222, "step": 13501 }, { "epoch": 60.10666666666667, "grad_norm": 5.049999713897705, "learning_rate": 3.624234844604869e-05, "loss": 1.1695, "step": 13524 }, { "epoch": 60.208888888888886, "grad_norm": 5.1560211181640625, "learning_rate": 3.6083277383528466e-05, "loss": 1.1792, "step": 13547 }, { "epoch": 60.31111111111111, "grad_norm": 5.553138256072998, "learning_rate": 3.592435886091867e-05, "loss": 1.1853, "step": 13570 }, { "epoch": 60.413333333333334, "grad_norm": 5.489965438842773, "learning_rate": 3.576559462011057e-05, "loss": 1.1918, "step": 13593 }, { "epoch": 60.51555555555556, "grad_norm": 6.636351108551025, "learning_rate": 3.5606986401304324e-05, "loss": 1.2002, "step": 13616 }, { "epoch": 60.617777777777775, "grad_norm": 8.49821662902832, "learning_rate": 3.544853594298997e-05, "loss": 1.2062, "step": 13639 }, { "epoch": 60.72, "grad_norm": 5.866752624511719, "learning_rate": 3.529024498192832e-05, "loss": 1.205, "step": 13662 }, { "epoch": 60.82222222222222, "grad_norm": 12.07309627532959, "learning_rate": 3.5132115253132005e-05, "loss": 1.2112, "step": 13685 }, { "epoch": 60.92444444444445, "grad_norm": 7.421104431152344, "learning_rate": 3.4974148489846315e-05, "loss": 1.2229, "step": 13708 }, { "epoch": 61.026666666666664, "grad_norm": 5.546532154083252, "learning_rate": 3.4816346423530385e-05, "loss": 1.1952, "step": 13731 }, { "epoch": 61.12888888888889, "grad_norm": 5.055679798126221, "learning_rate": 3.465871078383809e-05, "loss": 1.1628, "step": 13754 }, { "epoch": 61.23111111111111, "grad_norm": 6.14479923248291, "learning_rate": 3.4501243298599055e-05, "loss": 1.1767, "step": 13777 }, { "epoch": 61.333333333333336, "grad_norm": 5.632229328155518, "learning_rate": 3.434394569379988e-05, "loss": 1.179, "step": 13800 }, { "epoch": 61.43555555555555, "grad_norm": 5.1467671394348145, "learning_rate": 3.4186819693565046e-05, "loss": 1.1745, "step": 13823 }, { "epoch": 61.53777777777778, "grad_norm": 5.162554740905762, "learning_rate": 3.4029867020138155e-05, "loss": 1.1672, "step": 13846 }, { "epoch": 61.64, "grad_norm": 5.325419902801514, "learning_rate": 3.387308939386291e-05, "loss": 1.1793, "step": 13869 }, { "epoch": 61.742222222222225, "grad_norm": 5.7772626876831055, "learning_rate": 3.371648853316442e-05, "loss": 1.1706, "step": 13892 }, { "epoch": 61.84444444444444, "grad_norm": 7.251054763793945, "learning_rate": 3.356006615453025e-05, "loss": 1.1572, "step": 13915 }, { "epoch": 61.946666666666665, "grad_norm": 6.169683933258057, "learning_rate": 3.340382397249159e-05, "loss": 1.1553, "step": 13938 }, { "epoch": 62.04888888888889, "grad_norm": 6.773545742034912, "learning_rate": 3.324776369960461e-05, "loss": 1.1603, "step": 13961 }, { "epoch": 62.15111111111111, "grad_norm": 6.104127407073975, "learning_rate": 3.309188704643149e-05, "loss": 1.1209, "step": 13984 }, { "epoch": 62.25333333333333, "grad_norm": 5.433740615844727, "learning_rate": 3.2936195721521866e-05, "loss": 1.1373, "step": 14007 }, { "epoch": 62.355555555555554, "grad_norm": 5.472240924835205, "learning_rate": 3.2780691431393926e-05, "loss": 1.143, "step": 14030 }, { "epoch": 62.45777777777778, "grad_norm": 5.382284164428711, "learning_rate": 3.2625375880515854e-05, "loss": 1.1471, "step": 14053 }, { "epoch": 62.56, "grad_norm": 5.667013168334961, "learning_rate": 3.2470250771287036e-05, "loss": 1.1391, "step": 14076 }, { "epoch": 62.66222222222222, "grad_norm": 5.519725322723389, "learning_rate": 3.231531780401943e-05, "loss": 1.1335, "step": 14099 }, { "epoch": 62.76444444444444, "grad_norm": 5.530640125274658, "learning_rate": 3.2160578676919016e-05, "loss": 1.1386, "step": 14122 }, { "epoch": 62.86666666666667, "grad_norm": 6.683435440063477, "learning_rate": 3.200603508606703e-05, "loss": 1.1362, "step": 14145 }, { "epoch": 62.96888888888889, "grad_norm": 5.929420471191406, "learning_rate": 3.185168872540153e-05, "loss": 1.1455, "step": 14168 }, { "epoch": 63.07111111111111, "grad_norm": 6.305390357971191, "learning_rate": 3.169754128669866e-05, "loss": 1.1242, "step": 14191 }, { "epoch": 63.17333333333333, "grad_norm": 6.4048542976379395, "learning_rate": 3.154359445955429e-05, "loss": 1.1263, "step": 14214 }, { "epoch": 63.275555555555556, "grad_norm": 5.409482002258301, "learning_rate": 3.138984993136535e-05, "loss": 1.1052, "step": 14237 }, { "epoch": 63.37777777777778, "grad_norm": 5.47636079788208, "learning_rate": 3.12363093873114e-05, "loss": 1.1196, "step": 14260 }, { "epoch": 63.48, "grad_norm": 5.092154026031494, "learning_rate": 3.108297451033616e-05, "loss": 1.1193, "step": 14283 }, { "epoch": 63.58222222222222, "grad_norm": 5.453930377960205, "learning_rate": 3.092984698112904e-05, "loss": 1.1182, "step": 14306 }, { "epoch": 63.684444444444445, "grad_norm": 6.511165618896484, "learning_rate": 3.0776928478106754e-05, "loss": 1.1295, "step": 14329 }, { "epoch": 63.78666666666667, "grad_norm": 5.347112655639648, "learning_rate": 3.062422067739485e-05, "loss": 1.1239, "step": 14352 }, { "epoch": 63.888888888888886, "grad_norm": 5.500729084014893, "learning_rate": 3.0471725252809458e-05, "loss": 1.1227, "step": 14375 }, { "epoch": 63.99111111111111, "grad_norm": 5.913949489593506, "learning_rate": 3.0319443875838794e-05, "loss": 1.1306, "step": 14398 }, { "epoch": 64.09333333333333, "grad_norm": 5.112490177154541, "learning_rate": 3.0167378215624974e-05, "loss": 1.0993, "step": 14421 }, { "epoch": 64.19555555555556, "grad_norm": 5.541341304779053, "learning_rate": 3.0015529938945668e-05, "loss": 1.0976, "step": 14444 }, { "epoch": 64.29777777777778, "grad_norm": 5.937663555145264, "learning_rate": 2.9863900710195758e-05, "loss": 1.0953, "step": 14467 }, { "epoch": 64.4, "grad_norm": 5.4565558433532715, "learning_rate": 2.9712492191369244e-05, "loss": 1.0998, "step": 14490 }, { "epoch": 64.50222222222222, "grad_norm": 6.276011943817139, "learning_rate": 2.956130604204089e-05, "loss": 1.1113, "step": 14513 }, { "epoch": 64.60444444444444, "grad_norm": 5.444122791290283, "learning_rate": 2.9410343919348127e-05, "loss": 1.108, "step": 14536 }, { "epoch": 64.70666666666666, "grad_norm": 5.791774749755859, "learning_rate": 2.9259607477972794e-05, "loss": 1.1149, "step": 14559 }, { "epoch": 64.80888888888889, "grad_norm": 6.028242588043213, "learning_rate": 2.9109098370123132e-05, "loss": 1.1236, "step": 14582 }, { "epoch": 64.91111111111111, "grad_norm": 6.835079193115234, "learning_rate": 2.8958818245515533e-05, "loss": 1.1148, "step": 14605 }, { "epoch": 65.01333333333334, "grad_norm": 5.5959792137146, "learning_rate": 2.8808768751356564e-05, "loss": 1.1054, "step": 14628 }, { "epoch": 65.11555555555556, "grad_norm": 5.705920219421387, "learning_rate": 2.865895153232489e-05, "loss": 1.0824, "step": 14651 }, { "epoch": 65.21777777777778, "grad_norm": 4.9849934577941895, "learning_rate": 2.8509368230553157e-05, "loss": 1.077, "step": 14674 }, { "epoch": 65.32, "grad_norm": 5.702665328979492, "learning_rate": 2.8360020485610163e-05, "loss": 1.0514, "step": 14697 }, { "epoch": 65.42222222222222, "grad_norm": 5.4493207931518555, "learning_rate": 2.8210909934482678e-05, "loss": 1.0653, "step": 14720 }, { "epoch": 65.52444444444444, "grad_norm": 5.684943199157715, "learning_rate": 2.8062038211557728e-05, "loss": 1.0641, "step": 14743 }, { "epoch": 65.62666666666667, "grad_norm": 5.757254123687744, "learning_rate": 2.791340694860446e-05, "loss": 1.0754, "step": 14766 }, { "epoch": 65.72888888888889, "grad_norm": 5.588274955749512, "learning_rate": 2.776501777475644e-05, "loss": 1.0768, "step": 14789 }, { "epoch": 65.83111111111111, "grad_norm": 5.547431945800781, "learning_rate": 2.7616872316493708e-05, "loss": 1.078, "step": 14812 }, { "epoch": 65.93333333333334, "grad_norm": 5.201080322265625, "learning_rate": 2.7468972197624897e-05, "loss": 1.0824, "step": 14835 }, { "epoch": 66.03555555555556, "grad_norm": 6.8083271980285645, "learning_rate": 2.7321319039269576e-05, "loss": 1.07, "step": 14858 }, { "epoch": 66.13777777777777, "grad_norm": 6.262781620025635, "learning_rate": 2.7173914459840342e-05, "loss": 1.0395, "step": 14881 }, { "epoch": 66.24, "grad_norm": 5.109470844268799, "learning_rate": 2.7026760075025192e-05, "loss": 1.0467, "step": 14904 }, { "epoch": 66.34222222222222, "grad_norm": 5.397584915161133, "learning_rate": 2.6879857497769712e-05, "loss": 1.0531, "step": 14927 }, { "epoch": 66.44444444444444, "grad_norm": 5.602553844451904, "learning_rate": 2.6733208338259486e-05, "loss": 1.045, "step": 14950 }, { "epoch": 66.54666666666667, "grad_norm": 5.551428318023682, "learning_rate": 2.6586814203902422e-05, "loss": 1.042, "step": 14973 }, { "epoch": 66.64888888888889, "grad_norm": 5.80933952331543, "learning_rate": 2.6440676699311062e-05, "loss": 1.0555, "step": 14996 }, { "epoch": 66.75111111111111, "grad_norm": 5.058752536773682, "learning_rate": 2.6294797426285112e-05, "loss": 1.0507, "step": 15019 }, { "epoch": 66.85333333333334, "grad_norm": 7.067930221557617, "learning_rate": 2.6149177983793783e-05, "loss": 1.0599, "step": 15042 }, { "epoch": 66.95555555555555, "grad_norm": 5.901451587677002, "learning_rate": 2.6003819967958344e-05, "loss": 1.0527, "step": 15065 }, { "epoch": 67.05777777777777, "grad_norm": 5.727104663848877, "learning_rate": 2.5858724972034555e-05, "loss": 1.0395, "step": 15088 }, { "epoch": 67.16, "grad_norm": 7.644411563873291, "learning_rate": 2.5713894586395283e-05, "loss": 1.0326, "step": 15111 }, { "epoch": 67.26222222222222, "grad_norm": 4.788581848144531, "learning_rate": 2.5569330398512957e-05, "loss": 1.0388, "step": 15134 }, { "epoch": 67.36444444444444, "grad_norm": 4.921880722045898, "learning_rate": 2.5425033992942316e-05, "loss": 1.0413, "step": 15157 }, { "epoch": 67.46666666666667, "grad_norm": 5.7385735511779785, "learning_rate": 2.5281006951302934e-05, "loss": 1.0328, "step": 15180 }, { "epoch": 67.56888888888889, "grad_norm": 5.9198689460754395, "learning_rate": 2.5137250852261862e-05, "loss": 1.0416, "step": 15203 }, { "epoch": 67.67111111111112, "grad_norm": 5.01896858215332, "learning_rate": 2.499376727151646e-05, "loss": 1.0455, "step": 15226 }, { "epoch": 67.77333333333333, "grad_norm": 5.580973148345947, "learning_rate": 2.485055778177696e-05, "loss": 1.0487, "step": 15249 }, { "epoch": 67.87555555555555, "grad_norm": 4.777526378631592, "learning_rate": 2.470762395274938e-05, "loss": 1.0434, "step": 15272 }, { "epoch": 67.97777777777777, "grad_norm": 7.526794910430908, "learning_rate": 2.4564967351118175e-05, "loss": 1.0477, "step": 15295 }, { "epoch": 68.08, "grad_norm": 6.90614128112793, "learning_rate": 2.4422589540529185e-05, "loss": 1.0341, "step": 15318 }, { "epoch": 68.18222222222222, "grad_norm": 6.120336532592773, "learning_rate": 2.4280492081572455e-05, "loss": 1.0169, "step": 15341 }, { "epoch": 68.28444444444445, "grad_norm": 5.239770889282227, "learning_rate": 2.413867653176506e-05, "loss": 1.0155, "step": 15364 }, { "epoch": 68.38666666666667, "grad_norm": 5.342464923858643, "learning_rate": 2.3997144445534175e-05, "loss": 1.0343, "step": 15387 }, { "epoch": 68.4888888888889, "grad_norm": 6.170787811279297, "learning_rate": 2.3855897374199883e-05, "loss": 1.0101, "step": 15410 }, { "epoch": 68.5911111111111, "grad_norm": 7.313038349151611, "learning_rate": 2.371493686595831e-05, "loss": 1.0369, "step": 15433 }, { "epoch": 68.69333333333333, "grad_norm": 5.434996604919434, "learning_rate": 2.3574264465864527e-05, "loss": 1.0345, "step": 15456 }, { "epoch": 68.79555555555555, "grad_norm": 6.723358631134033, "learning_rate": 2.343388171581573e-05, "loss": 1.0309, "step": 15479 }, { "epoch": 68.89777777777778, "grad_norm": 5.317188262939453, "learning_rate": 2.3293790154534283e-05, "loss": 1.0314, "step": 15502 }, { "epoch": 69.0, "grad_norm": 6.149099349975586, "learning_rate": 2.315399131755081e-05, "loss": 1.0313, "step": 15525 }, { "epoch": 69.10222222222222, "grad_norm": 5.885578632354736, "learning_rate": 2.3014486737187475e-05, "loss": 1.0127, "step": 15548 }, { "epoch": 69.20444444444445, "grad_norm": 5.442347049713135, "learning_rate": 2.2875277942541057e-05, "loss": 1.0002, "step": 15571 }, { "epoch": 69.30666666666667, "grad_norm": 5.002798080444336, "learning_rate": 2.2736366459466326e-05, "loss": 1.0208, "step": 15594 }, { "epoch": 69.4088888888889, "grad_norm": 4.764693737030029, "learning_rate": 2.259775381055917e-05, "loss": 1.0147, "step": 15617 }, { "epoch": 69.5111111111111, "grad_norm": 5.556255340576172, "learning_rate": 2.2459441515140044e-05, "loss": 0.9888, "step": 15640 }, { "epoch": 69.61333333333333, "grad_norm": 5.241755485534668, "learning_rate": 2.2321431089237256e-05, "loss": 0.9846, "step": 15663 }, { "epoch": 69.71555555555555, "grad_norm": 5.701202869415283, "learning_rate": 2.2183724045570286e-05, "loss": 0.9872, "step": 15686 }, { "epoch": 69.81777777777778, "grad_norm": 8.224358558654785, "learning_rate": 2.2046321893533362e-05, "loss": 0.9898, "step": 15709 }, { "epoch": 69.92, "grad_norm": 5.965829849243164, "learning_rate": 2.1909226139178723e-05, "loss": 0.9831, "step": 15732 }, { "epoch": 70.02222222222223, "grad_norm": 5.391206741333008, "learning_rate": 2.1772438285200312e-05, "loss": 0.9954, "step": 15755 }, { "epoch": 70.12444444444445, "grad_norm": 6.74372673034668, "learning_rate": 2.1635959830917107e-05, "loss": 0.9651, "step": 15778 }, { "epoch": 70.22666666666667, "grad_norm": 5.2756123542785645, "learning_rate": 2.149979227225688e-05, "loss": 0.9698, "step": 15801 }, { "epoch": 70.32888888888888, "grad_norm": 6.822518825531006, "learning_rate": 2.1363937101739613e-05, "loss": 0.9771, "step": 15824 }, { "epoch": 70.43111111111111, "grad_norm": 5.256137847900391, "learning_rate": 2.1228395808461294e-05, "loss": 0.9962, "step": 15847 }, { "epoch": 70.53333333333333, "grad_norm": 4.483437538146973, "learning_rate": 2.1093169878077533e-05, "loss": 0.9735, "step": 15870 }, { "epoch": 70.63555555555556, "grad_norm": 6.114633083343506, "learning_rate": 2.0958260792787215e-05, "loss": 0.9839, "step": 15893 }, { "epoch": 70.73777777777778, "grad_norm": 5.309250831604004, "learning_rate": 2.08236700313164e-05, "loss": 0.9745, "step": 15916 }, { "epoch": 70.84, "grad_norm": 5.820844650268555, "learning_rate": 2.068939906890194e-05, "loss": 0.9786, "step": 15939 }, { "epoch": 70.94222222222223, "grad_norm": 5.038022041320801, "learning_rate": 2.055544937727549e-05, "loss": 0.9912, "step": 15962 }, { "epoch": 71.04444444444445, "grad_norm": 5.100025177001953, "learning_rate": 2.042182242464719e-05, "loss": 0.9748, "step": 15985 }, { "epoch": 71.14666666666666, "grad_norm": 5.8269829750061035, "learning_rate": 2.0288519675689755e-05, "loss": 0.9614, "step": 16008 }, { "epoch": 71.24888888888889, "grad_norm": 5.484350681304932, "learning_rate": 2.0155542591522303e-05, "loss": 0.9655, "step": 16031 }, { "epoch": 71.35111111111111, "grad_norm": 5.463179111480713, "learning_rate": 2.0022892629694335e-05, "loss": 0.9633, "step": 16054 }, { "epoch": 71.45333333333333, "grad_norm": 6.4749579429626465, "learning_rate": 1.9890571244169854e-05, "loss": 0.9643, "step": 16077 }, { "epoch": 71.55555555555556, "grad_norm": 5.12134313583374, "learning_rate": 1.97585798853113e-05, "loss": 0.9771, "step": 16100 }, { "epoch": 71.65777777777778, "grad_norm": 5.494293212890625, "learning_rate": 1.9626919999863802e-05, "loss": 0.9833, "step": 16123 }, { "epoch": 71.76, "grad_norm": 6.645090579986572, "learning_rate": 1.9495593030939157e-05, "loss": 0.966, "step": 16146 }, { "epoch": 71.86222222222223, "grad_norm": 5.469064235687256, "learning_rate": 1.9364600418000156e-05, "loss": 0.9752, "step": 16169 }, { "epoch": 71.96444444444444, "grad_norm": 7.400743007659912, "learning_rate": 1.9233943596844734e-05, "loss": 0.9729, "step": 16192 }, { "epoch": 72.06666666666666, "grad_norm": 5.228180408477783, "learning_rate": 1.9103623999590202e-05, "loss": 0.9706, "step": 16215 }, { "epoch": 72.16888888888889, "grad_norm": 5.571268081665039, "learning_rate": 1.897364305465766e-05, "loss": 0.9544, "step": 16238 }, { "epoch": 72.27111111111111, "grad_norm": 5.692650318145752, "learning_rate": 1.884400218675619e-05, "loss": 0.9577, "step": 16261 }, { "epoch": 72.37333333333333, "grad_norm": 5.098461151123047, "learning_rate": 1.87147028168674e-05, "loss": 0.952, "step": 16284 }, { "epoch": 72.47555555555556, "grad_norm": 5.3133745193481445, "learning_rate": 1.8585746362229706e-05, "loss": 0.9623, "step": 16307 }, { "epoch": 72.57777777777778, "grad_norm": 5.299659729003906, "learning_rate": 1.8457134236322903e-05, "loss": 0.9505, "step": 16330 }, { "epoch": 72.68, "grad_norm": 6.57431173324585, "learning_rate": 1.832886784885263e-05, "loss": 0.9665, "step": 16353 }, { "epoch": 72.78222222222222, "grad_norm": 5.018616199493408, "learning_rate": 1.820094860573488e-05, "loss": 0.9565, "step": 16376 }, { "epoch": 72.88444444444444, "grad_norm": 5.487111568450928, "learning_rate": 1.8073377909080685e-05, "loss": 0.9551, "step": 16399 }, { "epoch": 72.98666666666666, "grad_norm": 6.0984086990356445, "learning_rate": 1.7946157157180628e-05, "loss": 0.9743, "step": 16422 }, { "epoch": 73.08888888888889, "grad_norm": 5.412441730499268, "learning_rate": 1.7819287744489636e-05, "loss": 0.9316, "step": 16445 }, { "epoch": 73.19111111111111, "grad_norm": 5.8434929847717285, "learning_rate": 1.7692771061611603e-05, "loss": 0.947, "step": 16468 }, { "epoch": 73.29333333333334, "grad_norm": 5.178957462310791, "learning_rate": 1.756660849528422e-05, "loss": 0.9455, "step": 16491 }, { "epoch": 73.39555555555556, "grad_norm": 6.5831499099731445, "learning_rate": 1.7440801428363677e-05, "loss": 0.9469, "step": 16514 }, { "epoch": 73.49777777777778, "grad_norm": 5.628024101257324, "learning_rate": 1.731535123980964e-05, "loss": 0.961, "step": 16537 }, { "epoch": 73.6, "grad_norm": 4.770416736602783, "learning_rate": 1.7190259304670038e-05, "loss": 0.9489, "step": 16560 }, { "epoch": 73.70222222222222, "grad_norm": 5.419926166534424, "learning_rate": 1.7065526994065973e-05, "loss": 0.9384, "step": 16583 }, { "epoch": 73.80444444444444, "grad_norm": 5.695985794067383, "learning_rate": 1.6941155675176823e-05, "loss": 0.9386, "step": 16606 }, { "epoch": 73.90666666666667, "grad_norm": 5.251271724700928, "learning_rate": 1.6817146711225073e-05, "loss": 0.9577, "step": 16629 }, { "epoch": 74.00888888888889, "grad_norm": 5.220533847808838, "learning_rate": 1.669350146146156e-05, "loss": 0.9513, "step": 16652 }, { "epoch": 74.11111111111111, "grad_norm": 5.326650142669678, "learning_rate": 1.65702212811504e-05, "loss": 0.9399, "step": 16675 }, { "epoch": 74.21333333333334, "grad_norm": 5.140909194946289, "learning_rate": 1.6447307521554273e-05, "loss": 0.9273, "step": 16698 }, { "epoch": 74.31555555555556, "grad_norm": 5.344797611236572, "learning_rate": 1.6324761529919556e-05, "loss": 0.942, "step": 16721 }, { "epoch": 74.41777777777777, "grad_norm": 5.0787835121154785, "learning_rate": 1.6202584649461505e-05, "loss": 0.9358, "step": 16744 }, { "epoch": 74.52, "grad_norm": 4.678197383880615, "learning_rate": 1.608077821934965e-05, "loss": 0.9313, "step": 16767 }, { "epoch": 74.62222222222222, "grad_norm": 5.813838005065918, "learning_rate": 1.5959343574692982e-05, "loss": 0.9375, "step": 16790 }, { "epoch": 74.72444444444444, "grad_norm": 7.276843070983887, "learning_rate": 1.5838282046525444e-05, "loss": 0.9359, "step": 16813 }, { "epoch": 74.82666666666667, "grad_norm": 5.635644435882568, "learning_rate": 1.571759496179123e-05, "loss": 0.9444, "step": 16836 }, { "epoch": 74.92888888888889, "grad_norm": 5.5287556648254395, "learning_rate": 1.5597283643330347e-05, "loss": 0.9345, "step": 16859 }, { "epoch": 75.03111111111112, "grad_norm": 5.956721782684326, "learning_rate": 1.547734940986404e-05, "loss": 0.9618, "step": 16882 }, { "epoch": 75.13333333333334, "grad_norm": 6.450102806091309, "learning_rate": 1.535779357598033e-05, "loss": 0.9266, "step": 16905 }, { "epoch": 75.23555555555555, "grad_norm": 5.966337203979492, "learning_rate": 1.5238617452119697e-05, "loss": 0.9089, "step": 16928 }, { "epoch": 75.33777777777777, "grad_norm": 5.400455474853516, "learning_rate": 1.5119822344560591e-05, "loss": 0.8967, "step": 16951 }, { "epoch": 75.44, "grad_norm": 5.6878180503845215, "learning_rate": 1.5001409555405238e-05, "loss": 0.9058, "step": 16974 }, { "epoch": 75.54222222222222, "grad_norm": 5.092850685119629, "learning_rate": 1.4883380382565244e-05, "loss": 0.9037, "step": 16997 }, { "epoch": 75.64444444444445, "grad_norm": 7.444413185119629, "learning_rate": 1.4765736119747475e-05, "loss": 0.9191, "step": 17020 }, { "epoch": 75.74666666666667, "grad_norm": 5.114320755004883, "learning_rate": 1.4648478056439847e-05, "loss": 0.9132, "step": 17043 }, { "epoch": 75.8488888888889, "grad_norm": 5.615855693817139, "learning_rate": 1.453160747789712e-05, "loss": 0.9064, "step": 17066 }, { "epoch": 75.95111111111112, "grad_norm": 5.120584964752197, "learning_rate": 1.4415125665126933e-05, "loss": 0.9149, "step": 17089 }, { "epoch": 76.05333333333333, "grad_norm": 5.242557048797607, "learning_rate": 1.4299033894875647e-05, "loss": 0.8938, "step": 17112 }, { "epoch": 76.15555555555555, "grad_norm": 5.4338297843933105, "learning_rate": 1.4183333439614449e-05, "loss": 0.8982, "step": 17135 }, { "epoch": 76.25777777777778, "grad_norm": 4.58558988571167, "learning_rate": 1.4068025567525317e-05, "loss": 0.8992, "step": 17158 }, { "epoch": 76.36, "grad_norm": 5.754461765289307, "learning_rate": 1.3953111542487202e-05, "loss": 0.91, "step": 17181 }, { "epoch": 76.46222222222222, "grad_norm": 4.953834533691406, "learning_rate": 1.383859262406208e-05, "loss": 0.9014, "step": 17204 }, { "epoch": 76.56444444444445, "grad_norm": 5.375875473022461, "learning_rate": 1.3724470067481255e-05, "loss": 0.9027, "step": 17227 }, { "epoch": 76.66666666666667, "grad_norm": 5.019064426422119, "learning_rate": 1.3610745123631535e-05, "loss": 0.8902, "step": 17250 }, { "epoch": 76.7688888888889, "grad_norm": 7.214736461639404, "learning_rate": 1.3497419039041488e-05, "loss": 0.9004, "step": 17273 }, { "epoch": 76.8711111111111, "grad_norm": 5.181694507598877, "learning_rate": 1.3384493055867885e-05, "loss": 0.8949, "step": 17296 }, { "epoch": 76.97333333333333, "grad_norm": 5.116537094116211, "learning_rate": 1.3271968411881963e-05, "loss": 0.8958, "step": 17319 }, { "epoch": 77.07555555555555, "grad_norm": 4.765411853790283, "learning_rate": 1.3159846340455967e-05, "loss": 0.8901, "step": 17342 }, { "epoch": 77.17777777777778, "grad_norm": 4.765920639038086, "learning_rate": 1.3048128070549543e-05, "loss": 0.8875, "step": 17365 }, { "epoch": 77.28, "grad_norm": 4.69777250289917, "learning_rate": 1.2936814826696324e-05, "loss": 0.881, "step": 17388 }, { "epoch": 77.38222222222223, "grad_norm": 4.7684550285339355, "learning_rate": 1.2825907828990518e-05, "loss": 0.8835, "step": 17411 }, { "epoch": 77.48444444444445, "grad_norm": 4.776817321777344, "learning_rate": 1.271540829307344e-05, "loss": 0.8896, "step": 17434 }, { "epoch": 77.58666666666667, "grad_norm": 4.983736038208008, "learning_rate": 1.2605317430120311e-05, "loss": 0.8845, "step": 17457 }, { "epoch": 77.68888888888888, "grad_norm": 5.313802719116211, "learning_rate": 1.2495636446826891e-05, "loss": 0.8922, "step": 17480 }, { "epoch": 77.7911111111111, "grad_norm": 4.997971534729004, "learning_rate": 1.2386366545396328e-05, "loss": 0.8856, "step": 17503 }, { "epoch": 77.89333333333333, "grad_norm": 5.876720905303955, "learning_rate": 1.2277508923525876e-05, "loss": 0.8838, "step": 17526 }, { "epoch": 77.99555555555555, "grad_norm": 4.762071132659912, "learning_rate": 1.216906477439389e-05, "loss": 0.8814, "step": 17549 }, { "epoch": 78.09777777777778, "grad_norm": 4.621342658996582, "learning_rate": 1.2061035286646677e-05, "loss": 0.8764, "step": 17572 }, { "epoch": 78.2, "grad_norm": 5.084928035736084, "learning_rate": 1.1953421644385443e-05, "loss": 0.8747, "step": 17595 }, { "epoch": 78.30222222222223, "grad_norm": 4.952382564544678, "learning_rate": 1.1846225027153401e-05, "loss": 0.8886, "step": 17618 }, { "epoch": 78.40444444444445, "grad_norm": 4.579256534576416, "learning_rate": 1.1739446609922739e-05, "loss": 0.8729, "step": 17641 }, { "epoch": 78.50666666666666, "grad_norm": 5.518742561340332, "learning_rate": 1.1633087563081847e-05, "loss": 0.8863, "step": 17664 }, { "epoch": 78.60888888888888, "grad_norm": 4.966059684753418, "learning_rate": 1.1527149052422382e-05, "loss": 0.8839, "step": 17687 }, { "epoch": 78.71111111111111, "grad_norm": 5.001364707946777, "learning_rate": 1.1421632239126578e-05, "loss": 0.8893, "step": 17710 }, { "epoch": 78.81333333333333, "grad_norm": 4.7873854637146, "learning_rate": 1.131653827975449e-05, "loss": 0.8695, "step": 17733 }, { "epoch": 78.91555555555556, "grad_norm": 5.2424516677856445, "learning_rate": 1.1211868326231273e-05, "loss": 0.8857, "step": 17756 }, { "epoch": 79.01777777777778, "grad_norm": 4.72099494934082, "learning_rate": 1.1107623525834631e-05, "loss": 0.8844, "step": 17779 }, { "epoch": 79.12, "grad_norm": 5.387650489807129, "learning_rate": 1.1003805021182168e-05, "loss": 0.8672, "step": 17802 }, { "epoch": 79.22222222222223, "grad_norm": 6.549093246459961, "learning_rate": 1.0900413950218947e-05, "loss": 0.8639, "step": 17825 }, { "epoch": 79.32444444444444, "grad_norm": 5.805511951446533, "learning_rate": 1.0797451446204904e-05, "loss": 0.8738, "step": 17848 }, { "epoch": 79.42666666666666, "grad_norm": 5.417078018188477, "learning_rate": 1.0694918637702562e-05, "loss": 0.8815, "step": 17871 }, { "epoch": 79.52888888888889, "grad_norm": 4.696217060089111, "learning_rate": 1.0592816648564535e-05, "loss": 0.8824, "step": 17894 }, { "epoch": 79.63111111111111, "grad_norm": 4.98297119140625, "learning_rate": 1.0491146597921309e-05, "loss": 0.8617, "step": 17917 }, { "epoch": 79.73333333333333, "grad_norm": 4.85457181930542, "learning_rate": 1.0389909600168911e-05, "loss": 0.8715, "step": 17940 }, { "epoch": 79.83555555555556, "grad_norm": 5.266817092895508, "learning_rate": 1.0289106764956702e-05, "loss": 0.8754, "step": 17963 }, { "epoch": 79.93777777777778, "grad_norm": 5.948962688446045, "learning_rate": 1.0188739197175268e-05, "loss": 0.8806, "step": 17986 }, { "epoch": 80.04, "grad_norm": 6.155448913574219, "learning_rate": 1.0088807996944211e-05, "loss": 0.8767, "step": 18009 }, { "epoch": 80.14222222222222, "grad_norm": 4.785376071929932, "learning_rate": 9.989314259600219e-06, "loss": 0.8719, "step": 18032 }, { "epoch": 80.24444444444444, "grad_norm": 4.980493545532227, "learning_rate": 9.890259075684915e-06, "loss": 0.866, "step": 18055 }, { "epoch": 80.34666666666666, "grad_norm": 6.7485032081604, "learning_rate": 9.791643530933032e-06, "loss": 0.8639, "step": 18078 }, { "epoch": 80.44888888888889, "grad_norm": 5.030679225921631, "learning_rate": 9.693468706260456e-06, "loss": 0.8707, "step": 18101 }, { "epoch": 80.55111111111111, "grad_norm": 5.043888568878174, "learning_rate": 9.595735677752343e-06, "loss": 0.8603, "step": 18124 }, { "epoch": 80.65333333333334, "grad_norm": 5.022198677062988, "learning_rate": 9.49844551665141e-06, "loss": 0.8598, "step": 18147 }, { "epoch": 80.75555555555556, "grad_norm": 6.346147060394287, "learning_rate": 9.401599289346091e-06, "loss": 0.8663, "step": 18170 }, { "epoch": 80.85777777777778, "grad_norm": 5.1296610832214355, "learning_rate": 9.305198057358972e-06, "loss": 0.8703, "step": 18193 }, { "epoch": 80.96, "grad_norm": 5.117784023284912, "learning_rate": 9.209242877335005e-06, "loss": 0.8624, "step": 18216 }, { "epoch": 81.06222222222222, "grad_norm": 4.949360370635986, "learning_rate": 9.113734801030076e-06, "loss": 0.8559, "step": 18239 }, { "epoch": 81.16444444444444, "grad_norm": 4.507094860076904, "learning_rate": 9.018674875299393e-06, "loss": 0.861, "step": 18262 }, { "epoch": 81.26666666666667, "grad_norm": 5.280154705047607, "learning_rate": 8.924064142085985e-06, "loss": 0.8558, "step": 18285 }, { "epoch": 81.36888888888889, "grad_norm": 4.777374267578125, "learning_rate": 8.829903638409388e-06, "loss": 0.8598, "step": 18308 }, { "epoch": 81.47111111111111, "grad_norm": 5.726168632507324, "learning_rate": 8.736194396354153e-06, "loss": 0.8649, "step": 18331 }, { "epoch": 81.57333333333334, "grad_norm": 5.1066484451293945, "learning_rate": 8.642937443058646e-06, "loss": 0.8558, "step": 18354 }, { "epoch": 81.67555555555556, "grad_norm": 5.291098117828369, "learning_rate": 8.550133800703686e-06, "loss": 0.8572, "step": 18377 }, { "epoch": 81.77777777777777, "grad_norm": 4.3951334953308105, "learning_rate": 8.457784486501452e-06, "loss": 0.8713, "step": 18400 }, { "epoch": 81.88, "grad_norm": 4.807311058044434, "learning_rate": 8.36589051268421e-06, "loss": 0.8704, "step": 18423 }, { "epoch": 81.98222222222222, "grad_norm": 6.832765579223633, "learning_rate": 8.274452886493333e-06, "loss": 0.862, "step": 18446 }, { "epoch": 82.08444444444444, "grad_norm": 4.566845417022705, "learning_rate": 8.183472610168197e-06, "loss": 0.8604, "step": 18469 }, { "epoch": 82.18666666666667, "grad_norm": 4.8708648681640625, "learning_rate": 8.092950680935185e-06, "loss": 0.8589, "step": 18492 }, { "epoch": 82.28888888888889, "grad_norm": 5.396876335144043, "learning_rate": 8.002888090996814e-06, "loss": 0.8608, "step": 18515 }, { "epoch": 82.39111111111112, "grad_norm": 4.885883808135986, "learning_rate": 7.913285827520794e-06, "loss": 0.8484, "step": 18538 }, { "epoch": 82.49333333333334, "grad_norm": 4.598787307739258, "learning_rate": 7.824144872629269e-06, "loss": 0.8576, "step": 18561 }, { "epoch": 82.59555555555555, "grad_norm": 4.590323448181152, "learning_rate": 7.735466203387992e-06, "loss": 0.8554, "step": 18584 }, { "epoch": 82.69777777777777, "grad_norm": 5.497690200805664, "learning_rate": 7.647250791795668e-06, "loss": 0.855, "step": 18607 }, { "epoch": 82.8, "grad_norm": 4.905009746551514, "learning_rate": 7.559499604773279e-06, "loss": 0.8563, "step": 18630 }, { "epoch": 82.90222222222222, "grad_norm": 4.675111770629883, "learning_rate": 7.47221360415346e-06, "loss": 0.8597, "step": 18653 }, { "epoch": 83.00444444444445, "grad_norm": 5.6808576583862305, "learning_rate": 7.385393746670022e-06, "loss": 0.8566, "step": 18676 }, { "epoch": 83.10666666666667, "grad_norm": 6.699379920959473, "learning_rate": 7.299040983947369e-06, "loss": 0.856, "step": 18699 }, { "epoch": 83.2088888888889, "grad_norm": 5.053982257843018, "learning_rate": 7.213156262490173e-06, "loss": 0.8481, "step": 18722 }, { "epoch": 83.31111111111112, "grad_norm": 5.297053337097168, "learning_rate": 7.127740523672915e-06, "loss": 0.85, "step": 18745 }, { "epoch": 83.41333333333333, "grad_norm": 5.744291305541992, "learning_rate": 7.042794703729622e-06, "loss": 0.8618, "step": 18768 }, { "epoch": 83.51555555555555, "grad_norm": 4.679412364959717, "learning_rate": 6.95831973374359e-06, "loss": 0.8403, "step": 18791 }, { "epoch": 83.61777777777777, "grad_norm": 4.38852596282959, "learning_rate": 6.874316539637127e-06, "loss": 0.8464, "step": 18814 }, { "epoch": 83.72, "grad_norm": 4.899384021759033, "learning_rate": 6.7907860421615066e-06, "loss": 0.8523, "step": 18837 }, { "epoch": 83.82222222222222, "grad_norm": 5.16193962097168, "learning_rate": 6.707729156886777e-06, "loss": 0.8502, "step": 18860 }, { "epoch": 83.92444444444445, "grad_norm": 4.833446979522705, "learning_rate": 6.625146794191794e-06, "loss": 0.8551, "step": 18883 }, { "epoch": 84.02666666666667, "grad_norm": 4.920324325561523, "learning_rate": 6.543039859254185e-06, "loss": 0.8525, "step": 18906 }, { "epoch": 84.1288888888889, "grad_norm": 5.322509765625, "learning_rate": 6.4614092520404905e-06, "loss": 0.8534, "step": 18929 }, { "epoch": 84.2311111111111, "grad_norm": 5.062963485717773, "learning_rate": 6.380255867296253e-06, "loss": 0.8519, "step": 18952 }, { "epoch": 84.33333333333333, "grad_norm": 5.186446666717529, "learning_rate": 6.299580594536214e-06, "loss": 0.8445, "step": 18975 }, { "epoch": 84.43555555555555, "grad_norm": 5.609063148498535, "learning_rate": 6.219384318034588e-06, "loss": 0.8432, "step": 18998 }, { "epoch": 84.53777777777778, "grad_norm": 4.684319972991943, "learning_rate": 6.1396679168153445e-06, "loss": 0.8434, "step": 19021 }, { "epoch": 84.64, "grad_norm": 4.717188835144043, "learning_rate": 6.060432264642601e-06, "loss": 0.8451, "step": 19044 }, { "epoch": 84.74222222222222, "grad_norm": 6.810020446777344, "learning_rate": 5.981678230011006e-06, "loss": 0.8425, "step": 19067 }, { "epoch": 84.84444444444445, "grad_norm": 4.562713146209717, "learning_rate": 5.903406676136264e-06, "loss": 0.8468, "step": 19090 }, { "epoch": 84.94666666666667, "grad_norm": 5.388665199279785, "learning_rate": 5.825618460945636e-06, "loss": 0.8418, "step": 19113 }, { "epoch": 85.04888888888888, "grad_norm": 5.054759979248047, "learning_rate": 5.748314437068558e-06, "loss": 0.8417, "step": 19136 }, { "epoch": 85.1511111111111, "grad_norm": 4.943572521209717, "learning_rate": 5.671495451827308e-06, "loss": 0.8444, "step": 19159 }, { "epoch": 85.25333333333333, "grad_norm": 4.801841735839844, "learning_rate": 5.595162347227661e-06, "loss": 0.8407, "step": 19182 }, { "epoch": 85.35555555555555, "grad_norm": 4.94541072845459, "learning_rate": 5.519315959949745e-06, "loss": 0.8413, "step": 19205 }, { "epoch": 85.45777777777778, "grad_norm": 5.529304027557373, "learning_rate": 5.443957121338777e-06, "loss": 0.8462, "step": 19228 }, { "epoch": 85.56, "grad_norm": 4.735396385192871, "learning_rate": 5.36908665739605e-06, "loss": 0.8491, "step": 19251 }, { "epoch": 85.66222222222223, "grad_norm": 5.091115474700928, "learning_rate": 5.294705388769772e-06, "loss": 0.8444, "step": 19274 }, { "epoch": 85.76444444444445, "grad_norm": 4.820996284484863, "learning_rate": 5.220814130746165e-06, "loss": 0.8509, "step": 19297 }, { "epoch": 85.86666666666666, "grad_norm": 4.448352336883545, "learning_rate": 5.1474136932404935e-06, "loss": 0.8339, "step": 19320 }, { "epoch": 85.96888888888888, "grad_norm": 4.6064019203186035, "learning_rate": 5.07450488078815e-06, "loss": 0.8115, "step": 19343 }, { "epoch": 86.07111111111111, "grad_norm": 6.598939895629883, "learning_rate": 5.002088492535906e-06, "loss": 0.818, "step": 19366 }, { "epoch": 86.17333333333333, "grad_norm": 4.426856994628906, "learning_rate": 4.930165322233082e-06, "loss": 0.8147, "step": 19389 }, { "epoch": 86.27555555555556, "grad_norm": 4.873010635375977, "learning_rate": 4.858736158222921e-06, "loss": 0.8146, "step": 19412 }, { "epoch": 86.37777777777778, "grad_norm": 4.8856520652771, "learning_rate": 4.787801783433871e-06, "loss": 0.8158, "step": 19435 }, { "epoch": 86.48, "grad_norm": 5.177906513214111, "learning_rate": 4.717362975371059e-06, "loss": 0.8187, "step": 19458 }, { "epoch": 86.58222222222223, "grad_norm": 4.954709529876709, "learning_rate": 4.647420506107775e-06, "loss": 0.8131, "step": 19481 }, { "epoch": 86.68444444444444, "grad_norm": 4.427014350891113, "learning_rate": 4.577975142276925e-06, "loss": 0.8263, "step": 19504 }, { "epoch": 86.78666666666666, "grad_norm": 5.581162929534912, "learning_rate": 4.509027645062758e-06, "loss": 0.8201, "step": 19527 }, { "epoch": 86.88888888888889, "grad_norm": 4.889328479766846, "learning_rate": 4.4405787701923885e-06, "loss": 0.8239, "step": 19550 }, { "epoch": 86.99111111111111, "grad_norm": 4.658565998077393, "learning_rate": 4.3726292679276305e-06, "loss": 0.8211, "step": 19573 }, { "epoch": 87.09333333333333, "grad_norm": 5.102555751800537, "learning_rate": 4.305179883056687e-06, "loss": 0.8154, "step": 19596 }, { "epoch": 87.19555555555556, "grad_norm": 4.951329231262207, "learning_rate": 4.23823135488603e-06, "loss": 0.8182, "step": 19619 }, { "epoch": 87.29777777777778, "grad_norm": 5.642242908477783, "learning_rate": 4.171784417232305e-06, "loss": 0.8076, "step": 19642 }, { "epoch": 87.4, "grad_norm": 5.003154277801514, "learning_rate": 4.10583979841424e-06, "loss": 0.8129, "step": 19665 }, { "epoch": 87.50222222222222, "grad_norm": 5.778168678283691, "learning_rate": 4.040398221244718e-06, "loss": 0.8123, "step": 19688 }, { "epoch": 87.60444444444444, "grad_norm": 5.08914852142334, "learning_rate": 3.975460403022801e-06, "loss": 0.8149, "step": 19711 }, { "epoch": 87.70666666666666, "grad_norm": 4.585403919219971, "learning_rate": 3.9110270555259345e-06, "loss": 0.8197, "step": 19734 }, { "epoch": 87.80888888888889, "grad_norm": 4.91745138168335, "learning_rate": 3.84709888500207e-06, "loss": 0.8175, "step": 19757 }, { "epoch": 87.91111111111111, "grad_norm": 5.540400981903076, "learning_rate": 3.7836765921619888e-06, "loss": 0.8115, "step": 19780 }, { "epoch": 88.01333333333334, "grad_norm": 4.485517501831055, "learning_rate": 3.720760872171569e-06, "loss": 0.8122, "step": 19803 }, { "epoch": 88.11555555555556, "grad_norm": 4.355061054229736, "learning_rate": 3.658352414644206e-06, "loss": 0.8105, "step": 19826 }, { "epoch": 88.21777777777778, "grad_norm": 5.2161784172058105, "learning_rate": 3.596451903633247e-06, "loss": 0.8115, "step": 19849 }, { "epoch": 88.32, "grad_norm": 4.382901191711426, "learning_rate": 3.535060017624453e-06, "loss": 0.8118, "step": 19872 }, { "epoch": 88.42222222222222, "grad_norm": 5.805255889892578, "learning_rate": 3.47417742952863e-06, "loss": 0.8046, "step": 19895 }, { "epoch": 88.52444444444444, "grad_norm": 4.063962936401367, "learning_rate": 3.4138048066741867e-06, "loss": 0.8136, "step": 19918 }, { "epoch": 88.62666666666667, "grad_norm": 5.049718379974365, "learning_rate": 3.3539428107998814e-06, "loss": 0.8071, "step": 19941 }, { "epoch": 88.72888888888889, "grad_norm": 4.287143230438232, "learning_rate": 3.294592098047494e-06, "loss": 0.8064, "step": 19964 }, { "epoch": 88.83111111111111, "grad_norm": 5.841145992279053, "learning_rate": 3.2357533189547098e-06, "loss": 0.8188, "step": 19987 }, { "epoch": 88.93333333333334, "grad_norm": 6.014995098114014, "learning_rate": 3.1774271184479675e-06, "loss": 0.8114, "step": 20010 }, { "epoch": 89.03555555555556, "grad_norm": 4.5376386642456055, "learning_rate": 3.1196141358353357e-06, "loss": 0.8135, "step": 20033 }, { "epoch": 89.13777777777777, "grad_norm": 4.438096523284912, "learning_rate": 3.0623150047995873e-06, "loss": 0.8091, "step": 20056 }, { "epoch": 89.24, "grad_norm": 4.940515518188477, "learning_rate": 3.005530353391195e-06, "loss": 0.812, "step": 20079 }, { "epoch": 89.34222222222222, "grad_norm": 4.826828479766846, "learning_rate": 2.9492608040214862e-06, "loss": 0.8123, "step": 20102 }, { "epoch": 89.44444444444444, "grad_norm": 4.983479976654053, "learning_rate": 2.893506973455773e-06, "loss": 0.8081, "step": 20125 }, { "epoch": 89.54666666666667, "grad_norm": 6.005835056304932, "learning_rate": 2.838269472806654e-06, "loss": 0.8095, "step": 20148 }, { "epoch": 89.64888888888889, "grad_norm": 4.9561662673950195, "learning_rate": 2.7835489075272727e-06, "loss": 0.8061, "step": 20171 }, { "epoch": 89.75111111111111, "grad_norm": 5.078367233276367, "learning_rate": 2.729345877404671e-06, "loss": 0.7997, "step": 20194 }, { "epoch": 89.85333333333334, "grad_norm": 4.345983505249023, "learning_rate": 2.675660976553268e-06, "loss": 0.8101, "step": 20217 }, { "epoch": 89.95555555555555, "grad_norm": 4.390908241271973, "learning_rate": 2.6224947934082923e-06, "loss": 0.8016, "step": 20240 }, { "epoch": 90.05777777777777, "grad_norm": 4.5562028884887695, "learning_rate": 2.5698479107193697e-06, "loss": 0.8039, "step": 20263 }, { "epoch": 90.16, "grad_norm": 4.685390472412109, "learning_rate": 2.517720905544102e-06, "loss": 0.7952, "step": 20286 }, { "epoch": 90.26222222222222, "grad_norm": 4.973295211791992, "learning_rate": 2.466114349241794e-06, "loss": 0.809, "step": 20309 }, { "epoch": 90.36444444444444, "grad_norm": 5.430562496185303, "learning_rate": 2.4150288074671346e-06, "loss": 0.8088, "step": 20332 }, { "epoch": 90.46666666666667, "grad_norm": 4.49529504776001, "learning_rate": 2.3644648401640156e-06, "loss": 0.8057, "step": 20355 }, { "epoch": 90.56888888888889, "grad_norm": 5.173520565032959, "learning_rate": 2.314423001559424e-06, "loss": 0.8205, "step": 20378 }, { "epoch": 90.67111111111112, "grad_norm": 5.084122657775879, "learning_rate": 2.264903840157312e-06, "loss": 0.8096, "step": 20401 }, { "epoch": 90.77333333333333, "grad_norm": 4.675368309020996, "learning_rate": 2.2159078987326554e-06, "loss": 0.8109, "step": 20424 }, { "epoch": 90.87555555555555, "grad_norm": 4.598373889923096, "learning_rate": 2.167435714325411e-06, "loss": 0.7989, "step": 20447 }, { "epoch": 90.97777777777777, "grad_norm": 4.149188995361328, "learning_rate": 2.1194878182347334e-06, "loss": 0.8142, "step": 20470 }, { "epoch": 91.08, "grad_norm": 5.164962291717529, "learning_rate": 2.0720647360130685e-06, "loss": 0.8096, "step": 20493 }, { "epoch": 91.18222222222222, "grad_norm": 5.351869106292725, "learning_rate": 2.0251669874604474e-06, "loss": 0.8036, "step": 20516 }, { "epoch": 91.28444444444445, "grad_norm": 5.2852935791015625, "learning_rate": 1.9787950866187565e-06, "loss": 0.8057, "step": 20539 }, { "epoch": 91.38666666666667, "grad_norm": 6.784205436706543, "learning_rate": 1.9329495417661046e-06, "loss": 0.8031, "step": 20562 }, { "epoch": 91.4888888888889, "grad_norm": 4.940450668334961, "learning_rate": 1.887630855411282e-06, "loss": 0.8066, "step": 20585 }, { "epoch": 91.5911111111111, "grad_norm": 4.77994441986084, "learning_rate": 1.84283952428822e-06, "loss": 0.8038, "step": 20608 }, { "epoch": 91.69333333333333, "grad_norm": 4.902866840362549, "learning_rate": 1.798576039350558e-06, "loss": 0.8043, "step": 20631 }, { "epoch": 91.79555555555555, "grad_norm": 5.100454330444336, "learning_rate": 1.7548408857662623e-06, "loss": 0.8008, "step": 20654 }, { "epoch": 91.89777777777778, "grad_norm": 4.9377264976501465, "learning_rate": 1.7116345429123104e-06, "loss": 0.8098, "step": 20677 }, { "epoch": 92.0, "grad_norm": 5.0082292556762695, "learning_rate": 1.6689574843694433e-06, "loss": 0.7992, "step": 20700 }, { "epoch": 92.10222222222222, "grad_norm": 4.688179016113281, "learning_rate": 1.6268101779169375e-06, "loss": 0.7928, "step": 20723 }, { "epoch": 92.20444444444445, "grad_norm": 4.243449687957764, "learning_rate": 1.5851930855275365e-06, "loss": 0.7957, "step": 20746 }, { "epoch": 92.30666666666667, "grad_norm": 4.956583499908447, "learning_rate": 1.544106663362338e-06, "loss": 0.8073, "step": 20769 }, { "epoch": 92.4088888888889, "grad_norm": 4.556548118591309, "learning_rate": 1.503551361765826e-06, "loss": 0.8019, "step": 20792 }, { "epoch": 92.5111111111111, "grad_norm": 6.762635707855225, "learning_rate": 1.4635276252608965e-06, "loss": 0.8084, "step": 20815 }, { "epoch": 92.61333333333333, "grad_norm": 5.724966049194336, "learning_rate": 1.4240358925440457e-06, "loss": 0.8008, "step": 20838 }, { "epoch": 92.71555555555555, "grad_norm": 5.445995330810547, "learning_rate": 1.3850765964805e-06, "loss": 0.802, "step": 20861 }, { "epoch": 92.81777777777778, "grad_norm": 4.807301044464111, "learning_rate": 1.3466501640994944e-06, "loss": 0.8038, "step": 20884 }, { "epoch": 92.92, "grad_norm": 5.612717151641846, "learning_rate": 1.308757016589618e-06, "loss": 0.7996, "step": 20907 }, { "epoch": 93.02222222222223, "grad_norm": 4.5359296798706055, "learning_rate": 1.2713975692941415e-06, "loss": 0.801, "step": 20930 }, { "epoch": 93.12444444444445, "grad_norm": 4.222482681274414, "learning_rate": 1.2345722317065267e-06, "loss": 0.7996, "step": 20953 }, { "epoch": 93.22666666666667, "grad_norm": 4.250333786010742, "learning_rate": 1.19828140746589e-06, "loss": 0.8072, "step": 20976 }, { "epoch": 93.32888888888888, "grad_norm": 4.197777271270752, "learning_rate": 1.1625254943526065e-06, "loss": 0.795, "step": 20999 }, { "epoch": 93.43111111111111, "grad_norm": 5.79392671585083, "learning_rate": 1.1273048842839307e-06, "loss": 0.8076, "step": 21022 }, { "epoch": 93.53333333333333, "grad_norm": 4.919564723968506, "learning_rate": 1.0926199633097157e-06, "loss": 0.802, "step": 21045 }, { "epoch": 93.63555555555556, "grad_norm": 5.422025203704834, "learning_rate": 1.0584711116081837e-06, "loss": 0.8141, "step": 21068 }, { "epoch": 93.73777777777778, "grad_norm": 4.949449062347412, "learning_rate": 1.0248587034817237e-06, "loss": 0.8001, "step": 21091 }, { "epoch": 93.84, "grad_norm": 4.578461647033691, "learning_rate": 9.917831073528504e-07, "loss": 0.7959, "step": 21114 }, { "epoch": 93.94222222222223, "grad_norm": 4.7736592292785645, "learning_rate": 9.59244685760108e-07, "loss": 0.8007, "step": 21137 }, { "epoch": 94.04444444444445, "grad_norm": 4.64253044128418, "learning_rate": 9.27243795354138e-07, "loss": 0.8042, "step": 21160 }, { "epoch": 94.14666666666666, "grad_norm": 5.671309471130371, "learning_rate": 8.957807868937296e-07, "loss": 0.7971, "step": 21183 }, { "epoch": 94.24888888888889, "grad_norm": 4.637156963348389, "learning_rate": 8.648560052420151e-07, "loss": 0.8008, "step": 21206 }, { "epoch": 94.35111111111111, "grad_norm": 4.140064239501953, "learning_rate": 8.344697893626741e-07, "loss": 0.7955, "step": 21229 }, { "epoch": 94.45333333333333, "grad_norm": 4.615813732147217, "learning_rate": 8.046224723162077e-07, "loss": 0.7998, "step": 21252 }, { "epoch": 94.55555555555556, "grad_norm": 5.006037712097168, "learning_rate": 7.75314381256298e-07, "loss": 0.7944, "step": 21275 }, { "epoch": 94.65777777777778, "grad_norm": 4.940041542053223, "learning_rate": 7.465458374262213e-07, "loss": 0.7944, "step": 21298 }, { "epoch": 94.76, "grad_norm": 4.452148914337158, "learning_rate": 7.183171561553348e-07, "loss": 0.8021, "step": 21321 }, { "epoch": 94.86222222222223, "grad_norm": 4.3342509269714355, "learning_rate": 6.906286468555955e-07, "loss": 0.8016, "step": 21344 }, { "epoch": 94.96444444444444, "grad_norm": 5.098360538482666, "learning_rate": 6.634806130182025e-07, "loss": 0.7997, "step": 21367 }, { "epoch": 95.06666666666666, "grad_norm": 4.704761028289795, "learning_rate": 6.368733522102432e-07, "loss": 0.8007, "step": 21390 }, { "epoch": 95.16888888888889, "grad_norm": 4.529531002044678, "learning_rate": 6.108071560714413e-07, "loss": 0.7976, "step": 21413 }, { "epoch": 95.27111111111111, "grad_norm": 4.470498561859131, "learning_rate": 5.852823103109639e-07, "loss": 0.7871, "step": 21436 }, { "epoch": 95.37333333333333, "grad_norm": 4.434628486633301, "learning_rate": 5.602990947042919e-07, "loss": 0.8027, "step": 21459 }, { "epoch": 95.47555555555556, "grad_norm": 4.518807411193848, "learning_rate": 5.358577830901435e-07, "loss": 0.7986, "step": 21482 }, { "epoch": 95.57777777777778, "grad_norm": 4.176888942718506, "learning_rate": 5.119586433674661e-07, "loss": 0.7951, "step": 21505 }, { "epoch": 95.68, "grad_norm": 4.806949138641357, "learning_rate": 4.886019374925333e-07, "loss": 0.7995, "step": 21528 }, { "epoch": 95.78222222222222, "grad_norm": 4.371096611022949, "learning_rate": 4.657879214760297e-07, "loss": 0.7991, "step": 21551 }, { "epoch": 95.88444444444444, "grad_norm": 4.214781761169434, "learning_rate": 4.435168453802874e-07, "loss": 0.7912, "step": 21574 }, { "epoch": 95.98666666666666, "grad_norm": 4.71865177154541, "learning_rate": 4.2178895331650427e-07, "loss": 0.804, "step": 21597 }, { "epoch": 96.08888888888889, "grad_norm": 4.573912143707275, "learning_rate": 4.0060448344209634e-07, "loss": 0.7969, "step": 21620 }, { "epoch": 96.19111111111111, "grad_norm": 5.047268390655518, "learning_rate": 3.799636679580887e-07, "loss": 0.7964, "step": 21643 }, { "epoch": 96.29333333333334, "grad_norm": 4.307917594909668, "learning_rate": 3.598667331065397e-07, "loss": 0.7957, "step": 21666 }, { "epoch": 96.39555555555556, "grad_norm": 4.763662815093994, "learning_rate": 3.403138991681043e-07, "loss": 0.7958, "step": 21689 }, { "epoch": 96.49777777777778, "grad_norm": 4.808367729187012, "learning_rate": 3.213053804595911e-07, "loss": 0.809, "step": 21712 }, { "epoch": 96.6, "grad_norm": 5.026544570922852, "learning_rate": 3.0284138533160924e-07, "loss": 0.8024, "step": 21735 }, { "epoch": 96.70222222222222, "grad_norm": 6.12026834487915, "learning_rate": 2.849221161663085e-07, "loss": 0.8041, "step": 21758 }, { "epoch": 96.80444444444444, "grad_norm": 4.895252227783203, "learning_rate": 2.6754776937513717e-07, "loss": 0.7966, "step": 21781 }, { "epoch": 96.90666666666667, "grad_norm": 4.611559867858887, "learning_rate": 2.507185353967101e-07, "loss": 0.8041, "step": 21804 }, { "epoch": 97.00888888888889, "grad_norm": 4.198352813720703, "learning_rate": 2.344345986946994e-07, "loss": 0.8013, "step": 21827 }, { "epoch": 97.11111111111111, "grad_norm": 4.63875675201416, "learning_rate": 2.186961377558361e-07, "loss": 0.8015, "step": 21850 }, { "epoch": 97.21333333333334, "grad_norm": 4.243088245391846, "learning_rate": 2.0350332508793367e-07, "loss": 0.7829, "step": 21873 }, { "epoch": 97.31555555555556, "grad_norm": 4.228803634643555, "learning_rate": 1.8885632721800106e-07, "loss": 0.7999, "step": 21896 }, { "epoch": 97.41777777777777, "grad_norm": 5.103250980377197, "learning_rate": 1.7475530469044376e-07, "loss": 0.7979, "step": 21919 }, { "epoch": 97.52, "grad_norm": 4.691418170928955, "learning_rate": 1.6120041206524883e-07, "loss": 0.7972, "step": 21942 }, { "epoch": 97.62222222222222, "grad_norm": 4.644149303436279, "learning_rate": 1.481917979163583e-07, "loss": 0.7897, "step": 21965 }, { "epoch": 97.72444444444444, "grad_norm": 4.451114654541016, "learning_rate": 1.357296048299761e-07, "loss": 0.8001, "step": 21988 }, { "epoch": 97.82666666666667, "grad_norm": 4.836966037750244, "learning_rate": 1.2381396940305824e-07, "loss": 0.7994, "step": 22011 }, { "epoch": 97.92888888888889, "grad_norm": 4.453198432922363, "learning_rate": 1.12445022241775e-07, "loss": 0.7969, "step": 22034 }, { "epoch": 98.03111111111112, "grad_norm": 5.4233903884887695, "learning_rate": 1.0162288796011221e-07, "loss": 0.8006, "step": 22057 }, { "epoch": 98.13333333333334, "grad_norm": 4.528837203979492, "learning_rate": 9.134768517848336e-08, "loss": 0.8031, "step": 22080 }, { "epoch": 98.23555555555555, "grad_norm": 5.245551586151123, "learning_rate": 8.161952652243621e-08, "loss": 0.8005, "step": 22103 }, { "epoch": 98.33777777777777, "grad_norm": 4.625002861022949, "learning_rate": 7.243851862141492e-08, "loss": 0.8075, "step": 22126 }, { "epoch": 98.44, "grad_norm": 4.824587345123291, "learning_rate": 6.38047621075999e-08, "loss": 0.7925, "step": 22149 }, { "epoch": 98.54222222222222, "grad_norm": 4.704883098602295, "learning_rate": 5.5718351614797437e-08, "loss": 0.7953, "step": 22172 }, { "epoch": 98.64444444444445, "grad_norm": 4.561920642852783, "learning_rate": 4.817937577741294e-08, "loss": 0.7976, "step": 22195 }, { "epoch": 98.74666666666667, "grad_norm": 4.796523094177246, "learning_rate": 4.118791722945159e-08, "loss": 0.8026, "step": 22218 }, { "epoch": 98.8488888888889, "grad_norm": 4.576013565063477, "learning_rate": 3.474405260365798e-08, "loss": 0.794, "step": 22241 }, { "epoch": 98.95111111111112, "grad_norm": 5.13820743560791, "learning_rate": 2.8847852530622387e-08, "loss": 0.7895, "step": 22264 }, { "epoch": 99.05333333333333, "grad_norm": 4.2987060546875, "learning_rate": 2.3499381638064645e-08, "loss": 0.7919, "step": 22287 }, { "epoch": 99.15555555555555, "grad_norm": 4.3480305671691895, "learning_rate": 1.8698698550068117e-08, "loss": 0.798, "step": 22310 }, { "epoch": 99.25777777777778, "grad_norm": 5.037069797515869, "learning_rate": 1.4445855886480176e-08, "loss": 0.8026, "step": 22333 }, { "epoch": 99.36, "grad_norm": 4.374788284301758, "learning_rate": 1.074090026231267e-08, "loss": 0.7926, "step": 22356 }, { "epoch": 99.46222222222222, "grad_norm": 4.93529748916626, "learning_rate": 7.583872287253436e-09, "loss": 0.8044, "step": 22379 }, { "epoch": 99.56444444444445, "grad_norm": 4.404996395111084, "learning_rate": 4.974806565177792e-09, "loss": 0.802, "step": 22402 }, { "epoch": 99.66666666666667, "grad_norm": 4.556636333465576, "learning_rate": 2.9137316938265825e-09, "loss": 0.793, "step": 22425 }, { "epoch": 99.7688888888889, "grad_norm": 4.4638190269470215, "learning_rate": 1.4006702644453474e-09, "loss": 0.7999, "step": 22448 }, { "epoch": 99.8711111111111, "grad_norm": 4.293120861053467, "learning_rate": 4.3563886156228196e-10, "loss": 0.8048, "step": 22471 }, { "epoch": 99.97333333333333, "grad_norm": 4.100605010986328, "learning_rate": 1.8648062799497822e-11, "loss": 0.7996, "step": 22494 }, { "epoch": 100.0, "step": 22500, "total_flos": 2.1925440120390943e+18, "train_loss": 2.6133422136730617, "train_runtime": 133573.7106, "train_samples_per_second": 86.157, "train_steps_per_second": 0.168 } ], "logging_steps": 23, "max_steps": 22500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1925440120390943e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }