{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.88888888888889, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008888888888888889, "grad_norm": 5.700418949127197, "learning_rate": 8.88888888888889e-07, "loss": 1.01, "step": 10 }, { "epoch": 0.017777777777777778, "grad_norm": 6.929300308227539, "learning_rate": 1.777777777777778e-06, "loss": 1.0799, "step": 20 }, { "epoch": 0.02666666666666667, "grad_norm": 7.455889701843262, "learning_rate": 2.666666666666667e-06, "loss": 1.0056, "step": 30 }, { "epoch": 0.035555555555555556, "grad_norm": 5.365911960601807, "learning_rate": 3.555555555555556e-06, "loss": 0.9002, "step": 40 }, { "epoch": 0.044444444444444446, "grad_norm": 4.195066928863525, "learning_rate": 4.444444444444445e-06, "loss": 0.7579, "step": 50 }, { "epoch": 0.05333333333333334, "grad_norm": 2.68858003616333, "learning_rate": 5.333333333333334e-06, "loss": 0.5492, "step": 60 }, { "epoch": 0.06222222222222222, "grad_norm": 2.9468464851379395, "learning_rate": 6.222222222222222e-06, "loss": 0.4148, "step": 70 }, { "epoch": 0.07111111111111111, "grad_norm": 1.3413068056106567, "learning_rate": 7.111111111111112e-06, "loss": 0.3223, "step": 80 }, { "epoch": 0.08, "grad_norm": 1.8903324604034424, "learning_rate": 8.000000000000001e-06, "loss": 0.2846, "step": 90 }, { "epoch": 0.08888888888888889, "grad_norm": 1.3434109687805176, "learning_rate": 8.88888888888889e-06, "loss": 0.2368, "step": 100 }, { "epoch": 0.09777777777777778, "grad_norm": 0.9191005229949951, "learning_rate": 9.777777777777779e-06, "loss": 0.2334, "step": 110 }, { "epoch": 0.10666666666666667, "grad_norm": 0.827496349811554, "learning_rate": 1.0666666666666667e-05, "loss": 0.2261, "step": 120 }, { "epoch": 0.11555555555555555, "grad_norm": 1.0630714893341064, "learning_rate": 1.1555555555555556e-05, "loss": 0.2081, "step": 130 }, { "epoch": 0.12444444444444444, "grad_norm": 1.075667142868042, "learning_rate": 1.2444444444444445e-05, "loss": 0.2039, "step": 140 }, { "epoch": 0.13333333333333333, "grad_norm": 0.9288878440856934, "learning_rate": 1.3333333333333333e-05, "loss": 0.204, "step": 150 }, { "epoch": 0.14222222222222222, "grad_norm": 0.8703528642654419, "learning_rate": 1.4222222222222224e-05, "loss": 0.1891, "step": 160 }, { "epoch": 0.1511111111111111, "grad_norm": 0.6811290383338928, "learning_rate": 1.5111111111111112e-05, "loss": 0.1736, "step": 170 }, { "epoch": 0.16, "grad_norm": 0.7906114459037781, "learning_rate": 1.6000000000000003e-05, "loss": 0.1795, "step": 180 }, { "epoch": 0.1688888888888889, "grad_norm": 1.077010154724121, "learning_rate": 1.688888888888889e-05, "loss": 0.1638, "step": 190 }, { "epoch": 0.17777777777777778, "grad_norm": 0.9026978015899658, "learning_rate": 1.777777777777778e-05, "loss": 0.1538, "step": 200 }, { "epoch": 0.18666666666666668, "grad_norm": 0.9826441407203674, "learning_rate": 1.866666666666667e-05, "loss": 0.1611, "step": 210 }, { "epoch": 0.19555555555555557, "grad_norm": 2.254016637802124, "learning_rate": 1.9555555555555557e-05, "loss": 0.1539, "step": 220 }, { "epoch": 0.20444444444444446, "grad_norm": 0.807654619216919, "learning_rate": 2.0444444444444446e-05, "loss": 0.157, "step": 230 }, { "epoch": 0.21333333333333335, "grad_norm": 0.6680651307106018, "learning_rate": 2.1333333333333335e-05, "loss": 0.1365, "step": 240 }, { "epoch": 0.2222222222222222, "grad_norm": 1.279850721359253, "learning_rate": 2.2222222222222223e-05, "loss": 0.1411, "step": 250 }, { "epoch": 0.2311111111111111, "grad_norm": 0.5587814450263977, "learning_rate": 2.3111111111111112e-05, "loss": 0.1352, "step": 260 }, { "epoch": 0.24, "grad_norm": 0.8034347295761108, "learning_rate": 2.4e-05, "loss": 0.1257, "step": 270 }, { "epoch": 0.24888888888888888, "grad_norm": 0.9014438390731812, "learning_rate": 2.488888888888889e-05, "loss": 0.1176, "step": 280 }, { "epoch": 0.2577777777777778, "grad_norm": 0.9729371666908264, "learning_rate": 2.5777777777777778e-05, "loss": 0.1121, "step": 290 }, { "epoch": 0.26666666666666666, "grad_norm": 0.7709713578224182, "learning_rate": 2.6666666666666667e-05, "loss": 0.0996, "step": 300 }, { "epoch": 0.27555555555555555, "grad_norm": 1.095412254333496, "learning_rate": 2.7555555555555555e-05, "loss": 0.097, "step": 310 }, { "epoch": 0.28444444444444444, "grad_norm": 0.764116644859314, "learning_rate": 2.8444444444444447e-05, "loss": 0.1138, "step": 320 }, { "epoch": 0.29333333333333333, "grad_norm": 0.9186961054801941, "learning_rate": 2.9333333333333336e-05, "loss": 0.1014, "step": 330 }, { "epoch": 0.3022222222222222, "grad_norm": 0.66915363073349, "learning_rate": 3.0222222222222225e-05, "loss": 0.102, "step": 340 }, { "epoch": 0.3111111111111111, "grad_norm": 0.9105449318885803, "learning_rate": 3.111111111111111e-05, "loss": 0.0982, "step": 350 }, { "epoch": 0.32, "grad_norm": 0.6602293848991394, "learning_rate": 3.2000000000000005e-05, "loss": 0.0996, "step": 360 }, { "epoch": 0.3288888888888889, "grad_norm": 0.7929656505584717, "learning_rate": 3.2888888888888894e-05, "loss": 0.0898, "step": 370 }, { "epoch": 0.3377777777777778, "grad_norm": 0.7161681056022644, "learning_rate": 3.377777777777778e-05, "loss": 0.0908, "step": 380 }, { "epoch": 0.3466666666666667, "grad_norm": 0.7836205363273621, "learning_rate": 3.466666666666667e-05, "loss": 0.0851, "step": 390 }, { "epoch": 0.35555555555555557, "grad_norm": 0.7205038070678711, "learning_rate": 3.555555555555556e-05, "loss": 0.0785, "step": 400 }, { "epoch": 0.36444444444444446, "grad_norm": 0.7583833336830139, "learning_rate": 3.644444444444445e-05, "loss": 0.0849, "step": 410 }, { "epoch": 0.37333333333333335, "grad_norm": 0.6197006106376648, "learning_rate": 3.733333333333334e-05, "loss": 0.0853, "step": 420 }, { "epoch": 0.38222222222222224, "grad_norm": 0.7356877326965332, "learning_rate": 3.8222222222222226e-05, "loss": 0.0814, "step": 430 }, { "epoch": 0.39111111111111113, "grad_norm": 0.559291660785675, "learning_rate": 3.9111111111111115e-05, "loss": 0.0809, "step": 440 }, { "epoch": 0.4, "grad_norm": 0.8557582497596741, "learning_rate": 4e-05, "loss": 0.0857, "step": 450 }, { "epoch": 0.4088888888888889, "grad_norm": 0.5861983299255371, "learning_rate": 4.088888888888889e-05, "loss": 0.0804, "step": 460 }, { "epoch": 0.4177777777777778, "grad_norm": 0.5860317945480347, "learning_rate": 4.177777777777778e-05, "loss": 0.0674, "step": 470 }, { "epoch": 0.4266666666666667, "grad_norm": 0.6448118686676025, "learning_rate": 4.266666666666667e-05, "loss": 0.0683, "step": 480 }, { "epoch": 0.43555555555555553, "grad_norm": 0.574352502822876, "learning_rate": 4.355555555555556e-05, "loss": 0.0899, "step": 490 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5665810108184814, "learning_rate": 4.4444444444444447e-05, "loss": 0.0696, "step": 500 }, { "epoch": 0.4533333333333333, "grad_norm": 0.6638944745063782, "learning_rate": 4.5333333333333335e-05, "loss": 0.0754, "step": 510 }, { "epoch": 0.4622222222222222, "grad_norm": 0.630711019039154, "learning_rate": 4.6222222222222224e-05, "loss": 0.0749, "step": 520 }, { "epoch": 0.4711111111111111, "grad_norm": 0.6063206195831299, "learning_rate": 4.711111111111111e-05, "loss": 0.0726, "step": 530 }, { "epoch": 0.48, "grad_norm": 0.7545419335365295, "learning_rate": 4.8e-05, "loss": 0.0725, "step": 540 }, { "epoch": 0.4888888888888889, "grad_norm": 0.9865553975105286, "learning_rate": 4.888888888888889e-05, "loss": 0.0859, "step": 550 }, { "epoch": 0.49777777777777776, "grad_norm": 0.5210093259811401, "learning_rate": 4.977777777777778e-05, "loss": 0.0697, "step": 560 }, { "epoch": 0.5066666666666667, "grad_norm": 0.9670521020889282, "learning_rate": 5.0666666666666674e-05, "loss": 0.0751, "step": 570 }, { "epoch": 0.5155555555555555, "grad_norm": 0.9070740342140198, "learning_rate": 5.1555555555555556e-05, "loss": 0.0645, "step": 580 }, { "epoch": 0.5244444444444445, "grad_norm": 0.610295832157135, "learning_rate": 5.244444444444445e-05, "loss": 0.0611, "step": 590 }, { "epoch": 0.5333333333333333, "grad_norm": 0.5589671730995178, "learning_rate": 5.333333333333333e-05, "loss": 0.0711, "step": 600 }, { "epoch": 0.5422222222222223, "grad_norm": 0.6363654136657715, "learning_rate": 5.422222222222223e-05, "loss": 0.0666, "step": 610 }, { "epoch": 0.5511111111111111, "grad_norm": 0.5367708802223206, "learning_rate": 5.511111111111111e-05, "loss": 0.0658, "step": 620 }, { "epoch": 0.56, "grad_norm": 0.7059729099273682, "learning_rate": 5.6000000000000006e-05, "loss": 0.0726, "step": 630 }, { "epoch": 0.5688888888888889, "grad_norm": 0.45579686760902405, "learning_rate": 5.6888888888888895e-05, "loss": 0.0623, "step": 640 }, { "epoch": 0.5777777777777777, "grad_norm": 0.4652188718318939, "learning_rate": 5.7777777777777776e-05, "loss": 0.0664, "step": 650 }, { "epoch": 0.5866666666666667, "grad_norm": 0.5640103816986084, "learning_rate": 5.866666666666667e-05, "loss": 0.0548, "step": 660 }, { "epoch": 0.5955555555555555, "grad_norm": 0.6014713644981384, "learning_rate": 5.9555555555555554e-05, "loss": 0.0586, "step": 670 }, { "epoch": 0.6044444444444445, "grad_norm": 0.7811099886894226, "learning_rate": 6.044444444444445e-05, "loss": 0.0651, "step": 680 }, { "epoch": 0.6133333333333333, "grad_norm": 0.6935920715332031, "learning_rate": 6.133333333333334e-05, "loss": 0.0671, "step": 690 }, { "epoch": 0.6222222222222222, "grad_norm": 0.5297024250030518, "learning_rate": 6.222222222222222e-05, "loss": 0.0669, "step": 700 }, { "epoch": 0.6311111111111111, "grad_norm": 0.7475340962409973, "learning_rate": 6.311111111111112e-05, "loss": 0.0708, "step": 710 }, { "epoch": 0.64, "grad_norm": 0.5169958472251892, "learning_rate": 6.400000000000001e-05, "loss": 0.07, "step": 720 }, { "epoch": 0.6488888888888888, "grad_norm": 0.5551416873931885, "learning_rate": 6.488888888888889e-05, "loss": 0.0641, "step": 730 }, { "epoch": 0.6577777777777778, "grad_norm": 0.5010764598846436, "learning_rate": 6.577777777777779e-05, "loss": 0.0597, "step": 740 }, { "epoch": 0.6666666666666666, "grad_norm": 0.42308464646339417, "learning_rate": 6.666666666666667e-05, "loss": 0.0519, "step": 750 }, { "epoch": 0.6755555555555556, "grad_norm": 0.6996558904647827, "learning_rate": 6.755555555555557e-05, "loss": 0.0628, "step": 760 }, { "epoch": 0.6844444444444444, "grad_norm": 0.7874552607536316, "learning_rate": 6.844444444444445e-05, "loss": 0.0697, "step": 770 }, { "epoch": 0.6933333333333334, "grad_norm": 0.5602414011955261, "learning_rate": 6.933333333333334e-05, "loss": 0.0623, "step": 780 }, { "epoch": 0.7022222222222222, "grad_norm": 0.5839990973472595, "learning_rate": 7.022222222222222e-05, "loss": 0.0727, "step": 790 }, { "epoch": 0.7111111111111111, "grad_norm": 0.47553110122680664, "learning_rate": 7.111111111111112e-05, "loss": 0.0706, "step": 800 }, { "epoch": 0.72, "grad_norm": 0.5351859331130981, "learning_rate": 7.2e-05, "loss": 0.07, "step": 810 }, { "epoch": 0.7288888888888889, "grad_norm": 0.5547653436660767, "learning_rate": 7.28888888888889e-05, "loss": 0.0642, "step": 820 }, { "epoch": 0.7377777777777778, "grad_norm": 0.5916814208030701, "learning_rate": 7.377777777777778e-05, "loss": 0.0649, "step": 830 }, { "epoch": 0.7466666666666667, "grad_norm": 0.6574834585189819, "learning_rate": 7.466666666666667e-05, "loss": 0.07, "step": 840 }, { "epoch": 0.7555555555555555, "grad_norm": 0.5202444791793823, "learning_rate": 7.555555555555556e-05, "loss": 0.0617, "step": 850 }, { "epoch": 0.7644444444444445, "grad_norm": 0.6287102103233337, "learning_rate": 7.644444444444445e-05, "loss": 0.0605, "step": 860 }, { "epoch": 0.7733333333333333, "grad_norm": 0.4295113682746887, "learning_rate": 7.733333333333333e-05, "loss": 0.0568, "step": 870 }, { "epoch": 0.7822222222222223, "grad_norm": 0.5854935050010681, "learning_rate": 7.822222222222223e-05, "loss": 0.0546, "step": 880 }, { "epoch": 0.7911111111111111, "grad_norm": 0.5637120604515076, "learning_rate": 7.911111111111111e-05, "loss": 0.0593, "step": 890 }, { "epoch": 0.8, "grad_norm": 0.5082525610923767, "learning_rate": 8e-05, "loss": 0.0559, "step": 900 }, { "epoch": 0.8088888888888889, "grad_norm": 0.5577961802482605, "learning_rate": 8.088888888888889e-05, "loss": 0.0586, "step": 910 }, { "epoch": 0.8177777777777778, "grad_norm": 0.8455150127410889, "learning_rate": 8.177777777777778e-05, "loss": 0.0656, "step": 920 }, { "epoch": 0.8266666666666667, "grad_norm": 0.7705115079879761, "learning_rate": 8.266666666666667e-05, "loss": 0.0558, "step": 930 }, { "epoch": 0.8355555555555556, "grad_norm": 0.5148472189903259, "learning_rate": 8.355555555555556e-05, "loss": 0.0627, "step": 940 }, { "epoch": 0.8444444444444444, "grad_norm": 0.4516676068305969, "learning_rate": 8.444444444444444e-05, "loss": 0.0616, "step": 950 }, { "epoch": 0.8533333333333334, "grad_norm": 0.4560335576534271, "learning_rate": 8.533333333333334e-05, "loss": 0.0699, "step": 960 }, { "epoch": 0.8622222222222222, "grad_norm": 0.46958062052726746, "learning_rate": 8.622222222222222e-05, "loss": 0.0665, "step": 970 }, { "epoch": 0.8711111111111111, "grad_norm": 0.495043009519577, "learning_rate": 8.711111111111112e-05, "loss": 0.0615, "step": 980 }, { "epoch": 0.88, "grad_norm": 0.6721417307853699, "learning_rate": 8.800000000000001e-05, "loss": 0.0648, "step": 990 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6050969958305359, "learning_rate": 8.888888888888889e-05, "loss": 0.0585, "step": 1000 }, { "epoch": 0.8977777777777778, "grad_norm": 0.5422479510307312, "learning_rate": 8.977777777777779e-05, "loss": 0.066, "step": 1010 }, { "epoch": 0.9066666666666666, "grad_norm": 0.5164414644241333, "learning_rate": 9.066666666666667e-05, "loss": 0.0631, "step": 1020 }, { "epoch": 0.9155555555555556, "grad_norm": 0.582086443901062, "learning_rate": 9.155555555555557e-05, "loss": 0.0632, "step": 1030 }, { "epoch": 0.9244444444444444, "grad_norm": 0.7132883667945862, "learning_rate": 9.244444444444445e-05, "loss": 0.0545, "step": 1040 }, { "epoch": 0.9333333333333333, "grad_norm": 0.5442332625389099, "learning_rate": 9.333333333333334e-05, "loss": 0.0547, "step": 1050 }, { "epoch": 0.9422222222222222, "grad_norm": 0.35314828157424927, "learning_rate": 9.422222222222223e-05, "loss": 0.0579, "step": 1060 }, { "epoch": 0.9511111111111111, "grad_norm": 0.46566030383110046, "learning_rate": 9.511111111111112e-05, "loss": 0.0564, "step": 1070 }, { "epoch": 0.96, "grad_norm": 0.39371681213378906, "learning_rate": 9.6e-05, "loss": 0.0493, "step": 1080 }, { "epoch": 0.9688888888888889, "grad_norm": 0.5286294221878052, "learning_rate": 9.68888888888889e-05, "loss": 0.0638, "step": 1090 }, { "epoch": 0.9777777777777777, "grad_norm": 0.5336845517158508, "learning_rate": 9.777777777777778e-05, "loss": 0.0541, "step": 1100 }, { "epoch": 0.9866666666666667, "grad_norm": 0.4992632269859314, "learning_rate": 9.866666666666668e-05, "loss": 0.053, "step": 1110 }, { "epoch": 0.9955555555555555, "grad_norm": 0.5677465796470642, "learning_rate": 9.955555555555556e-05, "loss": 0.0559, "step": 1120 }, { "epoch": 1.0044444444444445, "grad_norm": 0.45878276228904724, "learning_rate": 9.999998649895154e-05, "loss": 0.0609, "step": 1130 }, { "epoch": 1.0133333333333334, "grad_norm": 0.520175039768219, "learning_rate": 9.999987849060753e-05, "loss": 0.0557, "step": 1140 }, { "epoch": 1.0222222222222221, "grad_norm": 0.5627009272575378, "learning_rate": 9.999966247415285e-05, "loss": 0.0504, "step": 1150 }, { "epoch": 1.031111111111111, "grad_norm": 0.3478066325187683, "learning_rate": 9.999933845005409e-05, "loss": 0.0594, "step": 1160 }, { "epoch": 1.04, "grad_norm": 0.5413442850112915, "learning_rate": 9.999890641901125e-05, "loss": 0.0585, "step": 1170 }, { "epoch": 1.048888888888889, "grad_norm": 0.4987482726573944, "learning_rate": 9.999836638195753e-05, "loss": 0.0592, "step": 1180 }, { "epoch": 1.0577777777777777, "grad_norm": 0.4062700569629669, "learning_rate": 9.999771834005954e-05, "loss": 0.0531, "step": 1190 }, { "epoch": 1.0666666666666667, "grad_norm": 0.6570175290107727, "learning_rate": 9.999696229471716e-05, "loss": 0.0525, "step": 1200 }, { "epoch": 1.0755555555555556, "grad_norm": 0.49096977710723877, "learning_rate": 9.999609824756355e-05, "loss": 0.055, "step": 1210 }, { "epoch": 1.0844444444444445, "grad_norm": 0.40465685725212097, "learning_rate": 9.999512620046522e-05, "loss": 0.0523, "step": 1220 }, { "epoch": 1.0933333333333333, "grad_norm": 0.4358631670475006, "learning_rate": 9.999404615552194e-05, "loss": 0.057, "step": 1230 }, { "epoch": 1.1022222222222222, "grad_norm": 0.3530769348144531, "learning_rate": 9.999285811506678e-05, "loss": 0.0446, "step": 1240 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5127022862434387, "learning_rate": 9.999156208166614e-05, "loss": 0.0513, "step": 1250 }, { "epoch": 1.12, "grad_norm": 0.6078258752822876, "learning_rate": 9.999015805811965e-05, "loss": 0.0666, "step": 1260 }, { "epoch": 1.1288888888888888, "grad_norm": 0.4968441426753998, "learning_rate": 9.998864604746022e-05, "loss": 0.0523, "step": 1270 }, { "epoch": 1.1377777777777778, "grad_norm": 0.47872912883758545, "learning_rate": 9.998702605295407e-05, "loss": 0.0525, "step": 1280 }, { "epoch": 1.1466666666666667, "grad_norm": 0.46508821845054626, "learning_rate": 9.998529807810064e-05, "loss": 0.0444, "step": 1290 }, { "epoch": 1.1555555555555554, "grad_norm": 0.3523375391960144, "learning_rate": 9.998346212663266e-05, "loss": 0.0462, "step": 1300 }, { "epoch": 1.1644444444444444, "grad_norm": 0.42386460304260254, "learning_rate": 9.998151820251608e-05, "loss": 0.0542, "step": 1310 }, { "epoch": 1.1733333333333333, "grad_norm": 0.36166882514953613, "learning_rate": 9.997946630995013e-05, "loss": 0.0533, "step": 1320 }, { "epoch": 1.1822222222222223, "grad_norm": 0.48401138186454773, "learning_rate": 9.997730645336721e-05, "loss": 0.0515, "step": 1330 }, { "epoch": 1.1911111111111112, "grad_norm": 0.4822772443294525, "learning_rate": 9.997503863743298e-05, "loss": 0.0623, "step": 1340 }, { "epoch": 1.2, "grad_norm": 0.5253335237503052, "learning_rate": 9.997266286704631e-05, "loss": 0.0538, "step": 1350 }, { "epoch": 1.208888888888889, "grad_norm": 0.5290883183479309, "learning_rate": 9.997017914733925e-05, "loss": 0.059, "step": 1360 }, { "epoch": 1.2177777777777778, "grad_norm": 0.478901207447052, "learning_rate": 9.996758748367706e-05, "loss": 0.0488, "step": 1370 }, { "epoch": 1.2266666666666666, "grad_norm": 0.3946424722671509, "learning_rate": 9.996488788165816e-05, "loss": 0.0591, "step": 1380 }, { "epoch": 1.2355555555555555, "grad_norm": 0.5172801613807678, "learning_rate": 9.996208034711416e-05, "loss": 0.0561, "step": 1390 }, { "epoch": 1.2444444444444445, "grad_norm": 0.3520701229572296, "learning_rate": 9.995916488610978e-05, "loss": 0.0475, "step": 1400 }, { "epoch": 1.2533333333333334, "grad_norm": 0.405266135931015, "learning_rate": 9.995614150494293e-05, "loss": 0.0499, "step": 1410 }, { "epoch": 1.2622222222222224, "grad_norm": 0.5158333778381348, "learning_rate": 9.995301021014458e-05, "loss": 0.0586, "step": 1420 }, { "epoch": 1.271111111111111, "grad_norm": 0.3017141819000244, "learning_rate": 9.99497710084789e-05, "loss": 0.0549, "step": 1430 }, { "epoch": 1.28, "grad_norm": 0.5214844942092896, "learning_rate": 9.994642390694308e-05, "loss": 0.0501, "step": 1440 }, { "epoch": 1.2888888888888888, "grad_norm": 0.4382994771003723, "learning_rate": 9.994296891276741e-05, "loss": 0.0538, "step": 1450 }, { "epoch": 1.2977777777777777, "grad_norm": 0.3737899661064148, "learning_rate": 9.99394060334153e-05, "loss": 0.0537, "step": 1460 }, { "epoch": 1.3066666666666666, "grad_norm": 0.40200427174568176, "learning_rate": 9.99357352765831e-05, "loss": 0.0535, "step": 1470 }, { "epoch": 1.3155555555555556, "grad_norm": 0.49049437046051025, "learning_rate": 9.993195665020033e-05, "loss": 0.0535, "step": 1480 }, { "epoch": 1.3244444444444445, "grad_norm": 0.4697207510471344, "learning_rate": 9.992807016242941e-05, "loss": 0.0551, "step": 1490 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5618184804916382, "learning_rate": 9.992407582166581e-05, "loss": 0.0441, "step": 1500 }, { "epoch": 1.3422222222222222, "grad_norm": 0.5449051856994629, "learning_rate": 9.9919973636538e-05, "loss": 0.0622, "step": 1510 }, { "epoch": 1.3511111111111112, "grad_norm": 0.3462349772453308, "learning_rate": 9.991576361590736e-05, "loss": 0.0585, "step": 1520 }, { "epoch": 1.3599999999999999, "grad_norm": 0.4024447798728943, "learning_rate": 9.991144576886823e-05, "loss": 0.0565, "step": 1530 }, { "epoch": 1.3688888888888888, "grad_norm": 0.5131449699401855, "learning_rate": 9.990702010474791e-05, "loss": 0.052, "step": 1540 }, { "epoch": 1.3777777777777778, "grad_norm": 0.3706686794757843, "learning_rate": 9.990248663310658e-05, "loss": 0.045, "step": 1550 }, { "epoch": 1.3866666666666667, "grad_norm": 0.3904135227203369, "learning_rate": 9.989784536373726e-05, "loss": 0.0464, "step": 1560 }, { "epoch": 1.3955555555555557, "grad_norm": 0.43984469771385193, "learning_rate": 9.989309630666592e-05, "loss": 0.0516, "step": 1570 }, { "epoch": 1.4044444444444444, "grad_norm": 0.4207358956336975, "learning_rate": 9.988823947215127e-05, "loss": 0.0472, "step": 1580 }, { "epoch": 1.4133333333333333, "grad_norm": 0.2589600384235382, "learning_rate": 9.988327487068492e-05, "loss": 0.0427, "step": 1590 }, { "epoch": 1.4222222222222223, "grad_norm": 0.4724588394165039, "learning_rate": 9.987820251299122e-05, "loss": 0.0444, "step": 1600 }, { "epoch": 1.431111111111111, "grad_norm": 0.4696759879589081, "learning_rate": 9.987302241002732e-05, "loss": 0.0445, "step": 1610 }, { "epoch": 1.44, "grad_norm": 0.5009663701057434, "learning_rate": 9.986773457298311e-05, "loss": 0.0434, "step": 1620 }, { "epoch": 1.448888888888889, "grad_norm": 0.3957982063293457, "learning_rate": 9.986233901328122e-05, "loss": 0.0478, "step": 1630 }, { "epoch": 1.4577777777777778, "grad_norm": 0.4240823984146118, "learning_rate": 9.985683574257692e-05, "loss": 0.0473, "step": 1640 }, { "epoch": 1.4666666666666668, "grad_norm": 0.5002963542938232, "learning_rate": 9.985122477275824e-05, "loss": 0.0463, "step": 1650 }, { "epoch": 1.4755555555555555, "grad_norm": 0.292977511882782, "learning_rate": 9.98455061159458e-05, "loss": 0.0466, "step": 1660 }, { "epoch": 1.4844444444444445, "grad_norm": 0.35600346326828003, "learning_rate": 9.983967978449285e-05, "loss": 0.0439, "step": 1670 }, { "epoch": 1.4933333333333334, "grad_norm": 0.27484193444252014, "learning_rate": 9.983374579098523e-05, "loss": 0.0494, "step": 1680 }, { "epoch": 1.5022222222222221, "grad_norm": 0.41110509634017944, "learning_rate": 9.982770414824139e-05, "loss": 0.0506, "step": 1690 }, { "epoch": 1.511111111111111, "grad_norm": 0.4308290183544159, "learning_rate": 9.982155486931227e-05, "loss": 0.0518, "step": 1700 }, { "epoch": 1.52, "grad_norm": 0.4264397621154785, "learning_rate": 9.981529796748134e-05, "loss": 0.0502, "step": 1710 }, { "epoch": 1.528888888888889, "grad_norm": 0.416526198387146, "learning_rate": 9.980893345626459e-05, "loss": 0.0459, "step": 1720 }, { "epoch": 1.537777777777778, "grad_norm": 0.39369890093803406, "learning_rate": 9.980246134941037e-05, "loss": 0.0512, "step": 1730 }, { "epoch": 1.5466666666666666, "grad_norm": 0.45721301436424255, "learning_rate": 9.979588166089958e-05, "loss": 0.0467, "step": 1740 }, { "epoch": 1.5555555555555556, "grad_norm": 0.4951644837856293, "learning_rate": 9.978919440494539e-05, "loss": 0.058, "step": 1750 }, { "epoch": 1.5644444444444443, "grad_norm": 0.5478604435920715, "learning_rate": 9.978239959599341e-05, "loss": 0.0505, "step": 1760 }, { "epoch": 1.5733333333333333, "grad_norm": 0.5187038779258728, "learning_rate": 9.97754972487216e-05, "loss": 0.0448, "step": 1770 }, { "epoch": 1.5822222222222222, "grad_norm": 0.437528133392334, "learning_rate": 9.976848737804015e-05, "loss": 0.0569, "step": 1780 }, { "epoch": 1.5911111111111111, "grad_norm": 0.4384775161743164, "learning_rate": 9.976136999909156e-05, "loss": 0.0506, "step": 1790 }, { "epoch": 1.6, "grad_norm": 0.46364232897758484, "learning_rate": 9.975414512725057e-05, "loss": 0.0462, "step": 1800 }, { "epoch": 1.608888888888889, "grad_norm": 0.3214699923992157, "learning_rate": 9.974681277812412e-05, "loss": 0.0515, "step": 1810 }, { "epoch": 1.6177777777777778, "grad_norm": 4.810063362121582, "learning_rate": 9.97393729675513e-05, "loss": 0.0453, "step": 1820 }, { "epoch": 1.6266666666666667, "grad_norm": 0.33709824085235596, "learning_rate": 9.973182571160332e-05, "loss": 0.058, "step": 1830 }, { "epoch": 1.6355555555555554, "grad_norm": 0.37568166851997375, "learning_rate": 9.972417102658356e-05, "loss": 0.0439, "step": 1840 }, { "epoch": 1.6444444444444444, "grad_norm": 0.49844491481781006, "learning_rate": 9.97164089290274e-05, "loss": 0.0437, "step": 1850 }, { "epoch": 1.6533333333333333, "grad_norm": 0.48624005913734436, "learning_rate": 9.97085394357023e-05, "loss": 0.0443, "step": 1860 }, { "epoch": 1.6622222222222223, "grad_norm": 0.4012000262737274, "learning_rate": 9.970056256360765e-05, "loss": 0.0445, "step": 1870 }, { "epoch": 1.6711111111111112, "grad_norm": 0.45026034116744995, "learning_rate": 9.969247832997481e-05, "loss": 0.0476, "step": 1880 }, { "epoch": 1.6800000000000002, "grad_norm": 0.28989550471305847, "learning_rate": 9.968428675226714e-05, "loss": 0.0452, "step": 1890 }, { "epoch": 1.6888888888888889, "grad_norm": 0.43861567974090576, "learning_rate": 9.967598784817977e-05, "loss": 0.0405, "step": 1900 }, { "epoch": 1.6977777777777778, "grad_norm": 0.34019795060157776, "learning_rate": 9.966758163563975e-05, "loss": 0.0452, "step": 1910 }, { "epoch": 1.7066666666666666, "grad_norm": 0.3666574954986572, "learning_rate": 9.96590681328059e-05, "loss": 0.0388, "step": 1920 }, { "epoch": 1.7155555555555555, "grad_norm": 0.30438700318336487, "learning_rate": 9.96504473580688e-05, "loss": 0.0375, "step": 1930 }, { "epoch": 1.7244444444444444, "grad_norm": 0.36628085374832153, "learning_rate": 9.964171933005077e-05, "loss": 0.0465, "step": 1940 }, { "epoch": 1.7333333333333334, "grad_norm": 0.2996879518032074, "learning_rate": 9.963288406760582e-05, "loss": 0.042, "step": 1950 }, { "epoch": 1.7422222222222223, "grad_norm": 0.43508851528167725, "learning_rate": 9.96239415898196e-05, "loss": 0.0474, "step": 1960 }, { "epoch": 1.751111111111111, "grad_norm": 0.38386306166648865, "learning_rate": 9.961489191600936e-05, "loss": 0.0365, "step": 1970 }, { "epoch": 1.76, "grad_norm": 0.3365028500556946, "learning_rate": 9.96057350657239e-05, "loss": 0.0434, "step": 1980 }, { "epoch": 1.7688888888888887, "grad_norm": 0.34734559059143066, "learning_rate": 9.959647105874354e-05, "loss": 0.0487, "step": 1990 }, { "epoch": 1.7777777777777777, "grad_norm": 0.365913987159729, "learning_rate": 9.958709991508012e-05, "loss": 0.0495, "step": 2000 }, { "epoch": 1.7866666666666666, "grad_norm": 0.43949899077415466, "learning_rate": 9.957762165497686e-05, "loss": 0.0438, "step": 2010 }, { "epoch": 1.7955555555555556, "grad_norm": 0.412881076335907, "learning_rate": 9.956803629890838e-05, "loss": 0.0462, "step": 2020 }, { "epoch": 1.8044444444444445, "grad_norm": 0.3077094256877899, "learning_rate": 9.955834386758068e-05, "loss": 0.0473, "step": 2030 }, { "epoch": 1.8133333333333335, "grad_norm": 0.3257523477077484, "learning_rate": 9.9548544381931e-05, "loss": 0.0464, "step": 2040 }, { "epoch": 1.8222222222222222, "grad_norm": 0.3986411690711975, "learning_rate": 9.95386378631279e-05, "loss": 0.0445, "step": 2050 }, { "epoch": 1.8311111111111111, "grad_norm": 0.2742040753364563, "learning_rate": 9.952862433257109e-05, "loss": 0.0416, "step": 2060 }, { "epoch": 1.8399999999999999, "grad_norm": 0.3750251233577728, "learning_rate": 9.95185038118915e-05, "loss": 0.0459, "step": 2070 }, { "epoch": 1.8488888888888888, "grad_norm": 0.4271152913570404, "learning_rate": 9.950827632295114e-05, "loss": 0.044, "step": 2080 }, { "epoch": 1.8577777777777778, "grad_norm": 0.44808468222618103, "learning_rate": 9.949794188784311e-05, "loss": 0.044, "step": 2090 }, { "epoch": 1.8666666666666667, "grad_norm": 0.4963094890117645, "learning_rate": 9.94875005288915e-05, "loss": 0.0471, "step": 2100 }, { "epoch": 1.8755555555555556, "grad_norm": 0.3342769742012024, "learning_rate": 9.947695226865142e-05, "loss": 0.0443, "step": 2110 }, { "epoch": 1.8844444444444446, "grad_norm": 0.5500453114509583, "learning_rate": 9.946629712990885e-05, "loss": 0.0498, "step": 2120 }, { "epoch": 1.8933333333333333, "grad_norm": 0.40924954414367676, "learning_rate": 9.945553513568068e-05, "loss": 0.0465, "step": 2130 }, { "epoch": 1.9022222222222223, "grad_norm": 0.5623714923858643, "learning_rate": 9.944466630921465e-05, "loss": 0.0464, "step": 2140 }, { "epoch": 1.911111111111111, "grad_norm": 0.2649368345737457, "learning_rate": 9.943369067398921e-05, "loss": 0.0466, "step": 2150 }, { "epoch": 1.92, "grad_norm": 0.37465882301330566, "learning_rate": 9.942260825371358e-05, "loss": 0.0386, "step": 2160 }, { "epoch": 1.9288888888888889, "grad_norm": 0.4072251617908478, "learning_rate": 9.941141907232765e-05, "loss": 0.0431, "step": 2170 }, { "epoch": 1.9377777777777778, "grad_norm": 0.3971821963787079, "learning_rate": 9.94001231540019e-05, "loss": 0.0423, "step": 2180 }, { "epoch": 1.9466666666666668, "grad_norm": 0.41692420840263367, "learning_rate": 9.938872052313746e-05, "loss": 0.0443, "step": 2190 }, { "epoch": 1.9555555555555557, "grad_norm": 0.4185730218887329, "learning_rate": 9.937721120436587e-05, "loss": 0.0454, "step": 2200 }, { "epoch": 1.9644444444444444, "grad_norm": 0.44983744621276855, "learning_rate": 9.93655952225492e-05, "loss": 0.0423, "step": 2210 }, { "epoch": 1.9733333333333334, "grad_norm": 0.3681102693080902, "learning_rate": 9.935387260277993e-05, "loss": 0.0476, "step": 2220 }, { "epoch": 1.982222222222222, "grad_norm": 0.2631477415561676, "learning_rate": 9.934204337038087e-05, "loss": 0.0341, "step": 2230 }, { "epoch": 1.991111111111111, "grad_norm": 0.33804768323898315, "learning_rate": 9.933010755090515e-05, "loss": 0.041, "step": 2240 }, { "epoch": 2.0, "grad_norm": 0.9031186699867249, "learning_rate": 9.931806517013612e-05, "loss": 0.0437, "step": 2250 }, { "epoch": 2.008888888888889, "grad_norm": 0.32650598883628845, "learning_rate": 9.930591625408736e-05, "loss": 0.0462, "step": 2260 }, { "epoch": 2.017777777777778, "grad_norm": 0.4269265830516815, "learning_rate": 9.929366082900256e-05, "loss": 0.0414, "step": 2270 }, { "epoch": 2.026666666666667, "grad_norm": 0.310558021068573, "learning_rate": 9.92812989213555e-05, "loss": 0.043, "step": 2280 }, { "epoch": 2.0355555555555553, "grad_norm": 0.4473124146461487, "learning_rate": 9.926883055784994e-05, "loss": 0.0451, "step": 2290 }, { "epoch": 2.0444444444444443, "grad_norm": 0.3853514790534973, "learning_rate": 9.925625576541965e-05, "loss": 0.044, "step": 2300 }, { "epoch": 2.0533333333333332, "grad_norm": 0.3843619227409363, "learning_rate": 9.924357457122828e-05, "loss": 0.0461, "step": 2310 }, { "epoch": 2.062222222222222, "grad_norm": 0.3739428222179413, "learning_rate": 9.923078700266937e-05, "loss": 0.039, "step": 2320 }, { "epoch": 2.071111111111111, "grad_norm": 0.34303751587867737, "learning_rate": 9.921789308736615e-05, "loss": 0.0423, "step": 2330 }, { "epoch": 2.08, "grad_norm": 0.32219573855400085, "learning_rate": 9.92048928531717e-05, "loss": 0.0471, "step": 2340 }, { "epoch": 2.088888888888889, "grad_norm": 0.32170942425727844, "learning_rate": 9.919178632816864e-05, "loss": 0.0483, "step": 2350 }, { "epoch": 2.097777777777778, "grad_norm": 0.33931663632392883, "learning_rate": 9.917857354066931e-05, "loss": 0.0416, "step": 2360 }, { "epoch": 2.1066666666666665, "grad_norm": 0.3088505268096924, "learning_rate": 9.91652545192155e-05, "loss": 0.0389, "step": 2370 }, { "epoch": 2.1155555555555554, "grad_norm": 0.43251898884773254, "learning_rate": 9.915182929257856e-05, "loss": 0.0395, "step": 2380 }, { "epoch": 2.1244444444444444, "grad_norm": 0.32791194319725037, "learning_rate": 9.913829788975922e-05, "loss": 0.0446, "step": 2390 }, { "epoch": 2.1333333333333333, "grad_norm": 0.371288925409317, "learning_rate": 9.912466033998757e-05, "loss": 0.0491, "step": 2400 }, { "epoch": 2.1422222222222222, "grad_norm": 0.3457043766975403, "learning_rate": 9.911091667272302e-05, "loss": 0.0422, "step": 2410 }, { "epoch": 2.151111111111111, "grad_norm": 0.38191157579421997, "learning_rate": 9.909706691765417e-05, "loss": 0.0525, "step": 2420 }, { "epoch": 2.16, "grad_norm": 0.3175460398197174, "learning_rate": 9.90831111046988e-05, "loss": 0.039, "step": 2430 }, { "epoch": 2.168888888888889, "grad_norm": 0.33168870210647583, "learning_rate": 9.906904926400384e-05, "loss": 0.0362, "step": 2440 }, { "epoch": 2.1777777777777776, "grad_norm": 0.4228513538837433, "learning_rate": 9.905488142594519e-05, "loss": 0.0385, "step": 2450 }, { "epoch": 2.1866666666666665, "grad_norm": 0.44055965542793274, "learning_rate": 9.904060762112777e-05, "loss": 0.0381, "step": 2460 }, { "epoch": 2.1955555555555555, "grad_norm": 0.2829914689064026, "learning_rate": 9.902622788038538e-05, "loss": 0.0379, "step": 2470 }, { "epoch": 2.2044444444444444, "grad_norm": 0.3622589707374573, "learning_rate": 9.901174223478068e-05, "loss": 0.0421, "step": 2480 }, { "epoch": 2.2133333333333334, "grad_norm": 0.3114979863166809, "learning_rate": 9.899715071560508e-05, "loss": 0.0432, "step": 2490 }, { "epoch": 2.2222222222222223, "grad_norm": 0.30967316031455994, "learning_rate": 9.89824533543787e-05, "loss": 0.0403, "step": 2500 }, { "epoch": 2.2311111111111113, "grad_norm": 0.3376723527908325, "learning_rate": 9.896765018285031e-05, "loss": 0.039, "step": 2510 }, { "epoch": 2.24, "grad_norm": 0.4390922784805298, "learning_rate": 9.895274123299723e-05, "loss": 0.0403, "step": 2520 }, { "epoch": 2.2488888888888887, "grad_norm": 0.36367088556289673, "learning_rate": 9.893772653702532e-05, "loss": 0.0389, "step": 2530 }, { "epoch": 2.2577777777777777, "grad_norm": 0.3615938127040863, "learning_rate": 9.89226061273688e-05, "loss": 0.0443, "step": 2540 }, { "epoch": 2.2666666666666666, "grad_norm": 0.3793148994445801, "learning_rate": 9.890738003669029e-05, "loss": 0.0421, "step": 2550 }, { "epoch": 2.2755555555555556, "grad_norm": 0.523744523525238, "learning_rate": 9.889204829788071e-05, "loss": 0.0378, "step": 2560 }, { "epoch": 2.2844444444444445, "grad_norm": 0.3601992130279541, "learning_rate": 9.887661094405918e-05, "loss": 0.0436, "step": 2570 }, { "epoch": 2.2933333333333334, "grad_norm": 0.3968031108379364, "learning_rate": 9.886106800857298e-05, "loss": 0.0358, "step": 2580 }, { "epoch": 2.3022222222222224, "grad_norm": 0.3358360528945923, "learning_rate": 9.884541952499743e-05, "loss": 0.0415, "step": 2590 }, { "epoch": 2.311111111111111, "grad_norm": 0.304015189409256, "learning_rate": 9.88296655271359e-05, "loss": 0.0398, "step": 2600 }, { "epoch": 2.32, "grad_norm": 0.3838290572166443, "learning_rate": 9.881380604901964e-05, "loss": 0.035, "step": 2610 }, { "epoch": 2.328888888888889, "grad_norm": 0.37335821986198425, "learning_rate": 9.87978411249078e-05, "loss": 0.0325, "step": 2620 }, { "epoch": 2.3377777777777777, "grad_norm": 0.39212650060653687, "learning_rate": 9.878177078928727e-05, "loss": 0.037, "step": 2630 }, { "epoch": 2.3466666666666667, "grad_norm": 0.35716789960861206, "learning_rate": 9.876559507687267e-05, "loss": 0.0396, "step": 2640 }, { "epoch": 2.3555555555555556, "grad_norm": 0.3476065695285797, "learning_rate": 9.874931402260627e-05, "loss": 0.0411, "step": 2650 }, { "epoch": 2.3644444444444446, "grad_norm": 0.37928491830825806, "learning_rate": 9.873292766165785e-05, "loss": 0.0306, "step": 2660 }, { "epoch": 2.3733333333333335, "grad_norm": 0.3795747756958008, "learning_rate": 9.871643602942469e-05, "loss": 0.0457, "step": 2670 }, { "epoch": 2.3822222222222225, "grad_norm": 0.2743997275829315, "learning_rate": 9.86998391615315e-05, "loss": 0.0407, "step": 2680 }, { "epoch": 2.391111111111111, "grad_norm": 0.2923840284347534, "learning_rate": 9.868313709383028e-05, "loss": 0.0356, "step": 2690 }, { "epoch": 2.4, "grad_norm": 0.3541683256626129, "learning_rate": 9.86663298624003e-05, "loss": 0.0396, "step": 2700 }, { "epoch": 2.408888888888889, "grad_norm": 0.2716972827911377, "learning_rate": 9.864941750354798e-05, "loss": 0.0356, "step": 2710 }, { "epoch": 2.417777777777778, "grad_norm": 0.2902292311191559, "learning_rate": 9.863240005380687e-05, "loss": 0.0356, "step": 2720 }, { "epoch": 2.4266666666666667, "grad_norm": 0.2947423756122589, "learning_rate": 9.861527754993749e-05, "loss": 0.0358, "step": 2730 }, { "epoch": 2.4355555555555557, "grad_norm": 0.4348752200603485, "learning_rate": 9.859805002892732e-05, "loss": 0.0427, "step": 2740 }, { "epoch": 2.4444444444444446, "grad_norm": 0.3590121865272522, "learning_rate": 9.85807175279907e-05, "loss": 0.0382, "step": 2750 }, { "epoch": 2.453333333333333, "grad_norm": 0.3993508219718933, "learning_rate": 9.856328008456872e-05, "loss": 0.0474, "step": 2760 }, { "epoch": 2.462222222222222, "grad_norm": 0.42127183079719543, "learning_rate": 9.85457377363292e-05, "loss": 0.0449, "step": 2770 }, { "epoch": 2.471111111111111, "grad_norm": 0.2630346417427063, "learning_rate": 9.852809052116653e-05, "loss": 0.0408, "step": 2780 }, { "epoch": 2.48, "grad_norm": 0.4207378923892975, "learning_rate": 9.851033847720166e-05, "loss": 0.0396, "step": 2790 }, { "epoch": 2.488888888888889, "grad_norm": 0.31925681233406067, "learning_rate": 9.849248164278198e-05, "loss": 0.0354, "step": 2800 }, { "epoch": 2.497777777777778, "grad_norm": 0.4610537886619568, "learning_rate": 9.847452005648123e-05, "loss": 0.0376, "step": 2810 }, { "epoch": 2.506666666666667, "grad_norm": 0.21612519025802612, "learning_rate": 9.845645375709945e-05, "loss": 0.0357, "step": 2820 }, { "epoch": 2.5155555555555553, "grad_norm": 0.3914960026741028, "learning_rate": 9.843828278366287e-05, "loss": 0.0421, "step": 2830 }, { "epoch": 2.5244444444444447, "grad_norm": 0.4255249500274658, "learning_rate": 9.842000717542384e-05, "loss": 0.0438, "step": 2840 }, { "epoch": 2.533333333333333, "grad_norm": 0.323687344789505, "learning_rate": 9.840162697186075e-05, "loss": 0.0356, "step": 2850 }, { "epoch": 2.542222222222222, "grad_norm": 0.32032763957977295, "learning_rate": 9.838314221267788e-05, "loss": 0.0363, "step": 2860 }, { "epoch": 2.551111111111111, "grad_norm": 0.2584310472011566, "learning_rate": 9.836455293780544e-05, "loss": 0.0334, "step": 2870 }, { "epoch": 2.56, "grad_norm": 0.25427907705307007, "learning_rate": 9.834585918739936e-05, "loss": 0.033, "step": 2880 }, { "epoch": 2.568888888888889, "grad_norm": 0.38529977202415466, "learning_rate": 9.832706100184128e-05, "loss": 0.0385, "step": 2890 }, { "epoch": 2.5777777777777775, "grad_norm": 0.41247913241386414, "learning_rate": 9.830815842173842e-05, "loss": 0.0394, "step": 2900 }, { "epoch": 2.586666666666667, "grad_norm": 0.3500446379184723, "learning_rate": 9.828915148792352e-05, "loss": 0.0407, "step": 2910 }, { "epoch": 2.5955555555555554, "grad_norm": 0.3676256239414215, "learning_rate": 9.827004024145475e-05, "loss": 0.0439, "step": 2920 }, { "epoch": 2.6044444444444443, "grad_norm": 0.3094874620437622, "learning_rate": 9.825082472361557e-05, "loss": 0.035, "step": 2930 }, { "epoch": 2.6133333333333333, "grad_norm": 0.2368018627166748, "learning_rate": 9.823150497591476e-05, "loss": 0.0355, "step": 2940 }, { "epoch": 2.6222222222222222, "grad_norm": 0.29669561982154846, "learning_rate": 9.82120810400862e-05, "loss": 0.0361, "step": 2950 }, { "epoch": 2.631111111111111, "grad_norm": 0.30996474623680115, "learning_rate": 9.819255295808882e-05, "loss": 0.0375, "step": 2960 }, { "epoch": 2.64, "grad_norm": 0.3094545900821686, "learning_rate": 9.817292077210659e-05, "loss": 0.0398, "step": 2970 }, { "epoch": 2.648888888888889, "grad_norm": 0.360730916261673, "learning_rate": 9.815318452454826e-05, "loss": 0.0432, "step": 2980 }, { "epoch": 2.6577777777777776, "grad_norm": 0.36050188541412354, "learning_rate": 9.813334425804747e-05, "loss": 0.034, "step": 2990 }, { "epoch": 2.6666666666666665, "grad_norm": 0.3863266408443451, "learning_rate": 9.811340001546251e-05, "loss": 0.0342, "step": 3000 }, { "epoch": 2.6755555555555555, "grad_norm": 0.28646644949913025, "learning_rate": 9.809335183987631e-05, "loss": 0.041, "step": 3010 }, { "epoch": 2.6844444444444444, "grad_norm": 0.3396134674549103, "learning_rate": 9.807319977459623e-05, "loss": 0.0398, "step": 3020 }, { "epoch": 2.6933333333333334, "grad_norm": 0.26546743512153625, "learning_rate": 9.805294386315415e-05, "loss": 0.0388, "step": 3030 }, { "epoch": 2.7022222222222223, "grad_norm": 0.2608436942100525, "learning_rate": 9.803258414930622e-05, "loss": 0.0352, "step": 3040 }, { "epoch": 2.7111111111111112, "grad_norm": 0.2912008762359619, "learning_rate": 9.801212067703284e-05, "loss": 0.0424, "step": 3050 }, { "epoch": 2.7199999999999998, "grad_norm": 0.3557053804397583, "learning_rate": 9.799155349053851e-05, "loss": 0.0337, "step": 3060 }, { "epoch": 2.728888888888889, "grad_norm": 0.372562974691391, "learning_rate": 9.797088263425182e-05, "loss": 0.0429, "step": 3070 }, { "epoch": 2.7377777777777776, "grad_norm": 0.3825114965438843, "learning_rate": 9.795010815282526e-05, "loss": 0.0353, "step": 3080 }, { "epoch": 2.7466666666666666, "grad_norm": 0.3432016670703888, "learning_rate": 9.792923009113522e-05, "loss": 0.039, "step": 3090 }, { "epoch": 2.7555555555555555, "grad_norm": 0.2322668582201004, "learning_rate": 9.790824849428179e-05, "loss": 0.0355, "step": 3100 }, { "epoch": 2.7644444444444445, "grad_norm": 0.2659859359264374, "learning_rate": 9.788716340758874e-05, "loss": 0.0346, "step": 3110 }, { "epoch": 2.7733333333333334, "grad_norm": 0.29680147767066956, "learning_rate": 9.786597487660337e-05, "loss": 0.0369, "step": 3120 }, { "epoch": 2.7822222222222224, "grad_norm": 0.32307639718055725, "learning_rate": 9.784468294709648e-05, "loss": 0.0288, "step": 3130 }, { "epoch": 2.7911111111111113, "grad_norm": 0.23442520201206207, "learning_rate": 9.78232876650622e-05, "loss": 0.0345, "step": 3140 }, { "epoch": 2.8, "grad_norm": 0.24266482889652252, "learning_rate": 9.780178907671789e-05, "loss": 0.0469, "step": 3150 }, { "epoch": 2.8088888888888888, "grad_norm": 0.24101509153842926, "learning_rate": 9.778018722850413e-05, "loss": 0.039, "step": 3160 }, { "epoch": 2.8177777777777777, "grad_norm": 0.3673695921897888, "learning_rate": 9.775848216708455e-05, "loss": 0.0342, "step": 3170 }, { "epoch": 2.8266666666666667, "grad_norm": 0.24255426228046417, "learning_rate": 9.773667393934567e-05, "loss": 0.0425, "step": 3180 }, { "epoch": 2.8355555555555556, "grad_norm": 0.41061174869537354, "learning_rate": 9.771476259239695e-05, "loss": 0.04, "step": 3190 }, { "epoch": 2.8444444444444446, "grad_norm": 0.41657042503356934, "learning_rate": 9.769274817357054e-05, "loss": 0.0419, "step": 3200 }, { "epoch": 2.8533333333333335, "grad_norm": 0.3159908056259155, "learning_rate": 9.76706307304213e-05, "loss": 0.0359, "step": 3210 }, { "epoch": 2.862222222222222, "grad_norm": 0.33408939838409424, "learning_rate": 9.76484103107266e-05, "loss": 0.0372, "step": 3220 }, { "epoch": 2.871111111111111, "grad_norm": 0.35909169912338257, "learning_rate": 9.762608696248625e-05, "loss": 0.0433, "step": 3230 }, { "epoch": 2.88, "grad_norm": 0.3484880030155182, "learning_rate": 9.760366073392246e-05, "loss": 0.0377, "step": 3240 }, { "epoch": 2.888888888888889, "grad_norm": 0.38622426986694336, "learning_rate": 9.75811316734796e-05, "loss": 0.0354, "step": 3250 }, { "epoch": 2.897777777777778, "grad_norm": 0.3925771415233612, "learning_rate": 9.755849982982423e-05, "loss": 0.0359, "step": 3260 }, { "epoch": 2.9066666666666667, "grad_norm": 0.36158132553100586, "learning_rate": 9.753576525184492e-05, "loss": 0.0344, "step": 3270 }, { "epoch": 2.9155555555555557, "grad_norm": 0.3591807782649994, "learning_rate": 9.751292798865217e-05, "loss": 0.0354, "step": 3280 }, { "epoch": 2.924444444444444, "grad_norm": 0.33845221996307373, "learning_rate": 9.748998808957828e-05, "loss": 0.0316, "step": 3290 }, { "epoch": 2.9333333333333336, "grad_norm": 0.24767133593559265, "learning_rate": 9.746694560417731e-05, "loss": 0.0354, "step": 3300 }, { "epoch": 2.942222222222222, "grad_norm": 0.292439728975296, "learning_rate": 9.744380058222483e-05, "loss": 0.0321, "step": 3310 }, { "epoch": 2.951111111111111, "grad_norm": 0.3415641486644745, "learning_rate": 9.742055307371801e-05, "loss": 0.0364, "step": 3320 }, { "epoch": 2.96, "grad_norm": 0.3149973154067993, "learning_rate": 9.739720312887535e-05, "loss": 0.0392, "step": 3330 }, { "epoch": 2.968888888888889, "grad_norm": 0.2787880003452301, "learning_rate": 9.737375079813662e-05, "loss": 0.0301, "step": 3340 }, { "epoch": 2.977777777777778, "grad_norm": 0.2951167821884155, "learning_rate": 9.73501961321628e-05, "loss": 0.0368, "step": 3350 }, { "epoch": 2.986666666666667, "grad_norm": 0.3122771382331848, "learning_rate": 9.732653918183592e-05, "loss": 0.0304, "step": 3360 }, { "epoch": 2.9955555555555557, "grad_norm": 0.3808428943157196, "learning_rate": 9.730277999825895e-05, "loss": 0.0346, "step": 3370 }, { "epoch": 3.0044444444444443, "grad_norm": 0.3741139769554138, "learning_rate": 9.727891863275569e-05, "loss": 0.0405, "step": 3380 }, { "epoch": 3.013333333333333, "grad_norm": 0.37048643827438354, "learning_rate": 9.72549551368707e-05, "loss": 0.0356, "step": 3390 }, { "epoch": 3.022222222222222, "grad_norm": 0.32506686449050903, "learning_rate": 9.723088956236915e-05, "loss": 0.033, "step": 3400 }, { "epoch": 3.031111111111111, "grad_norm": 0.23321156203746796, "learning_rate": 9.720672196123667e-05, "loss": 0.0344, "step": 3410 }, { "epoch": 3.04, "grad_norm": 0.31639111042022705, "learning_rate": 9.718245238567939e-05, "loss": 0.0341, "step": 3420 }, { "epoch": 3.048888888888889, "grad_norm": 0.32793188095092773, "learning_rate": 9.715808088812361e-05, "loss": 0.0339, "step": 3430 }, { "epoch": 3.057777777777778, "grad_norm": 0.38049063086509705, "learning_rate": 9.713360752121588e-05, "loss": 0.0365, "step": 3440 }, { "epoch": 3.066666666666667, "grad_norm": 0.2897101044654846, "learning_rate": 9.710903233782272e-05, "loss": 0.044, "step": 3450 }, { "epoch": 3.0755555555555554, "grad_norm": 0.2086770087480545, "learning_rate": 9.708435539103069e-05, "loss": 0.0305, "step": 3460 }, { "epoch": 3.0844444444444443, "grad_norm": 0.31852009892463684, "learning_rate": 9.705957673414612e-05, "loss": 0.0328, "step": 3470 }, { "epoch": 3.0933333333333333, "grad_norm": 0.2999953329563141, "learning_rate": 9.703469642069503e-05, "loss": 0.0336, "step": 3480 }, { "epoch": 3.102222222222222, "grad_norm": 0.23380529880523682, "learning_rate": 9.70097145044231e-05, "loss": 0.0326, "step": 3490 }, { "epoch": 3.111111111111111, "grad_norm": 0.33831268548965454, "learning_rate": 9.698463103929542e-05, "loss": 0.0363, "step": 3500 }, { "epoch": 3.12, "grad_norm": 0.27403244376182556, "learning_rate": 9.695944607949649e-05, "loss": 0.0356, "step": 3510 }, { "epoch": 3.128888888888889, "grad_norm": 0.3632008731365204, "learning_rate": 9.693415967943006e-05, "loss": 0.0382, "step": 3520 }, { "epoch": 3.137777777777778, "grad_norm": 0.6398510932922363, "learning_rate": 9.690877189371896e-05, "loss": 0.039, "step": 3530 }, { "epoch": 3.1466666666666665, "grad_norm": 0.33499160408973694, "learning_rate": 9.688328277720507e-05, "loss": 0.0329, "step": 3540 }, { "epoch": 3.1555555555555554, "grad_norm": 0.2875252068042755, "learning_rate": 9.685769238494915e-05, "loss": 0.0315, "step": 3550 }, { "epoch": 3.1644444444444444, "grad_norm": 0.29723408818244934, "learning_rate": 9.683200077223072e-05, "loss": 0.0401, "step": 3560 }, { "epoch": 3.1733333333333333, "grad_norm": 0.28652524948120117, "learning_rate": 9.6806207994548e-05, "loss": 0.0386, "step": 3570 }, { "epoch": 3.1822222222222223, "grad_norm": 0.2795201539993286, "learning_rate": 9.678031410761766e-05, "loss": 0.0398, "step": 3580 }, { "epoch": 3.1911111111111112, "grad_norm": 0.42683014273643494, "learning_rate": 9.675431916737488e-05, "loss": 0.0374, "step": 3590 }, { "epoch": 3.2, "grad_norm": 0.25263598561286926, "learning_rate": 9.672822322997305e-05, "loss": 0.0309, "step": 3600 }, { "epoch": 3.2088888888888887, "grad_norm": 0.32438334822654724, "learning_rate": 9.670202635178378e-05, "loss": 0.0323, "step": 3610 }, { "epoch": 3.2177777777777776, "grad_norm": 0.2848886549472809, "learning_rate": 9.66757285893967e-05, "loss": 0.041, "step": 3620 }, { "epoch": 3.2266666666666666, "grad_norm": 0.3264160752296448, "learning_rate": 9.664932999961942e-05, "loss": 0.0365, "step": 3630 }, { "epoch": 3.2355555555555555, "grad_norm": 0.32695403695106506, "learning_rate": 9.662283063947727e-05, "loss": 0.0353, "step": 3640 }, { "epoch": 3.2444444444444445, "grad_norm": 0.32455602288246155, "learning_rate": 9.659623056621332e-05, "loss": 0.0312, "step": 3650 }, { "epoch": 3.2533333333333334, "grad_norm": 0.3134697377681732, "learning_rate": 9.65695298372882e-05, "loss": 0.0367, "step": 3660 }, { "epoch": 3.2622222222222224, "grad_norm": 0.3112470805644989, "learning_rate": 9.654272851037994e-05, "loss": 0.0367, "step": 3670 }, { "epoch": 3.2711111111111113, "grad_norm": 0.2979118525981903, "learning_rate": 9.651582664338388e-05, "loss": 0.0388, "step": 3680 }, { "epoch": 3.2800000000000002, "grad_norm": 0.29712149500846863, "learning_rate": 9.648882429441257e-05, "loss": 0.0434, "step": 3690 }, { "epoch": 3.2888888888888888, "grad_norm": 0.32076647877693176, "learning_rate": 9.646172152179562e-05, "loss": 0.0322, "step": 3700 }, { "epoch": 3.2977777777777777, "grad_norm": 0.31056782603263855, "learning_rate": 9.643451838407955e-05, "loss": 0.0394, "step": 3710 }, { "epoch": 3.3066666666666666, "grad_norm": 0.33347660303115845, "learning_rate": 9.640721494002769e-05, "loss": 0.0352, "step": 3720 }, { "epoch": 3.3155555555555556, "grad_norm": 0.29113152623176575, "learning_rate": 9.637981124862005e-05, "loss": 0.0341, "step": 3730 }, { "epoch": 3.3244444444444445, "grad_norm": 0.25592583417892456, "learning_rate": 9.635230736905319e-05, "loss": 0.0365, "step": 3740 }, { "epoch": 3.3333333333333335, "grad_norm": 0.307113915681839, "learning_rate": 9.632470336074009e-05, "loss": 0.0321, "step": 3750 }, { "epoch": 3.3422222222222224, "grad_norm": 0.2403087615966797, "learning_rate": 9.629699928331006e-05, "loss": 0.0385, "step": 3760 }, { "epoch": 3.351111111111111, "grad_norm": 0.2530773878097534, "learning_rate": 9.62691951966085e-05, "loss": 0.0335, "step": 3770 }, { "epoch": 3.36, "grad_norm": 0.2887554466724396, "learning_rate": 9.624129116069694e-05, "loss": 0.0327, "step": 3780 }, { "epoch": 3.368888888888889, "grad_norm": 0.3695181608200073, "learning_rate": 9.621328723585276e-05, "loss": 0.0428, "step": 3790 }, { "epoch": 3.3777777777777778, "grad_norm": 0.20996980369091034, "learning_rate": 9.618518348256909e-05, "loss": 0.0402, "step": 3800 }, { "epoch": 3.3866666666666667, "grad_norm": 0.3566212058067322, "learning_rate": 9.61569799615548e-05, "loss": 0.0344, "step": 3810 }, { "epoch": 3.3955555555555557, "grad_norm": 0.32204535603523254, "learning_rate": 9.612867673373418e-05, "loss": 0.033, "step": 3820 }, { "epoch": 3.4044444444444446, "grad_norm": 0.4201919138431549, "learning_rate": 9.610027386024693e-05, "loss": 0.0347, "step": 3830 }, { "epoch": 3.413333333333333, "grad_norm": 0.34567514061927795, "learning_rate": 9.607177140244806e-05, "loss": 0.0309, "step": 3840 }, { "epoch": 3.422222222222222, "grad_norm": 0.3397578299045563, "learning_rate": 9.604316942190763e-05, "loss": 0.0297, "step": 3850 }, { "epoch": 3.431111111111111, "grad_norm": 0.41883495450019836, "learning_rate": 9.60144679804107e-05, "loss": 0.0409, "step": 3860 }, { "epoch": 3.44, "grad_norm": 0.3135182857513428, "learning_rate": 9.598566713995718e-05, "loss": 0.0332, "step": 3870 }, { "epoch": 3.448888888888889, "grad_norm": 0.2924269437789917, "learning_rate": 9.595676696276172e-05, "loss": 0.0385, "step": 3880 }, { "epoch": 3.457777777777778, "grad_norm": 0.23524484038352966, "learning_rate": 9.592776751125356e-05, "loss": 0.0292, "step": 3890 }, { "epoch": 3.466666666666667, "grad_norm": 0.2670474648475647, "learning_rate": 9.589866884807635e-05, "loss": 0.0365, "step": 3900 }, { "epoch": 3.4755555555555557, "grad_norm": 0.23938554525375366, "learning_rate": 9.58694710360881e-05, "loss": 0.0314, "step": 3910 }, { "epoch": 3.4844444444444447, "grad_norm": 0.3153280019760132, "learning_rate": 9.584017413836093e-05, "loss": 0.027, "step": 3920 }, { "epoch": 3.493333333333333, "grad_norm": 0.27695003151893616, "learning_rate": 9.581077821818109e-05, "loss": 0.036, "step": 3930 }, { "epoch": 3.502222222222222, "grad_norm": 0.3147580921649933, "learning_rate": 9.578128333904867e-05, "loss": 0.033, "step": 3940 }, { "epoch": 3.511111111111111, "grad_norm": 0.3648083806037903, "learning_rate": 9.575168956467755e-05, "loss": 0.0358, "step": 3950 }, { "epoch": 3.52, "grad_norm": 0.4754103124141693, "learning_rate": 9.572199695899522e-05, "loss": 0.0328, "step": 3960 }, { "epoch": 3.528888888888889, "grad_norm": 0.3163406252861023, "learning_rate": 9.569220558614272e-05, "loss": 0.0349, "step": 3970 }, { "epoch": 3.537777777777778, "grad_norm": 0.239278182387352, "learning_rate": 9.566231551047438e-05, "loss": 0.037, "step": 3980 }, { "epoch": 3.546666666666667, "grad_norm": 0.28450435400009155, "learning_rate": 9.563232679655776e-05, "loss": 0.0427, "step": 3990 }, { "epoch": 3.5555555555555554, "grad_norm": 0.20281273126602173, "learning_rate": 9.560223950917353e-05, "loss": 0.0377, "step": 4000 }, { "epoch": 3.5644444444444443, "grad_norm": 0.30739882588386536, "learning_rate": 9.557205371331526e-05, "loss": 0.0371, "step": 4010 }, { "epoch": 3.5733333333333333, "grad_norm": 0.35005420446395874, "learning_rate": 9.554176947418931e-05, "loss": 0.0334, "step": 4020 }, { "epoch": 3.582222222222222, "grad_norm": 0.40436676144599915, "learning_rate": 9.551138685721471e-05, "loss": 0.0443, "step": 4030 }, { "epoch": 3.591111111111111, "grad_norm": 0.3816026449203491, "learning_rate": 9.548090592802302e-05, "loss": 0.0348, "step": 4040 }, { "epoch": 3.6, "grad_norm": 0.34424254298210144, "learning_rate": 9.545032675245813e-05, "loss": 0.0373, "step": 4050 }, { "epoch": 3.608888888888889, "grad_norm": 0.41114187240600586, "learning_rate": 9.541964939657619e-05, "loss": 0.0315, "step": 4060 }, { "epoch": 3.6177777777777775, "grad_norm": 0.2818847596645355, "learning_rate": 9.538887392664544e-05, "loss": 0.0282, "step": 4070 }, { "epoch": 3.626666666666667, "grad_norm": 0.3051854074001312, "learning_rate": 9.535800040914601e-05, "loss": 0.0373, "step": 4080 }, { "epoch": 3.6355555555555554, "grad_norm": 0.20587944984436035, "learning_rate": 9.532702891076993e-05, "loss": 0.0296, "step": 4090 }, { "epoch": 3.6444444444444444, "grad_norm": 0.2924690544605255, "learning_rate": 9.529595949842077e-05, "loss": 0.0359, "step": 4100 }, { "epoch": 3.6533333333333333, "grad_norm": 0.3009898364543915, "learning_rate": 9.526479223921366e-05, "loss": 0.0306, "step": 4110 }, { "epoch": 3.6622222222222223, "grad_norm": 0.27144601941108704, "learning_rate": 9.523352720047513e-05, "loss": 0.0289, "step": 4120 }, { "epoch": 3.671111111111111, "grad_norm": 0.31827253103256226, "learning_rate": 9.52021644497429e-05, "loss": 0.0294, "step": 4130 }, { "epoch": 3.68, "grad_norm": 0.33864107728004456, "learning_rate": 9.517070405476575e-05, "loss": 0.0318, "step": 4140 }, { "epoch": 3.688888888888889, "grad_norm": 0.37844815850257874, "learning_rate": 9.513914608350339e-05, "loss": 0.0421, "step": 4150 }, { "epoch": 3.6977777777777776, "grad_norm": 0.24675294756889343, "learning_rate": 9.510749060412634e-05, "loss": 0.0344, "step": 4160 }, { "epoch": 3.7066666666666666, "grad_norm": 0.4113079607486725, "learning_rate": 9.507573768501574e-05, "loss": 0.0309, "step": 4170 }, { "epoch": 3.7155555555555555, "grad_norm": 0.33354371786117554, "learning_rate": 9.50438873947632e-05, "loss": 0.032, "step": 4180 }, { "epoch": 3.7244444444444444, "grad_norm": 0.27452439069747925, "learning_rate": 9.501193980217068e-05, "loss": 0.0314, "step": 4190 }, { "epoch": 3.7333333333333334, "grad_norm": 0.38070064783096313, "learning_rate": 9.497989497625035e-05, "loss": 0.0363, "step": 4200 }, { "epoch": 3.7422222222222223, "grad_norm": 0.32076600193977356, "learning_rate": 9.494775298622438e-05, "loss": 0.0337, "step": 4210 }, { "epoch": 3.7511111111111113, "grad_norm": 0.26348283886909485, "learning_rate": 9.491551390152487e-05, "loss": 0.035, "step": 4220 }, { "epoch": 3.76, "grad_norm": 0.3817702829837799, "learning_rate": 9.488317779179361e-05, "loss": 0.0355, "step": 4230 }, { "epoch": 3.7688888888888887, "grad_norm": 0.28240111470222473, "learning_rate": 9.485074472688205e-05, "loss": 0.0307, "step": 4240 }, { "epoch": 3.7777777777777777, "grad_norm": 0.2909921109676361, "learning_rate": 9.481821477685101e-05, "loss": 0.0326, "step": 4250 }, { "epoch": 3.7866666666666666, "grad_norm": 0.2704683542251587, "learning_rate": 9.478558801197065e-05, "loss": 0.032, "step": 4260 }, { "epoch": 3.7955555555555556, "grad_norm": 0.35608425736427307, "learning_rate": 9.475286450272023e-05, "loss": 0.035, "step": 4270 }, { "epoch": 3.8044444444444445, "grad_norm": 0.23778478801250458, "learning_rate": 9.472004431978802e-05, "loss": 0.0312, "step": 4280 }, { "epoch": 3.8133333333333335, "grad_norm": 0.30667850375175476, "learning_rate": 9.468712753407112e-05, "loss": 0.0374, "step": 4290 }, { "epoch": 3.822222222222222, "grad_norm": 0.22654017806053162, "learning_rate": 9.465411421667528e-05, "loss": 0.034, "step": 4300 }, { "epoch": 3.8311111111111114, "grad_norm": 0.2786659300327301, "learning_rate": 9.462100443891481e-05, "loss": 0.0342, "step": 4310 }, { "epoch": 3.84, "grad_norm": 0.3615601658821106, "learning_rate": 9.458779827231237e-05, "loss": 0.0284, "step": 4320 }, { "epoch": 3.848888888888889, "grad_norm": 0.284096896648407, "learning_rate": 9.455449578859883e-05, "loss": 0.0346, "step": 4330 }, { "epoch": 3.8577777777777778, "grad_norm": 0.26605576276779175, "learning_rate": 9.452109705971314e-05, "loss": 0.0321, "step": 4340 }, { "epoch": 3.8666666666666667, "grad_norm": 0.2558707594871521, "learning_rate": 9.448760215780217e-05, "loss": 0.0344, "step": 4350 }, { "epoch": 3.8755555555555556, "grad_norm": 0.5096879005432129, "learning_rate": 9.445401115522048e-05, "loss": 0.0332, "step": 4360 }, { "epoch": 3.8844444444444446, "grad_norm": 0.3304046094417572, "learning_rate": 9.442032412453027e-05, "loss": 0.0351, "step": 4370 }, { "epoch": 3.8933333333333335, "grad_norm": 0.2874262034893036, "learning_rate": 9.438654113850118e-05, "loss": 0.0323, "step": 4380 }, { "epoch": 3.902222222222222, "grad_norm": 0.2614710330963135, "learning_rate": 9.43526622701101e-05, "loss": 0.0322, "step": 4390 }, { "epoch": 3.911111111111111, "grad_norm": 0.28675758838653564, "learning_rate": 9.431868759254109e-05, "loss": 0.0411, "step": 4400 }, { "epoch": 3.92, "grad_norm": 0.278188019990921, "learning_rate": 9.428461717918511e-05, "loss": 0.0376, "step": 4410 }, { "epoch": 3.928888888888889, "grad_norm": 0.30171114206314087, "learning_rate": 9.425045110363998e-05, "loss": 0.034, "step": 4420 }, { "epoch": 3.937777777777778, "grad_norm": 0.2379283457994461, "learning_rate": 9.421618943971013e-05, "loss": 0.0328, "step": 4430 }, { "epoch": 3.9466666666666668, "grad_norm": 0.3122510612010956, "learning_rate": 9.41818322614065e-05, "loss": 0.0386, "step": 4440 }, { "epoch": 3.9555555555555557, "grad_norm": 0.25919103622436523, "learning_rate": 9.414737964294636e-05, "loss": 0.0324, "step": 4450 }, { "epoch": 3.964444444444444, "grad_norm": 0.30704838037490845, "learning_rate": 9.41128316587531e-05, "loss": 0.0339, "step": 4460 }, { "epoch": 3.9733333333333336, "grad_norm": 0.27548304200172424, "learning_rate": 9.407818838345619e-05, "loss": 0.0396, "step": 4470 }, { "epoch": 3.982222222222222, "grad_norm": 0.23227204382419586, "learning_rate": 9.404344989189089e-05, "loss": 0.0329, "step": 4480 }, { "epoch": 3.991111111111111, "grad_norm": 0.42115721106529236, "learning_rate": 9.400861625909814e-05, "loss": 0.029, "step": 4490 }, { "epoch": 4.0, "grad_norm": 0.8465108275413513, "learning_rate": 9.397368756032445e-05, "loss": 0.0408, "step": 4500 }, { "epoch": 4.0088888888888885, "grad_norm": 0.32668155431747437, "learning_rate": 9.393866387102165e-05, "loss": 0.0346, "step": 4510 }, { "epoch": 4.017777777777778, "grad_norm": 0.2699815332889557, "learning_rate": 9.390354526684678e-05, "loss": 0.0276, "step": 4520 }, { "epoch": 4.026666666666666, "grad_norm": 0.18220765888690948, "learning_rate": 9.38683318236619e-05, "loss": 0.0363, "step": 4530 }, { "epoch": 4.035555555555556, "grad_norm": 0.3447783589363098, "learning_rate": 9.383302361753392e-05, "loss": 0.0392, "step": 4540 }, { "epoch": 4.044444444444444, "grad_norm": 0.24282586574554443, "learning_rate": 9.379762072473452e-05, "loss": 0.0256, "step": 4550 }, { "epoch": 4.053333333333334, "grad_norm": 0.23750334978103638, "learning_rate": 9.376212322173985e-05, "loss": 0.0338, "step": 4560 }, { "epoch": 4.062222222222222, "grad_norm": 0.2933289110660553, "learning_rate": 9.372653118523048e-05, "loss": 0.0311, "step": 4570 }, { "epoch": 4.071111111111111, "grad_norm": 0.2606777250766754, "learning_rate": 9.369084469209114e-05, "loss": 0.0413, "step": 4580 }, { "epoch": 4.08, "grad_norm": 0.33872371912002563, "learning_rate": 9.365506381941066e-05, "loss": 0.0311, "step": 4590 }, { "epoch": 4.088888888888889, "grad_norm": 0.2913971543312073, "learning_rate": 9.36191886444817e-05, "loss": 0.0316, "step": 4600 }, { "epoch": 4.097777777777778, "grad_norm": 0.3347288966178894, "learning_rate": 9.358321924480065e-05, "loss": 0.0271, "step": 4610 }, { "epoch": 4.1066666666666665, "grad_norm": 0.3797815442085266, "learning_rate": 9.354715569806744e-05, "loss": 0.0398, "step": 4620 }, { "epoch": 4.115555555555556, "grad_norm": 0.4615563452243805, "learning_rate": 9.351099808218537e-05, "loss": 0.0326, "step": 4630 }, { "epoch": 4.124444444444444, "grad_norm": 0.2890847623348236, "learning_rate": 9.347474647526095e-05, "loss": 0.0314, "step": 4640 }, { "epoch": 4.133333333333334, "grad_norm": 0.37725111842155457, "learning_rate": 9.343840095560372e-05, "loss": 0.031, "step": 4650 }, { "epoch": 4.142222222222222, "grad_norm": 0.3183709681034088, "learning_rate": 9.340196160172607e-05, "loss": 0.0359, "step": 4660 }, { "epoch": 4.151111111111111, "grad_norm": 0.40993157029151917, "learning_rate": 9.336542849234313e-05, "loss": 0.0368, "step": 4670 }, { "epoch": 4.16, "grad_norm": 0.2408754974603653, "learning_rate": 9.332880170637252e-05, "loss": 0.0376, "step": 4680 }, { "epoch": 4.168888888888889, "grad_norm": 0.3505978286266327, "learning_rate": 9.329208132293425e-05, "loss": 0.035, "step": 4690 }, { "epoch": 4.177777777777778, "grad_norm": 0.26639479398727417, "learning_rate": 9.325526742135047e-05, "loss": 0.0338, "step": 4700 }, { "epoch": 4.1866666666666665, "grad_norm": 0.399189829826355, "learning_rate": 9.321836008114539e-05, "loss": 0.0366, "step": 4710 }, { "epoch": 4.195555555555556, "grad_norm": 0.3193024694919586, "learning_rate": 9.318135938204507e-05, "loss": 0.0309, "step": 4720 }, { "epoch": 4.204444444444444, "grad_norm": 0.4050881564617157, "learning_rate": 9.314426540397716e-05, "loss": 0.0372, "step": 4730 }, { "epoch": 4.213333333333333, "grad_norm": 0.3356577157974243, "learning_rate": 9.31070782270709e-05, "loss": 0.0336, "step": 4740 }, { "epoch": 4.222222222222222, "grad_norm": 0.40196847915649414, "learning_rate": 9.306979793165681e-05, "loss": 0.0311, "step": 4750 }, { "epoch": 4.231111111111111, "grad_norm": 0.32312798500061035, "learning_rate": 9.303242459826658e-05, "loss": 0.0327, "step": 4760 }, { "epoch": 4.24, "grad_norm": 0.3864130973815918, "learning_rate": 9.299495830763286e-05, "loss": 0.0334, "step": 4770 }, { "epoch": 4.248888888888889, "grad_norm": 0.3033246099948883, "learning_rate": 9.29573991406891e-05, "loss": 0.0336, "step": 4780 }, { "epoch": 4.257777777777778, "grad_norm": 0.3268554210662842, "learning_rate": 9.291974717856943e-05, "loss": 0.035, "step": 4790 }, { "epoch": 4.266666666666667, "grad_norm": 0.29068896174430847, "learning_rate": 9.288200250260836e-05, "loss": 0.0303, "step": 4800 }, { "epoch": 4.275555555555556, "grad_norm": 0.34848421812057495, "learning_rate": 9.284416519434072e-05, "loss": 0.0359, "step": 4810 }, { "epoch": 4.2844444444444445, "grad_norm": 0.2770916223526001, "learning_rate": 9.280623533550143e-05, "loss": 0.0366, "step": 4820 }, { "epoch": 4.293333333333333, "grad_norm": 0.21983428299427032, "learning_rate": 9.276821300802534e-05, "loss": 0.0331, "step": 4830 }, { "epoch": 4.302222222222222, "grad_norm": 0.26840636134147644, "learning_rate": 9.273009829404704e-05, "loss": 0.0307, "step": 4840 }, { "epoch": 4.311111111111111, "grad_norm": 0.2498561441898346, "learning_rate": 9.26918912759007e-05, "loss": 0.0276, "step": 4850 }, { "epoch": 4.32, "grad_norm": 0.3338160514831543, "learning_rate": 9.265359203611987e-05, "loss": 0.0382, "step": 4860 }, { "epoch": 4.328888888888889, "grad_norm": 0.2833396792411804, "learning_rate": 9.261520065743734e-05, "loss": 0.0352, "step": 4870 }, { "epoch": 4.337777777777778, "grad_norm": 0.31678640842437744, "learning_rate": 9.257671722278491e-05, "loss": 0.0324, "step": 4880 }, { "epoch": 4.346666666666667, "grad_norm": 0.36985212564468384, "learning_rate": 9.253814181529323e-05, "loss": 0.0323, "step": 4890 }, { "epoch": 4.355555555555555, "grad_norm": 0.3558400869369507, "learning_rate": 9.249947451829164e-05, "loss": 0.0315, "step": 4900 }, { "epoch": 4.364444444444445, "grad_norm": 0.3022688031196594, "learning_rate": 9.246071541530801e-05, "loss": 0.0344, "step": 4910 }, { "epoch": 4.373333333333333, "grad_norm": 0.32495608925819397, "learning_rate": 9.242186459006845e-05, "loss": 0.0303, "step": 4920 }, { "epoch": 4.3822222222222225, "grad_norm": 0.2471638172864914, "learning_rate": 9.238292212649727e-05, "loss": 0.0249, "step": 4930 }, { "epoch": 4.391111111111111, "grad_norm": 0.23440200090408325, "learning_rate": 9.23438881087167e-05, "loss": 0.0337, "step": 4940 }, { "epoch": 4.4, "grad_norm": 0.20676815509796143, "learning_rate": 9.230476262104677e-05, "loss": 0.0362, "step": 4950 }, { "epoch": 4.408888888888889, "grad_norm": 0.29860973358154297, "learning_rate": 9.22655457480051e-05, "loss": 0.0372, "step": 4960 }, { "epoch": 4.417777777777777, "grad_norm": 0.27144888043403625, "learning_rate": 9.222623757430666e-05, "loss": 0.0312, "step": 4970 }, { "epoch": 4.426666666666667, "grad_norm": 0.30471858382225037, "learning_rate": 9.218683818486372e-05, "loss": 0.0297, "step": 4980 }, { "epoch": 4.435555555555555, "grad_norm": 0.24851693212985992, "learning_rate": 9.214734766478555e-05, "loss": 0.029, "step": 4990 }, { "epoch": 4.444444444444445, "grad_norm": 0.26683375239372253, "learning_rate": 9.210776609937829e-05, "loss": 0.0321, "step": 5000 }, { "epoch": 4.453333333333333, "grad_norm": 0.5162234902381897, "learning_rate": 9.206809357414474e-05, "loss": 0.0305, "step": 5010 }, { "epoch": 4.4622222222222225, "grad_norm": 0.26474708318710327, "learning_rate": 9.202833017478422e-05, "loss": 0.0291, "step": 5020 }, { "epoch": 4.471111111111111, "grad_norm": 0.25687724351882935, "learning_rate": 9.198847598719232e-05, "loss": 0.034, "step": 5030 }, { "epoch": 4.48, "grad_norm": 0.2707546651363373, "learning_rate": 9.194853109746074e-05, "loss": 0.0345, "step": 5040 }, { "epoch": 4.488888888888889, "grad_norm": 0.2860361635684967, "learning_rate": 9.190849559187714e-05, "loss": 0.0351, "step": 5050 }, { "epoch": 4.497777777777777, "grad_norm": 0.2473740428686142, "learning_rate": 9.186836955692493e-05, "loss": 0.0289, "step": 5060 }, { "epoch": 4.506666666666667, "grad_norm": 0.303381085395813, "learning_rate": 9.182815307928307e-05, "loss": 0.0287, "step": 5070 }, { "epoch": 4.515555555555555, "grad_norm": 0.27258095145225525, "learning_rate": 9.178784624582588e-05, "loss": 0.0298, "step": 5080 }, { "epoch": 4.524444444444445, "grad_norm": 0.34528905153274536, "learning_rate": 9.174744914362285e-05, "loss": 0.0322, "step": 5090 }, { "epoch": 4.533333333333333, "grad_norm": 0.28558841347694397, "learning_rate": 9.17069618599385e-05, "loss": 0.03, "step": 5100 }, { "epoch": 4.542222222222223, "grad_norm": 0.3177531063556671, "learning_rate": 9.166638448223214e-05, "loss": 0.032, "step": 5110 }, { "epoch": 4.551111111111111, "grad_norm": 0.16659627854824066, "learning_rate": 9.162571709815773e-05, "loss": 0.0336, "step": 5120 }, { "epoch": 4.5600000000000005, "grad_norm": 0.21066249907016754, "learning_rate": 9.158495979556358e-05, "loss": 0.0308, "step": 5130 }, { "epoch": 4.568888888888889, "grad_norm": 0.3087247610092163, "learning_rate": 9.154411266249233e-05, "loss": 0.0293, "step": 5140 }, { "epoch": 4.5777777777777775, "grad_norm": 0.3003707528114319, "learning_rate": 9.150317578718062e-05, "loss": 0.031, "step": 5150 }, { "epoch": 4.586666666666667, "grad_norm": 0.21863174438476562, "learning_rate": 9.146214925805891e-05, "loss": 0.0296, "step": 5160 }, { "epoch": 4.595555555555555, "grad_norm": 0.3071073889732361, "learning_rate": 9.142103316375145e-05, "loss": 0.0335, "step": 5170 }, { "epoch": 4.604444444444445, "grad_norm": 0.2671109437942505, "learning_rate": 9.137982759307584e-05, "loss": 0.0282, "step": 5180 }, { "epoch": 4.613333333333333, "grad_norm": 0.2070036381483078, "learning_rate": 9.133853263504302e-05, "loss": 0.0294, "step": 5190 }, { "epoch": 4.622222222222222, "grad_norm": 0.2777647376060486, "learning_rate": 9.129714837885702e-05, "loss": 0.033, "step": 5200 }, { "epoch": 4.631111111111111, "grad_norm": 0.25252506136894226, "learning_rate": 9.125567491391476e-05, "loss": 0.0353, "step": 5210 }, { "epoch": 4.64, "grad_norm": 0.24646121263504028, "learning_rate": 9.121411232980588e-05, "loss": 0.0383, "step": 5220 }, { "epoch": 4.648888888888889, "grad_norm": 0.21079258620738983, "learning_rate": 9.11724607163125e-05, "loss": 0.0299, "step": 5230 }, { "epoch": 4.657777777777778, "grad_norm": 0.24304281175136566, "learning_rate": 9.11307201634091e-05, "loss": 0.0353, "step": 5240 }, { "epoch": 4.666666666666667, "grad_norm": 0.20178978145122528, "learning_rate": 9.108889076126226e-05, "loss": 0.0302, "step": 5250 }, { "epoch": 4.6755555555555555, "grad_norm": 0.25041839480400085, "learning_rate": 9.104697260023049e-05, "loss": 0.0287, "step": 5260 }, { "epoch": 4.684444444444445, "grad_norm": 0.3615833818912506, "learning_rate": 9.100496577086404e-05, "loss": 0.0353, "step": 5270 }, { "epoch": 4.693333333333333, "grad_norm": 0.3499181866645813, "learning_rate": 9.09628703639047e-05, "loss": 0.0312, "step": 5280 }, { "epoch": 4.702222222222222, "grad_norm": 0.40948736667633057, "learning_rate": 9.092068647028558e-05, "loss": 0.0286, "step": 5290 }, { "epoch": 4.711111111111111, "grad_norm": 0.2539764940738678, "learning_rate": 9.0878414181131e-05, "loss": 0.0309, "step": 5300 }, { "epoch": 4.72, "grad_norm": 0.28651413321495056, "learning_rate": 9.083605358775612e-05, "loss": 0.0317, "step": 5310 }, { "epoch": 4.728888888888889, "grad_norm": 0.28653913736343384, "learning_rate": 9.079360478166695e-05, "loss": 0.0309, "step": 5320 }, { "epoch": 4.737777777777778, "grad_norm": 0.27438831329345703, "learning_rate": 9.075106785456002e-05, "loss": 0.0312, "step": 5330 }, { "epoch": 4.746666666666667, "grad_norm": 0.25500354170799255, "learning_rate": 9.070844289832224e-05, "loss": 0.0337, "step": 5340 }, { "epoch": 4.7555555555555555, "grad_norm": 0.39056479930877686, "learning_rate": 9.066573000503059e-05, "loss": 0.0324, "step": 5350 }, { "epoch": 4.764444444444445, "grad_norm": 0.24575889110565186, "learning_rate": 9.062292926695213e-05, "loss": 0.0288, "step": 5360 }, { "epoch": 4.773333333333333, "grad_norm": 0.3391013443470001, "learning_rate": 9.058004077654359e-05, "loss": 0.0323, "step": 5370 }, { "epoch": 4.782222222222222, "grad_norm": 0.2727068066596985, "learning_rate": 9.05370646264513e-05, "loss": 0.0316, "step": 5380 }, { "epoch": 4.791111111111111, "grad_norm": 0.29957178235054016, "learning_rate": 9.049400090951094e-05, "loss": 0.0315, "step": 5390 }, { "epoch": 4.8, "grad_norm": 0.26901039481163025, "learning_rate": 9.045084971874738e-05, "loss": 0.0269, "step": 5400 }, { "epoch": 4.808888888888889, "grad_norm": 0.1908525675535202, "learning_rate": 9.040761114737437e-05, "loss": 0.0316, "step": 5410 }, { "epoch": 4.817777777777778, "grad_norm": 0.2559043765068054, "learning_rate": 9.03642852887945e-05, "loss": 0.0295, "step": 5420 }, { "epoch": 4.826666666666666, "grad_norm": 0.27997031807899475, "learning_rate": 9.032087223659885e-05, "loss": 0.0282, "step": 5430 }, { "epoch": 4.835555555555556, "grad_norm": 0.2120792120695114, "learning_rate": 9.027737208456691e-05, "loss": 0.0297, "step": 5440 }, { "epoch": 4.844444444444444, "grad_norm": 0.21166673302650452, "learning_rate": 9.023378492666628e-05, "loss": 0.0295, "step": 5450 }, { "epoch": 4.8533333333333335, "grad_norm": 0.303838312625885, "learning_rate": 9.019011085705253e-05, "loss": 0.0333, "step": 5460 }, { "epoch": 4.862222222222222, "grad_norm": 0.26026326417922974, "learning_rate": 9.014634997006896e-05, "loss": 0.0272, "step": 5470 }, { "epoch": 4.871111111111111, "grad_norm": 0.38109803199768066, "learning_rate": 9.01025023602464e-05, "loss": 0.0317, "step": 5480 }, { "epoch": 4.88, "grad_norm": 0.3488011062145233, "learning_rate": 9.005856812230304e-05, "loss": 0.0283, "step": 5490 }, { "epoch": 4.888888888888889, "grad_norm": 0.24160318076610565, "learning_rate": 9.001454735114421e-05, "loss": 0.0266, "step": 5500 }, { "epoch": 4.897777777777778, "grad_norm": 0.26617929339408875, "learning_rate": 8.997044014186212e-05, "loss": 0.033, "step": 5510 }, { "epoch": 4.906666666666666, "grad_norm": 0.2577374577522278, "learning_rate": 8.992624658973574e-05, "loss": 0.0303, "step": 5520 }, { "epoch": 4.915555555555556, "grad_norm": 0.27222734689712524, "learning_rate": 8.988196679023054e-05, "loss": 0.0373, "step": 5530 }, { "epoch": 4.924444444444444, "grad_norm": 0.28233540058135986, "learning_rate": 8.98376008389983e-05, "loss": 0.0341, "step": 5540 }, { "epoch": 4.933333333333334, "grad_norm": 0.23355074226856232, "learning_rate": 8.979314883187693e-05, "loss": 0.0244, "step": 5550 }, { "epoch": 4.942222222222222, "grad_norm": 0.32290029525756836, "learning_rate": 8.974861086489017e-05, "loss": 0.0322, "step": 5560 }, { "epoch": 4.9511111111111115, "grad_norm": 0.28826332092285156, "learning_rate": 8.970398703424751e-05, "loss": 0.036, "step": 5570 }, { "epoch": 4.96, "grad_norm": 0.2596275806427002, "learning_rate": 8.965927743634391e-05, "loss": 0.032, "step": 5580 }, { "epoch": 4.968888888888889, "grad_norm": 0.2161322385072708, "learning_rate": 8.961448216775954e-05, "loss": 0.0319, "step": 5590 }, { "epoch": 4.977777777777778, "grad_norm": 0.2368537187576294, "learning_rate": 8.956960132525974e-05, "loss": 0.0293, "step": 5600 }, { "epoch": 4.986666666666666, "grad_norm": 0.29512253403663635, "learning_rate": 8.95246350057946e-05, "loss": 0.0317, "step": 5610 }, { "epoch": 4.995555555555556, "grad_norm": 0.3062151074409485, "learning_rate": 8.947958330649893e-05, "loss": 0.0292, "step": 5620 }, { "epoch": 5.004444444444444, "grad_norm": 0.29166528582572937, "learning_rate": 8.943444632469192e-05, "loss": 0.0278, "step": 5630 }, { "epoch": 5.013333333333334, "grad_norm": 0.3154290020465851, "learning_rate": 8.938922415787703e-05, "loss": 0.03, "step": 5640 }, { "epoch": 5.022222222222222, "grad_norm": 0.24332383275032043, "learning_rate": 8.934391690374171e-05, "loss": 0.0294, "step": 5650 }, { "epoch": 5.0311111111111115, "grad_norm": 0.3086906671524048, "learning_rate": 8.929852466015722e-05, "loss": 0.0327, "step": 5660 }, { "epoch": 5.04, "grad_norm": 0.24998405575752258, "learning_rate": 8.92530475251784e-05, "loss": 0.0304, "step": 5670 }, { "epoch": 5.0488888888888885, "grad_norm": 0.3011724352836609, "learning_rate": 8.920748559704347e-05, "loss": 0.0267, "step": 5680 }, { "epoch": 5.057777777777778, "grad_norm": 0.3268841505050659, "learning_rate": 8.916183897417383e-05, "loss": 0.0277, "step": 5690 }, { "epoch": 5.066666666666666, "grad_norm": 0.2826198935508728, "learning_rate": 8.911610775517382e-05, "loss": 0.0261, "step": 5700 }, { "epoch": 5.075555555555556, "grad_norm": 0.3765130043029785, "learning_rate": 8.907029203883058e-05, "loss": 0.0314, "step": 5710 }, { "epoch": 5.084444444444444, "grad_norm": 0.27179816365242004, "learning_rate": 8.902439192411367e-05, "loss": 0.0267, "step": 5720 }, { "epoch": 5.093333333333334, "grad_norm": 0.22206132113933563, "learning_rate": 8.897840751017506e-05, "loss": 0.0289, "step": 5730 }, { "epoch": 5.102222222222222, "grad_norm": 0.2394196093082428, "learning_rate": 8.893233889634875e-05, "loss": 0.0378, "step": 5740 }, { "epoch": 5.111111111111111, "grad_norm": 0.4401729106903076, "learning_rate": 8.88861861821507e-05, "loss": 0.0311, "step": 5750 }, { "epoch": 5.12, "grad_norm": 0.287504643201828, "learning_rate": 8.883994946727849e-05, "loss": 0.027, "step": 5760 }, { "epoch": 5.128888888888889, "grad_norm": 0.29627490043640137, "learning_rate": 8.879362885161115e-05, "loss": 0.0295, "step": 5770 }, { "epoch": 5.137777777777778, "grad_norm": 0.22109301388263702, "learning_rate": 8.874722443520899e-05, "loss": 0.0339, "step": 5780 }, { "epoch": 5.1466666666666665, "grad_norm": 0.3001507818698883, "learning_rate": 8.87007363183133e-05, "loss": 0.032, "step": 5790 }, { "epoch": 5.155555555555556, "grad_norm": 0.22820651531219482, "learning_rate": 8.86541646013462e-05, "loss": 0.0321, "step": 5800 }, { "epoch": 5.164444444444444, "grad_norm": 0.3296657204627991, "learning_rate": 8.860750938491042e-05, "loss": 0.0307, "step": 5810 }, { "epoch": 5.173333333333334, "grad_norm": 0.21773795783519745, "learning_rate": 8.856077076978902e-05, "loss": 0.028, "step": 5820 }, { "epoch": 5.182222222222222, "grad_norm": 0.23407134413719177, "learning_rate": 8.851394885694524e-05, "loss": 0.0266, "step": 5830 }, { "epoch": 5.191111111111111, "grad_norm": 0.28203344345092773, "learning_rate": 8.846704374752227e-05, "loss": 0.0308, "step": 5840 }, { "epoch": 5.2, "grad_norm": 0.2717295289039612, "learning_rate": 8.842005554284296e-05, "loss": 0.0318, "step": 5850 }, { "epoch": 5.208888888888889, "grad_norm": 0.24596557021141052, "learning_rate": 8.837298434440972e-05, "loss": 0.0322, "step": 5860 }, { "epoch": 5.217777777777778, "grad_norm": 0.2646559476852417, "learning_rate": 8.832583025390425e-05, "loss": 0.0302, "step": 5870 }, { "epoch": 5.226666666666667, "grad_norm": 0.3697127401828766, "learning_rate": 8.827859337318725e-05, "loss": 0.0281, "step": 5880 }, { "epoch": 5.235555555555556, "grad_norm": 0.3839709758758545, "learning_rate": 8.82312738042983e-05, "loss": 0.0327, "step": 5890 }, { "epoch": 5.2444444444444445, "grad_norm": 0.33191561698913574, "learning_rate": 8.818387164945561e-05, "loss": 0.0419, "step": 5900 }, { "epoch": 5.253333333333333, "grad_norm": 0.2258535772562027, "learning_rate": 8.813638701105573e-05, "loss": 0.0282, "step": 5910 }, { "epoch": 5.262222222222222, "grad_norm": 0.2534967362880707, "learning_rate": 8.808881999167348e-05, "loss": 0.0314, "step": 5920 }, { "epoch": 5.271111111111111, "grad_norm": 0.28560617566108704, "learning_rate": 8.804117069406155e-05, "loss": 0.0307, "step": 5930 }, { "epoch": 5.28, "grad_norm": 0.2788536846637726, "learning_rate": 8.799343922115044e-05, "loss": 0.0307, "step": 5940 }, { "epoch": 5.288888888888889, "grad_norm": 0.24603256583213806, "learning_rate": 8.794562567604808e-05, "loss": 0.0272, "step": 5950 }, { "epoch": 5.297777777777778, "grad_norm": 0.21625715494155884, "learning_rate": 8.789773016203977e-05, "loss": 0.0304, "step": 5960 }, { "epoch": 5.306666666666667, "grad_norm": 0.2691044509410858, "learning_rate": 8.784975278258783e-05, "loss": 0.029, "step": 5970 }, { "epoch": 5.315555555555555, "grad_norm": 0.20362047851085663, "learning_rate": 8.780169364133142e-05, "loss": 0.0305, "step": 5980 }, { "epoch": 5.3244444444444445, "grad_norm": 0.23990501463413239, "learning_rate": 8.775355284208634e-05, "loss": 0.0282, "step": 5990 }, { "epoch": 5.333333333333333, "grad_norm": 0.2770455777645111, "learning_rate": 8.770533048884482e-05, "loss": 0.0339, "step": 6000 }, { "epoch": 5.342222222222222, "grad_norm": 0.2872175872325897, "learning_rate": 8.765702668577516e-05, "loss": 0.0251, "step": 6010 }, { "epoch": 5.351111111111111, "grad_norm": 0.28999388217926025, "learning_rate": 8.760864153722168e-05, "loss": 0.0293, "step": 6020 }, { "epoch": 5.36, "grad_norm": 0.2877536118030548, "learning_rate": 8.756017514770443e-05, "loss": 0.0282, "step": 6030 }, { "epoch": 5.368888888888889, "grad_norm": 0.19941496849060059, "learning_rate": 8.75116276219189e-05, "loss": 0.0312, "step": 6040 }, { "epoch": 5.377777777777778, "grad_norm": 0.27108559012413025, "learning_rate": 8.746299906473587e-05, "loss": 0.0296, "step": 6050 }, { "epoch": 5.386666666666667, "grad_norm": 0.32190003991127014, "learning_rate": 8.741428958120118e-05, "loss": 0.0289, "step": 6060 }, { "epoch": 5.395555555555555, "grad_norm": 0.25298449397087097, "learning_rate": 8.736549927653548e-05, "loss": 0.0389, "step": 6070 }, { "epoch": 5.404444444444445, "grad_norm": 0.33017855882644653, "learning_rate": 8.7316628256134e-05, "loss": 0.0283, "step": 6080 }, { "epoch": 5.413333333333333, "grad_norm": 0.2532486319541931, "learning_rate": 8.72676766255663e-05, "loss": 0.0252, "step": 6090 }, { "epoch": 5.4222222222222225, "grad_norm": 0.3070535361766815, "learning_rate": 8.721864449057613e-05, "loss": 0.0261, "step": 6100 }, { "epoch": 5.431111111111111, "grad_norm": 0.2998232841491699, "learning_rate": 8.716953195708109e-05, "loss": 0.0308, "step": 6110 }, { "epoch": 5.44, "grad_norm": 0.3111470937728882, "learning_rate": 8.71203391311725e-05, "loss": 0.0299, "step": 6120 }, { "epoch": 5.448888888888889, "grad_norm": 0.29436376690864563, "learning_rate": 8.707106611911509e-05, "loss": 0.0231, "step": 6130 }, { "epoch": 5.457777777777777, "grad_norm": 0.3064485192298889, "learning_rate": 8.702171302734681e-05, "loss": 0.0271, "step": 6140 }, { "epoch": 5.466666666666667, "grad_norm": 0.30283525586128235, "learning_rate": 8.697227996247861e-05, "loss": 0.0297, "step": 6150 }, { "epoch": 5.475555555555555, "grad_norm": 0.2780840992927551, "learning_rate": 8.692276703129421e-05, "loss": 0.0345, "step": 6160 }, { "epoch": 5.484444444444445, "grad_norm": 0.2647765874862671, "learning_rate": 8.687317434074977e-05, "loss": 0.0259, "step": 6170 }, { "epoch": 5.493333333333333, "grad_norm": 0.3319759666919708, "learning_rate": 8.682350199797388e-05, "loss": 0.0331, "step": 6180 }, { "epoch": 5.502222222222223, "grad_norm": 0.24724747240543365, "learning_rate": 8.677375011026708e-05, "loss": 0.0292, "step": 6190 }, { "epoch": 5.511111111111111, "grad_norm": 0.23594297468662262, "learning_rate": 8.672391878510178e-05, "loss": 0.0317, "step": 6200 }, { "epoch": 5.52, "grad_norm": 0.22257651388645172, "learning_rate": 8.6674008130122e-05, "loss": 0.0232, "step": 6210 }, { "epoch": 5.528888888888889, "grad_norm": 0.20600296556949615, "learning_rate": 8.66240182531431e-05, "loss": 0.028, "step": 6220 }, { "epoch": 5.5377777777777775, "grad_norm": 0.24483610689640045, "learning_rate": 8.657394926215159e-05, "loss": 0.0285, "step": 6230 }, { "epoch": 5.546666666666667, "grad_norm": 0.21987171471118927, "learning_rate": 8.652380126530488e-05, "loss": 0.0305, "step": 6240 }, { "epoch": 5.555555555555555, "grad_norm": 0.3559325039386749, "learning_rate": 8.647357437093105e-05, "loss": 0.0279, "step": 6250 }, { "epoch": 5.564444444444445, "grad_norm": 0.21600252389907837, "learning_rate": 8.642326868752858e-05, "loss": 0.0327, "step": 6260 }, { "epoch": 5.573333333333333, "grad_norm": 0.3064497113227844, "learning_rate": 8.637288432376618e-05, "loss": 0.0305, "step": 6270 }, { "epoch": 5.582222222222223, "grad_norm": 0.3038145899772644, "learning_rate": 8.632242138848252e-05, "loss": 0.0311, "step": 6280 }, { "epoch": 5.591111111111111, "grad_norm": 0.23511594533920288, "learning_rate": 8.627187999068598e-05, "loss": 0.0288, "step": 6290 }, { "epoch": 5.6, "grad_norm": 0.2400660365819931, "learning_rate": 8.622126023955446e-05, "loss": 0.0283, "step": 6300 }, { "epoch": 5.608888888888889, "grad_norm": 0.3130846321582794, "learning_rate": 8.617056224443507e-05, "loss": 0.0328, "step": 6310 }, { "epoch": 5.6177777777777775, "grad_norm": 0.19325688481330872, "learning_rate": 8.6119786114844e-05, "loss": 0.0259, "step": 6320 }, { "epoch": 5.626666666666667, "grad_norm": 0.1934719979763031, "learning_rate": 8.606893196046619e-05, "loss": 0.0313, "step": 6330 }, { "epoch": 5.635555555555555, "grad_norm": 0.20586548745632172, "learning_rate": 8.60179998911551e-05, "loss": 0.0265, "step": 6340 }, { "epoch": 5.644444444444445, "grad_norm": 0.3628253638744354, "learning_rate": 8.596699001693255e-05, "loss": 0.0325, "step": 6350 }, { "epoch": 5.653333333333333, "grad_norm": 0.26107966899871826, "learning_rate": 8.591590244798844e-05, "loss": 0.0312, "step": 6360 }, { "epoch": 5.662222222222223, "grad_norm": 0.22643977403640747, "learning_rate": 8.58647372946804e-05, "loss": 0.0297, "step": 6370 }, { "epoch": 5.671111111111111, "grad_norm": 0.29038503766059875, "learning_rate": 8.581349466753381e-05, "loss": 0.0271, "step": 6380 }, { "epoch": 5.68, "grad_norm": 0.3971521556377411, "learning_rate": 8.576217467724128e-05, "loss": 0.0304, "step": 6390 }, { "epoch": 5.688888888888889, "grad_norm": 0.3059302270412445, "learning_rate": 8.571077743466259e-05, "loss": 0.0267, "step": 6400 }, { "epoch": 5.697777777777778, "grad_norm": 0.28859710693359375, "learning_rate": 8.56593030508244e-05, "loss": 0.0303, "step": 6410 }, { "epoch": 5.706666666666667, "grad_norm": 0.2836757004261017, "learning_rate": 8.560775163691999e-05, "loss": 0.0262, "step": 6420 }, { "epoch": 5.7155555555555555, "grad_norm": 0.2497064769268036, "learning_rate": 8.555612330430904e-05, "loss": 0.024, "step": 6430 }, { "epoch": 5.724444444444444, "grad_norm": 0.2817823588848114, "learning_rate": 8.550441816451742e-05, "loss": 0.0287, "step": 6440 }, { "epoch": 5.733333333333333, "grad_norm": 0.25843819975852966, "learning_rate": 8.545263632923687e-05, "loss": 0.0263, "step": 6450 }, { "epoch": 5.742222222222222, "grad_norm": 0.19444498419761658, "learning_rate": 8.540077791032484e-05, "loss": 0.0353, "step": 6460 }, { "epoch": 5.751111111111111, "grad_norm": 0.301430881023407, "learning_rate": 8.534884301980418e-05, "loss": 0.0262, "step": 6470 }, { "epoch": 5.76, "grad_norm": 0.2067270129919052, "learning_rate": 8.529683176986295e-05, "loss": 0.0304, "step": 6480 }, { "epoch": 5.768888888888889, "grad_norm": 0.26122286915779114, "learning_rate": 8.524474427285419e-05, "loss": 0.0232, "step": 6490 }, { "epoch": 5.777777777777778, "grad_norm": 0.27038395404815674, "learning_rate": 8.519258064129558e-05, "loss": 0.0259, "step": 6500 }, { "epoch": 5.786666666666667, "grad_norm": 0.28756049275398254, "learning_rate": 8.514034098786933e-05, "loss": 0.0252, "step": 6510 }, { "epoch": 5.795555555555556, "grad_norm": 0.22862713038921356, "learning_rate": 8.508802542542181e-05, "loss": 0.0246, "step": 6520 }, { "epoch": 5.804444444444444, "grad_norm": 0.25050899386405945, "learning_rate": 8.503563406696342e-05, "loss": 0.0273, "step": 6530 }, { "epoch": 5.8133333333333335, "grad_norm": 0.17922230064868927, "learning_rate": 8.498316702566828e-05, "loss": 0.0257, "step": 6540 }, { "epoch": 5.822222222222222, "grad_norm": 0.25937119126319885, "learning_rate": 8.493062441487395e-05, "loss": 0.0263, "step": 6550 }, { "epoch": 5.831111111111111, "grad_norm": 0.2133234292268753, "learning_rate": 8.487800634808128e-05, "loss": 0.0338, "step": 6560 }, { "epoch": 5.84, "grad_norm": 0.25495123863220215, "learning_rate": 8.482531293895412e-05, "loss": 0.0253, "step": 6570 }, { "epoch": 5.848888888888889, "grad_norm": 0.1914951205253601, "learning_rate": 8.477254430131903e-05, "loss": 0.0228, "step": 6580 }, { "epoch": 5.857777777777778, "grad_norm": 0.26710835099220276, "learning_rate": 8.471970054916513e-05, "loss": 0.0284, "step": 6590 }, { "epoch": 5.866666666666667, "grad_norm": 0.23956747353076935, "learning_rate": 8.466678179664379e-05, "loss": 0.0247, "step": 6600 }, { "epoch": 5.875555555555556, "grad_norm": 0.24413150548934937, "learning_rate": 8.461378815806834e-05, "loss": 0.0265, "step": 6610 }, { "epoch": 5.884444444444444, "grad_norm": 0.3248586356639862, "learning_rate": 8.456071974791391e-05, "loss": 0.0286, "step": 6620 }, { "epoch": 5.8933333333333335, "grad_norm": 0.2924213707447052, "learning_rate": 8.450757668081716e-05, "loss": 0.0283, "step": 6630 }, { "epoch": 5.902222222222222, "grad_norm": 0.31495195627212524, "learning_rate": 8.445435907157605e-05, "loss": 0.0293, "step": 6640 }, { "epoch": 5.911111111111111, "grad_norm": 0.2383049726486206, "learning_rate": 8.440106703514949e-05, "loss": 0.0323, "step": 6650 }, { "epoch": 5.92, "grad_norm": 0.2581128776073456, "learning_rate": 8.434770068665723e-05, "loss": 0.0276, "step": 6660 }, { "epoch": 5.928888888888888, "grad_norm": 0.24448233842849731, "learning_rate": 8.429426014137948e-05, "loss": 0.0261, "step": 6670 }, { "epoch": 5.937777777777778, "grad_norm": 0.24104978144168854, "learning_rate": 8.424074551475683e-05, "loss": 0.0255, "step": 6680 }, { "epoch": 5.946666666666666, "grad_norm": 0.2276896834373474, "learning_rate": 8.418715692238978e-05, "loss": 0.0305, "step": 6690 }, { "epoch": 5.955555555555556, "grad_norm": 0.22446025907993317, "learning_rate": 8.413349448003866e-05, "loss": 0.0226, "step": 6700 }, { "epoch": 5.964444444444444, "grad_norm": 0.2728880047798157, "learning_rate": 8.407975830362338e-05, "loss": 0.0286, "step": 6710 }, { "epoch": 5.973333333333334, "grad_norm": 0.24272333085536957, "learning_rate": 8.402594850922305e-05, "loss": 0.0321, "step": 6720 }, { "epoch": 5.982222222222222, "grad_norm": 0.3038065731525421, "learning_rate": 8.397206521307584e-05, "loss": 0.0266, "step": 6730 }, { "epoch": 5.9911111111111115, "grad_norm": 0.2604829668998718, "learning_rate": 8.391810853157868e-05, "loss": 0.0287, "step": 6740 }, { "epoch": 6.0, "grad_norm": 0.4214931130409241, "learning_rate": 8.386407858128706e-05, "loss": 0.0326, "step": 6750 }, { "epoch": 6.0088888888888885, "grad_norm": 0.1896793246269226, "learning_rate": 8.380997547891471e-05, "loss": 0.0242, "step": 6760 }, { "epoch": 6.017777777777778, "grad_norm": 0.24815690517425537, "learning_rate": 8.37557993413334e-05, "loss": 0.0309, "step": 6770 }, { "epoch": 6.026666666666666, "grad_norm": 0.1926945447921753, "learning_rate": 8.370155028557265e-05, "loss": 0.0315, "step": 6780 }, { "epoch": 6.035555555555556, "grad_norm": 0.325992614030838, "learning_rate": 8.364722842881949e-05, "loss": 0.0343, "step": 6790 }, { "epoch": 6.044444444444444, "grad_norm": 0.14764641225337982, "learning_rate": 8.359283388841827e-05, "loss": 0.0266, "step": 6800 }, { "epoch": 6.053333333333334, "grad_norm": 0.2205577790737152, "learning_rate": 8.353836678187027e-05, "loss": 0.0254, "step": 6810 }, { "epoch": 6.062222222222222, "grad_norm": 0.20670156180858612, "learning_rate": 8.348382722683358e-05, "loss": 0.0251, "step": 6820 }, { "epoch": 6.071111111111111, "grad_norm": 0.1988840401172638, "learning_rate": 8.342921534112276e-05, "loss": 0.0281, "step": 6830 }, { "epoch": 6.08, "grad_norm": 0.3168070912361145, "learning_rate": 8.337453124270863e-05, "loss": 0.0294, "step": 6840 }, { "epoch": 6.088888888888889, "grad_norm": 0.2893161177635193, "learning_rate": 8.331977504971801e-05, "loss": 0.0324, "step": 6850 }, { "epoch": 6.097777777777778, "grad_norm": 0.22281889617443085, "learning_rate": 8.326494688043344e-05, "loss": 0.0268, "step": 6860 }, { "epoch": 6.1066666666666665, "grad_norm": 0.2670470178127289, "learning_rate": 8.321004685329296e-05, "loss": 0.0295, "step": 6870 }, { "epoch": 6.115555555555556, "grad_norm": 0.25628477334976196, "learning_rate": 8.31550750868898e-05, "loss": 0.0257, "step": 6880 }, { "epoch": 6.124444444444444, "grad_norm": 0.26609164476394653, "learning_rate": 8.310003169997218e-05, "loss": 0.0258, "step": 6890 }, { "epoch": 6.133333333333334, "grad_norm": 0.26989054679870605, "learning_rate": 8.304491681144306e-05, "loss": 0.0248, "step": 6900 }, { "epoch": 6.142222222222222, "grad_norm": 0.2946019768714905, "learning_rate": 8.298973054035981e-05, "loss": 0.0299, "step": 6910 }, { "epoch": 6.151111111111111, "grad_norm": 0.2558068037033081, "learning_rate": 8.293447300593402e-05, "loss": 0.032, "step": 6920 }, { "epoch": 6.16, "grad_norm": 0.25369948148727417, "learning_rate": 8.287914432753123e-05, "loss": 0.0275, "step": 6930 }, { "epoch": 6.168888888888889, "grad_norm": 0.24253907799720764, "learning_rate": 8.282374462467064e-05, "loss": 0.0338, "step": 6940 }, { "epoch": 6.177777777777778, "grad_norm": 0.24077919125556946, "learning_rate": 8.276827401702487e-05, "loss": 0.0274, "step": 6950 }, { "epoch": 6.1866666666666665, "grad_norm": 0.2945677936077118, "learning_rate": 8.271273262441975e-05, "loss": 0.0287, "step": 6960 }, { "epoch": 6.195555555555556, "grad_norm": 0.2240440994501114, "learning_rate": 8.265712056683397e-05, "loss": 0.0301, "step": 6970 }, { "epoch": 6.204444444444444, "grad_norm": 0.2557741105556488, "learning_rate": 8.26014379643989e-05, "loss": 0.0301, "step": 6980 }, { "epoch": 6.213333333333333, "grad_norm": 0.2755835950374603, "learning_rate": 8.254568493739828e-05, "loss": 0.0264, "step": 6990 }, { "epoch": 6.222222222222222, "grad_norm": 0.19644346833229065, "learning_rate": 8.2489861606268e-05, "loss": 0.0306, "step": 7000 }, { "epoch": 6.231111111111111, "grad_norm": 0.24307610094547272, "learning_rate": 8.243396809159575e-05, "loss": 0.0251, "step": 7010 }, { "epoch": 6.24, "grad_norm": 0.2883116900920868, "learning_rate": 8.237800451412095e-05, "loss": 0.0285, "step": 7020 }, { "epoch": 6.248888888888889, "grad_norm": 0.29675206542015076, "learning_rate": 8.232197099473427e-05, "loss": 0.0232, "step": 7030 }, { "epoch": 6.257777777777778, "grad_norm": 0.23401504755020142, "learning_rate": 8.226586765447748e-05, "loss": 0.027, "step": 7040 }, { "epoch": 6.266666666666667, "grad_norm": 0.31487202644348145, "learning_rate": 8.220969461454322e-05, "loss": 0.0278, "step": 7050 }, { "epoch": 6.275555555555556, "grad_norm": 0.259353369474411, "learning_rate": 8.215345199627463e-05, "loss": 0.0274, "step": 7060 }, { "epoch": 6.2844444444444445, "grad_norm": 0.25168901681900024, "learning_rate": 8.209713992116521e-05, "loss": 0.0251, "step": 7070 }, { "epoch": 6.293333333333333, "grad_norm": 0.2118161916732788, "learning_rate": 8.204075851085849e-05, "loss": 0.0217, "step": 7080 }, { "epoch": 6.302222222222222, "grad_norm": 0.2092578113079071, "learning_rate": 8.198430788714771e-05, "loss": 0.0235, "step": 7090 }, { "epoch": 6.311111111111111, "grad_norm": 0.33864957094192505, "learning_rate": 8.192778817197569e-05, "loss": 0.0244, "step": 7100 }, { "epoch": 6.32, "grad_norm": 0.26961442828178406, "learning_rate": 8.18711994874345e-05, "loss": 0.0305, "step": 7110 }, { "epoch": 6.328888888888889, "grad_norm": 0.2798379957675934, "learning_rate": 8.181454195576515e-05, "loss": 0.0264, "step": 7120 }, { "epoch": 6.337777777777778, "grad_norm": 0.20808854699134827, "learning_rate": 8.17578156993574e-05, "loss": 0.026, "step": 7130 }, { "epoch": 6.346666666666667, "grad_norm": 0.23184262216091156, "learning_rate": 8.170102084074946e-05, "loss": 0.0273, "step": 7140 }, { "epoch": 6.355555555555555, "grad_norm": 0.23005138337612152, "learning_rate": 8.164415750262777e-05, "loss": 0.0258, "step": 7150 }, { "epoch": 6.364444444444445, "grad_norm": 0.24796970188617706, "learning_rate": 8.158722580782662e-05, "loss": 0.0294, "step": 7160 }, { "epoch": 6.373333333333333, "grad_norm": 0.2472086250782013, "learning_rate": 8.153022587932803e-05, "loss": 0.024, "step": 7170 }, { "epoch": 6.3822222222222225, "grad_norm": 0.19740913808345795, "learning_rate": 8.147315784026138e-05, "loss": 0.0195, "step": 7180 }, { "epoch": 6.391111111111111, "grad_norm": 0.30975431203842163, "learning_rate": 8.14160218139032e-05, "loss": 0.0255, "step": 7190 }, { "epoch": 6.4, "grad_norm": 0.23974847793579102, "learning_rate": 8.135881792367686e-05, "loss": 0.0249, "step": 7200 }, { "epoch": 6.408888888888889, "grad_norm": 0.2577337324619293, "learning_rate": 8.130154629315234e-05, "loss": 0.0286, "step": 7210 }, { "epoch": 6.417777777777777, "grad_norm": 0.21682016551494598, "learning_rate": 8.124420704604598e-05, "loss": 0.0261, "step": 7220 }, { "epoch": 6.426666666666667, "grad_norm": 0.25370943546295166, "learning_rate": 8.118680030622014e-05, "loss": 0.0219, "step": 7230 }, { "epoch": 6.435555555555555, "grad_norm": 0.21826443076133728, "learning_rate": 8.1129326197683e-05, "loss": 0.0239, "step": 7240 }, { "epoch": 6.444444444444445, "grad_norm": 0.28323155641555786, "learning_rate": 8.107178484458824e-05, "loss": 0.0248, "step": 7250 }, { "epoch": 6.453333333333333, "grad_norm": 0.49537038803100586, "learning_rate": 8.101417637123484e-05, "loss": 0.0288, "step": 7260 }, { "epoch": 6.4622222222222225, "grad_norm": 0.24792195856571198, "learning_rate": 8.095650090206672e-05, "loss": 0.0274, "step": 7270 }, { "epoch": 6.471111111111111, "grad_norm": 0.2700432538986206, "learning_rate": 8.089875856167259e-05, "loss": 0.0249, "step": 7280 }, { "epoch": 6.48, "grad_norm": 0.2627910375595093, "learning_rate": 8.084094947478556e-05, "loss": 0.0222, "step": 7290 }, { "epoch": 6.488888888888889, "grad_norm": 0.2635471522808075, "learning_rate": 8.07830737662829e-05, "loss": 0.0287, "step": 7300 }, { "epoch": 6.497777777777777, "grad_norm": 0.22499562799930573, "learning_rate": 8.07251315611859e-05, "loss": 0.0266, "step": 7310 }, { "epoch": 6.506666666666667, "grad_norm": 0.2660355269908905, "learning_rate": 8.06671229846594e-05, "loss": 0.0289, "step": 7320 }, { "epoch": 6.515555555555555, "grad_norm": 0.23469319939613342, "learning_rate": 8.060904816201162e-05, "loss": 0.0271, "step": 7330 }, { "epoch": 6.524444444444445, "grad_norm": 0.30020564794540405, "learning_rate": 8.055090721869392e-05, "loss": 0.03, "step": 7340 }, { "epoch": 6.533333333333333, "grad_norm": 0.1877342313528061, "learning_rate": 8.049270028030046e-05, "loss": 0.0266, "step": 7350 }, { "epoch": 6.542222222222223, "grad_norm": 0.21152740716934204, "learning_rate": 8.0434427472568e-05, "loss": 0.0282, "step": 7360 }, { "epoch": 6.551111111111111, "grad_norm": 0.16452142596244812, "learning_rate": 8.037608892137553e-05, "loss": 0.0238, "step": 7370 }, { "epoch": 6.5600000000000005, "grad_norm": 0.2752535045146942, "learning_rate": 8.031768475274413e-05, "loss": 0.0285, "step": 7380 }, { "epoch": 6.568888888888889, "grad_norm": 0.26848486065864563, "learning_rate": 8.025921509283653e-05, "loss": 0.0249, "step": 7390 }, { "epoch": 6.5777777777777775, "grad_norm": 0.23145653307437897, "learning_rate": 8.020068006795705e-05, "loss": 0.0259, "step": 7400 }, { "epoch": 6.586666666666667, "grad_norm": 0.23849451541900635, "learning_rate": 8.01420798045511e-05, "loss": 0.0306, "step": 7410 }, { "epoch": 6.595555555555555, "grad_norm": 0.2694615125656128, "learning_rate": 8.008341442920508e-05, "loss": 0.0282, "step": 7420 }, { "epoch": 6.604444444444445, "grad_norm": 0.25809165835380554, "learning_rate": 8.002468406864603e-05, "loss": 0.029, "step": 7430 }, { "epoch": 6.613333333333333, "grad_norm": 0.18999096751213074, "learning_rate": 7.996588884974135e-05, "loss": 0.0258, "step": 7440 }, { "epoch": 6.622222222222222, "grad_norm": 0.35795027017593384, "learning_rate": 7.990702889949858e-05, "loss": 0.0328, "step": 7450 }, { "epoch": 6.631111111111111, "grad_norm": 0.2555767297744751, "learning_rate": 7.984810434506503e-05, "loss": 0.0256, "step": 7460 }, { "epoch": 6.64, "grad_norm": 0.25635913014411926, "learning_rate": 7.978911531372765e-05, "loss": 0.0248, "step": 7470 }, { "epoch": 6.648888888888889, "grad_norm": 0.30037885904312134, "learning_rate": 7.973006193291259e-05, "loss": 0.0304, "step": 7480 }, { "epoch": 6.657777777777778, "grad_norm": 0.29638251662254333, "learning_rate": 7.967094433018508e-05, "loss": 0.0212, "step": 7490 }, { "epoch": 6.666666666666667, "grad_norm": 0.2659394145011902, "learning_rate": 7.961176263324901e-05, "loss": 0.0253, "step": 7500 }, { "epoch": 6.6755555555555555, "grad_norm": 0.2688281834125519, "learning_rate": 7.955251696994676e-05, "loss": 0.0259, "step": 7510 }, { "epoch": 6.684444444444445, "grad_norm": 0.22831323742866516, "learning_rate": 7.94932074682589e-05, "loss": 0.0227, "step": 7520 }, { "epoch": 6.693333333333333, "grad_norm": 0.26453372836112976, "learning_rate": 7.943383425630387e-05, "loss": 0.0191, "step": 7530 }, { "epoch": 6.702222222222222, "grad_norm": 0.2413475066423416, "learning_rate": 7.937439746233777e-05, "loss": 0.0265, "step": 7540 }, { "epoch": 6.711111111111111, "grad_norm": 0.33436426520347595, "learning_rate": 7.9314897214754e-05, "loss": 0.0279, "step": 7550 }, { "epoch": 6.72, "grad_norm": 0.21935497224330902, "learning_rate": 7.925533364208309e-05, "loss": 0.0263, "step": 7560 }, { "epoch": 6.728888888888889, "grad_norm": 0.22483137249946594, "learning_rate": 7.91957068729923e-05, "loss": 0.0281, "step": 7570 }, { "epoch": 6.737777777777778, "grad_norm": 0.2041652649641037, "learning_rate": 7.913601703628546e-05, "loss": 0.0282, "step": 7580 }, { "epoch": 6.746666666666667, "grad_norm": 0.20766614377498627, "learning_rate": 7.907626426090262e-05, "loss": 0.0264, "step": 7590 }, { "epoch": 6.7555555555555555, "grad_norm": 0.35556066036224365, "learning_rate": 7.901644867591976e-05, "loss": 0.0309, "step": 7600 }, { "epoch": 6.764444444444445, "grad_norm": 0.24783955514431, "learning_rate": 7.895657041054856e-05, "loss": 0.0278, "step": 7610 }, { "epoch": 6.773333333333333, "grad_norm": 0.2949122488498688, "learning_rate": 7.88966295941361e-05, "loss": 0.0265, "step": 7620 }, { "epoch": 6.782222222222222, "grad_norm": 0.2504091262817383, "learning_rate": 7.88366263561646e-05, "loss": 0.0262, "step": 7630 }, { "epoch": 6.791111111111111, "grad_norm": 0.2638102173805237, "learning_rate": 7.87765608262511e-05, "loss": 0.0244, "step": 7640 }, { "epoch": 6.8, "grad_norm": 0.24820135533809662, "learning_rate": 7.871643313414718e-05, "loss": 0.0226, "step": 7650 }, { "epoch": 6.808888888888889, "grad_norm": 0.32193705439567566, "learning_rate": 7.865624340973876e-05, "loss": 0.0273, "step": 7660 }, { "epoch": 6.817777777777778, "grad_norm": 0.2583996057510376, "learning_rate": 7.859599178304568e-05, "loss": 0.0239, "step": 7670 }, { "epoch": 6.826666666666666, "grad_norm": 0.2252609133720398, "learning_rate": 7.85356783842216e-05, "loss": 0.0287, "step": 7680 }, { "epoch": 6.835555555555556, "grad_norm": 0.1531561017036438, "learning_rate": 7.84753033435535e-05, "loss": 0.0216, "step": 7690 }, { "epoch": 6.844444444444444, "grad_norm": 0.19102151691913605, "learning_rate": 7.841486679146162e-05, "loss": 0.0235, "step": 7700 }, { "epoch": 6.8533333333333335, "grad_norm": 0.20853540301322937, "learning_rate": 7.835436885849902e-05, "loss": 0.0213, "step": 7710 }, { "epoch": 6.862222222222222, "grad_norm": 0.28289639949798584, "learning_rate": 7.829380967535136e-05, "loss": 0.0291, "step": 7720 }, { "epoch": 6.871111111111111, "grad_norm": 0.23775836825370789, "learning_rate": 7.823318937283663e-05, "loss": 0.0261, "step": 7730 }, { "epoch": 6.88, "grad_norm": 0.3127186596393585, "learning_rate": 7.817250808190483e-05, "loss": 0.0317, "step": 7740 }, { "epoch": 6.888888888888889, "grad_norm": 0.23988620936870575, "learning_rate": 7.811176593363772e-05, "loss": 0.0254, "step": 7750 }, { "epoch": 6.897777777777778, "grad_norm": 0.2750106453895569, "learning_rate": 7.805096305924848e-05, "loss": 0.022, "step": 7760 }, { "epoch": 6.906666666666666, "grad_norm": 0.2889787554740906, "learning_rate": 7.799009959008155e-05, "loss": 0.0269, "step": 7770 }, { "epoch": 6.915555555555556, "grad_norm": 0.2642170190811157, "learning_rate": 7.792917565761216e-05, "loss": 0.0313, "step": 7780 }, { "epoch": 6.924444444444444, "grad_norm": 0.20323745906352997, "learning_rate": 7.786819139344625e-05, "loss": 0.0317, "step": 7790 }, { "epoch": 6.933333333333334, "grad_norm": 0.2732918858528137, "learning_rate": 7.780714692932002e-05, "loss": 0.0387, "step": 7800 }, { "epoch": 6.942222222222222, "grad_norm": 0.22289887070655823, "learning_rate": 7.774604239709977e-05, "loss": 0.0252, "step": 7810 }, { "epoch": 6.9511111111111115, "grad_norm": 0.23524457216262817, "learning_rate": 7.768487792878148e-05, "loss": 0.0254, "step": 7820 }, { "epoch": 6.96, "grad_norm": 0.19289687275886536, "learning_rate": 7.762365365649067e-05, "loss": 0.0275, "step": 7830 }, { "epoch": 6.968888888888889, "grad_norm": 0.21291349828243256, "learning_rate": 7.7562369712482e-05, "loss": 0.0264, "step": 7840 }, { "epoch": 6.977777777777778, "grad_norm": 0.3168800473213196, "learning_rate": 7.750102622913906e-05, "loss": 0.0237, "step": 7850 }, { "epoch": 6.986666666666666, "grad_norm": 0.28059613704681396, "learning_rate": 7.743962333897405e-05, "loss": 0.0287, "step": 7860 }, { "epoch": 6.995555555555556, "grad_norm": 0.14995969831943512, "learning_rate": 7.737816117462752e-05, "loss": 0.0257, "step": 7870 }, { "epoch": 7.004444444444444, "grad_norm": 0.24372100830078125, "learning_rate": 7.731663986886799e-05, "loss": 0.0228, "step": 7880 }, { "epoch": 7.013333333333334, "grad_norm": 0.20484502613544464, "learning_rate": 7.725505955459183e-05, "loss": 0.0246, "step": 7890 }, { "epoch": 7.022222222222222, "grad_norm": 0.23513025045394897, "learning_rate": 7.719342036482278e-05, "loss": 0.0242, "step": 7900 }, { "epoch": 7.0311111111111115, "grad_norm": 0.23059958219528198, "learning_rate": 7.713172243271186e-05, "loss": 0.0247, "step": 7910 }, { "epoch": 7.04, "grad_norm": 0.27934902906417847, "learning_rate": 7.70699658915369e-05, "loss": 0.0289, "step": 7920 }, { "epoch": 7.0488888888888885, "grad_norm": 0.23871836066246033, "learning_rate": 7.700815087470241e-05, "loss": 0.0205, "step": 7930 }, { "epoch": 7.057777777777778, "grad_norm": 0.24214226007461548, "learning_rate": 7.694627751573913e-05, "loss": 0.0284, "step": 7940 }, { "epoch": 7.066666666666666, "grad_norm": 0.308454692363739, "learning_rate": 7.688434594830392e-05, "loss": 0.0251, "step": 7950 }, { "epoch": 7.075555555555556, "grad_norm": 0.3364754319190979, "learning_rate": 7.68223563061793e-05, "loss": 0.0244, "step": 7960 }, { "epoch": 7.084444444444444, "grad_norm": 0.3038824200630188, "learning_rate": 7.676030872327331e-05, "loss": 0.0252, "step": 7970 }, { "epoch": 7.093333333333334, "grad_norm": 0.17861983180046082, "learning_rate": 7.66982033336191e-05, "loss": 0.025, "step": 7980 }, { "epoch": 7.102222222222222, "grad_norm": 0.4512441158294678, "learning_rate": 7.663604027137473e-05, "loss": 0.0312, "step": 7990 }, { "epoch": 7.111111111111111, "grad_norm": 0.20532359182834625, "learning_rate": 7.65738196708228e-05, "loss": 0.0231, "step": 8000 }, { "epoch": 7.12, "grad_norm": 0.2222631871700287, "learning_rate": 7.651154166637025e-05, "loss": 0.0205, "step": 8010 }, { "epoch": 7.128888888888889, "grad_norm": 0.24204786121845245, "learning_rate": 7.644920639254798e-05, "loss": 0.022, "step": 8020 }, { "epoch": 7.137777777777778, "grad_norm": 0.33409610390663147, "learning_rate": 7.638681398401062e-05, "loss": 0.023, "step": 8030 }, { "epoch": 7.1466666666666665, "grad_norm": 0.2129717916250229, "learning_rate": 7.632436457553625e-05, "loss": 0.0242, "step": 8040 }, { "epoch": 7.155555555555556, "grad_norm": 0.2354777604341507, "learning_rate": 7.626185830202602e-05, "loss": 0.0253, "step": 8050 }, { "epoch": 7.164444444444444, "grad_norm": 0.20029151439666748, "learning_rate": 7.619929529850397e-05, "loss": 0.0296, "step": 8060 }, { "epoch": 7.173333333333334, "grad_norm": 0.2851219177246094, "learning_rate": 7.613667570011663e-05, "loss": 0.0258, "step": 8070 }, { "epoch": 7.182222222222222, "grad_norm": 0.2836199402809143, "learning_rate": 7.607399964213287e-05, "loss": 0.0252, "step": 8080 }, { "epoch": 7.191111111111111, "grad_norm": 0.23780612647533417, "learning_rate": 7.601126725994341e-05, "loss": 0.0214, "step": 8090 }, { "epoch": 7.2, "grad_norm": 0.19909189641475677, "learning_rate": 7.594847868906076e-05, "loss": 0.0216, "step": 8100 }, { "epoch": 7.208888888888889, "grad_norm": 0.2124917209148407, "learning_rate": 7.588563406511871e-05, "loss": 0.0233, "step": 8110 }, { "epoch": 7.217777777777778, "grad_norm": 0.20021606981754303, "learning_rate": 7.58227335238722e-05, "loss": 0.0214, "step": 8120 }, { "epoch": 7.226666666666667, "grad_norm": 0.20000521838665009, "learning_rate": 7.57597772011969e-05, "loss": 0.0219, "step": 8130 }, { "epoch": 7.235555555555556, "grad_norm": 0.25364893674850464, "learning_rate": 7.569676523308904e-05, "loss": 0.0296, "step": 8140 }, { "epoch": 7.2444444444444445, "grad_norm": 0.2427048534154892, "learning_rate": 7.563369775566499e-05, "loss": 0.0242, "step": 8150 }, { "epoch": 7.253333333333333, "grad_norm": 0.4558410942554474, "learning_rate": 7.557057490516111e-05, "loss": 0.0337, "step": 8160 }, { "epoch": 7.262222222222222, "grad_norm": 0.20566076040267944, "learning_rate": 7.550739681793326e-05, "loss": 0.0262, "step": 8170 }, { "epoch": 7.271111111111111, "grad_norm": 0.22102034091949463, "learning_rate": 7.544416363045676e-05, "loss": 0.0221, "step": 8180 }, { "epoch": 7.28, "grad_norm": 0.15369313955307007, "learning_rate": 7.538087547932585e-05, "loss": 0.0281, "step": 8190 }, { "epoch": 7.288888888888889, "grad_norm": 0.19951918721199036, "learning_rate": 7.531753250125354e-05, "loss": 0.0272, "step": 8200 }, { "epoch": 7.297777777777778, "grad_norm": 0.2944194972515106, "learning_rate": 7.525413483307126e-05, "loss": 0.0312, "step": 8210 }, { "epoch": 7.306666666666667, "grad_norm": 0.20238344371318817, "learning_rate": 7.519068261172859e-05, "loss": 0.0221, "step": 8220 }, { "epoch": 7.315555555555555, "grad_norm": 0.18214763700962067, "learning_rate": 7.512717597429297e-05, "loss": 0.0228, "step": 8230 }, { "epoch": 7.3244444444444445, "grad_norm": 0.19634908437728882, "learning_rate": 7.506361505794936e-05, "loss": 0.0281, "step": 8240 }, { "epoch": 7.333333333333333, "grad_norm": 0.3210787773132324, "learning_rate": 7.500000000000001e-05, "loss": 0.0255, "step": 8250 }, { "epoch": 7.342222222222222, "grad_norm": 0.33083415031433105, "learning_rate": 7.493633093786406e-05, "loss": 0.024, "step": 8260 }, { "epoch": 7.351111111111111, "grad_norm": 0.20220117270946503, "learning_rate": 7.487260800907735e-05, "loss": 0.0274, "step": 8270 }, { "epoch": 7.36, "grad_norm": 0.2255803346633911, "learning_rate": 7.480883135129211e-05, "loss": 0.0254, "step": 8280 }, { "epoch": 7.368888888888889, "grad_norm": 0.2168605774641037, "learning_rate": 7.474500110227657e-05, "loss": 0.0263, "step": 8290 }, { "epoch": 7.377777777777778, "grad_norm": 0.4022749364376068, "learning_rate": 7.468111739991477e-05, "loss": 0.0227, "step": 8300 }, { "epoch": 7.386666666666667, "grad_norm": 0.21674712002277374, "learning_rate": 7.461718038220621e-05, "loss": 0.026, "step": 8310 }, { "epoch": 7.395555555555555, "grad_norm": 0.17594152688980103, "learning_rate": 7.455319018726553e-05, "loss": 0.0208, "step": 8320 }, { "epoch": 7.404444444444445, "grad_norm": 0.2003270983695984, "learning_rate": 7.44891469533223e-05, "loss": 0.0224, "step": 8330 }, { "epoch": 7.413333333333333, "grad_norm": 0.2584877610206604, "learning_rate": 7.44250508187206e-05, "loss": 0.0226, "step": 8340 }, { "epoch": 7.4222222222222225, "grad_norm": 0.21114584803581238, "learning_rate": 7.436090192191886e-05, "loss": 0.0265, "step": 8350 }, { "epoch": 7.431111111111111, "grad_norm": 0.21279163658618927, "learning_rate": 7.429670040148938e-05, "loss": 0.0307, "step": 8360 }, { "epoch": 7.44, "grad_norm": 0.25956496596336365, "learning_rate": 7.423244639611826e-05, "loss": 0.0277, "step": 8370 }, { "epoch": 7.448888888888889, "grad_norm": 0.1839926391839981, "learning_rate": 7.416814004460484e-05, "loss": 0.0252, "step": 8380 }, { "epoch": 7.457777777777777, "grad_norm": 0.25493964552879333, "learning_rate": 7.410378148586166e-05, "loss": 0.0236, "step": 8390 }, { "epoch": 7.466666666666667, "grad_norm": 0.2386111468076706, "learning_rate": 7.403937085891397e-05, "loss": 0.0243, "step": 8400 }, { "epoch": 7.475555555555555, "grad_norm": 0.1950986385345459, "learning_rate": 7.397490830289952e-05, "loss": 0.0238, "step": 8410 }, { "epoch": 7.484444444444445, "grad_norm": 0.21159300208091736, "learning_rate": 7.39103939570682e-05, "loss": 0.0243, "step": 8420 }, { "epoch": 7.493333333333333, "grad_norm": 0.2791748046875, "learning_rate": 7.384582796078184e-05, "loss": 0.0265, "step": 8430 }, { "epoch": 7.502222222222223, "grad_norm": 0.2403152734041214, "learning_rate": 7.378121045351378e-05, "loss": 0.0213, "step": 8440 }, { "epoch": 7.511111111111111, "grad_norm": 0.21562573313713074, "learning_rate": 7.371654157484864e-05, "loss": 0.025, "step": 8450 }, { "epoch": 7.52, "grad_norm": 0.20237354934215546, "learning_rate": 7.365182146448205e-05, "loss": 0.0278, "step": 8460 }, { "epoch": 7.528888888888889, "grad_norm": 0.2715177536010742, "learning_rate": 7.358705026222029e-05, "loss": 0.0275, "step": 8470 }, { "epoch": 7.5377777777777775, "grad_norm": 0.24359120428562164, "learning_rate": 7.352222810797998e-05, "loss": 0.0257, "step": 8480 }, { "epoch": 7.546666666666667, "grad_norm": 0.3335295617580414, "learning_rate": 7.345735514178787e-05, "loss": 0.024, "step": 8490 }, { "epoch": 7.555555555555555, "grad_norm": 0.2723658084869385, "learning_rate": 7.33924315037804e-05, "loss": 0.0254, "step": 8500 }, { "epoch": 7.564444444444445, "grad_norm": 0.22384099662303925, "learning_rate": 7.332745733420349e-05, "loss": 0.0248, "step": 8510 }, { "epoch": 7.573333333333333, "grad_norm": 0.2647709250450134, "learning_rate": 7.326243277341227e-05, "loss": 0.0297, "step": 8520 }, { "epoch": 7.582222222222223, "grad_norm": 0.29931676387786865, "learning_rate": 7.319735796187063e-05, "loss": 0.0264, "step": 8530 }, { "epoch": 7.591111111111111, "grad_norm": 0.19541478157043457, "learning_rate": 7.31322330401511e-05, "loss": 0.0235, "step": 8540 }, { "epoch": 7.6, "grad_norm": 0.22403022646903992, "learning_rate": 7.30670581489344e-05, "loss": 0.0211, "step": 8550 }, { "epoch": 7.608888888888889, "grad_norm": 0.17009246349334717, "learning_rate": 7.30018334290092e-05, "loss": 0.0274, "step": 8560 }, { "epoch": 7.6177777777777775, "grad_norm": 0.24999922513961792, "learning_rate": 7.293655902127183e-05, "loss": 0.0261, "step": 8570 }, { "epoch": 7.626666666666667, "grad_norm": 0.22635561227798462, "learning_rate": 7.287123506672595e-05, "loss": 0.0202, "step": 8580 }, { "epoch": 7.635555555555555, "grad_norm": 0.24938105046749115, "learning_rate": 7.280586170648223e-05, "loss": 0.0248, "step": 8590 }, { "epoch": 7.644444444444445, "grad_norm": 0.2041708081960678, "learning_rate": 7.274043908175808e-05, "loss": 0.0219, "step": 8600 }, { "epoch": 7.653333333333333, "grad_norm": 0.21436285972595215, "learning_rate": 7.267496733387731e-05, "loss": 0.0231, "step": 8610 }, { "epoch": 7.662222222222223, "grad_norm": 0.19197438657283783, "learning_rate": 7.260944660426989e-05, "loss": 0.0237, "step": 8620 }, { "epoch": 7.671111111111111, "grad_norm": 0.19595466554164886, "learning_rate": 7.254387703447154e-05, "loss": 0.0215, "step": 8630 }, { "epoch": 7.68, "grad_norm": 0.22495333850383759, "learning_rate": 7.247825876612353e-05, "loss": 0.0217, "step": 8640 }, { "epoch": 7.688888888888889, "grad_norm": 0.1938040852546692, "learning_rate": 7.241259194097228e-05, "loss": 0.0283, "step": 8650 }, { "epoch": 7.697777777777778, "grad_norm": 0.25733432173728943, "learning_rate": 7.234687670086917e-05, "loss": 0.0228, "step": 8660 }, { "epoch": 7.706666666666667, "grad_norm": 0.25317463278770447, "learning_rate": 7.22811131877701e-05, "loss": 0.0261, "step": 8670 }, { "epoch": 7.7155555555555555, "grad_norm": 0.2505670189857483, "learning_rate": 7.221530154373528e-05, "loss": 0.0231, "step": 8680 }, { "epoch": 7.724444444444444, "grad_norm": 0.22940559685230255, "learning_rate": 7.214944191092886e-05, "loss": 0.0279, "step": 8690 }, { "epoch": 7.733333333333333, "grad_norm": 0.21911337971687317, "learning_rate": 7.20835344316187e-05, "loss": 0.0251, "step": 8700 }, { "epoch": 7.742222222222222, "grad_norm": 0.2333977073431015, "learning_rate": 7.201757924817598e-05, "loss": 0.0272, "step": 8710 }, { "epoch": 7.751111111111111, "grad_norm": 0.18560375273227692, "learning_rate": 7.195157650307496e-05, "loss": 0.0202, "step": 8720 }, { "epoch": 7.76, "grad_norm": 0.3697771728038788, "learning_rate": 7.188552633889259e-05, "loss": 0.0212, "step": 8730 }, { "epoch": 7.768888888888889, "grad_norm": 0.20837579667568207, "learning_rate": 7.181942889830832e-05, "loss": 0.0212, "step": 8740 }, { "epoch": 7.777777777777778, "grad_norm": 0.2323990762233734, "learning_rate": 7.175328432410366e-05, "loss": 0.0246, "step": 8750 }, { "epoch": 7.786666666666667, "grad_norm": 0.15073710680007935, "learning_rate": 7.1687092759162e-05, "loss": 0.0258, "step": 8760 }, { "epoch": 7.795555555555556, "grad_norm": 0.19346582889556885, "learning_rate": 7.162085434646817e-05, "loss": 0.0242, "step": 8770 }, { "epoch": 7.804444444444444, "grad_norm": 0.1623661071062088, "learning_rate": 7.155456922910825e-05, "loss": 0.0193, "step": 8780 }, { "epoch": 7.8133333333333335, "grad_norm": 0.19126679003238678, "learning_rate": 7.148823755026921e-05, "loss": 0.024, "step": 8790 }, { "epoch": 7.822222222222222, "grad_norm": 0.19638970494270325, "learning_rate": 7.142185945323856e-05, "loss": 0.0261, "step": 8800 }, { "epoch": 7.831111111111111, "grad_norm": 0.29566171765327454, "learning_rate": 7.135543508140412e-05, "loss": 0.0243, "step": 8810 }, { "epoch": 7.84, "grad_norm": 0.2520422339439392, "learning_rate": 7.128896457825364e-05, "loss": 0.0273, "step": 8820 }, { "epoch": 7.848888888888889, "grad_norm": 0.15805794298648834, "learning_rate": 7.122244808737455e-05, "loss": 0.0203, "step": 8830 }, { "epoch": 7.857777777777778, "grad_norm": 0.22796449065208435, "learning_rate": 7.115588575245361e-05, "loss": 0.0193, "step": 8840 }, { "epoch": 7.866666666666667, "grad_norm": 0.263602077960968, "learning_rate": 7.108927771727661e-05, "loss": 0.031, "step": 8850 }, { "epoch": 7.875555555555556, "grad_norm": 0.22126036882400513, "learning_rate": 7.102262412572807e-05, "loss": 0.0254, "step": 8860 }, { "epoch": 7.884444444444444, "grad_norm": 0.22073419392108917, "learning_rate": 7.09559251217909e-05, "loss": 0.0243, "step": 8870 }, { "epoch": 7.8933333333333335, "grad_norm": 0.27728477120399475, "learning_rate": 7.08891808495461e-05, "loss": 0.0253, "step": 8880 }, { "epoch": 7.902222222222222, "grad_norm": 0.20619475841522217, "learning_rate": 7.08223914531725e-05, "loss": 0.0243, "step": 8890 }, { "epoch": 7.911111111111111, "grad_norm": 0.27480584383010864, "learning_rate": 7.075555707694636e-05, "loss": 0.021, "step": 8900 }, { "epoch": 7.92, "grad_norm": 0.2239075005054474, "learning_rate": 7.068867786524116e-05, "loss": 0.0269, "step": 8910 }, { "epoch": 7.928888888888888, "grad_norm": 0.19923384487628937, "learning_rate": 7.062175396252716e-05, "loss": 0.0227, "step": 8920 }, { "epoch": 7.937777777777778, "grad_norm": 0.2173166275024414, "learning_rate": 7.055478551337123e-05, "loss": 0.0284, "step": 8930 }, { "epoch": 7.946666666666666, "grad_norm": 0.19013693928718567, "learning_rate": 7.04877726624364e-05, "loss": 0.0229, "step": 8940 }, { "epoch": 7.955555555555556, "grad_norm": 0.19402934610843658, "learning_rate": 7.042071555448168e-05, "loss": 0.0205, "step": 8950 }, { "epoch": 7.964444444444444, "grad_norm": 0.1340809166431427, "learning_rate": 7.035361433436163e-05, "loss": 0.0216, "step": 8960 }, { "epoch": 7.973333333333334, "grad_norm": 0.17898128926753998, "learning_rate": 7.028646914702614e-05, "loss": 0.023, "step": 8970 }, { "epoch": 7.982222222222222, "grad_norm": 0.18558812141418457, "learning_rate": 7.021928013752005e-05, "loss": 0.0254, "step": 8980 }, { "epoch": 7.9911111111111115, "grad_norm": 0.19384290277957916, "learning_rate": 7.015204745098287e-05, "loss": 0.0251, "step": 8990 }, { "epoch": 8.0, "grad_norm": 0.21121573448181152, "learning_rate": 7.008477123264848e-05, "loss": 0.0227, "step": 9000 }, { "epoch": 8.008888888888889, "grad_norm": 0.20985658466815948, "learning_rate": 7.001745162784477e-05, "loss": 0.0283, "step": 9010 }, { "epoch": 8.017777777777777, "grad_norm": 0.16636261343955994, "learning_rate": 6.995008878199332e-05, "loss": 0.0191, "step": 9020 }, { "epoch": 8.026666666666667, "grad_norm": 0.20505701005458832, "learning_rate": 6.988268284060922e-05, "loss": 0.0211, "step": 9030 }, { "epoch": 8.035555555555556, "grad_norm": 0.23082955181598663, "learning_rate": 6.981523394930055e-05, "loss": 0.0215, "step": 9040 }, { "epoch": 8.044444444444444, "grad_norm": 0.29650309681892395, "learning_rate": 6.974774225376822e-05, "loss": 0.0281, "step": 9050 }, { "epoch": 8.053333333333333, "grad_norm": 0.23264449834823608, "learning_rate": 6.968020789980562e-05, "loss": 0.0213, "step": 9060 }, { "epoch": 8.062222222222223, "grad_norm": 0.24937617778778076, "learning_rate": 6.961263103329822e-05, "loss": 0.0306, "step": 9070 }, { "epoch": 8.071111111111112, "grad_norm": 0.1883297562599182, "learning_rate": 6.954501180022341e-05, "loss": 0.0232, "step": 9080 }, { "epoch": 8.08, "grad_norm": 0.16494864225387573, "learning_rate": 6.947735034665002e-05, "loss": 0.0261, "step": 9090 }, { "epoch": 8.088888888888889, "grad_norm": 0.31599846482276917, "learning_rate": 6.940964681873814e-05, "loss": 0.0222, "step": 9100 }, { "epoch": 8.097777777777777, "grad_norm": 0.27846190333366394, "learning_rate": 6.934190136273872e-05, "loss": 0.0231, "step": 9110 }, { "epoch": 8.106666666666667, "grad_norm": 0.3501579463481903, "learning_rate": 6.927411412499332e-05, "loss": 0.0224, "step": 9120 }, { "epoch": 8.115555555555556, "grad_norm": 0.19710171222686768, "learning_rate": 6.920628525193369e-05, "loss": 0.0217, "step": 9130 }, { "epoch": 8.124444444444444, "grad_norm": 0.2555966377258301, "learning_rate": 6.913841489008157e-05, "loss": 0.025, "step": 9140 }, { "epoch": 8.133333333333333, "grad_norm": 0.23690535128116608, "learning_rate": 6.90705031860483e-05, "loss": 0.024, "step": 9150 }, { "epoch": 8.142222222222221, "grad_norm": 0.2689349353313446, "learning_rate": 6.900255028653455e-05, "loss": 0.0266, "step": 9160 }, { "epoch": 8.151111111111112, "grad_norm": 0.2167505919933319, "learning_rate": 6.893455633832992e-05, "loss": 0.024, "step": 9170 }, { "epoch": 8.16, "grad_norm": 0.22795771062374115, "learning_rate": 6.886652148831279e-05, "loss": 0.0208, "step": 9180 }, { "epoch": 8.168888888888889, "grad_norm": 0.18795914947986603, "learning_rate": 6.879844588344978e-05, "loss": 0.021, "step": 9190 }, { "epoch": 8.177777777777777, "grad_norm": 0.28372564911842346, "learning_rate": 6.873032967079561e-05, "loss": 0.0211, "step": 9200 }, { "epoch": 8.186666666666667, "grad_norm": 0.2872115671634674, "learning_rate": 6.86621729974927e-05, "loss": 0.0196, "step": 9210 }, { "epoch": 8.195555555555556, "grad_norm": 0.3066284656524658, "learning_rate": 6.85939760107709e-05, "loss": 0.0236, "step": 9220 }, { "epoch": 8.204444444444444, "grad_norm": 0.2632872462272644, "learning_rate": 6.852573885794709e-05, "loss": 0.023, "step": 9230 }, { "epoch": 8.213333333333333, "grad_norm": 0.11840223520994186, "learning_rate": 6.845746168642497e-05, "loss": 0.0215, "step": 9240 }, { "epoch": 8.222222222222221, "grad_norm": 0.13074280321598053, "learning_rate": 6.838914464369467e-05, "loss": 0.023, "step": 9250 }, { "epoch": 8.231111111111112, "grad_norm": 0.2713388204574585, "learning_rate": 6.832078787733242e-05, "loss": 0.0249, "step": 9260 }, { "epoch": 8.24, "grad_norm": 0.22531013190746307, "learning_rate": 6.825239153500029e-05, "loss": 0.0259, "step": 9270 }, { "epoch": 8.248888888888889, "grad_norm": 0.3672671616077423, "learning_rate": 6.818395576444585e-05, "loss": 0.0279, "step": 9280 }, { "epoch": 8.257777777777777, "grad_norm": 0.24043358862400055, "learning_rate": 6.81154807135018e-05, "loss": 0.0273, "step": 9290 }, { "epoch": 8.266666666666667, "grad_norm": 0.3103722035884857, "learning_rate": 6.804696653008575e-05, "loss": 0.0278, "step": 9300 }, { "epoch": 8.275555555555556, "grad_norm": 0.23513801395893097, "learning_rate": 6.797841336219977e-05, "loss": 0.0187, "step": 9310 }, { "epoch": 8.284444444444444, "grad_norm": 0.3011358082294464, "learning_rate": 6.79098213579302e-05, "loss": 0.0214, "step": 9320 }, { "epoch": 8.293333333333333, "grad_norm": 0.2111138105392456, "learning_rate": 6.784119066544727e-05, "loss": 0.0201, "step": 9330 }, { "epoch": 8.302222222222222, "grad_norm": 0.1790749579668045, "learning_rate": 6.777252143300474e-05, "loss": 0.0261, "step": 9340 }, { "epoch": 8.311111111111112, "grad_norm": 0.18994136154651642, "learning_rate": 6.770381380893968e-05, "loss": 0.0254, "step": 9350 }, { "epoch": 8.32, "grad_norm": 0.24281306564807892, "learning_rate": 6.763506794167208e-05, "loss": 0.0243, "step": 9360 }, { "epoch": 8.328888888888889, "grad_norm": 0.2572516202926636, "learning_rate": 6.756628397970445e-05, "loss": 0.0193, "step": 9370 }, { "epoch": 8.337777777777777, "grad_norm": 0.27604877948760986, "learning_rate": 6.749746207162174e-05, "loss": 0.0275, "step": 9380 }, { "epoch": 8.346666666666668, "grad_norm": 0.2154282182455063, "learning_rate": 6.742860236609077e-05, "loss": 0.0252, "step": 9390 }, { "epoch": 8.355555555555556, "grad_norm": 0.26248568296432495, "learning_rate": 6.735970501186003e-05, "loss": 0.0226, "step": 9400 }, { "epoch": 8.364444444444445, "grad_norm": 0.2937508225440979, "learning_rate": 6.729077015775936e-05, "loss": 0.0254, "step": 9410 }, { "epoch": 8.373333333333333, "grad_norm": 0.22078022360801697, "learning_rate": 6.722179795269956e-05, "loss": 0.0225, "step": 9420 }, { "epoch": 8.382222222222222, "grad_norm": 0.22824744880199432, "learning_rate": 6.715278854567217e-05, "loss": 0.0318, "step": 9430 }, { "epoch": 8.391111111111112, "grad_norm": 0.28823742270469666, "learning_rate": 6.708374208574907e-05, "loss": 0.0287, "step": 9440 }, { "epoch": 8.4, "grad_norm": 0.3006422221660614, "learning_rate": 6.701465872208216e-05, "loss": 0.0261, "step": 9450 }, { "epoch": 8.408888888888889, "grad_norm": 0.26386314630508423, "learning_rate": 6.694553860390307e-05, "loss": 0.0293, "step": 9460 }, { "epoch": 8.417777777777777, "grad_norm": 0.21579484641551971, "learning_rate": 6.687638188052284e-05, "loss": 0.0241, "step": 9470 }, { "epoch": 8.426666666666666, "grad_norm": 0.20836317539215088, "learning_rate": 6.680718870133156e-05, "loss": 0.0247, "step": 9480 }, { "epoch": 8.435555555555556, "grad_norm": 0.2054918110370636, "learning_rate": 6.673795921579813e-05, "loss": 0.0254, "step": 9490 }, { "epoch": 8.444444444444445, "grad_norm": 0.3080763816833496, "learning_rate": 6.666869357346978e-05, "loss": 0.0246, "step": 9500 }, { "epoch": 8.453333333333333, "grad_norm": 0.18736578524112701, "learning_rate": 6.659939192397192e-05, "loss": 0.0272, "step": 9510 }, { "epoch": 8.462222222222222, "grad_norm": 0.22466592490673065, "learning_rate": 6.65300544170077e-05, "loss": 0.0249, "step": 9520 }, { "epoch": 8.471111111111112, "grad_norm": 0.20963740348815918, "learning_rate": 6.646068120235778e-05, "loss": 0.0208, "step": 9530 }, { "epoch": 8.48, "grad_norm": 0.22102439403533936, "learning_rate": 6.639127242987988e-05, "loss": 0.0194, "step": 9540 }, { "epoch": 8.488888888888889, "grad_norm": 0.21163538098335266, "learning_rate": 6.632182824950861e-05, "loss": 0.0218, "step": 9550 }, { "epoch": 8.497777777777777, "grad_norm": 0.23654435575008392, "learning_rate": 6.625234881125501e-05, "loss": 0.0255, "step": 9560 }, { "epoch": 8.506666666666666, "grad_norm": 0.2121993452310562, "learning_rate": 6.61828342652063e-05, "loss": 0.0192, "step": 9570 }, { "epoch": 8.515555555555556, "grad_norm": 0.1997184157371521, "learning_rate": 6.611328476152557e-05, "loss": 0.0231, "step": 9580 }, { "epoch": 8.524444444444445, "grad_norm": 0.21559667587280273, "learning_rate": 6.604370045045133e-05, "loss": 0.018, "step": 9590 }, { "epoch": 8.533333333333333, "grad_norm": 0.24597884714603424, "learning_rate": 6.59740814822974e-05, "loss": 0.0209, "step": 9600 }, { "epoch": 8.542222222222222, "grad_norm": 0.17290206253528595, "learning_rate": 6.590442800745241e-05, "loss": 0.0181, "step": 9610 }, { "epoch": 8.551111111111112, "grad_norm": 0.19596514105796814, "learning_rate": 6.583474017637952e-05, "loss": 0.0239, "step": 9620 }, { "epoch": 8.56, "grad_norm": 0.17282547056674957, "learning_rate": 6.576501813961609e-05, "loss": 0.022, "step": 9630 }, { "epoch": 8.568888888888889, "grad_norm": 0.21759827435016632, "learning_rate": 6.569526204777341e-05, "loss": 0.0201, "step": 9640 }, { "epoch": 8.577777777777778, "grad_norm": 0.23493897914886475, "learning_rate": 6.562547205153632e-05, "loss": 0.0216, "step": 9650 }, { "epoch": 8.586666666666666, "grad_norm": 0.2594851851463318, "learning_rate": 6.555564830166293e-05, "loss": 0.0219, "step": 9660 }, { "epoch": 8.595555555555556, "grad_norm": 0.30423596501350403, "learning_rate": 6.548579094898419e-05, "loss": 0.0219, "step": 9670 }, { "epoch": 8.604444444444445, "grad_norm": 0.5663008093833923, "learning_rate": 6.541590014440371e-05, "loss": 0.0209, "step": 9680 }, { "epoch": 8.613333333333333, "grad_norm": 0.2782978117465973, "learning_rate": 6.534597603889732e-05, "loss": 0.0219, "step": 9690 }, { "epoch": 8.622222222222222, "grad_norm": 0.2207699865102768, "learning_rate": 6.52760187835128e-05, "loss": 0.0219, "step": 9700 }, { "epoch": 8.63111111111111, "grad_norm": 0.24818424880504608, "learning_rate": 6.520602852936951e-05, "loss": 0.0213, "step": 9710 }, { "epoch": 8.64, "grad_norm": 0.23251044750213623, "learning_rate": 6.513600542765817e-05, "loss": 0.0246, "step": 9720 }, { "epoch": 8.648888888888889, "grad_norm": 0.1856372356414795, "learning_rate": 6.506594962964036e-05, "loss": 0.0236, "step": 9730 }, { "epoch": 8.657777777777778, "grad_norm": 0.16605663299560547, "learning_rate": 6.499586128664836e-05, "loss": 0.0241, "step": 9740 }, { "epoch": 8.666666666666666, "grad_norm": 0.1832137405872345, "learning_rate": 6.492574055008473e-05, "loss": 0.0212, "step": 9750 }, { "epoch": 8.675555555555556, "grad_norm": 0.42195653915405273, "learning_rate": 6.4855587571422e-05, "loss": 0.0259, "step": 9760 }, { "epoch": 8.684444444444445, "grad_norm": 0.2403748482465744, "learning_rate": 6.478540250220234e-05, "loss": 0.021, "step": 9770 }, { "epoch": 8.693333333333333, "grad_norm": 0.21305827796459198, "learning_rate": 6.471518549403726e-05, "loss": 0.0203, "step": 9780 }, { "epoch": 8.702222222222222, "grad_norm": 0.20438934862613678, "learning_rate": 6.464493669860727e-05, "loss": 0.0191, "step": 9790 }, { "epoch": 8.71111111111111, "grad_norm": 0.19585122168064117, "learning_rate": 6.457465626766152e-05, "loss": 0.0262, "step": 9800 }, { "epoch": 8.72, "grad_norm": 0.27267688512802124, "learning_rate": 6.450434435301751e-05, "loss": 0.0296, "step": 9810 }, { "epoch": 8.72888888888889, "grad_norm": 0.32654160261154175, "learning_rate": 6.443400110656075e-05, "loss": 0.0261, "step": 9820 }, { "epoch": 8.737777777777778, "grad_norm": 0.21250808238983154, "learning_rate": 6.436362668024442e-05, "loss": 0.0219, "step": 9830 }, { "epoch": 8.746666666666666, "grad_norm": 0.22815300524234772, "learning_rate": 6.42932212260891e-05, "loss": 0.0202, "step": 9840 }, { "epoch": 8.755555555555556, "grad_norm": 0.3127756714820862, "learning_rate": 6.42227848961823e-05, "loss": 0.021, "step": 9850 }, { "epoch": 8.764444444444445, "grad_norm": 0.32240206003189087, "learning_rate": 6.415231784267838e-05, "loss": 0.0282, "step": 9860 }, { "epoch": 8.773333333333333, "grad_norm": 0.31195443868637085, "learning_rate": 6.408182021779791e-05, "loss": 0.0211, "step": 9870 }, { "epoch": 8.782222222222222, "grad_norm": 0.23848415911197662, "learning_rate": 6.401129217382755e-05, "loss": 0.0229, "step": 9880 }, { "epoch": 8.79111111111111, "grad_norm": 0.18539327383041382, "learning_rate": 6.394073386311976e-05, "loss": 0.0218, "step": 9890 }, { "epoch": 8.8, "grad_norm": 0.19724710285663605, "learning_rate": 6.387014543809223e-05, "loss": 0.0256, "step": 9900 }, { "epoch": 8.80888888888889, "grad_norm": 0.31520766019821167, "learning_rate": 6.379952705122782e-05, "loss": 0.0219, "step": 9910 }, { "epoch": 8.817777777777778, "grad_norm": 0.210577130317688, "learning_rate": 6.372887885507408e-05, "loss": 0.0199, "step": 9920 }, { "epoch": 8.826666666666666, "grad_norm": 0.26865074038505554, "learning_rate": 6.365820100224292e-05, "loss": 0.0199, "step": 9930 }, { "epoch": 8.835555555555555, "grad_norm": 0.2878722548484802, "learning_rate": 6.358749364541034e-05, "loss": 0.0246, "step": 9940 }, { "epoch": 8.844444444444445, "grad_norm": 0.19988656044006348, "learning_rate": 6.35167569373161e-05, "loss": 0.021, "step": 9950 }, { "epoch": 8.853333333333333, "grad_norm": 0.17245405912399292, "learning_rate": 6.344599103076329e-05, "loss": 0.0198, "step": 9960 }, { "epoch": 8.862222222222222, "grad_norm": 0.2354762852191925, "learning_rate": 6.337519607861815e-05, "loss": 0.0205, "step": 9970 }, { "epoch": 8.87111111111111, "grad_norm": 0.24840585887432098, "learning_rate": 6.330437223380963e-05, "loss": 0.0226, "step": 9980 }, { "epoch": 8.88, "grad_norm": 0.2577451765537262, "learning_rate": 6.323351964932908e-05, "loss": 0.02, "step": 9990 }, { "epoch": 8.88888888888889, "grad_norm": 0.2501932978630066, "learning_rate": 6.316263847822997e-05, "loss": 0.0207, "step": 10000 } ], "logging_steps": 10, "max_steps": 22500, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.55059805532521e+17, "train_batch_size": 27, "trial_name": null, "trial_params": null }