{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 39.603960396039604, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019801980198019802, "grad_norm": 11.700119972229004, "learning_rate": 9e-07, "loss": 1.4308, "step": 10 }, { "epoch": 0.039603960396039604, "grad_norm": 9.809730529785156, "learning_rate": 1.9e-06, "loss": 1.4327, "step": 20 }, { "epoch": 0.0594059405940594, "grad_norm": 5.3152313232421875, "learning_rate": 2.9e-06, "loss": 0.9695, "step": 30 }, { "epoch": 0.07920792079207921, "grad_norm": 1.5061944723129272, "learning_rate": 3.9e-06, "loss": 0.6159, "step": 40 }, { "epoch": 0.09900990099009901, "grad_norm": 1.9035465717315674, "learning_rate": 4.9000000000000005e-06, "loss": 0.408, "step": 50 }, { "epoch": 0.1188118811881188, "grad_norm": 2.195225238800049, "learning_rate": 5.9e-06, "loss": 0.3545, "step": 60 }, { "epoch": 0.13861386138613863, "grad_norm": 1.1098666191101074, "learning_rate": 6.900000000000001e-06, "loss": 0.2932, "step": 70 }, { "epoch": 0.15841584158415842, "grad_norm": 1.617100715637207, "learning_rate": 7.9e-06, "loss": 0.2501, "step": 80 }, { "epoch": 0.1782178217821782, "grad_norm": 1.6840183734893799, "learning_rate": 8.9e-06, "loss": 0.2288, "step": 90 }, { "epoch": 0.19801980198019803, "grad_norm": 1.1760646104812622, "learning_rate": 9.900000000000002e-06, "loss": 0.2132, "step": 100 }, { "epoch": 0.21782178217821782, "grad_norm": 0.9506794810295105, "learning_rate": 1.09e-05, "loss": 0.2121, "step": 110 }, { "epoch": 0.2376237623762376, "grad_norm": 0.8373862504959106, "learning_rate": 1.19e-05, "loss": 0.1989, "step": 120 }, { "epoch": 0.25742574257425743, "grad_norm": 1.066789984703064, "learning_rate": 1.29e-05, "loss": 0.1816, "step": 130 }, { "epoch": 0.27722772277227725, "grad_norm": 0.9394069314002991, "learning_rate": 1.3900000000000002e-05, "loss": 0.1968, "step": 140 }, { "epoch": 0.297029702970297, "grad_norm": 1.3922629356384277, "learning_rate": 1.49e-05, "loss": 0.1882, "step": 150 }, { "epoch": 0.31683168316831684, "grad_norm": 1.1546082496643066, "learning_rate": 1.59e-05, "loss": 0.1816, "step": 160 }, { "epoch": 0.33663366336633666, "grad_norm": 0.7293896675109863, "learning_rate": 1.69e-05, "loss": 0.1659, "step": 170 }, { "epoch": 0.3564356435643564, "grad_norm": 1.0924512147903442, "learning_rate": 1.79e-05, "loss": 0.1521, "step": 180 }, { "epoch": 0.37623762376237624, "grad_norm": 0.6947932243347168, "learning_rate": 1.8900000000000002e-05, "loss": 0.161, "step": 190 }, { "epoch": 0.39603960396039606, "grad_norm": 0.5404964685440063, "learning_rate": 1.9900000000000003e-05, "loss": 0.1445, "step": 200 }, { "epoch": 0.4158415841584158, "grad_norm": 0.9344349503517151, "learning_rate": 2.09e-05, "loss": 0.1535, "step": 210 }, { "epoch": 0.43564356435643564, "grad_norm": 1.0715430974960327, "learning_rate": 2.19e-05, "loss": 0.1526, "step": 220 }, { "epoch": 0.45544554455445546, "grad_norm": 0.7890714406967163, "learning_rate": 2.29e-05, "loss": 0.1496, "step": 230 }, { "epoch": 0.4752475247524752, "grad_norm": 0.6856286525726318, "learning_rate": 2.39e-05, "loss": 0.143, "step": 240 }, { "epoch": 0.49504950495049505, "grad_norm": 1.0171067714691162, "learning_rate": 2.4900000000000002e-05, "loss": 0.1395, "step": 250 }, { "epoch": 0.5148514851485149, "grad_norm": 0.9761819243431091, "learning_rate": 2.5900000000000003e-05, "loss": 0.1267, "step": 260 }, { "epoch": 0.5346534653465347, "grad_norm": 0.9726797938346863, "learning_rate": 2.6900000000000003e-05, "loss": 0.1173, "step": 270 }, { "epoch": 0.5544554455445545, "grad_norm": 0.6862833499908447, "learning_rate": 2.7900000000000004e-05, "loss": 0.1224, "step": 280 }, { "epoch": 0.5742574257425742, "grad_norm": 0.6041882038116455, "learning_rate": 2.8899999999999998e-05, "loss": 0.116, "step": 290 }, { "epoch": 0.594059405940594, "grad_norm": 0.9466333389282227, "learning_rate": 2.9900000000000002e-05, "loss": 0.1228, "step": 300 }, { "epoch": 0.6138613861386139, "grad_norm": 0.6644469499588013, "learning_rate": 3.09e-05, "loss": 0.117, "step": 310 }, { "epoch": 0.6336633663366337, "grad_norm": 0.4309399425983429, "learning_rate": 3.19e-05, "loss": 0.1027, "step": 320 }, { "epoch": 0.6534653465346535, "grad_norm": 0.6838232278823853, "learning_rate": 3.29e-05, "loss": 0.1079, "step": 330 }, { "epoch": 0.6732673267326733, "grad_norm": 0.8602980971336365, "learning_rate": 3.3900000000000004e-05, "loss": 0.1171, "step": 340 }, { "epoch": 0.693069306930693, "grad_norm": 0.6315211653709412, "learning_rate": 3.49e-05, "loss": 0.1057, "step": 350 }, { "epoch": 0.7128712871287128, "grad_norm": 1.1917099952697754, "learning_rate": 3.59e-05, "loss": 0.1096, "step": 360 }, { "epoch": 0.7326732673267327, "grad_norm": 0.6452987790107727, "learning_rate": 3.69e-05, "loss": 0.0992, "step": 370 }, { "epoch": 0.7524752475247525, "grad_norm": 0.6533476114273071, "learning_rate": 3.79e-05, "loss": 0.0924, "step": 380 }, { "epoch": 0.7722772277227723, "grad_norm": 0.590823769569397, "learning_rate": 3.8900000000000004e-05, "loss": 0.1, "step": 390 }, { "epoch": 0.7920792079207921, "grad_norm": 0.6959227323532104, "learning_rate": 3.99e-05, "loss": 0.0991, "step": 400 }, { "epoch": 0.8118811881188119, "grad_norm": 0.7061798572540283, "learning_rate": 4.09e-05, "loss": 0.1004, "step": 410 }, { "epoch": 0.8316831683168316, "grad_norm": 0.691524088382721, "learning_rate": 4.19e-05, "loss": 0.0914, "step": 420 }, { "epoch": 0.8514851485148515, "grad_norm": 0.31635311245918274, "learning_rate": 4.29e-05, "loss": 0.0999, "step": 430 }, { "epoch": 0.8712871287128713, "grad_norm": 0.5709571242332458, "learning_rate": 4.39e-05, "loss": 0.0908, "step": 440 }, { "epoch": 0.8910891089108911, "grad_norm": 0.5255236029624939, "learning_rate": 4.49e-05, "loss": 0.0867, "step": 450 }, { "epoch": 0.9108910891089109, "grad_norm": 0.5639820098876953, "learning_rate": 4.5900000000000004e-05, "loss": 0.0914, "step": 460 }, { "epoch": 0.9306930693069307, "grad_norm": 0.4842193126678467, "learning_rate": 4.69e-05, "loss": 0.0904, "step": 470 }, { "epoch": 0.9504950495049505, "grad_norm": 0.45829492807388306, "learning_rate": 4.79e-05, "loss": 0.0922, "step": 480 }, { "epoch": 0.9702970297029703, "grad_norm": 0.8468970060348511, "learning_rate": 4.89e-05, "loss": 0.0905, "step": 490 }, { "epoch": 0.9900990099009901, "grad_norm": 0.5916125178337097, "learning_rate": 4.99e-05, "loss": 0.0953, "step": 500 }, { "epoch": 1.00990099009901, "grad_norm": 0.44470512866973877, "learning_rate": 5.0900000000000004e-05, "loss": 0.0808, "step": 510 }, { "epoch": 1.0297029702970297, "grad_norm": 0.5363156795501709, "learning_rate": 5.19e-05, "loss": 0.0904, "step": 520 }, { "epoch": 1.0495049504950495, "grad_norm": 0.6078326106071472, "learning_rate": 5.2900000000000005e-05, "loss": 0.0869, "step": 530 }, { "epoch": 1.0693069306930694, "grad_norm": 0.5621580481529236, "learning_rate": 5.390000000000001e-05, "loss": 0.088, "step": 540 }, { "epoch": 1.0891089108910892, "grad_norm": 0.4893638491630554, "learning_rate": 5.4900000000000006e-05, "loss": 0.0848, "step": 550 }, { "epoch": 1.108910891089109, "grad_norm": 0.5454689860343933, "learning_rate": 5.590000000000001e-05, "loss": 0.0872, "step": 560 }, { "epoch": 1.1287128712871288, "grad_norm": 0.3772514760494232, "learning_rate": 5.69e-05, "loss": 0.0913, "step": 570 }, { "epoch": 1.1485148514851484, "grad_norm": 0.4108765125274658, "learning_rate": 5.79e-05, "loss": 0.0837, "step": 580 }, { "epoch": 1.1683168316831682, "grad_norm": 0.35126355290412903, "learning_rate": 5.89e-05, "loss": 0.08, "step": 590 }, { "epoch": 1.188118811881188, "grad_norm": 0.5463754534721375, "learning_rate": 5.99e-05, "loss": 0.0845, "step": 600 }, { "epoch": 1.2079207920792079, "grad_norm": 0.3758898377418518, "learning_rate": 6.09e-05, "loss": 0.083, "step": 610 }, { "epoch": 1.2277227722772277, "grad_norm": 0.5758973360061646, "learning_rate": 6.19e-05, "loss": 0.0846, "step": 620 }, { "epoch": 1.2475247524752475, "grad_norm": 0.5831725597381592, "learning_rate": 6.29e-05, "loss": 0.088, "step": 630 }, { "epoch": 1.2673267326732673, "grad_norm": 0.6602601408958435, "learning_rate": 6.390000000000001e-05, "loss": 0.0844, "step": 640 }, { "epoch": 1.2871287128712872, "grad_norm": 0.6927556395530701, "learning_rate": 6.49e-05, "loss": 0.0853, "step": 650 }, { "epoch": 1.306930693069307, "grad_norm": 0.3769790232181549, "learning_rate": 6.59e-05, "loss": 0.0791, "step": 660 }, { "epoch": 1.3267326732673268, "grad_norm": 0.537031352519989, "learning_rate": 6.690000000000001e-05, "loss": 0.0784, "step": 670 }, { "epoch": 1.3465346534653464, "grad_norm": 0.3678572177886963, "learning_rate": 6.790000000000001e-05, "loss": 0.0802, "step": 680 }, { "epoch": 1.3663366336633662, "grad_norm": 0.45221948623657227, "learning_rate": 6.89e-05, "loss": 0.0821, "step": 690 }, { "epoch": 1.386138613861386, "grad_norm": 0.29710426926612854, "learning_rate": 6.99e-05, "loss": 0.077, "step": 700 }, { "epoch": 1.4059405940594059, "grad_norm": 0.3668842017650604, "learning_rate": 7.09e-05, "loss": 0.0756, "step": 710 }, { "epoch": 1.4257425742574257, "grad_norm": 0.5576313734054565, "learning_rate": 7.19e-05, "loss": 0.0716, "step": 720 }, { "epoch": 1.4455445544554455, "grad_norm": 0.4288281798362732, "learning_rate": 7.29e-05, "loss": 0.079, "step": 730 }, { "epoch": 1.4653465346534653, "grad_norm": 0.5531351566314697, "learning_rate": 7.390000000000001e-05, "loss": 0.0802, "step": 740 }, { "epoch": 1.4851485148514851, "grad_norm": 0.5030956864356995, "learning_rate": 7.49e-05, "loss": 0.0723, "step": 750 }, { "epoch": 1.504950495049505, "grad_norm": 0.6398656964302063, "learning_rate": 7.59e-05, "loss": 0.0797, "step": 760 }, { "epoch": 1.5247524752475248, "grad_norm": 0.530530571937561, "learning_rate": 7.69e-05, "loss": 0.0787, "step": 770 }, { "epoch": 1.5445544554455446, "grad_norm": 0.7378870844841003, "learning_rate": 7.790000000000001e-05, "loss": 0.0765, "step": 780 }, { "epoch": 1.5643564356435644, "grad_norm": 0.7795289754867554, "learning_rate": 7.890000000000001e-05, "loss": 0.069, "step": 790 }, { "epoch": 1.5841584158415842, "grad_norm": 0.569450318813324, "learning_rate": 7.99e-05, "loss": 0.071, "step": 800 }, { "epoch": 1.603960396039604, "grad_norm": 0.47426697611808777, "learning_rate": 8.090000000000001e-05, "loss": 0.0615, "step": 810 }, { "epoch": 1.6237623762376239, "grad_norm": 0.5630972981452942, "learning_rate": 8.19e-05, "loss": 0.0592, "step": 820 }, { "epoch": 1.6435643564356437, "grad_norm": 0.9659773707389832, "learning_rate": 8.29e-05, "loss": 0.0593, "step": 830 }, { "epoch": 1.6633663366336635, "grad_norm": 0.7206599712371826, "learning_rate": 8.39e-05, "loss": 0.057, "step": 840 }, { "epoch": 1.6831683168316833, "grad_norm": 0.7285612225532532, "learning_rate": 8.49e-05, "loss": 0.0513, "step": 850 }, { "epoch": 1.702970297029703, "grad_norm": 0.6530052423477173, "learning_rate": 8.59e-05, "loss": 0.0535, "step": 860 }, { "epoch": 1.7227722772277227, "grad_norm": 1.0796089172363281, "learning_rate": 8.69e-05, "loss": 0.0559, "step": 870 }, { "epoch": 1.7425742574257426, "grad_norm": 0.5483952164649963, "learning_rate": 8.790000000000001e-05, "loss": 0.0525, "step": 880 }, { "epoch": 1.7623762376237624, "grad_norm": 0.595950186252594, "learning_rate": 8.89e-05, "loss": 0.0482, "step": 890 }, { "epoch": 1.7821782178217822, "grad_norm": 0.6223919987678528, "learning_rate": 8.99e-05, "loss": 0.0525, "step": 900 }, { "epoch": 1.801980198019802, "grad_norm": 0.8920643329620361, "learning_rate": 9.090000000000001e-05, "loss": 0.0477, "step": 910 }, { "epoch": 1.8217821782178216, "grad_norm": 0.7896047830581665, "learning_rate": 9.190000000000001e-05, "loss": 0.0481, "step": 920 }, { "epoch": 1.8415841584158414, "grad_norm": 0.9497529864311218, "learning_rate": 9.290000000000001e-05, "loss": 0.0508, "step": 930 }, { "epoch": 1.8613861386138613, "grad_norm": 0.7815911173820496, "learning_rate": 9.39e-05, "loss": 0.045, "step": 940 }, { "epoch": 1.881188118811881, "grad_norm": 0.673361599445343, "learning_rate": 9.49e-05, "loss": 0.0465, "step": 950 }, { "epoch": 1.900990099009901, "grad_norm": 0.6864124536514282, "learning_rate": 9.59e-05, "loss": 0.0471, "step": 960 }, { "epoch": 1.9207920792079207, "grad_norm": 0.7551929354667664, "learning_rate": 9.69e-05, "loss": 0.0464, "step": 970 }, { "epoch": 1.9405940594059405, "grad_norm": 0.8724513053894043, "learning_rate": 9.790000000000001e-05, "loss": 0.0455, "step": 980 }, { "epoch": 1.9603960396039604, "grad_norm": 0.7006964683532715, "learning_rate": 9.89e-05, "loss": 0.0472, "step": 990 }, { "epoch": 1.9801980198019802, "grad_norm": 0.679084837436676, "learning_rate": 9.99e-05, "loss": 0.0449, "step": 1000 }, { "epoch": 2.0, "grad_norm": 1.5011944770812988, "learning_rate": 9.999994463727085e-05, "loss": 0.0433, "step": 1010 }, { "epoch": 2.01980198019802, "grad_norm": 0.6156640648841858, "learning_rate": 9.999975326009292e-05, "loss": 0.0491, "step": 1020 }, { "epoch": 2.0396039603960396, "grad_norm": 0.6705284714698792, "learning_rate": 9.999942518549879e-05, "loss": 0.0454, "step": 1030 }, { "epoch": 2.0594059405940595, "grad_norm": 0.6902824640274048, "learning_rate": 9.999896041438544e-05, "loss": 0.0417, "step": 1040 }, { "epoch": 2.0792079207920793, "grad_norm": 0.5809803605079651, "learning_rate": 9.999835894802353e-05, "loss": 0.0374, "step": 1050 }, { "epoch": 2.099009900990099, "grad_norm": 0.639614999294281, "learning_rate": 9.999762078805743e-05, "loss": 0.0364, "step": 1060 }, { "epoch": 2.118811881188119, "grad_norm": 0.7430437207221985, "learning_rate": 9.999674593650526e-05, "loss": 0.0394, "step": 1070 }, { "epoch": 2.1386138613861387, "grad_norm": 0.7876464128494263, "learning_rate": 9.99957343957588e-05, "loss": 0.0391, "step": 1080 }, { "epoch": 2.1584158415841586, "grad_norm": 0.4455740451812744, "learning_rate": 9.99945861685836e-05, "loss": 0.036, "step": 1090 }, { "epoch": 2.1782178217821784, "grad_norm": 0.6331169009208679, "learning_rate": 9.999330125811884e-05, "loss": 0.0372, "step": 1100 }, { "epoch": 2.198019801980198, "grad_norm": 0.7102445960044861, "learning_rate": 9.999187966787744e-05, "loss": 0.0325, "step": 1110 }, { "epoch": 2.217821782178218, "grad_norm": 0.7829434871673584, "learning_rate": 9.999032140174595e-05, "loss": 0.0373, "step": 1120 }, { "epoch": 2.237623762376238, "grad_norm": 0.7277643084526062, "learning_rate": 9.998862646398464e-05, "loss": 0.0397, "step": 1130 }, { "epoch": 2.2574257425742577, "grad_norm": 0.5535169243812561, "learning_rate": 9.998679485922739e-05, "loss": 0.039, "step": 1140 }, { "epoch": 2.2772277227722775, "grad_norm": 0.9868266582489014, "learning_rate": 9.998482659248174e-05, "loss": 0.0378, "step": 1150 }, { "epoch": 2.297029702970297, "grad_norm": 0.5889109969139099, "learning_rate": 9.998272166912883e-05, "loss": 0.0368, "step": 1160 }, { "epoch": 2.3168316831683167, "grad_norm": 0.38154739141464233, "learning_rate": 9.998048009492347e-05, "loss": 0.0394, "step": 1170 }, { "epoch": 2.3366336633663365, "grad_norm": 0.8264123201370239, "learning_rate": 9.997810187599403e-05, "loss": 0.0378, "step": 1180 }, { "epoch": 2.3564356435643563, "grad_norm": 1.089124083518982, "learning_rate": 9.997558701884249e-05, "loss": 0.0352, "step": 1190 }, { "epoch": 2.376237623762376, "grad_norm": 0.5834736824035645, "learning_rate": 9.997293553034433e-05, "loss": 0.0338, "step": 1200 }, { "epoch": 2.396039603960396, "grad_norm": 0.6849591732025146, "learning_rate": 9.997014741774866e-05, "loss": 0.0327, "step": 1210 }, { "epoch": 2.4158415841584158, "grad_norm": 0.701276957988739, "learning_rate": 9.996722268867803e-05, "loss": 0.0329, "step": 1220 }, { "epoch": 2.4356435643564356, "grad_norm": 0.6653477549552917, "learning_rate": 9.996416135112858e-05, "loss": 0.0339, "step": 1230 }, { "epoch": 2.4554455445544554, "grad_norm": 0.6447290778160095, "learning_rate": 9.996096341346988e-05, "loss": 0.0325, "step": 1240 }, { "epoch": 2.4752475247524752, "grad_norm": 0.8344826102256775, "learning_rate": 9.995762888444495e-05, "loss": 0.0316, "step": 1250 }, { "epoch": 2.495049504950495, "grad_norm": 0.5559496283531189, "learning_rate": 9.995415777317027e-05, "loss": 0.0346, "step": 1260 }, { "epoch": 2.514851485148515, "grad_norm": 0.6316512227058411, "learning_rate": 9.995055008913574e-05, "loss": 0.0327, "step": 1270 }, { "epoch": 2.5346534653465347, "grad_norm": 0.516606330871582, "learning_rate": 9.994680584220463e-05, "loss": 0.0333, "step": 1280 }, { "epoch": 2.5544554455445545, "grad_norm": 0.626565158367157, "learning_rate": 9.994292504261355e-05, "loss": 0.0336, "step": 1290 }, { "epoch": 2.5742574257425743, "grad_norm": 0.6147440671920776, "learning_rate": 9.993890770097247e-05, "loss": 0.0307, "step": 1300 }, { "epoch": 2.594059405940594, "grad_norm": 0.7798984050750732, "learning_rate": 9.993475382826467e-05, "loss": 0.0362, "step": 1310 }, { "epoch": 2.613861386138614, "grad_norm": 0.5636182427406311, "learning_rate": 9.993046343584664e-05, "loss": 0.0336, "step": 1320 }, { "epoch": 2.633663366336634, "grad_norm": 0.5937321782112122, "learning_rate": 9.992603653544816e-05, "loss": 0.0364, "step": 1330 }, { "epoch": 2.6534653465346536, "grad_norm": 0.6108596324920654, "learning_rate": 9.992147313917222e-05, "loss": 0.0343, "step": 1340 }, { "epoch": 2.6732673267326734, "grad_norm": 0.47968563437461853, "learning_rate": 9.991677325949497e-05, "loss": 0.0312, "step": 1350 }, { "epoch": 2.693069306930693, "grad_norm": 0.6273333430290222, "learning_rate": 9.991193690926568e-05, "loss": 0.0313, "step": 1360 }, { "epoch": 2.7128712871287126, "grad_norm": 0.7696301341056824, "learning_rate": 9.990696410170678e-05, "loss": 0.0304, "step": 1370 }, { "epoch": 2.7326732673267324, "grad_norm": 0.5537895560264587, "learning_rate": 9.990185485041371e-05, "loss": 0.0274, "step": 1380 }, { "epoch": 2.7524752475247523, "grad_norm": 0.5345724821090698, "learning_rate": 9.989660916935498e-05, "loss": 0.0274, "step": 1390 }, { "epoch": 2.772277227722772, "grad_norm": 0.6972618103027344, "learning_rate": 9.989122707287208e-05, "loss": 0.0292, "step": 1400 }, { "epoch": 2.792079207920792, "grad_norm": 0.37028950452804565, "learning_rate": 9.988570857567945e-05, "loss": 0.0283, "step": 1410 }, { "epoch": 2.8118811881188117, "grad_norm": 0.5381317138671875, "learning_rate": 9.988005369286446e-05, "loss": 0.0309, "step": 1420 }, { "epoch": 2.8316831683168315, "grad_norm": 0.5244536995887756, "learning_rate": 9.987426243988734e-05, "loss": 0.0273, "step": 1430 }, { "epoch": 2.8514851485148514, "grad_norm": 0.4199411869049072, "learning_rate": 9.986833483258114e-05, "loss": 0.0254, "step": 1440 }, { "epoch": 2.871287128712871, "grad_norm": 0.5240030884742737, "learning_rate": 9.986227088715173e-05, "loss": 0.0265, "step": 1450 }, { "epoch": 2.891089108910891, "grad_norm": 0.5243639349937439, "learning_rate": 9.98560706201777e-05, "loss": 0.0307, "step": 1460 }, { "epoch": 2.910891089108911, "grad_norm": 0.45854103565216064, "learning_rate": 9.984973404861036e-05, "loss": 0.0276, "step": 1470 }, { "epoch": 2.9306930693069306, "grad_norm": 0.4221343696117401, "learning_rate": 9.984326118977361e-05, "loss": 0.0296, "step": 1480 }, { "epoch": 2.9504950495049505, "grad_norm": 0.5276791453361511, "learning_rate": 9.983665206136406e-05, "loss": 0.0295, "step": 1490 }, { "epoch": 2.9702970297029703, "grad_norm": 0.46650955080986023, "learning_rate": 9.982990668145075e-05, "loss": 0.0255, "step": 1500 }, { "epoch": 2.99009900990099, "grad_norm": 0.6352214813232422, "learning_rate": 9.982302506847534e-05, "loss": 0.0285, "step": 1510 }, { "epoch": 3.00990099009901, "grad_norm": 0.49326667189598083, "learning_rate": 9.981600724125189e-05, "loss": 0.0305, "step": 1520 }, { "epoch": 3.0297029702970297, "grad_norm": 0.42429423332214355, "learning_rate": 9.980885321896685e-05, "loss": 0.0299, "step": 1530 }, { "epoch": 3.0495049504950495, "grad_norm": 0.3040918707847595, "learning_rate": 9.980156302117905e-05, "loss": 0.0276, "step": 1540 }, { "epoch": 3.0693069306930694, "grad_norm": 0.5348612070083618, "learning_rate": 9.979413666781963e-05, "loss": 0.0256, "step": 1550 }, { "epoch": 3.089108910891089, "grad_norm": 0.684425413608551, "learning_rate": 9.978657417919193e-05, "loss": 0.0285, "step": 1560 }, { "epoch": 3.108910891089109, "grad_norm": 0.3684428930282593, "learning_rate": 9.977887557597153e-05, "loss": 0.0277, "step": 1570 }, { "epoch": 3.128712871287129, "grad_norm": 0.4213053286075592, "learning_rate": 9.97710408792061e-05, "loss": 0.025, "step": 1580 }, { "epoch": 3.1485148514851486, "grad_norm": 0.880452036857605, "learning_rate": 9.976307011031542e-05, "loss": 0.0274, "step": 1590 }, { "epoch": 3.1683168316831685, "grad_norm": 1.2218737602233887, "learning_rate": 9.975496329109126e-05, "loss": 0.0397, "step": 1600 }, { "epoch": 3.1881188118811883, "grad_norm": 0.6180049777030945, "learning_rate": 9.974672044369732e-05, "loss": 0.0379, "step": 1610 }, { "epoch": 3.207920792079208, "grad_norm": 0.6267650723457336, "learning_rate": 9.97383415906693e-05, "loss": 0.0333, "step": 1620 }, { "epoch": 3.227722772277228, "grad_norm": 0.4574815630912781, "learning_rate": 9.97298267549146e-05, "loss": 0.0315, "step": 1630 }, { "epoch": 3.2475247524752477, "grad_norm": 0.5249430537223816, "learning_rate": 9.972117595971249e-05, "loss": 0.0352, "step": 1640 }, { "epoch": 3.2673267326732676, "grad_norm": 0.5119317770004272, "learning_rate": 9.971238922871391e-05, "loss": 0.0293, "step": 1650 }, { "epoch": 3.287128712871287, "grad_norm": 0.443644255399704, "learning_rate": 9.970346658594142e-05, "loss": 0.0286, "step": 1660 }, { "epoch": 3.3069306930693068, "grad_norm": 1.7989439964294434, "learning_rate": 9.969440805578923e-05, "loss": 0.033, "step": 1670 }, { "epoch": 3.3267326732673266, "grad_norm": 0.5163241624832153, "learning_rate": 9.968521366302298e-05, "loss": 0.0353, "step": 1680 }, { "epoch": 3.3465346534653464, "grad_norm": 0.5094070434570312, "learning_rate": 9.967588343277981e-05, "loss": 0.0333, "step": 1690 }, { "epoch": 3.366336633663366, "grad_norm": 0.5692246556282043, "learning_rate": 9.966641739056818e-05, "loss": 0.0308, "step": 1700 }, { "epoch": 3.386138613861386, "grad_norm": 0.4787304699420929, "learning_rate": 9.965681556226793e-05, "loss": 0.0304, "step": 1710 }, { "epoch": 3.405940594059406, "grad_norm": 0.5281928181648254, "learning_rate": 9.964707797413006e-05, "loss": 0.0271, "step": 1720 }, { "epoch": 3.4257425742574257, "grad_norm": 0.3603343665599823, "learning_rate": 9.963720465277679e-05, "loss": 0.0272, "step": 1730 }, { "epoch": 3.4455445544554455, "grad_norm": 0.5722606778144836, "learning_rate": 9.96271956252014e-05, "loss": 0.0252, "step": 1740 }, { "epoch": 3.4653465346534653, "grad_norm": 0.5469124913215637, "learning_rate": 9.961705091876816e-05, "loss": 0.0269, "step": 1750 }, { "epoch": 3.485148514851485, "grad_norm": 0.4726136326789856, "learning_rate": 9.960677056121235e-05, "loss": 0.0263, "step": 1760 }, { "epoch": 3.504950495049505, "grad_norm": 0.4829319715499878, "learning_rate": 9.959635458064005e-05, "loss": 0.0264, "step": 1770 }, { "epoch": 3.5247524752475248, "grad_norm": 0.640622079372406, "learning_rate": 9.958580300552815e-05, "loss": 0.0284, "step": 1780 }, { "epoch": 3.5445544554455446, "grad_norm": 0.4923085570335388, "learning_rate": 9.957511586472426e-05, "loss": 0.0256, "step": 1790 }, { "epoch": 3.5643564356435644, "grad_norm": 0.4447120130062103, "learning_rate": 9.956429318744662e-05, "loss": 0.0251, "step": 1800 }, { "epoch": 3.5841584158415842, "grad_norm": 0.46116968989372253, "learning_rate": 9.955333500328404e-05, "loss": 0.026, "step": 1810 }, { "epoch": 3.603960396039604, "grad_norm": 0.4842219352722168, "learning_rate": 9.95422413421957e-05, "loss": 0.0269, "step": 1820 }, { "epoch": 3.623762376237624, "grad_norm": 0.4033548831939697, "learning_rate": 9.953101223451133e-05, "loss": 0.0254, "step": 1830 }, { "epoch": 3.6435643564356437, "grad_norm": 0.495199054479599, "learning_rate": 9.951964771093085e-05, "loss": 0.0234, "step": 1840 }, { "epoch": 3.6633663366336635, "grad_norm": 0.3622370958328247, "learning_rate": 9.950814780252442e-05, "loss": 0.0262, "step": 1850 }, { "epoch": 3.6831683168316833, "grad_norm": 3.1002018451690674, "learning_rate": 9.949651254073236e-05, "loss": 0.0277, "step": 1860 }, { "epoch": 3.7029702970297027, "grad_norm": 0.4746827781200409, "learning_rate": 9.948474195736504e-05, "loss": 0.0252, "step": 1870 }, { "epoch": 3.7227722772277225, "grad_norm": 0.4771999418735504, "learning_rate": 9.947283608460277e-05, "loss": 0.0262, "step": 1880 }, { "epoch": 3.7425742574257423, "grad_norm": 0.5135788917541504, "learning_rate": 9.946079495499577e-05, "loss": 0.0276, "step": 1890 }, { "epoch": 3.762376237623762, "grad_norm": 0.36102426052093506, "learning_rate": 9.944861860146401e-05, "loss": 0.0277, "step": 1900 }, { "epoch": 3.782178217821782, "grad_norm": 0.5482873320579529, "learning_rate": 9.943630705729719e-05, "loss": 0.0298, "step": 1910 }, { "epoch": 3.801980198019802, "grad_norm": 0.6050424575805664, "learning_rate": 9.942386035615459e-05, "loss": 0.0305, "step": 1920 }, { "epoch": 3.8217821782178216, "grad_norm": 0.6736830472946167, "learning_rate": 9.941127853206503e-05, "loss": 0.0282, "step": 1930 }, { "epoch": 3.8415841584158414, "grad_norm": 0.6724772453308105, "learning_rate": 9.939856161942673e-05, "loss": 0.035, "step": 1940 }, { "epoch": 3.8613861386138613, "grad_norm": 0.6590871214866638, "learning_rate": 9.938570965300724e-05, "loss": 0.0329, "step": 1950 }, { "epoch": 3.881188118811881, "grad_norm": 0.5874097347259521, "learning_rate": 9.937272266794335e-05, "loss": 0.0307, "step": 1960 }, { "epoch": 3.900990099009901, "grad_norm": 0.532797634601593, "learning_rate": 9.935960069974096e-05, "loss": 0.0283, "step": 1970 }, { "epoch": 3.9207920792079207, "grad_norm": 0.5887763500213623, "learning_rate": 9.934634378427506e-05, "loss": 0.0257, "step": 1980 }, { "epoch": 3.9405940594059405, "grad_norm": 0.4116886854171753, "learning_rate": 9.933295195778954e-05, "loss": 0.025, "step": 1990 }, { "epoch": 3.9603960396039604, "grad_norm": 0.3874164819717407, "learning_rate": 9.931942525689715e-05, "loss": 0.0231, "step": 2000 }, { "epoch": 3.98019801980198, "grad_norm": 0.5164503455162048, "learning_rate": 9.930576371857936e-05, "loss": 0.0242, "step": 2010 }, { "epoch": 4.0, "grad_norm": 0.5156210064888, "learning_rate": 9.929196738018629e-05, "loss": 0.0234, "step": 2020 }, { "epoch": 4.01980198019802, "grad_norm": 0.3770348131656647, "learning_rate": 9.927803627943662e-05, "loss": 0.0244, "step": 2030 }, { "epoch": 4.03960396039604, "grad_norm": 0.5380315184593201, "learning_rate": 9.926397045441744e-05, "loss": 0.0253, "step": 2040 }, { "epoch": 4.0594059405940595, "grad_norm": 0.39466458559036255, "learning_rate": 9.924976994358417e-05, "loss": 0.0243, "step": 2050 }, { "epoch": 4.079207920792079, "grad_norm": 0.3864162564277649, "learning_rate": 9.923543478576048e-05, "loss": 0.0232, "step": 2060 }, { "epoch": 4.099009900990099, "grad_norm": 0.343925416469574, "learning_rate": 9.922096502013813e-05, "loss": 0.0231, "step": 2070 }, { "epoch": 4.118811881188119, "grad_norm": 0.5396806597709656, "learning_rate": 9.92063606862769e-05, "loss": 0.0226, "step": 2080 }, { "epoch": 4.138613861386139, "grad_norm": 0.3676890730857849, "learning_rate": 9.919162182410453e-05, "loss": 0.024, "step": 2090 }, { "epoch": 4.158415841584159, "grad_norm": 0.39978456497192383, "learning_rate": 9.917674847391645e-05, "loss": 0.0239, "step": 2100 }, { "epoch": 4.178217821782178, "grad_norm": 0.4424649178981781, "learning_rate": 9.916174067637584e-05, "loss": 0.0197, "step": 2110 }, { "epoch": 4.198019801980198, "grad_norm": 0.36216285824775696, "learning_rate": 9.914659847251348e-05, "loss": 0.0207, "step": 2120 }, { "epoch": 4.217821782178218, "grad_norm": 0.2973497807979584, "learning_rate": 9.913132190372753e-05, "loss": 0.0229, "step": 2130 }, { "epoch": 4.237623762376238, "grad_norm": 0.31579095125198364, "learning_rate": 9.911591101178359e-05, "loss": 0.0204, "step": 2140 }, { "epoch": 4.257425742574258, "grad_norm": 0.48529312014579773, "learning_rate": 9.910036583881443e-05, "loss": 0.0219, "step": 2150 }, { "epoch": 4.2772277227722775, "grad_norm": 0.42801210284233093, "learning_rate": 9.908468642731995e-05, "loss": 0.0228, "step": 2160 }, { "epoch": 4.297029702970297, "grad_norm": 0.49879559874534607, "learning_rate": 9.906887282016707e-05, "loss": 0.0205, "step": 2170 }, { "epoch": 4.316831683168317, "grad_norm": 0.37642356753349304, "learning_rate": 9.90529250605896e-05, "loss": 0.0216, "step": 2180 }, { "epoch": 4.336633663366337, "grad_norm": 0.41889533400535583, "learning_rate": 9.903684319218809e-05, "loss": 0.0199, "step": 2190 }, { "epoch": 4.356435643564357, "grad_norm": 0.34421610832214355, "learning_rate": 9.902062725892976e-05, "loss": 0.0237, "step": 2200 }, { "epoch": 4.376237623762377, "grad_norm": 0.3852355480194092, "learning_rate": 9.900427730514834e-05, "loss": 0.0219, "step": 2210 }, { "epoch": 4.396039603960396, "grad_norm": 0.3332788348197937, "learning_rate": 9.8987793375544e-05, "loss": 0.0244, "step": 2220 }, { "epoch": 4.415841584158416, "grad_norm": 0.4230865240097046, "learning_rate": 9.897117551518318e-05, "loss": 0.0244, "step": 2230 }, { "epoch": 4.435643564356436, "grad_norm": 0.3202756643295288, "learning_rate": 9.895442376949844e-05, "loss": 0.0219, "step": 2240 }, { "epoch": 4.455445544554456, "grad_norm": 0.3089510202407837, "learning_rate": 9.893753818428845e-05, "loss": 0.0213, "step": 2250 }, { "epoch": 4.475247524752476, "grad_norm": 0.27605995535850525, "learning_rate": 9.892051880571773e-05, "loss": 0.0237, "step": 2260 }, { "epoch": 4.4950495049504955, "grad_norm": 0.4935011863708496, "learning_rate": 9.890336568031663e-05, "loss": 0.0254, "step": 2270 }, { "epoch": 4.514851485148515, "grad_norm": 0.4826439321041107, "learning_rate": 9.888607885498113e-05, "loss": 0.024, "step": 2280 }, { "epoch": 4.534653465346535, "grad_norm": 0.35315781831741333, "learning_rate": 9.886865837697275e-05, "loss": 0.025, "step": 2290 }, { "epoch": 4.554455445544555, "grad_norm": 0.3606739342212677, "learning_rate": 9.88511042939184e-05, "loss": 0.0225, "step": 2300 }, { "epoch": 4.574257425742574, "grad_norm": 0.3545808494091034, "learning_rate": 9.883341665381028e-05, "loss": 0.0231, "step": 2310 }, { "epoch": 4.594059405940594, "grad_norm": 0.2959376871585846, "learning_rate": 9.881559550500575e-05, "loss": 0.0214, "step": 2320 }, { "epoch": 4.6138613861386135, "grad_norm": 0.35507145524024963, "learning_rate": 9.879764089622712e-05, "loss": 0.0213, "step": 2330 }, { "epoch": 4.633663366336633, "grad_norm": 0.3066723346710205, "learning_rate": 9.87795528765616e-05, "loss": 0.0238, "step": 2340 }, { "epoch": 4.653465346534653, "grad_norm": 0.4484430253505707, "learning_rate": 9.876133149546118e-05, "loss": 0.0237, "step": 2350 }, { "epoch": 4.673267326732673, "grad_norm": 0.4734068214893341, "learning_rate": 9.874297680274238e-05, "loss": 0.0228, "step": 2360 }, { "epoch": 4.693069306930693, "grad_norm": 0.3711143434047699, "learning_rate": 9.872448884858624e-05, "loss": 0.0217, "step": 2370 }, { "epoch": 4.712871287128713, "grad_norm": 0.360148161649704, "learning_rate": 9.870586768353815e-05, "loss": 0.0199, "step": 2380 }, { "epoch": 4.732673267326732, "grad_norm": 0.3019266128540039, "learning_rate": 9.868711335850764e-05, "loss": 0.0193, "step": 2390 }, { "epoch": 4.752475247524752, "grad_norm": 0.35108819603919983, "learning_rate": 9.866822592476833e-05, "loss": 0.0182, "step": 2400 }, { "epoch": 4.772277227722772, "grad_norm": 0.4206360876560211, "learning_rate": 9.86492054339577e-05, "loss": 0.0205, "step": 2410 }, { "epoch": 4.792079207920792, "grad_norm": 0.375120609998703, "learning_rate": 9.863005193807711e-05, "loss": 0.0228, "step": 2420 }, { "epoch": 4.811881188118812, "grad_norm": 0.35369354486465454, "learning_rate": 9.861076548949143e-05, "loss": 0.0227, "step": 2430 }, { "epoch": 4.8316831683168315, "grad_norm": 0.447580486536026, "learning_rate": 9.859134614092912e-05, "loss": 0.0224, "step": 2440 }, { "epoch": 4.851485148514851, "grad_norm": 0.3736158311367035, "learning_rate": 9.857179394548191e-05, "loss": 0.0212, "step": 2450 }, { "epoch": 4.871287128712871, "grad_norm": 0.3133954703807831, "learning_rate": 9.855210895660477e-05, "loss": 0.0201, "step": 2460 }, { "epoch": 4.891089108910891, "grad_norm": 0.2543289363384247, "learning_rate": 9.853229122811568e-05, "loss": 0.0195, "step": 2470 }, { "epoch": 4.910891089108911, "grad_norm": 0.30530208349227905, "learning_rate": 9.851234081419559e-05, "loss": 0.0206, "step": 2480 }, { "epoch": 4.930693069306931, "grad_norm": 0.40204182267189026, "learning_rate": 9.849225776938814e-05, "loss": 0.0222, "step": 2490 }, { "epoch": 4.9504950495049505, "grad_norm": 0.29214873909950256, "learning_rate": 9.847204214859964e-05, "loss": 0.0226, "step": 2500 }, { "epoch": 4.97029702970297, "grad_norm": 0.24915005266666412, "learning_rate": 9.845169400709879e-05, "loss": 0.0225, "step": 2510 }, { "epoch": 4.99009900990099, "grad_norm": 0.3315647840499878, "learning_rate": 9.843121340051664e-05, "loss": 0.0197, "step": 2520 }, { "epoch": 5.00990099009901, "grad_norm": 0.47070738673210144, "learning_rate": 9.841060038484641e-05, "loss": 0.0227, "step": 2530 }, { "epoch": 5.02970297029703, "grad_norm": 0.5289052128791809, "learning_rate": 9.838985501644328e-05, "loss": 0.0237, "step": 2540 }, { "epoch": 5.0495049504950495, "grad_norm": 0.34559372067451477, "learning_rate": 9.83689773520243e-05, "loss": 0.0237, "step": 2550 }, { "epoch": 5.069306930693069, "grad_norm": 0.43140825629234314, "learning_rate": 9.834796744866819e-05, "loss": 0.0233, "step": 2560 }, { "epoch": 5.089108910891089, "grad_norm": 0.3761429190635681, "learning_rate": 9.832682536381525e-05, "loss": 0.023, "step": 2570 }, { "epoch": 5.108910891089109, "grad_norm": 0.3882358968257904, "learning_rate": 9.830555115526711e-05, "loss": 0.02, "step": 2580 }, { "epoch": 5.128712871287129, "grad_norm": 0.3988865315914154, "learning_rate": 9.828414488118667e-05, "loss": 0.0184, "step": 2590 }, { "epoch": 5.148514851485149, "grad_norm": 0.4496070146560669, "learning_rate": 9.826260660009785e-05, "loss": 0.0209, "step": 2600 }, { "epoch": 5.1683168316831685, "grad_norm": 0.37955424189567566, "learning_rate": 9.824093637088547e-05, "loss": 0.0211, "step": 2610 }, { "epoch": 5.188118811881188, "grad_norm": 0.31307458877563477, "learning_rate": 9.821913425279514e-05, "loss": 0.0192, "step": 2620 }, { "epoch": 5.207920792079208, "grad_norm": 0.3439253568649292, "learning_rate": 9.8197200305433e-05, "loss": 0.0184, "step": 2630 }, { "epoch": 5.227722772277228, "grad_norm": 0.29320356249809265, "learning_rate": 9.817513458876564e-05, "loss": 0.019, "step": 2640 }, { "epoch": 5.247524752475248, "grad_norm": 0.2600882947444916, "learning_rate": 9.815293716311987e-05, "loss": 0.0178, "step": 2650 }, { "epoch": 5.267326732673268, "grad_norm": 0.29619163274765015, "learning_rate": 9.813060808918262e-05, "loss": 0.0173, "step": 2660 }, { "epoch": 5.287128712871287, "grad_norm": 0.34883660078048706, "learning_rate": 9.810814742800069e-05, "loss": 0.0172, "step": 2670 }, { "epoch": 5.306930693069307, "grad_norm": 0.2672446072101593, "learning_rate": 9.808555524098074e-05, "loss": 0.0185, "step": 2680 }, { "epoch": 5.326732673267327, "grad_norm": 0.39667657017707825, "learning_rate": 9.806283158988887e-05, "loss": 0.0215, "step": 2690 }, { "epoch": 5.346534653465347, "grad_norm": 0.47337856888771057, "learning_rate": 9.803997653685072e-05, "loss": 0.0182, "step": 2700 }, { "epoch": 5.366336633663367, "grad_norm": 0.3918495178222656, "learning_rate": 9.801699014435112e-05, "loss": 0.0186, "step": 2710 }, { "epoch": 5.3861386138613865, "grad_norm": 0.31006425619125366, "learning_rate": 9.799387247523398e-05, "loss": 0.0183, "step": 2720 }, { "epoch": 5.405940594059406, "grad_norm": 0.22273986041545868, "learning_rate": 9.797062359270215e-05, "loss": 0.0207, "step": 2730 }, { "epoch": 5.425742574257426, "grad_norm": 0.46588271856307983, "learning_rate": 9.794724356031715e-05, "loss": 0.0195, "step": 2740 }, { "epoch": 5.445544554455446, "grad_norm": 0.3831281065940857, "learning_rate": 9.792373244199913e-05, "loss": 0.0196, "step": 2750 }, { "epoch": 5.465346534653466, "grad_norm": 0.3359587490558624, "learning_rate": 9.790009030202658e-05, "loss": 0.0215, "step": 2760 }, { "epoch": 5.485148514851485, "grad_norm": 0.3482320010662079, "learning_rate": 9.78763172050362e-05, "loss": 0.0227, "step": 2770 }, { "epoch": 5.5049504950495045, "grad_norm": 0.4453575015068054, "learning_rate": 9.785241321602274e-05, "loss": 0.0189, "step": 2780 }, { "epoch": 5.524752475247524, "grad_norm": 0.3708284795284271, "learning_rate": 9.782837840033879e-05, "loss": 0.0182, "step": 2790 }, { "epoch": 5.544554455445544, "grad_norm": 0.35548657178878784, "learning_rate": 9.780421282369461e-05, "loss": 0.02, "step": 2800 }, { "epoch": 5.564356435643564, "grad_norm": 0.3473094403743744, "learning_rate": 9.777991655215797e-05, "loss": 0.0213, "step": 2810 }, { "epoch": 5.584158415841584, "grad_norm": 0.3799901604652405, "learning_rate": 9.775548965215394e-05, "loss": 0.0194, "step": 2820 }, { "epoch": 5.603960396039604, "grad_norm": 0.35067564249038696, "learning_rate": 9.773093219046474e-05, "loss": 0.0181, "step": 2830 }, { "epoch": 5.623762376237623, "grad_norm": 0.378802627325058, "learning_rate": 9.770624423422954e-05, "loss": 0.0185, "step": 2840 }, { "epoch": 5.643564356435643, "grad_norm": 0.37550902366638184, "learning_rate": 9.768142585094426e-05, "loss": 0.0185, "step": 2850 }, { "epoch": 5.663366336633663, "grad_norm": 0.29022786021232605, "learning_rate": 9.765647710846142e-05, "loss": 0.0169, "step": 2860 }, { "epoch": 5.683168316831683, "grad_norm": 0.2995775640010834, "learning_rate": 9.763139807498991e-05, "loss": 0.0215, "step": 2870 }, { "epoch": 5.702970297029703, "grad_norm": 0.3938986361026764, "learning_rate": 9.760618881909487e-05, "loss": 0.0181, "step": 2880 }, { "epoch": 5.7227722772277225, "grad_norm": 0.3290078938007355, "learning_rate": 9.758084940969744e-05, "loss": 0.0174, "step": 2890 }, { "epoch": 5.742574257425742, "grad_norm": 0.3669437766075134, "learning_rate": 9.755537991607459e-05, "loss": 0.0199, "step": 2900 }, { "epoch": 5.762376237623762, "grad_norm": 0.40888556838035583, "learning_rate": 9.752978040785895e-05, "loss": 0.018, "step": 2910 }, { "epoch": 5.782178217821782, "grad_norm": 0.3771969974040985, "learning_rate": 9.750405095503859e-05, "loss": 0.0199, "step": 2920 }, { "epoch": 5.801980198019802, "grad_norm": 0.48157572746276855, "learning_rate": 9.747819162795686e-05, "loss": 0.0216, "step": 2930 }, { "epoch": 5.821782178217822, "grad_norm": 0.3724214434623718, "learning_rate": 9.745220249731217e-05, "loss": 0.0182, "step": 2940 }, { "epoch": 5.841584158415841, "grad_norm": 0.3304087519645691, "learning_rate": 9.742608363415781e-05, "loss": 0.02, "step": 2950 }, { "epoch": 5.861386138613861, "grad_norm": 0.3134087324142456, "learning_rate": 9.739983510990176e-05, "loss": 0.0197, "step": 2960 }, { "epoch": 5.881188118811881, "grad_norm": 0.24684718251228333, "learning_rate": 9.737345699630647e-05, "loss": 0.0189, "step": 2970 }, { "epoch": 5.900990099009901, "grad_norm": 0.3885471224784851, "learning_rate": 9.734694936548869e-05, "loss": 0.0177, "step": 2980 }, { "epoch": 5.920792079207921, "grad_norm": 0.37604406476020813, "learning_rate": 9.732031228991932e-05, "loss": 0.0168, "step": 2990 }, { "epoch": 5.9405940594059405, "grad_norm": 0.3319747745990753, "learning_rate": 9.729354584242302e-05, "loss": 0.0189, "step": 3000 }, { "epoch": 5.96039603960396, "grad_norm": 0.393080472946167, "learning_rate": 9.726665009617832e-05, "loss": 0.0167, "step": 3010 }, { "epoch": 5.98019801980198, "grad_norm": 0.3286392092704773, "learning_rate": 9.723962512471714e-05, "loss": 0.0165, "step": 3020 }, { "epoch": 6.0, "grad_norm": 0.6086994409561157, "learning_rate": 9.72124710019247e-05, "loss": 0.0153, "step": 3030 }, { "epoch": 6.01980198019802, "grad_norm": 0.36388668417930603, "learning_rate": 9.718518780203934e-05, "loss": 0.0184, "step": 3040 }, { "epoch": 6.03960396039604, "grad_norm": 0.5109174847602844, "learning_rate": 9.715777559965228e-05, "loss": 0.0177, "step": 3050 }, { "epoch": 6.0594059405940595, "grad_norm": 0.3146904706954956, "learning_rate": 9.713023446970746e-05, "loss": 0.0163, "step": 3060 }, { "epoch": 6.079207920792079, "grad_norm": 0.29438161849975586, "learning_rate": 9.710256448750126e-05, "loss": 0.0166, "step": 3070 }, { "epoch": 6.099009900990099, "grad_norm": 0.2945617437362671, "learning_rate": 9.707476572868235e-05, "loss": 0.0165, "step": 3080 }, { "epoch": 6.118811881188119, "grad_norm": 0.3678109347820282, "learning_rate": 9.704683826925149e-05, "loss": 0.0187, "step": 3090 }, { "epoch": 6.138613861386139, "grad_norm": 0.45136383175849915, "learning_rate": 9.701878218556129e-05, "loss": 0.0161, "step": 3100 }, { "epoch": 6.158415841584159, "grad_norm": 0.4209136962890625, "learning_rate": 9.699059755431598e-05, "loss": 0.0177, "step": 3110 }, { "epoch": 6.178217821782178, "grad_norm": 0.40420234203338623, "learning_rate": 9.696228445257132e-05, "loss": 0.0185, "step": 3120 }, { "epoch": 6.198019801980198, "grad_norm": 0.39920854568481445, "learning_rate": 9.693384295773419e-05, "loss": 0.0193, "step": 3130 }, { "epoch": 6.217821782178218, "grad_norm": 0.2842671871185303, "learning_rate": 9.690527314756259e-05, "loss": 0.0194, "step": 3140 }, { "epoch": 6.237623762376238, "grad_norm": 0.3912985026836395, "learning_rate": 9.687657510016527e-05, "loss": 0.0208, "step": 3150 }, { "epoch": 6.257425742574258, "grad_norm": 0.5948848724365234, "learning_rate": 9.684774889400161e-05, "loss": 0.0187, "step": 3160 }, { "epoch": 6.2772277227722775, "grad_norm": 0.5797039866447449, "learning_rate": 9.681879460788135e-05, "loss": 0.0175, "step": 3170 }, { "epoch": 6.297029702970297, "grad_norm": 0.40737342834472656, "learning_rate": 9.67897123209644e-05, "loss": 0.0179, "step": 3180 }, { "epoch": 6.316831683168317, "grad_norm": 0.34485602378845215, "learning_rate": 9.676050211276062e-05, "loss": 0.0184, "step": 3190 }, { "epoch": 6.336633663366337, "grad_norm": 0.3097567558288574, "learning_rate": 9.673116406312962e-05, "loss": 0.0185, "step": 3200 }, { "epoch": 6.356435643564357, "grad_norm": 0.33055680990219116, "learning_rate": 9.67016982522805e-05, "loss": 0.0181, "step": 3210 }, { "epoch": 6.376237623762377, "grad_norm": 0.2838935852050781, "learning_rate": 9.667210476077164e-05, "loss": 0.0168, "step": 3220 }, { "epoch": 6.396039603960396, "grad_norm": 0.32559776306152344, "learning_rate": 9.664238366951055e-05, "loss": 0.0158, "step": 3230 }, { "epoch": 6.415841584158416, "grad_norm": 0.3504292964935303, "learning_rate": 9.661253505975355e-05, "loss": 0.0173, "step": 3240 }, { "epoch": 6.435643564356436, "grad_norm": 0.3245116174221039, "learning_rate": 9.658255901310557e-05, "loss": 0.0157, "step": 3250 }, { "epoch": 6.455445544554456, "grad_norm": 0.2947891354560852, "learning_rate": 9.655245561152e-05, "loss": 0.0159, "step": 3260 }, { "epoch": 6.475247524752476, "grad_norm": 0.378795325756073, "learning_rate": 9.65222249372984e-05, "loss": 0.0194, "step": 3270 }, { "epoch": 6.4950495049504955, "grad_norm": 0.3340989053249359, "learning_rate": 9.649186707309026e-05, "loss": 0.0185, "step": 3280 }, { "epoch": 6.514851485148515, "grad_norm": 0.5714468359947205, "learning_rate": 9.646138210189283e-05, "loss": 0.0202, "step": 3290 }, { "epoch": 6.534653465346535, "grad_norm": 0.33598747849464417, "learning_rate": 9.643077010705087e-05, "loss": 0.0185, "step": 3300 }, { "epoch": 6.554455445544555, "grad_norm": 0.33504021167755127, "learning_rate": 9.640003117225637e-05, "loss": 0.0167, "step": 3310 }, { "epoch": 6.574257425742574, "grad_norm": 0.4158162474632263, "learning_rate": 9.636916538154846e-05, "loss": 0.0196, "step": 3320 }, { "epoch": 6.594059405940594, "grad_norm": 0.298481822013855, "learning_rate": 9.633817281931296e-05, "loss": 0.0179, "step": 3330 }, { "epoch": 6.6138613861386135, "grad_norm": 0.30949610471725464, "learning_rate": 9.630705357028242e-05, "loss": 0.0174, "step": 3340 }, { "epoch": 6.633663366336633, "grad_norm": 0.43686333298683167, "learning_rate": 9.627580771953563e-05, "loss": 0.0194, "step": 3350 }, { "epoch": 6.653465346534653, "grad_norm": 0.27362263202667236, "learning_rate": 9.624443535249759e-05, "loss": 0.0183, "step": 3360 }, { "epoch": 6.673267326732673, "grad_norm": 0.2846762537956238, "learning_rate": 9.621293655493913e-05, "loss": 0.0179, "step": 3370 }, { "epoch": 6.693069306930693, "grad_norm": 0.37414300441741943, "learning_rate": 9.618131141297675e-05, "loss": 0.0191, "step": 3380 }, { "epoch": 6.712871287128713, "grad_norm": 0.3773373067378998, "learning_rate": 9.614956001307242e-05, "loss": 0.0155, "step": 3390 }, { "epoch": 6.732673267326732, "grad_norm": 0.2997821271419525, "learning_rate": 9.611768244203321e-05, "loss": 0.0173, "step": 3400 }, { "epoch": 6.752475247524752, "grad_norm": 0.296236127614975, "learning_rate": 9.60856787870112e-05, "loss": 0.0166, "step": 3410 }, { "epoch": 6.772277227722772, "grad_norm": 0.33548247814178467, "learning_rate": 9.605354913550318e-05, "loss": 0.0161, "step": 3420 }, { "epoch": 6.792079207920792, "grad_norm": 0.3519626557826996, "learning_rate": 9.602129357535037e-05, "loss": 0.0162, "step": 3430 }, { "epoch": 6.811881188118812, "grad_norm": 0.27583590149879456, "learning_rate": 9.598891219473825e-05, "loss": 0.0168, "step": 3440 }, { "epoch": 6.8316831683168315, "grad_norm": 0.2783600687980652, "learning_rate": 9.595640508219625e-05, "loss": 0.017, "step": 3450 }, { "epoch": 6.851485148514851, "grad_norm": 0.26526907086372375, "learning_rate": 9.592377232659761e-05, "loss": 0.0162, "step": 3460 }, { "epoch": 6.871287128712871, "grad_norm": 0.2630561590194702, "learning_rate": 9.589101401715904e-05, "loss": 0.0164, "step": 3470 }, { "epoch": 6.891089108910891, "grad_norm": 0.34386876225471497, "learning_rate": 9.585813024344045e-05, "loss": 0.0155, "step": 3480 }, { "epoch": 6.910891089108911, "grad_norm": 0.33793166279792786, "learning_rate": 9.58251210953449e-05, "loss": 0.0156, "step": 3490 }, { "epoch": 6.930693069306931, "grad_norm": 0.2659393846988678, "learning_rate": 9.579198666311809e-05, "loss": 0.0183, "step": 3500 }, { "epoch": 6.9504950495049505, "grad_norm": 0.34683603048324585, "learning_rate": 9.575872703734832e-05, "loss": 0.0179, "step": 3510 }, { "epoch": 6.97029702970297, "grad_norm": 0.3190460205078125, "learning_rate": 9.572534230896611e-05, "loss": 0.0153, "step": 3520 }, { "epoch": 6.99009900990099, "grad_norm": 0.3160490393638611, "learning_rate": 9.569183256924403e-05, "loss": 0.0204, "step": 3530 }, { "epoch": 7.00990099009901, "grad_norm": 0.35039374232292175, "learning_rate": 9.565819790979646e-05, "loss": 0.0195, "step": 3540 }, { "epoch": 7.02970297029703, "grad_norm": 0.32915234565734863, "learning_rate": 9.562443842257925e-05, "loss": 0.018, "step": 3550 }, { "epoch": 7.0495049504950495, "grad_norm": 0.2950669527053833, "learning_rate": 9.559055419988956e-05, "loss": 0.0176, "step": 3560 }, { "epoch": 7.069306930693069, "grad_norm": 0.3674241006374359, "learning_rate": 9.555654533436557e-05, "loss": 0.0162, "step": 3570 }, { "epoch": 7.089108910891089, "grad_norm": 0.2508985102176666, "learning_rate": 9.552241191898621e-05, "loss": 0.0211, "step": 3580 }, { "epoch": 7.108910891089109, "grad_norm": 0.6606206893920898, "learning_rate": 9.548815404707092e-05, "loss": 0.0177, "step": 3590 }, { "epoch": 7.128712871287129, "grad_norm": 0.4140847325325012, "learning_rate": 9.545377181227942e-05, "loss": 0.0209, "step": 3600 }, { "epoch": 7.148514851485149, "grad_norm": 0.3429228961467743, "learning_rate": 9.541926530861145e-05, "loss": 0.0182, "step": 3610 }, { "epoch": 7.1683168316831685, "grad_norm": 0.289007306098938, "learning_rate": 9.538463463040645e-05, "loss": 0.0183, "step": 3620 }, { "epoch": 7.188118811881188, "grad_norm": 0.3084986209869385, "learning_rate": 9.534987987234337e-05, "loss": 0.0161, "step": 3630 }, { "epoch": 7.207920792079208, "grad_norm": 0.3828168213367462, "learning_rate": 9.53150011294404e-05, "loss": 0.0168, "step": 3640 }, { "epoch": 7.227722772277228, "grad_norm": 0.30748867988586426, "learning_rate": 9.527999849705471e-05, "loss": 0.0159, "step": 3650 }, { "epoch": 7.247524752475248, "grad_norm": 0.6020660996437073, "learning_rate": 9.524487207088213e-05, "loss": 0.0184, "step": 3660 }, { "epoch": 7.267326732673268, "grad_norm": 0.34988123178482056, "learning_rate": 9.520962194695698e-05, "loss": 0.0169, "step": 3670 }, { "epoch": 7.287128712871287, "grad_norm": 0.33550670742988586, "learning_rate": 9.517424822165175e-05, "loss": 0.0168, "step": 3680 }, { "epoch": 7.306930693069307, "grad_norm": 0.381260484457016, "learning_rate": 9.513875099167685e-05, "loss": 0.0153, "step": 3690 }, { "epoch": 7.326732673267327, "grad_norm": 0.18554243445396423, "learning_rate": 9.510313035408035e-05, "loss": 0.0173, "step": 3700 }, { "epoch": 7.346534653465347, "grad_norm": 0.2811446189880371, "learning_rate": 9.506738640624775e-05, "loss": 0.0156, "step": 3710 }, { "epoch": 7.366336633663367, "grad_norm": 0.3494660258293152, "learning_rate": 9.50315192459016e-05, "loss": 0.0157, "step": 3720 }, { "epoch": 7.3861386138613865, "grad_norm": 0.3090815246105194, "learning_rate": 9.499552897110136e-05, "loss": 0.0173, "step": 3730 }, { "epoch": 7.405940594059406, "grad_norm": 0.2659648060798645, "learning_rate": 9.495941568024304e-05, "loss": 0.0173, "step": 3740 }, { "epoch": 7.425742574257426, "grad_norm": 0.3601891100406647, "learning_rate": 9.492317947205904e-05, "loss": 0.0187, "step": 3750 }, { "epoch": 7.445544554455446, "grad_norm": 0.3029676079750061, "learning_rate": 9.488682044561775e-05, "loss": 0.015, "step": 3760 }, { "epoch": 7.465346534653466, "grad_norm": 0.2730563282966614, "learning_rate": 9.485033870032335e-05, "loss": 0.0165, "step": 3770 }, { "epoch": 7.485148514851485, "grad_norm": 0.8519704937934875, "learning_rate": 9.481373433591556e-05, "loss": 0.0218, "step": 3780 }, { "epoch": 7.5049504950495045, "grad_norm": 1.2323354482650757, "learning_rate": 9.47770074524693e-05, "loss": 0.0336, "step": 3790 }, { "epoch": 7.524752475247524, "grad_norm": 0.4604546129703522, "learning_rate": 9.474015815039446e-05, "loss": 0.0311, "step": 3800 }, { "epoch": 7.544554455445544, "grad_norm": 0.502858579158783, "learning_rate": 9.470318653043565e-05, "loss": 0.0261, "step": 3810 }, { "epoch": 7.564356435643564, "grad_norm": 0.4469369053840637, "learning_rate": 9.466609269367185e-05, "loss": 0.0219, "step": 3820 }, { "epoch": 7.584158415841584, "grad_norm": 0.43039217591285706, "learning_rate": 9.46288767415162e-05, "loss": 0.0223, "step": 3830 }, { "epoch": 7.603960396039604, "grad_norm": 0.2668784558773041, "learning_rate": 9.459153877571567e-05, "loss": 0.0183, "step": 3840 }, { "epoch": 7.623762376237623, "grad_norm": 0.3393080234527588, "learning_rate": 9.455407889835087e-05, "loss": 0.0171, "step": 3850 }, { "epoch": 7.643564356435643, "grad_norm": 0.4221245050430298, "learning_rate": 9.451649721183564e-05, "loss": 0.0175, "step": 3860 }, { "epoch": 7.663366336633663, "grad_norm": 0.2936381697654724, "learning_rate": 9.447879381891692e-05, "loss": 0.0169, "step": 3870 }, { "epoch": 7.683168316831683, "grad_norm": 0.3651542067527771, "learning_rate": 9.444096882267428e-05, "loss": 0.0154, "step": 3880 }, { "epoch": 7.702970297029703, "grad_norm": 0.26084455847740173, "learning_rate": 9.440302232651988e-05, "loss": 0.0144, "step": 3890 }, { "epoch": 7.7227722772277225, "grad_norm": 0.26804202795028687, "learning_rate": 9.436495443419795e-05, "loss": 0.0167, "step": 3900 }, { "epoch": 7.742574257425742, "grad_norm": 0.3153522312641144, "learning_rate": 9.432676524978466e-05, "loss": 0.0178, "step": 3910 }, { "epoch": 7.762376237623762, "grad_norm": 0.40966054797172546, "learning_rate": 9.42884548776878e-05, "loss": 0.0228, "step": 3920 }, { "epoch": 7.782178217821782, "grad_norm": 0.45177891850471497, "learning_rate": 9.425002342264646e-05, "loss": 0.0189, "step": 3930 }, { "epoch": 7.801980198019802, "grad_norm": 0.3874051570892334, "learning_rate": 9.421147098973077e-05, "loss": 0.0159, "step": 3940 }, { "epoch": 7.821782178217822, "grad_norm": 0.26989370584487915, "learning_rate": 9.41727976843416e-05, "loss": 0.0164, "step": 3950 }, { "epoch": 7.841584158415841, "grad_norm": 0.42965760827064514, "learning_rate": 9.413400361221029e-05, "loss": 0.0158, "step": 3960 }, { "epoch": 7.861386138613861, "grad_norm": 0.30895447731018066, "learning_rate": 9.409508887939835e-05, "loss": 0.0161, "step": 3970 }, { "epoch": 7.881188118811881, "grad_norm": 0.2625492215156555, "learning_rate": 9.40560535922972e-05, "loss": 0.0187, "step": 3980 }, { "epoch": 7.900990099009901, "grad_norm": 0.2631179988384247, "learning_rate": 9.40168978576278e-05, "loss": 0.0169, "step": 3990 }, { "epoch": 7.920792079207921, "grad_norm": 0.3105683922767639, "learning_rate": 9.397762178244043e-05, "loss": 0.0192, "step": 4000 }, { "epoch": 7.9405940594059405, "grad_norm": 0.3329387307167053, "learning_rate": 9.393822547411439e-05, "loss": 0.0171, "step": 4010 }, { "epoch": 7.96039603960396, "grad_norm": 0.42155033349990845, "learning_rate": 9.389870904035769e-05, "loss": 0.0165, "step": 4020 }, { "epoch": 7.98019801980198, "grad_norm": 0.31094634532928467, "learning_rate": 9.385907258920672e-05, "loss": 0.0135, "step": 4030 }, { "epoch": 8.0, "grad_norm": 0.5723320841789246, "learning_rate": 9.381931622902607e-05, "loss": 0.0166, "step": 4040 }, { "epoch": 8.01980198019802, "grad_norm": 0.34304580092430115, "learning_rate": 9.377944006850807e-05, "loss": 0.0169, "step": 4050 }, { "epoch": 8.03960396039604, "grad_norm": 0.2420373558998108, "learning_rate": 9.373944421667265e-05, "loss": 0.0152, "step": 4060 }, { "epoch": 8.05940594059406, "grad_norm": 0.3722658157348633, "learning_rate": 9.369932878286691e-05, "loss": 0.0154, "step": 4070 }, { "epoch": 8.07920792079208, "grad_norm": 0.33852115273475647, "learning_rate": 9.365909387676494e-05, "loss": 0.0161, "step": 4080 }, { "epoch": 8.099009900990099, "grad_norm": 0.35614699125289917, "learning_rate": 9.361873960836744e-05, "loss": 0.0153, "step": 4090 }, { "epoch": 8.118811881188119, "grad_norm": 0.2849820852279663, "learning_rate": 9.357826608800142e-05, "loss": 0.0167, "step": 4100 }, { "epoch": 8.138613861386139, "grad_norm": 0.2672055661678314, "learning_rate": 9.353767342631994e-05, "loss": 0.014, "step": 4110 }, { "epoch": 8.158415841584159, "grad_norm": 0.493130624294281, "learning_rate": 9.34969617343018e-05, "loss": 0.0169, "step": 4120 }, { "epoch": 8.178217821782178, "grad_norm": 0.31148281693458557, "learning_rate": 9.345613112325122e-05, "loss": 0.0147, "step": 4130 }, { "epoch": 8.198019801980198, "grad_norm": 0.3248274624347687, "learning_rate": 9.34151817047975e-05, "loss": 0.0151, "step": 4140 }, { "epoch": 8.217821782178218, "grad_norm": 0.38024768233299255, "learning_rate": 9.33741135908948e-05, "loss": 0.0177, "step": 4150 }, { "epoch": 8.237623762376238, "grad_norm": 0.26777511835098267, "learning_rate": 9.33329268938218e-05, "loss": 0.0141, "step": 4160 }, { "epoch": 8.257425742574258, "grad_norm": 0.4392537772655487, "learning_rate": 9.329162172618132e-05, "loss": 0.0179, "step": 4170 }, { "epoch": 8.277227722772277, "grad_norm": 0.3067944347858429, "learning_rate": 9.325019820090013e-05, "loss": 0.0162, "step": 4180 }, { "epoch": 8.297029702970297, "grad_norm": 0.5044247508049011, "learning_rate": 9.320865643122855e-05, "loss": 0.019, "step": 4190 }, { "epoch": 8.316831683168317, "grad_norm": 0.45858651399612427, "learning_rate": 9.316699653074023e-05, "loss": 0.0217, "step": 4200 }, { "epoch": 8.336633663366337, "grad_norm": 0.5565308332443237, "learning_rate": 9.312521861333172e-05, "loss": 0.0184, "step": 4210 }, { "epoch": 8.356435643564357, "grad_norm": 0.36273929476737976, "learning_rate": 9.308332279322224e-05, "loss": 0.0185, "step": 4220 }, { "epoch": 8.376237623762377, "grad_norm": 0.2770717740058899, "learning_rate": 9.304130918495338e-05, "loss": 0.0184, "step": 4230 }, { "epoch": 8.396039603960396, "grad_norm": 0.27835768461227417, "learning_rate": 9.299917790338874e-05, "loss": 0.0175, "step": 4240 }, { "epoch": 8.415841584158416, "grad_norm": 0.30944201350212097, "learning_rate": 9.295692906371363e-05, "loss": 0.0183, "step": 4250 }, { "epoch": 8.435643564356436, "grad_norm": 0.28890740871429443, "learning_rate": 9.291456278143476e-05, "loss": 0.0168, "step": 4260 }, { "epoch": 8.455445544554456, "grad_norm": 0.24193021655082703, "learning_rate": 9.287207917237994e-05, "loss": 0.0152, "step": 4270 }, { "epoch": 8.475247524752476, "grad_norm": 0.2187400609254837, "learning_rate": 9.282947835269773e-05, "loss": 0.0144, "step": 4280 }, { "epoch": 8.495049504950495, "grad_norm": 0.2747240364551544, "learning_rate": 9.278676043885715e-05, "loss": 0.0144, "step": 4290 }, { "epoch": 8.514851485148515, "grad_norm": 0.3518552780151367, "learning_rate": 9.274392554764733e-05, "loss": 0.0144, "step": 4300 }, { "epoch": 8.534653465346535, "grad_norm": 0.3000265061855316, "learning_rate": 9.270097379617723e-05, "loss": 0.0156, "step": 4310 }, { "epoch": 8.554455445544555, "grad_norm": 0.21851317584514618, "learning_rate": 9.26579053018753e-05, "loss": 0.0162, "step": 4320 }, { "epoch": 8.574257425742575, "grad_norm": 0.25009363889694214, "learning_rate": 9.261472018248918e-05, "loss": 0.0153, "step": 4330 }, { "epoch": 8.594059405940595, "grad_norm": 0.2397993803024292, "learning_rate": 9.25714185560853e-05, "loss": 0.0143, "step": 4340 }, { "epoch": 8.613861386138614, "grad_norm": 0.3199898898601532, "learning_rate": 9.252800054104868e-05, "loss": 0.0162, "step": 4350 }, { "epoch": 8.633663366336634, "grad_norm": 0.29683277010917664, "learning_rate": 9.248446625608252e-05, "loss": 0.016, "step": 4360 }, { "epoch": 8.653465346534654, "grad_norm": 0.26599258184432983, "learning_rate": 9.244081582020789e-05, "loss": 0.0139, "step": 4370 }, { "epoch": 8.673267326732674, "grad_norm": 0.3746398091316223, "learning_rate": 9.239704935276339e-05, "loss": 0.0139, "step": 4380 }, { "epoch": 8.693069306930694, "grad_norm": 0.4132077097892761, "learning_rate": 9.235316697340489e-05, "loss": 0.017, "step": 4390 }, { "epoch": 8.712871287128714, "grad_norm": 0.29888296127319336, "learning_rate": 9.230916880210512e-05, "loss": 0.0167, "step": 4400 }, { "epoch": 8.732673267326733, "grad_norm": 0.29621779918670654, "learning_rate": 9.226505495915342e-05, "loss": 0.0158, "step": 4410 }, { "epoch": 8.752475247524753, "grad_norm": 0.1700131595134735, "learning_rate": 9.222082556515536e-05, "loss": 0.015, "step": 4420 }, { "epoch": 8.772277227722773, "grad_norm": 0.2691226899623871, "learning_rate": 9.217648074103242e-05, "loss": 0.0135, "step": 4430 }, { "epoch": 8.792079207920793, "grad_norm": 0.27838805317878723, "learning_rate": 9.213202060802161e-05, "loss": 0.0157, "step": 4440 }, { "epoch": 8.811881188118813, "grad_norm": 0.23582610487937927, "learning_rate": 9.208744528767528e-05, "loss": 0.0139, "step": 4450 }, { "epoch": 8.831683168316832, "grad_norm": 0.24311210215091705, "learning_rate": 9.204275490186064e-05, "loss": 0.015, "step": 4460 }, { "epoch": 8.851485148514852, "grad_norm": 0.2909806966781616, "learning_rate": 9.199794957275949e-05, "loss": 0.014, "step": 4470 }, { "epoch": 8.871287128712872, "grad_norm": 0.3193492591381073, "learning_rate": 9.19530294228679e-05, "loss": 0.0143, "step": 4480 }, { "epoch": 8.891089108910892, "grad_norm": 0.39091432094573975, "learning_rate": 9.190799457499583e-05, "loss": 0.0161, "step": 4490 }, { "epoch": 8.910891089108912, "grad_norm": 0.3201586604118347, "learning_rate": 9.186284515226686e-05, "loss": 0.0167, "step": 4500 }, { "epoch": 8.930693069306932, "grad_norm": 0.19157624244689941, "learning_rate": 9.181758127811777e-05, "loss": 0.0156, "step": 4510 }, { "epoch": 8.950495049504951, "grad_norm": 0.2585284113883972, "learning_rate": 9.177220307629825e-05, "loss": 0.0142, "step": 4520 }, { "epoch": 8.97029702970297, "grad_norm": 0.24680490791797638, "learning_rate": 9.172671067087059e-05, "loss": 0.012, "step": 4530 }, { "epoch": 8.990099009900991, "grad_norm": 0.2011060118675232, "learning_rate": 9.16811041862093e-05, "loss": 0.0133, "step": 4540 }, { "epoch": 9.009900990099009, "grad_norm": 0.2787906527519226, "learning_rate": 9.163538374700076e-05, "loss": 0.0143, "step": 4550 }, { "epoch": 9.029702970297029, "grad_norm": 0.2392808198928833, "learning_rate": 9.158954947824287e-05, "loss": 0.0144, "step": 4560 }, { "epoch": 9.049504950495049, "grad_norm": 0.350919246673584, "learning_rate": 9.154360150524482e-05, "loss": 0.016, "step": 4570 }, { "epoch": 9.069306930693068, "grad_norm": 0.38697656989097595, "learning_rate": 9.14975399536266e-05, "loss": 0.0145, "step": 4580 }, { "epoch": 9.089108910891088, "grad_norm": 0.3013128638267517, "learning_rate": 9.14513649493187e-05, "loss": 0.0171, "step": 4590 }, { "epoch": 9.108910891089108, "grad_norm": 0.2971257269382477, "learning_rate": 9.140507661856187e-05, "loss": 0.0177, "step": 4600 }, { "epoch": 9.128712871287128, "grad_norm": 0.3113596737384796, "learning_rate": 9.135867508790661e-05, "loss": 0.0158, "step": 4610 }, { "epoch": 9.148514851485148, "grad_norm": 0.3254695236682892, "learning_rate": 9.131216048421291e-05, "loss": 0.0159, "step": 4620 }, { "epoch": 9.168316831683168, "grad_norm": 0.29940325021743774, "learning_rate": 9.126553293464998e-05, "loss": 0.0154, "step": 4630 }, { "epoch": 9.188118811881187, "grad_norm": 0.3793588876724243, "learning_rate": 9.121879256669572e-05, "loss": 0.016, "step": 4640 }, { "epoch": 9.207920792079207, "grad_norm": 0.3003334105014801, "learning_rate": 9.117193950813652e-05, "loss": 0.0154, "step": 4650 }, { "epoch": 9.227722772277227, "grad_norm": 0.26004883646965027, "learning_rate": 9.112497388706685e-05, "loss": 0.0153, "step": 4660 }, { "epoch": 9.247524752475247, "grad_norm": 0.464182049036026, "learning_rate": 9.10778958318889e-05, "loss": 0.0146, "step": 4670 }, { "epoch": 9.267326732673267, "grad_norm": 0.24090123176574707, "learning_rate": 9.103070547131232e-05, "loss": 0.0135, "step": 4680 }, { "epoch": 9.287128712871286, "grad_norm": 0.2919778525829315, "learning_rate": 9.098340293435375e-05, "loss": 0.0159, "step": 4690 }, { "epoch": 9.306930693069306, "grad_norm": 0.26097938418388367, "learning_rate": 9.093598835033649e-05, "loss": 0.0154, "step": 4700 }, { "epoch": 9.326732673267326, "grad_norm": 0.3009501099586487, "learning_rate": 9.088846184889021e-05, "loss": 0.0138, "step": 4710 }, { "epoch": 9.346534653465346, "grad_norm": 0.2385077178478241, "learning_rate": 9.084082355995057e-05, "loss": 0.0144, "step": 4720 }, { "epoch": 9.366336633663366, "grad_norm": 0.35728076100349426, "learning_rate": 9.079307361375882e-05, "loss": 0.0156, "step": 4730 }, { "epoch": 9.386138613861386, "grad_norm": 0.4230426251888275, "learning_rate": 9.074521214086149e-05, "loss": 0.014, "step": 4740 }, { "epoch": 9.405940594059405, "grad_norm": 0.30497249960899353, "learning_rate": 9.069723927211001e-05, "loss": 0.0148, "step": 4750 }, { "epoch": 9.425742574257425, "grad_norm": 0.21952101588249207, "learning_rate": 9.064915513866037e-05, "loss": 0.016, "step": 4760 }, { "epoch": 9.445544554455445, "grad_norm": 0.2934461236000061, "learning_rate": 9.060095987197279e-05, "loss": 0.0139, "step": 4770 }, { "epoch": 9.465346534653465, "grad_norm": 0.3350074887275696, "learning_rate": 9.055265360381126e-05, "loss": 0.0158, "step": 4780 }, { "epoch": 9.485148514851485, "grad_norm": 0.28827229142189026, "learning_rate": 9.050423646624326e-05, "loss": 0.0154, "step": 4790 }, { "epoch": 9.504950495049505, "grad_norm": 0.2641935646533966, "learning_rate": 9.045570859163943e-05, "loss": 0.0155, "step": 4800 }, { "epoch": 9.524752475247524, "grad_norm": 0.2305571287870407, "learning_rate": 9.04070701126731e-05, "loss": 0.0138, "step": 4810 }, { "epoch": 9.544554455445544, "grad_norm": 0.25295478105545044, "learning_rate": 9.035832116232001e-05, "loss": 0.0171, "step": 4820 }, { "epoch": 9.564356435643564, "grad_norm": 0.28064045310020447, "learning_rate": 9.030946187385796e-05, "loss": 0.0145, "step": 4830 }, { "epoch": 9.584158415841584, "grad_norm": 0.2530624568462372, "learning_rate": 9.026049238086635e-05, "loss": 0.0144, "step": 4840 }, { "epoch": 9.603960396039604, "grad_norm": 0.3729717433452606, "learning_rate": 9.021141281722591e-05, "loss": 0.0146, "step": 4850 }, { "epoch": 9.623762376237623, "grad_norm": 0.36881178617477417, "learning_rate": 9.01622233171183e-05, "loss": 0.0143, "step": 4860 }, { "epoch": 9.643564356435643, "grad_norm": 0.37539559602737427, "learning_rate": 9.011292401502574e-05, "loss": 0.0157, "step": 4870 }, { "epoch": 9.663366336633663, "grad_norm": 0.5528184175491333, "learning_rate": 9.006351504573063e-05, "loss": 0.0137, "step": 4880 }, { "epoch": 9.683168316831683, "grad_norm": 0.24870911240577698, "learning_rate": 9.001399654431519e-05, "loss": 0.0137, "step": 4890 }, { "epoch": 9.702970297029703, "grad_norm": 0.2415791004896164, "learning_rate": 8.996436864616116e-05, "loss": 0.0141, "step": 4900 }, { "epoch": 9.722772277227723, "grad_norm": 0.3227170705795288, "learning_rate": 8.991463148694925e-05, "loss": 0.0162, "step": 4910 }, { "epoch": 9.742574257425742, "grad_norm": 0.3353762924671173, "learning_rate": 8.986478520265902e-05, "loss": 0.0151, "step": 4920 }, { "epoch": 9.762376237623762, "grad_norm": 0.22502224147319794, "learning_rate": 8.981482992956827e-05, "loss": 0.0146, "step": 4930 }, { "epoch": 9.782178217821782, "grad_norm": 0.20760492980480194, "learning_rate": 8.976476580425282e-05, "loss": 0.0155, "step": 4940 }, { "epoch": 9.801980198019802, "grad_norm": 0.3610151708126068, "learning_rate": 8.971459296358606e-05, "loss": 0.015, "step": 4950 }, { "epoch": 9.821782178217822, "grad_norm": 0.22285965085029602, "learning_rate": 8.966431154473864e-05, "loss": 0.0141, "step": 4960 }, { "epoch": 9.841584158415841, "grad_norm": 0.4091605544090271, "learning_rate": 8.961392168517803e-05, "loss": 0.0149, "step": 4970 }, { "epoch": 9.861386138613861, "grad_norm": 0.26743969321250916, "learning_rate": 8.956342352266821e-05, "loss": 0.0141, "step": 4980 }, { "epoch": 9.881188118811881, "grad_norm": 0.335641086101532, "learning_rate": 8.95128171952692e-05, "loss": 0.0146, "step": 4990 }, { "epoch": 9.900990099009901, "grad_norm": 0.26174721121788025, "learning_rate": 8.946210284133676e-05, "loss": 0.0143, "step": 5000 }, { "epoch": 9.92079207920792, "grad_norm": 0.229100301861763, "learning_rate": 8.941128059952201e-05, "loss": 0.0138, "step": 5010 }, { "epoch": 9.94059405940594, "grad_norm": 0.25398868322372437, "learning_rate": 8.936035060877102e-05, "loss": 0.0148, "step": 5020 }, { "epoch": 9.96039603960396, "grad_norm": 0.28825855255126953, "learning_rate": 8.930931300832443e-05, "loss": 0.0141, "step": 5030 }, { "epoch": 9.98019801980198, "grad_norm": 0.22167867422103882, "learning_rate": 8.925816793771711e-05, "loss": 0.013, "step": 5040 }, { "epoch": 10.0, "grad_norm": 0.3530784547328949, "learning_rate": 8.92069155367777e-05, "loss": 0.0141, "step": 5050 }, { "epoch": 10.01980198019802, "grad_norm": 0.24841484427452087, "learning_rate": 8.915555594562834e-05, "loss": 0.0128, "step": 5060 }, { "epoch": 10.03960396039604, "grad_norm": 0.40721410512924194, "learning_rate": 8.910408930468416e-05, "loss": 0.013, "step": 5070 }, { "epoch": 10.05940594059406, "grad_norm": 0.3263900876045227, "learning_rate": 8.905251575465303e-05, "loss": 0.0117, "step": 5080 }, { "epoch": 10.07920792079208, "grad_norm": 0.23267436027526855, "learning_rate": 8.900083543653502e-05, "loss": 0.0127, "step": 5090 }, { "epoch": 10.099009900990099, "grad_norm": 0.23549872636795044, "learning_rate": 8.894904849162218e-05, "loss": 0.0148, "step": 5100 }, { "epoch": 10.118811881188119, "grad_norm": 0.43103325366973877, "learning_rate": 8.889715506149802e-05, "loss": 0.0137, "step": 5110 }, { "epoch": 10.138613861386139, "grad_norm": 0.21758505702018738, "learning_rate": 8.884515528803722e-05, "loss": 0.0129, "step": 5120 }, { "epoch": 10.158415841584159, "grad_norm": 0.4262648820877075, "learning_rate": 8.879304931340517e-05, "loss": 0.0146, "step": 5130 }, { "epoch": 10.178217821782178, "grad_norm": 0.2211531400680542, "learning_rate": 8.874083728005759e-05, "loss": 0.0155, "step": 5140 }, { "epoch": 10.198019801980198, "grad_norm": 0.27015557885169983, "learning_rate": 8.868851933074021e-05, "loss": 0.0151, "step": 5150 }, { "epoch": 10.217821782178218, "grad_norm": 0.24982856214046478, "learning_rate": 8.863609560848829e-05, "loss": 0.0135, "step": 5160 }, { "epoch": 10.237623762376238, "grad_norm": 0.22625210881233215, "learning_rate": 8.85835662566263e-05, "loss": 0.0146, "step": 5170 }, { "epoch": 10.257425742574258, "grad_norm": 0.23804067075252533, "learning_rate": 8.853093141876747e-05, "loss": 0.0129, "step": 5180 }, { "epoch": 10.277227722772277, "grad_norm": 0.23650681972503662, "learning_rate": 8.847819123881343e-05, "loss": 0.0125, "step": 5190 }, { "epoch": 10.297029702970297, "grad_norm": 0.22880245745182037, "learning_rate": 8.842534586095383e-05, "loss": 0.0147, "step": 5200 }, { "epoch": 10.316831683168317, "grad_norm": 0.19834314286708832, "learning_rate": 8.837239542966593e-05, "loss": 0.013, "step": 5210 }, { "epoch": 10.336633663366337, "grad_norm": 0.27958470582962036, "learning_rate": 8.831934008971417e-05, "loss": 0.0125, "step": 5220 }, { "epoch": 10.356435643564357, "grad_norm": 0.276777982711792, "learning_rate": 8.826617998614982e-05, "loss": 0.0133, "step": 5230 }, { "epoch": 10.376237623762377, "grad_norm": 0.219083771109581, "learning_rate": 8.821291526431056e-05, "loss": 0.0131, "step": 5240 }, { "epoch": 10.396039603960396, "grad_norm": 0.20928841829299927, "learning_rate": 8.815954606982015e-05, "loss": 0.0124, "step": 5250 }, { "epoch": 10.415841584158416, "grad_norm": 0.24949289858341217, "learning_rate": 8.810607254858789e-05, "loss": 0.0123, "step": 5260 }, { "epoch": 10.435643564356436, "grad_norm": 0.2274172008037567, "learning_rate": 8.805249484680838e-05, "loss": 0.0159, "step": 5270 }, { "epoch": 10.455445544554456, "grad_norm": 0.3297211229801178, "learning_rate": 8.799881311096096e-05, "loss": 0.014, "step": 5280 }, { "epoch": 10.475247524752476, "grad_norm": 0.297308087348938, "learning_rate": 8.794502748780949e-05, "loss": 0.0127, "step": 5290 }, { "epoch": 10.495049504950495, "grad_norm": 0.23850926756858826, "learning_rate": 8.78911381244018e-05, "loss": 0.0127, "step": 5300 }, { "epoch": 10.514851485148515, "grad_norm": 0.2866947054862976, "learning_rate": 8.783714516806933e-05, "loss": 0.014, "step": 5310 }, { "epoch": 10.534653465346535, "grad_norm": 0.29530850052833557, "learning_rate": 8.77830487664268e-05, "loss": 0.0136, "step": 5320 }, { "epoch": 10.554455445544555, "grad_norm": 0.277731716632843, "learning_rate": 8.772884906737167e-05, "loss": 0.0121, "step": 5330 }, { "epoch": 10.574257425742575, "grad_norm": 0.3088735044002533, "learning_rate": 8.767454621908387e-05, "loss": 0.0131, "step": 5340 }, { "epoch": 10.594059405940595, "grad_norm": 0.2947111129760742, "learning_rate": 8.76201403700253e-05, "loss": 0.0144, "step": 5350 }, { "epoch": 10.613861386138614, "grad_norm": 0.29140740633010864, "learning_rate": 8.756563166893949e-05, "loss": 0.0132, "step": 5360 }, { "epoch": 10.633663366336634, "grad_norm": 0.23209413886070251, "learning_rate": 8.751102026485113e-05, "loss": 0.0138, "step": 5370 }, { "epoch": 10.653465346534654, "grad_norm": 0.26890984177589417, "learning_rate": 8.745630630706571e-05, "loss": 0.0136, "step": 5380 }, { "epoch": 10.673267326732674, "grad_norm": 0.257487416267395, "learning_rate": 8.740148994516912e-05, "loss": 0.0136, "step": 5390 }, { "epoch": 10.693069306930694, "grad_norm": 0.3181030750274658, "learning_rate": 8.73465713290272e-05, "loss": 0.0129, "step": 5400 }, { "epoch": 10.712871287128714, "grad_norm": 0.29198747873306274, "learning_rate": 8.729155060878533e-05, "loss": 0.0136, "step": 5410 }, { "epoch": 10.732673267326733, "grad_norm": 0.24527506530284882, "learning_rate": 8.723642793486809e-05, "loss": 0.0128, "step": 5420 }, { "epoch": 10.752475247524753, "grad_norm": 0.23791024088859558, "learning_rate": 8.718120345797873e-05, "loss": 0.0142, "step": 5430 }, { "epoch": 10.772277227722773, "grad_norm": 1.0206315517425537, "learning_rate": 8.712587732909889e-05, "loss": 0.0118, "step": 5440 }, { "epoch": 10.792079207920793, "grad_norm": 0.32817158102989197, "learning_rate": 8.707044969948806e-05, "loss": 0.0134, "step": 5450 }, { "epoch": 10.811881188118813, "grad_norm": 0.22793397307395935, "learning_rate": 8.701492072068329e-05, "loss": 0.0142, "step": 5460 }, { "epoch": 10.831683168316832, "grad_norm": 0.4862902760505676, "learning_rate": 8.695929054449869e-05, "loss": 0.0149, "step": 5470 }, { "epoch": 10.851485148514852, "grad_norm": 0.5702824592590332, "learning_rate": 8.690355932302501e-05, "loss": 0.0155, "step": 5480 }, { "epoch": 10.871287128712872, "grad_norm": 0.25776103138923645, "learning_rate": 8.684772720862931e-05, "loss": 0.0142, "step": 5490 }, { "epoch": 10.891089108910892, "grad_norm": 0.308340847492218, "learning_rate": 8.679179435395446e-05, "loss": 0.0139, "step": 5500 }, { "epoch": 10.910891089108912, "grad_norm": 0.24192814528942108, "learning_rate": 8.673576091191874e-05, "loss": 0.0159, "step": 5510 }, { "epoch": 10.930693069306932, "grad_norm": 0.2719208300113678, "learning_rate": 8.667962703571541e-05, "loss": 0.0124, "step": 5520 }, { "epoch": 10.950495049504951, "grad_norm": 0.18349619209766388, "learning_rate": 8.662339287881238e-05, "loss": 0.012, "step": 5530 }, { "epoch": 10.97029702970297, "grad_norm": 0.3547595143318176, "learning_rate": 8.656705859495169e-05, "loss": 0.0131, "step": 5540 }, { "epoch": 10.990099009900991, "grad_norm": 0.23801152408123016, "learning_rate": 8.651062433814912e-05, "loss": 0.0126, "step": 5550 }, { "epoch": 11.009900990099009, "grad_norm": 0.22795484960079193, "learning_rate": 8.645409026269375e-05, "loss": 0.0136, "step": 5560 }, { "epoch": 11.029702970297029, "grad_norm": 0.5350431203842163, "learning_rate": 8.639745652314759e-05, "loss": 0.0171, "step": 5570 }, { "epoch": 11.049504950495049, "grad_norm": 0.348154753446579, "learning_rate": 8.634072327434515e-05, "loss": 0.0162, "step": 5580 }, { "epoch": 11.069306930693068, "grad_norm": 0.3100927472114563, "learning_rate": 8.628389067139294e-05, "loss": 0.0179, "step": 5590 }, { "epoch": 11.089108910891088, "grad_norm": 0.233198881149292, "learning_rate": 8.622695886966911e-05, "loss": 0.0132, "step": 5600 }, { "epoch": 11.108910891089108, "grad_norm": 0.18409600853919983, "learning_rate": 8.616992802482308e-05, "loss": 0.0157, "step": 5610 }, { "epoch": 11.128712871287128, "grad_norm": 0.2728559970855713, "learning_rate": 8.611279829277496e-05, "loss": 0.0138, "step": 5620 }, { "epoch": 11.148514851485148, "grad_norm": 0.26488369703292847, "learning_rate": 8.605556982971528e-05, "loss": 0.0145, "step": 5630 }, { "epoch": 11.168316831683168, "grad_norm": 0.22121191024780273, "learning_rate": 8.599824279210447e-05, "loss": 0.0156, "step": 5640 }, { "epoch": 11.188118811881187, "grad_norm": 0.21147963404655457, "learning_rate": 8.594081733667243e-05, "loss": 0.0125, "step": 5650 }, { "epoch": 11.207920792079207, "grad_norm": 0.16566269099712372, "learning_rate": 8.58832936204182e-05, "loss": 0.015, "step": 5660 }, { "epoch": 11.227722772277227, "grad_norm": 0.3043392300605774, "learning_rate": 8.582567180060942e-05, "loss": 0.0133, "step": 5670 }, { "epoch": 11.247524752475247, "grad_norm": 0.2762334644794464, "learning_rate": 8.576795203478194e-05, "loss": 0.0137, "step": 5680 }, { "epoch": 11.267326732673267, "grad_norm": 0.2571558952331543, "learning_rate": 8.571013448073939e-05, "loss": 0.0123, "step": 5690 }, { "epoch": 11.287128712871286, "grad_norm": 0.24326954782009125, "learning_rate": 8.565221929655275e-05, "loss": 0.0143, "step": 5700 }, { "epoch": 11.306930693069306, "grad_norm": 0.21434998512268066, "learning_rate": 8.559420664055992e-05, "loss": 0.0117, "step": 5710 }, { "epoch": 11.326732673267326, "grad_norm": 0.20057740807533264, "learning_rate": 8.553609667136532e-05, "loss": 0.0132, "step": 5720 }, { "epoch": 11.346534653465346, "grad_norm": 0.24349017441272736, "learning_rate": 8.547788954783936e-05, "loss": 0.0129, "step": 5730 }, { "epoch": 11.366336633663366, "grad_norm": 0.22026139497756958, "learning_rate": 8.541958542911808e-05, "loss": 0.0148, "step": 5740 }, { "epoch": 11.386138613861386, "grad_norm": 0.23504893481731415, "learning_rate": 8.536118447460275e-05, "loss": 0.0148, "step": 5750 }, { "epoch": 11.405940594059405, "grad_norm": 0.2152322679758072, "learning_rate": 8.530268684395932e-05, "loss": 0.0136, "step": 5760 }, { "epoch": 11.425742574257425, "grad_norm": 0.17333871126174927, "learning_rate": 8.524409269711807e-05, "loss": 0.0136, "step": 5770 }, { "epoch": 11.445544554455445, "grad_norm": 0.24790695309638977, "learning_rate": 8.51854021942732e-05, "loss": 0.0136, "step": 5780 }, { "epoch": 11.465346534653465, "grad_norm": 0.2502416968345642, "learning_rate": 8.512661549588227e-05, "loss": 0.0142, "step": 5790 }, { "epoch": 11.485148514851485, "grad_norm": 0.22821183502674103, "learning_rate": 8.506773276266588e-05, "loss": 0.0124, "step": 5800 }, { "epoch": 11.504950495049505, "grad_norm": 0.25715675950050354, "learning_rate": 8.500875415560721e-05, "loss": 0.014, "step": 5810 }, { "epoch": 11.524752475247524, "grad_norm": 0.23591601848602295, "learning_rate": 8.494967983595144e-05, "loss": 0.013, "step": 5820 }, { "epoch": 11.544554455445544, "grad_norm": 0.218551903963089, "learning_rate": 8.489050996520558e-05, "loss": 0.0133, "step": 5830 }, { "epoch": 11.564356435643564, "grad_norm": 0.2778048515319824, "learning_rate": 8.483124470513775e-05, "loss": 0.012, "step": 5840 }, { "epoch": 11.584158415841584, "grad_norm": 0.30020150542259216, "learning_rate": 8.477188421777692e-05, "loss": 0.0124, "step": 5850 }, { "epoch": 11.603960396039604, "grad_norm": 0.4553483724594116, "learning_rate": 8.47124286654124e-05, "loss": 0.0129, "step": 5860 }, { "epoch": 11.623762376237623, "grad_norm": 0.1940346509218216, "learning_rate": 8.465287821059341e-05, "loss": 0.0129, "step": 5870 }, { "epoch": 11.643564356435643, "grad_norm": 0.24346499145030975, "learning_rate": 8.45932330161286e-05, "loss": 0.0109, "step": 5880 }, { "epoch": 11.663366336633663, "grad_norm": 0.24329276382923126, "learning_rate": 8.453349324508567e-05, "loss": 0.0108, "step": 5890 }, { "epoch": 11.683168316831683, "grad_norm": 0.3456171452999115, "learning_rate": 8.447365906079088e-05, "loss": 0.01, "step": 5900 }, { "epoch": 11.702970297029703, "grad_norm": 0.43261101841926575, "learning_rate": 8.441373062682856e-05, "loss": 0.0116, "step": 5910 }, { "epoch": 11.722772277227723, "grad_norm": 0.16980113089084625, "learning_rate": 8.43537081070408e-05, "loss": 0.0115, "step": 5920 }, { "epoch": 11.742574257425742, "grad_norm": 0.3877338469028473, "learning_rate": 8.429359166552689e-05, "loss": 0.0107, "step": 5930 }, { "epoch": 11.762376237623762, "grad_norm": 0.2082972526550293, "learning_rate": 8.423338146664284e-05, "loss": 0.0141, "step": 5940 }, { "epoch": 11.782178217821782, "grad_norm": 0.21298649907112122, "learning_rate": 8.417307767500107e-05, "loss": 0.0127, "step": 5950 }, { "epoch": 11.801980198019802, "grad_norm": 0.2187129408121109, "learning_rate": 8.411268045546983e-05, "loss": 0.0136, "step": 5960 }, { "epoch": 11.821782178217822, "grad_norm": 0.25349071621894836, "learning_rate": 8.405218997317281e-05, "loss": 0.0124, "step": 5970 }, { "epoch": 11.841584158415841, "grad_norm": 0.299495667219162, "learning_rate": 8.399160639348869e-05, "loss": 0.0114, "step": 5980 }, { "epoch": 11.861386138613861, "grad_norm": 0.2479686588048935, "learning_rate": 8.393092988205065e-05, "loss": 0.0116, "step": 5990 }, { "epoch": 11.881188118811881, "grad_norm": 0.29965847730636597, "learning_rate": 8.387016060474597e-05, "loss": 0.0133, "step": 6000 }, { "epoch": 11.900990099009901, "grad_norm": 0.2075260877609253, "learning_rate": 8.380929872771551e-05, "loss": 0.0114, "step": 6010 }, { "epoch": 11.92079207920792, "grad_norm": 0.3552871346473694, "learning_rate": 8.374834441735335e-05, "loss": 0.0128, "step": 6020 }, { "epoch": 11.94059405940594, "grad_norm": 0.2796480357646942, "learning_rate": 8.368729784030622e-05, "loss": 0.0118, "step": 6030 }, { "epoch": 11.96039603960396, "grad_norm": 0.239824578166008, "learning_rate": 8.362615916347315e-05, "loss": 0.0129, "step": 6040 }, { "epoch": 11.98019801980198, "grad_norm": 0.23167861998081207, "learning_rate": 8.356492855400493e-05, "loss": 0.0123, "step": 6050 }, { "epoch": 12.0, "grad_norm": 0.23870816826820374, "learning_rate": 8.350360617930371e-05, "loss": 0.0112, "step": 6060 }, { "epoch": 12.01980198019802, "grad_norm": 0.2629878520965576, "learning_rate": 8.344219220702255e-05, "loss": 0.0121, "step": 6070 }, { "epoch": 12.03960396039604, "grad_norm": 0.2327660471200943, "learning_rate": 8.338068680506485e-05, "loss": 0.0118, "step": 6080 }, { "epoch": 12.05940594059406, "grad_norm": 0.2417145073413849, "learning_rate": 8.33190901415841e-05, "loss": 0.0149, "step": 6090 }, { "epoch": 12.07920792079208, "grad_norm": 0.2162279635667801, "learning_rate": 8.325740238498317e-05, "loss": 0.0128, "step": 6100 }, { "epoch": 12.099009900990099, "grad_norm": 0.15675456821918488, "learning_rate": 8.319562370391406e-05, "loss": 0.0106, "step": 6110 }, { "epoch": 12.118811881188119, "grad_norm": 0.22149385511875153, "learning_rate": 8.31337542672773e-05, "loss": 0.0124, "step": 6120 }, { "epoch": 12.138613861386139, "grad_norm": 0.16836456954479218, "learning_rate": 8.307179424422158e-05, "loss": 0.0128, "step": 6130 }, { "epoch": 12.158415841584159, "grad_norm": 0.1731242537498474, "learning_rate": 8.300974380414327e-05, "loss": 0.0124, "step": 6140 }, { "epoch": 12.178217821782178, "grad_norm": 0.17010290920734406, "learning_rate": 8.294760311668586e-05, "loss": 0.0113, "step": 6150 }, { "epoch": 12.198019801980198, "grad_norm": 0.1834890991449356, "learning_rate": 8.288537235173961e-05, "loss": 0.0111, "step": 6160 }, { "epoch": 12.217821782178218, "grad_norm": 0.20945511758327484, "learning_rate": 8.282305167944108e-05, "loss": 0.0112, "step": 6170 }, { "epoch": 12.237623762376238, "grad_norm": 0.3750075697898865, "learning_rate": 8.276064127017262e-05, "loss": 0.0105, "step": 6180 }, { "epoch": 12.257425742574258, "grad_norm": 0.25690194964408875, "learning_rate": 8.269814129456189e-05, "loss": 0.0115, "step": 6190 }, { "epoch": 12.277227722772277, "grad_norm": 0.2113027721643448, "learning_rate": 8.263555192348143e-05, "loss": 0.0115, "step": 6200 }, { "epoch": 12.297029702970297, "grad_norm": 0.1899217814207077, "learning_rate": 8.257287332804819e-05, "loss": 0.0116, "step": 6210 }, { "epoch": 12.316831683168317, "grad_norm": 0.19912298023700714, "learning_rate": 8.251010567962307e-05, "loss": 0.0121, "step": 6220 }, { "epoch": 12.336633663366337, "grad_norm": 0.23123537003993988, "learning_rate": 8.244724914981041e-05, "loss": 0.0125, "step": 6230 }, { "epoch": 12.356435643564357, "grad_norm": 0.297870397567749, "learning_rate": 8.238430391045757e-05, "loss": 0.0117, "step": 6240 }, { "epoch": 12.376237623762377, "grad_norm": 0.19720222055912018, "learning_rate": 8.232127013365445e-05, "loss": 0.0127, "step": 6250 }, { "epoch": 12.396039603960396, "grad_norm": 0.16797412931919098, "learning_rate": 8.225814799173295e-05, "loss": 0.0109, "step": 6260 }, { "epoch": 12.415841584158416, "grad_norm": 0.21797575056552887, "learning_rate": 8.219493765726663e-05, "loss": 0.0107, "step": 6270 }, { "epoch": 12.435643564356436, "grad_norm": 0.21759164333343506, "learning_rate": 8.21316393030701e-05, "loss": 0.0103, "step": 6280 }, { "epoch": 12.455445544554456, "grad_norm": 0.21420404314994812, "learning_rate": 8.206825310219865e-05, "loss": 0.0112, "step": 6290 }, { "epoch": 12.475247524752476, "grad_norm": 0.36851629614830017, "learning_rate": 8.200477922794776e-05, "loss": 0.012, "step": 6300 }, { "epoch": 12.495049504950495, "grad_norm": 0.19582794606685638, "learning_rate": 8.194121785385256e-05, "loss": 0.0107, "step": 6310 }, { "epoch": 12.514851485148515, "grad_norm": 0.22732238471508026, "learning_rate": 8.187756915368741e-05, "loss": 0.0104, "step": 6320 }, { "epoch": 12.534653465346535, "grad_norm": 0.3295450508594513, "learning_rate": 8.181383330146544e-05, "loss": 0.0117, "step": 6330 }, { "epoch": 12.554455445544555, "grad_norm": 0.29424139857292175, "learning_rate": 8.175001047143804e-05, "loss": 0.0113, "step": 6340 }, { "epoch": 12.574257425742575, "grad_norm": 0.19414767622947693, "learning_rate": 8.168610083809438e-05, "loss": 0.0119, "step": 6350 }, { "epoch": 12.594059405940595, "grad_norm": 0.22992756962776184, "learning_rate": 8.162210457616095e-05, "loss": 0.0131, "step": 6360 }, { "epoch": 12.613861386138614, "grad_norm": 0.25204548239707947, "learning_rate": 8.155802186060109e-05, "loss": 0.0119, "step": 6370 }, { "epoch": 12.633663366336634, "grad_norm": 0.12040138989686966, "learning_rate": 8.149385286661453e-05, "loss": 0.0107, "step": 6380 }, { "epoch": 12.653465346534654, "grad_norm": 0.15956874191761017, "learning_rate": 8.14295977696368e-05, "loss": 0.011, "step": 6390 }, { "epoch": 12.673267326732674, "grad_norm": 0.4986914098262787, "learning_rate": 8.13652567453389e-05, "loss": 0.0138, "step": 6400 }, { "epoch": 12.693069306930694, "grad_norm": 0.26661914587020874, "learning_rate": 8.130082996962676e-05, "loss": 0.0139, "step": 6410 }, { "epoch": 12.712871287128714, "grad_norm": 0.19006909430027008, "learning_rate": 8.123631761864068e-05, "loss": 0.0135, "step": 6420 }, { "epoch": 12.732673267326733, "grad_norm": 0.2721673846244812, "learning_rate": 8.1171719868755e-05, "loss": 0.0118, "step": 6430 }, { "epoch": 12.752475247524753, "grad_norm": 0.23649105429649353, "learning_rate": 8.110703689657748e-05, "loss": 0.0105, "step": 6440 }, { "epoch": 12.772277227722773, "grad_norm": 0.2284175157546997, "learning_rate": 8.104226887894892e-05, "loss": 0.0124, "step": 6450 }, { "epoch": 12.792079207920793, "grad_norm": 0.1816590428352356, "learning_rate": 8.097741599294257e-05, "loss": 0.0108, "step": 6460 }, { "epoch": 12.811881188118813, "grad_norm": 0.26563265919685364, "learning_rate": 8.091247841586378e-05, "loss": 0.011, "step": 6470 }, { "epoch": 12.831683168316832, "grad_norm": 0.6437601447105408, "learning_rate": 8.084745632524939e-05, "loss": 0.0117, "step": 6480 }, { "epoch": 12.851485148514852, "grad_norm": 0.2212037295103073, "learning_rate": 8.07823498988673e-05, "loss": 0.0107, "step": 6490 }, { "epoch": 12.871287128712872, "grad_norm": 0.25284838676452637, "learning_rate": 8.071715931471602e-05, "loss": 0.0132, "step": 6500 }, { "epoch": 12.891089108910892, "grad_norm": 0.3124699592590332, "learning_rate": 8.06518847510241e-05, "loss": 0.0122, "step": 6510 }, { "epoch": 12.910891089108912, "grad_norm": 0.20998884737491608, "learning_rate": 8.058652638624971e-05, "loss": 0.0145, "step": 6520 }, { "epoch": 12.930693069306932, "grad_norm": 0.2207033932209015, "learning_rate": 8.052108439908013e-05, "loss": 0.0124, "step": 6530 }, { "epoch": 12.950495049504951, "grad_norm": 0.22854827344417572, "learning_rate": 8.045555896843125e-05, "loss": 0.0131, "step": 6540 }, { "epoch": 12.97029702970297, "grad_norm": 0.2500267028808594, "learning_rate": 8.03899502734471e-05, "loss": 0.0104, "step": 6550 }, { "epoch": 12.990099009900991, "grad_norm": 0.2160782516002655, "learning_rate": 8.032425849349931e-05, "loss": 0.0129, "step": 6560 }, { "epoch": 13.009900990099009, "grad_norm": 0.29270634055137634, "learning_rate": 8.025848380818674e-05, "loss": 0.0124, "step": 6570 }, { "epoch": 13.029702970297029, "grad_norm": 0.21005335450172424, "learning_rate": 8.019262639733487e-05, "loss": 0.0135, "step": 6580 }, { "epoch": 13.049504950495049, "grad_norm": 0.20583274960517883, "learning_rate": 8.012668644099531e-05, "loss": 0.0106, "step": 6590 }, { "epoch": 13.069306930693068, "grad_norm": 0.3405713438987732, "learning_rate": 8.006066411944542e-05, "loss": 0.0117, "step": 6600 }, { "epoch": 13.089108910891088, "grad_norm": 0.25548723340034485, "learning_rate": 7.999455961318769e-05, "loss": 0.0107, "step": 6610 }, { "epoch": 13.108910891089108, "grad_norm": 0.2546164095401764, "learning_rate": 7.992837310294932e-05, "loss": 0.0129, "step": 6620 }, { "epoch": 13.128712871287128, "grad_norm": 0.23357543349266052, "learning_rate": 7.986210476968167e-05, "loss": 0.0137, "step": 6630 }, { "epoch": 13.148514851485148, "grad_norm": 0.2579389810562134, "learning_rate": 7.97957547945599e-05, "loss": 0.0128, "step": 6640 }, { "epoch": 13.168316831683168, "grad_norm": 0.3065495491027832, "learning_rate": 7.972932335898226e-05, "loss": 0.0139, "step": 6650 }, { "epoch": 13.188118811881187, "grad_norm": 0.2702319025993347, "learning_rate": 7.966281064456975e-05, "loss": 0.0142, "step": 6660 }, { "epoch": 13.207920792079207, "grad_norm": 0.28214165568351746, "learning_rate": 7.959621683316563e-05, "loss": 0.0113, "step": 6670 }, { "epoch": 13.227722772277227, "grad_norm": 0.2662579119205475, "learning_rate": 7.952954210683481e-05, "loss": 0.0138, "step": 6680 }, { "epoch": 13.247524752475247, "grad_norm": 0.33687835931777954, "learning_rate": 7.946278664786345e-05, "loss": 0.0128, "step": 6690 }, { "epoch": 13.267326732673267, "grad_norm": 0.22283540666103363, "learning_rate": 7.939595063875842e-05, "loss": 0.0126, "step": 6700 }, { "epoch": 13.287128712871286, "grad_norm": 0.42111968994140625, "learning_rate": 7.932903426224683e-05, "loss": 0.0143, "step": 6710 }, { "epoch": 13.306930693069306, "grad_norm": 0.23351502418518066, "learning_rate": 7.926203770127552e-05, "loss": 0.0131, "step": 6720 }, { "epoch": 13.326732673267326, "grad_norm": 0.23020583391189575, "learning_rate": 7.919496113901046e-05, "loss": 0.0138, "step": 6730 }, { "epoch": 13.346534653465346, "grad_norm": 0.23387965559959412, "learning_rate": 7.912780475883649e-05, "loss": 0.0142, "step": 6740 }, { "epoch": 13.366336633663366, "grad_norm": 0.2174062579870224, "learning_rate": 7.906056874435652e-05, "loss": 0.0143, "step": 6750 }, { "epoch": 13.386138613861386, "grad_norm": 0.26152050495147705, "learning_rate": 7.899325327939131e-05, "loss": 0.016, "step": 6760 }, { "epoch": 13.405940594059405, "grad_norm": 0.26735180616378784, "learning_rate": 7.892585854797872e-05, "loss": 0.0135, "step": 6770 }, { "epoch": 13.425742574257425, "grad_norm": 0.1835986226797104, "learning_rate": 7.88583847343734e-05, "loss": 0.0141, "step": 6780 }, { "epoch": 13.445544554455445, "grad_norm": 0.1875249296426773, "learning_rate": 7.879083202304616e-05, "loss": 0.0151, "step": 6790 }, { "epoch": 13.465346534653465, "grad_norm": 0.21644644439220428, "learning_rate": 7.872320059868355e-05, "loss": 0.0143, "step": 6800 }, { "epoch": 13.485148514851485, "grad_norm": 0.263836532831192, "learning_rate": 7.865549064618729e-05, "loss": 0.0147, "step": 6810 }, { "epoch": 13.504950495049505, "grad_norm": 0.238608255982399, "learning_rate": 7.858770235067381e-05, "loss": 0.0135, "step": 6820 }, { "epoch": 13.524752475247524, "grad_norm": 0.4344395101070404, "learning_rate": 7.851983589747374e-05, "loss": 0.012, "step": 6830 }, { "epoch": 13.544554455445544, "grad_norm": 0.34218543767929077, "learning_rate": 7.845189147213133e-05, "loss": 0.0133, "step": 6840 }, { "epoch": 13.564356435643564, "grad_norm": 0.15599778294563293, "learning_rate": 7.838386926040407e-05, "loss": 0.0117, "step": 6850 }, { "epoch": 13.584158415841584, "grad_norm": 0.2906804382801056, "learning_rate": 7.83157694482621e-05, "loss": 0.0122, "step": 6860 }, { "epoch": 13.603960396039604, "grad_norm": 0.2471197098493576, "learning_rate": 7.824759222188768e-05, "loss": 0.0119, "step": 6870 }, { "epoch": 13.623762376237623, "grad_norm": 0.3254447877407074, "learning_rate": 7.817933776767478e-05, "loss": 0.0118, "step": 6880 }, { "epoch": 13.643564356435643, "grad_norm": 0.32774725556373596, "learning_rate": 7.811100627222842e-05, "loss": 0.0138, "step": 6890 }, { "epoch": 13.663366336633663, "grad_norm": 0.2427709847688675, "learning_rate": 7.804259792236435e-05, "loss": 0.0112, "step": 6900 }, { "epoch": 13.683168316831683, "grad_norm": 0.1937781125307083, "learning_rate": 7.797411290510835e-05, "loss": 0.0115, "step": 6910 }, { "epoch": 13.702970297029703, "grad_norm": 0.29669246077537537, "learning_rate": 7.790555140769586e-05, "loss": 0.0145, "step": 6920 }, { "epoch": 13.722772277227723, "grad_norm": 0.24872153997421265, "learning_rate": 7.78369136175714e-05, "loss": 0.0129, "step": 6930 }, { "epoch": 13.742574257425742, "grad_norm": 0.1797112226486206, "learning_rate": 7.776819972238806e-05, "loss": 0.012, "step": 6940 }, { "epoch": 13.762376237623762, "grad_norm": 0.20943115651607513, "learning_rate": 7.7699409910007e-05, "loss": 0.0111, "step": 6950 }, { "epoch": 13.782178217821782, "grad_norm": 0.20351198315620422, "learning_rate": 7.763054436849694e-05, "loss": 0.0111, "step": 6960 }, { "epoch": 13.801980198019802, "grad_norm": 0.2201082557439804, "learning_rate": 7.756160328613364e-05, "loss": 0.0119, "step": 6970 }, { "epoch": 13.821782178217822, "grad_norm": 0.23532699048519135, "learning_rate": 7.749258685139942e-05, "loss": 0.0121, "step": 6980 }, { "epoch": 13.841584158415841, "grad_norm": 0.26032164692878723, "learning_rate": 7.742349525298253e-05, "loss": 0.011, "step": 6990 }, { "epoch": 13.861386138613861, "grad_norm": 0.35262420773506165, "learning_rate": 7.735432867977679e-05, "loss": 0.0101, "step": 7000 }, { "epoch": 13.881188118811881, "grad_norm": 0.24596558511257172, "learning_rate": 7.728508732088096e-05, "loss": 0.0105, "step": 7010 }, { "epoch": 13.900990099009901, "grad_norm": 0.15614156424999237, "learning_rate": 7.721577136559825e-05, "loss": 0.0106, "step": 7020 }, { "epoch": 13.92079207920792, "grad_norm": 0.18521460890769958, "learning_rate": 7.714638100343588e-05, "loss": 0.0111, "step": 7030 }, { "epoch": 13.94059405940594, "grad_norm": 0.20374846458435059, "learning_rate": 7.707691642410444e-05, "loss": 0.0097, "step": 7040 }, { "epoch": 13.96039603960396, "grad_norm": 0.19179049134254456, "learning_rate": 7.70073778175174e-05, "loss": 0.0107, "step": 7050 }, { "epoch": 13.98019801980198, "grad_norm": 0.17555810511112213, "learning_rate": 7.69377653737907e-05, "loss": 0.0096, "step": 7060 }, { "epoch": 14.0, "grad_norm": 0.3304188549518585, "learning_rate": 7.686807928324209e-05, "loss": 0.009, "step": 7070 }, { "epoch": 14.01980198019802, "grad_norm": 0.1693524420261383, "learning_rate": 7.679831973639065e-05, "loss": 0.0107, "step": 7080 }, { "epoch": 14.03960396039604, "grad_norm": 0.23313014209270477, "learning_rate": 7.672848692395637e-05, "loss": 0.0111, "step": 7090 }, { "epoch": 14.05940594059406, "grad_norm": 0.18295225501060486, "learning_rate": 7.665858103685944e-05, "loss": 0.0108, "step": 7100 }, { "epoch": 14.07920792079208, "grad_norm": 0.18685941398143768, "learning_rate": 7.658860226621991e-05, "loss": 0.0108, "step": 7110 }, { "epoch": 14.099009900990099, "grad_norm": 0.18878231942653656, "learning_rate": 7.651855080335708e-05, "loss": 0.0106, "step": 7120 }, { "epoch": 14.118811881188119, "grad_norm": 0.198191300034523, "learning_rate": 7.644842683978896e-05, "loss": 0.0102, "step": 7130 }, { "epoch": 14.138613861386139, "grad_norm": 0.30171942710876465, "learning_rate": 7.63782305672318e-05, "loss": 0.012, "step": 7140 }, { "epoch": 14.158415841584159, "grad_norm": 0.24142993986606598, "learning_rate": 7.63079621775995e-05, "loss": 0.0096, "step": 7150 }, { "epoch": 14.178217821782178, "grad_norm": 0.18350423872470856, "learning_rate": 7.623762186300319e-05, "loss": 0.0097, "step": 7160 }, { "epoch": 14.198019801980198, "grad_norm": 0.17738915979862213, "learning_rate": 7.616720981575057e-05, "loss": 0.0105, "step": 7170 }, { "epoch": 14.217821782178218, "grad_norm": 0.22379957139492035, "learning_rate": 7.609672622834552e-05, "loss": 0.0124, "step": 7180 }, { "epoch": 14.237623762376238, "grad_norm": 0.23368655145168304, "learning_rate": 7.602617129348747e-05, "loss": 0.0107, "step": 7190 }, { "epoch": 14.257425742574258, "grad_norm": 0.1884390115737915, "learning_rate": 7.595554520407088e-05, "loss": 0.0121, "step": 7200 }, { "epoch": 14.277227722772277, "grad_norm": 0.19684326648712158, "learning_rate": 7.588484815318484e-05, "loss": 0.0115, "step": 7210 }, { "epoch": 14.297029702970297, "grad_norm": 0.311718225479126, "learning_rate": 7.581408033411234e-05, "loss": 0.0125, "step": 7220 }, { "epoch": 14.316831683168317, "grad_norm": 0.22057801485061646, "learning_rate": 7.574324194032995e-05, "loss": 0.0095, "step": 7230 }, { "epoch": 14.336633663366337, "grad_norm": 0.18872341513633728, "learning_rate": 7.567233316550705e-05, "loss": 0.0137, "step": 7240 }, { "epoch": 14.356435643564357, "grad_norm": 0.23562487959861755, "learning_rate": 7.560135420350562e-05, "loss": 0.0124, "step": 7250 }, { "epoch": 14.376237623762377, "grad_norm": 0.1655789464712143, "learning_rate": 7.553030524837935e-05, "loss": 0.0123, "step": 7260 }, { "epoch": 14.396039603960396, "grad_norm": 0.1875065416097641, "learning_rate": 7.545918649437341e-05, "loss": 0.01, "step": 7270 }, { "epoch": 14.415841584158416, "grad_norm": 0.27464863657951355, "learning_rate": 7.538799813592377e-05, "loss": 0.0116, "step": 7280 }, { "epoch": 14.435643564356436, "grad_norm": 0.3102843463420868, "learning_rate": 7.531674036765662e-05, "loss": 0.0124, "step": 7290 }, { "epoch": 14.455445544554456, "grad_norm": 0.20326678454875946, "learning_rate": 7.524541338438807e-05, "loss": 0.0107, "step": 7300 }, { "epoch": 14.475247524752476, "grad_norm": 0.27935683727264404, "learning_rate": 7.517401738112328e-05, "loss": 0.0107, "step": 7310 }, { "epoch": 14.495049504950495, "grad_norm": 0.22154977917671204, "learning_rate": 7.510255255305628e-05, "loss": 0.0107, "step": 7320 }, { "epoch": 14.514851485148515, "grad_norm": 0.26573193073272705, "learning_rate": 7.503101909556911e-05, "loss": 0.013, "step": 7330 }, { "epoch": 14.534653465346535, "grad_norm": 0.27580296993255615, "learning_rate": 7.495941720423154e-05, "loss": 0.012, "step": 7340 }, { "epoch": 14.554455445544555, "grad_norm": 0.30780431628227234, "learning_rate": 7.488774707480042e-05, "loss": 0.0107, "step": 7350 }, { "epoch": 14.574257425742575, "grad_norm": 0.25496143102645874, "learning_rate": 7.481600890321911e-05, "loss": 0.0114, "step": 7360 }, { "epoch": 14.594059405940595, "grad_norm": 0.23181608319282532, "learning_rate": 7.474420288561708e-05, "loss": 0.014, "step": 7370 }, { "epoch": 14.613861386138614, "grad_norm": 0.27790629863739014, "learning_rate": 7.467232921830921e-05, "loss": 0.0123, "step": 7380 }, { "epoch": 14.633663366336634, "grad_norm": 0.29614245891571045, "learning_rate": 7.460038809779537e-05, "loss": 0.0117, "step": 7390 }, { "epoch": 14.653465346534654, "grad_norm": 0.24413028359413147, "learning_rate": 7.452837972075983e-05, "loss": 0.012, "step": 7400 }, { "epoch": 14.673267326732674, "grad_norm": 0.1658088117837906, "learning_rate": 7.445630428407074e-05, "loss": 0.0106, "step": 7410 }, { "epoch": 14.693069306930694, "grad_norm": 0.16552011668682098, "learning_rate": 7.43841619847796e-05, "loss": 0.0098, "step": 7420 }, { "epoch": 14.712871287128714, "grad_norm": 0.3081233501434326, "learning_rate": 7.431195302012072e-05, "loss": 0.0103, "step": 7430 }, { "epoch": 14.732673267326733, "grad_norm": 0.26699939370155334, "learning_rate": 7.423967758751061e-05, "loss": 0.0102, "step": 7440 }, { "epoch": 14.752475247524753, "grad_norm": 0.14852634072303772, "learning_rate": 7.416733588454758e-05, "loss": 0.0096, "step": 7450 }, { "epoch": 14.772277227722773, "grad_norm": 0.15867866575717926, "learning_rate": 7.409492810901106e-05, "loss": 0.0111, "step": 7460 }, { "epoch": 14.792079207920793, "grad_norm": 0.27476492524147034, "learning_rate": 7.402245445886116e-05, "loss": 0.0096, "step": 7470 }, { "epoch": 14.811881188118813, "grad_norm": 0.2551340162754059, "learning_rate": 7.394991513223806e-05, "loss": 0.0109, "step": 7480 }, { "epoch": 14.831683168316832, "grad_norm": 0.2889328896999359, "learning_rate": 7.38773103274615e-05, "loss": 0.0105, "step": 7490 }, { "epoch": 14.851485148514852, "grad_norm": 0.4670482575893402, "learning_rate": 7.380464024303028e-05, "loss": 0.009, "step": 7500 }, { "epoch": 14.871287128712872, "grad_norm": 0.22646751999855042, "learning_rate": 7.373190507762162e-05, "loss": 0.0109, "step": 7510 }, { "epoch": 14.891089108910892, "grad_norm": 0.2021467685699463, "learning_rate": 7.365910503009066e-05, "loss": 0.0105, "step": 7520 }, { "epoch": 14.910891089108912, "grad_norm": 0.21203374862670898, "learning_rate": 7.358624029946996e-05, "loss": 0.0113, "step": 7530 }, { "epoch": 14.930693069306932, "grad_norm": 0.24363644421100616, "learning_rate": 7.351331108496893e-05, "loss": 0.0105, "step": 7540 }, { "epoch": 14.950495049504951, "grad_norm": 0.32484498620033264, "learning_rate": 7.344031758597325e-05, "loss": 0.0108, "step": 7550 }, { "epoch": 14.97029702970297, "grad_norm": 0.32126012444496155, "learning_rate": 7.336726000204435e-05, "loss": 0.0117, "step": 7560 }, { "epoch": 14.990099009900991, "grad_norm": 0.2513757050037384, "learning_rate": 7.32941385329189e-05, "loss": 0.013, "step": 7570 }, { "epoch": 15.009900990099009, "grad_norm": 0.19551318883895874, "learning_rate": 7.322095337850816e-05, "loss": 0.0116, "step": 7580 }, { "epoch": 15.029702970297029, "grad_norm": 0.24349740147590637, "learning_rate": 7.314770473889758e-05, "loss": 0.0136, "step": 7590 }, { "epoch": 15.049504950495049, "grad_norm": 0.13463565707206726, "learning_rate": 7.307439281434615e-05, "loss": 0.0113, "step": 7600 }, { "epoch": 15.069306930693068, "grad_norm": 0.2078399807214737, "learning_rate": 7.300101780528585e-05, "loss": 0.0118, "step": 7610 }, { "epoch": 15.089108910891088, "grad_norm": 0.22751915454864502, "learning_rate": 7.292757991232117e-05, "loss": 0.0112, "step": 7620 }, { "epoch": 15.108910891089108, "grad_norm": 0.17030200362205505, "learning_rate": 7.285407933622848e-05, "loss": 0.0111, "step": 7630 }, { "epoch": 15.128712871287128, "grad_norm": 0.22684140503406525, "learning_rate": 7.278051627795557e-05, "loss": 0.012, "step": 7640 }, { "epoch": 15.148514851485148, "grad_norm": 0.17248670756816864, "learning_rate": 7.270689093862105e-05, "loss": 0.0108, "step": 7650 }, { "epoch": 15.168316831683168, "grad_norm": 0.14869092404842377, "learning_rate": 7.263320351951374e-05, "loss": 0.0106, "step": 7660 }, { "epoch": 15.188118811881187, "grad_norm": 0.23264428973197937, "learning_rate": 7.255945422209227e-05, "loss": 0.0104, "step": 7670 }, { "epoch": 15.207920792079207, "grad_norm": 0.21513259410858154, "learning_rate": 7.248564324798437e-05, "loss": 0.0114, "step": 7680 }, { "epoch": 15.227722772277227, "grad_norm": 0.19899888336658478, "learning_rate": 7.241177079898644e-05, "loss": 0.0099, "step": 7690 }, { "epoch": 15.247524752475247, "grad_norm": 0.24870173633098602, "learning_rate": 7.233783707706295e-05, "loss": 0.0096, "step": 7700 }, { "epoch": 15.267326732673267, "grad_norm": 0.26383036375045776, "learning_rate": 7.226384228434586e-05, "loss": 0.0115, "step": 7710 }, { "epoch": 15.287128712871286, "grad_norm": 0.2211935818195343, "learning_rate": 7.21897866231341e-05, "loss": 0.0107, "step": 7720 }, { "epoch": 15.306930693069306, "grad_norm": 0.20200009644031525, "learning_rate": 7.211567029589303e-05, "loss": 0.0099, "step": 7730 }, { "epoch": 15.326732673267326, "grad_norm": 0.23284216225147247, "learning_rate": 7.204149350525387e-05, "loss": 0.0093, "step": 7740 }, { "epoch": 15.346534653465346, "grad_norm": 0.29052281379699707, "learning_rate": 7.196725645401309e-05, "loss": 0.0115, "step": 7750 }, { "epoch": 15.366336633663366, "grad_norm": 0.25089380145072937, "learning_rate": 7.1892959345132e-05, "loss": 0.0108, "step": 7760 }, { "epoch": 15.386138613861386, "grad_norm": 0.22715115547180176, "learning_rate": 7.181860238173605e-05, "loss": 0.0099, "step": 7770 }, { "epoch": 15.405940594059405, "grad_norm": 0.25397464632987976, "learning_rate": 7.174418576711432e-05, "loss": 0.0129, "step": 7780 }, { "epoch": 15.425742574257425, "grad_norm": 0.17592357099056244, "learning_rate": 7.1669709704719e-05, "loss": 0.0114, "step": 7790 }, { "epoch": 15.445544554455445, "grad_norm": 0.22582948207855225, "learning_rate": 7.159517439816481e-05, "loss": 0.0095, "step": 7800 }, { "epoch": 15.465346534653465, "grad_norm": 0.17772196233272552, "learning_rate": 7.152058005122842e-05, "loss": 0.0102, "step": 7810 }, { "epoch": 15.485148514851485, "grad_norm": 0.2062048763036728, "learning_rate": 7.144592686784793e-05, "loss": 0.0097, "step": 7820 }, { "epoch": 15.504950495049505, "grad_norm": 0.18263675272464752, "learning_rate": 7.137121505212229e-05, "loss": 0.0086, "step": 7830 }, { "epoch": 15.524752475247524, "grad_norm": 0.3096776306629181, "learning_rate": 7.129644480831077e-05, "loss": 0.0106, "step": 7840 }, { "epoch": 15.544554455445544, "grad_norm": 0.21245868504047394, "learning_rate": 7.122161634083234e-05, "loss": 0.0109, "step": 7850 }, { "epoch": 15.564356435643564, "grad_norm": 0.22532548010349274, "learning_rate": 7.114672985426516e-05, "loss": 0.0097, "step": 7860 }, { "epoch": 15.584158415841584, "grad_norm": 0.28283873200416565, "learning_rate": 7.107178555334606e-05, "loss": 0.0097, "step": 7870 }, { "epoch": 15.603960396039604, "grad_norm": 0.17164696753025055, "learning_rate": 7.099678364296989e-05, "loss": 0.0089, "step": 7880 }, { "epoch": 15.623762376237623, "grad_norm": 0.192380890250206, "learning_rate": 7.0921724328189e-05, "loss": 0.0096, "step": 7890 }, { "epoch": 15.643564356435643, "grad_norm": 0.1681312620639801, "learning_rate": 7.084660781421268e-05, "loss": 0.01, "step": 7900 }, { "epoch": 15.663366336633663, "grad_norm": 0.23007680475711823, "learning_rate": 7.077143430640662e-05, "loss": 0.0108, "step": 7910 }, { "epoch": 15.683168316831683, "grad_norm": 0.18256177008152008, "learning_rate": 7.069620401029232e-05, "loss": 0.0094, "step": 7920 }, { "epoch": 15.702970297029703, "grad_norm": 0.19984260201454163, "learning_rate": 7.062091713154655e-05, "loss": 0.0105, "step": 7930 }, { "epoch": 15.722772277227723, "grad_norm": 0.3185005486011505, "learning_rate": 7.054557387600075e-05, "loss": 0.0104, "step": 7940 }, { "epoch": 15.742574257425742, "grad_norm": 0.20442378520965576, "learning_rate": 7.04701744496405e-05, "loss": 0.0096, "step": 7950 }, { "epoch": 15.762376237623762, "grad_norm": 0.22411030530929565, "learning_rate": 7.039471905860495e-05, "loss": 0.0107, "step": 7960 }, { "epoch": 15.782178217821782, "grad_norm": 0.16568946838378906, "learning_rate": 7.031920790918628e-05, "loss": 0.0107, "step": 7970 }, { "epoch": 15.801980198019802, "grad_norm": 0.22677020728588104, "learning_rate": 7.024364120782906e-05, "loss": 0.0122, "step": 7980 }, { "epoch": 15.821782178217822, "grad_norm": 0.18990078568458557, "learning_rate": 7.016801916112978e-05, "loss": 0.0098, "step": 7990 }, { "epoch": 15.841584158415841, "grad_norm": 0.25340116024017334, "learning_rate": 7.009234197583623e-05, "loss": 0.0104, "step": 8000 }, { "epoch": 15.861386138613861, "grad_norm": 0.22137422859668732, "learning_rate": 7.001660985884692e-05, "loss": 0.0091, "step": 8010 }, { "epoch": 15.881188118811881, "grad_norm": 0.26442253589630127, "learning_rate": 6.994082301721063e-05, "loss": 0.009, "step": 8020 }, { "epoch": 15.900990099009901, "grad_norm": 0.1322186291217804, "learning_rate": 6.986498165812563e-05, "loss": 0.0096, "step": 8030 }, { "epoch": 15.92079207920792, "grad_norm": 0.19783268868923187, "learning_rate": 6.978908598893932e-05, "loss": 0.0099, "step": 8040 }, { "epoch": 15.94059405940594, "grad_norm": 0.42534226179122925, "learning_rate": 6.971313621714756e-05, "loss": 0.0108, "step": 8050 }, { "epoch": 15.96039603960396, "grad_norm": 0.37249135971069336, "learning_rate": 6.96371325503941e-05, "loss": 0.0112, "step": 8060 }, { "epoch": 15.98019801980198, "grad_norm": 0.2563101649284363, "learning_rate": 6.956107519647014e-05, "loss": 0.0119, "step": 8070 }, { "epoch": 16.0, "grad_norm": 0.529987633228302, "learning_rate": 6.94849643633135e-05, "loss": 0.0119, "step": 8080 }, { "epoch": 16.019801980198018, "grad_norm": 0.23660051822662354, "learning_rate": 6.940880025900834e-05, "loss": 0.0126, "step": 8090 }, { "epoch": 16.03960396039604, "grad_norm": 0.3192079961299896, "learning_rate": 6.933258309178438e-05, "loss": 0.0135, "step": 8100 }, { "epoch": 16.059405940594058, "grad_norm": 0.26338422298431396, "learning_rate": 6.925631307001646e-05, "loss": 0.0141, "step": 8110 }, { "epoch": 16.07920792079208, "grad_norm": 0.2774793803691864, "learning_rate": 6.91799904022239e-05, "loss": 0.0131, "step": 8120 }, { "epoch": 16.099009900990097, "grad_norm": 0.23682624101638794, "learning_rate": 6.910361529706997e-05, "loss": 0.0121, "step": 8130 }, { "epoch": 16.11881188118812, "grad_norm": 0.3061884939670563, "learning_rate": 6.902718796336131e-05, "loss": 0.0112, "step": 8140 }, { "epoch": 16.138613861386137, "grad_norm": 0.3166359066963196, "learning_rate": 6.895070861004729e-05, "loss": 0.0114, "step": 8150 }, { "epoch": 16.15841584158416, "grad_norm": 0.25321608781814575, "learning_rate": 6.887417744621956e-05, "loss": 0.0113, "step": 8160 }, { "epoch": 16.178217821782177, "grad_norm": 0.15376923978328705, "learning_rate": 6.87975946811114e-05, "loss": 0.0104, "step": 8170 }, { "epoch": 16.198019801980198, "grad_norm": 0.21338653564453125, "learning_rate": 6.872096052409718e-05, "loss": 0.0097, "step": 8180 }, { "epoch": 16.217821782178216, "grad_norm": 0.1560230702161789, "learning_rate": 6.864427518469174e-05, "loss": 0.0093, "step": 8190 }, { "epoch": 16.237623762376238, "grad_norm": 0.2059134989976883, "learning_rate": 6.856753887254986e-05, "loss": 0.01, "step": 8200 }, { "epoch": 16.257425742574256, "grad_norm": 0.13234414160251617, "learning_rate": 6.849075179746572e-05, "loss": 0.0097, "step": 8210 }, { "epoch": 16.277227722772277, "grad_norm": 0.2226133644580841, "learning_rate": 6.841391416937221e-05, "loss": 0.0102, "step": 8220 }, { "epoch": 16.297029702970296, "grad_norm": 0.14034296572208405, "learning_rate": 6.833702619834053e-05, "loss": 0.0078, "step": 8230 }, { "epoch": 16.316831683168317, "grad_norm": 0.17853978276252747, "learning_rate": 6.82600880945794e-05, "loss": 0.0078, "step": 8240 }, { "epoch": 16.336633663366335, "grad_norm": 0.22148948907852173, "learning_rate": 6.818310006843468e-05, "loss": 0.0088, "step": 8250 }, { "epoch": 16.356435643564357, "grad_norm": 0.2042260617017746, "learning_rate": 6.810606233038868e-05, "loss": 0.0104, "step": 8260 }, { "epoch": 16.376237623762375, "grad_norm": 0.16738636791706085, "learning_rate": 6.802897509105966e-05, "loss": 0.0091, "step": 8270 }, { "epoch": 16.396039603960396, "grad_norm": 0.16254325211048126, "learning_rate": 6.79518385612012e-05, "loss": 0.0093, "step": 8280 }, { "epoch": 16.415841584158414, "grad_norm": 0.19427767395973206, "learning_rate": 6.787465295170157e-05, "loss": 0.0096, "step": 8290 }, { "epoch": 16.435643564356436, "grad_norm": 0.15728606283664703, "learning_rate": 6.779741847358332e-05, "loss": 0.009, "step": 8300 }, { "epoch": 16.455445544554454, "grad_norm": 0.19666752219200134, "learning_rate": 6.772013533800256e-05, "loss": 0.0095, "step": 8310 }, { "epoch": 16.475247524752476, "grad_norm": 0.21026422083377838, "learning_rate": 6.764280375624843e-05, "loss": 0.0101, "step": 8320 }, { "epoch": 16.495049504950494, "grad_norm": 0.17693039774894714, "learning_rate": 6.756542393974252e-05, "loss": 0.0093, "step": 8330 }, { "epoch": 16.514851485148515, "grad_norm": 0.2008696049451828, "learning_rate": 6.748799610003828e-05, "loss": 0.0099, "step": 8340 }, { "epoch": 16.534653465346533, "grad_norm": 0.20574437081813812, "learning_rate": 6.741052044882048e-05, "loss": 0.0092, "step": 8350 }, { "epoch": 16.554455445544555, "grad_norm": 0.1645479053258896, "learning_rate": 6.73329971979046e-05, "loss": 0.0088, "step": 8360 }, { "epoch": 16.574257425742573, "grad_norm": 0.15794390439987183, "learning_rate": 6.725542655923625e-05, "loss": 0.0095, "step": 8370 }, { "epoch": 16.594059405940595, "grad_norm": 0.22251921892166138, "learning_rate": 6.717780874489057e-05, "loss": 0.0096, "step": 8380 }, { "epoch": 16.613861386138613, "grad_norm": 0.21368074417114258, "learning_rate": 6.710014396707172e-05, "loss": 0.0081, "step": 8390 }, { "epoch": 16.633663366336634, "grad_norm": 0.24377241730690002, "learning_rate": 6.702243243811221e-05, "loss": 0.0113, "step": 8400 }, { "epoch": 16.653465346534652, "grad_norm": 0.2374519556760788, "learning_rate": 6.694467437047244e-05, "loss": 0.0094, "step": 8410 }, { "epoch": 16.673267326732674, "grad_norm": 0.23884069919586182, "learning_rate": 6.686686997673997e-05, "loss": 0.0111, "step": 8420 }, { "epoch": 16.693069306930692, "grad_norm": 0.19424720108509064, "learning_rate": 6.678901946962903e-05, "loss": 0.0093, "step": 8430 }, { "epoch": 16.712871287128714, "grad_norm": 0.2666163146495819, "learning_rate": 6.671112306197996e-05, "loss": 0.0099, "step": 8440 }, { "epoch": 16.73267326732673, "grad_norm": 0.1983659714460373, "learning_rate": 6.663318096675854e-05, "loss": 0.0092, "step": 8450 }, { "epoch": 16.752475247524753, "grad_norm": 0.2002350389957428, "learning_rate": 6.655519339705552e-05, "loss": 0.011, "step": 8460 }, { "epoch": 16.77227722772277, "grad_norm": 0.2524837553501129, "learning_rate": 6.647716056608588e-05, "loss": 0.0119, "step": 8470 }, { "epoch": 16.792079207920793, "grad_norm": 0.37005963921546936, "learning_rate": 6.639908268718843e-05, "loss": 0.009, "step": 8480 }, { "epoch": 16.81188118811881, "grad_norm": 0.22124344110488892, "learning_rate": 6.632095997382514e-05, "loss": 0.0083, "step": 8490 }, { "epoch": 16.831683168316832, "grad_norm": 0.3052222430706024, "learning_rate": 6.624279263958047e-05, "loss": 0.0104, "step": 8500 }, { "epoch": 16.85148514851485, "grad_norm": 0.1759086400270462, "learning_rate": 6.616458089816097e-05, "loss": 0.0097, "step": 8510 }, { "epoch": 16.871287128712872, "grad_norm": 0.17480289936065674, "learning_rate": 6.608632496339454e-05, "loss": 0.0097, "step": 8520 }, { "epoch": 16.89108910891089, "grad_norm": 0.17170780897140503, "learning_rate": 6.600802504922988e-05, "loss": 0.0099, "step": 8530 }, { "epoch": 16.91089108910891, "grad_norm": 0.20651361346244812, "learning_rate": 6.592968136973604e-05, "loss": 0.0094, "step": 8540 }, { "epoch": 16.93069306930693, "grad_norm": 0.19722086191177368, "learning_rate": 6.585129413910159e-05, "loss": 0.0102, "step": 8550 }, { "epoch": 16.95049504950495, "grad_norm": 0.14216971397399902, "learning_rate": 6.577286357163424e-05, "loss": 0.0087, "step": 8560 }, { "epoch": 16.97029702970297, "grad_norm": 0.23216353356838226, "learning_rate": 6.569438988176018e-05, "loss": 0.0104, "step": 8570 }, { "epoch": 16.99009900990099, "grad_norm": 0.22564396262168884, "learning_rate": 6.561587328402347e-05, "loss": 0.0115, "step": 8580 }, { "epoch": 17.00990099009901, "grad_norm": 0.2875814139842987, "learning_rate": 6.553731399308549e-05, "loss": 0.0121, "step": 8590 }, { "epoch": 17.02970297029703, "grad_norm": 0.24947169423103333, "learning_rate": 6.545871222372436e-05, "loss": 0.0111, "step": 8600 }, { "epoch": 17.04950495049505, "grad_norm": 0.3035160005092621, "learning_rate": 6.538006819083426e-05, "loss": 0.0125, "step": 8610 }, { "epoch": 17.06930693069307, "grad_norm": 0.23474198579788208, "learning_rate": 6.530138210942505e-05, "loss": 0.0123, "step": 8620 }, { "epoch": 17.08910891089109, "grad_norm": 0.24250160157680511, "learning_rate": 6.522265419462141e-05, "loss": 0.0104, "step": 8630 }, { "epoch": 17.10891089108911, "grad_norm": 0.24698425829410553, "learning_rate": 6.514388466166248e-05, "loss": 0.012, "step": 8640 }, { "epoch": 17.128712871287128, "grad_norm": 0.28064265847206116, "learning_rate": 6.506507372590119e-05, "loss": 0.0105, "step": 8650 }, { "epoch": 17.14851485148515, "grad_norm": 0.35751840472221375, "learning_rate": 6.498622160280355e-05, "loss": 0.0103, "step": 8660 }, { "epoch": 17.168316831683168, "grad_norm": 0.27020135521888733, "learning_rate": 6.490732850794832e-05, "loss": 0.0103, "step": 8670 }, { "epoch": 17.18811881188119, "grad_norm": 0.2236364781856537, "learning_rate": 6.482839465702616e-05, "loss": 0.0101, "step": 8680 }, { "epoch": 17.207920792079207, "grad_norm": 0.22509904205799103, "learning_rate": 6.474942026583923e-05, "loss": 0.0119, "step": 8690 }, { "epoch": 17.22772277227723, "grad_norm": 0.24839846789836884, "learning_rate": 6.467040555030052e-05, "loss": 0.0092, "step": 8700 }, { "epoch": 17.247524752475247, "grad_norm": 0.17933028936386108, "learning_rate": 6.459135072643321e-05, "loss": 0.0088, "step": 8710 }, { "epoch": 17.26732673267327, "grad_norm": 0.19964472949504852, "learning_rate": 6.451225601037019e-05, "loss": 0.0096, "step": 8720 }, { "epoch": 17.287128712871286, "grad_norm": 0.20776866376399994, "learning_rate": 6.443312161835338e-05, "loss": 0.012, "step": 8730 }, { "epoch": 17.306930693069308, "grad_norm": 0.18409863114356995, "learning_rate": 6.43539477667332e-05, "loss": 0.0098, "step": 8740 }, { "epoch": 17.326732673267326, "grad_norm": 0.2983393669128418, "learning_rate": 6.427473467196793e-05, "loss": 0.0088, "step": 8750 }, { "epoch": 17.346534653465348, "grad_norm": 0.22706469893455505, "learning_rate": 6.419548255062315e-05, "loss": 0.0104, "step": 8760 }, { "epoch": 17.366336633663366, "grad_norm": 0.34341397881507874, "learning_rate": 6.411619161937112e-05, "loss": 0.0088, "step": 8770 }, { "epoch": 17.386138613861387, "grad_norm": 0.19944937527179718, "learning_rate": 6.403686209499022e-05, "loss": 0.0096, "step": 8780 }, { "epoch": 17.405940594059405, "grad_norm": 0.20025433599948883, "learning_rate": 6.395749419436437e-05, "loss": 0.0126, "step": 8790 }, { "epoch": 17.425742574257427, "grad_norm": 0.17424410581588745, "learning_rate": 6.387808813448234e-05, "loss": 0.0102, "step": 8800 }, { "epoch": 17.445544554455445, "grad_norm": 0.22727754712104797, "learning_rate": 6.37986441324373e-05, "loss": 0.0093, "step": 8810 }, { "epoch": 17.465346534653467, "grad_norm": 0.29664111137390137, "learning_rate": 6.37191624054261e-05, "loss": 0.0093, "step": 8820 }, { "epoch": 17.485148514851485, "grad_norm": 0.33922573924064636, "learning_rate": 6.363964317074872e-05, "loss": 0.0088, "step": 8830 }, { "epoch": 17.504950495049506, "grad_norm": 0.20484396815299988, "learning_rate": 6.356008664580776e-05, "loss": 0.0091, "step": 8840 }, { "epoch": 17.524752475247524, "grad_norm": 0.2183717042207718, "learning_rate": 6.348049304810771e-05, "loss": 0.0111, "step": 8850 }, { "epoch": 17.544554455445546, "grad_norm": 0.1880095899105072, "learning_rate": 6.340086259525442e-05, "loss": 0.0088, "step": 8860 }, { "epoch": 17.564356435643564, "grad_norm": 0.2122930884361267, "learning_rate": 6.332119550495448e-05, "loss": 0.011, "step": 8870 }, { "epoch": 17.584158415841586, "grad_norm": 0.21541570127010345, "learning_rate": 6.324149199501473e-05, "loss": 0.0096, "step": 8880 }, { "epoch": 17.603960396039604, "grad_norm": 0.18243514001369476, "learning_rate": 6.316175228334146e-05, "loss": 0.0094, "step": 8890 }, { "epoch": 17.623762376237625, "grad_norm": 0.16934598982334137, "learning_rate": 6.308197658794003e-05, "loss": 0.0103, "step": 8900 }, { "epoch": 17.643564356435643, "grad_norm": 0.1513604372739792, "learning_rate": 6.300216512691417e-05, "loss": 0.0091, "step": 8910 }, { "epoch": 17.663366336633665, "grad_norm": 0.24457184970378876, "learning_rate": 6.292231811846532e-05, "loss": 0.01, "step": 8920 }, { "epoch": 17.683168316831683, "grad_norm": 0.2765200734138489, "learning_rate": 6.284243578089217e-05, "loss": 0.0094, "step": 8930 }, { "epoch": 17.702970297029704, "grad_norm": 0.2188812643289566, "learning_rate": 6.276251833258999e-05, "loss": 0.0126, "step": 8940 }, { "epoch": 17.722772277227723, "grad_norm": 0.21763142943382263, "learning_rate": 6.268256599205003e-05, "loss": 0.0099, "step": 8950 }, { "epoch": 17.742574257425744, "grad_norm": 0.24788948893547058, "learning_rate": 6.260257897785892e-05, "loss": 0.01, "step": 8960 }, { "epoch": 17.762376237623762, "grad_norm": 0.18700334429740906, "learning_rate": 6.252255750869811e-05, "loss": 0.0091, "step": 8970 }, { "epoch": 17.782178217821784, "grad_norm": 0.2876586318016052, "learning_rate": 6.244250180334325e-05, "loss": 0.0113, "step": 8980 }, { "epoch": 17.801980198019802, "grad_norm": 0.2031906247138977, "learning_rate": 6.236241208066356e-05, "loss": 0.0099, "step": 8990 }, { "epoch": 17.821782178217823, "grad_norm": 0.3201976716518402, "learning_rate": 6.228228855962133e-05, "loss": 0.0111, "step": 9000 }, { "epoch": 17.84158415841584, "grad_norm": 0.18545329570770264, "learning_rate": 6.220213145927115e-05, "loss": 0.0101, "step": 9010 }, { "epoch": 17.861386138613863, "grad_norm": 0.15168237686157227, "learning_rate": 6.212194099875951e-05, "loss": 0.012, "step": 9020 }, { "epoch": 17.88118811881188, "grad_norm": 0.1707606166601181, "learning_rate": 6.204171739732405e-05, "loss": 0.0115, "step": 9030 }, { "epoch": 17.900990099009903, "grad_norm": 0.337006539106369, "learning_rate": 6.196146087429303e-05, "loss": 0.0112, "step": 9040 }, { "epoch": 17.92079207920792, "grad_norm": 0.16644635796546936, "learning_rate": 6.188117164908474e-05, "loss": 0.0098, "step": 9050 }, { "epoch": 17.94059405940594, "grad_norm": 0.15781202912330627, "learning_rate": 6.180084994120684e-05, "loss": 0.0078, "step": 9060 }, { "epoch": 17.96039603960396, "grad_norm": 0.19033648073673248, "learning_rate": 6.17204959702558e-05, "loss": 0.0094, "step": 9070 }, { "epoch": 17.980198019801982, "grad_norm": 0.16141574084758759, "learning_rate": 6.164010995591635e-05, "loss": 0.0079, "step": 9080 }, { "epoch": 18.0, "grad_norm": 0.5123599767684937, "learning_rate": 6.155969211796076e-05, "loss": 0.01, "step": 9090 }, { "epoch": 18.019801980198018, "grad_norm": 0.17091454565525055, "learning_rate": 6.147924267624829e-05, "loss": 0.0113, "step": 9100 }, { "epoch": 18.03960396039604, "grad_norm": 0.20390327274799347, "learning_rate": 6.13987618507247e-05, "loss": 0.0122, "step": 9110 }, { "epoch": 18.059405940594058, "grad_norm": 0.29800206422805786, "learning_rate": 6.131824986142147e-05, "loss": 0.0106, "step": 9120 }, { "epoch": 18.07920792079208, "grad_norm": 0.2312414050102234, "learning_rate": 6.123770692845529e-05, "loss": 0.0115, "step": 9130 }, { "epoch": 18.099009900990097, "grad_norm": 0.22353772819042206, "learning_rate": 6.11571332720275e-05, "loss": 0.0124, "step": 9140 }, { "epoch": 18.11881188118812, "grad_norm": 0.14474064111709595, "learning_rate": 6.107652911242336e-05, "loss": 0.0108, "step": 9150 }, { "epoch": 18.138613861386137, "grad_norm": 0.1993708312511444, "learning_rate": 6.0995894670011586e-05, "loss": 0.0087, "step": 9160 }, { "epoch": 18.15841584158416, "grad_norm": 0.2130400389432907, "learning_rate": 6.091523016524368e-05, "loss": 0.0103, "step": 9170 }, { "epoch": 18.178217821782177, "grad_norm": 0.21704746782779694, "learning_rate": 6.083453581865328e-05, "loss": 0.0084, "step": 9180 }, { "epoch": 18.198019801980198, "grad_norm": 0.16248515248298645, "learning_rate": 6.075381185085568e-05, "loss": 0.0098, "step": 9190 }, { "epoch": 18.217821782178216, "grad_norm": 0.21830058097839355, "learning_rate": 6.067305848254709e-05, "loss": 0.0094, "step": 9200 }, { "epoch": 18.237623762376238, "grad_norm": 0.29070040583610535, "learning_rate": 6.059227593450418e-05, "loss": 0.0109, "step": 9210 }, { "epoch": 18.257425742574256, "grad_norm": 0.21899686753749847, "learning_rate": 6.051146442758333e-05, "loss": 0.0115, "step": 9220 }, { "epoch": 18.277227722772277, "grad_norm": 0.21369437873363495, "learning_rate": 6.043062418272012e-05, "loss": 0.0096, "step": 9230 }, { "epoch": 18.297029702970296, "grad_norm": 0.22077088057994843, "learning_rate": 6.0349755420928666e-05, "loss": 0.0089, "step": 9240 }, { "epoch": 18.316831683168317, "grad_norm": 0.1880016028881073, "learning_rate": 6.0268858363301105e-05, "loss": 0.0087, "step": 9250 }, { "epoch": 18.336633663366335, "grad_norm": 0.1464831531047821, "learning_rate": 6.018793323100689e-05, "loss": 0.0086, "step": 9260 }, { "epoch": 18.356435643564357, "grad_norm": 0.16914163529872894, "learning_rate": 6.0106980245292255e-05, "loss": 0.0092, "step": 9270 }, { "epoch": 18.376237623762375, "grad_norm": 0.20864853262901306, "learning_rate": 6.002599962747957e-05, "loss": 0.0102, "step": 9280 }, { "epoch": 18.396039603960396, "grad_norm": 0.20573905110359192, "learning_rate": 5.994499159896673e-05, "loss": 0.0104, "step": 9290 }, { "epoch": 18.415841584158414, "grad_norm": 0.2628158628940582, "learning_rate": 5.9863956381226607e-05, "loss": 0.0089, "step": 9300 }, { "epoch": 18.435643564356436, "grad_norm": 0.15654148161411285, "learning_rate": 5.9782894195806394e-05, "loss": 0.0083, "step": 9310 }, { "epoch": 18.455445544554454, "grad_norm": 0.19462373852729797, "learning_rate": 5.9701805264327004e-05, "loss": 0.0096, "step": 9320 }, { "epoch": 18.475247524752476, "grad_norm": 0.19258834421634674, "learning_rate": 5.96206898084825e-05, "loss": 0.0085, "step": 9330 }, { "epoch": 18.495049504950494, "grad_norm": 0.22835570573806763, "learning_rate": 5.953954805003942e-05, "loss": 0.0079, "step": 9340 }, { "epoch": 18.514851485148515, "grad_norm": 0.14755375683307648, "learning_rate": 5.945838021083623e-05, "loss": 0.0085, "step": 9350 }, { "epoch": 18.534653465346533, "grad_norm": 0.16294123232364655, "learning_rate": 5.9377186512782714e-05, "loss": 0.0118, "step": 9360 }, { "epoch": 18.554455445544555, "grad_norm": 0.19986487925052643, "learning_rate": 5.929596717785935e-05, "loss": 0.0096, "step": 9370 }, { "epoch": 18.574257425742573, "grad_norm": 0.13137373328208923, "learning_rate": 5.921472242811668e-05, "loss": 0.0071, "step": 9380 }, { "epoch": 18.594059405940595, "grad_norm": 0.18024645745754242, "learning_rate": 5.913345248567475e-05, "loss": 0.0088, "step": 9390 }, { "epoch": 18.613861386138613, "grad_norm": 0.22270818054676056, "learning_rate": 5.905215757272248e-05, "loss": 0.0086, "step": 9400 }, { "epoch": 18.633663366336634, "grad_norm": 0.24049246311187744, "learning_rate": 5.897083791151706e-05, "loss": 0.0094, "step": 9410 }, { "epoch": 18.653465346534652, "grad_norm": 0.22350510954856873, "learning_rate": 5.888949372438336e-05, "loss": 0.0114, "step": 9420 }, { "epoch": 18.673267326732674, "grad_norm": 0.1902153640985489, "learning_rate": 5.8808125233713255e-05, "loss": 0.0091, "step": 9430 }, { "epoch": 18.693069306930692, "grad_norm": 0.19268903136253357, "learning_rate": 5.872673266196509e-05, "loss": 0.0079, "step": 9440 }, { "epoch": 18.712871287128714, "grad_norm": 0.1986839324235916, "learning_rate": 5.864531623166305e-05, "loss": 0.0082, "step": 9450 }, { "epoch": 18.73267326732673, "grad_norm": 0.22280143201351166, "learning_rate": 5.856387616539656e-05, "loss": 0.0084, "step": 9460 }, { "epoch": 18.752475247524753, "grad_norm": 0.203788623213768, "learning_rate": 5.848241268581967e-05, "loss": 0.0089, "step": 9470 }, { "epoch": 18.77227722772277, "grad_norm": 0.2207307368516922, "learning_rate": 5.840092601565037e-05, "loss": 0.0097, "step": 9480 }, { "epoch": 18.792079207920793, "grad_norm": 0.21375532448291779, "learning_rate": 5.8319416377670144e-05, "loss": 0.0076, "step": 9490 }, { "epoch": 18.81188118811881, "grad_norm": 0.23642539978027344, "learning_rate": 5.82378839947232e-05, "loss": 0.0083, "step": 9500 }, { "epoch": 18.831683168316832, "grad_norm": 0.20721617341041565, "learning_rate": 5.815632908971599e-05, "loss": 0.0068, "step": 9510 }, { "epoch": 18.85148514851485, "grad_norm": 0.23804277181625366, "learning_rate": 5.80747518856165e-05, "loss": 0.0066, "step": 9520 }, { "epoch": 18.871287128712872, "grad_norm": 0.19407323002815247, "learning_rate": 5.799315260545367e-05, "loss": 0.0089, "step": 9530 }, { "epoch": 18.89108910891089, "grad_norm": 0.19753941893577576, "learning_rate": 5.791153147231686e-05, "loss": 0.0076, "step": 9540 }, { "epoch": 18.91089108910891, "grad_norm": 0.2705848515033722, "learning_rate": 5.782988870935509e-05, "loss": 0.0077, "step": 9550 }, { "epoch": 18.93069306930693, "grad_norm": 0.2095915824174881, "learning_rate": 5.774822453977657e-05, "loss": 0.0081, "step": 9560 }, { "epoch": 18.95049504950495, "grad_norm": 0.28822603821754456, "learning_rate": 5.7666539186848036e-05, "loss": 0.0079, "step": 9570 }, { "epoch": 18.97029702970297, "grad_norm": 0.20028874278068542, "learning_rate": 5.758483287389411e-05, "loss": 0.0082, "step": 9580 }, { "epoch": 18.99009900990099, "grad_norm": 0.17685984075069427, "learning_rate": 5.7503105824296735e-05, "loss": 0.0081, "step": 9590 }, { "epoch": 19.00990099009901, "grad_norm": 0.24346457421779633, "learning_rate": 5.742135826149453e-05, "loss": 0.0101, "step": 9600 }, { "epoch": 19.02970297029703, "grad_norm": 0.26254090666770935, "learning_rate": 5.7339590408982223e-05, "loss": 0.0113, "step": 9610 }, { "epoch": 19.04950495049505, "grad_norm": 0.20072953402996063, "learning_rate": 5.725780249031e-05, "loss": 0.0095, "step": 9620 }, { "epoch": 19.06930693069307, "grad_norm": 0.20435158908367157, "learning_rate": 5.717599472908292e-05, "loss": 0.0103, "step": 9630 }, { "epoch": 19.08910891089109, "grad_norm": 0.17674684524536133, "learning_rate": 5.7094167348960237e-05, "loss": 0.0095, "step": 9640 }, { "epoch": 19.10891089108911, "grad_norm": 0.19221636652946472, "learning_rate": 5.7012320573654945e-05, "loss": 0.0085, "step": 9650 }, { "epoch": 19.128712871287128, "grad_norm": 0.20269884169101715, "learning_rate": 5.693045462693295e-05, "loss": 0.0097, "step": 9660 }, { "epoch": 19.14851485148515, "grad_norm": 0.20831069350242615, "learning_rate": 5.684856973261266e-05, "loss": 0.0086, "step": 9670 }, { "epoch": 19.168316831683168, "grad_norm": 0.16066406667232513, "learning_rate": 5.6766666114564215e-05, "loss": 0.0073, "step": 9680 }, { "epoch": 19.18811881188119, "grad_norm": 0.184458926320076, "learning_rate": 5.668474399670899e-05, "loss": 0.0083, "step": 9690 }, { "epoch": 19.207920792079207, "grad_norm": 0.1397104412317276, "learning_rate": 5.660280360301896e-05, "loss": 0.0084, "step": 9700 }, { "epoch": 19.22772277227723, "grad_norm": 0.1471136212348938, "learning_rate": 5.652084515751599e-05, "loss": 0.0074, "step": 9710 }, { "epoch": 19.247524752475247, "grad_norm": 0.3085004985332489, "learning_rate": 5.643886888427137e-05, "loss": 0.0082, "step": 9720 }, { "epoch": 19.26732673267327, "grad_norm": 0.16153781116008759, "learning_rate": 5.6356875007405074e-05, "loss": 0.0089, "step": 9730 }, { "epoch": 19.287128712871286, "grad_norm": 0.20335553586483002, "learning_rate": 5.627486375108525e-05, "loss": 0.0088, "step": 9740 }, { "epoch": 19.306930693069308, "grad_norm": 0.15481232106685638, "learning_rate": 5.619283533952754e-05, "loss": 0.0071, "step": 9750 }, { "epoch": 19.326732673267326, "grad_norm": 0.1782633662223816, "learning_rate": 5.6110789996994474e-05, "loss": 0.0107, "step": 9760 }, { "epoch": 19.346534653465348, "grad_norm": 0.167184516787529, "learning_rate": 5.602872794779491e-05, "loss": 0.0076, "step": 9770 }, { "epoch": 19.366336633663366, "grad_norm": 0.15242727100849152, "learning_rate": 5.594664941628334e-05, "loss": 0.0081, "step": 9780 }, { "epoch": 19.386138613861387, "grad_norm": 0.1767595410346985, "learning_rate": 5.5864554626859324e-05, "loss": 0.0075, "step": 9790 }, { "epoch": 19.405940594059405, "grad_norm": 0.13468272984027863, "learning_rate": 5.578244380396691e-05, "loss": 0.0109, "step": 9800 }, { "epoch": 19.425742574257427, "grad_norm": 0.11766193062067032, "learning_rate": 5.570031717209394e-05, "loss": 0.0079, "step": 9810 }, { "epoch": 19.445544554455445, "grad_norm": 0.1355472207069397, "learning_rate": 5.561817495577147e-05, "loss": 0.0068, "step": 9820 }, { "epoch": 19.465346534653467, "grad_norm": 0.1856883317232132, "learning_rate": 5.5536017379573215e-05, "loss": 0.0069, "step": 9830 }, { "epoch": 19.485148514851485, "grad_norm": 0.21311727166175842, "learning_rate": 5.545384466811483e-05, "loss": 0.0065, "step": 9840 }, { "epoch": 19.504950495049506, "grad_norm": 0.20642852783203125, "learning_rate": 5.5371657046053384e-05, "loss": 0.0081, "step": 9850 }, { "epoch": 19.524752475247524, "grad_norm": 0.31878241896629333, "learning_rate": 5.528945473808669e-05, "loss": 0.0071, "step": 9860 }, { "epoch": 19.544554455445546, "grad_norm": 0.15214094519615173, "learning_rate": 5.520723796895272e-05, "loss": 0.0106, "step": 9870 }, { "epoch": 19.564356435643564, "grad_norm": 0.2222938984632492, "learning_rate": 5.512500696342897e-05, "loss": 0.0083, "step": 9880 }, { "epoch": 19.584158415841586, "grad_norm": 0.2051517367362976, "learning_rate": 5.504276194633188e-05, "loss": 0.0098, "step": 9890 }, { "epoch": 19.603960396039604, "grad_norm": 0.12387961149215698, "learning_rate": 5.49605031425162e-05, "loss": 0.0088, "step": 9900 }, { "epoch": 19.623762376237625, "grad_norm": 0.25097838044166565, "learning_rate": 5.487823077687434e-05, "loss": 0.0091, "step": 9910 }, { "epoch": 19.643564356435643, "grad_norm": 0.21087101101875305, "learning_rate": 5.4795945074335806e-05, "loss": 0.0093, "step": 9920 }, { "epoch": 19.663366336633665, "grad_norm": 0.21140897274017334, "learning_rate": 5.471364625986657e-05, "loss": 0.0088, "step": 9930 }, { "epoch": 19.683168316831683, "grad_norm": 0.2335343062877655, "learning_rate": 5.463133455846845e-05, "loss": 0.0084, "step": 9940 }, { "epoch": 19.702970297029704, "grad_norm": 0.2136065512895584, "learning_rate": 5.4549010195178505e-05, "loss": 0.0102, "step": 9950 }, { "epoch": 19.722772277227723, "grad_norm": 0.20356595516204834, "learning_rate": 5.446667339506838e-05, "loss": 0.0081, "step": 9960 }, { "epoch": 19.742574257425744, "grad_norm": 0.12421521544456482, "learning_rate": 5.4384324383243756e-05, "loss": 0.0076, "step": 9970 }, { "epoch": 19.762376237623762, "grad_norm": 0.14886407554149628, "learning_rate": 5.430196338484368e-05, "loss": 0.0076, "step": 9980 }, { "epoch": 19.782178217821784, "grad_norm": 0.15691490471363068, "learning_rate": 5.4219590625039975e-05, "loss": 0.0072, "step": 9990 }, { "epoch": 19.801980198019802, "grad_norm": 0.22589290142059326, "learning_rate": 5.413720632903664e-05, "loss": 0.0082, "step": 10000 }, { "epoch": 19.821782178217823, "grad_norm": 0.27543458342552185, "learning_rate": 5.405481072206917e-05, "loss": 0.0084, "step": 10010 }, { "epoch": 19.84158415841584, "grad_norm": 0.18665051460266113, "learning_rate": 5.397240402940402e-05, "loss": 0.0079, "step": 10020 }, { "epoch": 19.861386138613863, "grad_norm": 0.23419269919395447, "learning_rate": 5.388998647633794e-05, "loss": 0.0077, "step": 10030 }, { "epoch": 19.88118811881188, "grad_norm": 0.13763131201267242, "learning_rate": 5.380755828819737e-05, "loss": 0.007, "step": 10040 }, { "epoch": 19.900990099009903, "grad_norm": 0.2252253144979477, "learning_rate": 5.3725119690337846e-05, "loss": 0.008, "step": 10050 }, { "epoch": 19.92079207920792, "grad_norm": 0.892955482006073, "learning_rate": 5.3642670908143324e-05, "loss": 0.0076, "step": 10060 }, { "epoch": 19.94059405940594, "grad_norm": 0.12461807578802109, "learning_rate": 5.356021216702562e-05, "loss": 0.0087, "step": 10070 }, { "epoch": 19.96039603960396, "grad_norm": 0.9183124303817749, "learning_rate": 5.347774369242381e-05, "loss": 0.0078, "step": 10080 }, { "epoch": 19.980198019801982, "grad_norm": 0.2640547752380371, "learning_rate": 5.3395265709803545e-05, "loss": 0.0105, "step": 10090 }, { "epoch": 20.0, "grad_norm": 0.16990622878074646, "learning_rate": 5.331277844465647e-05, "loss": 0.0101, "step": 10100 }, { "epoch": 20.019801980198018, "grad_norm": 0.2840176522731781, "learning_rate": 5.323028212249963e-05, "loss": 0.0115, "step": 10110 }, { "epoch": 20.03960396039604, "grad_norm": 0.17458824813365936, "learning_rate": 5.314777696887481e-05, "loss": 0.0115, "step": 10120 }, { "epoch": 20.059405940594058, "grad_norm": 0.2286110520362854, "learning_rate": 5.306526320934796e-05, "loss": 0.0092, "step": 10130 }, { "epoch": 20.07920792079208, "grad_norm": 0.2882753908634186, "learning_rate": 5.298274106950854e-05, "loss": 0.0083, "step": 10140 }, { "epoch": 20.099009900990097, "grad_norm": 0.20839571952819824, "learning_rate": 5.290021077496893e-05, "loss": 0.0088, "step": 10150 }, { "epoch": 20.11881188118812, "grad_norm": 0.20488013327121735, "learning_rate": 5.2817672551363816e-05, "loss": 0.0076, "step": 10160 }, { "epoch": 20.138613861386137, "grad_norm": 0.26098737120628357, "learning_rate": 5.273512662434952e-05, "loss": 0.0076, "step": 10170 }, { "epoch": 20.15841584158416, "grad_norm": 0.27684423327445984, "learning_rate": 5.265257321960349e-05, "loss": 0.0075, "step": 10180 }, { "epoch": 20.178217821782177, "grad_norm": 0.21769677102565765, "learning_rate": 5.257001256282357e-05, "loss": 0.0097, "step": 10190 }, { "epoch": 20.198019801980198, "grad_norm": 0.21925298869609833, "learning_rate": 5.248744487972742e-05, "loss": 0.0074, "step": 10200 }, { "epoch": 20.217821782178216, "grad_norm": 0.16500043869018555, "learning_rate": 5.240487039605196e-05, "loss": 0.007, "step": 10210 }, { "epoch": 20.237623762376238, "grad_norm": 0.15940546989440918, "learning_rate": 5.232228933755267e-05, "loss": 0.0073, "step": 10220 }, { "epoch": 20.257425742574256, "grad_norm": 0.15058648586273193, "learning_rate": 5.2239701930003006e-05, "loss": 0.0081, "step": 10230 }, { "epoch": 20.277227722772277, "grad_norm": 0.19026702642440796, "learning_rate": 5.215710839919379e-05, "loss": 0.0074, "step": 10240 }, { "epoch": 20.297029702970296, "grad_norm": 0.20770955085754395, "learning_rate": 5.207450897093257e-05, "loss": 0.0076, "step": 10250 }, { "epoch": 20.316831683168317, "grad_norm": 0.36166009306907654, "learning_rate": 5.1991903871043046e-05, "loss": 0.007, "step": 10260 }, { "epoch": 20.336633663366335, "grad_norm": 0.1704798936843872, "learning_rate": 5.190929332536439e-05, "loss": 0.0068, "step": 10270 }, { "epoch": 20.356435643564357, "grad_norm": 0.24602599442005157, "learning_rate": 5.182667755975071e-05, "loss": 0.0099, "step": 10280 }, { "epoch": 20.376237623762375, "grad_norm": 0.16353248059749603, "learning_rate": 5.1744056800070315e-05, "loss": 0.0081, "step": 10290 }, { "epoch": 20.396039603960396, "grad_norm": 0.18493378162384033, "learning_rate": 5.166143127220524e-05, "loss": 0.0086, "step": 10300 }, { "epoch": 20.415841584158414, "grad_norm": 0.18771639466285706, "learning_rate": 5.1578801202050485e-05, "loss": 0.0084, "step": 10310 }, { "epoch": 20.435643564356436, "grad_norm": 0.2499607354402542, "learning_rate": 5.149616681551355e-05, "loss": 0.0089, "step": 10320 }, { "epoch": 20.455445544554454, "grad_norm": 0.12777353823184967, "learning_rate": 5.141352833851367e-05, "loss": 0.0065, "step": 10330 }, { "epoch": 20.475247524752476, "grad_norm": 0.14223378896713257, "learning_rate": 5.1330885996981285e-05, "loss": 0.0064, "step": 10340 }, { "epoch": 20.495049504950494, "grad_norm": 0.2890561819076538, "learning_rate": 5.124824001685741e-05, "loss": 0.0093, "step": 10350 }, { "epoch": 20.514851485148515, "grad_norm": 0.16494347155094147, "learning_rate": 5.116559062409298e-05, "loss": 0.0066, "step": 10360 }, { "epoch": 20.534653465346533, "grad_norm": 0.2971169948577881, "learning_rate": 5.10829380446483e-05, "loss": 0.0072, "step": 10370 }, { "epoch": 20.554455445544555, "grad_norm": 0.2064463049173355, "learning_rate": 5.100028250449235e-05, "loss": 0.0069, "step": 10380 }, { "epoch": 20.574257425742573, "grad_norm": 0.20225441455841064, "learning_rate": 5.0917624229602234e-05, "loss": 0.0077, "step": 10390 }, { "epoch": 20.594059405940595, "grad_norm": 0.23786911368370056, "learning_rate": 5.0834963445962524e-05, "loss": 0.0064, "step": 10400 }, { "epoch": 20.613861386138613, "grad_norm": 0.1751769632101059, "learning_rate": 5.075230037956461e-05, "loss": 0.0078, "step": 10410 }, { "epoch": 20.633663366336634, "grad_norm": 0.18395425379276276, "learning_rate": 5.0669635256406213e-05, "loss": 0.008, "step": 10420 }, { "epoch": 20.653465346534652, "grad_norm": 0.19671641290187836, "learning_rate": 5.058696830249058e-05, "loss": 0.0071, "step": 10430 }, { "epoch": 20.673267326732674, "grad_norm": 0.19024300575256348, "learning_rate": 5.050429974382602e-05, "loss": 0.0075, "step": 10440 }, { "epoch": 20.693069306930692, "grad_norm": 0.18405002355575562, "learning_rate": 5.042162980642523e-05, "loss": 0.006, "step": 10450 }, { "epoch": 20.712871287128714, "grad_norm": 0.18050868809223175, "learning_rate": 5.033895871630462e-05, "loss": 0.0063, "step": 10460 }, { "epoch": 20.73267326732673, "grad_norm": 0.12201157212257385, "learning_rate": 5.025628669948386e-05, "loss": 0.0064, "step": 10470 }, { "epoch": 20.752475247524753, "grad_norm": 0.17355872690677643, "learning_rate": 5.017361398198502e-05, "loss": 0.0075, "step": 10480 }, { "epoch": 20.77227722772277, "grad_norm": 0.1515287458896637, "learning_rate": 5.009094078983221e-05, "loss": 0.0067, "step": 10490 }, { "epoch": 20.792079207920793, "grad_norm": 0.1434178203344345, "learning_rate": 5.000826734905073e-05, "loss": 0.0073, "step": 10500 }, { "epoch": 20.81188118811881, "grad_norm": 0.19585421681404114, "learning_rate": 4.9925593885666645e-05, "loss": 0.008, "step": 10510 }, { "epoch": 20.831683168316832, "grad_norm": 0.23987513780593872, "learning_rate": 4.984292062570602e-05, "loss": 0.0095, "step": 10520 }, { "epoch": 20.85148514851485, "grad_norm": 0.1637810319662094, "learning_rate": 4.976024779519442e-05, "loss": 0.0073, "step": 10530 }, { "epoch": 20.871287128712872, "grad_norm": 0.22645717859268188, "learning_rate": 4.9677575620156194e-05, "loss": 0.0065, "step": 10540 }, { "epoch": 20.89108910891089, "grad_norm": 0.1841379851102829, "learning_rate": 4.959490432661391e-05, "loss": 0.0078, "step": 10550 }, { "epoch": 20.91089108910891, "grad_norm": 0.2077997922897339, "learning_rate": 4.9512234140587726e-05, "loss": 0.0094, "step": 10560 }, { "epoch": 20.93069306930693, "grad_norm": 0.14512518048286438, "learning_rate": 4.942956528809477e-05, "loss": 0.0074, "step": 10570 }, { "epoch": 20.95049504950495, "grad_norm": 0.199898362159729, "learning_rate": 4.934689799514854e-05, "loss": 0.0073, "step": 10580 }, { "epoch": 20.97029702970297, "grad_norm": 0.21913664042949677, "learning_rate": 4.926423248775827e-05, "loss": 0.0084, "step": 10590 }, { "epoch": 20.99009900990099, "grad_norm": 0.19025568664073944, "learning_rate": 4.918156899192826e-05, "loss": 0.0075, "step": 10600 }, { "epoch": 21.00990099009901, "grad_norm": 0.11334829032421112, "learning_rate": 4.909890773365738e-05, "loss": 0.0069, "step": 10610 }, { "epoch": 21.02970297029703, "grad_norm": 0.20126935839653015, "learning_rate": 4.9016248938938344e-05, "loss": 0.0073, "step": 10620 }, { "epoch": 21.04950495049505, "grad_norm": 0.18374565243721008, "learning_rate": 4.8933592833757156e-05, "loss": 0.0067, "step": 10630 }, { "epoch": 21.06930693069307, "grad_norm": 0.1920863687992096, "learning_rate": 4.8850939644092435e-05, "loss": 0.0076, "step": 10640 }, { "epoch": 21.08910891089109, "grad_norm": 0.23863206803798676, "learning_rate": 4.876828959591485e-05, "loss": 0.0082, "step": 10650 }, { "epoch": 21.10891089108911, "grad_norm": 0.15368357300758362, "learning_rate": 4.8685642915186474e-05, "loss": 0.0088, "step": 10660 }, { "epoch": 21.128712871287128, "grad_norm": 0.11116620898246765, "learning_rate": 4.860299982786018e-05, "loss": 0.0084, "step": 10670 }, { "epoch": 21.14851485148515, "grad_norm": 0.1394735723733902, "learning_rate": 4.852036055987901e-05, "loss": 0.0062, "step": 10680 }, { "epoch": 21.168316831683168, "grad_norm": 0.17203332483768463, "learning_rate": 4.843772533717558e-05, "loss": 0.0067, "step": 10690 }, { "epoch": 21.18811881188119, "grad_norm": 0.17226840555667877, "learning_rate": 4.835509438567142e-05, "loss": 0.0087, "step": 10700 }, { "epoch": 21.207920792079207, "grad_norm": 0.19217072427272797, "learning_rate": 4.827246793127639e-05, "loss": 0.0079, "step": 10710 }, { "epoch": 21.22772277227723, "grad_norm": 0.1834883689880371, "learning_rate": 4.818984619988807e-05, "loss": 0.0077, "step": 10720 }, { "epoch": 21.247524752475247, "grad_norm": 0.15354396402835846, "learning_rate": 4.810722941739115e-05, "loss": 0.0075, "step": 10730 }, { "epoch": 21.26732673267327, "grad_norm": 0.14652736485004425, "learning_rate": 4.8024617809656684e-05, "loss": 0.0084, "step": 10740 }, { "epoch": 21.287128712871286, "grad_norm": 0.1668914258480072, "learning_rate": 4.794201160254171e-05, "loss": 0.008, "step": 10750 }, { "epoch": 21.306930693069308, "grad_norm": 0.24598723649978638, "learning_rate": 4.785941102188844e-05, "loss": 0.0079, "step": 10760 }, { "epoch": 21.326732673267326, "grad_norm": 0.18509556353092194, "learning_rate": 4.7776816293523686e-05, "loss": 0.0076, "step": 10770 }, { "epoch": 21.346534653465348, "grad_norm": 0.12979893386363983, "learning_rate": 4.769422764325832e-05, "loss": 0.0075, "step": 10780 }, { "epoch": 21.366336633663366, "grad_norm": 0.17714282870292664, "learning_rate": 4.76116452968865e-05, "loss": 0.0074, "step": 10790 }, { "epoch": 21.386138613861387, "grad_norm": 0.1264171302318573, "learning_rate": 4.752906948018525e-05, "loss": 0.0059, "step": 10800 }, { "epoch": 21.405940594059405, "grad_norm": 0.23090994358062744, "learning_rate": 4.7446500418913684e-05, "loss": 0.0075, "step": 10810 }, { "epoch": 21.425742574257427, "grad_norm": 0.18138669431209564, "learning_rate": 4.736393833881247e-05, "loss": 0.0082, "step": 10820 }, { "epoch": 21.445544554455445, "grad_norm": 0.22340402007102966, "learning_rate": 4.7281383465603194e-05, "loss": 0.0081, "step": 10830 }, { "epoch": 21.465346534653467, "grad_norm": 0.23322872817516327, "learning_rate": 4.71988360249877e-05, "loss": 0.008, "step": 10840 }, { "epoch": 21.485148514851485, "grad_norm": 0.17409467697143555, "learning_rate": 4.7116296242647554e-05, "loss": 0.0069, "step": 10850 }, { "epoch": 21.504950495049506, "grad_norm": 0.3394359052181244, "learning_rate": 4.703376434424336e-05, "loss": 0.0072, "step": 10860 }, { "epoch": 21.524752475247524, "grad_norm": 0.19695058465003967, "learning_rate": 4.695124055541421e-05, "loss": 0.0087, "step": 10870 }, { "epoch": 21.544554455445546, "grad_norm": 0.14545448124408722, "learning_rate": 4.6868725101776934e-05, "loss": 0.0068, "step": 10880 }, { "epoch": 21.564356435643564, "grad_norm": 0.2089635282754898, "learning_rate": 4.678621820892567e-05, "loss": 0.0074, "step": 10890 }, { "epoch": 21.584158415841586, "grad_norm": 0.14257267117500305, "learning_rate": 4.670372010243111e-05, "loss": 0.0081, "step": 10900 }, { "epoch": 21.603960396039604, "grad_norm": 0.19708164036273956, "learning_rate": 4.662123100783992e-05, "loss": 0.0074, "step": 10910 }, { "epoch": 21.623762376237625, "grad_norm": 0.24770651757717133, "learning_rate": 4.653875115067415e-05, "loss": 0.0075, "step": 10920 }, { "epoch": 21.643564356435643, "grad_norm": 0.1955225020647049, "learning_rate": 4.6456280756430545e-05, "loss": 0.0077, "step": 10930 }, { "epoch": 21.663366336633665, "grad_norm": 0.15522104501724243, "learning_rate": 4.637382005058004e-05, "loss": 0.0072, "step": 10940 }, { "epoch": 21.683168316831683, "grad_norm": 0.11586793512105942, "learning_rate": 4.629136925856705e-05, "loss": 0.0068, "step": 10950 }, { "epoch": 21.702970297029704, "grad_norm": 0.12964990735054016, "learning_rate": 4.6208928605808895e-05, "loss": 0.0071, "step": 10960 }, { "epoch": 21.722772277227723, "grad_norm": 0.15370619297027588, "learning_rate": 4.612649831769519e-05, "loss": 0.0072, "step": 10970 }, { "epoch": 21.742574257425744, "grad_norm": 0.16773360967636108, "learning_rate": 4.604407861958715e-05, "loss": 0.0068, "step": 10980 }, { "epoch": 21.762376237623762, "grad_norm": 0.2035830020904541, "learning_rate": 4.5961669736817114e-05, "loss": 0.0079, "step": 10990 }, { "epoch": 21.782178217821784, "grad_norm": 0.16520550847053528, "learning_rate": 4.5879271894687814e-05, "loss": 0.0066, "step": 11000 }, { "epoch": 21.801980198019802, "grad_norm": 0.30234014987945557, "learning_rate": 4.5796885318471826e-05, "loss": 0.0085, "step": 11010 }, { "epoch": 21.821782178217823, "grad_norm": 0.2624816596508026, "learning_rate": 4.571451023341086e-05, "loss": 0.0078, "step": 11020 }, { "epoch": 21.84158415841584, "grad_norm": 0.12384509295225143, "learning_rate": 4.563214686471527e-05, "loss": 0.0076, "step": 11030 }, { "epoch": 21.861386138613863, "grad_norm": 0.19219860434532166, "learning_rate": 4.5549795437563365e-05, "loss": 0.0072, "step": 11040 }, { "epoch": 21.88118811881188, "grad_norm": 0.23642732203006744, "learning_rate": 4.546745617710081e-05, "loss": 0.0067, "step": 11050 }, { "epoch": 21.900990099009903, "grad_norm": 0.15131977200508118, "learning_rate": 4.5385129308440014e-05, "loss": 0.0073, "step": 11060 }, { "epoch": 21.92079207920792, "grad_norm": 0.2477278858423233, "learning_rate": 4.530281505665944e-05, "loss": 0.0074, "step": 11070 }, { "epoch": 21.94059405940594, "grad_norm": 0.1681228131055832, "learning_rate": 4.5220513646803134e-05, "loss": 0.0084, "step": 11080 }, { "epoch": 21.96039603960396, "grad_norm": 0.16905257105827332, "learning_rate": 4.513822530388003e-05, "loss": 0.0067, "step": 11090 }, { "epoch": 21.980198019801982, "grad_norm": 0.11963207274675369, "learning_rate": 4.5055950252863296e-05, "loss": 0.0078, "step": 11100 }, { "epoch": 22.0, "grad_norm": 0.4790559709072113, "learning_rate": 4.4973688718689803e-05, "loss": 0.0074, "step": 11110 }, { "epoch": 22.019801980198018, "grad_norm": 0.17933227121829987, "learning_rate": 4.4891440926259406e-05, "loss": 0.0089, "step": 11120 }, { "epoch": 22.03960396039604, "grad_norm": 0.1762050986289978, "learning_rate": 4.480920710043443e-05, "loss": 0.0084, "step": 11130 }, { "epoch": 22.059405940594058, "grad_norm": 0.13652606308460236, "learning_rate": 4.4726987466039044e-05, "loss": 0.0082, "step": 11140 }, { "epoch": 22.07920792079208, "grad_norm": 0.20271748304367065, "learning_rate": 4.46447822478586e-05, "loss": 0.0082, "step": 11150 }, { "epoch": 22.099009900990097, "grad_norm": 0.161631777882576, "learning_rate": 4.4562591670638974e-05, "loss": 0.0063, "step": 11160 }, { "epoch": 22.11881188118812, "grad_norm": 0.11932705342769623, "learning_rate": 4.4480415959086105e-05, "loss": 0.0063, "step": 11170 }, { "epoch": 22.138613861386137, "grad_norm": 0.12963499128818512, "learning_rate": 4.439825533786522e-05, "loss": 0.007, "step": 11180 }, { "epoch": 22.15841584158416, "grad_norm": 0.1468386948108673, "learning_rate": 4.431611003160035e-05, "loss": 0.0077, "step": 11190 }, { "epoch": 22.178217821782177, "grad_norm": 0.15211112797260284, "learning_rate": 4.4233980264873636e-05, "loss": 0.0065, "step": 11200 }, { "epoch": 22.198019801980198, "grad_norm": 0.16036947071552277, "learning_rate": 4.4151866262224684e-05, "loss": 0.0079, "step": 11210 }, { "epoch": 22.217821782178216, "grad_norm": 0.12080317735671997, "learning_rate": 4.406976824815006e-05, "loss": 0.0079, "step": 11220 }, { "epoch": 22.237623762376238, "grad_norm": 0.1970706582069397, "learning_rate": 4.3987686447102595e-05, "loss": 0.0077, "step": 11230 }, { "epoch": 22.257425742574256, "grad_norm": 0.13815592229366302, "learning_rate": 4.3905621083490804e-05, "loss": 0.0061, "step": 11240 }, { "epoch": 22.277227722772277, "grad_norm": 0.17532068490982056, "learning_rate": 4.3823572381678286e-05, "loss": 0.0105, "step": 11250 }, { "epoch": 22.297029702970296, "grad_norm": 0.18134231865406036, "learning_rate": 4.374154056598301e-05, "loss": 0.0064, "step": 11260 }, { "epoch": 22.316831683168317, "grad_norm": 0.1580265313386917, "learning_rate": 4.3659525860676845e-05, "loss": 0.0072, "step": 11270 }, { "epoch": 22.336633663366335, "grad_norm": 0.1696138232946396, "learning_rate": 4.3577528489984854e-05, "loss": 0.0067, "step": 11280 }, { "epoch": 22.356435643564357, "grad_norm": 0.129084050655365, "learning_rate": 4.349554867808476e-05, "loss": 0.0062, "step": 11290 }, { "epoch": 22.376237623762375, "grad_norm": 0.12025167047977448, "learning_rate": 4.34135866491062e-05, "loss": 0.0058, "step": 11300 }, { "epoch": 22.396039603960396, "grad_norm": 0.22547964751720428, "learning_rate": 4.333164262713022e-05, "loss": 0.0069, "step": 11310 }, { "epoch": 22.415841584158414, "grad_norm": 0.14577405154705048, "learning_rate": 4.324971683618868e-05, "loss": 0.0065, "step": 11320 }, { "epoch": 22.435643564356436, "grad_norm": 0.11384018510580063, "learning_rate": 4.316780950026354e-05, "loss": 0.0067, "step": 11330 }, { "epoch": 22.455445544554454, "grad_norm": 0.18661735951900482, "learning_rate": 4.308592084328637e-05, "loss": 0.0061, "step": 11340 }, { "epoch": 22.475247524752476, "grad_norm": 0.24358205497264862, "learning_rate": 4.3004051089137576e-05, "loss": 0.0064, "step": 11350 }, { "epoch": 22.495049504950494, "grad_norm": 0.19240298867225647, "learning_rate": 4.292220046164597e-05, "loss": 0.0078, "step": 11360 }, { "epoch": 22.514851485148515, "grad_norm": 0.16971921920776367, "learning_rate": 4.2840369184588035e-05, "loss": 0.0073, "step": 11370 }, { "epoch": 22.534653465346533, "grad_norm": 0.19905249774456024, "learning_rate": 4.2758557481687345e-05, "loss": 0.0073, "step": 11380 }, { "epoch": 22.554455445544555, "grad_norm": 0.12982311844825745, "learning_rate": 4.267676557661403e-05, "loss": 0.008, "step": 11390 }, { "epoch": 22.574257425742573, "grad_norm": 0.16717073321342468, "learning_rate": 4.2594993692983955e-05, "loss": 0.0062, "step": 11400 }, { "epoch": 22.594059405940595, "grad_norm": 0.17949193716049194, "learning_rate": 4.251324205435837e-05, "loss": 0.0066, "step": 11410 }, { "epoch": 22.613861386138613, "grad_norm": 0.1294238269329071, "learning_rate": 4.243151088424312e-05, "loss": 0.0075, "step": 11420 }, { "epoch": 22.633663366336634, "grad_norm": 0.18585935235023499, "learning_rate": 4.234980040608813e-05, "loss": 0.008, "step": 11430 }, { "epoch": 22.653465346534652, "grad_norm": 0.1304997354745865, "learning_rate": 4.22681108432867e-05, "loss": 0.0065, "step": 11440 }, { "epoch": 22.673267326732674, "grad_norm": 0.16581542789936066, "learning_rate": 4.2186442419174984e-05, "loss": 0.0073, "step": 11450 }, { "epoch": 22.693069306930692, "grad_norm": 0.15744513273239136, "learning_rate": 4.210479535703133e-05, "loss": 0.0064, "step": 11460 }, { "epoch": 22.712871287128714, "grad_norm": 0.13860785961151123, "learning_rate": 4.202316988007567e-05, "loss": 0.0069, "step": 11470 }, { "epoch": 22.73267326732673, "grad_norm": 0.14386476576328278, "learning_rate": 4.194156621146901e-05, "loss": 0.0071, "step": 11480 }, { "epoch": 22.752475247524753, "grad_norm": 0.15325337648391724, "learning_rate": 4.1859984574312596e-05, "loss": 0.0064, "step": 11490 }, { "epoch": 22.77227722772277, "grad_norm": 0.16054347157478333, "learning_rate": 4.177842519164752e-05, "loss": 0.0063, "step": 11500 }, { "epoch": 22.792079207920793, "grad_norm": 0.18454602360725403, "learning_rate": 4.169688828645404e-05, "loss": 0.0068, "step": 11510 }, { "epoch": 22.81188118811881, "grad_norm": 0.18787617981433868, "learning_rate": 4.161537408165092e-05, "loss": 0.0061, "step": 11520 }, { "epoch": 22.831683168316832, "grad_norm": 0.17682453989982605, "learning_rate": 4.1533882800094924e-05, "loss": 0.007, "step": 11530 }, { "epoch": 22.85148514851485, "grad_norm": 0.15636955201625824, "learning_rate": 4.145241466458005e-05, "loss": 0.0076, "step": 11540 }, { "epoch": 22.871287128712872, "grad_norm": 0.17930614948272705, "learning_rate": 4.13709698978371e-05, "loss": 0.0085, "step": 11550 }, { "epoch": 22.89108910891089, "grad_norm": 0.14316824078559875, "learning_rate": 4.1289548722532944e-05, "loss": 0.0079, "step": 11560 }, { "epoch": 22.91089108910891, "grad_norm": 0.15827716886997223, "learning_rate": 4.120815136126999e-05, "loss": 0.007, "step": 11570 }, { "epoch": 22.93069306930693, "grad_norm": 0.14933271706104279, "learning_rate": 4.112677803658548e-05, "loss": 0.007, "step": 11580 }, { "epoch": 22.95049504950495, "grad_norm": 0.12505200505256653, "learning_rate": 4.1045428970951e-05, "loss": 0.0075, "step": 11590 }, { "epoch": 22.97029702970297, "grad_norm": 0.0964285284280777, "learning_rate": 4.0964104386771785e-05, "loss": 0.0083, "step": 11600 }, { "epoch": 22.99009900990099, "grad_norm": 0.1386091113090515, "learning_rate": 4.0882804506386144e-05, "loss": 0.0078, "step": 11610 }, { "epoch": 23.00990099009901, "grad_norm": 0.2303241491317749, "learning_rate": 4.080152955206485e-05, "loss": 0.0063, "step": 11620 }, { "epoch": 23.02970297029703, "grad_norm": 0.18534381687641144, "learning_rate": 4.0720279746010505e-05, "loss": 0.0088, "step": 11630 }, { "epoch": 23.04950495049505, "grad_norm": 0.14851334691047668, "learning_rate": 4.063905531035699e-05, "loss": 0.0093, "step": 11640 }, { "epoch": 23.06930693069307, "grad_norm": 0.175076425075531, "learning_rate": 4.055785646716882e-05, "loss": 0.0079, "step": 11650 }, { "epoch": 23.08910891089109, "grad_norm": 0.19484800100326538, "learning_rate": 4.047668343844051e-05, "loss": 0.0073, "step": 11660 }, { "epoch": 23.10891089108911, "grad_norm": 0.16997140645980835, "learning_rate": 4.039553644609604e-05, "loss": 0.0076, "step": 11670 }, { "epoch": 23.128712871287128, "grad_norm": 0.1560663878917694, "learning_rate": 4.0314415711988176e-05, "loss": 0.0061, "step": 11680 }, { "epoch": 23.14851485148515, "grad_norm": 0.1400129795074463, "learning_rate": 4.023332145789792e-05, "loss": 0.0069, "step": 11690 }, { "epoch": 23.168316831683168, "grad_norm": 0.1408177614212036, "learning_rate": 4.015225390553385e-05, "loss": 0.0068, "step": 11700 }, { "epoch": 23.18811881188119, "grad_norm": 0.22773446142673492, "learning_rate": 4.007121327653158e-05, "loss": 0.0061, "step": 11710 }, { "epoch": 23.207920792079207, "grad_norm": 0.1500692367553711, "learning_rate": 3.9990199792453064e-05, "loss": 0.008, "step": 11720 }, { "epoch": 23.22772277227723, "grad_norm": 0.1254241168498993, "learning_rate": 3.9909213674786103e-05, "loss": 0.0068, "step": 11730 }, { "epoch": 23.247524752475247, "grad_norm": 0.11419210582971573, "learning_rate": 3.982825514494363e-05, "loss": 0.0066, "step": 11740 }, { "epoch": 23.26732673267327, "grad_norm": 0.1602715253829956, "learning_rate": 3.974732442426319e-05, "loss": 0.0071, "step": 11750 }, { "epoch": 23.287128712871286, "grad_norm": 0.15899060666561127, "learning_rate": 3.966642173400629e-05, "loss": 0.0076, "step": 11760 }, { "epoch": 23.306930693069308, "grad_norm": 0.12463992834091187, "learning_rate": 3.9585547295357764e-05, "loss": 0.0066, "step": 11770 }, { "epoch": 23.326732673267326, "grad_norm": 0.1583748161792755, "learning_rate": 3.950470132942526e-05, "loss": 0.0064, "step": 11780 }, { "epoch": 23.346534653465348, "grad_norm": 0.22705981135368347, "learning_rate": 3.942388405723856e-05, "loss": 0.0065, "step": 11790 }, { "epoch": 23.366336633663366, "grad_norm": 0.16754236817359924, "learning_rate": 3.9343095699749e-05, "loss": 0.0057, "step": 11800 }, { "epoch": 23.386138613861387, "grad_norm": 0.15740607678890228, "learning_rate": 3.9262336477828874e-05, "loss": 0.0066, "step": 11810 }, { "epoch": 23.405940594059405, "grad_norm": 0.17730812728405, "learning_rate": 3.9181606612270794e-05, "loss": 0.0066, "step": 11820 }, { "epoch": 23.425742574257427, "grad_norm": 0.16125166416168213, "learning_rate": 3.910090632378713e-05, "loss": 0.006, "step": 11830 }, { "epoch": 23.445544554455445, "grad_norm": 0.1177867129445076, "learning_rate": 3.90202358330094e-05, "loss": 0.0065, "step": 11840 }, { "epoch": 23.465346534653467, "grad_norm": 0.18130022287368774, "learning_rate": 3.8939595360487656e-05, "loss": 0.0083, "step": 11850 }, { "epoch": 23.485148514851485, "grad_norm": 0.15023447573184967, "learning_rate": 3.885898512668984e-05, "loss": 0.0066, "step": 11860 }, { "epoch": 23.504950495049506, "grad_norm": 0.15929058194160461, "learning_rate": 3.877840535200127e-05, "loss": 0.0071, "step": 11870 }, { "epoch": 23.524752475247524, "grad_norm": 0.1817755252122879, "learning_rate": 3.869785625672397e-05, "loss": 0.0066, "step": 11880 }, { "epoch": 23.544554455445546, "grad_norm": 0.16960617899894714, "learning_rate": 3.8617338061076094e-05, "loss": 0.0088, "step": 11890 }, { "epoch": 23.564356435643564, "grad_norm": 0.099460668861866, "learning_rate": 3.853685098519132e-05, "loss": 0.0069, "step": 11900 }, { "epoch": 23.584158415841586, "grad_norm": 0.13255485892295837, "learning_rate": 3.845639524911823e-05, "loss": 0.0061, "step": 11910 }, { "epoch": 23.603960396039604, "grad_norm": 0.16225755214691162, "learning_rate": 3.837597107281974e-05, "loss": 0.0065, "step": 11920 }, { "epoch": 23.623762376237625, "grad_norm": 0.1612943559885025, "learning_rate": 3.829557867617247e-05, "loss": 0.0067, "step": 11930 }, { "epoch": 23.643564356435643, "grad_norm": 0.16609947383403778, "learning_rate": 3.821521827896618e-05, "loss": 0.0065, "step": 11940 }, { "epoch": 23.663366336633665, "grad_norm": 0.15718679130077362, "learning_rate": 3.81348901009031e-05, "loss": 0.0092, "step": 11950 }, { "epoch": 23.683168316831683, "grad_norm": 0.2105565220117569, "learning_rate": 3.805459436159741e-05, "loss": 0.0075, "step": 11960 }, { "epoch": 23.702970297029704, "grad_norm": 0.1899179071187973, "learning_rate": 3.797433128057461e-05, "loss": 0.0077, "step": 11970 }, { "epoch": 23.722772277227723, "grad_norm": 0.11070194840431213, "learning_rate": 3.789410107727089e-05, "loss": 0.0069, "step": 11980 }, { "epoch": 23.742574257425744, "grad_norm": 0.1368037313222885, "learning_rate": 3.781390397103257e-05, "loss": 0.0074, "step": 11990 }, { "epoch": 23.762376237623762, "grad_norm": 0.1745167225599289, "learning_rate": 3.7733740181115455e-05, "loss": 0.0071, "step": 12000 }, { "epoch": 23.782178217821784, "grad_norm": 0.15016211569309235, "learning_rate": 3.7653609926684306e-05, "loss": 0.0068, "step": 12010 }, { "epoch": 23.801980198019802, "grad_norm": 0.1620381772518158, "learning_rate": 3.757351342681217e-05, "loss": 0.0074, "step": 12020 }, { "epoch": 23.821782178217823, "grad_norm": 0.14677686989307404, "learning_rate": 3.749345090047982e-05, "loss": 0.0075, "step": 12030 }, { "epoch": 23.84158415841584, "grad_norm": 0.15476076304912567, "learning_rate": 3.741342256657515e-05, "loss": 0.0081, "step": 12040 }, { "epoch": 23.861386138613863, "grad_norm": 0.13465818762779236, "learning_rate": 3.7333428643892567e-05, "loss": 0.0062, "step": 12050 }, { "epoch": 23.88118811881188, "grad_norm": 0.15191146731376648, "learning_rate": 3.725346935113239e-05, "loss": 0.0062, "step": 12060 }, { "epoch": 23.900990099009903, "grad_norm": 0.1487332135438919, "learning_rate": 3.717354490690029e-05, "loss": 0.0065, "step": 12070 }, { "epoch": 23.92079207920792, "grad_norm": 0.21045652031898499, "learning_rate": 3.709365552970664e-05, "loss": 0.0064, "step": 12080 }, { "epoch": 23.94059405940594, "grad_norm": 0.18648217618465424, "learning_rate": 3.7013801437965945e-05, "loss": 0.006, "step": 12090 }, { "epoch": 23.96039603960396, "grad_norm": 0.28906023502349854, "learning_rate": 3.693398284999623e-05, "loss": 0.0071, "step": 12100 }, { "epoch": 23.980198019801982, "grad_norm": 0.17742140591144562, "learning_rate": 3.6854199984018484e-05, "loss": 0.006, "step": 12110 }, { "epoch": 24.0, "grad_norm": 0.22536055743694305, "learning_rate": 3.677445305815601e-05, "loss": 0.006, "step": 12120 }, { "epoch": 24.019801980198018, "grad_norm": 0.17006538808345795, "learning_rate": 3.669474229043387e-05, "loss": 0.0065, "step": 12130 }, { "epoch": 24.03960396039604, "grad_norm": 0.17406661808490753, "learning_rate": 3.6615067898778235e-05, "loss": 0.0059, "step": 12140 }, { "epoch": 24.059405940594058, "grad_norm": 0.2687278985977173, "learning_rate": 3.6535430101015866e-05, "loss": 0.0066, "step": 12150 }, { "epoch": 24.07920792079208, "grad_norm": 0.23536403477191925, "learning_rate": 3.645582911487345e-05, "loss": 0.0076, "step": 12160 }, { "epoch": 24.099009900990097, "grad_norm": 0.1743956357240677, "learning_rate": 3.637626515797706e-05, "loss": 0.0067, "step": 12170 }, { "epoch": 24.11881188118812, "grad_norm": 0.22201132774353027, "learning_rate": 3.629673844785152e-05, "loss": 0.0064, "step": 12180 }, { "epoch": 24.138613861386137, "grad_norm": 0.12168415635824203, "learning_rate": 3.621724920191979e-05, "loss": 0.0064, "step": 12190 }, { "epoch": 24.15841584158416, "grad_norm": 0.14426793158054352, "learning_rate": 3.6137797637502444e-05, "loss": 0.0063, "step": 12200 }, { "epoch": 24.178217821782177, "grad_norm": 0.1569751352071762, "learning_rate": 3.6058383971817035e-05, "loss": 0.0055, "step": 12210 }, { "epoch": 24.198019801980198, "grad_norm": 0.31936243176460266, "learning_rate": 3.59790084219775e-05, "loss": 0.0077, "step": 12220 }, { "epoch": 24.217821782178216, "grad_norm": 0.12240996956825256, "learning_rate": 3.589967120499353e-05, "loss": 0.0068, "step": 12230 }, { "epoch": 24.237623762376238, "grad_norm": 0.4123775362968445, "learning_rate": 3.5820372537770075e-05, "loss": 0.006, "step": 12240 }, { "epoch": 24.257425742574256, "grad_norm": 0.13621363043785095, "learning_rate": 3.5741112637106655e-05, "loss": 0.0058, "step": 12250 }, { "epoch": 24.277227722772277, "grad_norm": 0.11612989008426666, "learning_rate": 3.5661891719696804e-05, "loss": 0.0066, "step": 12260 }, { "epoch": 24.297029702970296, "grad_norm": 0.11658249795436859, "learning_rate": 3.5582710002127504e-05, "loss": 0.0054, "step": 12270 }, { "epoch": 24.316831683168317, "grad_norm": 0.12684740126132965, "learning_rate": 3.550356770087853e-05, "loss": 0.0054, "step": 12280 }, { "epoch": 24.336633663366335, "grad_norm": 0.16334962844848633, "learning_rate": 3.5424465032321914e-05, "loss": 0.0065, "step": 12290 }, { "epoch": 24.356435643564357, "grad_norm": 0.16481266915798187, "learning_rate": 3.5345402212721335e-05, "loss": 0.0067, "step": 12300 }, { "epoch": 24.376237623762375, "grad_norm": 0.38881030678749084, "learning_rate": 3.526637945823152e-05, "loss": 0.0065, "step": 12310 }, { "epoch": 24.396039603960396, "grad_norm": 0.1759631186723709, "learning_rate": 3.518739698489767e-05, "loss": 0.0075, "step": 12320 }, { "epoch": 24.415841584158414, "grad_norm": 0.3127826750278473, "learning_rate": 3.510845500865485e-05, "loss": 0.0073, "step": 12330 }, { "epoch": 24.435643564356436, "grad_norm": 0.1930539608001709, "learning_rate": 3.502955374532739e-05, "loss": 0.007, "step": 12340 }, { "epoch": 24.455445544554454, "grad_norm": 0.17123377323150635, "learning_rate": 3.495069341062836e-05, "loss": 0.0058, "step": 12350 }, { "epoch": 24.475247524752476, "grad_norm": 0.2951042652130127, "learning_rate": 3.4871874220158896e-05, "loss": 0.0065, "step": 12360 }, { "epoch": 24.495049504950494, "grad_norm": 0.18910938501358032, "learning_rate": 3.479309638940762e-05, "loss": 0.0077, "step": 12370 }, { "epoch": 24.514851485148515, "grad_norm": 0.13780899345874786, "learning_rate": 3.4714360133750146e-05, "loss": 0.0061, "step": 12380 }, { "epoch": 24.534653465346533, "grad_norm": 0.14857962727546692, "learning_rate": 3.463566566844839e-05, "loss": 0.0053, "step": 12390 }, { "epoch": 24.554455445544555, "grad_norm": 0.1745629757642746, "learning_rate": 3.4557013208650016e-05, "loss": 0.0055, "step": 12400 }, { "epoch": 24.574257425742573, "grad_norm": 0.15617060661315918, "learning_rate": 3.4478402969387857e-05, "loss": 0.0058, "step": 12410 }, { "epoch": 24.594059405940595, "grad_norm": 0.19306699931621552, "learning_rate": 3.4399835165579266e-05, "loss": 0.0071, "step": 12420 }, { "epoch": 24.613861386138613, "grad_norm": 0.15403521060943604, "learning_rate": 3.4321310012025645e-05, "loss": 0.0054, "step": 12430 }, { "epoch": 24.633663366336634, "grad_norm": 0.15768331289291382, "learning_rate": 3.424282772341176e-05, "loss": 0.006, "step": 12440 }, { "epoch": 24.653465346534652, "grad_norm": 0.21602573990821838, "learning_rate": 3.416438851430519e-05, "loss": 0.0067, "step": 12450 }, { "epoch": 24.673267326732674, "grad_norm": 0.18981517851352692, "learning_rate": 3.408599259915577e-05, "loss": 0.0056, "step": 12460 }, { "epoch": 24.693069306930692, "grad_norm": 0.16132952272891998, "learning_rate": 3.400764019229487e-05, "loss": 0.0053, "step": 12470 }, { "epoch": 24.712871287128714, "grad_norm": 1.149375319480896, "learning_rate": 3.3929331507935035e-05, "loss": 0.0048, "step": 12480 }, { "epoch": 24.73267326732673, "grad_norm": 0.16272978484630585, "learning_rate": 3.3851066760169196e-05, "loss": 0.0074, "step": 12490 }, { "epoch": 24.752475247524753, "grad_norm": 0.14534766972064972, "learning_rate": 3.377284616297021e-05, "loss": 0.0067, "step": 12500 }, { "epoch": 24.77227722772277, "grad_norm": 0.1345452219247818, "learning_rate": 3.3694669930190166e-05, "loss": 0.0083, "step": 12510 }, { "epoch": 24.792079207920793, "grad_norm": 0.10287529230117798, "learning_rate": 3.36165382755599e-05, "loss": 0.0054, "step": 12520 }, { "epoch": 24.81188118811881, "grad_norm": 0.1302444040775299, "learning_rate": 3.35384514126884e-05, "loss": 0.0073, "step": 12530 }, { "epoch": 24.831683168316832, "grad_norm": 0.11365585774183273, "learning_rate": 3.3460409555062154e-05, "loss": 0.0057, "step": 12540 }, { "epoch": 24.85148514851485, "grad_norm": 0.1700490266084671, "learning_rate": 3.3382412916044645e-05, "loss": 0.0053, "step": 12550 }, { "epoch": 24.871287128712872, "grad_norm": 0.16629038751125336, "learning_rate": 3.330446170887566e-05, "loss": 0.006, "step": 12560 }, { "epoch": 24.89108910891089, "grad_norm": 0.1661778837442398, "learning_rate": 3.3226556146670834e-05, "loss": 0.0058, "step": 12570 }, { "epoch": 24.91089108910891, "grad_norm": 0.21004174649715424, "learning_rate": 3.314869644242102e-05, "loss": 0.0075, "step": 12580 }, { "epoch": 24.93069306930693, "grad_norm": 0.1742498129606247, "learning_rate": 3.3070882808991674e-05, "loss": 0.0062, "step": 12590 }, { "epoch": 24.95049504950495, "grad_norm": 0.18244266510009766, "learning_rate": 3.2993115459122305e-05, "loss": 0.0081, "step": 12600 }, { "epoch": 24.97029702970297, "grad_norm": 0.22398591041564941, "learning_rate": 3.2915394605425835e-05, "loss": 0.0064, "step": 12610 }, { "epoch": 24.99009900990099, "grad_norm": 0.13917437195777893, "learning_rate": 3.283772046038816e-05, "loss": 0.0055, "step": 12620 }, { "epoch": 25.00990099009901, "grad_norm": 0.2566840350627899, "learning_rate": 3.276009323636739e-05, "loss": 0.0053, "step": 12630 }, { "epoch": 25.02970297029703, "grad_norm": 0.1794641762971878, "learning_rate": 3.268251314559344e-05, "loss": 0.0057, "step": 12640 }, { "epoch": 25.04950495049505, "grad_norm": 0.13524918258190155, "learning_rate": 3.2604980400167254e-05, "loss": 0.0067, "step": 12650 }, { "epoch": 25.06930693069307, "grad_norm": 0.15082649886608124, "learning_rate": 3.252749521206042e-05, "loss": 0.0067, "step": 12660 }, { "epoch": 25.08910891089109, "grad_norm": 0.2124025821685791, "learning_rate": 3.2450057793114494e-05, "loss": 0.0054, "step": 12670 }, { "epoch": 25.10891089108911, "grad_norm": 0.12361261993646622, "learning_rate": 3.2372668355040435e-05, "loss": 0.0061, "step": 12680 }, { "epoch": 25.128712871287128, "grad_norm": 0.10879666358232498, "learning_rate": 3.2295327109418005e-05, "loss": 0.0062, "step": 12690 }, { "epoch": 25.14851485148515, "grad_norm": 0.12398306280374527, "learning_rate": 3.221803426769518e-05, "loss": 0.0061, "step": 12700 }, { "epoch": 25.168316831683168, "grad_norm": 0.11016777157783508, "learning_rate": 3.214079004118768e-05, "loss": 0.0063, "step": 12710 }, { "epoch": 25.18811881188119, "grad_norm": 0.1183968335390091, "learning_rate": 3.2063594641078234e-05, "loss": 0.006, "step": 12720 }, { "epoch": 25.207920792079207, "grad_norm": 0.11351604759693146, "learning_rate": 3.198644827841616e-05, "loss": 0.0054, "step": 12730 }, { "epoch": 25.22772277227723, "grad_norm": 0.15915505588054657, "learning_rate": 3.1909351164116654e-05, "loss": 0.0063, "step": 12740 }, { "epoch": 25.247524752475247, "grad_norm": 0.17053726315498352, "learning_rate": 3.183230350896026e-05, "loss": 0.0074, "step": 12750 }, { "epoch": 25.26732673267327, "grad_norm": 0.1899987757205963, "learning_rate": 3.1755305523592337e-05, "loss": 0.0056, "step": 12760 }, { "epoch": 25.287128712871286, "grad_norm": 0.13941740989685059, "learning_rate": 3.167835741852245e-05, "loss": 0.0047, "step": 12770 }, { "epoch": 25.306930693069308, "grad_norm": 0.17486196756362915, "learning_rate": 3.160145940412378e-05, "loss": 0.0073, "step": 12780 }, { "epoch": 25.326732673267326, "grad_norm": 0.17095574736595154, "learning_rate": 3.1524611690632545e-05, "loss": 0.0071, "step": 12790 }, { "epoch": 25.346534653465348, "grad_norm": 0.20573543012142181, "learning_rate": 3.144781448814746e-05, "loss": 0.0058, "step": 12800 }, { "epoch": 25.366336633663366, "grad_norm": 0.1404886096715927, "learning_rate": 3.1371068006629145e-05, "loss": 0.0064, "step": 12810 }, { "epoch": 25.386138613861387, "grad_norm": 0.14523302018642426, "learning_rate": 3.129437245589956e-05, "loss": 0.0056, "step": 12820 }, { "epoch": 25.405940594059405, "grad_norm": 0.2074955701828003, "learning_rate": 3.121772804564143e-05, "loss": 0.0053, "step": 12830 }, { "epoch": 25.425742574257427, "grad_norm": 0.18090908229351044, "learning_rate": 3.11411349853976e-05, "loss": 0.0064, "step": 12840 }, { "epoch": 25.445544554455445, "grad_norm": 0.12879477441310883, "learning_rate": 3.10645934845706e-05, "loss": 0.0058, "step": 12850 }, { "epoch": 25.465346534653467, "grad_norm": 0.09176535904407501, "learning_rate": 3.098810375242196e-05, "loss": 0.0046, "step": 12860 }, { "epoch": 25.485148514851485, "grad_norm": 0.15078312158584595, "learning_rate": 3.0911665998071704e-05, "loss": 0.0065, "step": 12870 }, { "epoch": 25.504950495049506, "grad_norm": 0.22962869703769684, "learning_rate": 3.083528043049774e-05, "loss": 0.006, "step": 12880 }, { "epoch": 25.524752475247524, "grad_norm": 0.148224875330925, "learning_rate": 3.0758947258535255e-05, "loss": 0.0062, "step": 12890 }, { "epoch": 25.544554455445546, "grad_norm": 0.17134210467338562, "learning_rate": 3.068266669087625e-05, "loss": 0.0054, "step": 12900 }, { "epoch": 25.564356435643564, "grad_norm": 0.1475493162870407, "learning_rate": 3.060643893606887e-05, "loss": 0.0049, "step": 12910 }, { "epoch": 25.584158415841586, "grad_norm": 0.11003249883651733, "learning_rate": 3.053026420251693e-05, "loss": 0.0062, "step": 12920 }, { "epoch": 25.603960396039604, "grad_norm": 0.20548729598522186, "learning_rate": 3.0454142698479183e-05, "loss": 0.0045, "step": 12930 }, { "epoch": 25.623762376237625, "grad_norm": 0.138779878616333, "learning_rate": 3.0378074632068954e-05, "loss": 0.0051, "step": 12940 }, { "epoch": 25.643564356435643, "grad_norm": 0.11109545081853867, "learning_rate": 3.0302060211253408e-05, "loss": 0.0067, "step": 12950 }, { "epoch": 25.663366336633665, "grad_norm": 0.12814074754714966, "learning_rate": 3.0226099643853073e-05, "loss": 0.0053, "step": 12960 }, { "epoch": 25.683168316831683, "grad_norm": 0.16520704329013824, "learning_rate": 3.0150193137541283e-05, "loss": 0.0057, "step": 12970 }, { "epoch": 25.702970297029704, "grad_norm": 0.10198646783828735, "learning_rate": 3.0074340899843467e-05, "loss": 0.0056, "step": 12980 }, { "epoch": 25.722772277227723, "grad_norm": 0.16061265766620636, "learning_rate": 2.999854313813677e-05, "loss": 0.0044, "step": 12990 }, { "epoch": 25.742574257425744, "grad_norm": 0.13255640864372253, "learning_rate": 2.9922800059649382e-05, "loss": 0.0058, "step": 13000 }, { "epoch": 25.762376237623762, "grad_norm": 0.1827789843082428, "learning_rate": 2.9847111871459976e-05, "loss": 0.0053, "step": 13010 }, { "epoch": 25.782178217821784, "grad_norm": 0.11735762655735016, "learning_rate": 2.977147878049721e-05, "loss": 0.0058, "step": 13020 }, { "epoch": 25.801980198019802, "grad_norm": 0.1276397556066513, "learning_rate": 2.9695900993539006e-05, "loss": 0.0053, "step": 13030 }, { "epoch": 25.821782178217823, "grad_norm": 0.1460658460855484, "learning_rate": 2.9620378717212183e-05, "loss": 0.0062, "step": 13040 }, { "epoch": 25.84158415841584, "grad_norm": 0.11331414431333542, "learning_rate": 2.9544912157991745e-05, "loss": 0.0049, "step": 13050 }, { "epoch": 25.861386138613863, "grad_norm": 0.22658425569534302, "learning_rate": 2.9469501522200405e-05, "loss": 0.0058, "step": 13060 }, { "epoch": 25.88118811881188, "grad_norm": 0.13156534731388092, "learning_rate": 2.9394147016007946e-05, "loss": 0.006, "step": 13070 }, { "epoch": 25.900990099009903, "grad_norm": 0.18851321935653687, "learning_rate": 2.9318848845430702e-05, "loss": 0.0076, "step": 13080 }, { "epoch": 25.92079207920792, "grad_norm": 0.1291010081768036, "learning_rate": 2.9243607216331013e-05, "loss": 0.0054, "step": 13090 }, { "epoch": 25.94059405940594, "grad_norm": 0.1467839628458023, "learning_rate": 2.916842233441661e-05, "loss": 0.0057, "step": 13100 }, { "epoch": 25.96039603960396, "grad_norm": 0.18030935525894165, "learning_rate": 2.90932944052401e-05, "loss": 0.0062, "step": 13110 }, { "epoch": 25.980198019801982, "grad_norm": 0.1592559963464737, "learning_rate": 2.9018223634198354e-05, "loss": 0.0051, "step": 13120 }, { "epoch": 26.0, "grad_norm": 0.5750719308853149, "learning_rate": 2.8943210226532025e-05, "loss": 0.0057, "step": 13130 }, { "epoch": 26.019801980198018, "grad_norm": 0.11389387398958206, "learning_rate": 2.8868254387324857e-05, "loss": 0.0059, "step": 13140 }, { "epoch": 26.03960396039604, "grad_norm": 0.16423340141773224, "learning_rate": 2.8793356321503306e-05, "loss": 0.0053, "step": 13150 }, { "epoch": 26.059405940594058, "grad_norm": 0.20214618742465973, "learning_rate": 2.87185162338358e-05, "loss": 0.0057, "step": 13160 }, { "epoch": 26.07920792079208, "grad_norm": 0.1513577550649643, "learning_rate": 2.8643734328932253e-05, "loss": 0.0056, "step": 13170 }, { "epoch": 26.099009900990097, "grad_norm": 0.09861622750759125, "learning_rate": 2.856901081124359e-05, "loss": 0.0055, "step": 13180 }, { "epoch": 26.11881188118812, "grad_norm": 0.15588930249214172, "learning_rate": 2.8494345885061002e-05, "loss": 0.0049, "step": 13190 }, { "epoch": 26.138613861386137, "grad_norm": 0.12460043281316757, "learning_rate": 2.8419739754515616e-05, "loss": 0.0053, "step": 13200 }, { "epoch": 26.15841584158416, "grad_norm": 0.1762942373752594, "learning_rate": 2.8345192623577666e-05, "loss": 0.0059, "step": 13210 }, { "epoch": 26.178217821782177, "grad_norm": 0.24057206511497498, "learning_rate": 2.8270704696056193e-05, "loss": 0.0053, "step": 13220 }, { "epoch": 26.198019801980198, "grad_norm": 0.11624093353748322, "learning_rate": 2.8196276175598367e-05, "loss": 0.0057, "step": 13230 }, { "epoch": 26.217821782178216, "grad_norm": 0.4302012026309967, "learning_rate": 2.8121907265688884e-05, "loss": 0.0052, "step": 13240 }, { "epoch": 26.237623762376238, "grad_norm": 0.16424134373664856, "learning_rate": 2.804759816964957e-05, "loss": 0.0073, "step": 13250 }, { "epoch": 26.257425742574256, "grad_norm": 0.12663665413856506, "learning_rate": 2.797334909063857e-05, "loss": 0.0047, "step": 13260 }, { "epoch": 26.277227722772277, "grad_norm": 0.1361866593360901, "learning_rate": 2.7899160231650056e-05, "loss": 0.0046, "step": 13270 }, { "epoch": 26.297029702970296, "grad_norm": 0.12450657039880753, "learning_rate": 2.7825031795513585e-05, "loss": 0.0056, "step": 13280 }, { "epoch": 26.316831683168317, "grad_norm": 0.17182613909244537, "learning_rate": 2.775096398489341e-05, "loss": 0.0059, "step": 13290 }, { "epoch": 26.336633663366335, "grad_norm": 0.10902509838342667, "learning_rate": 2.7676957002288163e-05, "loss": 0.0057, "step": 13300 }, { "epoch": 26.356435643564357, "grad_norm": 0.11557909846305847, "learning_rate": 2.760301105003003e-05, "loss": 0.0044, "step": 13310 }, { "epoch": 26.376237623762375, "grad_norm": 0.14303244650363922, "learning_rate": 2.752912633028446e-05, "loss": 0.0063, "step": 13320 }, { "epoch": 26.396039603960396, "grad_norm": 0.1332441121339798, "learning_rate": 2.7455303045049474e-05, "loss": 0.0052, "step": 13330 }, { "epoch": 26.415841584158414, "grad_norm": 0.13410109281539917, "learning_rate": 2.7381541396155098e-05, "loss": 0.0054, "step": 13340 }, { "epoch": 26.435643564356436, "grad_norm": 0.11536426842212677, "learning_rate": 2.730784158526286e-05, "loss": 0.0087, "step": 13350 }, { "epoch": 26.455445544554454, "grad_norm": 0.11024891585111618, "learning_rate": 2.723420381386521e-05, "loss": 0.0051, "step": 13360 }, { "epoch": 26.475247524752476, "grad_norm": 0.1726273000240326, "learning_rate": 2.7160628283285018e-05, "loss": 0.0064, "step": 13370 }, { "epoch": 26.495049504950494, "grad_norm": 0.2093508094549179, "learning_rate": 2.7087115194675007e-05, "loss": 0.0063, "step": 13380 }, { "epoch": 26.514851485148515, "grad_norm": 0.15597717463970184, "learning_rate": 2.701366474901712e-05, "loss": 0.0063, "step": 13390 }, { "epoch": 26.534653465346533, "grad_norm": 0.08909502625465393, "learning_rate": 2.6940277147122085e-05, "loss": 0.005, "step": 13400 }, { "epoch": 26.554455445544555, "grad_norm": 0.09270431846380234, "learning_rate": 2.686695258962878e-05, "loss": 0.0049, "step": 13410 }, { "epoch": 26.574257425742573, "grad_norm": 0.11993030458688736, "learning_rate": 2.679369127700375e-05, "loss": 0.0068, "step": 13420 }, { "epoch": 26.594059405940595, "grad_norm": 0.13107821345329285, "learning_rate": 2.672049340954067e-05, "loss": 0.0059, "step": 13430 }, { "epoch": 26.613861386138613, "grad_norm": 0.199144646525383, "learning_rate": 2.6647359187359676e-05, "loss": 0.0057, "step": 13440 }, { "epoch": 26.633663366336634, "grad_norm": 0.18967321515083313, "learning_rate": 2.6574288810406946e-05, "loss": 0.0066, "step": 13450 }, { "epoch": 26.653465346534652, "grad_norm": 0.17040850222110748, "learning_rate": 2.6501282478454083e-05, "loss": 0.0068, "step": 13460 }, { "epoch": 26.673267326732674, "grad_norm": 0.10494083166122437, "learning_rate": 2.6428340391097618e-05, "loss": 0.005, "step": 13470 }, { "epoch": 26.693069306930692, "grad_norm": 0.1414257287979126, "learning_rate": 2.6355462747758485e-05, "loss": 0.006, "step": 13480 }, { "epoch": 26.712871287128714, "grad_norm": 0.14809712767601013, "learning_rate": 2.6282649747681304e-05, "loss": 0.0055, "step": 13490 }, { "epoch": 26.73267326732673, "grad_norm": 0.1544862538576126, "learning_rate": 2.620990158993406e-05, "loss": 0.0065, "step": 13500 }, { "epoch": 26.752475247524753, "grad_norm": 0.15681050717830658, "learning_rate": 2.6137218473407477e-05, "loss": 0.0052, "step": 13510 }, { "epoch": 26.77227722772277, "grad_norm": 0.13483715057373047, "learning_rate": 2.606460059681436e-05, "loss": 0.0066, "step": 13520 }, { "epoch": 26.792079207920793, "grad_norm": 0.1328570395708084, "learning_rate": 2.599204815868928e-05, "loss": 0.0057, "step": 13530 }, { "epoch": 26.81188118811881, "grad_norm": 0.13639812171459198, "learning_rate": 2.5919561357387756e-05, "loss": 0.0049, "step": 13540 }, { "epoch": 26.831683168316832, "grad_norm": 0.10641159862279892, "learning_rate": 2.5847140391085972e-05, "loss": 0.0046, "step": 13550 }, { "epoch": 26.85148514851485, "grad_norm": 0.1418219953775406, "learning_rate": 2.5774785457780103e-05, "loss": 0.0057, "step": 13560 }, { "epoch": 26.871287128712872, "grad_norm": 0.09620150178670883, "learning_rate": 2.5702496755285753e-05, "loss": 0.0063, "step": 13570 }, { "epoch": 26.89108910891089, "grad_norm": 0.1394530087709427, "learning_rate": 2.5630274481237483e-05, "loss": 0.0062, "step": 13580 }, { "epoch": 26.91089108910891, "grad_norm": 0.13036321103572845, "learning_rate": 2.5558118833088197e-05, "loss": 0.0052, "step": 13590 }, { "epoch": 26.93069306930693, "grad_norm": 0.13525576889514923, "learning_rate": 2.548603000810872e-05, "loss": 0.0057, "step": 13600 }, { "epoch": 26.95049504950495, "grad_norm": 0.11534959822893143, "learning_rate": 2.5414008203387152e-05, "loss": 0.0045, "step": 13610 }, { "epoch": 26.97029702970297, "grad_norm": 0.1215607225894928, "learning_rate": 2.534205361582834e-05, "loss": 0.0076, "step": 13620 }, { "epoch": 26.99009900990099, "grad_norm": 0.13764551281929016, "learning_rate": 2.527016644215338e-05, "loss": 0.0047, "step": 13630 }, { "epoch": 27.00990099009901, "grad_norm": 0.16493026912212372, "learning_rate": 2.519834687889905e-05, "loss": 0.0045, "step": 13640 }, { "epoch": 27.02970297029703, "grad_norm": 0.14375363290309906, "learning_rate": 2.5126595122417295e-05, "loss": 0.005, "step": 13650 }, { "epoch": 27.04950495049505, "grad_norm": 0.20669151842594147, "learning_rate": 2.5054911368874713e-05, "loss": 0.0055, "step": 13660 }, { "epoch": 27.06930693069307, "grad_norm": 0.19104622304439545, "learning_rate": 2.4983295814251916e-05, "loss": 0.0059, "step": 13670 }, { "epoch": 27.08910891089109, "grad_norm": 0.156061053276062, "learning_rate": 2.4911748654343105e-05, "loss": 0.0055, "step": 13680 }, { "epoch": 27.10891089108911, "grad_norm": 0.14646807312965393, "learning_rate": 2.4840270084755463e-05, "loss": 0.0051, "step": 13690 }, { "epoch": 27.128712871287128, "grad_norm": 0.10923798382282257, "learning_rate": 2.4768860300908685e-05, "loss": 0.0052, "step": 13700 }, { "epoch": 27.14851485148515, "grad_norm": 0.16276030242443085, "learning_rate": 2.469751949803443e-05, "loss": 0.0063, "step": 13710 }, { "epoch": 27.168316831683168, "grad_norm": 0.12101855129003525, "learning_rate": 2.4626247871175666e-05, "loss": 0.0049, "step": 13720 }, { "epoch": 27.18811881188119, "grad_norm": 0.14663085341453552, "learning_rate": 2.4555045615186346e-05, "loss": 0.0058, "step": 13730 }, { "epoch": 27.207920792079207, "grad_norm": 0.12020029872655869, "learning_rate": 2.4483912924730677e-05, "loss": 0.0058, "step": 13740 }, { "epoch": 27.22772277227723, "grad_norm": 0.12942253053188324, "learning_rate": 2.4412849994282742e-05, "loss": 0.0051, "step": 13750 }, { "epoch": 27.247524752475247, "grad_norm": 0.3446511924266815, "learning_rate": 2.434185701812592e-05, "loss": 0.007, "step": 13760 }, { "epoch": 27.26732673267327, "grad_norm": 0.09455665946006775, "learning_rate": 2.4270934190352218e-05, "loss": 0.0057, "step": 13770 }, { "epoch": 27.287128712871286, "grad_norm": 0.11263646930456161, "learning_rate": 2.4200081704861998e-05, "loss": 0.0052, "step": 13780 }, { "epoch": 27.306930693069308, "grad_norm": 0.14906109869480133, "learning_rate": 2.412929975536321e-05, "loss": 0.0047, "step": 13790 }, { "epoch": 27.326732673267326, "grad_norm": 0.12720923125743866, "learning_rate": 2.4058588535371017e-05, "loss": 0.0044, "step": 13800 }, { "epoch": 27.346534653465348, "grad_norm": 0.15930083394050598, "learning_rate": 2.3987948238207243e-05, "loss": 0.0055, "step": 13810 }, { "epoch": 27.366336633663366, "grad_norm": 0.1044587716460228, "learning_rate": 2.3917379056999678e-05, "loss": 0.0046, "step": 13820 }, { "epoch": 27.386138613861387, "grad_norm": 0.172451913356781, "learning_rate": 2.3846881184681824e-05, "loss": 0.0056, "step": 13830 }, { "epoch": 27.405940594059405, "grad_norm": 0.11896778643131256, "learning_rate": 2.377645481399214e-05, "loss": 0.0047, "step": 13840 }, { "epoch": 27.425742574257427, "grad_norm": 0.15571995079517365, "learning_rate": 2.3706100137473667e-05, "loss": 0.0064, "step": 13850 }, { "epoch": 27.445544554455445, "grad_norm": 0.13487105071544647, "learning_rate": 2.3635817347473394e-05, "loss": 0.0052, "step": 13860 }, { "epoch": 27.465346534653467, "grad_norm": 0.15631882846355438, "learning_rate": 2.3565606636141757e-05, "loss": 0.0046, "step": 13870 }, { "epoch": 27.485148514851485, "grad_norm": 0.14464789628982544, "learning_rate": 2.3495468195432203e-05, "loss": 0.0047, "step": 13880 }, { "epoch": 27.504950495049506, "grad_norm": 0.164018452167511, "learning_rate": 2.3425402217100507e-05, "loss": 0.0052, "step": 13890 }, { "epoch": 27.524752475247524, "grad_norm": 0.10736750811338425, "learning_rate": 2.3355408892704424e-05, "loss": 0.0052, "step": 13900 }, { "epoch": 27.544554455445546, "grad_norm": 0.11932003498077393, "learning_rate": 2.3285488413603003e-05, "loss": 0.0044, "step": 13910 }, { "epoch": 27.564356435643564, "grad_norm": 0.14560720324516296, "learning_rate": 2.321564097095615e-05, "loss": 0.0047, "step": 13920 }, { "epoch": 27.584158415841586, "grad_norm": 0.09830281138420105, "learning_rate": 2.3145866755724142e-05, "loss": 0.0054, "step": 13930 }, { "epoch": 27.603960396039604, "grad_norm": 0.15827935934066772, "learning_rate": 2.307616595866699e-05, "loss": 0.0049, "step": 13940 }, { "epoch": 27.623762376237625, "grad_norm": 0.10405430197715759, "learning_rate": 2.3006538770344032e-05, "loss": 0.0051, "step": 13950 }, { "epoch": 27.643564356435643, "grad_norm": 0.2073926478624344, "learning_rate": 2.293698538111334e-05, "loss": 0.0044, "step": 13960 }, { "epoch": 27.663366336633665, "grad_norm": 0.09724073112010956, "learning_rate": 2.28675059811312e-05, "loss": 0.0047, "step": 13970 }, { "epoch": 27.683168316831683, "grad_norm": 0.11926750838756561, "learning_rate": 2.279810076035167e-05, "loss": 0.0052, "step": 13980 }, { "epoch": 27.702970297029704, "grad_norm": 0.12129216641187668, "learning_rate": 2.272876990852596e-05, "loss": 0.0061, "step": 13990 }, { "epoch": 27.722772277227723, "grad_norm": 0.12027972936630249, "learning_rate": 2.265951361520195e-05, "loss": 0.0056, "step": 14000 }, { "epoch": 27.742574257425744, "grad_norm": 0.24053533375263214, "learning_rate": 2.2590332069723748e-05, "loss": 0.0048, "step": 14010 }, { "epoch": 27.762376237623762, "grad_norm": 0.1423746645450592, "learning_rate": 2.2521225461231004e-05, "loss": 0.0045, "step": 14020 }, { "epoch": 27.782178217821784, "grad_norm": 0.2438199669122696, "learning_rate": 2.2452193978658597e-05, "loss": 0.0043, "step": 14030 }, { "epoch": 27.801980198019802, "grad_norm": 0.12863273918628693, "learning_rate": 2.238323781073594e-05, "loss": 0.0069, "step": 14040 }, { "epoch": 27.821782178217823, "grad_norm": 0.09665673226118088, "learning_rate": 2.2314357145986552e-05, "loss": 0.0051, "step": 14050 }, { "epoch": 27.84158415841584, "grad_norm": 0.13755489885807037, "learning_rate": 2.224555217272757e-05, "loss": 0.0044, "step": 14060 }, { "epoch": 27.861386138613863, "grad_norm": 0.23338203132152557, "learning_rate": 2.2176823079069127e-05, "loss": 0.005, "step": 14070 }, { "epoch": 27.88118811881188, "grad_norm": 0.13428297638893127, "learning_rate": 2.210817005291398e-05, "loss": 0.0056, "step": 14080 }, { "epoch": 27.900990099009903, "grad_norm": 0.22653833031654358, "learning_rate": 2.203959328195686e-05, "loss": 0.0045, "step": 14090 }, { "epoch": 27.92079207920792, "grad_norm": 0.1050722673535347, "learning_rate": 2.1971092953684026e-05, "loss": 0.0043, "step": 14100 }, { "epoch": 27.94059405940594, "grad_norm": 0.15742871165275574, "learning_rate": 2.1902669255372788e-05, "loss": 0.0045, "step": 14110 }, { "epoch": 27.96039603960396, "grad_norm": 0.12327434867620468, "learning_rate": 2.1834322374090897e-05, "loss": 0.004, "step": 14120 }, { "epoch": 27.980198019801982, "grad_norm": 0.12029647827148438, "learning_rate": 2.1766052496696153e-05, "loss": 0.0052, "step": 14130 }, { "epoch": 28.0, "grad_norm": 0.3019558787345886, "learning_rate": 2.169785980983577e-05, "loss": 0.0053, "step": 14140 }, { "epoch": 28.019801980198018, "grad_norm": 0.14532847702503204, "learning_rate": 2.162974449994593e-05, "loss": 0.0057, "step": 14150 }, { "epoch": 28.03960396039604, "grad_norm": 0.1401582658290863, "learning_rate": 2.1561706753251337e-05, "loss": 0.005, "step": 14160 }, { "epoch": 28.059405940594058, "grad_norm": 0.18961220979690552, "learning_rate": 2.1493746755764544e-05, "loss": 0.0054, "step": 14170 }, { "epoch": 28.07920792079208, "grad_norm": 0.08711325377225876, "learning_rate": 2.1425864693285635e-05, "loss": 0.0041, "step": 14180 }, { "epoch": 28.099009900990097, "grad_norm": 0.12038716673851013, "learning_rate": 2.1358060751401547e-05, "loss": 0.0062, "step": 14190 }, { "epoch": 28.11881188118812, "grad_norm": 0.08505193889141083, "learning_rate": 2.129033511548566e-05, "loss": 0.0045, "step": 14200 }, { "epoch": 28.138613861386137, "grad_norm": 0.11792397499084473, "learning_rate": 2.1222687970697315e-05, "loss": 0.0042, "step": 14210 }, { "epoch": 28.15841584158416, "grad_norm": 0.14000387489795685, "learning_rate": 2.1155119501981173e-05, "loss": 0.0051, "step": 14220 }, { "epoch": 28.178217821782177, "grad_norm": 0.21676015853881836, "learning_rate": 2.1087629894066895e-05, "loss": 0.0054, "step": 14230 }, { "epoch": 28.198019801980198, "grad_norm": 0.12306522578001022, "learning_rate": 2.1020219331468473e-05, "loss": 0.0052, "step": 14240 }, { "epoch": 28.217821782178216, "grad_norm": 0.1080046072602272, "learning_rate": 2.095288799848379e-05, "loss": 0.0047, "step": 14250 }, { "epoch": 28.237623762376238, "grad_norm": 0.09628001600503922, "learning_rate": 2.088563607919417e-05, "loss": 0.0059, "step": 14260 }, { "epoch": 28.257425742574256, "grad_norm": 0.11858979612588882, "learning_rate": 2.0818463757463786e-05, "loss": 0.0051, "step": 14270 }, { "epoch": 28.277227722772277, "grad_norm": 0.13070692121982574, "learning_rate": 2.0751371216939175e-05, "loss": 0.0046, "step": 14280 }, { "epoch": 28.297029702970296, "grad_norm": 0.10648556798696518, "learning_rate": 2.068435864104882e-05, "loss": 0.0047, "step": 14290 }, { "epoch": 28.316831683168317, "grad_norm": 0.1842966377735138, "learning_rate": 2.0617426213002506e-05, "loss": 0.005, "step": 14300 }, { "epoch": 28.336633663366335, "grad_norm": 0.10069520026445389, "learning_rate": 2.055057411579097e-05, "loss": 0.0038, "step": 14310 }, { "epoch": 28.356435643564357, "grad_norm": 0.09501543641090393, "learning_rate": 2.0483802532185286e-05, "loss": 0.0043, "step": 14320 }, { "epoch": 28.376237623762375, "grad_norm": 0.31498295068740845, "learning_rate": 2.041711164473638e-05, "loss": 0.0044, "step": 14330 }, { "epoch": 28.396039603960396, "grad_norm": 0.1311095505952835, "learning_rate": 2.0350501635774637e-05, "loss": 0.0071, "step": 14340 }, { "epoch": 28.415841584158414, "grad_norm": 0.10612057149410248, "learning_rate": 2.0283972687409247e-05, "loss": 0.0046, "step": 14350 }, { "epoch": 28.435643564356436, "grad_norm": 0.13140656054019928, "learning_rate": 2.021752498152784e-05, "loss": 0.0042, "step": 14360 }, { "epoch": 28.455445544554454, "grad_norm": 0.10386335849761963, "learning_rate": 2.015115869979589e-05, "loss": 0.005, "step": 14370 }, { "epoch": 28.475247524752476, "grad_norm": 0.18082940578460693, "learning_rate": 2.0084874023656265e-05, "loss": 0.0045, "step": 14380 }, { "epoch": 28.495049504950494, "grad_norm": 0.1033976823091507, "learning_rate": 2.001867113432877e-05, "loss": 0.0048, "step": 14390 }, { "epoch": 28.514851485148515, "grad_norm": 0.16992606222629547, "learning_rate": 1.995255021280954e-05, "loss": 0.0042, "step": 14400 }, { "epoch": 28.534653465346533, "grad_norm": 0.13509373366832733, "learning_rate": 1.9886511439870688e-05, "loss": 0.0052, "step": 14410 }, { "epoch": 28.554455445544555, "grad_norm": 0.17203371226787567, "learning_rate": 1.9820554996059675e-05, "loss": 0.0051, "step": 14420 }, { "epoch": 28.574257425742573, "grad_norm": 0.11591371148824692, "learning_rate": 1.9754681061698893e-05, "loss": 0.0043, "step": 14430 }, { "epoch": 28.594059405940595, "grad_norm": 0.10557746887207031, "learning_rate": 1.9688889816885185e-05, "loss": 0.0039, "step": 14440 }, { "epoch": 28.613861386138613, "grad_norm": 0.13372908532619476, "learning_rate": 1.962318144148928e-05, "loss": 0.0046, "step": 14450 }, { "epoch": 28.633663366336634, "grad_norm": 0.1431443691253662, "learning_rate": 1.955755611515539e-05, "loss": 0.0055, "step": 14460 }, { "epoch": 28.653465346534652, "grad_norm": 0.10330212861299515, "learning_rate": 1.9492014017300642e-05, "loss": 0.0046, "step": 14470 }, { "epoch": 28.673267326732674, "grad_norm": 0.08603169769048691, "learning_rate": 1.942655532711461e-05, "loss": 0.0043, "step": 14480 }, { "epoch": 28.693069306930692, "grad_norm": 0.09774741530418396, "learning_rate": 1.9361180223558882e-05, "loss": 0.0054, "step": 14490 }, { "epoch": 28.712871287128714, "grad_norm": 0.11478214710950851, "learning_rate": 1.929588888536647e-05, "loss": 0.0037, "step": 14500 }, { "epoch": 28.73267326732673, "grad_norm": 0.13233812153339386, "learning_rate": 1.9230681491041425e-05, "loss": 0.0039, "step": 14510 }, { "epoch": 28.752475247524753, "grad_norm": 0.15097181499004364, "learning_rate": 1.9165558218858264e-05, "loss": 0.0052, "step": 14520 }, { "epoch": 28.77227722772277, "grad_norm": 0.11397737264633179, "learning_rate": 1.9100519246861505e-05, "loss": 0.0049, "step": 14530 }, { "epoch": 28.792079207920793, "grad_norm": 0.07853151112794876, "learning_rate": 1.9035564752865248e-05, "loss": 0.0038, "step": 14540 }, { "epoch": 28.81188118811881, "grad_norm": 0.11870231479406357, "learning_rate": 1.897069491445258e-05, "loss": 0.0045, "step": 14550 }, { "epoch": 28.831683168316832, "grad_norm": 0.08116481453180313, "learning_rate": 1.890590990897515e-05, "loss": 0.0045, "step": 14560 }, { "epoch": 28.85148514851485, "grad_norm": 0.11680148541927338, "learning_rate": 1.884120991355272e-05, "loss": 0.0055, "step": 14570 }, { "epoch": 28.871287128712872, "grad_norm": 0.10264234989881516, "learning_rate": 1.8776595105072576e-05, "loss": 0.0056, "step": 14580 }, { "epoch": 28.89108910891089, "grad_norm": 0.14260366559028625, "learning_rate": 1.8712065660189166e-05, "loss": 0.0042, "step": 14590 }, { "epoch": 28.91089108910891, "grad_norm": 0.10576102882623672, "learning_rate": 1.8647621755323513e-05, "loss": 0.0042, "step": 14600 }, { "epoch": 28.93069306930693, "grad_norm": 0.08846601098775864, "learning_rate": 1.858326356666278e-05, "loss": 0.0045, "step": 14610 }, { "epoch": 28.95049504950495, "grad_norm": 0.11092519015073776, "learning_rate": 1.851899127015983e-05, "loss": 0.0046, "step": 14620 }, { "epoch": 28.97029702970297, "grad_norm": 0.08788510411977768, "learning_rate": 1.8454805041532626e-05, "loss": 0.0047, "step": 14630 }, { "epoch": 28.99009900990099, "grad_norm": 0.10191840678453445, "learning_rate": 1.8390705056263906e-05, "loss": 0.0055, "step": 14640 }, { "epoch": 29.00990099009901, "grad_norm": 0.14260585606098175, "learning_rate": 1.832669148960057e-05, "loss": 0.0043, "step": 14650 }, { "epoch": 29.02970297029703, "grad_norm": 0.2154119461774826, "learning_rate": 1.8262764516553233e-05, "loss": 0.0053, "step": 14660 }, { "epoch": 29.04950495049505, "grad_norm": 0.3099201023578644, "learning_rate": 1.8198924311895843e-05, "loss": 0.0053, "step": 14670 }, { "epoch": 29.06930693069307, "grad_norm": 0.12906424701213837, "learning_rate": 1.813517105016505e-05, "loss": 0.0045, "step": 14680 }, { "epoch": 29.08910891089109, "grad_norm": 0.15516257286071777, "learning_rate": 1.8071504905659888e-05, "loss": 0.0046, "step": 14690 }, { "epoch": 29.10891089108911, "grad_norm": 0.09486714750528336, "learning_rate": 1.800792605244109e-05, "loss": 0.0045, "step": 14700 }, { "epoch": 29.128712871287128, "grad_norm": 0.11289254575967789, "learning_rate": 1.7944434664330844e-05, "loss": 0.0039, "step": 14710 }, { "epoch": 29.14851485148515, "grad_norm": 0.12177253514528275, "learning_rate": 1.7881030914912212e-05, "loss": 0.0049, "step": 14720 }, { "epoch": 29.168316831683168, "grad_norm": 0.0789734274148941, "learning_rate": 1.7817714977528577e-05, "loss": 0.0051, "step": 14730 }, { "epoch": 29.18811881188119, "grad_norm": 0.0927029624581337, "learning_rate": 1.7754487025283332e-05, "loss": 0.0039, "step": 14740 }, { "epoch": 29.207920792079207, "grad_norm": 0.13308937847614288, "learning_rate": 1.7691347231039275e-05, "loss": 0.004, "step": 14750 }, { "epoch": 29.22772277227723, "grad_norm": 0.09369685500860214, "learning_rate": 1.7628295767418164e-05, "loss": 0.0051, "step": 14760 }, { "epoch": 29.247524752475247, "grad_norm": 0.13907290995121002, "learning_rate": 1.7565332806800333e-05, "loss": 0.0045, "step": 14770 }, { "epoch": 29.26732673267327, "grad_norm": 0.13114067912101746, "learning_rate": 1.750245852132408e-05, "loss": 0.004, "step": 14780 }, { "epoch": 29.287128712871286, "grad_norm": 0.16108936071395874, "learning_rate": 1.7439673082885323e-05, "loss": 0.0053, "step": 14790 }, { "epoch": 29.306930693069308, "grad_norm": 0.11749403178691864, "learning_rate": 1.7376976663137047e-05, "loss": 0.0035, "step": 14800 }, { "epoch": 29.326732673267326, "grad_norm": 0.16137364506721497, "learning_rate": 1.7314369433488853e-05, "loss": 0.0045, "step": 14810 }, { "epoch": 29.346534653465348, "grad_norm": 0.14846117794513702, "learning_rate": 1.7251851565106548e-05, "loss": 0.0057, "step": 14820 }, { "epoch": 29.366336633663366, "grad_norm": 0.11941759288311005, "learning_rate": 1.7189423228911574e-05, "loss": 0.0045, "step": 14830 }, { "epoch": 29.386138613861387, "grad_norm": 0.0966564267873764, "learning_rate": 1.7127084595580606e-05, "loss": 0.0038, "step": 14840 }, { "epoch": 29.405940594059405, "grad_norm": 0.14711296558380127, "learning_rate": 1.706483583554513e-05, "loss": 0.0043, "step": 14850 }, { "epoch": 29.425742574257427, "grad_norm": 0.1500229686498642, "learning_rate": 1.700267711899083e-05, "loss": 0.0045, "step": 14860 }, { "epoch": 29.445544554455445, "grad_norm": 0.11229144781827927, "learning_rate": 1.69406086158573e-05, "loss": 0.004, "step": 14870 }, { "epoch": 29.465346534653467, "grad_norm": 0.14421184360980988, "learning_rate": 1.6878630495837455e-05, "loss": 0.0041, "step": 14880 }, { "epoch": 29.485148514851485, "grad_norm": 0.1793167144060135, "learning_rate": 1.681674292837707e-05, "loss": 0.0044, "step": 14890 }, { "epoch": 29.504950495049506, "grad_norm": 0.12927864491939545, "learning_rate": 1.6754946082674444e-05, "loss": 0.0049, "step": 14900 }, { "epoch": 29.524752475247524, "grad_norm": 0.13179895281791687, "learning_rate": 1.6693240127679748e-05, "loss": 0.0044, "step": 14910 }, { "epoch": 29.544554455445546, "grad_norm": 0.13211245834827423, "learning_rate": 1.663162523209475e-05, "loss": 0.0048, "step": 14920 }, { "epoch": 29.564356435643564, "grad_norm": 0.13843494653701782, "learning_rate": 1.6570101564372193e-05, "loss": 0.005, "step": 14930 }, { "epoch": 29.584158415841586, "grad_norm": 0.11851595342159271, "learning_rate": 1.650866929271543e-05, "loss": 0.0044, "step": 14940 }, { "epoch": 29.603960396039604, "grad_norm": 0.13270384073257446, "learning_rate": 1.644732858507797e-05, "loss": 0.0058, "step": 14950 }, { "epoch": 29.623762376237625, "grad_norm": 0.1090867891907692, "learning_rate": 1.6386079609162943e-05, "loss": 0.0058, "step": 14960 }, { "epoch": 29.643564356435643, "grad_norm": 0.1334017664194107, "learning_rate": 1.6324922532422742e-05, "loss": 0.0042, "step": 14970 }, { "epoch": 29.663366336633665, "grad_norm": 0.08444960415363312, "learning_rate": 1.6263857522058434e-05, "loss": 0.0042, "step": 14980 }, { "epoch": 29.683168316831683, "grad_norm": 0.09659381210803986, "learning_rate": 1.6202884745019443e-05, "loss": 0.0048, "step": 14990 }, { "epoch": 29.702970297029704, "grad_norm": 0.1296713799238205, "learning_rate": 1.614200436800304e-05, "loss": 0.0044, "step": 15000 }, { "epoch": 29.722772277227723, "grad_norm": 0.0997026115655899, "learning_rate": 1.6081216557453814e-05, "loss": 0.0042, "step": 15010 }, { "epoch": 29.742574257425744, "grad_norm": 0.14791589975357056, "learning_rate": 1.6020521479563367e-05, "loss": 0.0041, "step": 15020 }, { "epoch": 29.762376237623762, "grad_norm": 0.14091883599758148, "learning_rate": 1.5959919300269654e-05, "loss": 0.0066, "step": 15030 }, { "epoch": 29.782178217821784, "grad_norm": 0.12294190376996994, "learning_rate": 1.5899410185256764e-05, "loss": 0.0044, "step": 15040 }, { "epoch": 29.801980198019802, "grad_norm": 0.15378659963607788, "learning_rate": 1.583899429995431e-05, "loss": 0.0044, "step": 15050 }, { "epoch": 29.821782178217823, "grad_norm": 0.11030273884534836, "learning_rate": 1.5778671809536993e-05, "loss": 0.0041, "step": 15060 }, { "epoch": 29.84158415841584, "grad_norm": 0.11617165803909302, "learning_rate": 1.5718442878924246e-05, "loss": 0.0047, "step": 15070 }, { "epoch": 29.861386138613863, "grad_norm": 0.12027459591627121, "learning_rate": 1.5658307672779593e-05, "loss": 0.0051, "step": 15080 }, { "epoch": 29.88118811881188, "grad_norm": 0.10249722003936768, "learning_rate": 1.5598266355510427e-05, "loss": 0.0049, "step": 15090 }, { "epoch": 29.900990099009903, "grad_norm": 0.11343812942504883, "learning_rate": 1.553831909126744e-05, "loss": 0.0065, "step": 15100 }, { "epoch": 29.92079207920792, "grad_norm": 0.08953933417797089, "learning_rate": 1.5478466043944135e-05, "loss": 0.0056, "step": 15110 }, { "epoch": 29.94059405940594, "grad_norm": 0.12805773317813873, "learning_rate": 1.5418707377176468e-05, "loss": 0.0057, "step": 15120 }, { "epoch": 29.96039603960396, "grad_norm": 0.10706136375665665, "learning_rate": 1.535904325434233e-05, "loss": 0.0045, "step": 15130 }, { "epoch": 29.980198019801982, "grad_norm": 0.11298420280218124, "learning_rate": 1.529947383856118e-05, "loss": 0.0056, "step": 15140 }, { "epoch": 30.0, "grad_norm": 0.1668487787246704, "learning_rate": 1.5239999292693524e-05, "loss": 0.0034, "step": 15150 }, { "epoch": 30.019801980198018, "grad_norm": 0.11464525759220123, "learning_rate": 1.5180619779340505e-05, "loss": 0.0045, "step": 15160 }, { "epoch": 30.03960396039604, "grad_norm": 0.1614699810743332, "learning_rate": 1.5121335460843428e-05, "loss": 0.0045, "step": 15170 }, { "epoch": 30.059405940594058, "grad_norm": 0.12651227414608002, "learning_rate": 1.5062146499283347e-05, "loss": 0.0041, "step": 15180 }, { "epoch": 30.07920792079208, "grad_norm": 0.12641343474388123, "learning_rate": 1.5003053056480643e-05, "loss": 0.0043, "step": 15190 }, { "epoch": 30.099009900990097, "grad_norm": 0.1315588355064392, "learning_rate": 1.4944055293994551e-05, "loss": 0.0042, "step": 15200 }, { "epoch": 30.11881188118812, "grad_norm": 0.10051623731851578, "learning_rate": 1.4885153373122656e-05, "loss": 0.0042, "step": 15210 }, { "epoch": 30.138613861386137, "grad_norm": 0.085488460958004, "learning_rate": 1.482634745490059e-05, "loss": 0.0056, "step": 15220 }, { "epoch": 30.15841584158416, "grad_norm": 0.07477537542581558, "learning_rate": 1.4767637700101466e-05, "loss": 0.0044, "step": 15230 }, { "epoch": 30.178217821782177, "grad_norm": 0.15616054832935333, "learning_rate": 1.4709024269235528e-05, "loss": 0.0043, "step": 15240 }, { "epoch": 30.198019801980198, "grad_norm": 0.09089242666959763, "learning_rate": 1.4650507322549684e-05, "loss": 0.0039, "step": 15250 }, { "epoch": 30.217821782178216, "grad_norm": 0.1565282940864563, "learning_rate": 1.4592087020026972e-05, "loss": 0.0058, "step": 15260 }, { "epoch": 30.237623762376238, "grad_norm": 0.11577914655208588, "learning_rate": 1.4533763521386318e-05, "loss": 0.0043, "step": 15270 }, { "epoch": 30.257425742574256, "grad_norm": 0.11809254437685013, "learning_rate": 1.44755369860819e-05, "loss": 0.0048, "step": 15280 }, { "epoch": 30.277227722772277, "grad_norm": 0.1617242991924286, "learning_rate": 1.441740757330287e-05, "loss": 0.0048, "step": 15290 }, { "epoch": 30.297029702970296, "grad_norm": 0.12032796442508698, "learning_rate": 1.4359375441972844e-05, "loss": 0.0049, "step": 15300 }, { "epoch": 30.316831683168317, "grad_norm": 0.1285766065120697, "learning_rate": 1.4301440750749395e-05, "loss": 0.0046, "step": 15310 }, { "epoch": 30.336633663366335, "grad_norm": 0.12151230126619339, "learning_rate": 1.4243603658023808e-05, "loss": 0.0056, "step": 15320 }, { "epoch": 30.356435643564357, "grad_norm": 0.14393889904022217, "learning_rate": 1.4185864321920444e-05, "loss": 0.0069, "step": 15330 }, { "epoch": 30.376237623762375, "grad_norm": 0.10421860963106155, "learning_rate": 1.4128222900296485e-05, "loss": 0.0039, "step": 15340 }, { "epoch": 30.396039603960396, "grad_norm": 0.12880125641822815, "learning_rate": 1.407067955074135e-05, "loss": 0.0033, "step": 15350 }, { "epoch": 30.415841584158414, "grad_norm": 0.10973641276359558, "learning_rate": 1.4013234430576356e-05, "loss": 0.0046, "step": 15360 }, { "epoch": 30.435643564356436, "grad_norm": 0.22624921798706055, "learning_rate": 1.3955887696854286e-05, "loss": 0.0039, "step": 15370 }, { "epoch": 30.455445544554454, "grad_norm": 0.10363422334194183, "learning_rate": 1.38986395063589e-05, "loss": 0.0045, "step": 15380 }, { "epoch": 30.475247524752476, "grad_norm": 0.10899445414543152, "learning_rate": 1.3841490015604597e-05, "loss": 0.0061, "step": 15390 }, { "epoch": 30.495049504950494, "grad_norm": 0.11774276942014694, "learning_rate": 1.3784439380835879e-05, "loss": 0.0043, "step": 15400 }, { "epoch": 30.514851485148515, "grad_norm": 0.08955593407154083, "learning_rate": 1.3727487758026986e-05, "loss": 0.0042, "step": 15410 }, { "epoch": 30.534653465346533, "grad_norm": 0.08602274954319, "learning_rate": 1.3670635302881525e-05, "loss": 0.0046, "step": 15420 }, { "epoch": 30.554455445544555, "grad_norm": 0.12987931072711945, "learning_rate": 1.3613882170831888e-05, "loss": 0.0054, "step": 15430 }, { "epoch": 30.574257425742573, "grad_norm": 0.10410989075899124, "learning_rate": 1.355722851703901e-05, "loss": 0.004, "step": 15440 }, { "epoch": 30.594059405940595, "grad_norm": 0.1009332537651062, "learning_rate": 1.3500674496391814e-05, "loss": 0.0055, "step": 15450 }, { "epoch": 30.613861386138613, "grad_norm": 0.10756419599056244, "learning_rate": 1.3444220263506795e-05, "loss": 0.0035, "step": 15460 }, { "epoch": 30.633663366336634, "grad_norm": 0.11976262181997299, "learning_rate": 1.3387865972727714e-05, "loss": 0.0055, "step": 15470 }, { "epoch": 30.653465346534652, "grad_norm": 0.11273160576820374, "learning_rate": 1.3331611778125036e-05, "loss": 0.0037, "step": 15480 }, { "epoch": 30.673267326732674, "grad_norm": 0.14224915206432343, "learning_rate": 1.3275457833495564e-05, "loss": 0.0038, "step": 15490 }, { "epoch": 30.693069306930692, "grad_norm": 0.10217823833227158, "learning_rate": 1.3219404292362065e-05, "loss": 0.0049, "step": 15500 }, { "epoch": 30.712871287128714, "grad_norm": 0.11599840968847275, "learning_rate": 1.3163451307972751e-05, "loss": 0.0045, "step": 15510 }, { "epoch": 30.73267326732673, "grad_norm": 0.0999867245554924, "learning_rate": 1.3107599033300977e-05, "loss": 0.0055, "step": 15520 }, { "epoch": 30.752475247524753, "grad_norm": 0.11616326123476028, "learning_rate": 1.305184762104471e-05, "loss": 0.0054, "step": 15530 }, { "epoch": 30.77227722772277, "grad_norm": 0.09148378670215607, "learning_rate": 1.2996197223626178e-05, "loss": 0.0043, "step": 15540 }, { "epoch": 30.792079207920793, "grad_norm": 0.12086798995733261, "learning_rate": 1.2940647993191457e-05, "loss": 0.0054, "step": 15550 }, { "epoch": 30.81188118811881, "grad_norm": 0.13996422290802002, "learning_rate": 1.2885200081610005e-05, "loss": 0.0045, "step": 15560 }, { "epoch": 30.831683168316832, "grad_norm": 0.10163717716932297, "learning_rate": 1.2829853640474316e-05, "loss": 0.0048, "step": 15570 }, { "epoch": 30.85148514851485, "grad_norm": 0.1962335854768753, "learning_rate": 1.2774608821099438e-05, "loss": 0.0055, "step": 15580 }, { "epoch": 30.871287128712872, "grad_norm": 0.15199147164821625, "learning_rate": 1.2719465774522577e-05, "loss": 0.0045, "step": 15590 }, { "epoch": 30.89108910891089, "grad_norm": 0.1543012112379074, "learning_rate": 1.2664424651502755e-05, "loss": 0.0047, "step": 15600 }, { "epoch": 30.91089108910891, "grad_norm": 0.09822174906730652, "learning_rate": 1.260948560252026e-05, "loss": 0.0049, "step": 15610 }, { "epoch": 30.93069306930693, "grad_norm": 0.1670466661453247, "learning_rate": 1.2554648777776396e-05, "loss": 0.0037, "step": 15620 }, { "epoch": 30.95049504950495, "grad_norm": 0.10448116809129715, "learning_rate": 1.2499914327192919e-05, "loss": 0.0038, "step": 15630 }, { "epoch": 30.97029702970297, "grad_norm": 0.08853847533464432, "learning_rate": 1.2445282400411722e-05, "loss": 0.004, "step": 15640 }, { "epoch": 30.99009900990099, "grad_norm": 0.15134872496128082, "learning_rate": 1.2390753146794437e-05, "loss": 0.0042, "step": 15650 }, { "epoch": 31.00990099009901, "grad_norm": 0.14662720263004303, "learning_rate": 1.2336326715421925e-05, "loss": 0.0037, "step": 15660 }, { "epoch": 31.02970297029703, "grad_norm": 0.12129093706607819, "learning_rate": 1.2282003255094005e-05, "loss": 0.0038, "step": 15670 }, { "epoch": 31.04950495049505, "grad_norm": 0.13815173506736755, "learning_rate": 1.2227782914328928e-05, "loss": 0.0041, "step": 15680 }, { "epoch": 31.06930693069307, "grad_norm": 0.15066812932491302, "learning_rate": 1.2173665841363018e-05, "loss": 0.0043, "step": 15690 }, { "epoch": 31.08910891089109, "grad_norm": 0.15727625787258148, "learning_rate": 1.211965218415032e-05, "loss": 0.0038, "step": 15700 }, { "epoch": 31.10891089108911, "grad_norm": 0.1247967928647995, "learning_rate": 1.2065742090362082e-05, "loss": 0.0042, "step": 15710 }, { "epoch": 31.128712871287128, "grad_norm": 0.1185615211725235, "learning_rate": 1.2011935707386457e-05, "loss": 0.0039, "step": 15720 }, { "epoch": 31.14851485148515, "grad_norm": 0.11477718502283096, "learning_rate": 1.1958233182328044e-05, "loss": 0.0037, "step": 15730 }, { "epoch": 31.168316831683168, "grad_norm": 0.08394448459148407, "learning_rate": 1.1904634662007474e-05, "loss": 0.0049, "step": 15740 }, { "epoch": 31.18811881188119, "grad_norm": 0.16872242093086243, "learning_rate": 1.1851140292961088e-05, "loss": 0.0034, "step": 15750 }, { "epoch": 31.207920792079207, "grad_norm": 0.09852652251720428, "learning_rate": 1.1797750221440424e-05, "loss": 0.0055, "step": 15760 }, { "epoch": 31.22772277227723, "grad_norm": 0.09218394011259079, "learning_rate": 1.1744464593411897e-05, "loss": 0.0038, "step": 15770 }, { "epoch": 31.247524752475247, "grad_norm": 0.09641974419355392, "learning_rate": 1.1691283554556399e-05, "loss": 0.0048, "step": 15780 }, { "epoch": 31.26732673267327, "grad_norm": 0.10548311471939087, "learning_rate": 1.1638207250268834e-05, "loss": 0.0038, "step": 15790 }, { "epoch": 31.287128712871286, "grad_norm": 0.08839485794305801, "learning_rate": 1.158523582565782e-05, "loss": 0.0047, "step": 15800 }, { "epoch": 31.306930693069308, "grad_norm": 0.12031460553407669, "learning_rate": 1.1532369425545192e-05, "loss": 0.004, "step": 15810 }, { "epoch": 31.326732673267326, "grad_norm": 0.09571447968482971, "learning_rate": 1.1479608194465662e-05, "loss": 0.0045, "step": 15820 }, { "epoch": 31.346534653465348, "grad_norm": 0.10157231986522675, "learning_rate": 1.1426952276666442e-05, "loss": 0.0051, "step": 15830 }, { "epoch": 31.366336633663366, "grad_norm": 0.10015366971492767, "learning_rate": 1.1374401816106778e-05, "loss": 0.0042, "step": 15840 }, { "epoch": 31.386138613861387, "grad_norm": 0.07624445110559464, "learning_rate": 1.1321956956457646e-05, "loss": 0.0041, "step": 15850 }, { "epoch": 31.405940594059405, "grad_norm": 0.15804558992385864, "learning_rate": 1.1269617841101277e-05, "loss": 0.0047, "step": 15860 }, { "epoch": 31.425742574257427, "grad_norm": 0.13456542789936066, "learning_rate": 1.1217384613130804e-05, "loss": 0.0046, "step": 15870 }, { "epoch": 31.445544554455445, "grad_norm": 0.2612585425376892, "learning_rate": 1.11652574153499e-05, "loss": 0.0039, "step": 15880 }, { "epoch": 31.465346534653467, "grad_norm": 0.12965081632137299, "learning_rate": 1.1113236390272303e-05, "loss": 0.0058, "step": 15890 }, { "epoch": 31.485148514851485, "grad_norm": 0.10995545983314514, "learning_rate": 1.106132168012155e-05, "loss": 0.004, "step": 15900 }, { "epoch": 31.504950495049506, "grad_norm": 0.14803509414196014, "learning_rate": 1.1009513426830448e-05, "loss": 0.0049, "step": 15910 }, { "epoch": 31.524752475247524, "grad_norm": 0.09720242768526077, "learning_rate": 1.0957811772040777e-05, "loss": 0.005, "step": 15920 }, { "epoch": 31.544554455445546, "grad_norm": 0.08300759643316269, "learning_rate": 1.0906216857102913e-05, "loss": 0.004, "step": 15930 }, { "epoch": 31.564356435643564, "grad_norm": 0.09871240705251694, "learning_rate": 1.0854728823075355e-05, "loss": 0.004, "step": 15940 }, { "epoch": 31.584158415841586, "grad_norm": 0.10582385957241058, "learning_rate": 1.0803347810724452e-05, "loss": 0.004, "step": 15950 }, { "epoch": 31.603960396039604, "grad_norm": 0.12664195895195007, "learning_rate": 1.0752073960523911e-05, "loss": 0.005, "step": 15960 }, { "epoch": 31.623762376237625, "grad_norm": 0.10025006532669067, "learning_rate": 1.070090741265447e-05, "loss": 0.0033, "step": 15970 }, { "epoch": 31.643564356435643, "grad_norm": 0.06954266130924225, "learning_rate": 1.0649848307003547e-05, "loss": 0.0046, "step": 15980 }, { "epoch": 31.663366336633665, "grad_norm": 0.07739519327878952, "learning_rate": 1.0598896783164757e-05, "loss": 0.0036, "step": 15990 }, { "epoch": 31.683168316831683, "grad_norm": 0.11847846955060959, "learning_rate": 1.0548052980437645e-05, "loss": 0.0045, "step": 16000 }, { "epoch": 31.702970297029704, "grad_norm": 0.1065559983253479, "learning_rate": 1.049731703782722e-05, "loss": 0.0043, "step": 16010 }, { "epoch": 31.722772277227723, "grad_norm": 0.07475829869508743, "learning_rate": 1.0446689094043587e-05, "loss": 0.0041, "step": 16020 }, { "epoch": 31.742574257425744, "grad_norm": 0.10684454441070557, "learning_rate": 1.039616928750165e-05, "loss": 0.0044, "step": 16030 }, { "epoch": 31.762376237623762, "grad_norm": 0.23737673461437225, "learning_rate": 1.0345757756320612e-05, "loss": 0.004, "step": 16040 }, { "epoch": 31.782178217821784, "grad_norm": 0.11955942213535309, "learning_rate": 1.0295454638323666e-05, "loss": 0.0038, "step": 16050 }, { "epoch": 31.801980198019802, "grad_norm": 0.09081730991601944, "learning_rate": 1.0245260071037632e-05, "loss": 0.0037, "step": 16060 }, { "epoch": 31.821782178217823, "grad_norm": 0.08748668432235718, "learning_rate": 1.0195174191692518e-05, "loss": 0.0039, "step": 16070 }, { "epoch": 31.84158415841584, "grad_norm": 0.0759679451584816, "learning_rate": 1.014519713722124e-05, "loss": 0.004, "step": 16080 }, { "epoch": 31.861386138613863, "grad_norm": 0.08194688707590103, "learning_rate": 1.0095329044259132e-05, "loss": 0.0042, "step": 16090 }, { "epoch": 31.88118811881188, "grad_norm": 0.09125176072120667, "learning_rate": 1.004557004914365e-05, "loss": 0.0034, "step": 16100 }, { "epoch": 31.900990099009903, "grad_norm": 0.10992760211229324, "learning_rate": 9.995920287914007e-06, "loss": 0.0048, "step": 16110 }, { "epoch": 31.92079207920792, "grad_norm": 0.10198777168989182, "learning_rate": 9.946379896310737e-06, "loss": 0.0052, "step": 16120 }, { "epoch": 31.94059405940594, "grad_norm": 0.10493754595518112, "learning_rate": 9.896949009775396e-06, "loss": 0.0031, "step": 16130 }, { "epoch": 31.96039603960396, "grad_norm": 0.14871063828468323, "learning_rate": 9.847627763450134e-06, "loss": 0.0035, "step": 16140 }, { "epoch": 31.980198019801982, "grad_norm": 0.22123983502388, "learning_rate": 9.798416292177337e-06, "loss": 0.005, "step": 16150 }, { "epoch": 32.0, "grad_norm": 0.15723849833011627, "learning_rate": 9.74931473049932e-06, "loss": 0.0039, "step": 16160 }, { "epoch": 32.01980198019802, "grad_norm": 0.1023796945810318, "learning_rate": 9.700323212657847e-06, "loss": 0.0039, "step": 16170 }, { "epoch": 32.039603960396036, "grad_norm": 0.08733395487070084, "learning_rate": 9.65144187259388e-06, "loss": 0.0039, "step": 16180 }, { "epoch": 32.05940594059406, "grad_norm": 0.09351811558008194, "learning_rate": 9.602670843947132e-06, "loss": 0.0041, "step": 16190 }, { "epoch": 32.07920792079208, "grad_norm": 0.10343118011951447, "learning_rate": 9.554010260055713e-06, "loss": 0.0039, "step": 16200 }, { "epoch": 32.0990099009901, "grad_norm": 0.10616207867860794, "learning_rate": 9.505460253955834e-06, "loss": 0.0037, "step": 16210 }, { "epoch": 32.118811881188115, "grad_norm": 0.0985577255487442, "learning_rate": 9.457020958381324e-06, "loss": 0.004, "step": 16220 }, { "epoch": 32.13861386138614, "grad_norm": 0.09537248313426971, "learning_rate": 9.408692505763395e-06, "loss": 0.0052, "step": 16230 }, { "epoch": 32.15841584158416, "grad_norm": 0.11737038195133209, "learning_rate": 9.360475028230181e-06, "loss": 0.0038, "step": 16240 }, { "epoch": 32.17821782178218, "grad_norm": 0.15223947167396545, "learning_rate": 9.312368657606412e-06, "loss": 0.0048, "step": 16250 }, { "epoch": 32.198019801980195, "grad_norm": 0.08002904057502747, "learning_rate": 9.264373525413096e-06, "loss": 0.0036, "step": 16260 }, { "epoch": 32.21782178217822, "grad_norm": 0.09509250521659851, "learning_rate": 9.216489762867058e-06, "loss": 0.003, "step": 16270 }, { "epoch": 32.23762376237624, "grad_norm": 0.07337366044521332, "learning_rate": 9.168717500880708e-06, "loss": 0.0043, "step": 16280 }, { "epoch": 32.257425742574256, "grad_norm": 0.09502510726451874, "learning_rate": 9.121056870061574e-06, "loss": 0.0041, "step": 16290 }, { "epoch": 32.277227722772274, "grad_norm": 0.12347479164600372, "learning_rate": 9.073508000711983e-06, "loss": 0.0045, "step": 16300 }, { "epoch": 32.2970297029703, "grad_norm": 0.09901005774736404, "learning_rate": 9.026071022828758e-06, "loss": 0.0041, "step": 16310 }, { "epoch": 32.31683168316832, "grad_norm": 0.10298341512680054, "learning_rate": 8.978746066102771e-06, "loss": 0.0059, "step": 16320 }, { "epoch": 32.336633663366335, "grad_norm": 0.1565735638141632, "learning_rate": 8.931533259918634e-06, "loss": 0.0043, "step": 16330 }, { "epoch": 32.35643564356435, "grad_norm": 0.1433139145374298, "learning_rate": 8.884432733354382e-06, "loss": 0.0034, "step": 16340 }, { "epoch": 32.37623762376238, "grad_norm": 0.1165536567568779, "learning_rate": 8.837444615181029e-06, "loss": 0.004, "step": 16350 }, { "epoch": 32.396039603960396, "grad_norm": 0.07101791352033615, "learning_rate": 8.790569033862323e-06, "loss": 0.0048, "step": 16360 }, { "epoch": 32.415841584158414, "grad_norm": 0.1307142823934555, "learning_rate": 8.7438061175543e-06, "loss": 0.0041, "step": 16370 }, { "epoch": 32.43564356435643, "grad_norm": 0.09397759288549423, "learning_rate": 8.697155994104978e-06, "loss": 0.0034, "step": 16380 }, { "epoch": 32.45544554455446, "grad_norm": 0.10446245223283768, "learning_rate": 8.650618791054033e-06, "loss": 0.0042, "step": 16390 }, { "epoch": 32.475247524752476, "grad_norm": 0.14336974918842316, "learning_rate": 8.604194635632373e-06, "loss": 0.0035, "step": 16400 }, { "epoch": 32.495049504950494, "grad_norm": 0.09371241182088852, "learning_rate": 8.557883654761906e-06, "loss": 0.0036, "step": 16410 }, { "epoch": 32.51485148514851, "grad_norm": 0.10043253004550934, "learning_rate": 8.511685975055061e-06, "loss": 0.0056, "step": 16420 }, { "epoch": 32.53465346534654, "grad_norm": 0.09371709078550339, "learning_rate": 8.46560172281452e-06, "loss": 0.0037, "step": 16430 }, { "epoch": 32.554455445544555, "grad_norm": 0.11165273189544678, "learning_rate": 8.419631024032893e-06, "loss": 0.0033, "step": 16440 }, { "epoch": 32.57425742574257, "grad_norm": 0.096503846347332, "learning_rate": 8.373774004392293e-06, "loss": 0.0037, "step": 16450 }, { "epoch": 32.59405940594059, "grad_norm": 0.1161908209323883, "learning_rate": 8.32803078926409e-06, "loss": 0.0044, "step": 16460 }, { "epoch": 32.613861386138616, "grad_norm": 0.11583921313285828, "learning_rate": 8.282401503708454e-06, "loss": 0.0038, "step": 16470 }, { "epoch": 32.633663366336634, "grad_norm": 0.08680763095617294, "learning_rate": 8.23688627247412e-06, "loss": 0.004, "step": 16480 }, { "epoch": 32.65346534653465, "grad_norm": 0.38933953642845154, "learning_rate": 8.191485219998007e-06, "loss": 0.0032, "step": 16490 }, { "epoch": 32.67326732673267, "grad_norm": 0.1026013121008873, "learning_rate": 8.146198470404843e-06, "loss": 0.0034, "step": 16500 }, { "epoch": 32.693069306930695, "grad_norm": 0.08894722163677216, "learning_rate": 8.101026147506897e-06, "loss": 0.0048, "step": 16510 }, { "epoch": 32.71287128712871, "grad_norm": 0.12128002196550369, "learning_rate": 8.05596837480353e-06, "loss": 0.0048, "step": 16520 }, { "epoch": 32.73267326732673, "grad_norm": 0.06692855060100555, "learning_rate": 8.011025275480998e-06, "loss": 0.003, "step": 16530 }, { "epoch": 32.75247524752475, "grad_norm": 0.12302170693874359, "learning_rate": 7.966196972412027e-06, "loss": 0.0035, "step": 16540 }, { "epoch": 32.772277227722775, "grad_norm": 0.14103776216506958, "learning_rate": 7.92148358815547e-06, "loss": 0.0037, "step": 16550 }, { "epoch": 32.79207920792079, "grad_norm": 0.08288449794054031, "learning_rate": 7.87688524495604e-06, "loss": 0.004, "step": 16560 }, { "epoch": 32.81188118811881, "grad_norm": 0.0740550309419632, "learning_rate": 7.83240206474386e-06, "loss": 0.0033, "step": 16570 }, { "epoch": 32.83168316831683, "grad_norm": 0.18911093473434448, "learning_rate": 7.788034169134272e-06, "loss": 0.0039, "step": 16580 }, { "epoch": 32.851485148514854, "grad_norm": 0.06954523175954819, "learning_rate": 7.743781679427414e-06, "loss": 0.0034, "step": 16590 }, { "epoch": 32.87128712871287, "grad_norm": 0.12382980436086655, "learning_rate": 7.699644716607895e-06, "loss": 0.0036, "step": 16600 }, { "epoch": 32.89108910891089, "grad_norm": 0.12857143580913544, "learning_rate": 7.655623401344486e-06, "loss": 0.0044, "step": 16610 }, { "epoch": 32.91089108910891, "grad_norm": 0.11216636747121811, "learning_rate": 7.611717853989775e-06, "loss": 0.004, "step": 16620 }, { "epoch": 32.93069306930693, "grad_norm": 0.12324737012386322, "learning_rate": 7.567928194579854e-06, "loss": 0.0051, "step": 16630 }, { "epoch": 32.95049504950495, "grad_norm": 0.29276108741760254, "learning_rate": 7.524254542833997e-06, "loss": 0.0035, "step": 16640 }, { "epoch": 32.97029702970297, "grad_norm": 0.09512652456760406, "learning_rate": 7.480697018154286e-06, "loss": 0.0035, "step": 16650 }, { "epoch": 32.99009900990099, "grad_norm": 0.09349993616342545, "learning_rate": 7.437255739625332e-06, "loss": 0.0043, "step": 16660 }, { "epoch": 33.00990099009901, "grad_norm": 0.07107502967119217, "learning_rate": 7.393930826013923e-06, "loss": 0.0049, "step": 16670 }, { "epoch": 33.02970297029703, "grad_norm": 0.3250347077846527, "learning_rate": 7.350722395768722e-06, "loss": 0.0031, "step": 16680 }, { "epoch": 33.04950495049505, "grad_norm": 0.09976992756128311, "learning_rate": 7.307630567019963e-06, "loss": 0.0043, "step": 16690 }, { "epoch": 33.06930693069307, "grad_norm": 0.059842586517333984, "learning_rate": 7.264655457579e-06, "loss": 0.0042, "step": 16700 }, { "epoch": 33.08910891089109, "grad_norm": 0.10338236391544342, "learning_rate": 7.221797184938184e-06, "loss": 0.004, "step": 16710 }, { "epoch": 33.10891089108911, "grad_norm": 0.08777491748332977, "learning_rate": 7.179055866270373e-06, "loss": 0.0055, "step": 16720 }, { "epoch": 33.12871287128713, "grad_norm": 0.12139301747083664, "learning_rate": 7.136431618428707e-06, "loss": 0.0043, "step": 16730 }, { "epoch": 33.148514851485146, "grad_norm": 0.10714062303304672, "learning_rate": 7.09392455794628e-06, "loss": 0.0032, "step": 16740 }, { "epoch": 33.16831683168317, "grad_norm": 0.15763378143310547, "learning_rate": 7.051534801035725e-06, "loss": 0.0035, "step": 16750 }, { "epoch": 33.18811881188119, "grad_norm": 0.08492615073919296, "learning_rate": 7.00926246358905e-06, "loss": 0.0033, "step": 16760 }, { "epoch": 33.20792079207921, "grad_norm": 0.14975133538246155, "learning_rate": 6.967107661177191e-06, "loss": 0.0033, "step": 16770 }, { "epoch": 33.227722772277225, "grad_norm": 0.0991109237074852, "learning_rate": 6.925070509049786e-06, "loss": 0.0046, "step": 16780 }, { "epoch": 33.24752475247525, "grad_norm": 0.07235179096460342, "learning_rate": 6.883151122134812e-06, "loss": 0.0035, "step": 16790 }, { "epoch": 33.26732673267327, "grad_norm": 0.12929275631904602, "learning_rate": 6.8413496150382394e-06, "loss": 0.0039, "step": 16800 }, { "epoch": 33.28712871287129, "grad_norm": 0.10423611104488373, "learning_rate": 6.7996661020438165e-06, "loss": 0.0034, "step": 16810 }, { "epoch": 33.306930693069305, "grad_norm": 0.09395293146371841, "learning_rate": 6.758100697112662e-06, "loss": 0.0039, "step": 16820 }, { "epoch": 33.32673267326733, "grad_norm": 0.10043716430664062, "learning_rate": 6.716653513883026e-06, "loss": 0.0045, "step": 16830 }, { "epoch": 33.34653465346535, "grad_norm": 0.11164390295743942, "learning_rate": 6.675324665669913e-06, "loss": 0.0038, "step": 16840 }, { "epoch": 33.366336633663366, "grad_norm": 0.0667092353105545, "learning_rate": 6.634114265464803e-06, "loss": 0.0052, "step": 16850 }, { "epoch": 33.386138613861384, "grad_norm": 0.09320297092199326, "learning_rate": 6.59302242593538e-06, "loss": 0.0058, "step": 16860 }, { "epoch": 33.40594059405941, "grad_norm": 0.10602817684412003, "learning_rate": 6.552049259425141e-06, "loss": 0.0035, "step": 16870 }, { "epoch": 33.42574257425743, "grad_norm": 0.13047127425670624, "learning_rate": 6.511194877953181e-06, "loss": 0.0035, "step": 16880 }, { "epoch": 33.445544554455445, "grad_norm": 0.207218736410141, "learning_rate": 6.470459393213813e-06, "loss": 0.0032, "step": 16890 }, { "epoch": 33.46534653465346, "grad_norm": 0.10365287959575653, "learning_rate": 6.429842916576279e-06, "loss": 0.0032, "step": 16900 }, { "epoch": 33.48514851485149, "grad_norm": 0.12158408015966415, "learning_rate": 6.389345559084503e-06, "loss": 0.005, "step": 16910 }, { "epoch": 33.504950495049506, "grad_norm": 0.12235778570175171, "learning_rate": 6.348967431456682e-06, "loss": 0.0036, "step": 16920 }, { "epoch": 33.524752475247524, "grad_norm": 0.08666735142469406, "learning_rate": 6.30870864408511e-06, "loss": 0.0041, "step": 16930 }, { "epoch": 33.54455445544554, "grad_norm": 0.10731364041566849, "learning_rate": 6.268569307035754e-06, "loss": 0.0037, "step": 16940 }, { "epoch": 33.56435643564357, "grad_norm": 0.07096957415342331, "learning_rate": 6.228549530048022e-06, "loss": 0.0035, "step": 16950 }, { "epoch": 33.584158415841586, "grad_norm": 0.16632528603076935, "learning_rate": 6.1886494225344814e-06, "loss": 0.0032, "step": 16960 }, { "epoch": 33.603960396039604, "grad_norm": 0.42877593636512756, "learning_rate": 6.148869093580479e-06, "loss": 0.0032, "step": 16970 }, { "epoch": 33.62376237623762, "grad_norm": 0.09848944842815399, "learning_rate": 6.109208651943921e-06, "loss": 0.0032, "step": 16980 }, { "epoch": 33.64356435643565, "grad_norm": 0.10931384563446045, "learning_rate": 6.069668206054946e-06, "loss": 0.0041, "step": 16990 }, { "epoch": 33.663366336633665, "grad_norm": 0.12530742585659027, "learning_rate": 6.0302478640156145e-06, "loss": 0.0028, "step": 17000 }, { "epoch": 33.68316831683168, "grad_norm": 0.12193293869495392, "learning_rate": 5.990947733599644e-06, "loss": 0.0042, "step": 17010 }, { "epoch": 33.7029702970297, "grad_norm": 0.10111350566148758, "learning_rate": 5.951767922252105e-06, "loss": 0.005, "step": 17020 }, { "epoch": 33.722772277227726, "grad_norm": 0.08656706660985947, "learning_rate": 5.912708537089068e-06, "loss": 0.0042, "step": 17030 }, { "epoch": 33.742574257425744, "grad_norm": 0.18087439239025116, "learning_rate": 5.873769684897434e-06, "loss": 0.0042, "step": 17040 }, { "epoch": 33.76237623762376, "grad_norm": 0.10899296402931213, "learning_rate": 5.834951472134514e-06, "loss": 0.0036, "step": 17050 }, { "epoch": 33.78217821782178, "grad_norm": 0.09016386419534683, "learning_rate": 5.796254004927832e-06, "loss": 0.004, "step": 17060 }, { "epoch": 33.801980198019805, "grad_norm": 0.09436963498592377, "learning_rate": 5.757677389074806e-06, "loss": 0.0041, "step": 17070 }, { "epoch": 33.82178217821782, "grad_norm": 0.12839943170547485, "learning_rate": 5.719221730042385e-06, "loss": 0.0031, "step": 17080 }, { "epoch": 33.84158415841584, "grad_norm": 0.09439165145158768, "learning_rate": 5.680887132966911e-06, "loss": 0.0039, "step": 17090 }, { "epoch": 33.86138613861386, "grad_norm": 0.11368861049413681, "learning_rate": 5.642673702653683e-06, "loss": 0.003, "step": 17100 }, { "epoch": 33.881188118811885, "grad_norm": 0.10824242979288101, "learning_rate": 5.604581543576781e-06, "loss": 0.0039, "step": 17110 }, { "epoch": 33.9009900990099, "grad_norm": 0.353681743144989, "learning_rate": 5.566610759878704e-06, "loss": 0.0041, "step": 17120 }, { "epoch": 33.92079207920792, "grad_norm": 0.07451719790697098, "learning_rate": 5.528761455370119e-06, "loss": 0.0032, "step": 17130 }, { "epoch": 33.94059405940594, "grad_norm": 0.08310411125421524, "learning_rate": 5.491033733529594e-06, "loss": 0.0042, "step": 17140 }, { "epoch": 33.960396039603964, "grad_norm": 0.08229774981737137, "learning_rate": 5.453427697503255e-06, "loss": 0.0031, "step": 17150 }, { "epoch": 33.98019801980198, "grad_norm": 0.1215285211801529, "learning_rate": 5.415943450104599e-06, "loss": 0.0035, "step": 17160 }, { "epoch": 34.0, "grad_norm": 0.13819679617881775, "learning_rate": 5.378581093814111e-06, "loss": 0.0036, "step": 17170 }, { "epoch": 34.01980198019802, "grad_norm": 0.09488338977098465, "learning_rate": 5.3413407307790375e-06, "loss": 0.0038, "step": 17180 }, { "epoch": 34.039603960396036, "grad_norm": 0.087505042552948, "learning_rate": 5.30422246281313e-06, "loss": 0.0048, "step": 17190 }, { "epoch": 34.05940594059406, "grad_norm": 0.12778253853321075, "learning_rate": 5.267226391396296e-06, "loss": 0.0031, "step": 17200 }, { "epoch": 34.07920792079208, "grad_norm": 0.08946322649717331, "learning_rate": 5.2303526176744e-06, "loss": 0.0043, "step": 17210 }, { "epoch": 34.0990099009901, "grad_norm": 0.12675169110298157, "learning_rate": 5.193601242458929e-06, "loss": 0.0034, "step": 17220 }, { "epoch": 34.118811881188115, "grad_norm": 0.0962766706943512, "learning_rate": 5.156972366226714e-06, "loss": 0.0035, "step": 17230 }, { "epoch": 34.13861386138614, "grad_norm": 0.08664361387491226, "learning_rate": 5.120466089119735e-06, "loss": 0.0038, "step": 17240 }, { "epoch": 34.15841584158416, "grad_norm": 0.07018247246742249, "learning_rate": 5.084082510944749e-06, "loss": 0.0047, "step": 17250 }, { "epoch": 34.17821782178218, "grad_norm": 0.13072647154331207, "learning_rate": 5.047821731173058e-06, "loss": 0.0042, "step": 17260 }, { "epoch": 34.198019801980195, "grad_norm": 0.09480246901512146, "learning_rate": 5.011683848940274e-06, "loss": 0.0033, "step": 17270 }, { "epoch": 34.21782178217822, "grad_norm": 0.12551602721214294, "learning_rate": 4.975668963045954e-06, "loss": 0.0042, "step": 17280 }, { "epoch": 34.23762376237624, "grad_norm": 0.10916922986507416, "learning_rate": 4.9397771719534525e-06, "loss": 0.0034, "step": 17290 }, { "epoch": 34.257425742574256, "grad_norm": 0.11533031612634659, "learning_rate": 4.904008573789548e-06, "loss": 0.0031, "step": 17300 }, { "epoch": 34.277227722772274, "grad_norm": 0.10334401577711105, "learning_rate": 4.8683632663442005e-06, "loss": 0.0033, "step": 17310 }, { "epoch": 34.2970297029703, "grad_norm": 0.1203056052327156, "learning_rate": 4.832841347070343e-06, "loss": 0.0028, "step": 17320 }, { "epoch": 34.31683168316832, "grad_norm": 0.0944933146238327, "learning_rate": 4.797442913083539e-06, "loss": 0.0042, "step": 17330 }, { "epoch": 34.336633663366335, "grad_norm": 0.1266891062259674, "learning_rate": 4.7621680611617596e-06, "loss": 0.0049, "step": 17340 }, { "epoch": 34.35643564356435, "grad_norm": 0.13733252882957458, "learning_rate": 4.727016887745095e-06, "loss": 0.0038, "step": 17350 }, { "epoch": 34.37623762376238, "grad_norm": 0.11400517076253891, "learning_rate": 4.691989488935511e-06, "loss": 0.0036, "step": 17360 }, { "epoch": 34.396039603960396, "grad_norm": 0.10516846179962158, "learning_rate": 4.657085960496588e-06, "loss": 0.0034, "step": 17370 }, { "epoch": 34.415841584158414, "grad_norm": 0.0883408859372139, "learning_rate": 4.6223063978532265e-06, "loss": 0.0034, "step": 17380 }, { "epoch": 34.43564356435643, "grad_norm": 0.14377763867378235, "learning_rate": 4.587650896091439e-06, "loss": 0.0028, "step": 17390 }, { "epoch": 34.45544554455446, "grad_norm": 0.08714721351861954, "learning_rate": 4.553119549958035e-06, "loss": 0.0046, "step": 17400 }, { "epoch": 34.475247524752476, "grad_norm": 0.07962765544652939, "learning_rate": 4.518712453860385e-06, "loss": 0.004, "step": 17410 }, { "epoch": 34.495049504950494, "grad_norm": 0.09564808756113052, "learning_rate": 4.484429701866205e-06, "loss": 0.0045, "step": 17420 }, { "epoch": 34.51485148514851, "grad_norm": 0.07479019463062286, "learning_rate": 4.4502713877031975e-06, "loss": 0.0057, "step": 17430 }, { "epoch": 34.53465346534654, "grad_norm": 0.05900731682777405, "learning_rate": 4.416237604758911e-06, "loss": 0.0033, "step": 17440 }, { "epoch": 34.554455445544555, "grad_norm": 0.12131176143884659, "learning_rate": 4.3823284460804025e-06, "loss": 0.0035, "step": 17450 }, { "epoch": 34.57425742574257, "grad_norm": 0.07614139467477798, "learning_rate": 4.348544004374011e-06, "loss": 0.0038, "step": 17460 }, { "epoch": 34.59405940594059, "grad_norm": 0.08762862533330917, "learning_rate": 4.314884372005123e-06, "loss": 0.0025, "step": 17470 }, { "epoch": 34.613861386138616, "grad_norm": 0.10560333728790283, "learning_rate": 4.281349640997867e-06, "loss": 0.0032, "step": 17480 }, { "epoch": 34.633663366336634, "grad_norm": 0.12848791480064392, "learning_rate": 4.247939903034942e-06, "loss": 0.0037, "step": 17490 }, { "epoch": 34.65346534653465, "grad_norm": 0.12451065331697464, "learning_rate": 4.214655249457284e-06, "loss": 0.0036, "step": 17500 }, { "epoch": 34.67326732673267, "grad_norm": 0.07982125878334045, "learning_rate": 4.181495771263855e-06, "loss": 0.0037, "step": 17510 }, { "epoch": 34.693069306930695, "grad_norm": 0.10185917466878891, "learning_rate": 4.148461559111427e-06, "loss": 0.0048, "step": 17520 }, { "epoch": 34.71287128712871, "grad_norm": 0.15535041689872742, "learning_rate": 4.115552703314252e-06, "loss": 0.0029, "step": 17530 }, { "epoch": 34.73267326732673, "grad_norm": 0.07757784426212311, "learning_rate": 4.082769293843886e-06, "loss": 0.0026, "step": 17540 }, { "epoch": 34.75247524752475, "grad_norm": 0.09479980170726776, "learning_rate": 4.050111420328939e-06, "loss": 0.0043, "step": 17550 }, { "epoch": 34.772277227722775, "grad_norm": 0.07889006286859512, "learning_rate": 4.017579172054764e-06, "loss": 0.0046, "step": 17560 }, { "epoch": 34.79207920792079, "grad_norm": 0.09559512138366699, "learning_rate": 3.985172637963308e-06, "loss": 0.0038, "step": 17570 }, { "epoch": 34.81188118811881, "grad_norm": 0.08545385301113129, "learning_rate": 3.952891906652784e-06, "loss": 0.0033, "step": 17580 }, { "epoch": 34.83168316831683, "grad_norm": 0.09839265793561935, "learning_rate": 3.920737066377478e-06, "loss": 0.0036, "step": 17590 }, { "epoch": 34.851485148514854, "grad_norm": 0.1206684336066246, "learning_rate": 3.888708205047509e-06, "loss": 0.0059, "step": 17600 }, { "epoch": 34.87128712871287, "grad_norm": 0.06853261590003967, "learning_rate": 3.856805410228542e-06, "loss": 0.0039, "step": 17610 }, { "epoch": 34.89108910891089, "grad_norm": 0.09649675339460373, "learning_rate": 3.82502876914162e-06, "loss": 0.0034, "step": 17620 }, { "epoch": 34.91089108910891, "grad_norm": 0.11544796824455261, "learning_rate": 3.7933783686628586e-06, "loss": 0.0043, "step": 17630 }, { "epoch": 34.93069306930693, "grad_norm": 0.07522424310445786, "learning_rate": 3.7618542953232306e-06, "loss": 0.0034, "step": 17640 }, { "epoch": 34.95049504950495, "grad_norm": 0.06960292905569077, "learning_rate": 3.7304566353083658e-06, "loss": 0.0025, "step": 17650 }, { "epoch": 34.97029702970297, "grad_norm": 0.19875530898571014, "learning_rate": 3.6991854744582555e-06, "loss": 0.0034, "step": 17660 }, { "epoch": 34.99009900990099, "grad_norm": 0.10460120439529419, "learning_rate": 3.6680408982670777e-06, "loss": 0.0043, "step": 17670 }, { "epoch": 35.00990099009901, "grad_norm": 0.10051169246435165, "learning_rate": 3.637022991882899e-06, "loss": 0.0043, "step": 17680 }, { "epoch": 35.02970297029703, "grad_norm": 0.10045125335454941, "learning_rate": 3.606131840107485e-06, "loss": 0.0042, "step": 17690 }, { "epoch": 35.04950495049505, "grad_norm": 0.11237995326519012, "learning_rate": 3.575367527396084e-06, "loss": 0.0029, "step": 17700 }, { "epoch": 35.06930693069307, "grad_norm": 0.07639443129301071, "learning_rate": 3.5447301378571386e-06, "loss": 0.0033, "step": 17710 }, { "epoch": 35.08910891089109, "grad_norm": 0.06664007902145386, "learning_rate": 3.514219755252113e-06, "loss": 0.0049, "step": 17720 }, { "epoch": 35.10891089108911, "grad_norm": 0.12641526758670807, "learning_rate": 3.4838364629952213e-06, "loss": 0.0042, "step": 17730 }, { "epoch": 35.12871287128713, "grad_norm": 0.08074381947517395, "learning_rate": 3.4535803441532123e-06, "loss": 0.0028, "step": 17740 }, { "epoch": 35.148514851485146, "grad_norm": 0.1328432410955429, "learning_rate": 3.4234514814451836e-06, "loss": 0.0032, "step": 17750 }, { "epoch": 35.16831683168317, "grad_norm": 0.08454656600952148, "learning_rate": 3.393449957242273e-06, "loss": 0.0036, "step": 17760 }, { "epoch": 35.18811881188119, "grad_norm": 0.08047834783792496, "learning_rate": 3.363575853567524e-06, "loss": 0.0031, "step": 17770 }, { "epoch": 35.20792079207921, "grad_norm": 0.16320696473121643, "learning_rate": 3.3338292520955826e-06, "loss": 0.0033, "step": 17780 }, { "epoch": 35.227722772277225, "grad_norm": 0.0938926488161087, "learning_rate": 3.304210234152516e-06, "loss": 0.0032, "step": 17790 }, { "epoch": 35.24752475247525, "grad_norm": 0.06067925691604614, "learning_rate": 3.2747188807155993e-06, "loss": 0.0029, "step": 17800 }, { "epoch": 35.26732673267327, "grad_norm": 0.11046380549669266, "learning_rate": 3.2453552724130643e-06, "loss": 0.0033, "step": 17810 }, { "epoch": 35.28712871287129, "grad_norm": 0.10326196253299713, "learning_rate": 3.216119489523889e-06, "loss": 0.0033, "step": 17820 }, { "epoch": 35.306930693069305, "grad_norm": 0.15758758783340454, "learning_rate": 3.1870116119775917e-06, "loss": 0.0036, "step": 17830 }, { "epoch": 35.32673267326733, "grad_norm": 0.07022589445114136, "learning_rate": 3.158031719353999e-06, "loss": 0.0045, "step": 17840 }, { "epoch": 35.34653465346535, "grad_norm": 0.07760326564311981, "learning_rate": 3.1291798908830273e-06, "loss": 0.0028, "step": 17850 }, { "epoch": 35.366336633663366, "grad_norm": 0.09564215689897537, "learning_rate": 3.1004562054444853e-06, "loss": 0.0039, "step": 17860 }, { "epoch": 35.386138613861384, "grad_norm": 0.08802928775548935, "learning_rate": 3.071860741567806e-06, "loss": 0.0034, "step": 17870 }, { "epoch": 35.40594059405941, "grad_norm": 0.12692126631736755, "learning_rate": 3.04339357743193e-06, "loss": 0.0033, "step": 17880 }, { "epoch": 35.42574257425743, "grad_norm": 0.08101876080036163, "learning_rate": 3.0150547908649628e-06, "loss": 0.0036, "step": 17890 }, { "epoch": 35.445544554455445, "grad_norm": 0.07895857095718384, "learning_rate": 2.9868444593440957e-06, "loss": 0.0041, "step": 17900 }, { "epoch": 35.46534653465346, "grad_norm": 0.13766685128211975, "learning_rate": 2.9587626599952846e-06, "loss": 0.0029, "step": 17910 }, { "epoch": 35.48514851485149, "grad_norm": 0.0973987877368927, "learning_rate": 2.930809469593082e-06, "loss": 0.0034, "step": 17920 }, { "epoch": 35.504950495049506, "grad_norm": 0.08576986193656921, "learning_rate": 2.9029849645604733e-06, "loss": 0.0026, "step": 17930 }, { "epoch": 35.524752475247524, "grad_norm": 0.08131563663482666, "learning_rate": 2.8752892209685632e-06, "loss": 0.0035, "step": 17940 }, { "epoch": 35.54455445544554, "grad_norm": 0.13403970003128052, "learning_rate": 2.847722314536483e-06, "loss": 0.0036, "step": 17950 }, { "epoch": 35.56435643564357, "grad_norm": 0.08651141822338104, "learning_rate": 2.820284320631078e-06, "loss": 0.0034, "step": 17960 }, { "epoch": 35.584158415841586, "grad_norm": 0.12421918660402298, "learning_rate": 2.792975314266788e-06, "loss": 0.003, "step": 17970 }, { "epoch": 35.603960396039604, "grad_norm": 0.06776434928178787, "learning_rate": 2.7657953701054007e-06, "loss": 0.0029, "step": 17980 }, { "epoch": 35.62376237623762, "grad_norm": 0.15708424150943756, "learning_rate": 2.7387445624558306e-06, "loss": 0.0035, "step": 17990 }, { "epoch": 35.64356435643565, "grad_norm": 0.10055988281965256, "learning_rate": 2.7118229652739747e-06, "loss": 0.0035, "step": 18000 }, { "epoch": 35.663366336633665, "grad_norm": 0.08272873610258102, "learning_rate": 2.6850306521624236e-06, "loss": 0.0029, "step": 18010 }, { "epoch": 35.68316831683168, "grad_norm": 0.0876065194606781, "learning_rate": 2.6583676963703507e-06, "loss": 0.0026, "step": 18020 }, { "epoch": 35.7029702970297, "grad_norm": 0.06873156875371933, "learning_rate": 2.631834170793268e-06, "loss": 0.003, "step": 18030 }, { "epoch": 35.722772277227726, "grad_norm": 0.08681660145521164, "learning_rate": 2.6054301479728036e-06, "loss": 0.0032, "step": 18040 }, { "epoch": 35.742574257425744, "grad_norm": 0.08988898992538452, "learning_rate": 2.579155700096575e-06, "loss": 0.004, "step": 18050 }, { "epoch": 35.76237623762376, "grad_norm": 0.08358919620513916, "learning_rate": 2.5530108989978873e-06, "loss": 0.0031, "step": 18060 }, { "epoch": 35.78217821782178, "grad_norm": 0.1016467958688736, "learning_rate": 2.5269958161556416e-06, "loss": 0.0037, "step": 18070 }, { "epoch": 35.801980198019805, "grad_norm": 0.060684118419885635, "learning_rate": 2.5011105226940888e-06, "loss": 0.0044, "step": 18080 }, { "epoch": 35.82178217821782, "grad_norm": 0.10176170617341995, "learning_rate": 2.4753550893826248e-06, "loss": 0.003, "step": 18090 }, { "epoch": 35.84158415841584, "grad_norm": 0.07403777539730072, "learning_rate": 2.4497295866356296e-06, "loss": 0.0036, "step": 18100 }, { "epoch": 35.86138613861386, "grad_norm": 0.08445281535387039, "learning_rate": 2.424234084512228e-06, "loss": 0.003, "step": 18110 }, { "epoch": 35.881188118811885, "grad_norm": 0.07241027057170868, "learning_rate": 2.3988686527161687e-06, "loss": 0.0034, "step": 18120 }, { "epoch": 35.9009900990099, "grad_norm": 0.0793612077832222, "learning_rate": 2.373633360595573e-06, "loss": 0.0037, "step": 18130 }, { "epoch": 35.92079207920792, "grad_norm": 0.09805171936750412, "learning_rate": 2.3485282771427585e-06, "loss": 0.0029, "step": 18140 }, { "epoch": 35.94059405940594, "grad_norm": 0.08679939806461334, "learning_rate": 2.3235534709940665e-06, "loss": 0.0032, "step": 18150 }, { "epoch": 35.960396039603964, "grad_norm": 0.07956482470035553, "learning_rate": 2.2987090104296617e-06, "loss": 0.0027, "step": 18160 }, { "epoch": 35.98019801980198, "grad_norm": 0.06199061870574951, "learning_rate": 2.273994963373355e-06, "loss": 0.0042, "step": 18170 }, { "epoch": 36.0, "grad_norm": 0.1715719848871231, "learning_rate": 2.249411397392409e-06, "loss": 0.0035, "step": 18180 }, { "epoch": 36.01980198019802, "grad_norm": 0.13919757306575775, "learning_rate": 2.2249583796973506e-06, "loss": 0.003, "step": 18190 }, { "epoch": 36.039603960396036, "grad_norm": 0.07975203543901443, "learning_rate": 2.200635977141796e-06, "loss": 0.0061, "step": 18200 }, { "epoch": 36.05940594059406, "grad_norm": 0.07281020283699036, "learning_rate": 2.17644425622226e-06, "loss": 0.0035, "step": 18210 }, { "epoch": 36.07920792079208, "grad_norm": 0.06285537034273148, "learning_rate": 2.152383283077991e-06, "loss": 0.0035, "step": 18220 }, { "epoch": 36.0990099009901, "grad_norm": 0.10281231999397278, "learning_rate": 2.128453123490781e-06, "loss": 0.003, "step": 18230 }, { "epoch": 36.118811881188115, "grad_norm": 0.05404623970389366, "learning_rate": 2.1046538428847462e-06, "loss": 0.0031, "step": 18240 }, { "epoch": 36.13861386138614, "grad_norm": 0.06724295765161514, "learning_rate": 2.0809855063262273e-06, "loss": 0.0043, "step": 18250 }, { "epoch": 36.15841584158416, "grad_norm": 0.08123020827770233, "learning_rate": 2.057448178523558e-06, "loss": 0.0041, "step": 18260 }, { "epoch": 36.17821782178218, "grad_norm": 0.06296538561582565, "learning_rate": 2.034041923826885e-06, "loss": 0.0033, "step": 18270 }, { "epoch": 36.198019801980195, "grad_norm": 0.10159935802221298, "learning_rate": 2.0107668062280204e-06, "loss": 0.0039, "step": 18280 }, { "epoch": 36.21782178217822, "grad_norm": 0.07073197513818741, "learning_rate": 1.9876228893602357e-06, "loss": 0.0032, "step": 18290 }, { "epoch": 36.23762376237624, "grad_norm": 0.09778368473052979, "learning_rate": 1.9646102364981266e-06, "loss": 0.0037, "step": 18300 }, { "epoch": 36.257425742574256, "grad_norm": 0.11057127267122269, "learning_rate": 1.9417289105574053e-06, "loss": 0.0046, "step": 18310 }, { "epoch": 36.277227722772274, "grad_norm": 0.0757347121834755, "learning_rate": 1.9189789740947427e-06, "loss": 0.003, "step": 18320 }, { "epoch": 36.2970297029703, "grad_norm": 0.0633690282702446, "learning_rate": 1.896360489307597e-06, "loss": 0.0036, "step": 18330 }, { "epoch": 36.31683168316832, "grad_norm": 0.08957216143608093, "learning_rate": 1.8738735180340362e-06, "loss": 0.0027, "step": 18340 }, { "epoch": 36.336633663366335, "grad_norm": 0.09313087165355682, "learning_rate": 1.8515181217525824e-06, "loss": 0.0038, "step": 18350 }, { "epoch": 36.35643564356435, "grad_norm": 0.07103991508483887, "learning_rate": 1.8292943615820457e-06, "loss": 0.0043, "step": 18360 }, { "epoch": 36.37623762376238, "grad_norm": 0.08272033929824829, "learning_rate": 1.8072022982813296e-06, "loss": 0.0044, "step": 18370 }, { "epoch": 36.396039603960396, "grad_norm": 0.110234834253788, "learning_rate": 1.7852419922492925e-06, "loss": 0.0029, "step": 18380 }, { "epoch": 36.415841584158414, "grad_norm": 0.07498534023761749, "learning_rate": 1.763413503524569e-06, "loss": 0.0032, "step": 18390 }, { "epoch": 36.43564356435643, "grad_norm": 0.08735746145248413, "learning_rate": 1.7417168917854165e-06, "loss": 0.003, "step": 18400 }, { "epoch": 36.45544554455446, "grad_norm": 0.07570893317461014, "learning_rate": 1.720152216349552e-06, "loss": 0.0043, "step": 18410 }, { "epoch": 36.475247524752476, "grad_norm": 0.07637068629264832, "learning_rate": 1.6987195361739595e-06, "loss": 0.0037, "step": 18420 }, { "epoch": 36.495049504950494, "grad_norm": 0.08607878535985947, "learning_rate": 1.6774189098547832e-06, "loss": 0.0031, "step": 18430 }, { "epoch": 36.51485148514851, "grad_norm": 0.07991831749677658, "learning_rate": 1.6562503956271069e-06, "loss": 0.0048, "step": 18440 }, { "epoch": 36.53465346534654, "grad_norm": 0.09281142801046371, "learning_rate": 1.6352140513648417e-06, "loss": 0.0039, "step": 18450 }, { "epoch": 36.554455445544555, "grad_norm": 0.07477410137653351, "learning_rate": 1.6143099345805712e-06, "loss": 0.0042, "step": 18460 }, { "epoch": 36.57425742574257, "grad_norm": 0.11860659718513489, "learning_rate": 1.5935381024253293e-06, "loss": 0.0034, "step": 18470 }, { "epoch": 36.59405940594059, "grad_norm": 0.07419423758983612, "learning_rate": 1.572898611688517e-06, "loss": 0.0037, "step": 18480 }, { "epoch": 36.613861386138616, "grad_norm": 0.07210385799407959, "learning_rate": 1.5523915187977133e-06, "loss": 0.0034, "step": 18490 }, { "epoch": 36.633663366336634, "grad_norm": 0.0886564627289772, "learning_rate": 1.532016879818532e-06, "loss": 0.0039, "step": 18500 }, { "epoch": 36.65346534653465, "grad_norm": 0.06269262731075287, "learning_rate": 1.51177475045447e-06, "loss": 0.0029, "step": 18510 }, { "epoch": 36.67326732673267, "grad_norm": 0.091458760201931, "learning_rate": 1.4916651860467035e-06, "loss": 0.0031, "step": 18520 }, { "epoch": 36.693069306930695, "grad_norm": 0.08740977942943573, "learning_rate": 1.471688241574043e-06, "loss": 0.0033, "step": 18530 }, { "epoch": 36.71287128712871, "grad_norm": 0.10957670211791992, "learning_rate": 1.451843971652672e-06, "loss": 0.0036, "step": 18540 }, { "epoch": 36.73267326732673, "grad_norm": 0.09682370722293854, "learning_rate": 1.432132430536076e-06, "loss": 0.004, "step": 18550 }, { "epoch": 36.75247524752475, "grad_norm": 0.07413970679044724, "learning_rate": 1.412553672114869e-06, "loss": 0.0036, "step": 18560 }, { "epoch": 36.772277227722775, "grad_norm": 0.05547945201396942, "learning_rate": 1.3931077499166056e-06, "loss": 0.0035, "step": 18570 }, { "epoch": 36.79207920792079, "grad_norm": 0.07108374685049057, "learning_rate": 1.3737947171057085e-06, "loss": 0.0036, "step": 18580 }, { "epoch": 36.81188118811881, "grad_norm": 0.18211926519870758, "learning_rate": 1.3546146264832582e-06, "loss": 0.0038, "step": 18590 }, { "epoch": 36.83168316831683, "grad_norm": 0.06867118924856186, "learning_rate": 1.3355675304869086e-06, "loss": 0.0036, "step": 18600 }, { "epoch": 36.851485148514854, "grad_norm": 0.0728214830160141, "learning_rate": 1.3166534811906827e-06, "loss": 0.005, "step": 18610 }, { "epoch": 36.87128712871287, "grad_norm": 0.08580135554075241, "learning_rate": 1.2978725303048666e-06, "loss": 0.0027, "step": 18620 }, { "epoch": 36.89108910891089, "grad_norm": 0.06674764305353165, "learning_rate": 1.2792247291758762e-06, "loss": 0.0036, "step": 18630 }, { "epoch": 36.91089108910891, "grad_norm": 0.05617116019129753, "learning_rate": 1.2607101287860635e-06, "loss": 0.0044, "step": 18640 }, { "epoch": 36.93069306930693, "grad_norm": 0.1844770461320877, "learning_rate": 1.2423287797536654e-06, "loss": 0.003, "step": 18650 }, { "epoch": 36.95049504950495, "grad_norm": 0.04959391430020332, "learning_rate": 1.2240807323325776e-06, "loss": 0.0031, "step": 18660 }, { "epoch": 36.97029702970297, "grad_norm": 0.07914312183856964, "learning_rate": 1.205966036412254e-06, "loss": 0.0039, "step": 18670 }, { "epoch": 36.99009900990099, "grad_norm": 0.12778861820697784, "learning_rate": 1.1879847415175949e-06, "loss": 0.0024, "step": 18680 }, { "epoch": 37.00990099009901, "grad_norm": 0.1037282943725586, "learning_rate": 1.1701368968087712e-06, "loss": 0.0034, "step": 18690 }, { "epoch": 37.02970297029703, "grad_norm": 0.09589175879955292, "learning_rate": 1.1524225510811116e-06, "loss": 0.0025, "step": 18700 }, { "epoch": 37.04950495049505, "grad_norm": 0.08655616641044617, "learning_rate": 1.1348417527649535e-06, "loss": 0.0032, "step": 18710 }, { "epoch": 37.06930693069307, "grad_norm": 0.05283910036087036, "learning_rate": 1.1173945499255268e-06, "loss": 0.0028, "step": 18720 }, { "epoch": 37.08910891089109, "grad_norm": 0.06970123946666718, "learning_rate": 1.1000809902628307e-06, "loss": 0.0034, "step": 18730 }, { "epoch": 37.10891089108911, "grad_norm": 0.06557361781597137, "learning_rate": 1.082901121111468e-06, "loss": 0.003, "step": 18740 }, { "epoch": 37.12871287128713, "grad_norm": 0.06459652632474899, "learning_rate": 1.0658549894405456e-06, "loss": 0.0033, "step": 18750 }, { "epoch": 37.148514851485146, "grad_norm": 0.09647754579782486, "learning_rate": 1.0489426418535342e-06, "loss": 0.0029, "step": 18760 }, { "epoch": 37.16831683168317, "grad_norm": 0.09600921720266342, "learning_rate": 1.0321641245881474e-06, "loss": 0.0045, "step": 18770 }, { "epoch": 37.18811881188119, "grad_norm": 0.07674586772918701, "learning_rate": 1.015519483516214e-06, "loss": 0.0036, "step": 18780 }, { "epoch": 37.20792079207921, "grad_norm": 0.11543366312980652, "learning_rate": 9.990087641435443e-07, "loss": 0.0034, "step": 18790 }, { "epoch": 37.227722772277225, "grad_norm": 0.066350057721138, "learning_rate": 9.826320116098132e-07, "loss": 0.0031, "step": 18800 }, { "epoch": 37.24752475247525, "grad_norm": 0.07722385972738266, "learning_rate": 9.663892706884447e-07, "loss": 0.0032, "step": 18810 }, { "epoch": 37.26732673267327, "grad_norm": 0.07065491378307343, "learning_rate": 9.502805857864616e-07, "loss": 0.004, "step": 18820 }, { "epoch": 37.28712871287129, "grad_norm": 0.06879852712154388, "learning_rate": 9.34306000944396e-07, "loss": 0.0032, "step": 18830 }, { "epoch": 37.306930693069305, "grad_norm": 0.05707300454378128, "learning_rate": 9.184655598361624e-07, "loss": 0.0048, "step": 18840 }, { "epoch": 37.32673267326733, "grad_norm": 0.0738983154296875, "learning_rate": 9.027593057689076e-07, "loss": 0.0034, "step": 18850 }, { "epoch": 37.34653465346535, "grad_norm": 0.06453447043895721, "learning_rate": 8.871872816829441e-07, "loss": 0.0035, "step": 18860 }, { "epoch": 37.366336633663366, "grad_norm": 0.07070840150117874, "learning_rate": 8.717495301515777e-07, "loss": 0.0031, "step": 18870 }, { "epoch": 37.386138613861384, "grad_norm": 0.08017843216657639, "learning_rate": 8.564460933810415e-07, "loss": 0.0033, "step": 18880 }, { "epoch": 37.40594059405941, "grad_norm": 0.11353304237127304, "learning_rate": 8.412770132103453e-07, "loss": 0.004, "step": 18890 }, { "epoch": 37.42574257425743, "grad_norm": 0.061830103397369385, "learning_rate": 8.262423311111711e-07, "loss": 0.0032, "step": 18900 }, { "epoch": 37.445544554455445, "grad_norm": 0.12132392823696136, "learning_rate": 8.113420881877665e-07, "loss": 0.0031, "step": 18910 }, { "epoch": 37.46534653465346, "grad_norm": 0.0956854447722435, "learning_rate": 7.965763251768288e-07, "loss": 0.0032, "step": 18920 }, { "epoch": 37.48514851485149, "grad_norm": 0.09288842231035233, "learning_rate": 7.819450824473995e-07, "loss": 0.0031, "step": 18930 }, { "epoch": 37.504950495049506, "grad_norm": 0.08361069858074188, "learning_rate": 7.674484000007198e-07, "loss": 0.0025, "step": 18940 }, { "epoch": 37.524752475247524, "grad_norm": 0.12988193333148956, "learning_rate": 7.530863174701752e-07, "loss": 0.0033, "step": 18950 }, { "epoch": 37.54455445544554, "grad_norm": 0.09103484451770782, "learning_rate": 7.38858874121151e-07, "loss": 0.0047, "step": 18960 }, { "epoch": 37.56435643564357, "grad_norm": 0.10136369615793228, "learning_rate": 7.247661088509328e-07, "loss": 0.0032, "step": 18970 }, { "epoch": 37.584158415841586, "grad_norm": 0.0699278935790062, "learning_rate": 7.108080601886002e-07, "loss": 0.0029, "step": 18980 }, { "epoch": 37.603960396039604, "grad_norm": 0.10937847197055817, "learning_rate": 6.969847662949336e-07, "loss": 0.0035, "step": 18990 }, { "epoch": 37.62376237623762, "grad_norm": 0.07546510547399521, "learning_rate": 6.832962649622798e-07, "loss": 0.0038, "step": 19000 }, { "epoch": 37.64356435643565, "grad_norm": 0.1554333120584488, "learning_rate": 6.697425936144863e-07, "loss": 0.0042, "step": 19010 }, { "epoch": 37.663366336633665, "grad_norm": 0.05972469225525856, "learning_rate": 6.563237893067731e-07, "loss": 0.0038, "step": 19020 }, { "epoch": 37.68316831683168, "grad_norm": 0.08999968320131302, "learning_rate": 6.430398887256328e-07, "loss": 0.0038, "step": 19030 }, { "epoch": 37.7029702970297, "grad_norm": 0.05801737681031227, "learning_rate": 6.298909281887478e-07, "loss": 0.0028, "step": 19040 }, { "epoch": 37.722772277227726, "grad_norm": 0.06012775003910065, "learning_rate": 6.168769436448673e-07, "loss": 0.0025, "step": 19050 }, { "epoch": 37.742574257425744, "grad_norm": 0.0653783529996872, "learning_rate": 6.03997970673742e-07, "loss": 0.003, "step": 19060 }, { "epoch": 37.76237623762376, "grad_norm": 0.08053107559680939, "learning_rate": 5.912540444859782e-07, "loss": 0.0041, "step": 19070 }, { "epoch": 37.78217821782178, "grad_norm": 0.09589800238609314, "learning_rate": 5.786451999229837e-07, "loss": 0.0048, "step": 19080 }, { "epoch": 37.801980198019805, "grad_norm": 0.15838636457920074, "learning_rate": 5.661714714568722e-07, "loss": 0.0039, "step": 19090 }, { "epoch": 37.82178217821782, "grad_norm": 0.15103557705879211, "learning_rate": 5.538328931903259e-07, "loss": 0.0055, "step": 19100 }, { "epoch": 37.84158415841584, "grad_norm": 0.07228780537843704, "learning_rate": 5.416294988565551e-07, "loss": 0.0046, "step": 19110 }, { "epoch": 37.86138613861386, "grad_norm": 0.061356645077466965, "learning_rate": 5.29561321819172e-07, "loss": 0.0032, "step": 19120 }, { "epoch": 37.881188118811885, "grad_norm": 0.08987829834222794, "learning_rate": 5.176283950721061e-07, "loss": 0.0048, "step": 19130 }, { "epoch": 37.9009900990099, "grad_norm": 0.10829996317625046, "learning_rate": 5.058307512395332e-07, "loss": 0.0059, "step": 19140 }, { "epoch": 37.92079207920792, "grad_norm": 0.09810294210910797, "learning_rate": 4.941684225757526e-07, "loss": 0.004, "step": 19150 }, { "epoch": 37.94059405940594, "grad_norm": 0.08465871214866638, "learning_rate": 4.826414409651314e-07, "loss": 0.0034, "step": 19160 }, { "epoch": 37.960396039603964, "grad_norm": 0.08666183799505234, "learning_rate": 4.712498379219943e-07, "loss": 0.0038, "step": 19170 }, { "epoch": 37.98019801980198, "grad_norm": 0.08979610353708267, "learning_rate": 4.599936445905506e-07, "loss": 0.0048, "step": 19180 }, { "epoch": 38.0, "grad_norm": 0.2221834510564804, "learning_rate": 4.4887289174480594e-07, "loss": 0.0031, "step": 19190 }, { "epoch": 38.01980198019802, "grad_norm": 0.05905792862176895, "learning_rate": 4.378876097884621e-07, "loss": 0.0025, "step": 19200 }, { "epoch": 38.039603960396036, "grad_norm": 0.10588892549276352, "learning_rate": 4.2703782875487264e-07, "loss": 0.0032, "step": 19210 }, { "epoch": 38.05940594059406, "grad_norm": 0.09269928187131882, "learning_rate": 4.163235783069208e-07, "loss": 0.0031, "step": 19220 }, { "epoch": 38.07920792079208, "grad_norm": 0.06352471560239792, "learning_rate": 4.057448877369585e-07, "loss": 0.0041, "step": 19230 }, { "epoch": 38.0990099009901, "grad_norm": 0.06330405175685883, "learning_rate": 3.9530178596672295e-07, "loss": 0.0034, "step": 19240 }, { "epoch": 38.118811881188115, "grad_norm": 0.09997168928384781, "learning_rate": 3.849943015472479e-07, "loss": 0.0036, "step": 19250 }, { "epoch": 38.13861386138614, "grad_norm": 0.10886431485414505, "learning_rate": 3.748224626588137e-07, "loss": 0.0031, "step": 19260 }, { "epoch": 38.15841584158416, "grad_norm": 0.07031849771738052, "learning_rate": 3.647862971108307e-07, "loss": 0.0029, "step": 19270 }, { "epoch": 38.17821782178218, "grad_norm": 0.06360998749732971, "learning_rate": 3.5488583234179473e-07, "loss": 0.003, "step": 19280 }, { "epoch": 38.198019801980195, "grad_norm": 0.07677295804023743, "learning_rate": 3.4512109541920413e-07, "loss": 0.0045, "step": 19290 }, { "epoch": 38.21782178217822, "grad_norm": 0.12924718856811523, "learning_rate": 3.354921130394706e-07, "loss": 0.0034, "step": 19300 }, { "epoch": 38.23762376237624, "grad_norm": 0.06167233735322952, "learning_rate": 3.259989115278639e-07, "loss": 0.0031, "step": 19310 }, { "epoch": 38.257425742574256, "grad_norm": 0.08331756293773651, "learning_rate": 3.1664151683843403e-07, "loss": 0.004, "step": 19320 }, { "epoch": 38.277227722772274, "grad_norm": 0.0656290203332901, "learning_rate": 3.074199545539447e-07, "loss": 0.0032, "step": 19330 }, { "epoch": 38.2970297029703, "grad_norm": 0.061017755419015884, "learning_rate": 2.983342498857955e-07, "loss": 0.0034, "step": 19340 }, { "epoch": 38.31683168316832, "grad_norm": 0.07890347391366959, "learning_rate": 2.893844276739499e-07, "loss": 0.0035, "step": 19350 }, { "epoch": 38.336633663366335, "grad_norm": 0.08436411619186401, "learning_rate": 2.8057051238688514e-07, "loss": 0.0044, "step": 19360 }, { "epoch": 38.35643564356435, "grad_norm": 0.050976481288671494, "learning_rate": 2.71892528121509e-07, "loss": 0.0029, "step": 19370 }, { "epoch": 38.37623762376238, "grad_norm": 0.06301739066839218, "learning_rate": 2.633504986030988e-07, "loss": 0.003, "step": 19380 }, { "epoch": 38.396039603960396, "grad_norm": 0.1382066309452057, "learning_rate": 2.549444471852347e-07, "loss": 0.0032, "step": 19390 }, { "epoch": 38.415841584158414, "grad_norm": 0.06707846373319626, "learning_rate": 2.4667439684974423e-07, "loss": 0.0026, "step": 19400 }, { "epoch": 38.43564356435643, "grad_norm": 0.06805866956710815, "learning_rate": 2.3854037020662467e-07, "loss": 0.0047, "step": 19410 }, { "epoch": 38.45544554455446, "grad_norm": 0.06395847350358963, "learning_rate": 2.3054238949399288e-07, "loss": 0.0033, "step": 19420 }, { "epoch": 38.475247524752476, "grad_norm": 0.07167374342679977, "learning_rate": 2.2268047657802993e-07, "loss": 0.0059, "step": 19430 }, { "epoch": 38.495049504950494, "grad_norm": 0.1685115545988083, "learning_rate": 2.149546529529034e-07, "loss": 0.0038, "step": 19440 }, { "epoch": 38.51485148514851, "grad_norm": 0.09654707461595535, "learning_rate": 2.0736493974071736e-07, "loss": 0.0032, "step": 19450 }, { "epoch": 38.53465346534654, "grad_norm": 0.060527779161930084, "learning_rate": 1.9991135769145686e-07, "loss": 0.0034, "step": 19460 }, { "epoch": 38.554455445544555, "grad_norm": 0.08695706725120544, "learning_rate": 1.9259392718293245e-07, "loss": 0.0049, "step": 19470 }, { "epoch": 38.57425742574257, "grad_norm": 0.06632280349731445, "learning_rate": 1.8541266822072467e-07, "loss": 0.0032, "step": 19480 }, { "epoch": 38.59405940594059, "grad_norm": 0.05977049097418785, "learning_rate": 1.7836760043811184e-07, "loss": 0.0028, "step": 19490 }, { "epoch": 38.613861386138616, "grad_norm": 0.07864651829004288, "learning_rate": 1.7145874309604792e-07, "loss": 0.0032, "step": 19500 }, { "epoch": 38.633663366336634, "grad_norm": 0.09718827903270721, "learning_rate": 1.6468611508308474e-07, "loss": 0.0028, "step": 19510 }, { "epoch": 38.65346534653465, "grad_norm": 0.07957422733306885, "learning_rate": 1.5804973491532204e-07, "loss": 0.0028, "step": 19520 }, { "epoch": 38.67326732673267, "grad_norm": 0.07704255729913712, "learning_rate": 1.5154962073637424e-07, "loss": 0.0027, "step": 19530 }, { "epoch": 38.693069306930695, "grad_norm": 0.14303703606128693, "learning_rate": 1.4518579031730372e-07, "loss": 0.0034, "step": 19540 }, { "epoch": 38.71287128712871, "grad_norm": 0.0684218555688858, "learning_rate": 1.389582610565876e-07, "loss": 0.0044, "step": 19550 }, { "epoch": 38.73267326732673, "grad_norm": 0.11056157946586609, "learning_rate": 1.3286704998003995e-07, "loss": 0.0031, "step": 19560 }, { "epoch": 38.75247524752475, "grad_norm": 0.05554186552762985, "learning_rate": 1.2691217374080632e-07, "loss": 0.0028, "step": 19570 }, { "epoch": 38.772277227722775, "grad_norm": 0.11263319849967957, "learning_rate": 1.2109364861929705e-07, "loss": 0.0029, "step": 19580 }, { "epoch": 38.79207920792079, "grad_norm": 0.07246825098991394, "learning_rate": 1.1541149052312628e-07, "loss": 0.0033, "step": 19590 }, { "epoch": 38.81188118811881, "grad_norm": 0.07978416979312897, "learning_rate": 1.0986571498710074e-07, "loss": 0.0031, "step": 19600 }, { "epoch": 38.83168316831683, "grad_norm": 0.0690901055932045, "learning_rate": 1.0445633717316438e-07, "loss": 0.0045, "step": 19610 }, { "epoch": 38.851485148514854, "grad_norm": 0.08683246374130249, "learning_rate": 9.918337187034277e-08, "loss": 0.0029, "step": 19620 }, { "epoch": 38.87128712871287, "grad_norm": 0.10963574051856995, "learning_rate": 9.404683349472643e-08, "loss": 0.0028, "step": 19630 }, { "epoch": 38.89108910891089, "grad_norm": 0.07607074081897736, "learning_rate": 8.904673608940983e-08, "loss": 0.003, "step": 19640 }, { "epoch": 38.91089108910891, "grad_norm": 0.06882975995540619, "learning_rate": 8.418309332447471e-08, "loss": 0.0034, "step": 19650 }, { "epoch": 38.93069306930693, "grad_norm": 0.06352885067462921, "learning_rate": 7.945591849692902e-08, "loss": 0.0045, "step": 19660 }, { "epoch": 38.95049504950495, "grad_norm": 0.07140235602855682, "learning_rate": 7.486522453069578e-08, "loss": 0.0026, "step": 19670 }, { "epoch": 38.97029702970297, "grad_norm": 0.16476468741893768, "learning_rate": 7.041102397655208e-08, "loss": 0.0035, "step": 19680 }, { "epoch": 38.99009900990099, "grad_norm": 0.07033969461917877, "learning_rate": 6.609332901210685e-08, "loss": 0.0039, "step": 19690 }, { "epoch": 39.00990099009901, "grad_norm": 0.054452911019325256, "learning_rate": 6.191215144178419e-08, "loss": 0.0042, "step": 19700 }, { "epoch": 39.02970297029703, "grad_norm": 0.053264640271663666, "learning_rate": 5.786750269675678e-08, "loss": 0.0036, "step": 19710 }, { "epoch": 39.04950495049505, "grad_norm": 0.07240825146436691, "learning_rate": 5.395939383494031e-08, "loss": 0.0026, "step": 19720 }, { "epoch": 39.06930693069307, "grad_norm": 0.055415697395801544, "learning_rate": 5.018783554095463e-08, "loss": 0.0033, "step": 19730 }, { "epoch": 39.08910891089109, "grad_norm": 0.06653116643428802, "learning_rate": 4.655283812610156e-08, "loss": 0.003, "step": 19740 }, { "epoch": 39.10891089108911, "grad_norm": 0.07280904054641724, "learning_rate": 4.305441152831491e-08, "loss": 0.0026, "step": 19750 }, { "epoch": 39.12871287128713, "grad_norm": 0.0687834694981575, "learning_rate": 3.9692565312171584e-08, "loss": 0.0031, "step": 19760 }, { "epoch": 39.148514851485146, "grad_norm": 0.06945894658565521, "learning_rate": 3.6467308668824975e-08, "loss": 0.0027, "step": 19770 }, { "epoch": 39.16831683168317, "grad_norm": 0.11281078308820724, "learning_rate": 3.3378650416004964e-08, "loss": 0.0036, "step": 19780 }, { "epoch": 39.18811881188119, "grad_norm": 0.09210415184497833, "learning_rate": 3.042659899797906e-08, "loss": 0.0036, "step": 19790 }, { "epoch": 39.20792079207921, "grad_norm": 0.07662764191627502, "learning_rate": 2.76111624855524e-08, "loss": 0.0047, "step": 19800 }, { "epoch": 39.227722772277225, "grad_norm": 0.06346912682056427, "learning_rate": 2.4932348576017784e-08, "loss": 0.0032, "step": 19810 }, { "epoch": 39.24752475247525, "grad_norm": 0.0886249840259552, "learning_rate": 2.239016459314458e-08, "loss": 0.0028, "step": 19820 }, { "epoch": 39.26732673267327, "grad_norm": 0.07013731449842453, "learning_rate": 1.9984617487173174e-08, "loss": 0.0043, "step": 19830 }, { "epoch": 39.28712871287129, "grad_norm": 0.16708339750766754, "learning_rate": 1.7715713834776105e-08, "loss": 0.0032, "step": 19840 }, { "epoch": 39.306930693069305, "grad_norm": 0.0640217512845993, "learning_rate": 1.5583459839046964e-08, "loss": 0.0031, "step": 19850 }, { "epoch": 39.32673267326733, "grad_norm": 0.10677800327539444, "learning_rate": 1.3587861329489304e-08, "loss": 0.0038, "step": 19860 }, { "epoch": 39.34653465346535, "grad_norm": 0.07920550554990768, "learning_rate": 1.1728923761994415e-08, "loss": 0.0033, "step": 19870 }, { "epoch": 39.366336633663366, "grad_norm": 0.08473733812570572, "learning_rate": 1.0006652218819135e-08, "loss": 0.0031, "step": 19880 }, { "epoch": 39.386138613861384, "grad_norm": 0.0958934873342514, "learning_rate": 8.421051408596947e-09, "loss": 0.0034, "step": 19890 }, { "epoch": 39.40594059405941, "grad_norm": 0.09162214398384094, "learning_rate": 6.972125666299123e-09, "loss": 0.0035, "step": 19900 }, { "epoch": 39.42574257425743, "grad_norm": 0.07072219252586365, "learning_rate": 5.659878953229169e-09, "loss": 0.0032, "step": 19910 }, { "epoch": 39.445544554455445, "grad_norm": 0.07218711078166962, "learning_rate": 4.48431485701728e-09, "loss": 0.0032, "step": 19920 }, { "epoch": 39.46534653465346, "grad_norm": 0.06791788339614868, "learning_rate": 3.4454365916203322e-09, "loss": 0.0033, "step": 19930 }, { "epoch": 39.48514851485149, "grad_norm": 0.0781869888305664, "learning_rate": 2.5432469972830332e-09, "loss": 0.0034, "step": 19940 }, { "epoch": 39.504950495049506, "grad_norm": 0.06842334568500519, "learning_rate": 1.7777485405601203e-09, "loss": 0.0038, "step": 19950 }, { "epoch": 39.524752475247524, "grad_norm": 0.09868868440389633, "learning_rate": 1.1489433142941597e-09, "loss": 0.0034, "step": 19960 }, { "epoch": 39.54455445544554, "grad_norm": 0.06989936530590057, "learning_rate": 6.568330376210963e-10, "loss": 0.0043, "step": 19970 }, { "epoch": 39.56435643564357, "grad_norm": 0.10134881734848022, "learning_rate": 3.0141905594249787e-10, "loss": 0.0033, "step": 19980 }, { "epoch": 39.584158415841586, "grad_norm": 0.07529815286397934, "learning_rate": 8.270234094776008e-11, "loss": 0.0045, "step": 19990 }, { "epoch": 39.603960396039604, "grad_norm": 0.05843590199947357, "learning_rate": 6.834906085551041e-13, "loss": 0.0038, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }