diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,4317 +10,4317 @@ "log_history": [ { "epoch": 0.016260162601626018, - "grad_norm": 16.525604248046875, + "grad_norm": 3704.4365234375, "learning_rate": 6.493506493506493e-06, - "loss": 1.7328, + "loss": 11280.4625, "step": 10 }, { "epoch": 0.032520325203252036, - "grad_norm": 7.380761623382568, + "grad_norm": 3407.870361328125, "learning_rate": 1.2987012987012986e-05, - "loss": 1.2612, + "loss": 11149.0953, "step": 20 }, { "epoch": 0.04878048780487805, - "grad_norm": 3.019463062286377, + "grad_norm": 3003.387939453125, "learning_rate": 1.9480519480519483e-05, - "loss": 0.5146, + "loss": 11173.7734, "step": 30 }, { "epoch": 0.06504065040650407, - "grad_norm": 2.5373520851135254, + "grad_norm": 2449.536376953125, "learning_rate": 2.5974025974025972e-05, - "loss": 0.3545, + "loss": 11021.1336, "step": 40 }, { "epoch": 0.08130081300813008, - "grad_norm": 2.312335252761841, + "grad_norm": 2389.97509765625, "learning_rate": 3.246753246753247e-05, - "loss": 0.244, + "loss": 11002.3586, "step": 50 }, { "epoch": 0.0975609756097561, - "grad_norm": 2.7523088455200195, + "grad_norm": 2766.29443359375, "learning_rate": 3.8961038961038966e-05, - "loss": 0.2233, + "loss": 10353.3125, "step": 60 }, { "epoch": 0.11382113821138211, - "grad_norm": 1.5593374967575073, + "grad_norm": 4483.15087890625, "learning_rate": 4.545454545454546e-05, - "loss": 0.1671, + "loss": 9267.8453, "step": 70 }, { "epoch": 0.13008130081300814, - "grad_norm": 1.4652494192123413, + "grad_norm": 10965.46484375, "learning_rate": 5.1948051948051944e-05, - "loss": 0.1475, + "loss": 7955.3172, "step": 80 }, { "epoch": 0.14634146341463414, - "grad_norm": 1.656570315361023, + "grad_norm": 12316.47265625, "learning_rate": 5.844155844155844e-05, - "loss": 0.1403, + "loss": 3923.734, "step": 90 }, { "epoch": 0.16260162601626016, - "grad_norm": 1.5120561122894287, + "grad_norm": 154421.5, "learning_rate": 6.493506493506494e-05, - "loss": 0.1405, + "loss": 2865.5645, "step": 100 }, { "epoch": 0.17886178861788618, - "grad_norm": 0.974445104598999, + "grad_norm": 6532.34765625, "learning_rate": 7.142857142857143e-05, - "loss": 0.1223, + "loss": 2196.2996, "step": 110 }, { "epoch": 0.1951219512195122, - "grad_norm": 0.841881275177002, + "grad_norm": 4513.40087890625, "learning_rate": 7.792207792207793e-05, - "loss": 0.1321, + "loss": 1111.7941, "step": 120 }, { "epoch": 0.21138211382113822, - "grad_norm": 0.8575102090835571, + "grad_norm": 4285.0390625, "learning_rate": 8.441558441558442e-05, - "loss": 0.1012, + "loss": 945.8594, "step": 130 }, { "epoch": 0.22764227642276422, - "grad_norm": 0.9641493558883667, + "grad_norm": 2602.03369140625, "learning_rate": 9.090909090909092e-05, - "loss": 0.1163, + "loss": 593.7912, "step": 140 }, { "epoch": 0.24390243902439024, - "grad_norm": 1.6963988542556763, + "grad_norm": 3598.093017578125, "learning_rate": 9.74025974025974e-05, - "loss": 0.111, + "loss": 355.9361, "step": 150 }, { "epoch": 0.2601626016260163, - "grad_norm": 0.7563762664794922, + "grad_norm": 40596.5078125, "learning_rate": 0.00010389610389610389, - "loss": 0.1094, + "loss": 263.4985, "step": 160 }, { "epoch": 0.2764227642276423, - "grad_norm": 0.9374352693557739, + "grad_norm": 3959.72265625, "learning_rate": 0.0001103896103896104, - "loss": 0.1051, + "loss": 200.0281, "step": 170 }, { "epoch": 0.2926829268292683, - "grad_norm": 0.900098979473114, + "grad_norm": 277750.1875, "learning_rate": 0.00011688311688311689, - "loss": 0.088, + "loss": 176.2805, "step": 180 }, { "epoch": 0.3089430894308943, - "grad_norm": 0.7044101357460022, + "grad_norm": 114203.1015625, "learning_rate": 0.0001233766233766234, - "loss": 0.1136, + "loss": 192.7327, "step": 190 }, { "epoch": 0.3252032520325203, - "grad_norm": 1.2954487800598145, + "grad_norm": 2108.614013671875, "learning_rate": 0.00012987012987012987, - "loss": 0.11, + "loss": 154.6821, "step": 200 }, { "epoch": 0.34146341463414637, - "grad_norm": 0.6169984936714172, + "grad_norm": 6306.46484375, "learning_rate": 0.00013636363636363637, - "loss": 0.1039, + "loss": 94.0863, "step": 210 }, { "epoch": 0.35772357723577236, - "grad_norm": 1.195844292640686, + "grad_norm": 2991.08935546875, "learning_rate": 0.00014285714285714287, - "loss": 0.12, + "loss": 120.9749, "step": 220 }, { "epoch": 0.37398373983739835, - "grad_norm": 0.8411315679550171, + "grad_norm": 17456.123046875, "learning_rate": 0.00014935064935064934, - "loss": 0.1012, + "loss": 150.8287, "step": 230 }, { "epoch": 0.3902439024390244, - "grad_norm": 0.9592096209526062, + "grad_norm": 3997.399658203125, "learning_rate": 0.00015584415584415587, - "loss": 0.0973, + "loss": 127.9284, "step": 240 }, { "epoch": 0.4065040650406504, - "grad_norm": 1.0399092435836792, + "grad_norm": 3142.8544921875, "learning_rate": 0.00016233766233766234, - "loss": 0.0906, + "loss": 99.4487, "step": 250 }, { "epoch": 0.42276422764227645, - "grad_norm": 0.9300332069396973, + "grad_norm": 4303.7421875, "learning_rate": 0.00016883116883116884, - "loss": 0.0946, + "loss": 111.1226, "step": 260 }, { "epoch": 0.43902439024390244, - "grad_norm": 0.7730377912521362, + "grad_norm": 9494.5283203125, "learning_rate": 0.00017532467532467534, - "loss": 0.0901, + "loss": 148.5725, "step": 270 }, { "epoch": 0.45528455284552843, - "grad_norm": 1.2400496006011963, + "grad_norm": 12805.1005859375, "learning_rate": 0.00018181818181818183, - "loss": 0.1034, + "loss": 89.0703, "step": 280 }, { "epoch": 0.4715447154471545, - "grad_norm": 0.8693737387657166, + "grad_norm": 5651.734375, "learning_rate": 0.00018831168831168833, - "loss": 0.081, + "loss": 113.0061, "step": 290 }, { "epoch": 0.4878048780487805, - "grad_norm": 0.8496430516242981, + "grad_norm": 3500.915283203125, "learning_rate": 0.0001948051948051948, - "loss": 0.0916, + "loss": 98.2204, "step": 300 }, { "epoch": 0.5040650406504065, - "grad_norm": 0.5803017020225525, + "grad_norm": 76347.09375, "learning_rate": 0.00019999994218268405, - "loss": 0.0863, + "loss": 99.3199, "step": 310 }, { "epoch": 0.5203252032520326, - "grad_norm": 0.8963378667831421, + "grad_norm": 5863.58642578125, "learning_rate": 0.00019999791858364572, - "loss": 0.1005, + "loss": 145.732, "step": 320 }, { "epoch": 0.5365853658536586, - "grad_norm": 0.9483084678649902, + "grad_norm": 6211.58203125, "learning_rate": 0.00019999300418566636, - "loss": 0.103, + "loss": 99.9764, "step": 330 }, { "epoch": 0.5528455284552846, - "grad_norm": 0.640961766242981, + "grad_norm": 1611.767333984375, "learning_rate": 0.00019998519913081423, - "loss": 0.0788, + "loss": 130.0497, "step": 340 }, { "epoch": 0.5691056910569106, - "grad_norm": 0.7454540729522705, + "grad_norm": 6738.830078125, "learning_rate": 0.0001999745036447225, - "loss": 0.0928, + "loss": 132.5203, "step": 350 }, { "epoch": 0.5853658536585366, - "grad_norm": 1.048614740371704, + "grad_norm": 1651.7647705078125, "learning_rate": 0.00019996091803658263, - "loss": 0.0855, + "loss": 93.679, "step": 360 }, { "epoch": 0.6016260162601627, - "grad_norm": 0.6536673903465271, + "grad_norm": 1451.311279296875, "learning_rate": 0.00019994444269913535, - "loss": 0.0758, + "loss": 130.961, "step": 370 }, { "epoch": 0.6178861788617886, - "grad_norm": 1.11087167263031, + "grad_norm": 3547.41015625, "learning_rate": 0.00019992507810865954, - "loss": 0.0904, + "loss": 89.0317, "step": 380 }, { "epoch": 0.6341463414634146, - "grad_norm": 0.8868140578269958, + "grad_norm": 3523.5322265625, "learning_rate": 0.00019990282482495816, - "loss": 0.0991, + "loss": 92.9305, "step": 390 }, { "epoch": 0.6504065040650406, - "grad_norm": 1.0413252115249634, + "grad_norm": 5402.7509765625, "learning_rate": 0.00019987768349134227, - "loss": 0.0829, + "loss": 124.9789, "step": 400 }, { "epoch": 0.6666666666666666, - "grad_norm": 0.6953838467597961, + "grad_norm": 2866.8330078125, "learning_rate": 0.0001998496548346125, - "loss": 0.08, + "loss": 85.8321, "step": 410 }, { "epoch": 0.6829268292682927, - "grad_norm": 0.7871291637420654, + "grad_norm": 2670.57275390625, "learning_rate": 0.00019981873966503773, - "loss": 0.0861, + "loss": 143.1263, "step": 420 }, { "epoch": 0.6991869918699187, - "grad_norm": 0.8782192468643188, + "grad_norm": 3444.505126953125, "learning_rate": 0.000199784938876332, - "loss": 0.0786, + "loss": 117.812, "step": 430 }, { "epoch": 0.7154471544715447, - "grad_norm": 0.8923195600509644, + "grad_norm": 1545.001708984375, "learning_rate": 0.0001997482534456285, - "loss": 0.0861, + "loss": 100.9372, "step": 440 }, { "epoch": 0.7317073170731707, - "grad_norm": 0.7249306440353394, + "grad_norm": 839.1185913085938, "learning_rate": 0.00019970868443345134, - "loss": 0.0701, + "loss": 92.1672, "step": 450 }, { "epoch": 0.7479674796747967, - "grad_norm": 0.8845471143722534, + "grad_norm": 17015.447265625, "learning_rate": 0.0001996662329836849, - "loss": 0.0801, + "loss": 96.7714, "step": 460 }, { "epoch": 0.7642276422764228, - "grad_norm": 0.7328776121139526, + "grad_norm": 3150.693359375, "learning_rate": 0.0001996209003235408, - "loss": 0.0759, + "loss": 90.8617, "step": 470 }, { "epoch": 0.7804878048780488, - "grad_norm": 0.7246416211128235, + "grad_norm": 519.7723999023438, "learning_rate": 0.00019957268776352234, - "loss": 0.0663, + "loss": 113.3078, "step": 480 }, { "epoch": 0.7967479674796748, - "grad_norm": 0.5354925394058228, + "grad_norm": 4185.5126953125, "learning_rate": 0.00019952159669738674, - "loss": 0.0673, + "loss": 105.7553, "step": 490 }, { "epoch": 0.8130081300813008, - "grad_norm": 0.6777483820915222, + "grad_norm": 2410.567138671875, "learning_rate": 0.00019946762860210471, - "loss": 0.0667, + "loss": 78.1075, "step": 500 }, { "epoch": 0.8292682926829268, - "grad_norm": 0.9891756772994995, + "grad_norm": 2119.968505859375, "learning_rate": 0.00019941078503781792, - "loss": 0.0726, + "loss": 83.322, "step": 510 }, { "epoch": 0.8455284552845529, - "grad_norm": 0.5575820207595825, + "grad_norm": 10644.458984375, "learning_rate": 0.00019935106764779365, - "loss": 0.0612, + "loss": 79.2555, "step": 520 }, { "epoch": 0.8617886178861789, - "grad_norm": 0.722955584526062, + "grad_norm": 3944.619873046875, "learning_rate": 0.00019928847815837758, - "loss": 0.0802, + "loss": 103.8101, "step": 530 }, { "epoch": 0.8780487804878049, - "grad_norm": 0.7638558149337769, + "grad_norm": 1694.9683837890625, "learning_rate": 0.00019922301837894358, - "loss": 0.0679, + "loss": 96.7458, "step": 540 }, { "epoch": 0.8943089430894309, - "grad_norm": 0.4031972289085388, + "grad_norm": 3317.81640625, "learning_rate": 0.0001991546902018417, - "loss": 0.0586, + "loss": 160.2423, "step": 550 }, { "epoch": 0.9105691056910569, - "grad_norm": 0.44369038939476013, + "grad_norm": 7013.18359375, "learning_rate": 0.0001990834956023433, - "loss": 0.0614, + "loss": 122.6204, "step": 560 }, { "epoch": 0.926829268292683, - "grad_norm": 0.9656615257263184, + "grad_norm": 3094.7744140625, "learning_rate": 0.00019900943663858387, - "loss": 0.083, + "loss": 96.8247, "step": 570 }, { "epoch": 0.943089430894309, - "grad_norm": 0.693723738193512, + "grad_norm": 6648.25048828125, "learning_rate": 0.0001989325154515038, - "loss": 0.0764, + "loss": 116.6589, "step": 580 }, { "epoch": 0.959349593495935, - "grad_norm": 0.7485199570655823, + "grad_norm": 15371.361328125, "learning_rate": 0.0001988527342647862, - "loss": 0.06, + "loss": 88.9712, "step": 590 }, { "epoch": 0.975609756097561, - "grad_norm": 0.9115013480186462, + "grad_norm": 2130.667724609375, "learning_rate": 0.00019877009538479275, - "loss": 0.0668, + "loss": 75.6254, "step": 600 }, { "epoch": 0.991869918699187, - "grad_norm": 0.6610664129257202, + "grad_norm": 3430.82763671875, "learning_rate": 0.00019868460120049704, - "loss": 0.0756, + "loss": 118.3028, "step": 610 }, { "epoch": 1.008130081300813, - "grad_norm": 0.7623885273933411, + "grad_norm": 1396.5372314453125, "learning_rate": 0.00019859625418341557, - "loss": 0.0703, + "loss": 78.8569, "step": 620 }, { "epoch": 1.024390243902439, - "grad_norm": 0.6736379265785217, + "grad_norm": 7597.01904296875, "learning_rate": 0.00019850505688753602, - "loss": 0.0664, + "loss": 100.3299, "step": 630 }, { "epoch": 1.040650406504065, - "grad_norm": 0.40982815623283386, + "grad_norm": 2552.638916015625, "learning_rate": 0.0001984110119492438, - "loss": 0.0624, + "loss": 73.0117, "step": 640 }, { "epoch": 1.056910569105691, - "grad_norm": 0.5779235363006592, + "grad_norm": 1387.00439453125, "learning_rate": 0.00019831412208724556, - "loss": 0.0702, + "loss": 107.2604, "step": 650 }, { "epoch": 1.0731707317073171, - "grad_norm": 0.37329673767089844, + "grad_norm": 1579.257080078125, "learning_rate": 0.0001982143901024907, - "loss": 0.062, + "loss": 64.988, "step": 660 }, { "epoch": 1.089430894308943, - "grad_norm": 0.6696743965148926, + "grad_norm": 1369.64501953125, "learning_rate": 0.0001981118188780904, - "loss": 0.0484, + "loss": 110.6651, "step": 670 }, { "epoch": 1.1056910569105691, - "grad_norm": 0.4657931625843048, + "grad_norm": 3883.478271484375, "learning_rate": 0.00019800641137923423, - "loss": 0.0691, + "loss": 110.6604, "step": 680 }, { "epoch": 1.1219512195121952, - "grad_norm": 0.6538046598434448, + "grad_norm": 2725.116943359375, "learning_rate": 0.00019789817065310448, - "loss": 0.0504, + "loss": 97.7683, "step": 690 }, { "epoch": 1.1382113821138211, - "grad_norm": 0.5810462832450867, + "grad_norm": 2270.0986328125, "learning_rate": 0.00019778709982878805, - "loss": 0.0545, + "loss": 133.6088, "step": 700 }, { "epoch": 1.1544715447154472, - "grad_norm": 0.7634444236755371, + "grad_norm": 3066.498046875, "learning_rate": 0.000197673202117186, - "loss": 0.0658, + "loss": 83.8171, "step": 710 }, { "epoch": 1.170731707317073, - "grad_norm": 1.0801016092300415, + "grad_norm": 5128.125, "learning_rate": 0.00019755648081092066, - "loss": 0.0593, + "loss": 169.6488, "step": 720 }, { "epoch": 1.1869918699186992, - "grad_norm": 0.5045628547668457, + "grad_norm": 1368.7137451171875, "learning_rate": 0.00019743693928424058, - "loss": 0.0666, + "loss": 78.2656, "step": 730 }, { "epoch": 1.203252032520325, - "grad_norm": 0.8225274682044983, + "grad_norm": 3027.226318359375, "learning_rate": 0.00019731458099292288, - "loss": 0.059, + "loss": 132.4441, "step": 740 }, { "epoch": 1.2195121951219512, - "grad_norm": 0.6356520056724548, + "grad_norm": 7759.80810546875, "learning_rate": 0.00019718940947417336, - "loss": 0.0601, + "loss": 130.1133, "step": 750 }, { "epoch": 1.2357723577235773, - "grad_norm": 0.5118418335914612, + "grad_norm": 1686.7059326171875, "learning_rate": 0.00019706142834652427, - "loss": 0.0601, + "loss": 111.4778, "step": 760 }, { "epoch": 1.2520325203252032, - "grad_norm": 0.36911386251449585, + "grad_norm": 9301.548828125, "learning_rate": 0.00019693064130972974, - "loss": 0.0569, + "loss": 88.9655, "step": 770 }, { "epoch": 1.2682926829268293, - "grad_norm": 0.6248646378517151, + "grad_norm": 1258.872802734375, "learning_rate": 0.0001967970521446587, - "loss": 0.061, + "loss": 69.8348, "step": 780 }, { "epoch": 1.2845528455284554, - "grad_norm": 0.6961585879325867, + "grad_norm": 1352.4385986328125, "learning_rate": 0.00019666066471318568, - "loss": 0.0493, + "loss": 77.0263, "step": 790 }, { "epoch": 1.3008130081300813, - "grad_norm": 0.4376440644264221, + "grad_norm": 855.2029418945312, "learning_rate": 0.00019652148295807922, - "loss": 0.0517, + "loss": 85.511, "step": 800 }, { "epoch": 1.3170731707317074, - "grad_norm": 0.5526062250137329, + "grad_norm": 1946.88330078125, "learning_rate": 0.00019637951090288778, - "loss": 0.0553, + "loss": 59.645, "step": 810 }, { "epoch": 1.3333333333333333, - "grad_norm": 0.5190712213516235, + "grad_norm": 2297.47216796875, "learning_rate": 0.00019623475265182337, - "loss": 0.0655, + "loss": 67.3651, "step": 820 }, { "epoch": 1.3495934959349594, - "grad_norm": 0.6455714702606201, + "grad_norm": 11286.927734375, "learning_rate": 0.00019608721238964318, - "loss": 0.0557, + "loss": 128.2699, "step": 830 }, { "epoch": 1.3658536585365852, - "grad_norm": 0.6400585174560547, + "grad_norm": 2499.033447265625, "learning_rate": 0.00019593689438152827, - "loss": 0.0563, + "loss": 69.4611, "step": 840 }, { "epoch": 1.3821138211382114, - "grad_norm": 0.5363293886184692, + "grad_norm": 10106.341796875, "learning_rate": 0.0001957838029729605, - "loss": 0.0618, + "loss": 93.8524, "step": 850 }, { "epoch": 1.3983739837398375, - "grad_norm": 0.42193731665611267, + "grad_norm": 2966.48779296875, "learning_rate": 0.00019562794258959674, - "loss": 0.0615, + "loss": 108.8285, "step": 860 }, { "epoch": 1.4146341463414633, - "grad_norm": 0.30361121892929077, + "grad_norm": 7656.32275390625, "learning_rate": 0.00019546931773714116, - "loss": 0.0566, + "loss": 70.237, "step": 870 }, { "epoch": 1.4308943089430894, - "grad_norm": 0.9000810980796814, + "grad_norm": 4307.1708984375, "learning_rate": 0.00019530793300121473, - "loss": 0.0608, + "loss": 125.8694, "step": 880 }, { "epoch": 1.4471544715447155, - "grad_norm": 0.40272057056427, + "grad_norm": 2789.88916015625, "learning_rate": 0.0001951437930472228, - "loss": 0.0529, + "loss": 108.8423, "step": 890 }, { "epoch": 1.4634146341463414, - "grad_norm": 0.414357990026474, + "grad_norm": 5194.333984375, "learning_rate": 0.00019497690262022018, - "loss": 0.0532, + "loss": 162.3557, "step": 900 }, { "epoch": 1.4796747967479675, - "grad_norm": 0.5152111053466797, + "grad_norm": 2407.015380859375, "learning_rate": 0.00019480726654477398, - "loss": 0.0572, + "loss": 98.5685, "step": 910 }, { "epoch": 1.4959349593495934, - "grad_norm": 0.5087205171585083, + "grad_norm": 7854.9638671875, "learning_rate": 0.00019463488972482418, - "loss": 0.0626, + "loss": 60.0693, "step": 920 }, { "epoch": 1.5121951219512195, - "grad_norm": 0.6234679818153381, + "grad_norm": 1800.740478515625, "learning_rate": 0.00019445977714354173, - "loss": 0.0701, + "loss": 60.3849, "step": 930 }, { "epoch": 1.5284552845528454, - "grad_norm": 0.7005413770675659, + "grad_norm": 2736.665283203125, "learning_rate": 0.00019428193386318468, - "loss": 0.0441, + "loss": 66.8596, "step": 940 }, { "epoch": 1.5447154471544715, - "grad_norm": 0.7384408712387085, + "grad_norm": 15203.984375, "learning_rate": 0.0001941013650249517, - "loss": 0.0485, + "loss": 95.6272, "step": 950 }, { "epoch": 1.5609756097560976, - "grad_norm": 0.5566347241401672, + "grad_norm": 3157.21337890625, "learning_rate": 0.0001939180758488335, - "loss": 0.0641, + "loss": 71.3239, "step": 960 }, { "epoch": 1.5772357723577235, - "grad_norm": 0.6487844586372375, + "grad_norm": 4594.89501953125, "learning_rate": 0.00019373207163346192, - "loss": 0.0586, + "loss": 82.758, "step": 970 }, { "epoch": 1.5934959349593496, - "grad_norm": 0.4212978780269623, + "grad_norm": 2293.903564453125, "learning_rate": 0.0001935433577559568, - "loss": 0.0501, + "loss": 67.9693, "step": 980 }, { "epoch": 1.6097560975609757, - "grad_norm": 0.5692740678787231, + "grad_norm": 2138.247802734375, "learning_rate": 0.0001933519396717704, - "loss": 0.0525, + "loss": 75.4409, "step": 990 }, { "epoch": 1.6260162601626016, - "grad_norm": 0.5937200784683228, + "grad_norm": 781.8675537109375, "learning_rate": 0.0001931578229145299, - "loss": 0.0536, + "loss": 77.4897, "step": 1000 }, { "epoch": 1.6422764227642277, - "grad_norm": 0.45667609572410583, + "grad_norm": 2182.60107421875, "learning_rate": 0.00019296101309587726, - "loss": 0.0697, + "loss": 54.7864, "step": 1010 }, { "epoch": 1.6585365853658538, - "grad_norm": 0.5607714653015137, + "grad_norm": 26183.85546875, "learning_rate": 0.00019276151590530703, - "loss": 0.0464, + "loss": 89.1371, "step": 1020 }, { "epoch": 1.6747967479674797, - "grad_norm": 0.5573679804801941, + "grad_norm": 1233.78857421875, "learning_rate": 0.000192559337110002, - "loss": 0.049, + "loss": 51.9562, "step": 1030 }, { "epoch": 1.6910569105691056, - "grad_norm": 0.5698865056037903, + "grad_norm": 4076.354248046875, "learning_rate": 0.00019235448255466617, - "loss": 0.0457, + "loss": 77.1311, "step": 1040 }, { "epoch": 1.7073170731707317, - "grad_norm": 0.38402828574180603, + "grad_norm": 1355.48095703125, "learning_rate": 0.0001921469581613562, - "loss": 0.0493, + "loss": 70.7184, "step": 1050 }, { "epoch": 1.7235772357723578, - "grad_norm": 0.5972709655761719, + "grad_norm": 4424.4345703125, "learning_rate": 0.00019193676992930992, - "loss": 0.0599, + "loss": 82.3314, "step": 1060 }, { "epoch": 1.7398373983739837, - "grad_norm": 0.3473765552043915, + "grad_norm": 38555.8359375, "learning_rate": 0.00019172392393477296, - "loss": 0.0511, + "loss": 78.6395, "step": 1070 }, { "epoch": 1.7560975609756098, - "grad_norm": 0.44477471709251404, + "grad_norm": 8763.5234375, "learning_rate": 0.0001915084263308232, - "loss": 0.0479, + "loss": 110.0452, "step": 1080 }, { "epoch": 1.7723577235772359, - "grad_norm": 0.4405505657196045, + "grad_norm": 3243.281005859375, "learning_rate": 0.0001912902833471927, - "loss": 0.0455, + "loss": 121.6475, "step": 1090 }, { "epoch": 1.7886178861788617, - "grad_norm": 0.21757575869560242, + "grad_norm": 9277.51953125, "learning_rate": 0.0001910695012900878, - "loss": 0.0435, + "loss": 113.4883, "step": 1100 }, { "epoch": 1.8048780487804879, - "grad_norm": 0.4405602514743805, + "grad_norm": 1323.01904296875, "learning_rate": 0.0001908460865420067, - "loss": 0.0442, + "loss": 82.9752, "step": 1110 }, { "epoch": 1.821138211382114, - "grad_norm": 0.3303125202655792, + "grad_norm": 1779.7681884765625, "learning_rate": 0.00019062004556155506, - "loss": 0.038, + "loss": 89.6342, "step": 1120 }, { "epoch": 1.8373983739837398, - "grad_norm": 0.4111814796924591, + "grad_norm": 4294.21044921875, "learning_rate": 0.00019039138488325912, - "loss": 0.0459, + "loss": 95.4384, "step": 1130 }, { "epoch": 1.8536585365853657, - "grad_norm": 0.5819851160049438, + "grad_norm": 1751.0389404296875, "learning_rate": 0.0001901601111173769, - "loss": 0.0378, + "loss": 94.7895, "step": 1140 }, { "epoch": 1.8699186991869918, - "grad_norm": 0.37830594182014465, + "grad_norm": 1074.5364990234375, "learning_rate": 0.00018992623094970718, - "loss": 0.0571, + "loss": 52.8511, "step": 1150 }, { "epoch": 1.886178861788618, - "grad_norm": 0.35503852367401123, + "grad_norm": 1331.533935546875, "learning_rate": 0.0001896897511413961, - "loss": 0.0468, + "loss": 94.5019, "step": 1160 }, { "epoch": 1.9024390243902438, - "grad_norm": 0.4923391342163086, + "grad_norm": 3847.0712890625, "learning_rate": 0.0001894506785287417, - "loss": 0.0556, + "loss": 74.2541, "step": 1170 }, { "epoch": 1.91869918699187, - "grad_norm": 0.45425254106521606, + "grad_norm": 2032.809326171875, "learning_rate": 0.00018920902002299644, - "loss": 0.0507, + "loss": 139.9438, "step": 1180 }, { "epoch": 1.934959349593496, - "grad_norm": 0.28873878717422485, + "grad_norm": 2137.700439453125, "learning_rate": 0.00018896478261016725, - "loss": 0.0488, + "loss": 111.8997, "step": 1190 }, { "epoch": 1.951219512195122, - "grad_norm": 0.5140803456306458, + "grad_norm": 2548.987548828125, "learning_rate": 0.0001887179733508136, - "loss": 0.0625, + "loss": 76.8431, "step": 1200 }, { "epoch": 1.967479674796748, - "grad_norm": 0.4635219871997833, + "grad_norm": 1745.5999755859375, "learning_rate": 0.00018846859937984346, - "loss": 0.0607, + "loss": 67.4039, "step": 1210 }, { "epoch": 1.9837398373983741, - "grad_norm": 0.4263623058795929, + "grad_norm": 2157.826904296875, "learning_rate": 0.000188216667906307, - "loss": 0.0529, + "loss": 103.4683, "step": 1220 }, { "epoch": 2.0, - "grad_norm": 0.3493933081626892, + "grad_norm": 1239.8826904296875, "learning_rate": 0.00018796218621318822, - "loss": 0.0436, + "loss": 98.9879, "step": 1230 }, { "epoch": 2.016260162601626, - "grad_norm": 0.47562670707702637, + "grad_norm": 1551.313720703125, "learning_rate": 0.00018770516165719423, - "loss": 0.0522, + "loss": 58.3172, "step": 1240 }, { "epoch": 2.032520325203252, - "grad_norm": 0.5135197043418884, + "grad_norm": 6539.54736328125, "learning_rate": 0.00018744560166854296, - "loss": 0.0438, + "loss": 72.3266, "step": 1250 }, { "epoch": 2.048780487804878, - "grad_norm": 0.36958932876586914, + "grad_norm": 920.8938598632812, "learning_rate": 0.00018718351375074786, - "loss": 0.0354, + "loss": 71.1883, "step": 1260 }, { "epoch": 2.065040650406504, - "grad_norm": 0.2944124639034271, + "grad_norm": 2663.365234375, "learning_rate": 0.00018691890548040146, - "loss": 0.0395, + "loss": 100.6873, "step": 1270 }, { "epoch": 2.08130081300813, - "grad_norm": 0.49656942486763, + "grad_norm": 5801.7314453125, "learning_rate": 0.00018665178450695606, - "loss": 0.055, + "loss": 51.0893, "step": 1280 }, { "epoch": 2.097560975609756, - "grad_norm": 0.41679883003234863, + "grad_norm": 1768.61083984375, "learning_rate": 0.00018638215855250263, - "loss": 0.0506, + "loss": 46.9602, "step": 1290 }, { "epoch": 2.113821138211382, - "grad_norm": 0.4237174689769745, + "grad_norm": 74955.7421875, "learning_rate": 0.00018611003541154766, - "loss": 0.0432, + "loss": 69.618, "step": 1300 }, { "epoch": 2.130081300813008, - "grad_norm": 0.3991101384162903, + "grad_norm": 16715.201171875, "learning_rate": 0.00018583542295078775, - "loss": 0.0464, + "loss": 76.9604, "step": 1310 }, { "epoch": 2.1463414634146343, - "grad_norm": 0.505883514881134, + "grad_norm": 490.9708557128906, "learning_rate": 0.0001855583291088822, - "loss": 0.0427, + "loss": 61.4616, "step": 1320 }, { "epoch": 2.16260162601626, - "grad_norm": 0.4554952383041382, + "grad_norm": 2168.93896484375, "learning_rate": 0.00018527876189622372, - "loss": 0.0538, + "loss": 69.4417, "step": 1330 }, { "epoch": 2.178861788617886, - "grad_norm": 0.4448312819004059, + "grad_norm": 1728.7271728515625, "learning_rate": 0.00018499672939470646, - "loss": 0.0608, + "loss": 41.3895, "step": 1340 }, { "epoch": 2.1951219512195124, - "grad_norm": 0.5928354263305664, + "grad_norm": 13797.6259765625, "learning_rate": 0.00018471223975749266, - "loss": 0.0445, + "loss": 86.5364, "step": 1350 }, { "epoch": 2.2113821138211383, - "grad_norm": 0.5058962106704712, + "grad_norm": 1238.1878662109375, "learning_rate": 0.000184425301208777, - "loss": 0.057, + "loss": 60.4841, "step": 1360 }, { "epoch": 2.227642276422764, - "grad_norm": 0.4662698209285736, + "grad_norm": 1721.18505859375, "learning_rate": 0.00018413592204354857, - "loss": 0.0449, + "loss": 63.7924, "step": 1370 }, { "epoch": 2.2439024390243905, - "grad_norm": 0.47586390376091003, + "grad_norm": 1503.65234375, "learning_rate": 0.00018384411062735142, - "loss": 0.042, + "loss": 72.9356, "step": 1380 }, { "epoch": 2.2601626016260163, - "grad_norm": 0.37652483582496643, + "grad_norm": 2268.296630859375, "learning_rate": 0.00018354987539604244, - "loss": 0.0476, + "loss": 64.837, "step": 1390 }, { "epoch": 2.2764227642276422, - "grad_norm": 0.5162194967269897, + "grad_norm": 770.135986328125, "learning_rate": 0.0001832532248555476, - "loss": 0.0469, + "loss": 46.5948, "step": 1400 }, { "epoch": 2.292682926829268, - "grad_norm": 0.7164818644523621, + "grad_norm": 2809.25146484375, "learning_rate": 0.00018295416758161607, - "loss": 0.0611, + "loss": 72.0357, "step": 1410 }, { "epoch": 2.3089430894308944, - "grad_norm": 0.3522421419620514, + "grad_norm": 5348.63330078125, "learning_rate": 0.00018265271221957235, - "loss": 0.0456, + "loss": 64.2022, "step": 1420 }, { "epoch": 2.3252032520325203, - "grad_norm": 0.5284357070922852, + "grad_norm": 1522.7744140625, "learning_rate": 0.00018234886748406623, - "loss": 0.0487, + "loss": 87.9972, "step": 1430 }, { "epoch": 2.341463414634146, - "grad_norm": 0.34699952602386475, + "grad_norm": 2168.956298828125, "learning_rate": 0.00018204264215882093, - "loss": 0.043, + "loss": 77.3112, "step": 1440 }, { "epoch": 2.3577235772357725, - "grad_norm": 0.4560791552066803, + "grad_norm": 1201.1219482421875, "learning_rate": 0.00018173404509637912, - "loss": 0.044, + "loss": 77.9051, "step": 1450 }, { "epoch": 2.3739837398373984, - "grad_norm": 0.34067708253860474, + "grad_norm": 1378.28515625, "learning_rate": 0.00018142308521784716, - "loss": 0.0427, + "loss": 113.3623, "step": 1460 }, { "epoch": 2.3902439024390243, - "grad_norm": 0.4294307827949524, + "grad_norm": 3121.484375, "learning_rate": 0.00018110977151263702, - "loss": 0.0306, + "loss": 68.1337, "step": 1470 }, { "epoch": 2.40650406504065, - "grad_norm": 0.37671607732772827, + "grad_norm": 4526.3203125, "learning_rate": 0.00018079411303820647, - "loss": 0.0428, + "loss": 76.719, "step": 1480 }, { "epoch": 2.4227642276422765, - "grad_norm": 0.5851466655731201, + "grad_norm": 1512.4857177734375, "learning_rate": 0.00018047611891979732, - "loss": 0.05, + "loss": 53.3857, "step": 1490 }, { "epoch": 2.4390243902439024, - "grad_norm": 0.3729349970817566, + "grad_norm": 775.2145385742188, "learning_rate": 0.00018015579835017147, - "loss": 0.0457, + "loss": 59.4552, "step": 1500 }, { "epoch": 2.4552845528455283, - "grad_norm": 0.45499187707901, + "grad_norm": 1543.6497802734375, "learning_rate": 0.00017983316058934533, - "loss": 0.0349, + "loss": 79.715, "step": 1510 }, { "epoch": 2.4715447154471546, - "grad_norm": 0.3735388517379761, + "grad_norm": 3052.429931640625, "learning_rate": 0.00017950821496432202, - "loss": 0.044, + "loss": 68.2702, "step": 1520 }, { "epoch": 2.4878048780487805, - "grad_norm": 0.29665642976760864, + "grad_norm": 1861.0294189453125, "learning_rate": 0.00017918097086882167, - "loss": 0.0383, + "loss": 70.8437, "step": 1530 }, { "epoch": 2.5040650406504064, - "grad_norm": 0.39778468012809753, + "grad_norm": 1316.6455078125, "learning_rate": 0.00017885143776301017, - "loss": 0.0365, + "loss": 48.8773, "step": 1540 }, { "epoch": 2.5203252032520327, - "grad_norm": 0.625698447227478, + "grad_norm": 1434.713623046875, "learning_rate": 0.0001785196251732252, - "loss": 0.0499, + "loss": 50.5964, "step": 1550 }, { "epoch": 2.5365853658536586, - "grad_norm": 0.22902359068393707, + "grad_norm": 2314.07373046875, "learning_rate": 0.0001781855426917013, - "loss": 0.0333, + "loss": 49.6357, "step": 1560 }, { "epoch": 2.5528455284552845, - "grad_norm": 0.34528470039367676, + "grad_norm": 27705.951171875, "learning_rate": 0.00017784919997629236, - "loss": 0.0364, + "loss": 60.1384, "step": 1570 }, { "epoch": 2.569105691056911, - "grad_norm": 0.38487961888313293, + "grad_norm": 100750.6953125, "learning_rate": 0.00017751060675019235, - "loss": 0.042, + "loss": 78.1081, "step": 1580 }, { "epoch": 2.5853658536585367, - "grad_norm": 0.3221438229084015, + "grad_norm": 5099.37548828125, "learning_rate": 0.00017716977280165445, - "loss": 0.0427, + "loss": 107.401, "step": 1590 }, { "epoch": 2.6016260162601625, - "grad_norm": 0.5269684195518494, + "grad_norm": 16017.0224609375, "learning_rate": 0.00017682670798370792, - "loss": 0.0391, + "loss": 109.425, "step": 1600 }, { "epoch": 2.617886178861789, - "grad_norm": 0.38913485407829285, + "grad_norm": 1565.2376708984375, "learning_rate": 0.00017648142221387325, - "loss": 0.0433, + "loss": 66.7137, "step": 1610 }, { "epoch": 2.6341463414634148, - "grad_norm": 0.38666921854019165, + "grad_norm": 1883.359619140625, "learning_rate": 0.00017613392547387565, - "loss": 0.0382, + "loss": 63.5428, "step": 1620 }, { "epoch": 2.6504065040650406, - "grad_norm": 0.5571984648704529, + "grad_norm": 4678.5400390625, "learning_rate": 0.00017578422780935624, - "loss": 0.0411, + "loss": 62.324, "step": 1630 }, { "epoch": 2.6666666666666665, - "grad_norm": 0.5609903931617737, + "grad_norm": 1467.29150390625, "learning_rate": 0.00017543233932958185, - "loss": 0.0379, + "loss": 42.7399, "step": 1640 }, { "epoch": 2.682926829268293, - "grad_norm": 0.4976758062839508, + "grad_norm": 17443.28125, "learning_rate": 0.00017507827020715267, - "loss": 0.037, + "loss": 76.8691, "step": 1650 }, { "epoch": 2.6991869918699187, - "grad_norm": 0.6068280339241028, + "grad_norm": 1430.255615234375, "learning_rate": 0.00017472203067770816, - "loss": 0.0543, + "loss": 45.8614, "step": 1660 }, { "epoch": 2.7154471544715446, - "grad_norm": 0.587230384349823, + "grad_norm": 973.7998657226562, "learning_rate": 0.0001743636310396312, - "loss": 0.0376, + "loss": 36.7464, "step": 1670 }, { "epoch": 2.7317073170731705, - "grad_norm": 0.4266080856323242, + "grad_norm": 2293.49658203125, "learning_rate": 0.00017400308165375043, - "loss": 0.0422, + "loss": 104.4038, "step": 1680 }, { "epoch": 2.747967479674797, - "grad_norm": 0.68971186876297, + "grad_norm": 1044.43115234375, "learning_rate": 0.00017364039294304063, - "loss": 0.0601, + "loss": 61.9649, "step": 1690 }, { "epoch": 2.7642276422764227, - "grad_norm": 0.508395254611969, + "grad_norm": 2085.281982421875, "learning_rate": 0.00017327557539232138, - "loss": 0.0423, + "loss": 51.97, "step": 1700 }, { "epoch": 2.7804878048780486, - "grad_norm": 0.48645734786987305, + "grad_norm": 1864.0758056640625, "learning_rate": 0.00017290863954795414, - "loss": 0.0408, + "loss": 56.1968, "step": 1710 }, { "epoch": 2.796747967479675, - "grad_norm": 0.4463174045085907, + "grad_norm": 5055.72216796875, "learning_rate": 0.00017253959601753715, - "loss": 0.0445, + "loss": 49.4941, "step": 1720 }, { "epoch": 2.813008130081301, - "grad_norm": 0.3336637616157532, + "grad_norm": 2442.3779296875, "learning_rate": 0.00017216845546959904, - "loss": 0.0336, + "loss": 85.7186, "step": 1730 }, { "epoch": 2.8292682926829267, - "grad_norm": 0.41125941276550293, + "grad_norm": 1286.6806640625, "learning_rate": 0.00017179522863329004, - "loss": 0.0463, + "loss": 57.1273, "step": 1740 }, { "epoch": 2.845528455284553, - "grad_norm": 0.35591065883636475, + "grad_norm": 1548.7122802734375, "learning_rate": 0.0001714199262980722, - "loss": 0.0352, + "loss": 50.7149, "step": 1750 }, { "epoch": 2.861788617886179, - "grad_norm": 0.352740615606308, + "grad_norm": 1237.375732421875, "learning_rate": 0.00017104255931340732, - "loss": 0.0394, + "loss": 80.6716, "step": 1760 }, { "epoch": 2.8780487804878048, - "grad_norm": 0.28329795598983765, + "grad_norm": 271203.3125, "learning_rate": 0.00017066313858844317, - "loss": 0.0402, + "loss": 79.4793, "step": 1770 }, { "epoch": 2.894308943089431, - "grad_norm": 0.35925358533859253, + "grad_norm": 2990.47998046875, "learning_rate": 0.00017028167509169846, - "loss": 0.0335, + "loss": 63.7313, "step": 1780 }, { "epoch": 2.910569105691057, - "grad_norm": 0.2933971583843231, + "grad_norm": 2197.031494140625, "learning_rate": 0.00016989817985074533, - "loss": 0.0409, + "loss": 66.6744, "step": 1790 }, { "epoch": 2.926829268292683, - "grad_norm": 0.4342270493507385, + "grad_norm": 2398.322509765625, "learning_rate": 0.00016951266395189097, - "loss": 0.0395, + "loss": 119.2331, "step": 1800 }, { "epoch": 2.943089430894309, - "grad_norm": 0.43399593234062195, + "grad_norm": 1132.4508056640625, "learning_rate": 0.00016912513853985686, - "loss": 0.0381, + "loss": 66.5857, "step": 1810 }, { "epoch": 2.959349593495935, - "grad_norm": 0.5376604795455933, + "grad_norm": 1172.097412109375, "learning_rate": 0.00016873561481745667, - "loss": 0.0385, + "loss": 69.8449, "step": 1820 }, { "epoch": 2.975609756097561, - "grad_norm": 0.5233227014541626, + "grad_norm": 1260.872314453125, "learning_rate": 0.0001683441040452724, - "loss": 0.0456, + "loss": 65.4089, "step": 1830 }, { "epoch": 2.991869918699187, - "grad_norm": 0.5964445471763611, + "grad_norm": 3771.443603515625, "learning_rate": 0.00016795061754132896, - "loss": 0.049, + "loss": 59.9783, "step": 1840 }, { "epoch": 3.008130081300813, - "grad_norm": 0.4924629330635071, + "grad_norm": 44377.31640625, "learning_rate": 0.00016755516668076674, - "loss": 0.0391, + "loss": 77.3272, "step": 1850 }, { "epoch": 3.024390243902439, - "grad_norm": 0.2987900674343109, + "grad_norm": 1505.83984375, "learning_rate": 0.00016715776289551296, - "loss": 0.0407, + "loss": 53.3784, "step": 1860 }, { "epoch": 3.040650406504065, - "grad_norm": 0.30709150433540344, + "grad_norm": 615.7579956054688, "learning_rate": 0.0001667584176739512, - "loss": 0.0319, + "loss": 50.9411, "step": 1870 }, { "epoch": 3.0569105691056913, - "grad_norm": 0.2775282859802246, + "grad_norm": 38362.62890625, "learning_rate": 0.00016635714256058915, - "loss": 0.037, + "loss": 118.019, "step": 1880 }, { "epoch": 3.073170731707317, - "grad_norm": 0.2877950072288513, + "grad_norm": 1028.602783203125, "learning_rate": 0.00016595394915572506, - "loss": 0.0403, + "loss": 69.6284, "step": 1890 }, { "epoch": 3.089430894308943, - "grad_norm": 0.32036092877388, + "grad_norm": 5944.29248046875, "learning_rate": 0.00016554884911511213, - "loss": 0.0431, + "loss": 64.6018, "step": 1900 }, { "epoch": 3.105691056910569, - "grad_norm": 0.5187320113182068, + "grad_norm": 2787.141845703125, "learning_rate": 0.00016514185414962182, - "loss": 0.0379, + "loss": 68.6644, "step": 1910 }, { "epoch": 3.1219512195121952, - "grad_norm": 0.3565807044506073, + "grad_norm": 2354.9130859375, "learning_rate": 0.0001647329760249052, - "loss": 0.032, + "loss": 81.7822, "step": 1920 }, { "epoch": 3.138211382113821, - "grad_norm": 0.509468674659729, + "grad_norm": 2922.60009765625, "learning_rate": 0.00016432222656105277, - "loss": 0.0363, + "loss": 113.863, "step": 1930 }, { "epoch": 3.154471544715447, - "grad_norm": 0.37406641244888306, + "grad_norm": 4188.85107421875, "learning_rate": 0.0001639096176322528, - "loss": 0.0372, + "loss": 79.855, "step": 1940 }, { "epoch": 3.1707317073170733, - "grad_norm": 0.345146119594574, + "grad_norm": 1911.2069091796875, "learning_rate": 0.0001634951611664482, - "loss": 0.0419, + "loss": 69.1627, "step": 1950 }, { "epoch": 3.186991869918699, - "grad_norm": 0.3773280084133148, + "grad_norm": 1192.2657470703125, "learning_rate": 0.0001630788691449914, - "loss": 0.0417, + "loss": 55.1678, "step": 1960 }, { "epoch": 3.203252032520325, - "grad_norm": 0.3879764974117279, + "grad_norm": 10476.7724609375, "learning_rate": 0.00016266075360229823, - "loss": 0.0389, + "loss": 88.3594, "step": 1970 }, { "epoch": 3.2195121951219514, - "grad_norm": 0.33022749423980713, + "grad_norm": 746.9041748046875, "learning_rate": 0.00016224082662550003, - "loss": 0.0383, + "loss": 109.0398, "step": 1980 }, { "epoch": 3.2357723577235773, - "grad_norm": 0.27165573835372925, + "grad_norm": 2032.73779296875, "learning_rate": 0.000161819100354094, - "loss": 0.0354, + "loss": 44.7227, "step": 1990 }, { "epoch": 3.252032520325203, - "grad_norm": 0.5045759081840515, + "grad_norm": 1000.6553955078125, "learning_rate": 0.0001613955869795925, - "loss": 0.0357, + "loss": 73.6318, "step": 2000 }, { "epoch": 3.2682926829268295, - "grad_norm": 0.3169061839580536, + "grad_norm": 877.0646362304688, "learning_rate": 0.00016097029874517053, - "loss": 0.0423, + "loss": 65.1961, "step": 2010 }, { "epoch": 3.2845528455284554, - "grad_norm": 0.295289009809494, + "grad_norm": 20667.6640625, "learning_rate": 0.0001605432479453117, - "loss": 0.0327, + "loss": 131.7637, "step": 2020 }, { "epoch": 3.3008130081300813, - "grad_norm": 0.3346075117588043, + "grad_norm": 6932.1630859375, "learning_rate": 0.0001601144469254531, - "loss": 0.0341, + "loss": 63.2276, "step": 2030 }, { "epoch": 3.317073170731707, - "grad_norm": 0.2813117206096649, + "grad_norm": 2701.05029296875, "learning_rate": 0.00015968390808162797, - "loss": 0.0341, + "loss": 93.1463, "step": 2040 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.2963223159313202, + "grad_norm": 2700.706298828125, "learning_rate": 0.0001592516438601077, - "loss": 0.0335, + "loss": 63.6073, "step": 2050 }, { "epoch": 3.3495934959349594, - "grad_norm": 0.3074122965335846, + "grad_norm": 9397.724609375, "learning_rate": 0.00015881766675704203, - "loss": 0.037, + "loss": 74.2051, "step": 2060 }, { "epoch": 3.3658536585365852, - "grad_norm": 0.25252121686935425, + "grad_norm": 919.5447998046875, "learning_rate": 0.00015838198931809747, - "loss": 0.037, + "loss": 55.599, "step": 2070 }, { "epoch": 3.3821138211382116, - "grad_norm": 0.33035480976104736, + "grad_norm": 4705.94287109375, "learning_rate": 0.00015794462413809503, - "loss": 0.0368, + "loss": 54.821, "step": 2080 }, { "epoch": 3.3983739837398375, - "grad_norm": 0.24505755305290222, + "grad_norm": 80140.5, "learning_rate": 0.00015750558386064584, - "loss": 0.0341, + "loss": 132.3792, "step": 2090 }, { "epoch": 3.4146341463414633, - "grad_norm": 0.2401554435491562, + "grad_norm": 17313.400390625, "learning_rate": 0.0001570648811777858, - "loss": 0.032, + "loss": 73.4562, "step": 2100 }, { "epoch": 3.430894308943089, - "grad_norm": 0.33875253796577454, + "grad_norm": 62464.19140625, "learning_rate": 0.00015662252882960855, - "loss": 0.0362, + "loss": 123.1144, "step": 2110 }, { "epoch": 3.4471544715447155, - "grad_norm": 0.5542377233505249, + "grad_norm": 10362.189453125, "learning_rate": 0.00015617853960389724, - "loss": 0.0432, + "loss": 60.7324, "step": 2120 }, { "epoch": 3.4634146341463414, - "grad_norm": 0.3801787793636322, + "grad_norm": 8119.03662109375, "learning_rate": 0.00015573292633575488, - "loss": 0.0406, + "loss": 47.9465, "step": 2130 }, { "epoch": 3.4796747967479673, - "grad_norm": 0.27756837010383606, + "grad_norm": 65353.2890625, "learning_rate": 0.00015528570190723325, - "loss": 0.0347, + "loss": 38.784, "step": 2140 }, { "epoch": 3.4959349593495936, - "grad_norm": 0.3375602066516876, + "grad_norm": 946.7526245117188, "learning_rate": 0.00015483687924696047, - "loss": 0.0307, + "loss": 45.439, "step": 2150 }, { "epoch": 3.5121951219512195, - "grad_norm": 0.3185490369796753, + "grad_norm": 8941.34375, "learning_rate": 0.0001543864713297673, - "loss": 0.0395, + "loss": 62.3894, "step": 2160 }, { "epoch": 3.5284552845528454, - "grad_norm": 0.2786973714828491, + "grad_norm": 169778.421875, "learning_rate": 0.00015393449117631205, - "loss": 0.0288, + "loss": 71.317, "step": 2170 }, { "epoch": 3.5447154471544717, - "grad_norm": 0.31171637773513794, + "grad_norm": 1309.4539794921875, "learning_rate": 0.0001534809518527042, - "loss": 0.0234, + "loss": 59.1676, "step": 2180 }, { "epoch": 3.5609756097560976, - "grad_norm": 0.32175740599632263, + "grad_norm": 159682.328125, "learning_rate": 0.0001530258664701266, - "loss": 0.0287, + "loss": 74.9109, "step": 2190 }, { "epoch": 3.5772357723577235, - "grad_norm": 0.19202867150306702, + "grad_norm": 5231.26611328125, "learning_rate": 0.00015256924818445652, - "loss": 0.0305, + "loss": 50.8158, "step": 2200 }, { "epoch": 3.59349593495935, - "grad_norm": 0.324100136756897, + "grad_norm": 840.7651977539062, "learning_rate": 0.0001521111101958852, - "loss": 0.0351, + "loss": 53.3685, "step": 2210 }, { "epoch": 3.6097560975609757, - "grad_norm": 0.28740307688713074, + "grad_norm": 1039.3839111328125, "learning_rate": 0.00015165146574853651, - "loss": 0.0318, + "loss": 51.3367, "step": 2220 }, { "epoch": 3.6260162601626016, - "grad_norm": 0.34469175338745117, + "grad_norm": 2042.122802734375, "learning_rate": 0.00015119032813008384, - "loss": 0.0382, + "loss": 63.4835, "step": 2230 }, { "epoch": 3.642276422764228, - "grad_norm": 0.23193226754665375, + "grad_norm": 1014.0968017578125, "learning_rate": 0.00015072771067136602, - "loss": 0.0324, + "loss": 121.3831, "step": 2240 }, { "epoch": 3.658536585365854, - "grad_norm": 0.4241219162940979, + "grad_norm": 2085.046875, "learning_rate": 0.00015026362674600197, - "loss": 0.0329, + "loss": 86.4089, "step": 2250 }, { "epoch": 3.6747967479674797, - "grad_norm": 0.4813792407512665, + "grad_norm": 1501.3868408203125, "learning_rate": 0.00014979808977000423, - "loss": 0.0458, + "loss": 87.4238, "step": 2260 }, { "epoch": 3.6910569105691056, - "grad_norm": 0.53413325548172, + "grad_norm": 3143.670166015625, "learning_rate": 0.0001493311132013908, - "loss": 0.0465, + "loss": 47.4117, "step": 2270 }, { "epoch": 3.7073170731707314, - "grad_norm": 0.35125425457954407, + "grad_norm": 3601.27197265625, "learning_rate": 0.00014886271053979642, - "loss": 0.0347, + "loss": 47.0386, "step": 2280 }, { "epoch": 3.7235772357723578, - "grad_norm": 0.30809614062309265, + "grad_norm": 1050.021484375, "learning_rate": 0.00014839289532608208, - "loss": 0.0413, + "loss": 50.3757, "step": 2290 }, { "epoch": 3.7398373983739837, - "grad_norm": 0.3970271050930023, + "grad_norm": 1158.14453125, "learning_rate": 0.0001479216811419437, - "loss": 0.0337, + "loss": 53.1059, "step": 2300 }, { "epoch": 3.7560975609756095, - "grad_norm": 0.4460907578468323, + "grad_norm": 1679.3118896484375, "learning_rate": 0.00014744908160951948, - "loss": 0.036, + "loss": 81.2242, "step": 2310 }, { "epoch": 3.772357723577236, - "grad_norm": 0.33100569248199463, + "grad_norm": 1483.0025634765625, "learning_rate": 0.00014697511039099602, - "loss": 0.0416, + "loss": 65.0123, "step": 2320 }, { "epoch": 3.7886178861788617, - "grad_norm": 0.46431633830070496, + "grad_norm": 1206.0103759765625, "learning_rate": 0.00014649978118821356, - "loss": 0.0376, + "loss": 112.2168, "step": 2330 }, { "epoch": 3.8048780487804876, - "grad_norm": 0.43561309576034546, + "grad_norm": 6336.48828125, "learning_rate": 0.00014602310774226957, - "loss": 0.0304, + "loss": 98.5093, "step": 2340 }, { "epoch": 3.821138211382114, - "grad_norm": 0.3034389317035675, + "grad_norm": 659.5859985351562, "learning_rate": 0.00014554510383312189, - "loss": 0.0303, + "loss": 65.6266, "step": 2350 }, { "epoch": 3.83739837398374, - "grad_norm": 0.2471175640821457, + "grad_norm": 1136.7991943359375, "learning_rate": 0.00014506578327919, - "loss": 0.0315, + "loss": 51.189, "step": 2360 }, { "epoch": 3.8536585365853657, - "grad_norm": 0.2999189794063568, + "grad_norm": 6465.4130859375, "learning_rate": 0.00014458515993695585, - "loss": 0.0361, + "loss": 69.188, "step": 2370 }, { "epoch": 3.869918699186992, - "grad_norm": 0.24256114661693573, + "grad_norm": 5106.58642578125, "learning_rate": 0.00014410324770056313, - "loss": 0.0303, + "loss": 96.6794, "step": 2380 }, { "epoch": 3.886178861788618, - "grad_norm": 0.24523112177848816, + "grad_norm": 3519.845703125, "learning_rate": 0.00014362006050141563, - "loss": 0.04, + "loss": 55.2195, "step": 2390 }, { "epoch": 3.902439024390244, - "grad_norm": 0.28290221095085144, + "grad_norm": 20824.455078125, "learning_rate": 0.00014313561230777452, - "loss": 0.0296, + "loss": 47.6591, "step": 2400 }, { "epoch": 3.91869918699187, - "grad_norm": 0.409028559923172, + "grad_norm": 2973.600830078125, "learning_rate": 0.00014264991712435452, - "loss": 0.0355, + "loss": 66.8287, "step": 2410 }, { "epoch": 3.934959349593496, - "grad_norm": 0.36809080839157104, + "grad_norm": 1502.51025390625, "learning_rate": 0.00014216298899191916, - "loss": 0.029, + "loss": 47.0916, "step": 2420 }, { "epoch": 3.951219512195122, - "grad_norm": 0.4081156849861145, + "grad_norm": 13010.16796875, "learning_rate": 0.0001416748419868747, - "loss": 0.0263, + "loss": 61.0954, "step": 2430 }, { "epoch": 3.9674796747967482, - "grad_norm": 0.4881688952445984, + "grad_norm": 953.6785278320312, "learning_rate": 0.0001411854902208633, - "loss": 0.0355, + "loss": 47.334, "step": 2440 }, { "epoch": 3.983739837398374, - "grad_norm": 0.5361936092376709, + "grad_norm": 2903.397216796875, "learning_rate": 0.00014069494784035505, - "loss": 0.0354, + "loss": 67.0245, "step": 2450 }, { "epoch": 4.0, - "grad_norm": 0.4904439449310303, + "grad_norm": 1550.0595703125, "learning_rate": 0.0001402032290262391, - "loss": 0.0348, + "loss": 51.0681, "step": 2460 }, { "epoch": 4.016260162601626, - "grad_norm": 0.3554152548313141, + "grad_norm": 58333.4921875, "learning_rate": 0.00013971034799341355, - "loss": 0.0288, + "loss": 62.1808, "step": 2470 }, { "epoch": 4.032520325203252, - "grad_norm": 0.3490947484970093, + "grad_norm": 1227.8946533203125, "learning_rate": 0.0001392163189903747, - "loss": 0.0392, + "loss": 72.5005, "step": 2480 }, { "epoch": 4.048780487804878, - "grad_norm": 0.3183273375034332, + "grad_norm": 2188.923828125, "learning_rate": 0.00013872115629880497, - "loss": 0.0398, + "loss": 47.0166, "step": 2490 }, { "epoch": 4.065040650406504, - "grad_norm": 0.20899447798728943, + "grad_norm": 1214.519775390625, "learning_rate": 0.0001382248742331602, - "loss": 0.034, + "loss": 40.6225, "step": 2500 }, { "epoch": 4.08130081300813, - "grad_norm": 0.4122081398963928, + "grad_norm": 952.546875, "learning_rate": 0.0001377274871402556, - "loss": 0.0338, + "loss": 43.3264, "step": 2510 }, { "epoch": 4.097560975609756, - "grad_norm": 0.30182841420173645, + "grad_norm": 753.4329833984375, "learning_rate": 0.00013722900939885132, - "loss": 0.0278, + "loss": 51.3909, "step": 2520 }, { "epoch": 4.1138211382113825, - "grad_norm": 0.18697448074817657, + "grad_norm": 1024.9317626953125, "learning_rate": 0.0001367294554192366, - "loss": 0.0307, + "loss": 42.0499, "step": 2530 }, { "epoch": 4.130081300813008, - "grad_norm": 0.33489131927490234, + "grad_norm": 546.87841796875, "learning_rate": 0.00013622883964281316, - "loss": 0.039, + "loss": 36.1083, "step": 2540 }, { "epoch": 4.146341463414634, - "grad_norm": 0.35084104537963867, + "grad_norm": 893.5374755859375, "learning_rate": 0.00013572717654167777, - "loss": 0.03, + "loss": 39.7196, "step": 2550 }, { "epoch": 4.16260162601626, - "grad_norm": 0.45599302649497986, + "grad_norm": 1298.6865234375, "learning_rate": 0.00013522448061820393, - "loss": 0.0367, + "loss": 43.8941, "step": 2560 }, { "epoch": 4.178861788617886, - "grad_norm": 0.2615354657173157, + "grad_norm": 1751.4395751953125, "learning_rate": 0.00013472076640462248, - "loss": 0.0369, + "loss": 48.5067, "step": 2570 }, { "epoch": 4.195121951219512, - "grad_norm": 0.5199306011199951, + "grad_norm": 4070.478759765625, "learning_rate": 0.00013421604846260173, - "loss": 0.0376, + "loss": 69.5999, "step": 2580 }, { "epoch": 4.211382113821138, - "grad_norm": 0.3855922222137451, + "grad_norm": 1715.4664306640625, "learning_rate": 0.0001337103413828263, - "loss": 0.0291, + "loss": 55.5755, "step": 2590 }, { "epoch": 4.227642276422764, - "grad_norm": 0.24133704602718353, + "grad_norm": 1144.9033203125, "learning_rate": 0.00013320365978457534, - "loss": 0.0315, + "loss": 44.6062, "step": 2600 }, { "epoch": 4.2439024390243905, - "grad_norm": 0.3246220648288727, + "grad_norm": 1374.0616455078125, "learning_rate": 0.00013269601831530003, - "loss": 0.0331, + "loss": 100.0019, "step": 2610 }, { "epoch": 4.260162601626016, - "grad_norm": 0.39405912160873413, + "grad_norm": 649.107666015625, "learning_rate": 0.0001321874316502, - "loss": 0.029, + "loss": 45.9766, "step": 2620 }, { "epoch": 4.276422764227642, - "grad_norm": 0.4733191430568695, + "grad_norm": 1265.823486328125, "learning_rate": 0.00013167791449179928, - "loss": 0.0358, + "loss": 36.6327, "step": 2630 }, { "epoch": 4.2926829268292686, - "grad_norm": 0.42187246680259705, + "grad_norm": 1065.16943359375, "learning_rate": 0.00013116748156952098, - "loss": 0.032, + "loss": 36.6221, "step": 2640 }, { "epoch": 4.308943089430894, - "grad_norm": 0.32308024168014526, + "grad_norm": 7990.9853515625, "learning_rate": 0.00013065614763926184, - "loss": 0.0308, + "loss": 47.2748, "step": 2650 }, { "epoch": 4.32520325203252, - "grad_norm": 0.3186604082584381, + "grad_norm": 3891.1884765625, "learning_rate": 0.00013014392748296528, - "loss": 0.0298, + "loss": 60.2811, "step": 2660 }, { "epoch": 4.341463414634147, - "grad_norm": 0.2509617507457733, + "grad_norm": 1250.55859375, "learning_rate": 0.00012963083590819443, - "loss": 0.0385, + "loss": 59.3533, "step": 2670 }, { "epoch": 4.357723577235772, - "grad_norm": 0.2872649133205414, + "grad_norm": 452.96368408203125, "learning_rate": 0.00012911688774770377, - "loss": 0.0293, + "loss": 39.7551, "step": 2680 }, { "epoch": 4.373983739837398, - "grad_norm": 0.3669278025627136, + "grad_norm": 1382.8927001953125, "learning_rate": 0.0001286020978590106, - "loss": 0.0276, + "loss": 56.9612, "step": 2690 }, { "epoch": 4.390243902439025, - "grad_norm": 0.31471124291419983, + "grad_norm": 2779.33642578125, "learning_rate": 0.0001280864811239652, - "loss": 0.0263, + "loss": 76.6694, "step": 2700 }, { "epoch": 4.40650406504065, - "grad_norm": 0.5951998233795166, + "grad_norm": 1720.7236328125, "learning_rate": 0.00012757005244832113, - "loss": 0.0263, + "loss": 54.5705, "step": 2710 }, { "epoch": 4.4227642276422765, - "grad_norm": 0.38588541746139526, + "grad_norm": 530.7537231445312, "learning_rate": 0.00012705282676130368, - "loss": 0.0344, + "loss": 43.2596, "step": 2720 }, { "epoch": 4.439024390243903, - "grad_norm": 0.2506534457206726, + "grad_norm": 1741.5948486328125, "learning_rate": 0.00012653481901517876, - "loss": 0.0218, + "loss": 44.5357, "step": 2730 }, { "epoch": 4.455284552845528, - "grad_norm": 0.43242403864860535, + "grad_norm": 545.766357421875, "learning_rate": 0.00012601604418482052, - "loss": 0.0243, + "loss": 64.0609, "step": 2740 }, { "epoch": 4.471544715447155, - "grad_norm": 0.27420303225517273, + "grad_norm": 760.1073608398438, "learning_rate": 0.00012549651726727841, - "loss": 0.0263, + "loss": 33.9295, "step": 2750 }, { "epoch": 4.487804878048781, - "grad_norm": 0.2911824882030487, + "grad_norm": 3076.673583984375, "learning_rate": 0.0001249762532813437, - "loss": 0.0285, + "loss": 53.2542, "step": 2760 }, { "epoch": 4.504065040650406, - "grad_norm": 0.4180115759372711, + "grad_norm": 613.498779296875, "learning_rate": 0.0001244552672671152, - "loss": 0.0255, + "loss": 42.9754, "step": 2770 }, { "epoch": 4.520325203252033, - "grad_norm": 0.34034252166748047, + "grad_norm": 633.474365234375, "learning_rate": 0.0001239335742855645, - "loss": 0.0313, + "loss": 79.9076, "step": 2780 }, { "epoch": 4.536585365853659, - "grad_norm": 0.4372043013572693, + "grad_norm": 534.7109375, "learning_rate": 0.00012341118941810086, - "loss": 0.0349, + "loss": 56.3449, "step": 2790 }, { "epoch": 4.5528455284552845, - "grad_norm": 0.3891702890396118, + "grad_norm": 988.2083740234375, "learning_rate": 0.00012288812776613467, - "loss": 0.0299, + "loss": 60.076, "step": 2800 }, { "epoch": 4.569105691056911, - "grad_norm": 0.3251037895679474, + "grad_norm": 987.4862670898438, "learning_rate": 0.00012236440445064146, - "loss": 0.0231, + "loss": 44.6687, "step": 2810 }, { "epoch": 4.585365853658536, - "grad_norm": 0.24190375208854675, + "grad_norm": 1020.8764038085938, "learning_rate": 0.00012184003461172437, - "loss": 0.0219, + "loss": 54.9522, "step": 2820 }, { "epoch": 4.6016260162601625, - "grad_norm": 0.35838401317596436, + "grad_norm": 861.468505859375, "learning_rate": 0.00012131503340817663, - "loss": 0.033, + "loss": 72.5806, "step": 2830 }, { "epoch": 4.617886178861789, - "grad_norm": 0.24510283768177032, + "grad_norm": 1153.2725830078125, "learning_rate": 0.00012078941601704343, - "loss": 0.0279, + "loss": 44.8851, "step": 2840 }, { "epoch": 4.634146341463414, - "grad_norm": 0.259705513715744, + "grad_norm": 7982.6865234375, "learning_rate": 0.00012026319763318301, - "loss": 0.0256, + "loss": 49.9482, "step": 2850 }, { "epoch": 4.650406504065041, - "grad_norm": 0.19180011749267578, + "grad_norm": 1476.1536865234375, "learning_rate": 0.00011973639346882746, - "loss": 0.0293, + "loss": 47.223, "step": 2860 }, { "epoch": 4.666666666666667, - "grad_norm": 0.3907632529735565, + "grad_norm": 1169.1434326171875, "learning_rate": 0.00011920901875314295, - "loss": 0.0302, + "loss": 51.8643, "step": 2870 }, { "epoch": 4.682926829268292, - "grad_norm": 0.30425527691841125, + "grad_norm": 1330.784912109375, "learning_rate": 0.00011868108873178949, - "loss": 0.0237, + "loss": 43.6427, "step": 2880 }, { "epoch": 4.699186991869919, - "grad_norm": 0.4089062511920929, + "grad_norm": 631.0576171875, "learning_rate": 0.00011815261866648026, - "loss": 0.0264, + "loss": 56.523, "step": 2890 }, { "epoch": 4.715447154471545, - "grad_norm": 0.22791796922683716, + "grad_norm": 1804.2171630859375, "learning_rate": 0.00011762362383454024, - "loss": 0.0293, + "loss": 49.6038, "step": 2900 }, { "epoch": 4.7317073170731705, - "grad_norm": 0.2575097680091858, + "grad_norm": 2007.8486328125, "learning_rate": 0.00011709411952846479, - "loss": 0.0302, + "loss": 56.3543, "step": 2910 }, { "epoch": 4.747967479674797, - "grad_norm": 0.23536550998687744, + "grad_norm": 1846.902099609375, "learning_rate": 0.00011656412105547733, - "loss": 0.0287, + "loss": 40.9638, "step": 2920 }, { "epoch": 4.764227642276423, - "grad_norm": 0.220611110329628, + "grad_norm": 854.6354370117188, "learning_rate": 0.00011603364373708702, - "loss": 0.0273, + "loss": 47.7196, "step": 2930 }, { "epoch": 4.780487804878049, - "grad_norm": 0.2931613028049469, + "grad_norm": 2663.093017578125, "learning_rate": 0.00011550270290864582, - "loss": 0.0389, + "loss": 88.7795, "step": 2940 }, { "epoch": 4.796747967479675, - "grad_norm": 0.26123061776161194, + "grad_norm": 2370.38720703125, "learning_rate": 0.00011497131391890498, - "loss": 0.0356, + "loss": 65.2372, "step": 2950 }, { "epoch": 4.8130081300813, - "grad_norm": 0.35643813014030457, + "grad_norm": 1494.7568359375, "learning_rate": 0.00011443949212957154, - "loss": 0.0288, + "loss": 68.4685, "step": 2960 }, { "epoch": 4.829268292682927, - "grad_norm": 0.3364145755767822, + "grad_norm": 1287.447021484375, "learning_rate": 0.00011390725291486419, - "loss": 0.0266, + "loss": 51.913, "step": 2970 }, { "epoch": 4.845528455284553, - "grad_norm": 0.24207085371017456, + "grad_norm": 1271.5274658203125, "learning_rate": 0.00011337461166106871, - "loss": 0.0244, + "loss": 53.7021, "step": 2980 }, { "epoch": 4.861788617886178, - "grad_norm": 0.24333341419696808, + "grad_norm": 1231.7939453125, "learning_rate": 0.00011284158376609333, - "loss": 0.0271, + "loss": 31.6516, "step": 2990 }, { "epoch": 4.878048780487805, - "grad_norm": 0.24428123235702515, + "grad_norm": 1916.57421875, "learning_rate": 0.00011230818463902358, - "loss": 0.0413, + "loss": 69.1733, "step": 3000 }, { "epoch": 4.894308943089431, - "grad_norm": 0.20461352169513702, + "grad_norm": 2691.4208984375, "learning_rate": 0.00011177442969967668, - "loss": 0.0249, + "loss": 55.0878, "step": 3010 }, { "epoch": 4.9105691056910565, - "grad_norm": 0.2744583487510681, + "grad_norm": 1314.462646484375, "learning_rate": 0.00011124033437815593, - "loss": 0.0254, + "loss": 40.0013, "step": 3020 }, { "epoch": 4.926829268292683, - "grad_norm": 0.2665221095085144, + "grad_norm": 1857.048095703125, "learning_rate": 0.00011070591411440459, - "loss": 0.0288, + "loss": 46.5445, "step": 3030 }, { "epoch": 4.943089430894309, - "grad_norm": 0.27404850721359253, + "grad_norm": 1580.3558349609375, "learning_rate": 0.00011017118435775957, - "loss": 0.0271, + "loss": 38.4451, "step": 3040 }, { "epoch": 4.959349593495935, - "grad_norm": 0.402006596326828, + "grad_norm": 1501.5589599609375, "learning_rate": 0.00010963616056650476, - "loss": 0.0369, + "loss": 34.3078, "step": 3050 }, { "epoch": 4.975609756097561, - "grad_norm": 0.29626989364624023, + "grad_norm": 3925.81591796875, "learning_rate": 0.00010910085820742419, - "loss": 0.0254, + "loss": 58.2388, "step": 3060 }, { "epoch": 4.991869918699187, - "grad_norm": 0.3598923683166504, + "grad_norm": 828.7344360351562, "learning_rate": 0.00010856529275535487, - "loss": 0.0238, + "loss": 77.3652, "step": 3070 }, { "epoch": 5.008130081300813, - "grad_norm": 0.30296790599823, + "grad_norm": 850.0521240234375, "learning_rate": 0.00010802947969273946, - "loss": 0.0396, + "loss": 32.5409, "step": 3080 }, { "epoch": 5.024390243902439, - "grad_norm": 0.24314256012439728, + "grad_norm": 315.0628967285156, "learning_rate": 0.00010749343450917873, - "loss": 0.0271, + "loss": 49.1381, "step": 3090 }, { "epoch": 5.040650406504065, - "grad_norm": 0.31943249702453613, + "grad_norm": 805.5790405273438, "learning_rate": 0.0001069571727009837, - "loss": 0.0339, + "loss": 44.4946, "step": 3100 }, { "epoch": 5.056910569105691, - "grad_norm": 0.2527000904083252, + "grad_norm": 2954.944091796875, "learning_rate": 0.0001064207097707277, - "loss": 0.0251, + "loss": 56.0899, "step": 3110 }, { "epoch": 5.073170731707317, - "grad_norm": 0.4308377504348755, + "grad_norm": 1296.76025390625, "learning_rate": 0.00010588406122679825, - "loss": 0.0252, + "loss": 32.3572, "step": 3120 }, { "epoch": 5.0894308943089435, - "grad_norm": 0.2500210404396057, + "grad_norm": 682.7062377929688, "learning_rate": 0.00010534724258294868, - "loss": 0.0253, + "loss": 41.241, "step": 3130 }, { "epoch": 5.105691056910569, - "grad_norm": 0.3363681733608246, + "grad_norm": 586.6185302734375, "learning_rate": 0.00010481026935784967, - "loss": 0.0238, + "loss": 46.9862, "step": 3140 }, { "epoch": 5.121951219512195, - "grad_norm": 0.28005245327949524, + "grad_norm": 494.31768798828125, "learning_rate": 0.0001042731570746406, - "loss": 0.0269, + "loss": 39.867, "step": 3150 }, { "epoch": 5.138211382113822, - "grad_norm": 0.27843043208122253, + "grad_norm": 1095.9088134765625, "learning_rate": 0.00010373592126048093, - "loss": 0.0275, + "loss": 33.0041, "step": 3160 }, { "epoch": 5.154471544715447, - "grad_norm": 0.21730771660804749, + "grad_norm": 1172.2149658203125, "learning_rate": 0.00010319857744610106, - "loss": 0.0257, + "loss": 84.7379, "step": 3170 }, { "epoch": 5.170731707317073, - "grad_norm": 0.33479946851730347, + "grad_norm": 7211.0283203125, "learning_rate": 0.00010266114116535362, - "loss": 0.0251, + "loss": 48.8282, "step": 3180 }, { "epoch": 5.186991869918699, - "grad_norm": 0.3060964345932007, + "grad_norm": 1418.6943359375, "learning_rate": 0.00010212362795476432, - "loss": 0.0227, + "loss": 46.3707, "step": 3190 }, { "epoch": 5.203252032520325, - "grad_norm": 0.3518486022949219, + "grad_norm": 3661.55126953125, "learning_rate": 0.0001015860533530828, - "loss": 0.0314, + "loss": 93.9867, "step": 3200 }, { "epoch": 5.219512195121951, - "grad_norm": 0.2479204535484314, + "grad_norm": 1076.226806640625, "learning_rate": 0.00010104843290083341, - "loss": 0.0234, + "loss": 68.2097, "step": 3210 }, { "epoch": 5.235772357723577, - "grad_norm": 0.278024286031723, + "grad_norm": 4902.42138671875, "learning_rate": 0.00010051078213986597, - "loss": 0.0278, + "loss": 36.9465, "step": 3220 }, { "epoch": 5.252032520325203, - "grad_norm": 0.25407204031944275, + "grad_norm": 2610.93212890625, "learning_rate": 9.997311661290648e-05, - "loss": 0.0223, + "loss": 56.646, "step": 3230 }, { "epoch": 5.2682926829268295, - "grad_norm": 0.21007820963859558, + "grad_norm": 3272.592529296875, "learning_rate": 9.943545186310787e-05, - "loss": 0.0221, + "loss": 42.065, "step": 3240 }, { "epoch": 5.284552845528455, - "grad_norm": 0.31850218772888184, + "grad_norm": 1224.6219482421875, "learning_rate": 9.889780343360049e-05, - "loss": 0.0211, + "loss": 60.0324, "step": 3250 }, { "epoch": 5.300813008130081, - "grad_norm": 0.3158598244190216, + "grad_norm": 1191.6717529296875, "learning_rate": 9.836018686704298e-05, - "loss": 0.0176, + "loss": 49.1736, "step": 3260 }, { "epoch": 5.317073170731708, - "grad_norm": 0.30192384123802185, + "grad_norm": 1531.7381591796875, "learning_rate": 9.782261770517289e-05, - "loss": 0.0291, + "loss": 29.3415, "step": 3270 }, { "epoch": 5.333333333333333, - "grad_norm": 0.3068801462650299, + "grad_norm": 1613.154296875, "learning_rate": 9.72851114883572e-05, - "loss": 0.0227, + "loss": 71.2164, "step": 3280 }, { "epoch": 5.349593495934959, - "grad_norm": 0.19755934178829193, + "grad_norm": 1089.3868408203125, "learning_rate": 9.674768375514347e-05, - "loss": 0.025, + "loss": 41.1068, "step": 3290 }, { "epoch": 5.365853658536586, - "grad_norm": 0.21400178968906403, + "grad_norm": 425.6622314453125, "learning_rate": 9.621035004181022e-05, - "loss": 0.0313, + "loss": 29.7313, "step": 3300 }, { "epoch": 5.382113821138211, - "grad_norm": 0.21456244587898254, + "grad_norm": 4809.2626953125, "learning_rate": 9.56731258819181e-05, - "loss": 0.0206, + "loss": 59.21, "step": 3310 }, { "epoch": 5.3983739837398375, - "grad_norm": 0.28082045912742615, + "grad_norm": 768.4491577148438, "learning_rate": 9.51360268058607e-05, - "loss": 0.0221, + "loss": 65.3515, "step": 3320 }, { "epoch": 5.414634146341464, - "grad_norm": 0.3279547691345215, + "grad_norm": 1334.3365478515625, "learning_rate": 9.459906834041558e-05, - "loss": 0.0272, + "loss": 44.464, "step": 3330 }, { "epoch": 5.430894308943089, - "grad_norm": 0.28901779651641846, + "grad_norm": 1523.654296875, "learning_rate": 9.406226600829545e-05, - "loss": 0.0217, + "loss": 61.8839, "step": 3340 }, { "epoch": 5.4471544715447155, - "grad_norm": 0.26895517110824585, + "grad_norm": 1562.5716552734375, "learning_rate": 9.352563532769949e-05, - "loss": 0.0215, + "loss": 51.7122, "step": 3350 }, { "epoch": 5.463414634146342, - "grad_norm": 0.26811909675598145, + "grad_norm": 1880.090087890625, "learning_rate": 9.298919181186458e-05, - "loss": 0.0303, + "loss": 41.961, "step": 3360 }, { "epoch": 5.479674796747967, - "grad_norm": 0.25498056411743164, + "grad_norm": 1722.7073974609375, "learning_rate": 9.245295096861698e-05, - "loss": 0.033, + "loss": 46.5965, "step": 3370 }, { "epoch": 5.495934959349594, - "grad_norm": 0.20725968480110168, + "grad_norm": 925.80126953125, "learning_rate": 9.191692829992401e-05, - "loss": 0.019, + "loss": 48.4384, "step": 3380 }, { "epoch": 5.512195121951219, - "grad_norm": 0.24128729104995728, + "grad_norm": 1489.31982421875, "learning_rate": 9.138113930144578e-05, - "loss": 0.0256, + "loss": 59.3866, "step": 3390 }, { "epoch": 5.528455284552845, - "grad_norm": 0.3082524836063385, + "grad_norm": 707.712890625, "learning_rate": 9.084559946208739e-05, - "loss": 0.0224, + "loss": 42.5858, "step": 3400 }, { "epoch": 5.544715447154472, - "grad_norm": 0.20113001763820648, + "grad_norm": 2299.88720703125, "learning_rate": 9.031032426355106e-05, - "loss": 0.0182, + "loss": 36.6626, "step": 3410 }, { "epoch": 5.560975609756097, - "grad_norm": 0.34511125087738037, + "grad_norm": 4950.97998046875, "learning_rate": 8.977532917988871e-05, - "loss": 0.0249, + "loss": 37.762, "step": 3420 }, { "epoch": 5.5772357723577235, - "grad_norm": 0.1838812530040741, + "grad_norm": 891.8377075195312, "learning_rate": 8.924062967705443e-05, - "loss": 0.0219, + "loss": 50.5158, "step": 3430 }, { "epoch": 5.59349593495935, - "grad_norm": 0.31571686267852783, + "grad_norm": 996.9815673828125, "learning_rate": 8.870624121245748e-05, - "loss": 0.0307, + "loss": 56.7966, "step": 3440 }, { "epoch": 5.609756097560975, - "grad_norm": 0.26014745235443115, + "grad_norm": 814.5260009765625, "learning_rate": 8.817217923451554e-05, - "loss": 0.0253, + "loss": 61.8741, "step": 3450 }, { "epoch": 5.626016260162602, - "grad_norm": 0.3773570954799652, + "grad_norm": 1282.3272705078125, "learning_rate": 8.763845918220793e-05, - "loss": 0.0235, + "loss": 28.1619, "step": 3460 }, { "epoch": 5.642276422764228, - "grad_norm": 0.2535437345504761, + "grad_norm": 1114.01513671875, "learning_rate": 8.71050964846294e-05, - "loss": 0.0227, + "loss": 34.5723, "step": 3470 }, { "epoch": 5.658536585365853, - "grad_norm": 0.3311566710472107, + "grad_norm": 768.8634033203125, "learning_rate": 8.657210656054413e-05, - "loss": 0.018, + "loss": 40.1524, "step": 3480 }, { "epoch": 5.67479674796748, - "grad_norm": 0.29373443126678467, + "grad_norm": 640.5523681640625, "learning_rate": 8.60395048179399e-05, - "loss": 0.0307, + "loss": 59.3767, "step": 3490 }, { "epoch": 5.691056910569106, - "grad_norm": 0.37950748205184937, + "grad_norm": 976.6678466796875, "learning_rate": 8.550730665358266e-05, - "loss": 0.0226, + "loss": 46.2076, "step": 3500 }, { "epoch": 5.7073170731707314, - "grad_norm": 0.3670765459537506, + "grad_norm": 904.607666015625, "learning_rate": 8.497552745257157e-05, - "loss": 0.0277, + "loss": 44.8267, "step": 3510 }, { "epoch": 5.723577235772358, - "grad_norm": 0.22482118010520935, + "grad_norm": 18157.951171875, "learning_rate": 8.444418258789418e-05, - "loss": 0.0269, + "loss": 46.1126, "step": 3520 }, { "epoch": 5.739837398373984, - "grad_norm": 0.2423231601715088, + "grad_norm": 702.4590454101562, "learning_rate": 8.391328741998187e-05, - "loss": 0.0183, + "loss": 62.335, "step": 3530 }, { "epoch": 5.7560975609756095, - "grad_norm": 0.2830420732498169, + "grad_norm": 906.1786499023438, "learning_rate": 8.338285729626595e-05, - "loss": 0.0315, + "loss": 65.6418, "step": 3540 }, { "epoch": 5.772357723577236, - "grad_norm": 0.30432966351509094, + "grad_norm": 1011.940185546875, "learning_rate": 8.285290755073405e-05, - "loss": 0.0206, + "loss": 41.4294, "step": 3550 }, { "epoch": 5.788617886178862, - "grad_norm": 0.18228772282600403, + "grad_norm": 2783.18798828125, "learning_rate": 8.23234535034866e-05, - "loss": 0.0198, + "loss": 73.9544, "step": 3560 }, { "epoch": 5.804878048780488, - "grad_norm": 0.3448275625705719, + "grad_norm": 1077.9619140625, "learning_rate": 8.179451046029424e-05, - "loss": 0.0229, + "loss": 36.2339, "step": 3570 }, { "epoch": 5.821138211382114, - "grad_norm": 0.29271072149276733, + "grad_norm": 1024.14453125, "learning_rate": 8.12660937121551e-05, - "loss": 0.0301, + "loss": 40.021, "step": 3580 }, { "epoch": 5.83739837398374, - "grad_norm": 0.29591134190559387, + "grad_norm": 1014.1956787109375, "learning_rate": 8.073821853485288e-05, - "loss": 0.0201, + "loss": 73.2346, "step": 3590 }, { "epoch": 5.853658536585366, - "grad_norm": 0.28770920634269714, + "grad_norm": 869.21875, "learning_rate": 8.021090018851526e-05, - "loss": 0.0209, + "loss": 34.6341, "step": 3600 }, { "epoch": 5.869918699186992, - "grad_norm": 0.31899213790893555, + "grad_norm": 1306.168212890625, "learning_rate": 7.968415391717271e-05, - "loss": 0.0218, + "loss": 71.121, "step": 3610 }, { "epoch": 5.886178861788618, - "grad_norm": 0.26371410489082336, + "grad_norm": 1111.87890625, "learning_rate": 7.915799494831775e-05, - "loss": 0.021, + "loss": 33.9404, "step": 3620 }, { "epoch": 5.902439024390244, - "grad_norm": 0.3779837191104889, + "grad_norm": 759.7614135742188, "learning_rate": 7.863243849246494e-05, - "loss": 0.0233, + "loss": 50.714, "step": 3630 }, { "epoch": 5.91869918699187, - "grad_norm": 0.2993268370628357, + "grad_norm": 5193.80419921875, "learning_rate": 7.810749974271099e-05, - "loss": 0.0205, + "loss": 59.9144, "step": 3640 }, { "epoch": 5.934959349593496, - "grad_norm": 0.2081155925989151, + "grad_norm": 1484.0467529296875, "learning_rate": 7.758319387429553e-05, - "loss": 0.0266, + "loss": 58.3316, "step": 3650 }, { "epoch": 5.951219512195122, - "grad_norm": 0.31904008984565735, + "grad_norm": 1309.0003662109375, "learning_rate": 7.705953604416254e-05, - "loss": 0.0249, + "loss": 48.9651, "step": 3660 }, { "epoch": 5.967479674796748, - "grad_norm": 0.21781237423419952, + "grad_norm": 754.5973510742188, "learning_rate": 7.653654139052214e-05, - "loss": 0.0228, + "loss": 29.4624, "step": 3670 }, { "epoch": 5.983739837398374, - "grad_norm": 0.22414086759090424, + "grad_norm": 637.7557983398438, "learning_rate": 7.60142250324129e-05, - "loss": 0.0192, + "loss": 43.2339, "step": 3680 }, { "epoch": 6.0, - "grad_norm": 0.17804913222789764, + "grad_norm": 1177.0924072265625, "learning_rate": 7.549260206926486e-05, - "loss": 0.0191, + "loss": 47.2867, "step": 3690 }, { "epoch": 6.016260162601626, - "grad_norm": 0.19235257804393768, + "grad_norm": 1924.6392822265625, "learning_rate": 7.4971687580463e-05, - "loss": 0.0215, + "loss": 38.3521, "step": 3700 }, { "epoch": 6.032520325203252, - "grad_norm": 0.28361621499061584, + "grad_norm": 916.7091674804688, "learning_rate": 7.445149662491126e-05, - "loss": 0.0254, + "loss": 49.7392, "step": 3710 }, { "epoch": 6.048780487804878, - "grad_norm": 0.34580036997795105, + "grad_norm": 967.6969604492188, "learning_rate": 7.393204424059725e-05, - "loss": 0.0238, + "loss": 38.2029, "step": 3720 }, { "epoch": 6.065040650406504, - "grad_norm": 0.19561924040317535, + "grad_norm": 840.0963745117188, "learning_rate": 7.341334544415761e-05, - "loss": 0.0194, + "loss": 77.827, "step": 3730 }, { "epoch": 6.08130081300813, - "grad_norm": 0.2833673357963562, + "grad_norm": 1400.66064453125, "learning_rate": 7.289541523044376e-05, - "loss": 0.0191, + "loss": 66.4577, "step": 3740 }, { "epoch": 6.097560975609756, - "grad_norm": 0.18665507435798645, + "grad_norm": 767.639892578125, "learning_rate": 7.237826857208847e-05, - "loss": 0.0207, + "loss": 30.1595, "step": 3750 }, { "epoch": 6.1138211382113825, - "grad_norm": 0.16092735528945923, + "grad_norm": 728.1867065429688, "learning_rate": 7.186192041907298e-05, - "loss": 0.0153, + "loss": 48.2639, "step": 3760 }, { "epoch": 6.130081300813008, - "grad_norm": 0.23183247447013855, + "grad_norm": 1045.18798828125, "learning_rate": 7.134638569829499e-05, - "loss": 0.0166, + "loss": 54.2319, "step": 3770 }, { "epoch": 6.146341463414634, - "grad_norm": 0.26079216599464417, + "grad_norm": 1185.36474609375, "learning_rate": 7.083167931313692e-05, - "loss": 0.0203, + "loss": 37.9882, "step": 3780 }, { "epoch": 6.16260162601626, - "grad_norm": 0.19080810248851776, + "grad_norm": 723.2171020507812, "learning_rate": 7.031781614303519e-05, - "loss": 0.024, + "loss": 41.0285, "step": 3790 }, { "epoch": 6.178861788617886, - "grad_norm": 0.32448557019233704, + "grad_norm": 1335.1109619140625, "learning_rate": 6.980481104305013e-05, - "loss": 0.0236, + "loss": 33.8187, "step": 3800 }, { "epoch": 6.195121951219512, - "grad_norm": 0.24346303939819336, + "grad_norm": 651.626708984375, "learning_rate": 6.929267884343634e-05, - "loss": 0.0228, + "loss": 65.5501, "step": 3810 }, { "epoch": 6.211382113821138, - "grad_norm": 0.40463072061538696, + "grad_norm": 595.5252075195312, "learning_rate": 6.87814343492142e-05, - "loss": 0.0174, + "loss": 43.2794, "step": 3820 }, { "epoch": 6.227642276422764, - "grad_norm": 0.1949874609708786, + "grad_norm": 1277.5653076171875, "learning_rate": 6.827109233974178e-05, - "loss": 0.0163, + "loss": 42.5897, "step": 3830 }, { "epoch": 6.2439024390243905, - "grad_norm": 0.23320889472961426, + "grad_norm": 950.2879028320312, "learning_rate": 6.776166756828759e-05, - "loss": 0.0222, + "loss": 59.1106, "step": 3840 }, { "epoch": 6.260162601626016, - "grad_norm": 0.28618958592414856, + "grad_norm": 862.7484741210938, "learning_rate": 6.7253174761604e-05, - "loss": 0.0174, + "loss": 51.2283, "step": 3850 }, { "epoch": 6.276422764227642, - "grad_norm": 0.2637457251548767, + "grad_norm": 346.978759765625, "learning_rate": 6.674562861950167e-05, - "loss": 0.0224, + "loss": 22.1792, "step": 3860 }, { "epoch": 6.2926829268292686, - "grad_norm": 0.21399089694023132, + "grad_norm": 2020.3907470703125, "learning_rate": 6.62390438144245e-05, - "loss": 0.0198, + "loss": 34.9443, "step": 3870 }, { "epoch": 6.308943089430894, - "grad_norm": 0.19568473100662231, + "grad_norm": 1247.765869140625, "learning_rate": 6.573343499102545e-05, - "loss": 0.0211, + "loss": 89.5246, "step": 3880 }, { "epoch": 6.32520325203252, - "grad_norm": 0.23971660435199738, + "grad_norm": 1061.9462890625, "learning_rate": 6.52288167657433e-05, - "loss": 0.0227, + "loss": 57.1117, "step": 3890 }, { "epoch": 6.341463414634147, - "grad_norm": 0.19574925303459167, + "grad_norm": 740.0230712890625, "learning_rate": 6.472520372637999e-05, - "loss": 0.0182, + "loss": 41.9892, "step": 3900 }, { "epoch": 6.357723577235772, - "grad_norm": 0.29720553755760193, + "grad_norm": 437.2298583984375, "learning_rate": 6.422261043167893e-05, - "loss": 0.0159, + "loss": 41.5301, "step": 3910 }, { "epoch": 6.373983739837398, - "grad_norm": 0.19235315918922424, + "grad_norm": 707.180908203125, "learning_rate": 6.372105141090417e-05, - "loss": 0.0159, + "loss": 61.3545, "step": 3920 }, { "epoch": 6.390243902439025, - "grad_norm": 0.2920894920825958, + "grad_norm": 533.357177734375, "learning_rate": 6.322054116342044e-05, - "loss": 0.0207, + "loss": 40.3018, "step": 3930 }, { "epoch": 6.40650406504065, - "grad_norm": 0.33090823888778687, + "grad_norm": 423.275634765625, "learning_rate": 6.272109415827379e-05, - "loss": 0.0223, + "loss": 31.2483, "step": 3940 }, { "epoch": 6.4227642276422765, - "grad_norm": 0.23652423918247223, + "grad_norm": 535.2537231445312, "learning_rate": 6.222272483377345e-05, - "loss": 0.0242, + "loss": 61.084, "step": 3950 }, { "epoch": 6.439024390243903, - "grad_norm": 0.26307007670402527, + "grad_norm": 654.32470703125, "learning_rate": 6.172544759707449e-05, - "loss": 0.0284, + "loss": 69.6351, "step": 3960 }, { "epoch": 6.455284552845528, - "grad_norm": 0.29087966680526733, + "grad_norm": 827.914794921875, "learning_rate": 6.122927682376119e-05, - "loss": 0.0203, + "loss": 34.8883, "step": 3970 }, { "epoch": 6.471544715447155, - "grad_norm": 0.2642574906349182, + "grad_norm": 364.55615234375, "learning_rate": 6.0734226857431554e-05, - "loss": 0.0153, + "loss": 32.2486, "step": 3980 }, { "epoch": 6.487804878048781, - "grad_norm": 0.28669440746307373, + "grad_norm": 383.2949523925781, "learning_rate": 6.0240312009282674e-05, - "loss": 0.0172, + "loss": 27.0549, "step": 3990 }, { "epoch": 6.504065040650406, - "grad_norm": 0.24338223040103912, + "grad_norm": 666.8985595703125, "learning_rate": 5.9747546557696924e-05, - "loss": 0.0157, + "loss": 30.6733, "step": 4000 }, { "epoch": 6.520325203252033, - "grad_norm": 0.24392348527908325, + "grad_norm": 322.81890869140625, "learning_rate": 5.925594474782925e-05, - "loss": 0.0161, + "loss": 41.4183, "step": 4010 }, { "epoch": 6.536585365853659, - "grad_norm": 0.17414595186710358, + "grad_norm": 1725.4873046875, "learning_rate": 5.876552079119536e-05, - "loss": 0.0163, + "loss": 56.3451, "step": 4020 }, { "epoch": 6.5528455284552845, - "grad_norm": 0.20468956232070923, + "grad_norm": 417.5548095703125, "learning_rate": 5.827628886526093e-05, - "loss": 0.0169, + "loss": 46.2162, "step": 4030 }, { "epoch": 6.569105691056911, - "grad_norm": 0.3492712676525116, + "grad_norm": 626.910400390625, "learning_rate": 5.778826311303169e-05, - "loss": 0.024, + "loss": 29.055, "step": 4040 }, { "epoch": 6.585365853658536, - "grad_norm": 0.36739641427993774, + "grad_norm": 661.1826171875, "learning_rate": 5.730145764264448e-05, - "loss": 0.0226, + "loss": 27.6717, "step": 4050 }, { "epoch": 6.6016260162601625, - "grad_norm": 0.22074246406555176, + "grad_norm": 595.2796020507812, "learning_rate": 5.681588652695966e-05, - "loss": 0.0181, + "loss": 50.871, "step": 4060 }, { "epoch": 6.617886178861789, - "grad_norm": 0.22845806181430817, + "grad_norm": 1768.0650634765625, "learning_rate": 5.6331563803154086e-05, - "loss": 0.0284, + "loss": 31.054, "step": 4070 }, { "epoch": 6.634146341463414, - "grad_norm": 0.3320392966270447, + "grad_norm": 1227.727783203125, "learning_rate": 5.584850347231528e-05, - "loss": 0.0199, + "loss": 36.9891, "step": 4080 }, { "epoch": 6.650406504065041, - "grad_norm": 0.23301807045936584, + "grad_norm": 1646.6304931640625, "learning_rate": 5.536671949903689e-05, - "loss": 0.0166, + "loss": 33.9344, "step": 4090 }, { "epoch": 6.666666666666667, - "grad_norm": 0.46796318888664246, + "grad_norm": 1407.2939453125, "learning_rate": 5.4886225811014814e-05, - "loss": 0.0172, + "loss": 51.3101, "step": 4100 }, { "epoch": 6.682926829268292, - "grad_norm": 0.30953124165534973, + "grad_norm": 1124.4527587890625, "learning_rate": 5.440703629864454e-05, - "loss": 0.0149, + "loss": 49.1819, "step": 4110 }, { "epoch": 6.699186991869919, - "grad_norm": 0.18632985651493073, + "grad_norm": 689.7494506835938, "learning_rate": 5.392916481461983e-05, - "loss": 0.0166, + "loss": 36.6202, "step": 4120 }, { "epoch": 6.715447154471545, - "grad_norm": 0.2245371788740158, + "grad_norm": 714.1576538085938, "learning_rate": 5.3452625173531964e-05, - "loss": 0.0146, + "loss": 32.2473, "step": 4130 }, { "epoch": 6.7317073170731705, - "grad_norm": 0.27866289019584656, + "grad_norm": 479.4760437011719, "learning_rate": 5.297743115147062e-05, - "loss": 0.0184, + "loss": 35.0904, "step": 4140 }, { "epoch": 6.747967479674797, - "grad_norm": 0.17456312477588654, + "grad_norm": 362.479736328125, "learning_rate": 5.250359648562551e-05, - "loss": 0.017, + "loss": 43.3301, "step": 4150 }, { "epoch": 6.764227642276423, - "grad_norm": 0.1838310807943344, + "grad_norm": 668.361572265625, "learning_rate": 5.203113487388917e-05, - "loss": 0.0121, + "loss": 50.1241, "step": 4160 }, { "epoch": 6.780487804878049, - "grad_norm": 0.15984876453876495, + "grad_norm": 1105.221923828125, "learning_rate": 5.156005997446118e-05, - "loss": 0.0106, + "loss": 36.7327, "step": 4170 }, { "epoch": 6.796747967479675, - "grad_norm": 0.1516880989074707, + "grad_norm": 528.5939331054688, "learning_rate": 5.109038540545326e-05, - "loss": 0.0151, + "loss": 45.8215, "step": 4180 }, { "epoch": 6.8130081300813, - "grad_norm": 0.3025248348712921, + "grad_norm": 635.588134765625, "learning_rate": 5.062212474449537e-05, - "loss": 0.0168, + "loss": 68.0413, "step": 4190 }, { "epoch": 6.829268292682927, - "grad_norm": 0.17823679745197296, + "grad_norm": 629.8543701171875, "learning_rate": 5.0155291528343577e-05, - "loss": 0.0114, + "loss": 89.9357, "step": 4200 }, { "epoch": 6.845528455284553, - "grad_norm": 0.3626069724559784, + "grad_norm": 511.0000915527344, "learning_rate": 4.96898992524884e-05, - "loss": 0.0133, + "loss": 39.3891, "step": 4210 }, { "epoch": 6.861788617886178, - "grad_norm": 0.1827721744775772, + "grad_norm": 331.4763488769531, "learning_rate": 4.922596137076493e-05, - "loss": 0.0136, + "loss": 32.5439, "step": 4220 }, { "epoch": 6.878048780487805, - "grad_norm": 0.17440979182720184, + "grad_norm": 433.0771484375, "learning_rate": 4.876349129496355e-05, - "loss": 0.0129, + "loss": 64.7455, "step": 4230 }, { "epoch": 6.894308943089431, - "grad_norm": 0.24478399753570557, + "grad_norm": 456.54644775390625, "learning_rate": 4.830250239444276e-05, - "loss": 0.012, + "loss": 44.152, "step": 4240 }, { "epoch": 6.9105691056910565, - "grad_norm": 0.2005622684955597, + "grad_norm": 1340.421142578125, "learning_rate": 4.7843007995742065e-05, - "loss": 0.0201, + "loss": 30.8355, "step": 4250 }, { "epoch": 6.926829268292683, - "grad_norm": 0.1362733095884323, + "grad_norm": 1253.5787353515625, "learning_rate": 4.7385021382197216e-05, - "loss": 0.0126, + "loss": 48.8547, "step": 4260 }, { "epoch": 6.943089430894309, - "grad_norm": 0.20147354900836945, + "grad_norm": 735.3323974609375, "learning_rate": 4.692855579355597e-05, - "loss": 0.0199, + "loss": 29.7913, "step": 4270 }, { "epoch": 6.959349593495935, - "grad_norm": 0.23760125041007996, + "grad_norm": 485.3312072753906, "learning_rate": 4.647362442559535e-05, - "loss": 0.0174, + "loss": 45.8068, "step": 4280 }, { "epoch": 6.975609756097561, - "grad_norm": 0.5984326004981995, + "grad_norm": 1383.2845458984375, "learning_rate": 4.602024042974027e-05, - "loss": 0.0253, + "loss": 38.6388, "step": 4290 }, { "epoch": 6.991869918699187, - "grad_norm": 0.17115150392055511, + "grad_norm": 491.0514831542969, "learning_rate": 4.556841691268333e-05, - "loss": 0.0159, + "loss": 36.584, "step": 4300 }, { "epoch": 7.008130081300813, - "grad_norm": 0.33204561471939087, + "grad_norm": 417.0002746582031, "learning_rate": 4.511816693600577e-05, - "loss": 0.0182, + "loss": 39.8136, "step": 4310 }, { "epoch": 7.024390243902439, - "grad_norm": 0.2889736592769623, + "grad_norm": 731.73828125, "learning_rate": 4.46695035158001e-05, - "loss": 0.0155, + "loss": 32.1251, "step": 4320 }, { "epoch": 7.040650406504065, - "grad_norm": 0.23228377103805542, + "grad_norm": 649.9963989257812, "learning_rate": 4.42224396222937e-05, - "loss": 0.0174, + "loss": 24.8058, "step": 4330 }, { "epoch": 7.056910569105691, - "grad_norm": 0.15126347541809082, + "grad_norm": 497.6392517089844, "learning_rate": 4.377698817947385e-05, - "loss": 0.0113, + "loss": 37.5999, "step": 4340 }, { "epoch": 7.073170731707317, - "grad_norm": 0.1488284170627594, + "grad_norm": 1092.6939697265625, "learning_rate": 4.333316206471418e-05, - "loss": 0.0138, + "loss": 34.9651, "step": 4350 }, { "epoch": 7.0894308943089435, - "grad_norm": 0.26601848006248474, + "grad_norm": 252.49484252929688, "learning_rate": 4.2890974108402425e-05, - "loss": 0.0237, + "loss": 64.3354, "step": 4360 }, { "epoch": 7.105691056910569, - "grad_norm": 0.15545906126499176, + "grad_norm": 704.4669799804688, "learning_rate": 4.2450437093569315e-05, - "loss": 0.0112, + "loss": 66.6694, "step": 4370 }, { "epoch": 7.121951219512195, - "grad_norm": 0.13300956785678864, + "grad_norm": 1412.200927734375, "learning_rate": 4.2011563755519326e-05, - "loss": 0.0148, + "loss": 34.0108, "step": 4380 }, { "epoch": 7.138211382113822, - "grad_norm": 0.2120588719844818, + "grad_norm": 513.7908935546875, "learning_rate": 4.157436678146238e-05, - "loss": 0.0205, + "loss": 23.0915, "step": 4390 }, { "epoch": 7.154471544715447, - "grad_norm": 0.15902888774871826, + "grad_norm": 429.260986328125, "learning_rate": 4.1138858810146965e-05, - "loss": 0.0118, + "loss": 21.7249, "step": 4400 }, { "epoch": 7.170731707317073, - "grad_norm": 0.21792960166931152, + "grad_norm": 282.83160400390625, "learning_rate": 4.0705052431494995e-05, - "loss": 0.0149, + "loss": 35.1431, "step": 4410 }, { "epoch": 7.186991869918699, - "grad_norm": 0.32575589418411255, + "grad_norm": 189.756591796875, "learning_rate": 4.027296018623772e-05, - "loss": 0.0153, + "loss": 30.4934, "step": 4420 }, { "epoch": 7.203252032520325, - "grad_norm": 0.2091379463672638, + "grad_norm": 484.0589904785156, "learning_rate": 3.9842594565553085e-05, - "loss": 0.0109, + "loss": 25.1109, "step": 4430 }, { "epoch": 7.219512195121951, - "grad_norm": 0.12458918243646622, + "grad_norm": 707.24560546875, "learning_rate": 3.9413968010704984e-05, - "loss": 0.015, + "loss": 49.4997, "step": 4440 }, { "epoch": 7.235772357723577, - "grad_norm": 0.15325990319252014, + "grad_norm": 321.16485595703125, "learning_rate": 3.898709291268313e-05, - "loss": 0.0171, + "loss": 50.0109, "step": 4450 }, { "epoch": 7.252032520325203, - "grad_norm": 0.2739748954772949, + "grad_norm": 468.12042236328125, "learning_rate": 3.8561981611845246e-05, - "loss": 0.0189, + "loss": 71.7242, "step": 4460 }, { "epoch": 7.2682926829268295, - "grad_norm": 0.3035985827445984, + "grad_norm": 628.5554809570312, "learning_rate": 3.813864639756007e-05, - "loss": 0.0168, + "loss": 31.7032, "step": 4470 }, { "epoch": 7.284552845528455, - "grad_norm": 0.1693941205739975, + "grad_norm": 597.160400390625, "learning_rate": 3.771709950785228e-05, - "loss": 0.0135, + "loss": 27.9663, "step": 4480 }, { "epoch": 7.300813008130081, - "grad_norm": 0.1758739948272705, + "grad_norm": 450.8225402832031, "learning_rate": 3.7297353129048476e-05, - "loss": 0.0141, + "loss": 21.0904, "step": 4490 }, { "epoch": 7.317073170731708, - "grad_norm": 0.2751999795436859, + "grad_norm": 615.4117431640625, "learning_rate": 3.687941939542513e-05, - "loss": 0.0248, + "loss": 32.9963, "step": 4500 }, { "epoch": 7.333333333333333, - "grad_norm": 0.3108856678009033, + "grad_norm": 751.5721435546875, "learning_rate": 3.646331038885768e-05, - "loss": 0.0127, + "loss": 33.0976, "step": 4510 }, { "epoch": 7.349593495934959, - "grad_norm": 0.1918034851551056, + "grad_norm": 13358.826171875, "learning_rate": 3.6049038138471215e-05, - "loss": 0.0209, + "loss": 48.3166, "step": 4520 }, { "epoch": 7.365853658536586, - "grad_norm": 0.10584542900323868, + "grad_norm": 5210.142578125, "learning_rate": 3.5636614620292854e-05, - "loss": 0.0163, + "loss": 42.6251, "step": 4530 }, { "epoch": 7.382113821138211, - "grad_norm": 0.1657392382621765, + "grad_norm": 1281.064453125, "learning_rate": 3.522605175690544e-05, - "loss": 0.0112, + "loss": 29.0492, "step": 4540 }, { "epoch": 7.3983739837398375, - "grad_norm": 0.24549029767513275, + "grad_norm": 357.83819580078125, "learning_rate": 3.481736141710293e-05, - "loss": 0.0183, + "loss": 35.3369, "step": 4550 }, { "epoch": 7.414634146341464, - "grad_norm": 0.22945013642311096, + "grad_norm": 173.05294799804688, "learning_rate": 3.4410555415547306e-05, - "loss": 0.0165, + "loss": 33.2367, "step": 4560 }, { "epoch": 7.430894308943089, - "grad_norm": 0.1069050207734108, + "grad_norm": 3365.111572265625, "learning_rate": 3.4005645512426834e-05, - "loss": 0.0128, + "loss": 29.4222, "step": 4570 }, { "epoch": 7.4471544715447155, - "grad_norm": 0.15123425424098969, + "grad_norm": 670.9901733398438, "learning_rate": 3.3602643413116386e-05, - "loss": 0.0136, + "loss": 44.8467, "step": 4580 }, { "epoch": 7.463414634146342, - "grad_norm": 0.12151239067316055, + "grad_norm": 454.53265380859375, "learning_rate": 3.320156076783891e-05, - "loss": 0.0105, + "loss": 32.9965, "step": 4590 }, { "epoch": 7.479674796747967, - "grad_norm": 0.14802220463752747, + "grad_norm": 1082.113525390625, "learning_rate": 3.280240917132853e-05, - "loss": 0.0114, + "loss": 37.7567, "step": 4600 }, { "epoch": 7.495934959349594, - "grad_norm": 0.2116687148809433, + "grad_norm": 21382.505859375, "learning_rate": 3.2405200162495586e-05, - "loss": 0.0223, + "loss": 27.9646, "step": 4610 }, { "epoch": 7.512195121951219, - "grad_norm": 0.13151954114437103, + "grad_norm": 391.889892578125, "learning_rate": 3.200994522409293e-05, - "loss": 0.0173, + "loss": 32.9818, "step": 4620 }, { "epoch": 7.528455284552845, - "grad_norm": 0.1715514063835144, + "grad_norm": 4713.3359375, "learning_rate": 3.1616655782383864e-05, - "loss": 0.0112, + "loss": 37.4087, "step": 4630 }, { "epoch": 7.544715447154472, - "grad_norm": 0.15202124416828156, + "grad_norm": 2711.176513671875, "learning_rate": 3.122534320681214e-05, - "loss": 0.0142, + "loss": 48.8535, "step": 4640 }, { "epoch": 7.560975609756097, - "grad_norm": 0.2114085704088211, + "grad_norm": 1700.7119140625, "learning_rate": 3.083601880967302e-05, - "loss": 0.0199, + "loss": 42.1752, "step": 4650 }, { "epoch": 7.5772357723577235, - "grad_norm": 0.4068102538585663, + "grad_norm": 420.5804443359375, "learning_rate": 3.0448693845786246e-05, - "loss": 0.0192, + "loss": 26.3437, "step": 4660 }, { "epoch": 7.59349593495935, - "grad_norm": 0.2130446583032608, + "grad_norm": 279.73455810546875, "learning_rate": 3.0063379512170852e-05, - "loss": 0.0137, + "loss": 26.54, "step": 4670 }, { "epoch": 7.609756097560975, - "grad_norm": 0.2692960798740387, + "grad_norm": 373.8387756347656, "learning_rate": 2.968008694772141e-05, - "loss": 0.0115, + "loss": 32.9037, "step": 4680 }, { "epoch": 7.626016260162602, - "grad_norm": 0.15790198743343353, + "grad_norm": 4132.44873046875, "learning_rate": 2.9298827232885863e-05, - "loss": 0.0197, + "loss": 30.5371, "step": 4690 }, { "epoch": 7.642276422764228, - "grad_norm": 0.19259366393089294, + "grad_norm": 448.18359375, "learning_rate": 2.8919611389345447e-05, - "loss": 0.0158, + "loss": 23.2553, "step": 4700 }, { "epoch": 7.658536585365853, - "grad_norm": 0.25127512216567993, + "grad_norm": 1203.708984375, "learning_rate": 2.8542450379695973e-05, - "loss": 0.016, + "loss": 48.5284, "step": 4710 }, { "epoch": 7.67479674796748, - "grad_norm": 0.25255757570266724, + "grad_norm": 234.6784210205078, "learning_rate": 2.8167355107130787e-05, - "loss": 0.0117, + "loss": 63.0278, "step": 4720 }, { "epoch": 7.691056910569106, - "grad_norm": 0.1875191479921341, + "grad_norm": 475.01544189453125, "learning_rate": 2.77943364151258e-05, - "loss": 0.0104, + "loss": 26.5827, "step": 4730 }, { "epoch": 7.7073170731707314, - "grad_norm": 0.21179762482643127, + "grad_norm": 2622.9150390625, "learning_rate": 2.7423405087125832e-05, - "loss": 0.0176, + "loss": 37.8167, "step": 4740 }, { "epoch": 7.723577235772358, - "grad_norm": 0.1246047094464302, + "grad_norm": 2133.2802734375, "learning_rate": 2.705457184623299e-05, - "loss": 0.0087, + "loss": 45.3475, "step": 4750 }, { "epoch": 7.739837398373984, - "grad_norm": 0.19497287273406982, + "grad_norm": 467.1634216308594, "learning_rate": 2.668784735489662e-05, - "loss": 0.0108, + "loss": 38.3572, "step": 4760 }, { "epoch": 7.7560975609756095, - "grad_norm": 0.2593243718147278, + "grad_norm": 2866.9052734375, "learning_rate": 2.632324221460515e-05, - "loss": 0.0164, + "loss": 49.7959, "step": 4770 }, { "epoch": 7.772357723577236, - "grad_norm": 0.20588083565235138, + "grad_norm": 5320.82470703125, "learning_rate": 2.5960766965579407e-05, - "loss": 0.0146, + "loss": 27.4925, "step": 4780 }, { "epoch": 7.788617886178862, - "grad_norm": 0.31863540410995483, + "grad_norm": 12207.2236328125, "learning_rate": 2.5600432086468207e-05, - "loss": 0.015, + "loss": 25.4184, "step": 4790 }, { "epoch": 7.804878048780488, - "grad_norm": 0.29355064034461975, + "grad_norm": 928.2150268554688, "learning_rate": 2.5242247994045255e-05, - "loss": 0.0137, + "loss": 38.9474, "step": 4800 }, { "epoch": 7.821138211382114, - "grad_norm": 0.16072151064872742, + "grad_norm": 666.2001342773438, "learning_rate": 2.4886225042907973e-05, - "loss": 0.0144, + "loss": 28.4315, "step": 4810 }, { "epoch": 7.83739837398374, - "grad_norm": 0.20395232737064362, + "grad_norm": 394.76727294921875, "learning_rate": 2.453237352517831e-05, - "loss": 0.0102, + "loss": 35.7413, "step": 4820 }, { "epoch": 7.853658536585366, - "grad_norm": 0.19141028821468353, + "grad_norm": 1564.347900390625, "learning_rate": 2.4180703670205108e-05, - "loss": 0.0152, + "loss": 49.657, "step": 4830 }, { "epoch": 7.869918699186992, - "grad_norm": 0.18786580860614777, + "grad_norm": 662.8395385742188, "learning_rate": 2.3831225644268416e-05, - "loss": 0.0134, + "loss": 23.6479, "step": 4840 }, { "epoch": 7.886178861788618, - "grad_norm": 0.13707751035690308, + "grad_norm": 448.2498474121094, "learning_rate": 2.348394955028561e-05, - "loss": 0.0107, + "loss": 30.4568, "step": 4850 }, { "epoch": 7.902439024390244, - "grad_norm": 0.1631462275981903, + "grad_norm": 738.3649291992188, "learning_rate": 2.3138885427519262e-05, - "loss": 0.011, + "loss": 48.6049, "step": 4860 }, { "epoch": 7.91869918699187, - "grad_norm": 0.3250775635242462, + "grad_norm": 600.122314453125, "learning_rate": 2.2796043251287002e-05, - "loss": 0.0146, + "loss": 24.3334, "step": 4870 }, { "epoch": 7.934959349593496, - "grad_norm": 0.11890354007482529, + "grad_norm": 604.3839111328125, "learning_rate": 2.2455432932673182e-05, - "loss": 0.0125, + "loss": 48.3579, "step": 4880 }, { "epoch": 7.951219512195122, - "grad_norm": 0.18371151387691498, + "grad_norm": 854.1920166015625, "learning_rate": 2.2117064318242154e-05, - "loss": 0.0129, + "loss": 50.2401, "step": 4890 }, { "epoch": 7.967479674796748, - "grad_norm": 0.20046193897724152, + "grad_norm": 8056.27490234375, "learning_rate": 2.1780947189753875e-05, - "loss": 0.0098, + "loss": 41.4174, "step": 4900 }, { "epoch": 7.983739837398374, - "grad_norm": 0.27011969685554504, + "grad_norm": 788.5985717773438, "learning_rate": 2.1447091263881014e-05, - "loss": 0.0138, + "loss": 41.0822, "step": 4910 }, { "epoch": 8.0, - "grad_norm": 0.18496853113174438, + "grad_norm": 194.98179626464844, "learning_rate": 2.111550619192797e-05, - "loss": 0.016, + "loss": 28.0501, "step": 4920 }, { "epoch": 8.016260162601625, - "grad_norm": 0.17147807776927948, + "grad_norm": 463.9582214355469, "learning_rate": 2.0786201559552022e-05, - "loss": 0.0112, + "loss": 38.9959, "step": 4930 }, { "epoch": 8.032520325203253, - "grad_norm": 0.20769961178302765, + "grad_norm": 361.2221374511719, "learning_rate": 2.045918688648616e-05, - "loss": 0.012, + "loss": 37.643, "step": 4940 }, { "epoch": 8.048780487804878, - "grad_norm": 0.17951570451259613, + "grad_norm": 3094.411376953125, "learning_rate": 2.013447162626384e-05, - "loss": 0.0102, + "loss": 23.8148, "step": 4950 }, { "epoch": 8.065040650406504, - "grad_norm": 0.31969377398490906, + "grad_norm": 618.3005981445312, "learning_rate": 1.981206516594576e-05, - "loss": 0.014, + "loss": 45.4684, "step": 4960 }, { "epoch": 8.08130081300813, - "grad_norm": 0.12813997268676758, + "grad_norm": 3658.843994140625, "learning_rate": 1.949197682584848e-05, - "loss": 0.0114, + "loss": 47.9616, "step": 4970 }, { "epoch": 8.097560975609756, - "grad_norm": 0.1808984875679016, + "grad_norm": 3654.126708984375, "learning_rate": 1.9174215859274892e-05, - "loss": 0.0081, + "loss": 39.6678, "step": 4980 }, { "epoch": 8.113821138211382, - "grad_norm": 0.16158317029476166, + "grad_norm": 3715.457763671875, "learning_rate": 1.885879145224688e-05, - "loss": 0.0139, + "loss": 28.395, "step": 4990 }, { "epoch": 8.130081300813009, - "grad_norm": 0.21064020693302155, + "grad_norm": 13629.64453125, "learning_rate": 1.8545712723239682e-05, - "loss": 0.0154, + "loss": 30.707, "step": 5000 }, { "epoch": 8.146341463414634, - "grad_norm": 0.1379614770412445, + "grad_norm": 1702.9984130859375, "learning_rate": 1.823498872291821e-05, - "loss": 0.0137, + "loss": 39.2062, "step": 5010 }, { "epoch": 8.16260162601626, - "grad_norm": 0.3506397604942322, + "grad_norm": 652.4723510742188, "learning_rate": 1.792662843387557e-05, - "loss": 0.0116, + "loss": 25.4401, "step": 5020 }, { "epoch": 8.178861788617887, - "grad_norm": 0.17526014149188995, + "grad_norm": 545.2056884765625, "learning_rate": 1.7620640770373286e-05, - "loss": 0.0162, + "loss": 65.776, "step": 5030 }, { "epoch": 8.195121951219512, - "grad_norm": 0.14359094202518463, + "grad_norm": 986.5762329101562, "learning_rate": 1.7317034578083547e-05, - "loss": 0.0091, + "loss": 27.4899, "step": 5040 }, { "epoch": 8.211382113821138, - "grad_norm": 0.226181760430336, + "grad_norm": 471.08343505859375, "learning_rate": 1.70158186338337e-05, - "loss": 0.0128, + "loss": 35.4397, "step": 5050 }, { "epoch": 8.227642276422765, - "grad_norm": 0.1992688775062561, + "grad_norm": 284.622802734375, "learning_rate": 1.6717001645352324e-05, - "loss": 0.0097, + "loss": 22.5494, "step": 5060 }, { "epoch": 8.24390243902439, - "grad_norm": 0.179975688457489, + "grad_norm": 22431.65625, "learning_rate": 1.6420592251017487e-05, - "loss": 0.0115, + "loss": 45.1601, "step": 5070 }, { "epoch": 8.260162601626016, - "grad_norm": 0.18070729076862335, + "grad_norm": 780.5162353515625, "learning_rate": 1.6126599019607223e-05, - "loss": 0.0149, + "loss": 33.0745, "step": 5080 }, { "epoch": 8.276422764227643, - "grad_norm": 0.19790923595428467, + "grad_norm": 961.0186767578125, "learning_rate": 1.5835030450051656e-05, - "loss": 0.0156, + "loss": 34.2111, "step": 5090 }, { "epoch": 8.292682926829269, - "grad_norm": 0.24671638011932373, + "grad_norm": 240.08079528808594, "learning_rate": 1.5545894971187303e-05, - "loss": 0.0157, + "loss": 25.9617, "step": 5100 }, { "epoch": 8.308943089430894, - "grad_norm": 0.11316593736410141, + "grad_norm": 2864.75, "learning_rate": 1.525920094151353e-05, - "loss": 0.0109, + "loss": 43.9031, "step": 5110 }, { "epoch": 8.32520325203252, - "grad_norm": 0.1019648090004921, + "grad_norm": 791.8621215820312, "learning_rate": 1.4974956648950845e-05, - "loss": 0.0114, + "loss": 37.113, "step": 5120 }, { "epoch": 8.341463414634147, - "grad_norm": 0.21767546236515045, + "grad_norm": 470.98736572265625, "learning_rate": 1.4693170310601212e-05, - "loss": 0.0106, + "loss": 34.8349, "step": 5130 }, { "epoch": 8.357723577235772, - "grad_norm": 0.14478851854801178, + "grad_norm": 840.1485595703125, "learning_rate": 1.4413850072510704e-05, - "loss": 0.0121, + "loss": 24.1196, "step": 5140 }, { "epoch": 8.373983739837398, - "grad_norm": 0.15735219419002533, + "grad_norm": 660.6499633789062, "learning_rate": 1.4137004009433885e-05, - "loss": 0.013, + "loss": 20.1648, "step": 5150 }, { "epoch": 8.390243902439025, - "grad_norm": 0.20376187562942505, + "grad_norm": 1366.75390625, "learning_rate": 1.386264012460039e-05, - "loss": 0.0145, + "loss": 29.1244, "step": 5160 }, { "epoch": 8.40650406504065, - "grad_norm": 0.15563973784446716, + "grad_norm": 270.5916442871094, "learning_rate": 1.3590766349483586e-05, - "loss": 0.0129, + "loss": 36.4448, "step": 5170 }, { "epoch": 8.422764227642276, - "grad_norm": 0.10939871519804001, + "grad_norm": 439.3215637207031, "learning_rate": 1.3321390543571266e-05, - "loss": 0.0138, + "loss": 33.3136, "step": 5180 }, { "epoch": 8.439024390243903, - "grad_norm": 0.17156806588172913, + "grad_norm": 37061.68359375, "learning_rate": 1.3054520494138445e-05, - "loss": 0.0112, + "loss": 64.5556, "step": 5190 }, { "epoch": 8.455284552845528, - "grad_norm": 0.21800781786441803, + "grad_norm": 316.3396911621094, "learning_rate": 1.2790163916022312e-05, - "loss": 0.0112, + "loss": 27.1406, "step": 5200 }, { "epoch": 8.471544715447154, - "grad_norm": 0.17452943325042725, + "grad_norm": 2111.4130859375, "learning_rate": 1.2528328451399041e-05, - "loss": 0.0127, + "loss": 22.3547, "step": 5210 }, { "epoch": 8.487804878048781, - "grad_norm": 0.3795112669467926, + "grad_norm": 489.82464599609375, "learning_rate": 1.2269021669563041e-05, - "loss": 0.0088, + "loss": 20.5392, "step": 5220 }, { "epoch": 8.504065040650406, - "grad_norm": 0.29412275552749634, + "grad_norm": 1655.57275390625, "learning_rate": 1.2012251066708035e-05, - "loss": 0.0122, + "loss": 25.9037, "step": 5230 }, { "epoch": 8.520325203252032, - "grad_norm": 0.10248192399740219, + "grad_norm": 1041.8621826171875, "learning_rate": 1.1758024065710404e-05, - "loss": 0.0114, + "loss": 26.4345, "step": 5240 }, { "epoch": 8.536585365853659, - "grad_norm": 0.2023720145225525, + "grad_norm": 1299.66650390625, "learning_rate": 1.150634801591457e-05, - "loss": 0.0086, + "loss": 42.8872, "step": 5250 }, { "epoch": 8.552845528455284, - "grad_norm": 0.11840803176164627, + "grad_norm": 435.3826904296875, "learning_rate": 1.1257230192920565e-05, - "loss": 0.0136, + "loss": 42.8848, "step": 5260 }, { "epoch": 8.56910569105691, - "grad_norm": 0.1444655954837799, + "grad_norm": 726.2322998046875, "learning_rate": 1.1010677798373625e-05, - "loss": 0.0094, + "loss": 25.041, "step": 5270 }, { "epoch": 8.585365853658537, - "grad_norm": 0.14335845410823822, + "grad_norm": 3022.15625, "learning_rate": 1.0766697959756166e-05, - "loss": 0.0055, + "loss": 68.7748, "step": 5280 }, { "epoch": 8.601626016260163, - "grad_norm": 0.11126456409692764, + "grad_norm": 4241.69580078125, "learning_rate": 1.0525297730181572e-05, - "loss": 0.0115, + "loss": 74.2972, "step": 5290 }, { "epoch": 8.617886178861788, - "grad_norm": 0.07953710854053497, + "grad_norm": 961.3088989257812, "learning_rate": 1.028648408819034e-05, - "loss": 0.0164, + "loss": 24.1545, "step": 5300 }, { "epoch": 8.634146341463415, - "grad_norm": 0.2995052933692932, + "grad_norm": 949.1688842773438, "learning_rate": 1.0050263937548433e-05, - "loss": 0.0231, + "loss": 49.1739, "step": 5310 }, { "epoch": 8.65040650406504, - "grad_norm": 0.21585552394390106, + "grad_norm": 470.57708740234375, "learning_rate": 9.816644107047613e-06, - "loss": 0.0132, + "loss": 32.3933, "step": 5320 }, { "epoch": 8.666666666666666, - "grad_norm": 0.14535541832447052, + "grad_norm": 717.5396728515625, "learning_rate": 9.585631350308e-06, - "loss": 0.0159, + "loss": 32.7468, "step": 5330 }, { "epoch": 8.682926829268293, - "grad_norm": 0.13804489374160767, + "grad_norm": 575.5538330078125, "learning_rate": 9.357232345582922e-06, - "loss": 0.0125, + "loss": 37.3175, "step": 5340 }, { "epoch": 8.699186991869919, - "grad_norm": 0.14459654688835144, + "grad_norm": 371.1407775878906, "learning_rate": 9.131453695565872e-06, - "loss": 0.0203, + "loss": 48.2922, "step": 5350 }, { "epoch": 8.715447154471544, - "grad_norm": 0.14034615457057953, + "grad_norm": 1407.066650390625, "learning_rate": 8.90830192719947e-06, - "loss": 0.0107, + "loss": 34.3162, "step": 5360 }, { "epoch": 8.731707317073171, - "grad_norm": 0.0734546035528183, + "grad_norm": 2786.113525390625, "learning_rate": 8.687783491486966e-06, - "loss": 0.0074, + "loss": 51.1913, "step": 5370 }, { "epoch": 8.747967479674797, - "grad_norm": 0.17336265742778778, + "grad_norm": 407.6085510253906, "learning_rate": 8.46990476330567e-06, - "loss": 0.0136, + "loss": 27.1041, "step": 5380 }, { "epoch": 8.764227642276422, - "grad_norm": 0.28956034779548645, + "grad_norm": 317.9125671386719, "learning_rate": 8.254672041222611e-06, - "loss": 0.0114, + "loss": 57.7832, "step": 5390 }, { "epoch": 8.78048780487805, - "grad_norm": 0.12940819561481476, + "grad_norm": 200.4461669921875, "learning_rate": 8.042091547312569e-06, - "loss": 0.0119, + "loss": 24.9711, "step": 5400 }, { "epoch": 8.796747967479675, - "grad_norm": 0.32707545161247253, + "grad_norm": 25919.078125, "learning_rate": 7.83216942697813e-06, - "loss": 0.0098, + "loss": 30.2866, "step": 5410 }, { "epoch": 8.8130081300813, - "grad_norm": 0.27290332317352295, + "grad_norm": 9640.9111328125, "learning_rate": 7.624911748772023e-06, - "loss": 0.0147, + "loss": 46.633, "step": 5420 }, { "epoch": 8.829268292682928, - "grad_norm": 0.12661409378051758, + "grad_norm": 339.77239990234375, "learning_rate": 7.420324504221721e-06, - "loss": 0.0078, + "loss": 49.0615, "step": 5430 }, { "epoch": 8.845528455284553, - "grad_norm": 0.15845227241516113, + "grad_norm": 360.1629638671875, "learning_rate": 7.218413607656227e-06, - "loss": 0.013, + "loss": 43.912, "step": 5440 }, { "epoch": 8.861788617886178, - "grad_norm": 0.1557973325252533, + "grad_norm": 357.3642578125, "learning_rate": 7.019184896035103e-06, - "loss": 0.0112, + "loss": 40.2426, "step": 5450 }, { "epoch": 8.878048780487806, - "grad_norm": 0.16300538182258606, + "grad_norm": 342.8908386230469, "learning_rate": 6.822644128779721e-06, - "loss": 0.0094, + "loss": 27.857, "step": 5460 }, { "epoch": 8.894308943089431, - "grad_norm": 0.3038705885410309, + "grad_norm": 1741.92333984375, "learning_rate": 6.628796987606722e-06, - "loss": 0.0168, + "loss": 22.8556, "step": 5470 }, { "epoch": 8.910569105691057, - "grad_norm": 0.18598264455795288, + "grad_norm": 817.4639282226562, "learning_rate": 6.437649076363883e-06, - "loss": 0.0072, + "loss": 25.4468, "step": 5480 }, { "epoch": 8.926829268292684, - "grad_norm": 0.14420990645885468, + "grad_norm": 418.2152404785156, "learning_rate": 6.249205920868018e-06, - "loss": 0.0088, + "loss": 30.6125, "step": 5490 }, { "epoch": 8.94308943089431, - "grad_norm": 0.33995798230171204, + "grad_norm": 345.6661071777344, "learning_rate": 6.063472968745221e-06, - "loss": 0.0106, + "loss": 24.8203, "step": 5500 }, { "epoch": 8.959349593495935, - "grad_norm": 0.20272549986839294, + "grad_norm": 311.8279113769531, "learning_rate": 5.880455589273481e-06, - "loss": 0.0101, + "loss": 28.5219, "step": 5510 }, { "epoch": 8.975609756097562, - "grad_norm": 0.14233702421188354, + "grad_norm": 398.0353698730469, "learning_rate": 5.7001590732273955e-06, - "loss": 0.0097, + "loss": 38.751, "step": 5520 }, { "epoch": 8.991869918699187, - "grad_norm": 0.1719483584165573, + "grad_norm": 4006.41796875, "learning_rate": 5.522588632725245e-06, - "loss": 0.0099, + "loss": 48.2014, "step": 5530 }, { "epoch": 9.008130081300813, - "grad_norm": 0.1671096682548523, + "grad_norm": 863.8807983398438, "learning_rate": 5.34774940107825e-06, - "loss": 0.0088, + "loss": 42.1497, "step": 5540 }, { "epoch": 9.024390243902438, - "grad_norm": 0.3211604952812195, + "grad_norm": 6790.38232421875, "learning_rate": 5.175646432642278e-06, - "loss": 0.0098, + "loss": 31.0566, "step": 5550 }, { "epoch": 9.040650406504065, - "grad_norm": 0.2213525027036667, + "grad_norm": 772.9898681640625, "learning_rate": 5.006284702671693e-06, - "loss": 0.0115, + "loss": 36.8164, "step": 5560 }, { "epoch": 9.05691056910569, - "grad_norm": 0.1039019227027893, + "grad_norm": 4930.9443359375, "learning_rate": 4.839669107175493e-06, - "loss": 0.0116, + "loss": 42.4926, "step": 5570 }, { "epoch": 9.073170731707316, - "grad_norm": 0.2064022570848465, + "grad_norm": 192.48233032226562, "learning_rate": 4.675804462775801e-06, - "loss": 0.0065, + "loss": 39.5624, "step": 5580 }, { "epoch": 9.089430894308943, - "grad_norm": 0.18150761723518372, + "grad_norm": 886.0300903320312, "learning_rate": 4.5146955065686e-06, - "loss": 0.0088, + "loss": 32.467, "step": 5590 }, { "epoch": 9.105691056910569, - "grad_norm": 0.1533227115869522, + "grad_norm": 271.0351257324219, "learning_rate": 4.3563468959868515e-06, - "loss": 0.0098, + "loss": 29.2705, "step": 5600 }, { "epoch": 9.121951219512194, - "grad_norm": 0.14361286163330078, + "grad_norm": 651.6824340820312, "learning_rate": 4.2007632086658035e-06, - "loss": 0.0092, + "loss": 40.7806, "step": 5610 }, { "epoch": 9.138211382113822, - "grad_norm": 0.16205288469791412, + "grad_norm": 153.58518981933594, "learning_rate": 4.047948942310631e-06, - "loss": 0.015, + "loss": 32.8395, "step": 5620 }, { "epoch": 9.154471544715447, - "grad_norm": 0.08183801919221878, + "grad_norm": 771.262939453125, "learning_rate": 3.897908514566484e-06, - "loss": 0.0052, + "loss": 59.9376, "step": 5630 }, { "epoch": 9.170731707317072, - "grad_norm": 0.3060095012187958, + "grad_norm": 2750.450439453125, "learning_rate": 3.750646262890767e-06, - "loss": 0.0127, + "loss": 26.9996, "step": 5640 }, { "epoch": 9.1869918699187, - "grad_norm": 0.13393723964691162, + "grad_norm": 361.48516845703125, "learning_rate": 3.60616644442765e-06, - "loss": 0.0101, + "loss": 30.9447, "step": 5650 }, { "epoch": 9.203252032520325, - "grad_norm": 0.22637413442134857, + "grad_norm": 1025.7686767578125, "learning_rate": 3.4644732358851685e-06, - "loss": 0.0084, + "loss": 27.8333, "step": 5660 }, { "epoch": 9.21951219512195, - "grad_norm": 0.17154087126255035, + "grad_norm": 301.7310485839844, "learning_rate": 3.3255707334143516e-06, - "loss": 0.0087, + "loss": 50.7049, "step": 5670 }, { "epoch": 9.235772357723578, - "grad_norm": 0.08137061446905136, + "grad_norm": 282.4934997558594, "learning_rate": 3.1894629524908293e-06, - "loss": 0.0137, + "loss": 58.6614, "step": 5680 }, { "epoch": 9.252032520325203, - "grad_norm": 0.2149609923362732, + "grad_norm": 2989.5283203125, "learning_rate": 3.056153827798791e-06, - "loss": 0.0081, + "loss": 65.7686, "step": 5690 }, { "epoch": 9.268292682926829, - "grad_norm": 0.28451022505760193, + "grad_norm": 145.37416076660156, "learning_rate": 2.9256472131172442e-06, - "loss": 0.0132, + "loss": 24.332, "step": 5700 }, { "epoch": 9.284552845528456, - "grad_norm": 0.13171927630901337, + "grad_norm": 245.1734619140625, "learning_rate": 2.797946881208513e-06, - "loss": 0.0109, + "loss": 62.6, "step": 5710 }, { "epoch": 9.300813008130081, - "grad_norm": 0.2204708606004715, + "grad_norm": 842.1190795898438, "learning_rate": 2.673056523709294e-06, - "loss": 0.0073, + "loss": 33.1712, "step": 5720 }, { "epoch": 9.317073170731707, - "grad_norm": 0.33869749307632446, + "grad_norm": 205.359130859375, "learning_rate": 2.550979751023885e-06, - "loss": 0.0131, + "loss": 24.7365, "step": 5730 }, { "epoch": 9.333333333333334, - "grad_norm": 0.1560915857553482, + "grad_norm": 189.57533264160156, "learning_rate": 2.431720092219758e-06, - "loss": 0.0063, + "loss": 28.2499, "step": 5740 }, { "epoch": 9.34959349593496, - "grad_norm": 0.11412607133388519, + "grad_norm": 311.52374267578125, "learning_rate": 2.3152809949256503e-06, - "loss": 0.0088, + "loss": 21.5204, "step": 5750 }, { "epoch": 9.365853658536585, - "grad_norm": 0.0961686298251152, + "grad_norm": 2237.07958984375, "learning_rate": 2.2016658252318025e-06, - "loss": 0.0093, + "loss": 26.6137, "step": 5760 }, { "epoch": 9.382113821138212, - "grad_norm": 0.18859520554542542, + "grad_norm": 623.1047973632812, "learning_rate": 2.0908778675927e-06, - "loss": 0.0075, + "loss": 24.8671, "step": 5770 }, { "epoch": 9.398373983739837, - "grad_norm": 0.1648787409067154, + "grad_norm": 292.36285400390625, "learning_rate": 1.9829203247321293e-06, - "loss": 0.0101, + "loss": 23.2705, "step": 5780 }, { "epoch": 9.414634146341463, - "grad_norm": 0.1102316826581955, + "grad_norm": 139.58456420898438, "learning_rate": 1.8777963175505398e-06, - "loss": 0.01, + "loss": 34.1858, "step": 5790 }, { "epoch": 9.43089430894309, - "grad_norm": 0.22981742024421692, + "grad_norm": 5472.58349609375, "learning_rate": 1.7755088850348822e-06, - "loss": 0.0104, + "loss": 23.8006, "step": 5800 }, { "epoch": 9.447154471544716, - "grad_norm": 0.20991994440555573, + "grad_norm": 1327.946533203125, "learning_rate": 1.676060984170702e-06, - "loss": 0.016, + "loss": 27.9731, "step": 5810 }, { "epoch": 9.463414634146341, - "grad_norm": 0.1747724562883377, + "grad_norm": 156.09629821777344, "learning_rate": 1.5794554898567182e-06, - "loss": 0.0149, + "loss": 24.1258, "step": 5820 }, { "epoch": 9.479674796747968, - "grad_norm": 0.15112029016017914, + "grad_norm": 485.4151306152344, "learning_rate": 1.4856951948216569e-06, - "loss": 0.0106, + "loss": 28.9193, "step": 5830 }, { "epoch": 9.495934959349594, - "grad_norm": 0.19349244236946106, + "grad_norm": 354.6837158203125, "learning_rate": 1.39478280954356e-06, - "loss": 0.0087, + "loss": 33.2445, "step": 5840 }, { "epoch": 9.512195121951219, - "grad_norm": 0.08975008875131607, + "grad_norm": 503.53289794921875, "learning_rate": 1.3067209621713928e-06, - "loss": 0.0084, + "loss": 25.0091, "step": 5850 }, { "epoch": 9.528455284552846, - "grad_norm": 0.369566947221756, + "grad_norm": 329.1166687011719, "learning_rate": 1.221512198449093e-06, - "loss": 0.0108, + "loss": 35.9692, "step": 5860 }, { "epoch": 9.544715447154472, - "grad_norm": 0.18167252838611603, + "grad_norm": 374.5758361816406, "learning_rate": 1.1391589816419968e-06, - "loss": 0.0073, + "loss": 25.7447, "step": 5870 }, { "epoch": 9.560975609756097, - "grad_norm": 0.1581115871667862, + "grad_norm": 257.5137939453125, "learning_rate": 1.059663692465529e-06, - "loss": 0.0147, + "loss": 37.0374, "step": 5880 }, { "epoch": 9.577235772357724, - "grad_norm": 0.19970975816249847, + "grad_norm": 284.5126037597656, "learning_rate": 9.830286290165357e-07, - "loss": 0.0106, + "loss": 23.4132, "step": 5890 }, { "epoch": 9.59349593495935, - "grad_norm": 0.20190675556659698, + "grad_norm": 689.851806640625, "learning_rate": 9.092560067067268e-07, - "loss": 0.01, + "loss": 47.7638, "step": 5900 }, { "epoch": 9.609756097560975, - "grad_norm": 0.15383341908454895, + "grad_norm": 1487.80859375, "learning_rate": 8.383479581986597e-07, - "loss": 0.012, + "loss": 22.3418, "step": 5910 }, { "epoch": 9.6260162601626, - "grad_norm": 0.17703844606876373, + "grad_norm": 1127.08837890625, "learning_rate": 7.70306533344134e-07, - "loss": 0.0123, + "loss": 24.0052, "step": 5920 }, { "epoch": 9.642276422764228, - "grad_norm": 0.17949138581752777, + "grad_norm": 6250.7666015625, "learning_rate": 7.051336991248714e-07, - "loss": 0.0124, + "loss": 31.2493, "step": 5930 }, { "epoch": 9.658536585365853, - "grad_norm": 0.18274930119514465, + "grad_norm": 565.5596923828125, "learning_rate": 6.428313395956953e-07, - "loss": 0.0127, + "loss": 20.2709, "step": 5940 }, { "epoch": 9.67479674796748, - "grad_norm": 0.15016835927963257, + "grad_norm": 142.4834442138672, "learning_rate": 5.834012558300295e-07, - "loss": 0.0076, + "loss": 27.2821, "step": 5950 }, { "epoch": 9.691056910569106, - "grad_norm": 0.10980150103569031, + "grad_norm": 559.2692260742188, "learning_rate": 5.26845165867873e-07, - "loss": 0.0109, + "loss": 56.2713, "step": 5960 }, { "epoch": 9.707317073170731, - "grad_norm": 0.09684355556964874, + "grad_norm": 170.9761199951172, "learning_rate": 4.7316470466611804e-07, - "loss": 0.0083, + "loss": 25.9403, "step": 5970 }, { "epoch": 9.723577235772357, - "grad_norm": 0.15286968648433685, + "grad_norm": 577.9078369140625, "learning_rate": 4.22361424051243e-07, - "loss": 0.0076, + "loss": 27.2287, "step": 5980 }, { "epoch": 9.739837398373984, - "grad_norm": 0.20796345174312592, + "grad_norm": 203.03167724609375, "learning_rate": 3.7443679267453735e-07, - "loss": 0.0086, + "loss": 33.0212, "step": 5990 }, { "epoch": 9.75609756097561, - "grad_norm": 0.17535723745822906, + "grad_norm": 1709.7088623046875, "learning_rate": 3.2939219596956895e-07, - "loss": 0.0126, + "loss": 30.0687, "step": 6000 }, { "epoch": 9.772357723577235, - "grad_norm": 0.1464337259531021, + "grad_norm": 226.99795532226562, "learning_rate": 2.872289361121605e-07, - "loss": 0.0091, + "loss": 36.0599, "step": 6010 }, { "epoch": 9.788617886178862, - "grad_norm": 0.21644367277622223, + "grad_norm": 805.7896728515625, "learning_rate": 2.4794823198275307e-07, - "loss": 0.0104, + "loss": 48.3908, "step": 6020 }, { "epoch": 9.804878048780488, - "grad_norm": 0.1423000991344452, + "grad_norm": 21221.19921875, "learning_rate": 2.115512191311564e-07, - "loss": 0.0111, + "loss": 55.056, "step": 6030 }, { "epoch": 9.821138211382113, - "grad_norm": 0.20385445654392242, + "grad_norm": 1422.177001953125, "learning_rate": 1.780389497437418e-07, - "loss": 0.0149, + "loss": 20.1985, "step": 6040 }, { "epoch": 9.83739837398374, - "grad_norm": 0.13462184369564056, + "grad_norm": 182.74656677246094, "learning_rate": 1.4741239261299998e-07, - "loss": 0.0094, + "loss": 36.4601, "step": 6050 }, { "epoch": 9.853658536585366, - "grad_norm": 0.3199392855167389, + "grad_norm": 427.26385498046875, "learning_rate": 1.1967243310955222e-07, - "loss": 0.0098, + "loss": 49.9752, "step": 6060 }, { "epoch": 9.869918699186991, - "grad_norm": 0.12754817306995392, + "grad_norm": 463.0358581542969, "learning_rate": 9.481987315653751e-08, - "loss": 0.0086, + "loss": 38.0783, "step": 6070 }, { "epoch": 9.886178861788618, - "grad_norm": 0.0839032307267189, + "grad_norm": 381.15008544921875, "learning_rate": 7.285543120645332e-08, - "loss": 0.0084, + "loss": 40.3717, "step": 6080 }, { "epoch": 9.902439024390244, - "grad_norm": 0.1982775628566742, + "grad_norm": 414.7477111816406, "learning_rate": 5.377974222036119e-08, - "loss": 0.011, + "loss": 23.7009, "step": 6090 }, { "epoch": 9.91869918699187, - "grad_norm": 0.13023720681667328, + "grad_norm": 2649.5400390625, "learning_rate": 3.7593357649579055e-08, - "loss": 0.0137, + "loss": 39.1989, "step": 6100 }, { "epoch": 9.934959349593496, - "grad_norm": 0.08501988649368286, + "grad_norm": 1547.17236328125, "learning_rate": 2.429674541966076e-08, - "loss": 0.0132, + "loss": 45.793, "step": 6110 }, { "epoch": 9.951219512195122, - "grad_norm": 0.27699193358421326, + "grad_norm": 394.08685302734375, "learning_rate": 1.3890289916929089e-08, - "loss": 0.0109, + "loss": 26.9755, "step": 6120 }, { "epoch": 9.967479674796747, - "grad_norm": 0.1899338662624359, + "grad_norm": 6701.4306640625, "learning_rate": 6.37429197736239e-09, - "loss": 0.0136, + "loss": 26.3901, "step": 6130 }, { "epoch": 9.983739837398375, - "grad_norm": 0.22311294078826904, + "grad_norm": 231.67611694335938, "learning_rate": 1.7489688778793424e-09, - "loss": 0.0087, + "loss": 22.5137, "step": 6140 }, { "epoch": 10.0, - "grad_norm": 0.2033189833164215, + "grad_norm": 158.60951232910156, "learning_rate": 1.4454330032886986e-11, - "loss": 0.0087, + "loss": 39.3726, "step": 6150 }, { "epoch": 10.001626016260163, "step": 6151, "total_flos": 2.157115506118272e+17, - "train_loss": 0.03987043799002434, - "train_runtime": 2798.8997, - "train_samples_per_second": 35.162, - "train_steps_per_second": 2.198 + "train_loss": 212.35847260424458, + "train_runtime": 2807.1103, + "train_samples_per_second": 35.06, + "train_steps_per_second": 2.191 } ], "logging_steps": 10,