|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.001626016260163, |
|
"eval_steps": 500, |
|
"global_step": 6151, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016260162601626018, |
|
"grad_norm": 3704.4365234375, |
|
"learning_rate": 6.493506493506493e-06, |
|
"loss": 11280.4625, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032520325203252036, |
|
"grad_norm": 3407.870361328125, |
|
"learning_rate": 1.2987012987012986e-05, |
|
"loss": 11149.0953, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 3003.387939453125, |
|
"learning_rate": 1.9480519480519483e-05, |
|
"loss": 11173.7734, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06504065040650407, |
|
"grad_norm": 2449.536376953125, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 11021.1336, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08130081300813008, |
|
"grad_norm": 2389.97509765625, |
|
"learning_rate": 3.246753246753247e-05, |
|
"loss": 11002.3586, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 2766.29443359375, |
|
"learning_rate": 3.8961038961038966e-05, |
|
"loss": 10353.3125, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11382113821138211, |
|
"grad_norm": 4483.15087890625, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 9267.8453, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13008130081300814, |
|
"grad_norm": 10965.46484375, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 7955.3172, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 12316.47265625, |
|
"learning_rate": 5.844155844155844e-05, |
|
"loss": 3923.734, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16260162601626016, |
|
"grad_norm": 154421.5, |
|
"learning_rate": 6.493506493506494e-05, |
|
"loss": 2865.5645, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17886178861788618, |
|
"grad_norm": 6532.34765625, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 2196.2996, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 4513.40087890625, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 1111.7941, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21138211382113822, |
|
"grad_norm": 4285.0390625, |
|
"learning_rate": 8.441558441558442e-05, |
|
"loss": 945.8594, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22764227642276422, |
|
"grad_norm": 2602.03369140625, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 593.7912, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 3598.093017578125, |
|
"learning_rate": 9.74025974025974e-05, |
|
"loss": 355.9361, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2601626016260163, |
|
"grad_norm": 40596.5078125, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 263.4985, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2764227642276423, |
|
"grad_norm": 3959.72265625, |
|
"learning_rate": 0.0001103896103896104, |
|
"loss": 200.0281, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 277750.1875, |
|
"learning_rate": 0.00011688311688311689, |
|
"loss": 176.2805, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3089430894308943, |
|
"grad_norm": 114203.1015625, |
|
"learning_rate": 0.0001233766233766234, |
|
"loss": 192.7327, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3252032520325203, |
|
"grad_norm": 2108.614013671875, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 154.6821, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 6306.46484375, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 94.0863, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35772357723577236, |
|
"grad_norm": 2991.08935546875, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 120.9749, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.37398373983739835, |
|
"grad_norm": 17456.123046875, |
|
"learning_rate": 0.00014935064935064934, |
|
"loss": 150.8287, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 3997.399658203125, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 127.9284, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4065040650406504, |
|
"grad_norm": 3142.8544921875, |
|
"learning_rate": 0.00016233766233766234, |
|
"loss": 99.4487, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42276422764227645, |
|
"grad_norm": 4303.7421875, |
|
"learning_rate": 0.00016883116883116884, |
|
"loss": 111.1226, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 9494.5283203125, |
|
"learning_rate": 0.00017532467532467534, |
|
"loss": 148.5725, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.45528455284552843, |
|
"grad_norm": 12805.1005859375, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 89.0703, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4715447154471545, |
|
"grad_norm": 5651.734375, |
|
"learning_rate": 0.00018831168831168833, |
|
"loss": 113.0061, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 3500.915283203125, |
|
"learning_rate": 0.0001948051948051948, |
|
"loss": 98.2204, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5040650406504065, |
|
"grad_norm": 76347.09375, |
|
"learning_rate": 0.00019999994218268405, |
|
"loss": 99.3199, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5203252032520326, |
|
"grad_norm": 5863.58642578125, |
|
"learning_rate": 0.00019999791858364572, |
|
"loss": 145.732, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 6211.58203125, |
|
"learning_rate": 0.00019999300418566636, |
|
"loss": 99.9764, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5528455284552846, |
|
"grad_norm": 1611.767333984375, |
|
"learning_rate": 0.00019998519913081423, |
|
"loss": 130.0497, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5691056910569106, |
|
"grad_norm": 6738.830078125, |
|
"learning_rate": 0.0001999745036447225, |
|
"loss": 132.5203, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 1651.7647705078125, |
|
"learning_rate": 0.00019996091803658263, |
|
"loss": 93.679, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6016260162601627, |
|
"grad_norm": 1451.311279296875, |
|
"learning_rate": 0.00019994444269913535, |
|
"loss": 130.961, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6178861788617886, |
|
"grad_norm": 3547.41015625, |
|
"learning_rate": 0.00019992507810865954, |
|
"loss": 89.0317, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 3523.5322265625, |
|
"learning_rate": 0.00019990282482495816, |
|
"loss": 92.9305, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6504065040650406, |
|
"grad_norm": 5402.7509765625, |
|
"learning_rate": 0.00019987768349134227, |
|
"loss": 124.9789, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 2866.8330078125, |
|
"learning_rate": 0.0001998496548346125, |
|
"loss": 85.8321, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 2670.57275390625, |
|
"learning_rate": 0.00019981873966503773, |
|
"loss": 143.1263, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6991869918699187, |
|
"grad_norm": 3444.505126953125, |
|
"learning_rate": 0.000199784938876332, |
|
"loss": 117.812, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7154471544715447, |
|
"grad_norm": 1545.001708984375, |
|
"learning_rate": 0.0001997482534456285, |
|
"loss": 100.9372, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 839.1185913085938, |
|
"learning_rate": 0.00019970868443345134, |
|
"loss": 92.1672, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7479674796747967, |
|
"grad_norm": 17015.447265625, |
|
"learning_rate": 0.0001996662329836849, |
|
"loss": 96.7714, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7642276422764228, |
|
"grad_norm": 3150.693359375, |
|
"learning_rate": 0.0001996209003235408, |
|
"loss": 90.8617, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 519.7723999023438, |
|
"learning_rate": 0.00019957268776352234, |
|
"loss": 113.3078, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7967479674796748, |
|
"grad_norm": 4185.5126953125, |
|
"learning_rate": 0.00019952159669738674, |
|
"loss": 105.7553, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8130081300813008, |
|
"grad_norm": 2410.567138671875, |
|
"learning_rate": 0.00019946762860210471, |
|
"loss": 78.1075, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8292682926829268, |
|
"grad_norm": 2119.968505859375, |
|
"learning_rate": 0.00019941078503781792, |
|
"loss": 83.322, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8455284552845529, |
|
"grad_norm": 10644.458984375, |
|
"learning_rate": 0.00019935106764779365, |
|
"loss": 79.2555, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8617886178861789, |
|
"grad_norm": 3944.619873046875, |
|
"learning_rate": 0.00019928847815837758, |
|
"loss": 103.8101, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 1694.9683837890625, |
|
"learning_rate": 0.00019922301837894358, |
|
"loss": 96.7458, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8943089430894309, |
|
"grad_norm": 3317.81640625, |
|
"learning_rate": 0.0001991546902018417, |
|
"loss": 160.2423, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9105691056910569, |
|
"grad_norm": 7013.18359375, |
|
"learning_rate": 0.0001990834956023433, |
|
"loss": 122.6204, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.926829268292683, |
|
"grad_norm": 3094.7744140625, |
|
"learning_rate": 0.00019900943663858387, |
|
"loss": 96.8247, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.943089430894309, |
|
"grad_norm": 6648.25048828125, |
|
"learning_rate": 0.0001989325154515038, |
|
"loss": 116.6589, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.959349593495935, |
|
"grad_norm": 15371.361328125, |
|
"learning_rate": 0.0001988527342647862, |
|
"loss": 88.9712, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 2130.667724609375, |
|
"learning_rate": 0.00019877009538479275, |
|
"loss": 75.6254, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.991869918699187, |
|
"grad_norm": 3430.82763671875, |
|
"learning_rate": 0.00019868460120049704, |
|
"loss": 118.3028, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.008130081300813, |
|
"grad_norm": 1396.5372314453125, |
|
"learning_rate": 0.00019859625418341557, |
|
"loss": 78.8569, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.024390243902439, |
|
"grad_norm": 7597.01904296875, |
|
"learning_rate": 0.00019850505688753602, |
|
"loss": 100.3299, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.040650406504065, |
|
"grad_norm": 2552.638916015625, |
|
"learning_rate": 0.0001984110119492438, |
|
"loss": 73.0117, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.056910569105691, |
|
"grad_norm": 1387.00439453125, |
|
"learning_rate": 0.00019831412208724556, |
|
"loss": 107.2604, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0731707317073171, |
|
"grad_norm": 1579.257080078125, |
|
"learning_rate": 0.0001982143901024907, |
|
"loss": 64.988, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.089430894308943, |
|
"grad_norm": 1369.64501953125, |
|
"learning_rate": 0.0001981118188780904, |
|
"loss": 110.6651, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1056910569105691, |
|
"grad_norm": 3883.478271484375, |
|
"learning_rate": 0.00019800641137923423, |
|
"loss": 110.6604, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1219512195121952, |
|
"grad_norm": 2725.116943359375, |
|
"learning_rate": 0.00019789817065310448, |
|
"loss": 97.7683, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.1382113821138211, |
|
"grad_norm": 2270.0986328125, |
|
"learning_rate": 0.00019778709982878805, |
|
"loss": 133.6088, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1544715447154472, |
|
"grad_norm": 3066.498046875, |
|
"learning_rate": 0.000197673202117186, |
|
"loss": 83.8171, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.170731707317073, |
|
"grad_norm": 5128.125, |
|
"learning_rate": 0.00019755648081092066, |
|
"loss": 169.6488, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1869918699186992, |
|
"grad_norm": 1368.7137451171875, |
|
"learning_rate": 0.00019743693928424058, |
|
"loss": 78.2656, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.203252032520325, |
|
"grad_norm": 3027.226318359375, |
|
"learning_rate": 0.00019731458099292288, |
|
"loss": 132.4441, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"grad_norm": 7759.80810546875, |
|
"learning_rate": 0.00019718940947417336, |
|
"loss": 130.1133, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2357723577235773, |
|
"grad_norm": 1686.7059326171875, |
|
"learning_rate": 0.00019706142834652427, |
|
"loss": 111.4778, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2520325203252032, |
|
"grad_norm": 9301.548828125, |
|
"learning_rate": 0.00019693064130972974, |
|
"loss": 88.9655, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2682926829268293, |
|
"grad_norm": 1258.872802734375, |
|
"learning_rate": 0.0001967970521446587, |
|
"loss": 69.8348, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2845528455284554, |
|
"grad_norm": 1352.4385986328125, |
|
"learning_rate": 0.00019666066471318568, |
|
"loss": 77.0263, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3008130081300813, |
|
"grad_norm": 855.2029418945312, |
|
"learning_rate": 0.00019652148295807922, |
|
"loss": 85.511, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3170731707317074, |
|
"grad_norm": 1946.88330078125, |
|
"learning_rate": 0.00019637951090288778, |
|
"loss": 59.645, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 2297.47216796875, |
|
"learning_rate": 0.00019623475265182337, |
|
"loss": 67.3651, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3495934959349594, |
|
"grad_norm": 11286.927734375, |
|
"learning_rate": 0.00019608721238964318, |
|
"loss": 128.2699, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3658536585365852, |
|
"grad_norm": 2499.033447265625, |
|
"learning_rate": 0.00019593689438152827, |
|
"loss": 69.4611, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3821138211382114, |
|
"grad_norm": 10106.341796875, |
|
"learning_rate": 0.0001957838029729605, |
|
"loss": 93.8524, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3983739837398375, |
|
"grad_norm": 2966.48779296875, |
|
"learning_rate": 0.00019562794258959674, |
|
"loss": 108.8285, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4146341463414633, |
|
"grad_norm": 7656.32275390625, |
|
"learning_rate": 0.00019546931773714116, |
|
"loss": 70.237, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4308943089430894, |
|
"grad_norm": 4307.1708984375, |
|
"learning_rate": 0.00019530793300121473, |
|
"loss": 125.8694, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.4471544715447155, |
|
"grad_norm": 2789.88916015625, |
|
"learning_rate": 0.0001951437930472228, |
|
"loss": 108.8423, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"grad_norm": 5194.333984375, |
|
"learning_rate": 0.00019497690262022018, |
|
"loss": 162.3557, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4796747967479675, |
|
"grad_norm": 2407.015380859375, |
|
"learning_rate": 0.00019480726654477398, |
|
"loss": 98.5685, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4959349593495934, |
|
"grad_norm": 7854.9638671875, |
|
"learning_rate": 0.00019463488972482418, |
|
"loss": 60.0693, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.5121951219512195, |
|
"grad_norm": 1800.740478515625, |
|
"learning_rate": 0.00019445977714354173, |
|
"loss": 60.3849, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.5284552845528454, |
|
"grad_norm": 2736.665283203125, |
|
"learning_rate": 0.00019428193386318468, |
|
"loss": 66.8596, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5447154471544715, |
|
"grad_norm": 15203.984375, |
|
"learning_rate": 0.0001941013650249517, |
|
"loss": 95.6272, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5609756097560976, |
|
"grad_norm": 3157.21337890625, |
|
"learning_rate": 0.0001939180758488335, |
|
"loss": 71.3239, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5772357723577235, |
|
"grad_norm": 4594.89501953125, |
|
"learning_rate": 0.00019373207163346192, |
|
"loss": 82.758, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5934959349593496, |
|
"grad_norm": 2293.903564453125, |
|
"learning_rate": 0.0001935433577559568, |
|
"loss": 67.9693, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.6097560975609757, |
|
"grad_norm": 2138.247802734375, |
|
"learning_rate": 0.0001933519396717704, |
|
"loss": 75.4409, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6260162601626016, |
|
"grad_norm": 781.8675537109375, |
|
"learning_rate": 0.0001931578229145299, |
|
"loss": 77.4897, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6422764227642277, |
|
"grad_norm": 2182.60107421875, |
|
"learning_rate": 0.00019296101309587726, |
|
"loss": 54.7864, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6585365853658538, |
|
"grad_norm": 26183.85546875, |
|
"learning_rate": 0.00019276151590530703, |
|
"loss": 89.1371, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6747967479674797, |
|
"grad_norm": 1233.78857421875, |
|
"learning_rate": 0.000192559337110002, |
|
"loss": 51.9562, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6910569105691056, |
|
"grad_norm": 4076.354248046875, |
|
"learning_rate": 0.00019235448255466617, |
|
"loss": 77.1311, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"grad_norm": 1355.48095703125, |
|
"learning_rate": 0.0001921469581613562, |
|
"loss": 70.7184, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.7235772357723578, |
|
"grad_norm": 4424.4345703125, |
|
"learning_rate": 0.00019193676992930992, |
|
"loss": 82.3314, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.7398373983739837, |
|
"grad_norm": 38555.8359375, |
|
"learning_rate": 0.00019172392393477296, |
|
"loss": 78.6395, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.7560975609756098, |
|
"grad_norm": 8763.5234375, |
|
"learning_rate": 0.0001915084263308232, |
|
"loss": 110.0452, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.7723577235772359, |
|
"grad_norm": 3243.281005859375, |
|
"learning_rate": 0.0001912902833471927, |
|
"loss": 121.6475, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.7886178861788617, |
|
"grad_norm": 9277.51953125, |
|
"learning_rate": 0.0001910695012900878, |
|
"loss": 113.4883, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.8048780487804879, |
|
"grad_norm": 1323.01904296875, |
|
"learning_rate": 0.0001908460865420067, |
|
"loss": 82.9752, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.821138211382114, |
|
"grad_norm": 1779.7681884765625, |
|
"learning_rate": 0.00019062004556155506, |
|
"loss": 89.6342, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8373983739837398, |
|
"grad_norm": 4294.21044921875, |
|
"learning_rate": 0.00019039138488325912, |
|
"loss": 95.4384, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8536585365853657, |
|
"grad_norm": 1751.0389404296875, |
|
"learning_rate": 0.0001901601111173769, |
|
"loss": 94.7895, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8699186991869918, |
|
"grad_norm": 1074.5364990234375, |
|
"learning_rate": 0.00018992623094970718, |
|
"loss": 52.8511, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.886178861788618, |
|
"grad_norm": 1331.533935546875, |
|
"learning_rate": 0.0001896897511413961, |
|
"loss": 94.5019, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.9024390243902438, |
|
"grad_norm": 3847.0712890625, |
|
"learning_rate": 0.0001894506785287417, |
|
"loss": 74.2541, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.91869918699187, |
|
"grad_norm": 2032.809326171875, |
|
"learning_rate": 0.00018920902002299644, |
|
"loss": 139.9438, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.934959349593496, |
|
"grad_norm": 2137.700439453125, |
|
"learning_rate": 0.00018896478261016725, |
|
"loss": 111.8997, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"grad_norm": 2548.987548828125, |
|
"learning_rate": 0.0001887179733508136, |
|
"loss": 76.8431, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.967479674796748, |
|
"grad_norm": 1745.5999755859375, |
|
"learning_rate": 0.00018846859937984346, |
|
"loss": 67.4039, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.9837398373983741, |
|
"grad_norm": 2157.826904296875, |
|
"learning_rate": 0.000188216667906307, |
|
"loss": 103.4683, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1239.8826904296875, |
|
"learning_rate": 0.00018796218621318822, |
|
"loss": 98.9879, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.016260162601626, |
|
"grad_norm": 1551.313720703125, |
|
"learning_rate": 0.00018770516165719423, |
|
"loss": 58.3172, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.032520325203252, |
|
"grad_norm": 6539.54736328125, |
|
"learning_rate": 0.00018744560166854296, |
|
"loss": 72.3266, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.048780487804878, |
|
"grad_norm": 920.8938598632812, |
|
"learning_rate": 0.00018718351375074786, |
|
"loss": 71.1883, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.065040650406504, |
|
"grad_norm": 2663.365234375, |
|
"learning_rate": 0.00018691890548040146, |
|
"loss": 100.6873, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.08130081300813, |
|
"grad_norm": 5801.7314453125, |
|
"learning_rate": 0.00018665178450695606, |
|
"loss": 51.0893, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.097560975609756, |
|
"grad_norm": 1768.61083984375, |
|
"learning_rate": 0.00018638215855250263, |
|
"loss": 46.9602, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.113821138211382, |
|
"grad_norm": 74955.7421875, |
|
"learning_rate": 0.00018611003541154766, |
|
"loss": 69.618, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.130081300813008, |
|
"grad_norm": 16715.201171875, |
|
"learning_rate": 0.00018583542295078775, |
|
"loss": 76.9604, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.1463414634146343, |
|
"grad_norm": 490.9708557128906, |
|
"learning_rate": 0.0001855583291088822, |
|
"loss": 61.4616, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.16260162601626, |
|
"grad_norm": 2168.93896484375, |
|
"learning_rate": 0.00018527876189622372, |
|
"loss": 69.4417, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.178861788617886, |
|
"grad_norm": 1728.7271728515625, |
|
"learning_rate": 0.00018499672939470646, |
|
"loss": 41.3895, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.1951219512195124, |
|
"grad_norm": 13797.6259765625, |
|
"learning_rate": 0.00018471223975749266, |
|
"loss": 86.5364, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.2113821138211383, |
|
"grad_norm": 1238.1878662109375, |
|
"learning_rate": 0.000184425301208777, |
|
"loss": 60.4841, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.227642276422764, |
|
"grad_norm": 1721.18505859375, |
|
"learning_rate": 0.00018413592204354857, |
|
"loss": 63.7924, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.2439024390243905, |
|
"grad_norm": 1503.65234375, |
|
"learning_rate": 0.00018384411062735142, |
|
"loss": 72.9356, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.2601626016260163, |
|
"grad_norm": 2268.296630859375, |
|
"learning_rate": 0.00018354987539604244, |
|
"loss": 64.837, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.2764227642276422, |
|
"grad_norm": 770.135986328125, |
|
"learning_rate": 0.0001832532248555476, |
|
"loss": 46.5948, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.292682926829268, |
|
"grad_norm": 2809.25146484375, |
|
"learning_rate": 0.00018295416758161607, |
|
"loss": 72.0357, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.3089430894308944, |
|
"grad_norm": 5348.63330078125, |
|
"learning_rate": 0.00018265271221957235, |
|
"loss": 64.2022, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.3252032520325203, |
|
"grad_norm": 1522.7744140625, |
|
"learning_rate": 0.00018234886748406623, |
|
"loss": 87.9972, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.341463414634146, |
|
"grad_norm": 2168.956298828125, |
|
"learning_rate": 0.00018204264215882093, |
|
"loss": 77.3112, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.3577235772357725, |
|
"grad_norm": 1201.1219482421875, |
|
"learning_rate": 0.00018173404509637912, |
|
"loss": 77.9051, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.3739837398373984, |
|
"grad_norm": 1378.28515625, |
|
"learning_rate": 0.00018142308521784716, |
|
"loss": 113.3623, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.3902439024390243, |
|
"grad_norm": 3121.484375, |
|
"learning_rate": 0.00018110977151263702, |
|
"loss": 68.1337, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.40650406504065, |
|
"grad_norm": 4526.3203125, |
|
"learning_rate": 0.00018079411303820647, |
|
"loss": 76.719, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.4227642276422765, |
|
"grad_norm": 1512.4857177734375, |
|
"learning_rate": 0.00018047611891979732, |
|
"loss": 53.3857, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.4390243902439024, |
|
"grad_norm": 775.2145385742188, |
|
"learning_rate": 0.00018015579835017147, |
|
"loss": 59.4552, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.4552845528455283, |
|
"grad_norm": 1543.6497802734375, |
|
"learning_rate": 0.00017983316058934533, |
|
"loss": 79.715, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.4715447154471546, |
|
"grad_norm": 3052.429931640625, |
|
"learning_rate": 0.00017950821496432202, |
|
"loss": 68.2702, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.4878048780487805, |
|
"grad_norm": 1861.0294189453125, |
|
"learning_rate": 0.00017918097086882167, |
|
"loss": 70.8437, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.5040650406504064, |
|
"grad_norm": 1316.6455078125, |
|
"learning_rate": 0.00017885143776301017, |
|
"loss": 48.8773, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.5203252032520327, |
|
"grad_norm": 1434.713623046875, |
|
"learning_rate": 0.0001785196251732252, |
|
"loss": 50.5964, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.5365853658536586, |
|
"grad_norm": 2314.07373046875, |
|
"learning_rate": 0.0001781855426917013, |
|
"loss": 49.6357, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.5528455284552845, |
|
"grad_norm": 27705.951171875, |
|
"learning_rate": 0.00017784919997629236, |
|
"loss": 60.1384, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.569105691056911, |
|
"grad_norm": 100750.6953125, |
|
"learning_rate": 0.00017751060675019235, |
|
"loss": 78.1081, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.5853658536585367, |
|
"grad_norm": 5099.37548828125, |
|
"learning_rate": 0.00017716977280165445, |
|
"loss": 107.401, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.6016260162601625, |
|
"grad_norm": 16017.0224609375, |
|
"learning_rate": 0.00017682670798370792, |
|
"loss": 109.425, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.617886178861789, |
|
"grad_norm": 1565.2376708984375, |
|
"learning_rate": 0.00017648142221387325, |
|
"loss": 66.7137, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.6341463414634148, |
|
"grad_norm": 1883.359619140625, |
|
"learning_rate": 0.00017613392547387565, |
|
"loss": 63.5428, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.6504065040650406, |
|
"grad_norm": 4678.5400390625, |
|
"learning_rate": 0.00017578422780935624, |
|
"loss": 62.324, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1467.29150390625, |
|
"learning_rate": 0.00017543233932958185, |
|
"loss": 42.7399, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.682926829268293, |
|
"grad_norm": 17443.28125, |
|
"learning_rate": 0.00017507827020715267, |
|
"loss": 76.8691, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.6991869918699187, |
|
"grad_norm": 1430.255615234375, |
|
"learning_rate": 0.00017472203067770816, |
|
"loss": 45.8614, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.7154471544715446, |
|
"grad_norm": 973.7998657226562, |
|
"learning_rate": 0.0001743636310396312, |
|
"loss": 36.7464, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.7317073170731705, |
|
"grad_norm": 2293.49658203125, |
|
"learning_rate": 0.00017400308165375043, |
|
"loss": 104.4038, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.747967479674797, |
|
"grad_norm": 1044.43115234375, |
|
"learning_rate": 0.00017364039294304063, |
|
"loss": 61.9649, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.7642276422764227, |
|
"grad_norm": 2085.281982421875, |
|
"learning_rate": 0.00017327557539232138, |
|
"loss": 51.97, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7804878048780486, |
|
"grad_norm": 1864.0758056640625, |
|
"learning_rate": 0.00017290863954795414, |
|
"loss": 56.1968, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.796747967479675, |
|
"grad_norm": 5055.72216796875, |
|
"learning_rate": 0.00017253959601753715, |
|
"loss": 49.4941, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.813008130081301, |
|
"grad_norm": 2442.3779296875, |
|
"learning_rate": 0.00017216845546959904, |
|
"loss": 85.7186, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.8292682926829267, |
|
"grad_norm": 1286.6806640625, |
|
"learning_rate": 0.00017179522863329004, |
|
"loss": 57.1273, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.845528455284553, |
|
"grad_norm": 1548.7122802734375, |
|
"learning_rate": 0.0001714199262980722, |
|
"loss": 50.7149, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.861788617886179, |
|
"grad_norm": 1237.375732421875, |
|
"learning_rate": 0.00017104255931340732, |
|
"loss": 80.6716, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.8780487804878048, |
|
"grad_norm": 271203.3125, |
|
"learning_rate": 0.00017066313858844317, |
|
"loss": 79.4793, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.894308943089431, |
|
"grad_norm": 2990.47998046875, |
|
"learning_rate": 0.00017028167509169846, |
|
"loss": 63.7313, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.910569105691057, |
|
"grad_norm": 2197.031494140625, |
|
"learning_rate": 0.00016989817985074533, |
|
"loss": 66.6744, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.926829268292683, |
|
"grad_norm": 2398.322509765625, |
|
"learning_rate": 0.00016951266395189097, |
|
"loss": 119.2331, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.943089430894309, |
|
"grad_norm": 1132.4508056640625, |
|
"learning_rate": 0.00016912513853985686, |
|
"loss": 66.5857, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.959349593495935, |
|
"grad_norm": 1172.097412109375, |
|
"learning_rate": 0.00016873561481745667, |
|
"loss": 69.8449, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.975609756097561, |
|
"grad_norm": 1260.872314453125, |
|
"learning_rate": 0.0001683441040452724, |
|
"loss": 65.4089, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.991869918699187, |
|
"grad_norm": 3771.443603515625, |
|
"learning_rate": 0.00016795061754132896, |
|
"loss": 59.9783, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.008130081300813, |
|
"grad_norm": 44377.31640625, |
|
"learning_rate": 0.00016755516668076674, |
|
"loss": 77.3272, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.024390243902439, |
|
"grad_norm": 1505.83984375, |
|
"learning_rate": 0.00016715776289551296, |
|
"loss": 53.3784, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.040650406504065, |
|
"grad_norm": 615.7579956054688, |
|
"learning_rate": 0.0001667584176739512, |
|
"loss": 50.9411, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.0569105691056913, |
|
"grad_norm": 38362.62890625, |
|
"learning_rate": 0.00016635714256058915, |
|
"loss": 118.019, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.073170731707317, |
|
"grad_norm": 1028.602783203125, |
|
"learning_rate": 0.00016595394915572506, |
|
"loss": 69.6284, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.089430894308943, |
|
"grad_norm": 5944.29248046875, |
|
"learning_rate": 0.00016554884911511213, |
|
"loss": 64.6018, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.105691056910569, |
|
"grad_norm": 2787.141845703125, |
|
"learning_rate": 0.00016514185414962182, |
|
"loss": 68.6644, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.1219512195121952, |
|
"grad_norm": 2354.9130859375, |
|
"learning_rate": 0.0001647329760249052, |
|
"loss": 81.7822, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.138211382113821, |
|
"grad_norm": 2922.60009765625, |
|
"learning_rate": 0.00016432222656105277, |
|
"loss": 113.863, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.154471544715447, |
|
"grad_norm": 4188.85107421875, |
|
"learning_rate": 0.0001639096176322528, |
|
"loss": 79.855, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.1707317073170733, |
|
"grad_norm": 1911.2069091796875, |
|
"learning_rate": 0.0001634951611664482, |
|
"loss": 69.1627, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.186991869918699, |
|
"grad_norm": 1192.2657470703125, |
|
"learning_rate": 0.0001630788691449914, |
|
"loss": 55.1678, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.203252032520325, |
|
"grad_norm": 10476.7724609375, |
|
"learning_rate": 0.00016266075360229823, |
|
"loss": 88.3594, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.2195121951219514, |
|
"grad_norm": 746.9041748046875, |
|
"learning_rate": 0.00016224082662550003, |
|
"loss": 109.0398, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.2357723577235773, |
|
"grad_norm": 2032.73779296875, |
|
"learning_rate": 0.000161819100354094, |
|
"loss": 44.7227, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.252032520325203, |
|
"grad_norm": 1000.6553955078125, |
|
"learning_rate": 0.0001613955869795925, |
|
"loss": 73.6318, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.2682926829268295, |
|
"grad_norm": 877.0646362304688, |
|
"learning_rate": 0.00016097029874517053, |
|
"loss": 65.1961, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.2845528455284554, |
|
"grad_norm": 20667.6640625, |
|
"learning_rate": 0.0001605432479453117, |
|
"loss": 131.7637, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.3008130081300813, |
|
"grad_norm": 6932.1630859375, |
|
"learning_rate": 0.0001601144469254531, |
|
"loss": 63.2276, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.317073170731707, |
|
"grad_norm": 2701.05029296875, |
|
"learning_rate": 0.00015968390808162797, |
|
"loss": 93.1463, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 2700.706298828125, |
|
"learning_rate": 0.0001592516438601077, |
|
"loss": 63.6073, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.3495934959349594, |
|
"grad_norm": 9397.724609375, |
|
"learning_rate": 0.00015881766675704203, |
|
"loss": 74.2051, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.3658536585365852, |
|
"grad_norm": 919.5447998046875, |
|
"learning_rate": 0.00015838198931809747, |
|
"loss": 55.599, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.3821138211382116, |
|
"grad_norm": 4705.94287109375, |
|
"learning_rate": 0.00015794462413809503, |
|
"loss": 54.821, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.3983739837398375, |
|
"grad_norm": 80140.5, |
|
"learning_rate": 0.00015750558386064584, |
|
"loss": 132.3792, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.4146341463414633, |
|
"grad_norm": 17313.400390625, |
|
"learning_rate": 0.0001570648811777858, |
|
"loss": 73.4562, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.430894308943089, |
|
"grad_norm": 62464.19140625, |
|
"learning_rate": 0.00015662252882960855, |
|
"loss": 123.1144, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.4471544715447155, |
|
"grad_norm": 10362.189453125, |
|
"learning_rate": 0.00015617853960389724, |
|
"loss": 60.7324, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.4634146341463414, |
|
"grad_norm": 8119.03662109375, |
|
"learning_rate": 0.00015573292633575488, |
|
"loss": 47.9465, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.4796747967479673, |
|
"grad_norm": 65353.2890625, |
|
"learning_rate": 0.00015528570190723325, |
|
"loss": 38.784, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.4959349593495936, |
|
"grad_norm": 946.7526245117188, |
|
"learning_rate": 0.00015483687924696047, |
|
"loss": 45.439, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.5121951219512195, |
|
"grad_norm": 8941.34375, |
|
"learning_rate": 0.0001543864713297673, |
|
"loss": 62.3894, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.5284552845528454, |
|
"grad_norm": 169778.421875, |
|
"learning_rate": 0.00015393449117631205, |
|
"loss": 71.317, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.5447154471544717, |
|
"grad_norm": 1309.4539794921875, |
|
"learning_rate": 0.0001534809518527042, |
|
"loss": 59.1676, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.5609756097560976, |
|
"grad_norm": 159682.328125, |
|
"learning_rate": 0.0001530258664701266, |
|
"loss": 74.9109, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.5772357723577235, |
|
"grad_norm": 5231.26611328125, |
|
"learning_rate": 0.00015256924818445652, |
|
"loss": 50.8158, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.59349593495935, |
|
"grad_norm": 840.7651977539062, |
|
"learning_rate": 0.0001521111101958852, |
|
"loss": 53.3685, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.6097560975609757, |
|
"grad_norm": 1039.3839111328125, |
|
"learning_rate": 0.00015165146574853651, |
|
"loss": 51.3367, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.6260162601626016, |
|
"grad_norm": 2042.122802734375, |
|
"learning_rate": 0.00015119032813008384, |
|
"loss": 63.4835, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.642276422764228, |
|
"grad_norm": 1014.0968017578125, |
|
"learning_rate": 0.00015072771067136602, |
|
"loss": 121.3831, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.658536585365854, |
|
"grad_norm": 2085.046875, |
|
"learning_rate": 0.00015026362674600197, |
|
"loss": 86.4089, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.6747967479674797, |
|
"grad_norm": 1501.3868408203125, |
|
"learning_rate": 0.00014979808977000423, |
|
"loss": 87.4238, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.6910569105691056, |
|
"grad_norm": 3143.670166015625, |
|
"learning_rate": 0.0001493311132013908, |
|
"loss": 47.4117, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.7073170731707314, |
|
"grad_norm": 3601.27197265625, |
|
"learning_rate": 0.00014886271053979642, |
|
"loss": 47.0386, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.7235772357723578, |
|
"grad_norm": 1050.021484375, |
|
"learning_rate": 0.00014839289532608208, |
|
"loss": 50.3757, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.7398373983739837, |
|
"grad_norm": 1158.14453125, |
|
"learning_rate": 0.0001479216811419437, |
|
"loss": 53.1059, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.7560975609756095, |
|
"grad_norm": 1679.3118896484375, |
|
"learning_rate": 0.00014744908160951948, |
|
"loss": 81.2242, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.772357723577236, |
|
"grad_norm": 1483.0025634765625, |
|
"learning_rate": 0.00014697511039099602, |
|
"loss": 65.0123, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.7886178861788617, |
|
"grad_norm": 1206.0103759765625, |
|
"learning_rate": 0.00014649978118821356, |
|
"loss": 112.2168, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.8048780487804876, |
|
"grad_norm": 6336.48828125, |
|
"learning_rate": 0.00014602310774226957, |
|
"loss": 98.5093, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.821138211382114, |
|
"grad_norm": 659.5859985351562, |
|
"learning_rate": 0.00014554510383312189, |
|
"loss": 65.6266, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.83739837398374, |
|
"grad_norm": 1136.7991943359375, |
|
"learning_rate": 0.00014506578327919, |
|
"loss": 51.189, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.8536585365853657, |
|
"grad_norm": 6465.4130859375, |
|
"learning_rate": 0.00014458515993695585, |
|
"loss": 69.188, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.869918699186992, |
|
"grad_norm": 5106.58642578125, |
|
"learning_rate": 0.00014410324770056313, |
|
"loss": 96.6794, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.886178861788618, |
|
"grad_norm": 3519.845703125, |
|
"learning_rate": 0.00014362006050141563, |
|
"loss": 55.2195, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.902439024390244, |
|
"grad_norm": 20824.455078125, |
|
"learning_rate": 0.00014313561230777452, |
|
"loss": 47.6591, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.91869918699187, |
|
"grad_norm": 2973.600830078125, |
|
"learning_rate": 0.00014264991712435452, |
|
"loss": 66.8287, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.934959349593496, |
|
"grad_norm": 1502.51025390625, |
|
"learning_rate": 0.00014216298899191916, |
|
"loss": 47.0916, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.951219512195122, |
|
"grad_norm": 13010.16796875, |
|
"learning_rate": 0.0001416748419868747, |
|
"loss": 61.0954, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.9674796747967482, |
|
"grad_norm": 953.6785278320312, |
|
"learning_rate": 0.0001411854902208633, |
|
"loss": 47.334, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.983739837398374, |
|
"grad_norm": 2903.397216796875, |
|
"learning_rate": 0.00014069494784035505, |
|
"loss": 67.0245, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1550.0595703125, |
|
"learning_rate": 0.0001402032290262391, |
|
"loss": 51.0681, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.016260162601626, |
|
"grad_norm": 58333.4921875, |
|
"learning_rate": 0.00013971034799341355, |
|
"loss": 62.1808, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.032520325203252, |
|
"grad_norm": 1227.8946533203125, |
|
"learning_rate": 0.0001392163189903747, |
|
"loss": 72.5005, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.048780487804878, |
|
"grad_norm": 2188.923828125, |
|
"learning_rate": 0.00013872115629880497, |
|
"loss": 47.0166, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.065040650406504, |
|
"grad_norm": 1214.519775390625, |
|
"learning_rate": 0.0001382248742331602, |
|
"loss": 40.6225, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.08130081300813, |
|
"grad_norm": 952.546875, |
|
"learning_rate": 0.0001377274871402556, |
|
"loss": 43.3264, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.097560975609756, |
|
"grad_norm": 753.4329833984375, |
|
"learning_rate": 0.00013722900939885132, |
|
"loss": 51.3909, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.1138211382113825, |
|
"grad_norm": 1024.9317626953125, |
|
"learning_rate": 0.0001367294554192366, |
|
"loss": 42.0499, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.130081300813008, |
|
"grad_norm": 546.87841796875, |
|
"learning_rate": 0.00013622883964281316, |
|
"loss": 36.1083, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.146341463414634, |
|
"grad_norm": 893.5374755859375, |
|
"learning_rate": 0.00013572717654167777, |
|
"loss": 39.7196, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.16260162601626, |
|
"grad_norm": 1298.6865234375, |
|
"learning_rate": 0.00013522448061820393, |
|
"loss": 43.8941, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.178861788617886, |
|
"grad_norm": 1751.4395751953125, |
|
"learning_rate": 0.00013472076640462248, |
|
"loss": 48.5067, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.195121951219512, |
|
"grad_norm": 4070.478759765625, |
|
"learning_rate": 0.00013421604846260173, |
|
"loss": 69.5999, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.211382113821138, |
|
"grad_norm": 1715.4664306640625, |
|
"learning_rate": 0.0001337103413828263, |
|
"loss": 55.5755, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.227642276422764, |
|
"grad_norm": 1144.9033203125, |
|
"learning_rate": 0.00013320365978457534, |
|
"loss": 44.6062, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.2439024390243905, |
|
"grad_norm": 1374.0616455078125, |
|
"learning_rate": 0.00013269601831530003, |
|
"loss": 100.0019, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.260162601626016, |
|
"grad_norm": 649.107666015625, |
|
"learning_rate": 0.0001321874316502, |
|
"loss": 45.9766, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.276422764227642, |
|
"grad_norm": 1265.823486328125, |
|
"learning_rate": 0.00013167791449179928, |
|
"loss": 36.6327, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.2926829268292686, |
|
"grad_norm": 1065.16943359375, |
|
"learning_rate": 0.00013116748156952098, |
|
"loss": 36.6221, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.308943089430894, |
|
"grad_norm": 7990.9853515625, |
|
"learning_rate": 0.00013065614763926184, |
|
"loss": 47.2748, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.32520325203252, |
|
"grad_norm": 3891.1884765625, |
|
"learning_rate": 0.00013014392748296528, |
|
"loss": 60.2811, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.341463414634147, |
|
"grad_norm": 1250.55859375, |
|
"learning_rate": 0.00012963083590819443, |
|
"loss": 59.3533, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.357723577235772, |
|
"grad_norm": 452.96368408203125, |
|
"learning_rate": 0.00012911688774770377, |
|
"loss": 39.7551, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.373983739837398, |
|
"grad_norm": 1382.8927001953125, |
|
"learning_rate": 0.0001286020978590106, |
|
"loss": 56.9612, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.390243902439025, |
|
"grad_norm": 2779.33642578125, |
|
"learning_rate": 0.0001280864811239652, |
|
"loss": 76.6694, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.40650406504065, |
|
"grad_norm": 1720.7236328125, |
|
"learning_rate": 0.00012757005244832113, |
|
"loss": 54.5705, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.4227642276422765, |
|
"grad_norm": 530.7537231445312, |
|
"learning_rate": 0.00012705282676130368, |
|
"loss": 43.2596, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.439024390243903, |
|
"grad_norm": 1741.5948486328125, |
|
"learning_rate": 0.00012653481901517876, |
|
"loss": 44.5357, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.455284552845528, |
|
"grad_norm": 545.766357421875, |
|
"learning_rate": 0.00012601604418482052, |
|
"loss": 64.0609, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.471544715447155, |
|
"grad_norm": 760.1073608398438, |
|
"learning_rate": 0.00012549651726727841, |
|
"loss": 33.9295, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.487804878048781, |
|
"grad_norm": 3076.673583984375, |
|
"learning_rate": 0.0001249762532813437, |
|
"loss": 53.2542, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.504065040650406, |
|
"grad_norm": 613.498779296875, |
|
"learning_rate": 0.0001244552672671152, |
|
"loss": 42.9754, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.520325203252033, |
|
"grad_norm": 633.474365234375, |
|
"learning_rate": 0.0001239335742855645, |
|
"loss": 79.9076, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.536585365853659, |
|
"grad_norm": 534.7109375, |
|
"learning_rate": 0.00012341118941810086, |
|
"loss": 56.3449, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.5528455284552845, |
|
"grad_norm": 988.2083740234375, |
|
"learning_rate": 0.00012288812776613467, |
|
"loss": 60.076, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.569105691056911, |
|
"grad_norm": 987.4862670898438, |
|
"learning_rate": 0.00012236440445064146, |
|
"loss": 44.6687, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.585365853658536, |
|
"grad_norm": 1020.8764038085938, |
|
"learning_rate": 0.00012184003461172437, |
|
"loss": 54.9522, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.6016260162601625, |
|
"grad_norm": 861.468505859375, |
|
"learning_rate": 0.00012131503340817663, |
|
"loss": 72.5806, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.617886178861789, |
|
"grad_norm": 1153.2725830078125, |
|
"learning_rate": 0.00012078941601704343, |
|
"loss": 44.8851, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.634146341463414, |
|
"grad_norm": 7982.6865234375, |
|
"learning_rate": 0.00012026319763318301, |
|
"loss": 49.9482, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.650406504065041, |
|
"grad_norm": 1476.1536865234375, |
|
"learning_rate": 0.00011973639346882746, |
|
"loss": 47.223, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 1169.1434326171875, |
|
"learning_rate": 0.00011920901875314295, |
|
"loss": 51.8643, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.682926829268292, |
|
"grad_norm": 1330.784912109375, |
|
"learning_rate": 0.00011868108873178949, |
|
"loss": 43.6427, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.699186991869919, |
|
"grad_norm": 631.0576171875, |
|
"learning_rate": 0.00011815261866648026, |
|
"loss": 56.523, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.715447154471545, |
|
"grad_norm": 1804.2171630859375, |
|
"learning_rate": 0.00011762362383454024, |
|
"loss": 49.6038, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.7317073170731705, |
|
"grad_norm": 2007.8486328125, |
|
"learning_rate": 0.00011709411952846479, |
|
"loss": 56.3543, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.747967479674797, |
|
"grad_norm": 1846.902099609375, |
|
"learning_rate": 0.00011656412105547733, |
|
"loss": 40.9638, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.764227642276423, |
|
"grad_norm": 854.6354370117188, |
|
"learning_rate": 0.00011603364373708702, |
|
"loss": 47.7196, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.780487804878049, |
|
"grad_norm": 2663.093017578125, |
|
"learning_rate": 0.00011550270290864582, |
|
"loss": 88.7795, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.796747967479675, |
|
"grad_norm": 2370.38720703125, |
|
"learning_rate": 0.00011497131391890498, |
|
"loss": 65.2372, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.8130081300813, |
|
"grad_norm": 1494.7568359375, |
|
"learning_rate": 0.00011443949212957154, |
|
"loss": 68.4685, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.829268292682927, |
|
"grad_norm": 1287.447021484375, |
|
"learning_rate": 0.00011390725291486419, |
|
"loss": 51.913, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.845528455284553, |
|
"grad_norm": 1271.5274658203125, |
|
"learning_rate": 0.00011337461166106871, |
|
"loss": 53.7021, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.861788617886178, |
|
"grad_norm": 1231.7939453125, |
|
"learning_rate": 0.00011284158376609333, |
|
"loss": 31.6516, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.878048780487805, |
|
"grad_norm": 1916.57421875, |
|
"learning_rate": 0.00011230818463902358, |
|
"loss": 69.1733, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.894308943089431, |
|
"grad_norm": 2691.4208984375, |
|
"learning_rate": 0.00011177442969967668, |
|
"loss": 55.0878, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.9105691056910565, |
|
"grad_norm": 1314.462646484375, |
|
"learning_rate": 0.00011124033437815593, |
|
"loss": 40.0013, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.926829268292683, |
|
"grad_norm": 1857.048095703125, |
|
"learning_rate": 0.00011070591411440459, |
|
"loss": 46.5445, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.943089430894309, |
|
"grad_norm": 1580.3558349609375, |
|
"learning_rate": 0.00011017118435775957, |
|
"loss": 38.4451, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.959349593495935, |
|
"grad_norm": 1501.5589599609375, |
|
"learning_rate": 0.00010963616056650476, |
|
"loss": 34.3078, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.975609756097561, |
|
"grad_norm": 3925.81591796875, |
|
"learning_rate": 0.00010910085820742419, |
|
"loss": 58.2388, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.991869918699187, |
|
"grad_norm": 828.7344360351562, |
|
"learning_rate": 0.00010856529275535487, |
|
"loss": 77.3652, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.008130081300813, |
|
"grad_norm": 850.0521240234375, |
|
"learning_rate": 0.00010802947969273946, |
|
"loss": 32.5409, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.024390243902439, |
|
"grad_norm": 315.0628967285156, |
|
"learning_rate": 0.00010749343450917873, |
|
"loss": 49.1381, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.040650406504065, |
|
"grad_norm": 805.5790405273438, |
|
"learning_rate": 0.0001069571727009837, |
|
"loss": 44.4946, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.056910569105691, |
|
"grad_norm": 2954.944091796875, |
|
"learning_rate": 0.0001064207097707277, |
|
"loss": 56.0899, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.073170731707317, |
|
"grad_norm": 1296.76025390625, |
|
"learning_rate": 0.00010588406122679825, |
|
"loss": 32.3572, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.0894308943089435, |
|
"grad_norm": 682.7062377929688, |
|
"learning_rate": 0.00010534724258294868, |
|
"loss": 41.241, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.105691056910569, |
|
"grad_norm": 586.6185302734375, |
|
"learning_rate": 0.00010481026935784967, |
|
"loss": 46.9862, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.121951219512195, |
|
"grad_norm": 494.31768798828125, |
|
"learning_rate": 0.0001042731570746406, |
|
"loss": 39.867, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.138211382113822, |
|
"grad_norm": 1095.9088134765625, |
|
"learning_rate": 0.00010373592126048093, |
|
"loss": 33.0041, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.154471544715447, |
|
"grad_norm": 1172.2149658203125, |
|
"learning_rate": 0.00010319857744610106, |
|
"loss": 84.7379, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.170731707317073, |
|
"grad_norm": 7211.0283203125, |
|
"learning_rate": 0.00010266114116535362, |
|
"loss": 48.8282, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.186991869918699, |
|
"grad_norm": 1418.6943359375, |
|
"learning_rate": 0.00010212362795476432, |
|
"loss": 46.3707, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 5.203252032520325, |
|
"grad_norm": 3661.55126953125, |
|
"learning_rate": 0.0001015860533530828, |
|
"loss": 93.9867, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.219512195121951, |
|
"grad_norm": 1076.226806640625, |
|
"learning_rate": 0.00010104843290083341, |
|
"loss": 68.2097, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 5.235772357723577, |
|
"grad_norm": 4902.42138671875, |
|
"learning_rate": 0.00010051078213986597, |
|
"loss": 36.9465, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.252032520325203, |
|
"grad_norm": 2610.93212890625, |
|
"learning_rate": 9.997311661290648e-05, |
|
"loss": 56.646, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 5.2682926829268295, |
|
"grad_norm": 3272.592529296875, |
|
"learning_rate": 9.943545186310787e-05, |
|
"loss": 42.065, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.284552845528455, |
|
"grad_norm": 1224.6219482421875, |
|
"learning_rate": 9.889780343360049e-05, |
|
"loss": 60.0324, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.300813008130081, |
|
"grad_norm": 1191.6717529296875, |
|
"learning_rate": 9.836018686704298e-05, |
|
"loss": 49.1736, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.317073170731708, |
|
"grad_norm": 1531.7381591796875, |
|
"learning_rate": 9.782261770517289e-05, |
|
"loss": 29.3415, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 1613.154296875, |
|
"learning_rate": 9.72851114883572e-05, |
|
"loss": 71.2164, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.349593495934959, |
|
"grad_norm": 1089.3868408203125, |
|
"learning_rate": 9.674768375514347e-05, |
|
"loss": 41.1068, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 5.365853658536586, |
|
"grad_norm": 425.6622314453125, |
|
"learning_rate": 9.621035004181022e-05, |
|
"loss": 29.7313, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.382113821138211, |
|
"grad_norm": 4809.2626953125, |
|
"learning_rate": 9.56731258819181e-05, |
|
"loss": 59.21, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 5.3983739837398375, |
|
"grad_norm": 768.4491577148438, |
|
"learning_rate": 9.51360268058607e-05, |
|
"loss": 65.3515, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 5.414634146341464, |
|
"grad_norm": 1334.3365478515625, |
|
"learning_rate": 9.459906834041558e-05, |
|
"loss": 44.464, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 5.430894308943089, |
|
"grad_norm": 1523.654296875, |
|
"learning_rate": 9.406226600829545e-05, |
|
"loss": 61.8839, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 5.4471544715447155, |
|
"grad_norm": 1562.5716552734375, |
|
"learning_rate": 9.352563532769949e-05, |
|
"loss": 51.7122, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.463414634146342, |
|
"grad_norm": 1880.090087890625, |
|
"learning_rate": 9.298919181186458e-05, |
|
"loss": 41.961, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 5.479674796747967, |
|
"grad_norm": 1722.7073974609375, |
|
"learning_rate": 9.245295096861698e-05, |
|
"loss": 46.5965, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 5.495934959349594, |
|
"grad_norm": 925.80126953125, |
|
"learning_rate": 9.191692829992401e-05, |
|
"loss": 48.4384, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 5.512195121951219, |
|
"grad_norm": 1489.31982421875, |
|
"learning_rate": 9.138113930144578e-05, |
|
"loss": 59.3866, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 5.528455284552845, |
|
"grad_norm": 707.712890625, |
|
"learning_rate": 9.084559946208739e-05, |
|
"loss": 42.5858, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.544715447154472, |
|
"grad_norm": 2299.88720703125, |
|
"learning_rate": 9.031032426355106e-05, |
|
"loss": 36.6626, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 5.560975609756097, |
|
"grad_norm": 4950.97998046875, |
|
"learning_rate": 8.977532917988871e-05, |
|
"loss": 37.762, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 5.5772357723577235, |
|
"grad_norm": 891.8377075195312, |
|
"learning_rate": 8.924062967705443e-05, |
|
"loss": 50.5158, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 5.59349593495935, |
|
"grad_norm": 996.9815673828125, |
|
"learning_rate": 8.870624121245748e-05, |
|
"loss": 56.7966, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 5.609756097560975, |
|
"grad_norm": 814.5260009765625, |
|
"learning_rate": 8.817217923451554e-05, |
|
"loss": 61.8741, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 5.626016260162602, |
|
"grad_norm": 1282.3272705078125, |
|
"learning_rate": 8.763845918220793e-05, |
|
"loss": 28.1619, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 5.642276422764228, |
|
"grad_norm": 1114.01513671875, |
|
"learning_rate": 8.71050964846294e-05, |
|
"loss": 34.5723, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 5.658536585365853, |
|
"grad_norm": 768.8634033203125, |
|
"learning_rate": 8.657210656054413e-05, |
|
"loss": 40.1524, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 5.67479674796748, |
|
"grad_norm": 640.5523681640625, |
|
"learning_rate": 8.60395048179399e-05, |
|
"loss": 59.3767, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 5.691056910569106, |
|
"grad_norm": 976.6678466796875, |
|
"learning_rate": 8.550730665358266e-05, |
|
"loss": 46.2076, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.7073170731707314, |
|
"grad_norm": 904.607666015625, |
|
"learning_rate": 8.497552745257157e-05, |
|
"loss": 44.8267, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 5.723577235772358, |
|
"grad_norm": 18157.951171875, |
|
"learning_rate": 8.444418258789418e-05, |
|
"loss": 46.1126, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 5.739837398373984, |
|
"grad_norm": 702.4590454101562, |
|
"learning_rate": 8.391328741998187e-05, |
|
"loss": 62.335, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 5.7560975609756095, |
|
"grad_norm": 906.1786499023438, |
|
"learning_rate": 8.338285729626595e-05, |
|
"loss": 65.6418, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 5.772357723577236, |
|
"grad_norm": 1011.940185546875, |
|
"learning_rate": 8.285290755073405e-05, |
|
"loss": 41.4294, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 5.788617886178862, |
|
"grad_norm": 2783.18798828125, |
|
"learning_rate": 8.23234535034866e-05, |
|
"loss": 73.9544, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 5.804878048780488, |
|
"grad_norm": 1077.9619140625, |
|
"learning_rate": 8.179451046029424e-05, |
|
"loss": 36.2339, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 5.821138211382114, |
|
"grad_norm": 1024.14453125, |
|
"learning_rate": 8.12660937121551e-05, |
|
"loss": 40.021, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 5.83739837398374, |
|
"grad_norm": 1014.1956787109375, |
|
"learning_rate": 8.073821853485288e-05, |
|
"loss": 73.2346, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 5.853658536585366, |
|
"grad_norm": 869.21875, |
|
"learning_rate": 8.021090018851526e-05, |
|
"loss": 34.6341, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.869918699186992, |
|
"grad_norm": 1306.168212890625, |
|
"learning_rate": 7.968415391717271e-05, |
|
"loss": 71.121, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 5.886178861788618, |
|
"grad_norm": 1111.87890625, |
|
"learning_rate": 7.915799494831775e-05, |
|
"loss": 33.9404, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 5.902439024390244, |
|
"grad_norm": 759.7614135742188, |
|
"learning_rate": 7.863243849246494e-05, |
|
"loss": 50.714, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 5.91869918699187, |
|
"grad_norm": 5193.80419921875, |
|
"learning_rate": 7.810749974271099e-05, |
|
"loss": 59.9144, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 5.934959349593496, |
|
"grad_norm": 1484.0467529296875, |
|
"learning_rate": 7.758319387429553e-05, |
|
"loss": 58.3316, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 5.951219512195122, |
|
"grad_norm": 1309.0003662109375, |
|
"learning_rate": 7.705953604416254e-05, |
|
"loss": 48.9651, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 5.967479674796748, |
|
"grad_norm": 754.5973510742188, |
|
"learning_rate": 7.653654139052214e-05, |
|
"loss": 29.4624, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 5.983739837398374, |
|
"grad_norm": 637.7557983398438, |
|
"learning_rate": 7.60142250324129e-05, |
|
"loss": 43.2339, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1177.0924072265625, |
|
"learning_rate": 7.549260206926486e-05, |
|
"loss": 47.2867, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.016260162601626, |
|
"grad_norm": 1924.6392822265625, |
|
"learning_rate": 7.4971687580463e-05, |
|
"loss": 38.3521, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.032520325203252, |
|
"grad_norm": 916.7091674804688, |
|
"learning_rate": 7.445149662491126e-05, |
|
"loss": 49.7392, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.048780487804878, |
|
"grad_norm": 967.6969604492188, |
|
"learning_rate": 7.393204424059725e-05, |
|
"loss": 38.2029, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 6.065040650406504, |
|
"grad_norm": 840.0963745117188, |
|
"learning_rate": 7.341334544415761e-05, |
|
"loss": 77.827, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 6.08130081300813, |
|
"grad_norm": 1400.66064453125, |
|
"learning_rate": 7.289541523044376e-05, |
|
"loss": 66.4577, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.097560975609756, |
|
"grad_norm": 767.639892578125, |
|
"learning_rate": 7.237826857208847e-05, |
|
"loss": 30.1595, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.1138211382113825, |
|
"grad_norm": 728.1867065429688, |
|
"learning_rate": 7.186192041907298e-05, |
|
"loss": 48.2639, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.130081300813008, |
|
"grad_norm": 1045.18798828125, |
|
"learning_rate": 7.134638569829499e-05, |
|
"loss": 54.2319, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 6.146341463414634, |
|
"grad_norm": 1185.36474609375, |
|
"learning_rate": 7.083167931313692e-05, |
|
"loss": 37.9882, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.16260162601626, |
|
"grad_norm": 723.2171020507812, |
|
"learning_rate": 7.031781614303519e-05, |
|
"loss": 41.0285, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 6.178861788617886, |
|
"grad_norm": 1335.1109619140625, |
|
"learning_rate": 6.980481104305013e-05, |
|
"loss": 33.8187, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.195121951219512, |
|
"grad_norm": 651.626708984375, |
|
"learning_rate": 6.929267884343634e-05, |
|
"loss": 65.5501, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 6.211382113821138, |
|
"grad_norm": 595.5252075195312, |
|
"learning_rate": 6.87814343492142e-05, |
|
"loss": 43.2794, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 6.227642276422764, |
|
"grad_norm": 1277.5653076171875, |
|
"learning_rate": 6.827109233974178e-05, |
|
"loss": 42.5897, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 6.2439024390243905, |
|
"grad_norm": 950.2879028320312, |
|
"learning_rate": 6.776166756828759e-05, |
|
"loss": 59.1106, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 6.260162601626016, |
|
"grad_norm": 862.7484741210938, |
|
"learning_rate": 6.7253174761604e-05, |
|
"loss": 51.2283, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 6.276422764227642, |
|
"grad_norm": 346.978759765625, |
|
"learning_rate": 6.674562861950167e-05, |
|
"loss": 22.1792, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 6.2926829268292686, |
|
"grad_norm": 2020.3907470703125, |
|
"learning_rate": 6.62390438144245e-05, |
|
"loss": 34.9443, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 6.308943089430894, |
|
"grad_norm": 1247.765869140625, |
|
"learning_rate": 6.573343499102545e-05, |
|
"loss": 89.5246, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 6.32520325203252, |
|
"grad_norm": 1061.9462890625, |
|
"learning_rate": 6.52288167657433e-05, |
|
"loss": 57.1117, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 6.341463414634147, |
|
"grad_norm": 740.0230712890625, |
|
"learning_rate": 6.472520372637999e-05, |
|
"loss": 41.9892, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.357723577235772, |
|
"grad_norm": 437.2298583984375, |
|
"learning_rate": 6.422261043167893e-05, |
|
"loss": 41.5301, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 6.373983739837398, |
|
"grad_norm": 707.180908203125, |
|
"learning_rate": 6.372105141090417e-05, |
|
"loss": 61.3545, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 6.390243902439025, |
|
"grad_norm": 533.357177734375, |
|
"learning_rate": 6.322054116342044e-05, |
|
"loss": 40.3018, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 6.40650406504065, |
|
"grad_norm": 423.275634765625, |
|
"learning_rate": 6.272109415827379e-05, |
|
"loss": 31.2483, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 6.4227642276422765, |
|
"grad_norm": 535.2537231445312, |
|
"learning_rate": 6.222272483377345e-05, |
|
"loss": 61.084, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 6.439024390243903, |
|
"grad_norm": 654.32470703125, |
|
"learning_rate": 6.172544759707449e-05, |
|
"loss": 69.6351, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 6.455284552845528, |
|
"grad_norm": 827.914794921875, |
|
"learning_rate": 6.122927682376119e-05, |
|
"loss": 34.8883, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 6.471544715447155, |
|
"grad_norm": 364.55615234375, |
|
"learning_rate": 6.0734226857431554e-05, |
|
"loss": 32.2486, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 6.487804878048781, |
|
"grad_norm": 383.2949523925781, |
|
"learning_rate": 6.0240312009282674e-05, |
|
"loss": 27.0549, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 6.504065040650406, |
|
"grad_norm": 666.8985595703125, |
|
"learning_rate": 5.9747546557696924e-05, |
|
"loss": 30.6733, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.520325203252033, |
|
"grad_norm": 322.81890869140625, |
|
"learning_rate": 5.925594474782925e-05, |
|
"loss": 41.4183, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 6.536585365853659, |
|
"grad_norm": 1725.4873046875, |
|
"learning_rate": 5.876552079119536e-05, |
|
"loss": 56.3451, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 6.5528455284552845, |
|
"grad_norm": 417.5548095703125, |
|
"learning_rate": 5.827628886526093e-05, |
|
"loss": 46.2162, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 6.569105691056911, |
|
"grad_norm": 626.910400390625, |
|
"learning_rate": 5.778826311303169e-05, |
|
"loss": 29.055, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 6.585365853658536, |
|
"grad_norm": 661.1826171875, |
|
"learning_rate": 5.730145764264448e-05, |
|
"loss": 27.6717, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 6.6016260162601625, |
|
"grad_norm": 595.2796020507812, |
|
"learning_rate": 5.681588652695966e-05, |
|
"loss": 50.871, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.617886178861789, |
|
"grad_norm": 1768.0650634765625, |
|
"learning_rate": 5.6331563803154086e-05, |
|
"loss": 31.054, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 6.634146341463414, |
|
"grad_norm": 1227.727783203125, |
|
"learning_rate": 5.584850347231528e-05, |
|
"loss": 36.9891, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 6.650406504065041, |
|
"grad_norm": 1646.6304931640625, |
|
"learning_rate": 5.536671949903689e-05, |
|
"loss": 33.9344, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 1407.2939453125, |
|
"learning_rate": 5.4886225811014814e-05, |
|
"loss": 51.3101, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.682926829268292, |
|
"grad_norm": 1124.4527587890625, |
|
"learning_rate": 5.440703629864454e-05, |
|
"loss": 49.1819, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 6.699186991869919, |
|
"grad_norm": 689.7494506835938, |
|
"learning_rate": 5.392916481461983e-05, |
|
"loss": 36.6202, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 6.715447154471545, |
|
"grad_norm": 714.1576538085938, |
|
"learning_rate": 5.3452625173531964e-05, |
|
"loss": 32.2473, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 6.7317073170731705, |
|
"grad_norm": 479.4760437011719, |
|
"learning_rate": 5.297743115147062e-05, |
|
"loss": 35.0904, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 6.747967479674797, |
|
"grad_norm": 362.479736328125, |
|
"learning_rate": 5.250359648562551e-05, |
|
"loss": 43.3301, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 6.764227642276423, |
|
"grad_norm": 668.361572265625, |
|
"learning_rate": 5.203113487388917e-05, |
|
"loss": 50.1241, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 6.780487804878049, |
|
"grad_norm": 1105.221923828125, |
|
"learning_rate": 5.156005997446118e-05, |
|
"loss": 36.7327, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 6.796747967479675, |
|
"grad_norm": 528.5939331054688, |
|
"learning_rate": 5.109038540545326e-05, |
|
"loss": 45.8215, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 6.8130081300813, |
|
"grad_norm": 635.588134765625, |
|
"learning_rate": 5.062212474449537e-05, |
|
"loss": 68.0413, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 6.829268292682927, |
|
"grad_norm": 629.8543701171875, |
|
"learning_rate": 5.0155291528343577e-05, |
|
"loss": 89.9357, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.845528455284553, |
|
"grad_norm": 511.0000915527344, |
|
"learning_rate": 4.96898992524884e-05, |
|
"loss": 39.3891, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 6.861788617886178, |
|
"grad_norm": 331.4763488769531, |
|
"learning_rate": 4.922596137076493e-05, |
|
"loss": 32.5439, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 6.878048780487805, |
|
"grad_norm": 433.0771484375, |
|
"learning_rate": 4.876349129496355e-05, |
|
"loss": 64.7455, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 6.894308943089431, |
|
"grad_norm": 456.54644775390625, |
|
"learning_rate": 4.830250239444276e-05, |
|
"loss": 44.152, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 6.9105691056910565, |
|
"grad_norm": 1340.421142578125, |
|
"learning_rate": 4.7843007995742065e-05, |
|
"loss": 30.8355, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 6.926829268292683, |
|
"grad_norm": 1253.5787353515625, |
|
"learning_rate": 4.7385021382197216e-05, |
|
"loss": 48.8547, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 6.943089430894309, |
|
"grad_norm": 735.3323974609375, |
|
"learning_rate": 4.692855579355597e-05, |
|
"loss": 29.7913, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 6.959349593495935, |
|
"grad_norm": 485.3312072753906, |
|
"learning_rate": 4.647362442559535e-05, |
|
"loss": 45.8068, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 6.975609756097561, |
|
"grad_norm": 1383.2845458984375, |
|
"learning_rate": 4.602024042974027e-05, |
|
"loss": 38.6388, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 6.991869918699187, |
|
"grad_norm": 491.0514831542969, |
|
"learning_rate": 4.556841691268333e-05, |
|
"loss": 36.584, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.008130081300813, |
|
"grad_norm": 417.0002746582031, |
|
"learning_rate": 4.511816693600577e-05, |
|
"loss": 39.8136, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 7.024390243902439, |
|
"grad_norm": 731.73828125, |
|
"learning_rate": 4.46695035158001e-05, |
|
"loss": 32.1251, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 7.040650406504065, |
|
"grad_norm": 649.9963989257812, |
|
"learning_rate": 4.42224396222937e-05, |
|
"loss": 24.8058, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 7.056910569105691, |
|
"grad_norm": 497.6392517089844, |
|
"learning_rate": 4.377698817947385e-05, |
|
"loss": 37.5999, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 7.073170731707317, |
|
"grad_norm": 1092.6939697265625, |
|
"learning_rate": 4.333316206471418e-05, |
|
"loss": 34.9651, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.0894308943089435, |
|
"grad_norm": 252.49484252929688, |
|
"learning_rate": 4.2890974108402425e-05, |
|
"loss": 64.3354, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 7.105691056910569, |
|
"grad_norm": 704.4669799804688, |
|
"learning_rate": 4.2450437093569315e-05, |
|
"loss": 66.6694, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 7.121951219512195, |
|
"grad_norm": 1412.200927734375, |
|
"learning_rate": 4.2011563755519326e-05, |
|
"loss": 34.0108, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 7.138211382113822, |
|
"grad_norm": 513.7908935546875, |
|
"learning_rate": 4.157436678146238e-05, |
|
"loss": 23.0915, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 7.154471544715447, |
|
"grad_norm": 429.260986328125, |
|
"learning_rate": 4.1138858810146965e-05, |
|
"loss": 21.7249, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.170731707317073, |
|
"grad_norm": 282.83160400390625, |
|
"learning_rate": 4.0705052431494995e-05, |
|
"loss": 35.1431, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 7.186991869918699, |
|
"grad_norm": 189.756591796875, |
|
"learning_rate": 4.027296018623772e-05, |
|
"loss": 30.4934, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 7.203252032520325, |
|
"grad_norm": 484.0589904785156, |
|
"learning_rate": 3.9842594565553085e-05, |
|
"loss": 25.1109, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 7.219512195121951, |
|
"grad_norm": 707.24560546875, |
|
"learning_rate": 3.9413968010704984e-05, |
|
"loss": 49.4997, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 7.235772357723577, |
|
"grad_norm": 321.16485595703125, |
|
"learning_rate": 3.898709291268313e-05, |
|
"loss": 50.0109, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.252032520325203, |
|
"grad_norm": 468.12042236328125, |
|
"learning_rate": 3.8561981611845246e-05, |
|
"loss": 71.7242, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 7.2682926829268295, |
|
"grad_norm": 628.5554809570312, |
|
"learning_rate": 3.813864639756007e-05, |
|
"loss": 31.7032, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 7.284552845528455, |
|
"grad_norm": 597.160400390625, |
|
"learning_rate": 3.771709950785228e-05, |
|
"loss": 27.9663, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 7.300813008130081, |
|
"grad_norm": 450.8225402832031, |
|
"learning_rate": 3.7297353129048476e-05, |
|
"loss": 21.0904, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 7.317073170731708, |
|
"grad_norm": 615.4117431640625, |
|
"learning_rate": 3.687941939542513e-05, |
|
"loss": 32.9963, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 751.5721435546875, |
|
"learning_rate": 3.646331038885768e-05, |
|
"loss": 33.0976, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 7.349593495934959, |
|
"grad_norm": 13358.826171875, |
|
"learning_rate": 3.6049038138471215e-05, |
|
"loss": 48.3166, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 7.365853658536586, |
|
"grad_norm": 5210.142578125, |
|
"learning_rate": 3.5636614620292854e-05, |
|
"loss": 42.6251, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 7.382113821138211, |
|
"grad_norm": 1281.064453125, |
|
"learning_rate": 3.522605175690544e-05, |
|
"loss": 29.0492, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 7.3983739837398375, |
|
"grad_norm": 357.83819580078125, |
|
"learning_rate": 3.481736141710293e-05, |
|
"loss": 35.3369, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.414634146341464, |
|
"grad_norm": 173.05294799804688, |
|
"learning_rate": 3.4410555415547306e-05, |
|
"loss": 33.2367, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 7.430894308943089, |
|
"grad_norm": 3365.111572265625, |
|
"learning_rate": 3.4005645512426834e-05, |
|
"loss": 29.4222, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 7.4471544715447155, |
|
"grad_norm": 670.9901733398438, |
|
"learning_rate": 3.3602643413116386e-05, |
|
"loss": 44.8467, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 7.463414634146342, |
|
"grad_norm": 454.53265380859375, |
|
"learning_rate": 3.320156076783891e-05, |
|
"loss": 32.9965, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 7.479674796747967, |
|
"grad_norm": 1082.113525390625, |
|
"learning_rate": 3.280240917132853e-05, |
|
"loss": 37.7567, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.495934959349594, |
|
"grad_norm": 21382.505859375, |
|
"learning_rate": 3.2405200162495586e-05, |
|
"loss": 27.9646, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 7.512195121951219, |
|
"grad_norm": 391.889892578125, |
|
"learning_rate": 3.200994522409293e-05, |
|
"loss": 32.9818, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 7.528455284552845, |
|
"grad_norm": 4713.3359375, |
|
"learning_rate": 3.1616655782383864e-05, |
|
"loss": 37.4087, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 7.544715447154472, |
|
"grad_norm": 2711.176513671875, |
|
"learning_rate": 3.122534320681214e-05, |
|
"loss": 48.8535, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 7.560975609756097, |
|
"grad_norm": 1700.7119140625, |
|
"learning_rate": 3.083601880967302e-05, |
|
"loss": 42.1752, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 7.5772357723577235, |
|
"grad_norm": 420.5804443359375, |
|
"learning_rate": 3.0448693845786246e-05, |
|
"loss": 26.3437, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 7.59349593495935, |
|
"grad_norm": 279.73455810546875, |
|
"learning_rate": 3.0063379512170852e-05, |
|
"loss": 26.54, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 7.609756097560975, |
|
"grad_norm": 373.8387756347656, |
|
"learning_rate": 2.968008694772141e-05, |
|
"loss": 32.9037, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 7.626016260162602, |
|
"grad_norm": 4132.44873046875, |
|
"learning_rate": 2.9298827232885863e-05, |
|
"loss": 30.5371, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 7.642276422764228, |
|
"grad_norm": 448.18359375, |
|
"learning_rate": 2.8919611389345447e-05, |
|
"loss": 23.2553, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 7.658536585365853, |
|
"grad_norm": 1203.708984375, |
|
"learning_rate": 2.8542450379695973e-05, |
|
"loss": 48.5284, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 7.67479674796748, |
|
"grad_norm": 234.6784210205078, |
|
"learning_rate": 2.8167355107130787e-05, |
|
"loss": 63.0278, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 7.691056910569106, |
|
"grad_norm": 475.01544189453125, |
|
"learning_rate": 2.77943364151258e-05, |
|
"loss": 26.5827, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 7.7073170731707314, |
|
"grad_norm": 2622.9150390625, |
|
"learning_rate": 2.7423405087125832e-05, |
|
"loss": 37.8167, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 7.723577235772358, |
|
"grad_norm": 2133.2802734375, |
|
"learning_rate": 2.705457184623299e-05, |
|
"loss": 45.3475, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 7.739837398373984, |
|
"grad_norm": 467.1634216308594, |
|
"learning_rate": 2.668784735489662e-05, |
|
"loss": 38.3572, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 7.7560975609756095, |
|
"grad_norm": 2866.9052734375, |
|
"learning_rate": 2.632324221460515e-05, |
|
"loss": 49.7959, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 7.772357723577236, |
|
"grad_norm": 5320.82470703125, |
|
"learning_rate": 2.5960766965579407e-05, |
|
"loss": 27.4925, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 7.788617886178862, |
|
"grad_norm": 12207.2236328125, |
|
"learning_rate": 2.5600432086468207e-05, |
|
"loss": 25.4184, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 7.804878048780488, |
|
"grad_norm": 928.2150268554688, |
|
"learning_rate": 2.5242247994045255e-05, |
|
"loss": 38.9474, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 7.821138211382114, |
|
"grad_norm": 666.2001342773438, |
|
"learning_rate": 2.4886225042907973e-05, |
|
"loss": 28.4315, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 7.83739837398374, |
|
"grad_norm": 394.76727294921875, |
|
"learning_rate": 2.453237352517831e-05, |
|
"loss": 35.7413, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 7.853658536585366, |
|
"grad_norm": 1564.347900390625, |
|
"learning_rate": 2.4180703670205108e-05, |
|
"loss": 49.657, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 7.869918699186992, |
|
"grad_norm": 662.8395385742188, |
|
"learning_rate": 2.3831225644268416e-05, |
|
"loss": 23.6479, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 7.886178861788618, |
|
"grad_norm": 448.2498474121094, |
|
"learning_rate": 2.348394955028561e-05, |
|
"loss": 30.4568, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 7.902439024390244, |
|
"grad_norm": 738.3649291992188, |
|
"learning_rate": 2.3138885427519262e-05, |
|
"loss": 48.6049, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 7.91869918699187, |
|
"grad_norm": 600.122314453125, |
|
"learning_rate": 2.2796043251287002e-05, |
|
"loss": 24.3334, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 7.934959349593496, |
|
"grad_norm": 604.3839111328125, |
|
"learning_rate": 2.2455432932673182e-05, |
|
"loss": 48.3579, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 7.951219512195122, |
|
"grad_norm": 854.1920166015625, |
|
"learning_rate": 2.2117064318242154e-05, |
|
"loss": 50.2401, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 7.967479674796748, |
|
"grad_norm": 8056.27490234375, |
|
"learning_rate": 2.1780947189753875e-05, |
|
"loss": 41.4174, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 7.983739837398374, |
|
"grad_norm": 788.5985717773438, |
|
"learning_rate": 2.1447091263881014e-05, |
|
"loss": 41.0822, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 194.98179626464844, |
|
"learning_rate": 2.111550619192797e-05, |
|
"loss": 28.0501, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 8.016260162601625, |
|
"grad_norm": 463.9582214355469, |
|
"learning_rate": 2.0786201559552022e-05, |
|
"loss": 38.9959, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 8.032520325203253, |
|
"grad_norm": 361.2221374511719, |
|
"learning_rate": 2.045918688648616e-05, |
|
"loss": 37.643, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 8.048780487804878, |
|
"grad_norm": 3094.411376953125, |
|
"learning_rate": 2.013447162626384e-05, |
|
"loss": 23.8148, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 8.065040650406504, |
|
"grad_norm": 618.3005981445312, |
|
"learning_rate": 1.981206516594576e-05, |
|
"loss": 45.4684, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 8.08130081300813, |
|
"grad_norm": 3658.843994140625, |
|
"learning_rate": 1.949197682584848e-05, |
|
"loss": 47.9616, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 8.097560975609756, |
|
"grad_norm": 3654.126708984375, |
|
"learning_rate": 1.9174215859274892e-05, |
|
"loss": 39.6678, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 8.113821138211382, |
|
"grad_norm": 3715.457763671875, |
|
"learning_rate": 1.885879145224688e-05, |
|
"loss": 28.395, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 8.130081300813009, |
|
"grad_norm": 13629.64453125, |
|
"learning_rate": 1.8545712723239682e-05, |
|
"loss": 30.707, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.146341463414634, |
|
"grad_norm": 1702.9984130859375, |
|
"learning_rate": 1.823498872291821e-05, |
|
"loss": 39.2062, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 8.16260162601626, |
|
"grad_norm": 652.4723510742188, |
|
"learning_rate": 1.792662843387557e-05, |
|
"loss": 25.4401, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 8.178861788617887, |
|
"grad_norm": 545.2056884765625, |
|
"learning_rate": 1.7620640770373286e-05, |
|
"loss": 65.776, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 8.195121951219512, |
|
"grad_norm": 986.5762329101562, |
|
"learning_rate": 1.7317034578083547e-05, |
|
"loss": 27.4899, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 8.211382113821138, |
|
"grad_norm": 471.08343505859375, |
|
"learning_rate": 1.70158186338337e-05, |
|
"loss": 35.4397, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 8.227642276422765, |
|
"grad_norm": 284.622802734375, |
|
"learning_rate": 1.6717001645352324e-05, |
|
"loss": 22.5494, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 8.24390243902439, |
|
"grad_norm": 22431.65625, |
|
"learning_rate": 1.6420592251017487e-05, |
|
"loss": 45.1601, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 8.260162601626016, |
|
"grad_norm": 780.5162353515625, |
|
"learning_rate": 1.6126599019607223e-05, |
|
"loss": 33.0745, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 8.276422764227643, |
|
"grad_norm": 961.0186767578125, |
|
"learning_rate": 1.5835030450051656e-05, |
|
"loss": 34.2111, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 8.292682926829269, |
|
"grad_norm": 240.08079528808594, |
|
"learning_rate": 1.5545894971187303e-05, |
|
"loss": 25.9617, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.308943089430894, |
|
"grad_norm": 2864.75, |
|
"learning_rate": 1.525920094151353e-05, |
|
"loss": 43.9031, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 8.32520325203252, |
|
"grad_norm": 791.8621215820312, |
|
"learning_rate": 1.4974956648950845e-05, |
|
"loss": 37.113, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 8.341463414634147, |
|
"grad_norm": 470.98736572265625, |
|
"learning_rate": 1.4693170310601212e-05, |
|
"loss": 34.8349, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 8.357723577235772, |
|
"grad_norm": 840.1485595703125, |
|
"learning_rate": 1.4413850072510704e-05, |
|
"loss": 24.1196, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 8.373983739837398, |
|
"grad_norm": 660.6499633789062, |
|
"learning_rate": 1.4137004009433885e-05, |
|
"loss": 20.1648, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 8.390243902439025, |
|
"grad_norm": 1366.75390625, |
|
"learning_rate": 1.386264012460039e-05, |
|
"loss": 29.1244, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 8.40650406504065, |
|
"grad_norm": 270.5916442871094, |
|
"learning_rate": 1.3590766349483586e-05, |
|
"loss": 36.4448, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 8.422764227642276, |
|
"grad_norm": 439.3215637207031, |
|
"learning_rate": 1.3321390543571266e-05, |
|
"loss": 33.3136, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 8.439024390243903, |
|
"grad_norm": 37061.68359375, |
|
"learning_rate": 1.3054520494138445e-05, |
|
"loss": 64.5556, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 8.455284552845528, |
|
"grad_norm": 316.3396911621094, |
|
"learning_rate": 1.2790163916022312e-05, |
|
"loss": 27.1406, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 8.471544715447154, |
|
"grad_norm": 2111.4130859375, |
|
"learning_rate": 1.2528328451399041e-05, |
|
"loss": 22.3547, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 8.487804878048781, |
|
"grad_norm": 489.82464599609375, |
|
"learning_rate": 1.2269021669563041e-05, |
|
"loss": 20.5392, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 8.504065040650406, |
|
"grad_norm": 1655.57275390625, |
|
"learning_rate": 1.2012251066708035e-05, |
|
"loss": 25.9037, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 8.520325203252032, |
|
"grad_norm": 1041.8621826171875, |
|
"learning_rate": 1.1758024065710404e-05, |
|
"loss": 26.4345, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 8.536585365853659, |
|
"grad_norm": 1299.66650390625, |
|
"learning_rate": 1.150634801591457e-05, |
|
"loss": 42.8872, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 8.552845528455284, |
|
"grad_norm": 435.3826904296875, |
|
"learning_rate": 1.1257230192920565e-05, |
|
"loss": 42.8848, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 8.56910569105691, |
|
"grad_norm": 726.2322998046875, |
|
"learning_rate": 1.1010677798373625e-05, |
|
"loss": 25.041, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 8.585365853658537, |
|
"grad_norm": 3022.15625, |
|
"learning_rate": 1.0766697959756166e-05, |
|
"loss": 68.7748, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 8.601626016260163, |
|
"grad_norm": 4241.69580078125, |
|
"learning_rate": 1.0525297730181572e-05, |
|
"loss": 74.2972, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 8.617886178861788, |
|
"grad_norm": 961.3088989257812, |
|
"learning_rate": 1.028648408819034e-05, |
|
"loss": 24.1545, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 8.634146341463415, |
|
"grad_norm": 949.1688842773438, |
|
"learning_rate": 1.0050263937548433e-05, |
|
"loss": 49.1739, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 8.65040650406504, |
|
"grad_norm": 470.57708740234375, |
|
"learning_rate": 9.816644107047613e-06, |
|
"loss": 32.3933, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 717.5396728515625, |
|
"learning_rate": 9.585631350308e-06, |
|
"loss": 32.7468, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 8.682926829268293, |
|
"grad_norm": 575.5538330078125, |
|
"learning_rate": 9.357232345582922e-06, |
|
"loss": 37.3175, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 8.699186991869919, |
|
"grad_norm": 371.1407775878906, |
|
"learning_rate": 9.131453695565872e-06, |
|
"loss": 48.2922, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 8.715447154471544, |
|
"grad_norm": 1407.066650390625, |
|
"learning_rate": 8.90830192719947e-06, |
|
"loss": 34.3162, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 8.731707317073171, |
|
"grad_norm": 2786.113525390625, |
|
"learning_rate": 8.687783491486966e-06, |
|
"loss": 51.1913, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 8.747967479674797, |
|
"grad_norm": 407.6085510253906, |
|
"learning_rate": 8.46990476330567e-06, |
|
"loss": 27.1041, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 8.764227642276422, |
|
"grad_norm": 317.9125671386719, |
|
"learning_rate": 8.254672041222611e-06, |
|
"loss": 57.7832, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 8.78048780487805, |
|
"grad_norm": 200.4461669921875, |
|
"learning_rate": 8.042091547312569e-06, |
|
"loss": 24.9711, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 8.796747967479675, |
|
"grad_norm": 25919.078125, |
|
"learning_rate": 7.83216942697813e-06, |
|
"loss": 30.2866, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 8.8130081300813, |
|
"grad_norm": 9640.9111328125, |
|
"learning_rate": 7.624911748772023e-06, |
|
"loss": 46.633, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 8.829268292682928, |
|
"grad_norm": 339.77239990234375, |
|
"learning_rate": 7.420324504221721e-06, |
|
"loss": 49.0615, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 8.845528455284553, |
|
"grad_norm": 360.1629638671875, |
|
"learning_rate": 7.218413607656227e-06, |
|
"loss": 43.912, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 8.861788617886178, |
|
"grad_norm": 357.3642578125, |
|
"learning_rate": 7.019184896035103e-06, |
|
"loss": 40.2426, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 8.878048780487806, |
|
"grad_norm": 342.8908386230469, |
|
"learning_rate": 6.822644128779721e-06, |
|
"loss": 27.857, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 8.894308943089431, |
|
"grad_norm": 1741.92333984375, |
|
"learning_rate": 6.628796987606722e-06, |
|
"loss": 22.8556, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 8.910569105691057, |
|
"grad_norm": 817.4639282226562, |
|
"learning_rate": 6.437649076363883e-06, |
|
"loss": 25.4468, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 8.926829268292684, |
|
"grad_norm": 418.2152404785156, |
|
"learning_rate": 6.249205920868018e-06, |
|
"loss": 30.6125, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 8.94308943089431, |
|
"grad_norm": 345.6661071777344, |
|
"learning_rate": 6.063472968745221e-06, |
|
"loss": 24.8203, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 8.959349593495935, |
|
"grad_norm": 311.8279113769531, |
|
"learning_rate": 5.880455589273481e-06, |
|
"loss": 28.5219, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 8.975609756097562, |
|
"grad_norm": 398.0353698730469, |
|
"learning_rate": 5.7001590732273955e-06, |
|
"loss": 38.751, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 8.991869918699187, |
|
"grad_norm": 4006.41796875, |
|
"learning_rate": 5.522588632725245e-06, |
|
"loss": 48.2014, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 9.008130081300813, |
|
"grad_norm": 863.8807983398438, |
|
"learning_rate": 5.34774940107825e-06, |
|
"loss": 42.1497, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 9.024390243902438, |
|
"grad_norm": 6790.38232421875, |
|
"learning_rate": 5.175646432642278e-06, |
|
"loss": 31.0566, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 9.040650406504065, |
|
"grad_norm": 772.9898681640625, |
|
"learning_rate": 5.006284702671693e-06, |
|
"loss": 36.8164, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 9.05691056910569, |
|
"grad_norm": 4930.9443359375, |
|
"learning_rate": 4.839669107175493e-06, |
|
"loss": 42.4926, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 9.073170731707316, |
|
"grad_norm": 192.48233032226562, |
|
"learning_rate": 4.675804462775801e-06, |
|
"loss": 39.5624, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 9.089430894308943, |
|
"grad_norm": 886.0300903320312, |
|
"learning_rate": 4.5146955065686e-06, |
|
"loss": 32.467, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 9.105691056910569, |
|
"grad_norm": 271.0351257324219, |
|
"learning_rate": 4.3563468959868515e-06, |
|
"loss": 29.2705, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.121951219512194, |
|
"grad_norm": 651.6824340820312, |
|
"learning_rate": 4.2007632086658035e-06, |
|
"loss": 40.7806, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 9.138211382113822, |
|
"grad_norm": 153.58518981933594, |
|
"learning_rate": 4.047948942310631e-06, |
|
"loss": 32.8395, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 9.154471544715447, |
|
"grad_norm": 771.262939453125, |
|
"learning_rate": 3.897908514566484e-06, |
|
"loss": 59.9376, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 9.170731707317072, |
|
"grad_norm": 2750.450439453125, |
|
"learning_rate": 3.750646262890767e-06, |
|
"loss": 26.9996, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 9.1869918699187, |
|
"grad_norm": 361.48516845703125, |
|
"learning_rate": 3.60616644442765e-06, |
|
"loss": 30.9447, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 9.203252032520325, |
|
"grad_norm": 1025.7686767578125, |
|
"learning_rate": 3.4644732358851685e-06, |
|
"loss": 27.8333, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 9.21951219512195, |
|
"grad_norm": 301.7310485839844, |
|
"learning_rate": 3.3255707334143516e-06, |
|
"loss": 50.7049, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 9.235772357723578, |
|
"grad_norm": 282.4934997558594, |
|
"learning_rate": 3.1894629524908293e-06, |
|
"loss": 58.6614, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 9.252032520325203, |
|
"grad_norm": 2989.5283203125, |
|
"learning_rate": 3.056153827798791e-06, |
|
"loss": 65.7686, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 9.268292682926829, |
|
"grad_norm": 145.37416076660156, |
|
"learning_rate": 2.9256472131172442e-06, |
|
"loss": 24.332, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.284552845528456, |
|
"grad_norm": 245.1734619140625, |
|
"learning_rate": 2.797946881208513e-06, |
|
"loss": 62.6, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 9.300813008130081, |
|
"grad_norm": 842.1190795898438, |
|
"learning_rate": 2.673056523709294e-06, |
|
"loss": 33.1712, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 9.317073170731707, |
|
"grad_norm": 205.359130859375, |
|
"learning_rate": 2.550979751023885e-06, |
|
"loss": 24.7365, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 189.57533264160156, |
|
"learning_rate": 2.431720092219758e-06, |
|
"loss": 28.2499, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 9.34959349593496, |
|
"grad_norm": 311.52374267578125, |
|
"learning_rate": 2.3152809949256503e-06, |
|
"loss": 21.5204, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 9.365853658536585, |
|
"grad_norm": 2237.07958984375, |
|
"learning_rate": 2.2016658252318025e-06, |
|
"loss": 26.6137, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 9.382113821138212, |
|
"grad_norm": 623.1047973632812, |
|
"learning_rate": 2.0908778675927e-06, |
|
"loss": 24.8671, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 9.398373983739837, |
|
"grad_norm": 292.36285400390625, |
|
"learning_rate": 1.9829203247321293e-06, |
|
"loss": 23.2705, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 9.414634146341463, |
|
"grad_norm": 139.58456420898438, |
|
"learning_rate": 1.8777963175505398e-06, |
|
"loss": 34.1858, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 9.43089430894309, |
|
"grad_norm": 5472.58349609375, |
|
"learning_rate": 1.7755088850348822e-06, |
|
"loss": 23.8006, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 9.447154471544716, |
|
"grad_norm": 1327.946533203125, |
|
"learning_rate": 1.676060984170702e-06, |
|
"loss": 27.9731, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 9.463414634146341, |
|
"grad_norm": 156.09629821777344, |
|
"learning_rate": 1.5794554898567182e-06, |
|
"loss": 24.1258, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 9.479674796747968, |
|
"grad_norm": 485.4151306152344, |
|
"learning_rate": 1.4856951948216569e-06, |
|
"loss": 28.9193, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 9.495934959349594, |
|
"grad_norm": 354.6837158203125, |
|
"learning_rate": 1.39478280954356e-06, |
|
"loss": 33.2445, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 9.512195121951219, |
|
"grad_norm": 503.53289794921875, |
|
"learning_rate": 1.3067209621713928e-06, |
|
"loss": 25.0091, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 9.528455284552846, |
|
"grad_norm": 329.1166687011719, |
|
"learning_rate": 1.221512198449093e-06, |
|
"loss": 35.9692, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 9.544715447154472, |
|
"grad_norm": 374.5758361816406, |
|
"learning_rate": 1.1391589816419968e-06, |
|
"loss": 25.7447, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 9.560975609756097, |
|
"grad_norm": 257.5137939453125, |
|
"learning_rate": 1.059663692465529e-06, |
|
"loss": 37.0374, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 9.577235772357724, |
|
"grad_norm": 284.5126037597656, |
|
"learning_rate": 9.830286290165357e-07, |
|
"loss": 23.4132, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 9.59349593495935, |
|
"grad_norm": 689.851806640625, |
|
"learning_rate": 9.092560067067268e-07, |
|
"loss": 47.7638, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 9.609756097560975, |
|
"grad_norm": 1487.80859375, |
|
"learning_rate": 8.383479581986597e-07, |
|
"loss": 22.3418, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 9.6260162601626, |
|
"grad_norm": 1127.08837890625, |
|
"learning_rate": 7.70306533344134e-07, |
|
"loss": 24.0052, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 9.642276422764228, |
|
"grad_norm": 6250.7666015625, |
|
"learning_rate": 7.051336991248714e-07, |
|
"loss": 31.2493, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 9.658536585365853, |
|
"grad_norm": 565.5596923828125, |
|
"learning_rate": 6.428313395956953e-07, |
|
"loss": 20.2709, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 9.67479674796748, |
|
"grad_norm": 142.4834442138672, |
|
"learning_rate": 5.834012558300295e-07, |
|
"loss": 27.2821, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 9.691056910569106, |
|
"grad_norm": 559.2692260742188, |
|
"learning_rate": 5.26845165867873e-07, |
|
"loss": 56.2713, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 9.707317073170731, |
|
"grad_norm": 170.9761199951172, |
|
"learning_rate": 4.7316470466611804e-07, |
|
"loss": 25.9403, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 9.723577235772357, |
|
"grad_norm": 577.9078369140625, |
|
"learning_rate": 4.22361424051243e-07, |
|
"loss": 27.2287, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 9.739837398373984, |
|
"grad_norm": 203.03167724609375, |
|
"learning_rate": 3.7443679267453735e-07, |
|
"loss": 33.0212, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 9.75609756097561, |
|
"grad_norm": 1709.7088623046875, |
|
"learning_rate": 3.2939219596956895e-07, |
|
"loss": 30.0687, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.772357723577235, |
|
"grad_norm": 226.99795532226562, |
|
"learning_rate": 2.872289361121605e-07, |
|
"loss": 36.0599, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 9.788617886178862, |
|
"grad_norm": 805.7896728515625, |
|
"learning_rate": 2.4794823198275307e-07, |
|
"loss": 48.3908, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 9.804878048780488, |
|
"grad_norm": 21221.19921875, |
|
"learning_rate": 2.115512191311564e-07, |
|
"loss": 55.056, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 9.821138211382113, |
|
"grad_norm": 1422.177001953125, |
|
"learning_rate": 1.780389497437418e-07, |
|
"loss": 20.1985, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 9.83739837398374, |
|
"grad_norm": 182.74656677246094, |
|
"learning_rate": 1.4741239261299998e-07, |
|
"loss": 36.4601, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 9.853658536585366, |
|
"grad_norm": 427.26385498046875, |
|
"learning_rate": 1.1967243310955222e-07, |
|
"loss": 49.9752, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 9.869918699186991, |
|
"grad_norm": 463.0358581542969, |
|
"learning_rate": 9.481987315653751e-08, |
|
"loss": 38.0783, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 9.886178861788618, |
|
"grad_norm": 381.15008544921875, |
|
"learning_rate": 7.285543120645332e-08, |
|
"loss": 40.3717, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 9.902439024390244, |
|
"grad_norm": 414.7477111816406, |
|
"learning_rate": 5.377974222036119e-08, |
|
"loss": 23.7009, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 9.91869918699187, |
|
"grad_norm": 2649.5400390625, |
|
"learning_rate": 3.7593357649579055e-08, |
|
"loss": 39.1989, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 9.934959349593496, |
|
"grad_norm": 1547.17236328125, |
|
"learning_rate": 2.429674541966076e-08, |
|
"loss": 45.793, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 9.951219512195122, |
|
"grad_norm": 394.08685302734375, |
|
"learning_rate": 1.3890289916929089e-08, |
|
"loss": 26.9755, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 9.967479674796747, |
|
"grad_norm": 6701.4306640625, |
|
"learning_rate": 6.37429197736239e-09, |
|
"loss": 26.3901, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 9.983739837398375, |
|
"grad_norm": 231.67611694335938, |
|
"learning_rate": 1.7489688778793424e-09, |
|
"loss": 22.5137, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 158.60951232910156, |
|
"learning_rate": 1.4454330032886986e-11, |
|
"loss": 39.3726, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 10.001626016260163, |
|
"step": 6151, |
|
"total_flos": 2.157115506118272e+17, |
|
"train_loss": 212.35847260424458, |
|
"train_runtime": 2807.1103, |
|
"train_samples_per_second": 35.06, |
|
"train_steps_per_second": 2.191 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6151, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.157115506118272e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|