gr00t / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
eac8840 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.001626016260163,
"eval_steps": 500,
"global_step": 6151,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016260162601626018,
"grad_norm": 3704.4365234375,
"learning_rate": 6.493506493506493e-06,
"loss": 11280.4625,
"step": 10
},
{
"epoch": 0.032520325203252036,
"grad_norm": 3407.870361328125,
"learning_rate": 1.2987012987012986e-05,
"loss": 11149.0953,
"step": 20
},
{
"epoch": 0.04878048780487805,
"grad_norm": 3003.387939453125,
"learning_rate": 1.9480519480519483e-05,
"loss": 11173.7734,
"step": 30
},
{
"epoch": 0.06504065040650407,
"grad_norm": 2449.536376953125,
"learning_rate": 2.5974025974025972e-05,
"loss": 11021.1336,
"step": 40
},
{
"epoch": 0.08130081300813008,
"grad_norm": 2389.97509765625,
"learning_rate": 3.246753246753247e-05,
"loss": 11002.3586,
"step": 50
},
{
"epoch": 0.0975609756097561,
"grad_norm": 2766.29443359375,
"learning_rate": 3.8961038961038966e-05,
"loss": 10353.3125,
"step": 60
},
{
"epoch": 0.11382113821138211,
"grad_norm": 4483.15087890625,
"learning_rate": 4.545454545454546e-05,
"loss": 9267.8453,
"step": 70
},
{
"epoch": 0.13008130081300814,
"grad_norm": 10965.46484375,
"learning_rate": 5.1948051948051944e-05,
"loss": 7955.3172,
"step": 80
},
{
"epoch": 0.14634146341463414,
"grad_norm": 12316.47265625,
"learning_rate": 5.844155844155844e-05,
"loss": 3923.734,
"step": 90
},
{
"epoch": 0.16260162601626016,
"grad_norm": 154421.5,
"learning_rate": 6.493506493506494e-05,
"loss": 2865.5645,
"step": 100
},
{
"epoch": 0.17886178861788618,
"grad_norm": 6532.34765625,
"learning_rate": 7.142857142857143e-05,
"loss": 2196.2996,
"step": 110
},
{
"epoch": 0.1951219512195122,
"grad_norm": 4513.40087890625,
"learning_rate": 7.792207792207793e-05,
"loss": 1111.7941,
"step": 120
},
{
"epoch": 0.21138211382113822,
"grad_norm": 4285.0390625,
"learning_rate": 8.441558441558442e-05,
"loss": 945.8594,
"step": 130
},
{
"epoch": 0.22764227642276422,
"grad_norm": 2602.03369140625,
"learning_rate": 9.090909090909092e-05,
"loss": 593.7912,
"step": 140
},
{
"epoch": 0.24390243902439024,
"grad_norm": 3598.093017578125,
"learning_rate": 9.74025974025974e-05,
"loss": 355.9361,
"step": 150
},
{
"epoch": 0.2601626016260163,
"grad_norm": 40596.5078125,
"learning_rate": 0.00010389610389610389,
"loss": 263.4985,
"step": 160
},
{
"epoch": 0.2764227642276423,
"grad_norm": 3959.72265625,
"learning_rate": 0.0001103896103896104,
"loss": 200.0281,
"step": 170
},
{
"epoch": 0.2926829268292683,
"grad_norm": 277750.1875,
"learning_rate": 0.00011688311688311689,
"loss": 176.2805,
"step": 180
},
{
"epoch": 0.3089430894308943,
"grad_norm": 114203.1015625,
"learning_rate": 0.0001233766233766234,
"loss": 192.7327,
"step": 190
},
{
"epoch": 0.3252032520325203,
"grad_norm": 2108.614013671875,
"learning_rate": 0.00012987012987012987,
"loss": 154.6821,
"step": 200
},
{
"epoch": 0.34146341463414637,
"grad_norm": 6306.46484375,
"learning_rate": 0.00013636363636363637,
"loss": 94.0863,
"step": 210
},
{
"epoch": 0.35772357723577236,
"grad_norm": 2991.08935546875,
"learning_rate": 0.00014285714285714287,
"loss": 120.9749,
"step": 220
},
{
"epoch": 0.37398373983739835,
"grad_norm": 17456.123046875,
"learning_rate": 0.00014935064935064934,
"loss": 150.8287,
"step": 230
},
{
"epoch": 0.3902439024390244,
"grad_norm": 3997.399658203125,
"learning_rate": 0.00015584415584415587,
"loss": 127.9284,
"step": 240
},
{
"epoch": 0.4065040650406504,
"grad_norm": 3142.8544921875,
"learning_rate": 0.00016233766233766234,
"loss": 99.4487,
"step": 250
},
{
"epoch": 0.42276422764227645,
"grad_norm": 4303.7421875,
"learning_rate": 0.00016883116883116884,
"loss": 111.1226,
"step": 260
},
{
"epoch": 0.43902439024390244,
"grad_norm": 9494.5283203125,
"learning_rate": 0.00017532467532467534,
"loss": 148.5725,
"step": 270
},
{
"epoch": 0.45528455284552843,
"grad_norm": 12805.1005859375,
"learning_rate": 0.00018181818181818183,
"loss": 89.0703,
"step": 280
},
{
"epoch": 0.4715447154471545,
"grad_norm": 5651.734375,
"learning_rate": 0.00018831168831168833,
"loss": 113.0061,
"step": 290
},
{
"epoch": 0.4878048780487805,
"grad_norm": 3500.915283203125,
"learning_rate": 0.0001948051948051948,
"loss": 98.2204,
"step": 300
},
{
"epoch": 0.5040650406504065,
"grad_norm": 76347.09375,
"learning_rate": 0.00019999994218268405,
"loss": 99.3199,
"step": 310
},
{
"epoch": 0.5203252032520326,
"grad_norm": 5863.58642578125,
"learning_rate": 0.00019999791858364572,
"loss": 145.732,
"step": 320
},
{
"epoch": 0.5365853658536586,
"grad_norm": 6211.58203125,
"learning_rate": 0.00019999300418566636,
"loss": 99.9764,
"step": 330
},
{
"epoch": 0.5528455284552846,
"grad_norm": 1611.767333984375,
"learning_rate": 0.00019998519913081423,
"loss": 130.0497,
"step": 340
},
{
"epoch": 0.5691056910569106,
"grad_norm": 6738.830078125,
"learning_rate": 0.0001999745036447225,
"loss": 132.5203,
"step": 350
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1651.7647705078125,
"learning_rate": 0.00019996091803658263,
"loss": 93.679,
"step": 360
},
{
"epoch": 0.6016260162601627,
"grad_norm": 1451.311279296875,
"learning_rate": 0.00019994444269913535,
"loss": 130.961,
"step": 370
},
{
"epoch": 0.6178861788617886,
"grad_norm": 3547.41015625,
"learning_rate": 0.00019992507810865954,
"loss": 89.0317,
"step": 380
},
{
"epoch": 0.6341463414634146,
"grad_norm": 3523.5322265625,
"learning_rate": 0.00019990282482495816,
"loss": 92.9305,
"step": 390
},
{
"epoch": 0.6504065040650406,
"grad_norm": 5402.7509765625,
"learning_rate": 0.00019987768349134227,
"loss": 124.9789,
"step": 400
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2866.8330078125,
"learning_rate": 0.0001998496548346125,
"loss": 85.8321,
"step": 410
},
{
"epoch": 0.6829268292682927,
"grad_norm": 2670.57275390625,
"learning_rate": 0.00019981873966503773,
"loss": 143.1263,
"step": 420
},
{
"epoch": 0.6991869918699187,
"grad_norm": 3444.505126953125,
"learning_rate": 0.000199784938876332,
"loss": 117.812,
"step": 430
},
{
"epoch": 0.7154471544715447,
"grad_norm": 1545.001708984375,
"learning_rate": 0.0001997482534456285,
"loss": 100.9372,
"step": 440
},
{
"epoch": 0.7317073170731707,
"grad_norm": 839.1185913085938,
"learning_rate": 0.00019970868443345134,
"loss": 92.1672,
"step": 450
},
{
"epoch": 0.7479674796747967,
"grad_norm": 17015.447265625,
"learning_rate": 0.0001996662329836849,
"loss": 96.7714,
"step": 460
},
{
"epoch": 0.7642276422764228,
"grad_norm": 3150.693359375,
"learning_rate": 0.0001996209003235408,
"loss": 90.8617,
"step": 470
},
{
"epoch": 0.7804878048780488,
"grad_norm": 519.7723999023438,
"learning_rate": 0.00019957268776352234,
"loss": 113.3078,
"step": 480
},
{
"epoch": 0.7967479674796748,
"grad_norm": 4185.5126953125,
"learning_rate": 0.00019952159669738674,
"loss": 105.7553,
"step": 490
},
{
"epoch": 0.8130081300813008,
"grad_norm": 2410.567138671875,
"learning_rate": 0.00019946762860210471,
"loss": 78.1075,
"step": 500
},
{
"epoch": 0.8292682926829268,
"grad_norm": 2119.968505859375,
"learning_rate": 0.00019941078503781792,
"loss": 83.322,
"step": 510
},
{
"epoch": 0.8455284552845529,
"grad_norm": 10644.458984375,
"learning_rate": 0.00019935106764779365,
"loss": 79.2555,
"step": 520
},
{
"epoch": 0.8617886178861789,
"grad_norm": 3944.619873046875,
"learning_rate": 0.00019928847815837758,
"loss": 103.8101,
"step": 530
},
{
"epoch": 0.8780487804878049,
"grad_norm": 1694.9683837890625,
"learning_rate": 0.00019922301837894358,
"loss": 96.7458,
"step": 540
},
{
"epoch": 0.8943089430894309,
"grad_norm": 3317.81640625,
"learning_rate": 0.0001991546902018417,
"loss": 160.2423,
"step": 550
},
{
"epoch": 0.9105691056910569,
"grad_norm": 7013.18359375,
"learning_rate": 0.0001990834956023433,
"loss": 122.6204,
"step": 560
},
{
"epoch": 0.926829268292683,
"grad_norm": 3094.7744140625,
"learning_rate": 0.00019900943663858387,
"loss": 96.8247,
"step": 570
},
{
"epoch": 0.943089430894309,
"grad_norm": 6648.25048828125,
"learning_rate": 0.0001989325154515038,
"loss": 116.6589,
"step": 580
},
{
"epoch": 0.959349593495935,
"grad_norm": 15371.361328125,
"learning_rate": 0.0001988527342647862,
"loss": 88.9712,
"step": 590
},
{
"epoch": 0.975609756097561,
"grad_norm": 2130.667724609375,
"learning_rate": 0.00019877009538479275,
"loss": 75.6254,
"step": 600
},
{
"epoch": 0.991869918699187,
"grad_norm": 3430.82763671875,
"learning_rate": 0.00019868460120049704,
"loss": 118.3028,
"step": 610
},
{
"epoch": 1.008130081300813,
"grad_norm": 1396.5372314453125,
"learning_rate": 0.00019859625418341557,
"loss": 78.8569,
"step": 620
},
{
"epoch": 1.024390243902439,
"grad_norm": 7597.01904296875,
"learning_rate": 0.00019850505688753602,
"loss": 100.3299,
"step": 630
},
{
"epoch": 1.040650406504065,
"grad_norm": 2552.638916015625,
"learning_rate": 0.0001984110119492438,
"loss": 73.0117,
"step": 640
},
{
"epoch": 1.056910569105691,
"grad_norm": 1387.00439453125,
"learning_rate": 0.00019831412208724556,
"loss": 107.2604,
"step": 650
},
{
"epoch": 1.0731707317073171,
"grad_norm": 1579.257080078125,
"learning_rate": 0.0001982143901024907,
"loss": 64.988,
"step": 660
},
{
"epoch": 1.089430894308943,
"grad_norm": 1369.64501953125,
"learning_rate": 0.0001981118188780904,
"loss": 110.6651,
"step": 670
},
{
"epoch": 1.1056910569105691,
"grad_norm": 3883.478271484375,
"learning_rate": 0.00019800641137923423,
"loss": 110.6604,
"step": 680
},
{
"epoch": 1.1219512195121952,
"grad_norm": 2725.116943359375,
"learning_rate": 0.00019789817065310448,
"loss": 97.7683,
"step": 690
},
{
"epoch": 1.1382113821138211,
"grad_norm": 2270.0986328125,
"learning_rate": 0.00019778709982878805,
"loss": 133.6088,
"step": 700
},
{
"epoch": 1.1544715447154472,
"grad_norm": 3066.498046875,
"learning_rate": 0.000197673202117186,
"loss": 83.8171,
"step": 710
},
{
"epoch": 1.170731707317073,
"grad_norm": 5128.125,
"learning_rate": 0.00019755648081092066,
"loss": 169.6488,
"step": 720
},
{
"epoch": 1.1869918699186992,
"grad_norm": 1368.7137451171875,
"learning_rate": 0.00019743693928424058,
"loss": 78.2656,
"step": 730
},
{
"epoch": 1.203252032520325,
"grad_norm": 3027.226318359375,
"learning_rate": 0.00019731458099292288,
"loss": 132.4441,
"step": 740
},
{
"epoch": 1.2195121951219512,
"grad_norm": 7759.80810546875,
"learning_rate": 0.00019718940947417336,
"loss": 130.1133,
"step": 750
},
{
"epoch": 1.2357723577235773,
"grad_norm": 1686.7059326171875,
"learning_rate": 0.00019706142834652427,
"loss": 111.4778,
"step": 760
},
{
"epoch": 1.2520325203252032,
"grad_norm": 9301.548828125,
"learning_rate": 0.00019693064130972974,
"loss": 88.9655,
"step": 770
},
{
"epoch": 1.2682926829268293,
"grad_norm": 1258.872802734375,
"learning_rate": 0.0001967970521446587,
"loss": 69.8348,
"step": 780
},
{
"epoch": 1.2845528455284554,
"grad_norm": 1352.4385986328125,
"learning_rate": 0.00019666066471318568,
"loss": 77.0263,
"step": 790
},
{
"epoch": 1.3008130081300813,
"grad_norm": 855.2029418945312,
"learning_rate": 0.00019652148295807922,
"loss": 85.511,
"step": 800
},
{
"epoch": 1.3170731707317074,
"grad_norm": 1946.88330078125,
"learning_rate": 0.00019637951090288778,
"loss": 59.645,
"step": 810
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2297.47216796875,
"learning_rate": 0.00019623475265182337,
"loss": 67.3651,
"step": 820
},
{
"epoch": 1.3495934959349594,
"grad_norm": 11286.927734375,
"learning_rate": 0.00019608721238964318,
"loss": 128.2699,
"step": 830
},
{
"epoch": 1.3658536585365852,
"grad_norm": 2499.033447265625,
"learning_rate": 0.00019593689438152827,
"loss": 69.4611,
"step": 840
},
{
"epoch": 1.3821138211382114,
"grad_norm": 10106.341796875,
"learning_rate": 0.0001957838029729605,
"loss": 93.8524,
"step": 850
},
{
"epoch": 1.3983739837398375,
"grad_norm": 2966.48779296875,
"learning_rate": 0.00019562794258959674,
"loss": 108.8285,
"step": 860
},
{
"epoch": 1.4146341463414633,
"grad_norm": 7656.32275390625,
"learning_rate": 0.00019546931773714116,
"loss": 70.237,
"step": 870
},
{
"epoch": 1.4308943089430894,
"grad_norm": 4307.1708984375,
"learning_rate": 0.00019530793300121473,
"loss": 125.8694,
"step": 880
},
{
"epoch": 1.4471544715447155,
"grad_norm": 2789.88916015625,
"learning_rate": 0.0001951437930472228,
"loss": 108.8423,
"step": 890
},
{
"epoch": 1.4634146341463414,
"grad_norm": 5194.333984375,
"learning_rate": 0.00019497690262022018,
"loss": 162.3557,
"step": 900
},
{
"epoch": 1.4796747967479675,
"grad_norm": 2407.015380859375,
"learning_rate": 0.00019480726654477398,
"loss": 98.5685,
"step": 910
},
{
"epoch": 1.4959349593495934,
"grad_norm": 7854.9638671875,
"learning_rate": 0.00019463488972482418,
"loss": 60.0693,
"step": 920
},
{
"epoch": 1.5121951219512195,
"grad_norm": 1800.740478515625,
"learning_rate": 0.00019445977714354173,
"loss": 60.3849,
"step": 930
},
{
"epoch": 1.5284552845528454,
"grad_norm": 2736.665283203125,
"learning_rate": 0.00019428193386318468,
"loss": 66.8596,
"step": 940
},
{
"epoch": 1.5447154471544715,
"grad_norm": 15203.984375,
"learning_rate": 0.0001941013650249517,
"loss": 95.6272,
"step": 950
},
{
"epoch": 1.5609756097560976,
"grad_norm": 3157.21337890625,
"learning_rate": 0.0001939180758488335,
"loss": 71.3239,
"step": 960
},
{
"epoch": 1.5772357723577235,
"grad_norm": 4594.89501953125,
"learning_rate": 0.00019373207163346192,
"loss": 82.758,
"step": 970
},
{
"epoch": 1.5934959349593496,
"grad_norm": 2293.903564453125,
"learning_rate": 0.0001935433577559568,
"loss": 67.9693,
"step": 980
},
{
"epoch": 1.6097560975609757,
"grad_norm": 2138.247802734375,
"learning_rate": 0.0001933519396717704,
"loss": 75.4409,
"step": 990
},
{
"epoch": 1.6260162601626016,
"grad_norm": 781.8675537109375,
"learning_rate": 0.0001931578229145299,
"loss": 77.4897,
"step": 1000
},
{
"epoch": 1.6422764227642277,
"grad_norm": 2182.60107421875,
"learning_rate": 0.00019296101309587726,
"loss": 54.7864,
"step": 1010
},
{
"epoch": 1.6585365853658538,
"grad_norm": 26183.85546875,
"learning_rate": 0.00019276151590530703,
"loss": 89.1371,
"step": 1020
},
{
"epoch": 1.6747967479674797,
"grad_norm": 1233.78857421875,
"learning_rate": 0.000192559337110002,
"loss": 51.9562,
"step": 1030
},
{
"epoch": 1.6910569105691056,
"grad_norm": 4076.354248046875,
"learning_rate": 0.00019235448255466617,
"loss": 77.1311,
"step": 1040
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1355.48095703125,
"learning_rate": 0.0001921469581613562,
"loss": 70.7184,
"step": 1050
},
{
"epoch": 1.7235772357723578,
"grad_norm": 4424.4345703125,
"learning_rate": 0.00019193676992930992,
"loss": 82.3314,
"step": 1060
},
{
"epoch": 1.7398373983739837,
"grad_norm": 38555.8359375,
"learning_rate": 0.00019172392393477296,
"loss": 78.6395,
"step": 1070
},
{
"epoch": 1.7560975609756098,
"grad_norm": 8763.5234375,
"learning_rate": 0.0001915084263308232,
"loss": 110.0452,
"step": 1080
},
{
"epoch": 1.7723577235772359,
"grad_norm": 3243.281005859375,
"learning_rate": 0.0001912902833471927,
"loss": 121.6475,
"step": 1090
},
{
"epoch": 1.7886178861788617,
"grad_norm": 9277.51953125,
"learning_rate": 0.0001910695012900878,
"loss": 113.4883,
"step": 1100
},
{
"epoch": 1.8048780487804879,
"grad_norm": 1323.01904296875,
"learning_rate": 0.0001908460865420067,
"loss": 82.9752,
"step": 1110
},
{
"epoch": 1.821138211382114,
"grad_norm": 1779.7681884765625,
"learning_rate": 0.00019062004556155506,
"loss": 89.6342,
"step": 1120
},
{
"epoch": 1.8373983739837398,
"grad_norm": 4294.21044921875,
"learning_rate": 0.00019039138488325912,
"loss": 95.4384,
"step": 1130
},
{
"epoch": 1.8536585365853657,
"grad_norm": 1751.0389404296875,
"learning_rate": 0.0001901601111173769,
"loss": 94.7895,
"step": 1140
},
{
"epoch": 1.8699186991869918,
"grad_norm": 1074.5364990234375,
"learning_rate": 0.00018992623094970718,
"loss": 52.8511,
"step": 1150
},
{
"epoch": 1.886178861788618,
"grad_norm": 1331.533935546875,
"learning_rate": 0.0001896897511413961,
"loss": 94.5019,
"step": 1160
},
{
"epoch": 1.9024390243902438,
"grad_norm": 3847.0712890625,
"learning_rate": 0.0001894506785287417,
"loss": 74.2541,
"step": 1170
},
{
"epoch": 1.91869918699187,
"grad_norm": 2032.809326171875,
"learning_rate": 0.00018920902002299644,
"loss": 139.9438,
"step": 1180
},
{
"epoch": 1.934959349593496,
"grad_norm": 2137.700439453125,
"learning_rate": 0.00018896478261016725,
"loss": 111.8997,
"step": 1190
},
{
"epoch": 1.951219512195122,
"grad_norm": 2548.987548828125,
"learning_rate": 0.0001887179733508136,
"loss": 76.8431,
"step": 1200
},
{
"epoch": 1.967479674796748,
"grad_norm": 1745.5999755859375,
"learning_rate": 0.00018846859937984346,
"loss": 67.4039,
"step": 1210
},
{
"epoch": 1.9837398373983741,
"grad_norm": 2157.826904296875,
"learning_rate": 0.000188216667906307,
"loss": 103.4683,
"step": 1220
},
{
"epoch": 2.0,
"grad_norm": 1239.8826904296875,
"learning_rate": 0.00018796218621318822,
"loss": 98.9879,
"step": 1230
},
{
"epoch": 2.016260162601626,
"grad_norm": 1551.313720703125,
"learning_rate": 0.00018770516165719423,
"loss": 58.3172,
"step": 1240
},
{
"epoch": 2.032520325203252,
"grad_norm": 6539.54736328125,
"learning_rate": 0.00018744560166854296,
"loss": 72.3266,
"step": 1250
},
{
"epoch": 2.048780487804878,
"grad_norm": 920.8938598632812,
"learning_rate": 0.00018718351375074786,
"loss": 71.1883,
"step": 1260
},
{
"epoch": 2.065040650406504,
"grad_norm": 2663.365234375,
"learning_rate": 0.00018691890548040146,
"loss": 100.6873,
"step": 1270
},
{
"epoch": 2.08130081300813,
"grad_norm": 5801.7314453125,
"learning_rate": 0.00018665178450695606,
"loss": 51.0893,
"step": 1280
},
{
"epoch": 2.097560975609756,
"grad_norm": 1768.61083984375,
"learning_rate": 0.00018638215855250263,
"loss": 46.9602,
"step": 1290
},
{
"epoch": 2.113821138211382,
"grad_norm": 74955.7421875,
"learning_rate": 0.00018611003541154766,
"loss": 69.618,
"step": 1300
},
{
"epoch": 2.130081300813008,
"grad_norm": 16715.201171875,
"learning_rate": 0.00018583542295078775,
"loss": 76.9604,
"step": 1310
},
{
"epoch": 2.1463414634146343,
"grad_norm": 490.9708557128906,
"learning_rate": 0.0001855583291088822,
"loss": 61.4616,
"step": 1320
},
{
"epoch": 2.16260162601626,
"grad_norm": 2168.93896484375,
"learning_rate": 0.00018527876189622372,
"loss": 69.4417,
"step": 1330
},
{
"epoch": 2.178861788617886,
"grad_norm": 1728.7271728515625,
"learning_rate": 0.00018499672939470646,
"loss": 41.3895,
"step": 1340
},
{
"epoch": 2.1951219512195124,
"grad_norm": 13797.6259765625,
"learning_rate": 0.00018471223975749266,
"loss": 86.5364,
"step": 1350
},
{
"epoch": 2.2113821138211383,
"grad_norm": 1238.1878662109375,
"learning_rate": 0.000184425301208777,
"loss": 60.4841,
"step": 1360
},
{
"epoch": 2.227642276422764,
"grad_norm": 1721.18505859375,
"learning_rate": 0.00018413592204354857,
"loss": 63.7924,
"step": 1370
},
{
"epoch": 2.2439024390243905,
"grad_norm": 1503.65234375,
"learning_rate": 0.00018384411062735142,
"loss": 72.9356,
"step": 1380
},
{
"epoch": 2.2601626016260163,
"grad_norm": 2268.296630859375,
"learning_rate": 0.00018354987539604244,
"loss": 64.837,
"step": 1390
},
{
"epoch": 2.2764227642276422,
"grad_norm": 770.135986328125,
"learning_rate": 0.0001832532248555476,
"loss": 46.5948,
"step": 1400
},
{
"epoch": 2.292682926829268,
"grad_norm": 2809.25146484375,
"learning_rate": 0.00018295416758161607,
"loss": 72.0357,
"step": 1410
},
{
"epoch": 2.3089430894308944,
"grad_norm": 5348.63330078125,
"learning_rate": 0.00018265271221957235,
"loss": 64.2022,
"step": 1420
},
{
"epoch": 2.3252032520325203,
"grad_norm": 1522.7744140625,
"learning_rate": 0.00018234886748406623,
"loss": 87.9972,
"step": 1430
},
{
"epoch": 2.341463414634146,
"grad_norm": 2168.956298828125,
"learning_rate": 0.00018204264215882093,
"loss": 77.3112,
"step": 1440
},
{
"epoch": 2.3577235772357725,
"grad_norm": 1201.1219482421875,
"learning_rate": 0.00018173404509637912,
"loss": 77.9051,
"step": 1450
},
{
"epoch": 2.3739837398373984,
"grad_norm": 1378.28515625,
"learning_rate": 0.00018142308521784716,
"loss": 113.3623,
"step": 1460
},
{
"epoch": 2.3902439024390243,
"grad_norm": 3121.484375,
"learning_rate": 0.00018110977151263702,
"loss": 68.1337,
"step": 1470
},
{
"epoch": 2.40650406504065,
"grad_norm": 4526.3203125,
"learning_rate": 0.00018079411303820647,
"loss": 76.719,
"step": 1480
},
{
"epoch": 2.4227642276422765,
"grad_norm": 1512.4857177734375,
"learning_rate": 0.00018047611891979732,
"loss": 53.3857,
"step": 1490
},
{
"epoch": 2.4390243902439024,
"grad_norm": 775.2145385742188,
"learning_rate": 0.00018015579835017147,
"loss": 59.4552,
"step": 1500
},
{
"epoch": 2.4552845528455283,
"grad_norm": 1543.6497802734375,
"learning_rate": 0.00017983316058934533,
"loss": 79.715,
"step": 1510
},
{
"epoch": 2.4715447154471546,
"grad_norm": 3052.429931640625,
"learning_rate": 0.00017950821496432202,
"loss": 68.2702,
"step": 1520
},
{
"epoch": 2.4878048780487805,
"grad_norm": 1861.0294189453125,
"learning_rate": 0.00017918097086882167,
"loss": 70.8437,
"step": 1530
},
{
"epoch": 2.5040650406504064,
"grad_norm": 1316.6455078125,
"learning_rate": 0.00017885143776301017,
"loss": 48.8773,
"step": 1540
},
{
"epoch": 2.5203252032520327,
"grad_norm": 1434.713623046875,
"learning_rate": 0.0001785196251732252,
"loss": 50.5964,
"step": 1550
},
{
"epoch": 2.5365853658536586,
"grad_norm": 2314.07373046875,
"learning_rate": 0.0001781855426917013,
"loss": 49.6357,
"step": 1560
},
{
"epoch": 2.5528455284552845,
"grad_norm": 27705.951171875,
"learning_rate": 0.00017784919997629236,
"loss": 60.1384,
"step": 1570
},
{
"epoch": 2.569105691056911,
"grad_norm": 100750.6953125,
"learning_rate": 0.00017751060675019235,
"loss": 78.1081,
"step": 1580
},
{
"epoch": 2.5853658536585367,
"grad_norm": 5099.37548828125,
"learning_rate": 0.00017716977280165445,
"loss": 107.401,
"step": 1590
},
{
"epoch": 2.6016260162601625,
"grad_norm": 16017.0224609375,
"learning_rate": 0.00017682670798370792,
"loss": 109.425,
"step": 1600
},
{
"epoch": 2.617886178861789,
"grad_norm": 1565.2376708984375,
"learning_rate": 0.00017648142221387325,
"loss": 66.7137,
"step": 1610
},
{
"epoch": 2.6341463414634148,
"grad_norm": 1883.359619140625,
"learning_rate": 0.00017613392547387565,
"loss": 63.5428,
"step": 1620
},
{
"epoch": 2.6504065040650406,
"grad_norm": 4678.5400390625,
"learning_rate": 0.00017578422780935624,
"loss": 62.324,
"step": 1630
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1467.29150390625,
"learning_rate": 0.00017543233932958185,
"loss": 42.7399,
"step": 1640
},
{
"epoch": 2.682926829268293,
"grad_norm": 17443.28125,
"learning_rate": 0.00017507827020715267,
"loss": 76.8691,
"step": 1650
},
{
"epoch": 2.6991869918699187,
"grad_norm": 1430.255615234375,
"learning_rate": 0.00017472203067770816,
"loss": 45.8614,
"step": 1660
},
{
"epoch": 2.7154471544715446,
"grad_norm": 973.7998657226562,
"learning_rate": 0.0001743636310396312,
"loss": 36.7464,
"step": 1670
},
{
"epoch": 2.7317073170731705,
"grad_norm": 2293.49658203125,
"learning_rate": 0.00017400308165375043,
"loss": 104.4038,
"step": 1680
},
{
"epoch": 2.747967479674797,
"grad_norm": 1044.43115234375,
"learning_rate": 0.00017364039294304063,
"loss": 61.9649,
"step": 1690
},
{
"epoch": 2.7642276422764227,
"grad_norm": 2085.281982421875,
"learning_rate": 0.00017327557539232138,
"loss": 51.97,
"step": 1700
},
{
"epoch": 2.7804878048780486,
"grad_norm": 1864.0758056640625,
"learning_rate": 0.00017290863954795414,
"loss": 56.1968,
"step": 1710
},
{
"epoch": 2.796747967479675,
"grad_norm": 5055.72216796875,
"learning_rate": 0.00017253959601753715,
"loss": 49.4941,
"step": 1720
},
{
"epoch": 2.813008130081301,
"grad_norm": 2442.3779296875,
"learning_rate": 0.00017216845546959904,
"loss": 85.7186,
"step": 1730
},
{
"epoch": 2.8292682926829267,
"grad_norm": 1286.6806640625,
"learning_rate": 0.00017179522863329004,
"loss": 57.1273,
"step": 1740
},
{
"epoch": 2.845528455284553,
"grad_norm": 1548.7122802734375,
"learning_rate": 0.0001714199262980722,
"loss": 50.7149,
"step": 1750
},
{
"epoch": 2.861788617886179,
"grad_norm": 1237.375732421875,
"learning_rate": 0.00017104255931340732,
"loss": 80.6716,
"step": 1760
},
{
"epoch": 2.8780487804878048,
"grad_norm": 271203.3125,
"learning_rate": 0.00017066313858844317,
"loss": 79.4793,
"step": 1770
},
{
"epoch": 2.894308943089431,
"grad_norm": 2990.47998046875,
"learning_rate": 0.00017028167509169846,
"loss": 63.7313,
"step": 1780
},
{
"epoch": 2.910569105691057,
"grad_norm": 2197.031494140625,
"learning_rate": 0.00016989817985074533,
"loss": 66.6744,
"step": 1790
},
{
"epoch": 2.926829268292683,
"grad_norm": 2398.322509765625,
"learning_rate": 0.00016951266395189097,
"loss": 119.2331,
"step": 1800
},
{
"epoch": 2.943089430894309,
"grad_norm": 1132.4508056640625,
"learning_rate": 0.00016912513853985686,
"loss": 66.5857,
"step": 1810
},
{
"epoch": 2.959349593495935,
"grad_norm": 1172.097412109375,
"learning_rate": 0.00016873561481745667,
"loss": 69.8449,
"step": 1820
},
{
"epoch": 2.975609756097561,
"grad_norm": 1260.872314453125,
"learning_rate": 0.0001683441040452724,
"loss": 65.4089,
"step": 1830
},
{
"epoch": 2.991869918699187,
"grad_norm": 3771.443603515625,
"learning_rate": 0.00016795061754132896,
"loss": 59.9783,
"step": 1840
},
{
"epoch": 3.008130081300813,
"grad_norm": 44377.31640625,
"learning_rate": 0.00016755516668076674,
"loss": 77.3272,
"step": 1850
},
{
"epoch": 3.024390243902439,
"grad_norm": 1505.83984375,
"learning_rate": 0.00016715776289551296,
"loss": 53.3784,
"step": 1860
},
{
"epoch": 3.040650406504065,
"grad_norm": 615.7579956054688,
"learning_rate": 0.0001667584176739512,
"loss": 50.9411,
"step": 1870
},
{
"epoch": 3.0569105691056913,
"grad_norm": 38362.62890625,
"learning_rate": 0.00016635714256058915,
"loss": 118.019,
"step": 1880
},
{
"epoch": 3.073170731707317,
"grad_norm": 1028.602783203125,
"learning_rate": 0.00016595394915572506,
"loss": 69.6284,
"step": 1890
},
{
"epoch": 3.089430894308943,
"grad_norm": 5944.29248046875,
"learning_rate": 0.00016554884911511213,
"loss": 64.6018,
"step": 1900
},
{
"epoch": 3.105691056910569,
"grad_norm": 2787.141845703125,
"learning_rate": 0.00016514185414962182,
"loss": 68.6644,
"step": 1910
},
{
"epoch": 3.1219512195121952,
"grad_norm": 2354.9130859375,
"learning_rate": 0.0001647329760249052,
"loss": 81.7822,
"step": 1920
},
{
"epoch": 3.138211382113821,
"grad_norm": 2922.60009765625,
"learning_rate": 0.00016432222656105277,
"loss": 113.863,
"step": 1930
},
{
"epoch": 3.154471544715447,
"grad_norm": 4188.85107421875,
"learning_rate": 0.0001639096176322528,
"loss": 79.855,
"step": 1940
},
{
"epoch": 3.1707317073170733,
"grad_norm": 1911.2069091796875,
"learning_rate": 0.0001634951611664482,
"loss": 69.1627,
"step": 1950
},
{
"epoch": 3.186991869918699,
"grad_norm": 1192.2657470703125,
"learning_rate": 0.0001630788691449914,
"loss": 55.1678,
"step": 1960
},
{
"epoch": 3.203252032520325,
"grad_norm": 10476.7724609375,
"learning_rate": 0.00016266075360229823,
"loss": 88.3594,
"step": 1970
},
{
"epoch": 3.2195121951219514,
"grad_norm": 746.9041748046875,
"learning_rate": 0.00016224082662550003,
"loss": 109.0398,
"step": 1980
},
{
"epoch": 3.2357723577235773,
"grad_norm": 2032.73779296875,
"learning_rate": 0.000161819100354094,
"loss": 44.7227,
"step": 1990
},
{
"epoch": 3.252032520325203,
"grad_norm": 1000.6553955078125,
"learning_rate": 0.0001613955869795925,
"loss": 73.6318,
"step": 2000
},
{
"epoch": 3.2682926829268295,
"grad_norm": 877.0646362304688,
"learning_rate": 0.00016097029874517053,
"loss": 65.1961,
"step": 2010
},
{
"epoch": 3.2845528455284554,
"grad_norm": 20667.6640625,
"learning_rate": 0.0001605432479453117,
"loss": 131.7637,
"step": 2020
},
{
"epoch": 3.3008130081300813,
"grad_norm": 6932.1630859375,
"learning_rate": 0.0001601144469254531,
"loss": 63.2276,
"step": 2030
},
{
"epoch": 3.317073170731707,
"grad_norm": 2701.05029296875,
"learning_rate": 0.00015968390808162797,
"loss": 93.1463,
"step": 2040
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2700.706298828125,
"learning_rate": 0.0001592516438601077,
"loss": 63.6073,
"step": 2050
},
{
"epoch": 3.3495934959349594,
"grad_norm": 9397.724609375,
"learning_rate": 0.00015881766675704203,
"loss": 74.2051,
"step": 2060
},
{
"epoch": 3.3658536585365852,
"grad_norm": 919.5447998046875,
"learning_rate": 0.00015838198931809747,
"loss": 55.599,
"step": 2070
},
{
"epoch": 3.3821138211382116,
"grad_norm": 4705.94287109375,
"learning_rate": 0.00015794462413809503,
"loss": 54.821,
"step": 2080
},
{
"epoch": 3.3983739837398375,
"grad_norm": 80140.5,
"learning_rate": 0.00015750558386064584,
"loss": 132.3792,
"step": 2090
},
{
"epoch": 3.4146341463414633,
"grad_norm": 17313.400390625,
"learning_rate": 0.0001570648811777858,
"loss": 73.4562,
"step": 2100
},
{
"epoch": 3.430894308943089,
"grad_norm": 62464.19140625,
"learning_rate": 0.00015662252882960855,
"loss": 123.1144,
"step": 2110
},
{
"epoch": 3.4471544715447155,
"grad_norm": 10362.189453125,
"learning_rate": 0.00015617853960389724,
"loss": 60.7324,
"step": 2120
},
{
"epoch": 3.4634146341463414,
"grad_norm": 8119.03662109375,
"learning_rate": 0.00015573292633575488,
"loss": 47.9465,
"step": 2130
},
{
"epoch": 3.4796747967479673,
"grad_norm": 65353.2890625,
"learning_rate": 0.00015528570190723325,
"loss": 38.784,
"step": 2140
},
{
"epoch": 3.4959349593495936,
"grad_norm": 946.7526245117188,
"learning_rate": 0.00015483687924696047,
"loss": 45.439,
"step": 2150
},
{
"epoch": 3.5121951219512195,
"grad_norm": 8941.34375,
"learning_rate": 0.0001543864713297673,
"loss": 62.3894,
"step": 2160
},
{
"epoch": 3.5284552845528454,
"grad_norm": 169778.421875,
"learning_rate": 0.00015393449117631205,
"loss": 71.317,
"step": 2170
},
{
"epoch": 3.5447154471544717,
"grad_norm": 1309.4539794921875,
"learning_rate": 0.0001534809518527042,
"loss": 59.1676,
"step": 2180
},
{
"epoch": 3.5609756097560976,
"grad_norm": 159682.328125,
"learning_rate": 0.0001530258664701266,
"loss": 74.9109,
"step": 2190
},
{
"epoch": 3.5772357723577235,
"grad_norm": 5231.26611328125,
"learning_rate": 0.00015256924818445652,
"loss": 50.8158,
"step": 2200
},
{
"epoch": 3.59349593495935,
"grad_norm": 840.7651977539062,
"learning_rate": 0.0001521111101958852,
"loss": 53.3685,
"step": 2210
},
{
"epoch": 3.6097560975609757,
"grad_norm": 1039.3839111328125,
"learning_rate": 0.00015165146574853651,
"loss": 51.3367,
"step": 2220
},
{
"epoch": 3.6260162601626016,
"grad_norm": 2042.122802734375,
"learning_rate": 0.00015119032813008384,
"loss": 63.4835,
"step": 2230
},
{
"epoch": 3.642276422764228,
"grad_norm": 1014.0968017578125,
"learning_rate": 0.00015072771067136602,
"loss": 121.3831,
"step": 2240
},
{
"epoch": 3.658536585365854,
"grad_norm": 2085.046875,
"learning_rate": 0.00015026362674600197,
"loss": 86.4089,
"step": 2250
},
{
"epoch": 3.6747967479674797,
"grad_norm": 1501.3868408203125,
"learning_rate": 0.00014979808977000423,
"loss": 87.4238,
"step": 2260
},
{
"epoch": 3.6910569105691056,
"grad_norm": 3143.670166015625,
"learning_rate": 0.0001493311132013908,
"loss": 47.4117,
"step": 2270
},
{
"epoch": 3.7073170731707314,
"grad_norm": 3601.27197265625,
"learning_rate": 0.00014886271053979642,
"loss": 47.0386,
"step": 2280
},
{
"epoch": 3.7235772357723578,
"grad_norm": 1050.021484375,
"learning_rate": 0.00014839289532608208,
"loss": 50.3757,
"step": 2290
},
{
"epoch": 3.7398373983739837,
"grad_norm": 1158.14453125,
"learning_rate": 0.0001479216811419437,
"loss": 53.1059,
"step": 2300
},
{
"epoch": 3.7560975609756095,
"grad_norm": 1679.3118896484375,
"learning_rate": 0.00014744908160951948,
"loss": 81.2242,
"step": 2310
},
{
"epoch": 3.772357723577236,
"grad_norm": 1483.0025634765625,
"learning_rate": 0.00014697511039099602,
"loss": 65.0123,
"step": 2320
},
{
"epoch": 3.7886178861788617,
"grad_norm": 1206.0103759765625,
"learning_rate": 0.00014649978118821356,
"loss": 112.2168,
"step": 2330
},
{
"epoch": 3.8048780487804876,
"grad_norm": 6336.48828125,
"learning_rate": 0.00014602310774226957,
"loss": 98.5093,
"step": 2340
},
{
"epoch": 3.821138211382114,
"grad_norm": 659.5859985351562,
"learning_rate": 0.00014554510383312189,
"loss": 65.6266,
"step": 2350
},
{
"epoch": 3.83739837398374,
"grad_norm": 1136.7991943359375,
"learning_rate": 0.00014506578327919,
"loss": 51.189,
"step": 2360
},
{
"epoch": 3.8536585365853657,
"grad_norm": 6465.4130859375,
"learning_rate": 0.00014458515993695585,
"loss": 69.188,
"step": 2370
},
{
"epoch": 3.869918699186992,
"grad_norm": 5106.58642578125,
"learning_rate": 0.00014410324770056313,
"loss": 96.6794,
"step": 2380
},
{
"epoch": 3.886178861788618,
"grad_norm": 3519.845703125,
"learning_rate": 0.00014362006050141563,
"loss": 55.2195,
"step": 2390
},
{
"epoch": 3.902439024390244,
"grad_norm": 20824.455078125,
"learning_rate": 0.00014313561230777452,
"loss": 47.6591,
"step": 2400
},
{
"epoch": 3.91869918699187,
"grad_norm": 2973.600830078125,
"learning_rate": 0.00014264991712435452,
"loss": 66.8287,
"step": 2410
},
{
"epoch": 3.934959349593496,
"grad_norm": 1502.51025390625,
"learning_rate": 0.00014216298899191916,
"loss": 47.0916,
"step": 2420
},
{
"epoch": 3.951219512195122,
"grad_norm": 13010.16796875,
"learning_rate": 0.0001416748419868747,
"loss": 61.0954,
"step": 2430
},
{
"epoch": 3.9674796747967482,
"grad_norm": 953.6785278320312,
"learning_rate": 0.0001411854902208633,
"loss": 47.334,
"step": 2440
},
{
"epoch": 3.983739837398374,
"grad_norm": 2903.397216796875,
"learning_rate": 0.00014069494784035505,
"loss": 67.0245,
"step": 2450
},
{
"epoch": 4.0,
"grad_norm": 1550.0595703125,
"learning_rate": 0.0001402032290262391,
"loss": 51.0681,
"step": 2460
},
{
"epoch": 4.016260162601626,
"grad_norm": 58333.4921875,
"learning_rate": 0.00013971034799341355,
"loss": 62.1808,
"step": 2470
},
{
"epoch": 4.032520325203252,
"grad_norm": 1227.8946533203125,
"learning_rate": 0.0001392163189903747,
"loss": 72.5005,
"step": 2480
},
{
"epoch": 4.048780487804878,
"grad_norm": 2188.923828125,
"learning_rate": 0.00013872115629880497,
"loss": 47.0166,
"step": 2490
},
{
"epoch": 4.065040650406504,
"grad_norm": 1214.519775390625,
"learning_rate": 0.0001382248742331602,
"loss": 40.6225,
"step": 2500
},
{
"epoch": 4.08130081300813,
"grad_norm": 952.546875,
"learning_rate": 0.0001377274871402556,
"loss": 43.3264,
"step": 2510
},
{
"epoch": 4.097560975609756,
"grad_norm": 753.4329833984375,
"learning_rate": 0.00013722900939885132,
"loss": 51.3909,
"step": 2520
},
{
"epoch": 4.1138211382113825,
"grad_norm": 1024.9317626953125,
"learning_rate": 0.0001367294554192366,
"loss": 42.0499,
"step": 2530
},
{
"epoch": 4.130081300813008,
"grad_norm": 546.87841796875,
"learning_rate": 0.00013622883964281316,
"loss": 36.1083,
"step": 2540
},
{
"epoch": 4.146341463414634,
"grad_norm": 893.5374755859375,
"learning_rate": 0.00013572717654167777,
"loss": 39.7196,
"step": 2550
},
{
"epoch": 4.16260162601626,
"grad_norm": 1298.6865234375,
"learning_rate": 0.00013522448061820393,
"loss": 43.8941,
"step": 2560
},
{
"epoch": 4.178861788617886,
"grad_norm": 1751.4395751953125,
"learning_rate": 0.00013472076640462248,
"loss": 48.5067,
"step": 2570
},
{
"epoch": 4.195121951219512,
"grad_norm": 4070.478759765625,
"learning_rate": 0.00013421604846260173,
"loss": 69.5999,
"step": 2580
},
{
"epoch": 4.211382113821138,
"grad_norm": 1715.4664306640625,
"learning_rate": 0.0001337103413828263,
"loss": 55.5755,
"step": 2590
},
{
"epoch": 4.227642276422764,
"grad_norm": 1144.9033203125,
"learning_rate": 0.00013320365978457534,
"loss": 44.6062,
"step": 2600
},
{
"epoch": 4.2439024390243905,
"grad_norm": 1374.0616455078125,
"learning_rate": 0.00013269601831530003,
"loss": 100.0019,
"step": 2610
},
{
"epoch": 4.260162601626016,
"grad_norm": 649.107666015625,
"learning_rate": 0.0001321874316502,
"loss": 45.9766,
"step": 2620
},
{
"epoch": 4.276422764227642,
"grad_norm": 1265.823486328125,
"learning_rate": 0.00013167791449179928,
"loss": 36.6327,
"step": 2630
},
{
"epoch": 4.2926829268292686,
"grad_norm": 1065.16943359375,
"learning_rate": 0.00013116748156952098,
"loss": 36.6221,
"step": 2640
},
{
"epoch": 4.308943089430894,
"grad_norm": 7990.9853515625,
"learning_rate": 0.00013065614763926184,
"loss": 47.2748,
"step": 2650
},
{
"epoch": 4.32520325203252,
"grad_norm": 3891.1884765625,
"learning_rate": 0.00013014392748296528,
"loss": 60.2811,
"step": 2660
},
{
"epoch": 4.341463414634147,
"grad_norm": 1250.55859375,
"learning_rate": 0.00012963083590819443,
"loss": 59.3533,
"step": 2670
},
{
"epoch": 4.357723577235772,
"grad_norm": 452.96368408203125,
"learning_rate": 0.00012911688774770377,
"loss": 39.7551,
"step": 2680
},
{
"epoch": 4.373983739837398,
"grad_norm": 1382.8927001953125,
"learning_rate": 0.0001286020978590106,
"loss": 56.9612,
"step": 2690
},
{
"epoch": 4.390243902439025,
"grad_norm": 2779.33642578125,
"learning_rate": 0.0001280864811239652,
"loss": 76.6694,
"step": 2700
},
{
"epoch": 4.40650406504065,
"grad_norm": 1720.7236328125,
"learning_rate": 0.00012757005244832113,
"loss": 54.5705,
"step": 2710
},
{
"epoch": 4.4227642276422765,
"grad_norm": 530.7537231445312,
"learning_rate": 0.00012705282676130368,
"loss": 43.2596,
"step": 2720
},
{
"epoch": 4.439024390243903,
"grad_norm": 1741.5948486328125,
"learning_rate": 0.00012653481901517876,
"loss": 44.5357,
"step": 2730
},
{
"epoch": 4.455284552845528,
"grad_norm": 545.766357421875,
"learning_rate": 0.00012601604418482052,
"loss": 64.0609,
"step": 2740
},
{
"epoch": 4.471544715447155,
"grad_norm": 760.1073608398438,
"learning_rate": 0.00012549651726727841,
"loss": 33.9295,
"step": 2750
},
{
"epoch": 4.487804878048781,
"grad_norm": 3076.673583984375,
"learning_rate": 0.0001249762532813437,
"loss": 53.2542,
"step": 2760
},
{
"epoch": 4.504065040650406,
"grad_norm": 613.498779296875,
"learning_rate": 0.0001244552672671152,
"loss": 42.9754,
"step": 2770
},
{
"epoch": 4.520325203252033,
"grad_norm": 633.474365234375,
"learning_rate": 0.0001239335742855645,
"loss": 79.9076,
"step": 2780
},
{
"epoch": 4.536585365853659,
"grad_norm": 534.7109375,
"learning_rate": 0.00012341118941810086,
"loss": 56.3449,
"step": 2790
},
{
"epoch": 4.5528455284552845,
"grad_norm": 988.2083740234375,
"learning_rate": 0.00012288812776613467,
"loss": 60.076,
"step": 2800
},
{
"epoch": 4.569105691056911,
"grad_norm": 987.4862670898438,
"learning_rate": 0.00012236440445064146,
"loss": 44.6687,
"step": 2810
},
{
"epoch": 4.585365853658536,
"grad_norm": 1020.8764038085938,
"learning_rate": 0.00012184003461172437,
"loss": 54.9522,
"step": 2820
},
{
"epoch": 4.6016260162601625,
"grad_norm": 861.468505859375,
"learning_rate": 0.00012131503340817663,
"loss": 72.5806,
"step": 2830
},
{
"epoch": 4.617886178861789,
"grad_norm": 1153.2725830078125,
"learning_rate": 0.00012078941601704343,
"loss": 44.8851,
"step": 2840
},
{
"epoch": 4.634146341463414,
"grad_norm": 7982.6865234375,
"learning_rate": 0.00012026319763318301,
"loss": 49.9482,
"step": 2850
},
{
"epoch": 4.650406504065041,
"grad_norm": 1476.1536865234375,
"learning_rate": 0.00011973639346882746,
"loss": 47.223,
"step": 2860
},
{
"epoch": 4.666666666666667,
"grad_norm": 1169.1434326171875,
"learning_rate": 0.00011920901875314295,
"loss": 51.8643,
"step": 2870
},
{
"epoch": 4.682926829268292,
"grad_norm": 1330.784912109375,
"learning_rate": 0.00011868108873178949,
"loss": 43.6427,
"step": 2880
},
{
"epoch": 4.699186991869919,
"grad_norm": 631.0576171875,
"learning_rate": 0.00011815261866648026,
"loss": 56.523,
"step": 2890
},
{
"epoch": 4.715447154471545,
"grad_norm": 1804.2171630859375,
"learning_rate": 0.00011762362383454024,
"loss": 49.6038,
"step": 2900
},
{
"epoch": 4.7317073170731705,
"grad_norm": 2007.8486328125,
"learning_rate": 0.00011709411952846479,
"loss": 56.3543,
"step": 2910
},
{
"epoch": 4.747967479674797,
"grad_norm": 1846.902099609375,
"learning_rate": 0.00011656412105547733,
"loss": 40.9638,
"step": 2920
},
{
"epoch": 4.764227642276423,
"grad_norm": 854.6354370117188,
"learning_rate": 0.00011603364373708702,
"loss": 47.7196,
"step": 2930
},
{
"epoch": 4.780487804878049,
"grad_norm": 2663.093017578125,
"learning_rate": 0.00011550270290864582,
"loss": 88.7795,
"step": 2940
},
{
"epoch": 4.796747967479675,
"grad_norm": 2370.38720703125,
"learning_rate": 0.00011497131391890498,
"loss": 65.2372,
"step": 2950
},
{
"epoch": 4.8130081300813,
"grad_norm": 1494.7568359375,
"learning_rate": 0.00011443949212957154,
"loss": 68.4685,
"step": 2960
},
{
"epoch": 4.829268292682927,
"grad_norm": 1287.447021484375,
"learning_rate": 0.00011390725291486419,
"loss": 51.913,
"step": 2970
},
{
"epoch": 4.845528455284553,
"grad_norm": 1271.5274658203125,
"learning_rate": 0.00011337461166106871,
"loss": 53.7021,
"step": 2980
},
{
"epoch": 4.861788617886178,
"grad_norm": 1231.7939453125,
"learning_rate": 0.00011284158376609333,
"loss": 31.6516,
"step": 2990
},
{
"epoch": 4.878048780487805,
"grad_norm": 1916.57421875,
"learning_rate": 0.00011230818463902358,
"loss": 69.1733,
"step": 3000
},
{
"epoch": 4.894308943089431,
"grad_norm": 2691.4208984375,
"learning_rate": 0.00011177442969967668,
"loss": 55.0878,
"step": 3010
},
{
"epoch": 4.9105691056910565,
"grad_norm": 1314.462646484375,
"learning_rate": 0.00011124033437815593,
"loss": 40.0013,
"step": 3020
},
{
"epoch": 4.926829268292683,
"grad_norm": 1857.048095703125,
"learning_rate": 0.00011070591411440459,
"loss": 46.5445,
"step": 3030
},
{
"epoch": 4.943089430894309,
"grad_norm": 1580.3558349609375,
"learning_rate": 0.00011017118435775957,
"loss": 38.4451,
"step": 3040
},
{
"epoch": 4.959349593495935,
"grad_norm": 1501.5589599609375,
"learning_rate": 0.00010963616056650476,
"loss": 34.3078,
"step": 3050
},
{
"epoch": 4.975609756097561,
"grad_norm": 3925.81591796875,
"learning_rate": 0.00010910085820742419,
"loss": 58.2388,
"step": 3060
},
{
"epoch": 4.991869918699187,
"grad_norm": 828.7344360351562,
"learning_rate": 0.00010856529275535487,
"loss": 77.3652,
"step": 3070
},
{
"epoch": 5.008130081300813,
"grad_norm": 850.0521240234375,
"learning_rate": 0.00010802947969273946,
"loss": 32.5409,
"step": 3080
},
{
"epoch": 5.024390243902439,
"grad_norm": 315.0628967285156,
"learning_rate": 0.00010749343450917873,
"loss": 49.1381,
"step": 3090
},
{
"epoch": 5.040650406504065,
"grad_norm": 805.5790405273438,
"learning_rate": 0.0001069571727009837,
"loss": 44.4946,
"step": 3100
},
{
"epoch": 5.056910569105691,
"grad_norm": 2954.944091796875,
"learning_rate": 0.0001064207097707277,
"loss": 56.0899,
"step": 3110
},
{
"epoch": 5.073170731707317,
"grad_norm": 1296.76025390625,
"learning_rate": 0.00010588406122679825,
"loss": 32.3572,
"step": 3120
},
{
"epoch": 5.0894308943089435,
"grad_norm": 682.7062377929688,
"learning_rate": 0.00010534724258294868,
"loss": 41.241,
"step": 3130
},
{
"epoch": 5.105691056910569,
"grad_norm": 586.6185302734375,
"learning_rate": 0.00010481026935784967,
"loss": 46.9862,
"step": 3140
},
{
"epoch": 5.121951219512195,
"grad_norm": 494.31768798828125,
"learning_rate": 0.0001042731570746406,
"loss": 39.867,
"step": 3150
},
{
"epoch": 5.138211382113822,
"grad_norm": 1095.9088134765625,
"learning_rate": 0.00010373592126048093,
"loss": 33.0041,
"step": 3160
},
{
"epoch": 5.154471544715447,
"grad_norm": 1172.2149658203125,
"learning_rate": 0.00010319857744610106,
"loss": 84.7379,
"step": 3170
},
{
"epoch": 5.170731707317073,
"grad_norm": 7211.0283203125,
"learning_rate": 0.00010266114116535362,
"loss": 48.8282,
"step": 3180
},
{
"epoch": 5.186991869918699,
"grad_norm": 1418.6943359375,
"learning_rate": 0.00010212362795476432,
"loss": 46.3707,
"step": 3190
},
{
"epoch": 5.203252032520325,
"grad_norm": 3661.55126953125,
"learning_rate": 0.0001015860533530828,
"loss": 93.9867,
"step": 3200
},
{
"epoch": 5.219512195121951,
"grad_norm": 1076.226806640625,
"learning_rate": 0.00010104843290083341,
"loss": 68.2097,
"step": 3210
},
{
"epoch": 5.235772357723577,
"grad_norm": 4902.42138671875,
"learning_rate": 0.00010051078213986597,
"loss": 36.9465,
"step": 3220
},
{
"epoch": 5.252032520325203,
"grad_norm": 2610.93212890625,
"learning_rate": 9.997311661290648e-05,
"loss": 56.646,
"step": 3230
},
{
"epoch": 5.2682926829268295,
"grad_norm": 3272.592529296875,
"learning_rate": 9.943545186310787e-05,
"loss": 42.065,
"step": 3240
},
{
"epoch": 5.284552845528455,
"grad_norm": 1224.6219482421875,
"learning_rate": 9.889780343360049e-05,
"loss": 60.0324,
"step": 3250
},
{
"epoch": 5.300813008130081,
"grad_norm": 1191.6717529296875,
"learning_rate": 9.836018686704298e-05,
"loss": 49.1736,
"step": 3260
},
{
"epoch": 5.317073170731708,
"grad_norm": 1531.7381591796875,
"learning_rate": 9.782261770517289e-05,
"loss": 29.3415,
"step": 3270
},
{
"epoch": 5.333333333333333,
"grad_norm": 1613.154296875,
"learning_rate": 9.72851114883572e-05,
"loss": 71.2164,
"step": 3280
},
{
"epoch": 5.349593495934959,
"grad_norm": 1089.3868408203125,
"learning_rate": 9.674768375514347e-05,
"loss": 41.1068,
"step": 3290
},
{
"epoch": 5.365853658536586,
"grad_norm": 425.6622314453125,
"learning_rate": 9.621035004181022e-05,
"loss": 29.7313,
"step": 3300
},
{
"epoch": 5.382113821138211,
"grad_norm": 4809.2626953125,
"learning_rate": 9.56731258819181e-05,
"loss": 59.21,
"step": 3310
},
{
"epoch": 5.3983739837398375,
"grad_norm": 768.4491577148438,
"learning_rate": 9.51360268058607e-05,
"loss": 65.3515,
"step": 3320
},
{
"epoch": 5.414634146341464,
"grad_norm": 1334.3365478515625,
"learning_rate": 9.459906834041558e-05,
"loss": 44.464,
"step": 3330
},
{
"epoch": 5.430894308943089,
"grad_norm": 1523.654296875,
"learning_rate": 9.406226600829545e-05,
"loss": 61.8839,
"step": 3340
},
{
"epoch": 5.4471544715447155,
"grad_norm": 1562.5716552734375,
"learning_rate": 9.352563532769949e-05,
"loss": 51.7122,
"step": 3350
},
{
"epoch": 5.463414634146342,
"grad_norm": 1880.090087890625,
"learning_rate": 9.298919181186458e-05,
"loss": 41.961,
"step": 3360
},
{
"epoch": 5.479674796747967,
"grad_norm": 1722.7073974609375,
"learning_rate": 9.245295096861698e-05,
"loss": 46.5965,
"step": 3370
},
{
"epoch": 5.495934959349594,
"grad_norm": 925.80126953125,
"learning_rate": 9.191692829992401e-05,
"loss": 48.4384,
"step": 3380
},
{
"epoch": 5.512195121951219,
"grad_norm": 1489.31982421875,
"learning_rate": 9.138113930144578e-05,
"loss": 59.3866,
"step": 3390
},
{
"epoch": 5.528455284552845,
"grad_norm": 707.712890625,
"learning_rate": 9.084559946208739e-05,
"loss": 42.5858,
"step": 3400
},
{
"epoch": 5.544715447154472,
"grad_norm": 2299.88720703125,
"learning_rate": 9.031032426355106e-05,
"loss": 36.6626,
"step": 3410
},
{
"epoch": 5.560975609756097,
"grad_norm": 4950.97998046875,
"learning_rate": 8.977532917988871e-05,
"loss": 37.762,
"step": 3420
},
{
"epoch": 5.5772357723577235,
"grad_norm": 891.8377075195312,
"learning_rate": 8.924062967705443e-05,
"loss": 50.5158,
"step": 3430
},
{
"epoch": 5.59349593495935,
"grad_norm": 996.9815673828125,
"learning_rate": 8.870624121245748e-05,
"loss": 56.7966,
"step": 3440
},
{
"epoch": 5.609756097560975,
"grad_norm": 814.5260009765625,
"learning_rate": 8.817217923451554e-05,
"loss": 61.8741,
"step": 3450
},
{
"epoch": 5.626016260162602,
"grad_norm": 1282.3272705078125,
"learning_rate": 8.763845918220793e-05,
"loss": 28.1619,
"step": 3460
},
{
"epoch": 5.642276422764228,
"grad_norm": 1114.01513671875,
"learning_rate": 8.71050964846294e-05,
"loss": 34.5723,
"step": 3470
},
{
"epoch": 5.658536585365853,
"grad_norm": 768.8634033203125,
"learning_rate": 8.657210656054413e-05,
"loss": 40.1524,
"step": 3480
},
{
"epoch": 5.67479674796748,
"grad_norm": 640.5523681640625,
"learning_rate": 8.60395048179399e-05,
"loss": 59.3767,
"step": 3490
},
{
"epoch": 5.691056910569106,
"grad_norm": 976.6678466796875,
"learning_rate": 8.550730665358266e-05,
"loss": 46.2076,
"step": 3500
},
{
"epoch": 5.7073170731707314,
"grad_norm": 904.607666015625,
"learning_rate": 8.497552745257157e-05,
"loss": 44.8267,
"step": 3510
},
{
"epoch": 5.723577235772358,
"grad_norm": 18157.951171875,
"learning_rate": 8.444418258789418e-05,
"loss": 46.1126,
"step": 3520
},
{
"epoch": 5.739837398373984,
"grad_norm": 702.4590454101562,
"learning_rate": 8.391328741998187e-05,
"loss": 62.335,
"step": 3530
},
{
"epoch": 5.7560975609756095,
"grad_norm": 906.1786499023438,
"learning_rate": 8.338285729626595e-05,
"loss": 65.6418,
"step": 3540
},
{
"epoch": 5.772357723577236,
"grad_norm": 1011.940185546875,
"learning_rate": 8.285290755073405e-05,
"loss": 41.4294,
"step": 3550
},
{
"epoch": 5.788617886178862,
"grad_norm": 2783.18798828125,
"learning_rate": 8.23234535034866e-05,
"loss": 73.9544,
"step": 3560
},
{
"epoch": 5.804878048780488,
"grad_norm": 1077.9619140625,
"learning_rate": 8.179451046029424e-05,
"loss": 36.2339,
"step": 3570
},
{
"epoch": 5.821138211382114,
"grad_norm": 1024.14453125,
"learning_rate": 8.12660937121551e-05,
"loss": 40.021,
"step": 3580
},
{
"epoch": 5.83739837398374,
"grad_norm": 1014.1956787109375,
"learning_rate": 8.073821853485288e-05,
"loss": 73.2346,
"step": 3590
},
{
"epoch": 5.853658536585366,
"grad_norm": 869.21875,
"learning_rate": 8.021090018851526e-05,
"loss": 34.6341,
"step": 3600
},
{
"epoch": 5.869918699186992,
"grad_norm": 1306.168212890625,
"learning_rate": 7.968415391717271e-05,
"loss": 71.121,
"step": 3610
},
{
"epoch": 5.886178861788618,
"grad_norm": 1111.87890625,
"learning_rate": 7.915799494831775e-05,
"loss": 33.9404,
"step": 3620
},
{
"epoch": 5.902439024390244,
"grad_norm": 759.7614135742188,
"learning_rate": 7.863243849246494e-05,
"loss": 50.714,
"step": 3630
},
{
"epoch": 5.91869918699187,
"grad_norm": 5193.80419921875,
"learning_rate": 7.810749974271099e-05,
"loss": 59.9144,
"step": 3640
},
{
"epoch": 5.934959349593496,
"grad_norm": 1484.0467529296875,
"learning_rate": 7.758319387429553e-05,
"loss": 58.3316,
"step": 3650
},
{
"epoch": 5.951219512195122,
"grad_norm": 1309.0003662109375,
"learning_rate": 7.705953604416254e-05,
"loss": 48.9651,
"step": 3660
},
{
"epoch": 5.967479674796748,
"grad_norm": 754.5973510742188,
"learning_rate": 7.653654139052214e-05,
"loss": 29.4624,
"step": 3670
},
{
"epoch": 5.983739837398374,
"grad_norm": 637.7557983398438,
"learning_rate": 7.60142250324129e-05,
"loss": 43.2339,
"step": 3680
},
{
"epoch": 6.0,
"grad_norm": 1177.0924072265625,
"learning_rate": 7.549260206926486e-05,
"loss": 47.2867,
"step": 3690
},
{
"epoch": 6.016260162601626,
"grad_norm": 1924.6392822265625,
"learning_rate": 7.4971687580463e-05,
"loss": 38.3521,
"step": 3700
},
{
"epoch": 6.032520325203252,
"grad_norm": 916.7091674804688,
"learning_rate": 7.445149662491126e-05,
"loss": 49.7392,
"step": 3710
},
{
"epoch": 6.048780487804878,
"grad_norm": 967.6969604492188,
"learning_rate": 7.393204424059725e-05,
"loss": 38.2029,
"step": 3720
},
{
"epoch": 6.065040650406504,
"grad_norm": 840.0963745117188,
"learning_rate": 7.341334544415761e-05,
"loss": 77.827,
"step": 3730
},
{
"epoch": 6.08130081300813,
"grad_norm": 1400.66064453125,
"learning_rate": 7.289541523044376e-05,
"loss": 66.4577,
"step": 3740
},
{
"epoch": 6.097560975609756,
"grad_norm": 767.639892578125,
"learning_rate": 7.237826857208847e-05,
"loss": 30.1595,
"step": 3750
},
{
"epoch": 6.1138211382113825,
"grad_norm": 728.1867065429688,
"learning_rate": 7.186192041907298e-05,
"loss": 48.2639,
"step": 3760
},
{
"epoch": 6.130081300813008,
"grad_norm": 1045.18798828125,
"learning_rate": 7.134638569829499e-05,
"loss": 54.2319,
"step": 3770
},
{
"epoch": 6.146341463414634,
"grad_norm": 1185.36474609375,
"learning_rate": 7.083167931313692e-05,
"loss": 37.9882,
"step": 3780
},
{
"epoch": 6.16260162601626,
"grad_norm": 723.2171020507812,
"learning_rate": 7.031781614303519e-05,
"loss": 41.0285,
"step": 3790
},
{
"epoch": 6.178861788617886,
"grad_norm": 1335.1109619140625,
"learning_rate": 6.980481104305013e-05,
"loss": 33.8187,
"step": 3800
},
{
"epoch": 6.195121951219512,
"grad_norm": 651.626708984375,
"learning_rate": 6.929267884343634e-05,
"loss": 65.5501,
"step": 3810
},
{
"epoch": 6.211382113821138,
"grad_norm": 595.5252075195312,
"learning_rate": 6.87814343492142e-05,
"loss": 43.2794,
"step": 3820
},
{
"epoch": 6.227642276422764,
"grad_norm": 1277.5653076171875,
"learning_rate": 6.827109233974178e-05,
"loss": 42.5897,
"step": 3830
},
{
"epoch": 6.2439024390243905,
"grad_norm": 950.2879028320312,
"learning_rate": 6.776166756828759e-05,
"loss": 59.1106,
"step": 3840
},
{
"epoch": 6.260162601626016,
"grad_norm": 862.7484741210938,
"learning_rate": 6.7253174761604e-05,
"loss": 51.2283,
"step": 3850
},
{
"epoch": 6.276422764227642,
"grad_norm": 346.978759765625,
"learning_rate": 6.674562861950167e-05,
"loss": 22.1792,
"step": 3860
},
{
"epoch": 6.2926829268292686,
"grad_norm": 2020.3907470703125,
"learning_rate": 6.62390438144245e-05,
"loss": 34.9443,
"step": 3870
},
{
"epoch": 6.308943089430894,
"grad_norm": 1247.765869140625,
"learning_rate": 6.573343499102545e-05,
"loss": 89.5246,
"step": 3880
},
{
"epoch": 6.32520325203252,
"grad_norm": 1061.9462890625,
"learning_rate": 6.52288167657433e-05,
"loss": 57.1117,
"step": 3890
},
{
"epoch": 6.341463414634147,
"grad_norm": 740.0230712890625,
"learning_rate": 6.472520372637999e-05,
"loss": 41.9892,
"step": 3900
},
{
"epoch": 6.357723577235772,
"grad_norm": 437.2298583984375,
"learning_rate": 6.422261043167893e-05,
"loss": 41.5301,
"step": 3910
},
{
"epoch": 6.373983739837398,
"grad_norm": 707.180908203125,
"learning_rate": 6.372105141090417e-05,
"loss": 61.3545,
"step": 3920
},
{
"epoch": 6.390243902439025,
"grad_norm": 533.357177734375,
"learning_rate": 6.322054116342044e-05,
"loss": 40.3018,
"step": 3930
},
{
"epoch": 6.40650406504065,
"grad_norm": 423.275634765625,
"learning_rate": 6.272109415827379e-05,
"loss": 31.2483,
"step": 3940
},
{
"epoch": 6.4227642276422765,
"grad_norm": 535.2537231445312,
"learning_rate": 6.222272483377345e-05,
"loss": 61.084,
"step": 3950
},
{
"epoch": 6.439024390243903,
"grad_norm": 654.32470703125,
"learning_rate": 6.172544759707449e-05,
"loss": 69.6351,
"step": 3960
},
{
"epoch": 6.455284552845528,
"grad_norm": 827.914794921875,
"learning_rate": 6.122927682376119e-05,
"loss": 34.8883,
"step": 3970
},
{
"epoch": 6.471544715447155,
"grad_norm": 364.55615234375,
"learning_rate": 6.0734226857431554e-05,
"loss": 32.2486,
"step": 3980
},
{
"epoch": 6.487804878048781,
"grad_norm": 383.2949523925781,
"learning_rate": 6.0240312009282674e-05,
"loss": 27.0549,
"step": 3990
},
{
"epoch": 6.504065040650406,
"grad_norm": 666.8985595703125,
"learning_rate": 5.9747546557696924e-05,
"loss": 30.6733,
"step": 4000
},
{
"epoch": 6.520325203252033,
"grad_norm": 322.81890869140625,
"learning_rate": 5.925594474782925e-05,
"loss": 41.4183,
"step": 4010
},
{
"epoch": 6.536585365853659,
"grad_norm": 1725.4873046875,
"learning_rate": 5.876552079119536e-05,
"loss": 56.3451,
"step": 4020
},
{
"epoch": 6.5528455284552845,
"grad_norm": 417.5548095703125,
"learning_rate": 5.827628886526093e-05,
"loss": 46.2162,
"step": 4030
},
{
"epoch": 6.569105691056911,
"grad_norm": 626.910400390625,
"learning_rate": 5.778826311303169e-05,
"loss": 29.055,
"step": 4040
},
{
"epoch": 6.585365853658536,
"grad_norm": 661.1826171875,
"learning_rate": 5.730145764264448e-05,
"loss": 27.6717,
"step": 4050
},
{
"epoch": 6.6016260162601625,
"grad_norm": 595.2796020507812,
"learning_rate": 5.681588652695966e-05,
"loss": 50.871,
"step": 4060
},
{
"epoch": 6.617886178861789,
"grad_norm": 1768.0650634765625,
"learning_rate": 5.6331563803154086e-05,
"loss": 31.054,
"step": 4070
},
{
"epoch": 6.634146341463414,
"grad_norm": 1227.727783203125,
"learning_rate": 5.584850347231528e-05,
"loss": 36.9891,
"step": 4080
},
{
"epoch": 6.650406504065041,
"grad_norm": 1646.6304931640625,
"learning_rate": 5.536671949903689e-05,
"loss": 33.9344,
"step": 4090
},
{
"epoch": 6.666666666666667,
"grad_norm": 1407.2939453125,
"learning_rate": 5.4886225811014814e-05,
"loss": 51.3101,
"step": 4100
},
{
"epoch": 6.682926829268292,
"grad_norm": 1124.4527587890625,
"learning_rate": 5.440703629864454e-05,
"loss": 49.1819,
"step": 4110
},
{
"epoch": 6.699186991869919,
"grad_norm": 689.7494506835938,
"learning_rate": 5.392916481461983e-05,
"loss": 36.6202,
"step": 4120
},
{
"epoch": 6.715447154471545,
"grad_norm": 714.1576538085938,
"learning_rate": 5.3452625173531964e-05,
"loss": 32.2473,
"step": 4130
},
{
"epoch": 6.7317073170731705,
"grad_norm": 479.4760437011719,
"learning_rate": 5.297743115147062e-05,
"loss": 35.0904,
"step": 4140
},
{
"epoch": 6.747967479674797,
"grad_norm": 362.479736328125,
"learning_rate": 5.250359648562551e-05,
"loss": 43.3301,
"step": 4150
},
{
"epoch": 6.764227642276423,
"grad_norm": 668.361572265625,
"learning_rate": 5.203113487388917e-05,
"loss": 50.1241,
"step": 4160
},
{
"epoch": 6.780487804878049,
"grad_norm": 1105.221923828125,
"learning_rate": 5.156005997446118e-05,
"loss": 36.7327,
"step": 4170
},
{
"epoch": 6.796747967479675,
"grad_norm": 528.5939331054688,
"learning_rate": 5.109038540545326e-05,
"loss": 45.8215,
"step": 4180
},
{
"epoch": 6.8130081300813,
"grad_norm": 635.588134765625,
"learning_rate": 5.062212474449537e-05,
"loss": 68.0413,
"step": 4190
},
{
"epoch": 6.829268292682927,
"grad_norm": 629.8543701171875,
"learning_rate": 5.0155291528343577e-05,
"loss": 89.9357,
"step": 4200
},
{
"epoch": 6.845528455284553,
"grad_norm": 511.0000915527344,
"learning_rate": 4.96898992524884e-05,
"loss": 39.3891,
"step": 4210
},
{
"epoch": 6.861788617886178,
"grad_norm": 331.4763488769531,
"learning_rate": 4.922596137076493e-05,
"loss": 32.5439,
"step": 4220
},
{
"epoch": 6.878048780487805,
"grad_norm": 433.0771484375,
"learning_rate": 4.876349129496355e-05,
"loss": 64.7455,
"step": 4230
},
{
"epoch": 6.894308943089431,
"grad_norm": 456.54644775390625,
"learning_rate": 4.830250239444276e-05,
"loss": 44.152,
"step": 4240
},
{
"epoch": 6.9105691056910565,
"grad_norm": 1340.421142578125,
"learning_rate": 4.7843007995742065e-05,
"loss": 30.8355,
"step": 4250
},
{
"epoch": 6.926829268292683,
"grad_norm": 1253.5787353515625,
"learning_rate": 4.7385021382197216e-05,
"loss": 48.8547,
"step": 4260
},
{
"epoch": 6.943089430894309,
"grad_norm": 735.3323974609375,
"learning_rate": 4.692855579355597e-05,
"loss": 29.7913,
"step": 4270
},
{
"epoch": 6.959349593495935,
"grad_norm": 485.3312072753906,
"learning_rate": 4.647362442559535e-05,
"loss": 45.8068,
"step": 4280
},
{
"epoch": 6.975609756097561,
"grad_norm": 1383.2845458984375,
"learning_rate": 4.602024042974027e-05,
"loss": 38.6388,
"step": 4290
},
{
"epoch": 6.991869918699187,
"grad_norm": 491.0514831542969,
"learning_rate": 4.556841691268333e-05,
"loss": 36.584,
"step": 4300
},
{
"epoch": 7.008130081300813,
"grad_norm": 417.0002746582031,
"learning_rate": 4.511816693600577e-05,
"loss": 39.8136,
"step": 4310
},
{
"epoch": 7.024390243902439,
"grad_norm": 731.73828125,
"learning_rate": 4.46695035158001e-05,
"loss": 32.1251,
"step": 4320
},
{
"epoch": 7.040650406504065,
"grad_norm": 649.9963989257812,
"learning_rate": 4.42224396222937e-05,
"loss": 24.8058,
"step": 4330
},
{
"epoch": 7.056910569105691,
"grad_norm": 497.6392517089844,
"learning_rate": 4.377698817947385e-05,
"loss": 37.5999,
"step": 4340
},
{
"epoch": 7.073170731707317,
"grad_norm": 1092.6939697265625,
"learning_rate": 4.333316206471418e-05,
"loss": 34.9651,
"step": 4350
},
{
"epoch": 7.0894308943089435,
"grad_norm": 252.49484252929688,
"learning_rate": 4.2890974108402425e-05,
"loss": 64.3354,
"step": 4360
},
{
"epoch": 7.105691056910569,
"grad_norm": 704.4669799804688,
"learning_rate": 4.2450437093569315e-05,
"loss": 66.6694,
"step": 4370
},
{
"epoch": 7.121951219512195,
"grad_norm": 1412.200927734375,
"learning_rate": 4.2011563755519326e-05,
"loss": 34.0108,
"step": 4380
},
{
"epoch": 7.138211382113822,
"grad_norm": 513.7908935546875,
"learning_rate": 4.157436678146238e-05,
"loss": 23.0915,
"step": 4390
},
{
"epoch": 7.154471544715447,
"grad_norm": 429.260986328125,
"learning_rate": 4.1138858810146965e-05,
"loss": 21.7249,
"step": 4400
},
{
"epoch": 7.170731707317073,
"grad_norm": 282.83160400390625,
"learning_rate": 4.0705052431494995e-05,
"loss": 35.1431,
"step": 4410
},
{
"epoch": 7.186991869918699,
"grad_norm": 189.756591796875,
"learning_rate": 4.027296018623772e-05,
"loss": 30.4934,
"step": 4420
},
{
"epoch": 7.203252032520325,
"grad_norm": 484.0589904785156,
"learning_rate": 3.9842594565553085e-05,
"loss": 25.1109,
"step": 4430
},
{
"epoch": 7.219512195121951,
"grad_norm": 707.24560546875,
"learning_rate": 3.9413968010704984e-05,
"loss": 49.4997,
"step": 4440
},
{
"epoch": 7.235772357723577,
"grad_norm": 321.16485595703125,
"learning_rate": 3.898709291268313e-05,
"loss": 50.0109,
"step": 4450
},
{
"epoch": 7.252032520325203,
"grad_norm": 468.12042236328125,
"learning_rate": 3.8561981611845246e-05,
"loss": 71.7242,
"step": 4460
},
{
"epoch": 7.2682926829268295,
"grad_norm": 628.5554809570312,
"learning_rate": 3.813864639756007e-05,
"loss": 31.7032,
"step": 4470
},
{
"epoch": 7.284552845528455,
"grad_norm": 597.160400390625,
"learning_rate": 3.771709950785228e-05,
"loss": 27.9663,
"step": 4480
},
{
"epoch": 7.300813008130081,
"grad_norm": 450.8225402832031,
"learning_rate": 3.7297353129048476e-05,
"loss": 21.0904,
"step": 4490
},
{
"epoch": 7.317073170731708,
"grad_norm": 615.4117431640625,
"learning_rate": 3.687941939542513e-05,
"loss": 32.9963,
"step": 4500
},
{
"epoch": 7.333333333333333,
"grad_norm": 751.5721435546875,
"learning_rate": 3.646331038885768e-05,
"loss": 33.0976,
"step": 4510
},
{
"epoch": 7.349593495934959,
"grad_norm": 13358.826171875,
"learning_rate": 3.6049038138471215e-05,
"loss": 48.3166,
"step": 4520
},
{
"epoch": 7.365853658536586,
"grad_norm": 5210.142578125,
"learning_rate": 3.5636614620292854e-05,
"loss": 42.6251,
"step": 4530
},
{
"epoch": 7.382113821138211,
"grad_norm": 1281.064453125,
"learning_rate": 3.522605175690544e-05,
"loss": 29.0492,
"step": 4540
},
{
"epoch": 7.3983739837398375,
"grad_norm": 357.83819580078125,
"learning_rate": 3.481736141710293e-05,
"loss": 35.3369,
"step": 4550
},
{
"epoch": 7.414634146341464,
"grad_norm": 173.05294799804688,
"learning_rate": 3.4410555415547306e-05,
"loss": 33.2367,
"step": 4560
},
{
"epoch": 7.430894308943089,
"grad_norm": 3365.111572265625,
"learning_rate": 3.4005645512426834e-05,
"loss": 29.4222,
"step": 4570
},
{
"epoch": 7.4471544715447155,
"grad_norm": 670.9901733398438,
"learning_rate": 3.3602643413116386e-05,
"loss": 44.8467,
"step": 4580
},
{
"epoch": 7.463414634146342,
"grad_norm": 454.53265380859375,
"learning_rate": 3.320156076783891e-05,
"loss": 32.9965,
"step": 4590
},
{
"epoch": 7.479674796747967,
"grad_norm": 1082.113525390625,
"learning_rate": 3.280240917132853e-05,
"loss": 37.7567,
"step": 4600
},
{
"epoch": 7.495934959349594,
"grad_norm": 21382.505859375,
"learning_rate": 3.2405200162495586e-05,
"loss": 27.9646,
"step": 4610
},
{
"epoch": 7.512195121951219,
"grad_norm": 391.889892578125,
"learning_rate": 3.200994522409293e-05,
"loss": 32.9818,
"step": 4620
},
{
"epoch": 7.528455284552845,
"grad_norm": 4713.3359375,
"learning_rate": 3.1616655782383864e-05,
"loss": 37.4087,
"step": 4630
},
{
"epoch": 7.544715447154472,
"grad_norm": 2711.176513671875,
"learning_rate": 3.122534320681214e-05,
"loss": 48.8535,
"step": 4640
},
{
"epoch": 7.560975609756097,
"grad_norm": 1700.7119140625,
"learning_rate": 3.083601880967302e-05,
"loss": 42.1752,
"step": 4650
},
{
"epoch": 7.5772357723577235,
"grad_norm": 420.5804443359375,
"learning_rate": 3.0448693845786246e-05,
"loss": 26.3437,
"step": 4660
},
{
"epoch": 7.59349593495935,
"grad_norm": 279.73455810546875,
"learning_rate": 3.0063379512170852e-05,
"loss": 26.54,
"step": 4670
},
{
"epoch": 7.609756097560975,
"grad_norm": 373.8387756347656,
"learning_rate": 2.968008694772141e-05,
"loss": 32.9037,
"step": 4680
},
{
"epoch": 7.626016260162602,
"grad_norm": 4132.44873046875,
"learning_rate": 2.9298827232885863e-05,
"loss": 30.5371,
"step": 4690
},
{
"epoch": 7.642276422764228,
"grad_norm": 448.18359375,
"learning_rate": 2.8919611389345447e-05,
"loss": 23.2553,
"step": 4700
},
{
"epoch": 7.658536585365853,
"grad_norm": 1203.708984375,
"learning_rate": 2.8542450379695973e-05,
"loss": 48.5284,
"step": 4710
},
{
"epoch": 7.67479674796748,
"grad_norm": 234.6784210205078,
"learning_rate": 2.8167355107130787e-05,
"loss": 63.0278,
"step": 4720
},
{
"epoch": 7.691056910569106,
"grad_norm": 475.01544189453125,
"learning_rate": 2.77943364151258e-05,
"loss": 26.5827,
"step": 4730
},
{
"epoch": 7.7073170731707314,
"grad_norm": 2622.9150390625,
"learning_rate": 2.7423405087125832e-05,
"loss": 37.8167,
"step": 4740
},
{
"epoch": 7.723577235772358,
"grad_norm": 2133.2802734375,
"learning_rate": 2.705457184623299e-05,
"loss": 45.3475,
"step": 4750
},
{
"epoch": 7.739837398373984,
"grad_norm": 467.1634216308594,
"learning_rate": 2.668784735489662e-05,
"loss": 38.3572,
"step": 4760
},
{
"epoch": 7.7560975609756095,
"grad_norm": 2866.9052734375,
"learning_rate": 2.632324221460515e-05,
"loss": 49.7959,
"step": 4770
},
{
"epoch": 7.772357723577236,
"grad_norm": 5320.82470703125,
"learning_rate": 2.5960766965579407e-05,
"loss": 27.4925,
"step": 4780
},
{
"epoch": 7.788617886178862,
"grad_norm": 12207.2236328125,
"learning_rate": 2.5600432086468207e-05,
"loss": 25.4184,
"step": 4790
},
{
"epoch": 7.804878048780488,
"grad_norm": 928.2150268554688,
"learning_rate": 2.5242247994045255e-05,
"loss": 38.9474,
"step": 4800
},
{
"epoch": 7.821138211382114,
"grad_norm": 666.2001342773438,
"learning_rate": 2.4886225042907973e-05,
"loss": 28.4315,
"step": 4810
},
{
"epoch": 7.83739837398374,
"grad_norm": 394.76727294921875,
"learning_rate": 2.453237352517831e-05,
"loss": 35.7413,
"step": 4820
},
{
"epoch": 7.853658536585366,
"grad_norm": 1564.347900390625,
"learning_rate": 2.4180703670205108e-05,
"loss": 49.657,
"step": 4830
},
{
"epoch": 7.869918699186992,
"grad_norm": 662.8395385742188,
"learning_rate": 2.3831225644268416e-05,
"loss": 23.6479,
"step": 4840
},
{
"epoch": 7.886178861788618,
"grad_norm": 448.2498474121094,
"learning_rate": 2.348394955028561e-05,
"loss": 30.4568,
"step": 4850
},
{
"epoch": 7.902439024390244,
"grad_norm": 738.3649291992188,
"learning_rate": 2.3138885427519262e-05,
"loss": 48.6049,
"step": 4860
},
{
"epoch": 7.91869918699187,
"grad_norm": 600.122314453125,
"learning_rate": 2.2796043251287002e-05,
"loss": 24.3334,
"step": 4870
},
{
"epoch": 7.934959349593496,
"grad_norm": 604.3839111328125,
"learning_rate": 2.2455432932673182e-05,
"loss": 48.3579,
"step": 4880
},
{
"epoch": 7.951219512195122,
"grad_norm": 854.1920166015625,
"learning_rate": 2.2117064318242154e-05,
"loss": 50.2401,
"step": 4890
},
{
"epoch": 7.967479674796748,
"grad_norm": 8056.27490234375,
"learning_rate": 2.1780947189753875e-05,
"loss": 41.4174,
"step": 4900
},
{
"epoch": 7.983739837398374,
"grad_norm": 788.5985717773438,
"learning_rate": 2.1447091263881014e-05,
"loss": 41.0822,
"step": 4910
},
{
"epoch": 8.0,
"grad_norm": 194.98179626464844,
"learning_rate": 2.111550619192797e-05,
"loss": 28.0501,
"step": 4920
},
{
"epoch": 8.016260162601625,
"grad_norm": 463.9582214355469,
"learning_rate": 2.0786201559552022e-05,
"loss": 38.9959,
"step": 4930
},
{
"epoch": 8.032520325203253,
"grad_norm": 361.2221374511719,
"learning_rate": 2.045918688648616e-05,
"loss": 37.643,
"step": 4940
},
{
"epoch": 8.048780487804878,
"grad_norm": 3094.411376953125,
"learning_rate": 2.013447162626384e-05,
"loss": 23.8148,
"step": 4950
},
{
"epoch": 8.065040650406504,
"grad_norm": 618.3005981445312,
"learning_rate": 1.981206516594576e-05,
"loss": 45.4684,
"step": 4960
},
{
"epoch": 8.08130081300813,
"grad_norm": 3658.843994140625,
"learning_rate": 1.949197682584848e-05,
"loss": 47.9616,
"step": 4970
},
{
"epoch": 8.097560975609756,
"grad_norm": 3654.126708984375,
"learning_rate": 1.9174215859274892e-05,
"loss": 39.6678,
"step": 4980
},
{
"epoch": 8.113821138211382,
"grad_norm": 3715.457763671875,
"learning_rate": 1.885879145224688e-05,
"loss": 28.395,
"step": 4990
},
{
"epoch": 8.130081300813009,
"grad_norm": 13629.64453125,
"learning_rate": 1.8545712723239682e-05,
"loss": 30.707,
"step": 5000
},
{
"epoch": 8.146341463414634,
"grad_norm": 1702.9984130859375,
"learning_rate": 1.823498872291821e-05,
"loss": 39.2062,
"step": 5010
},
{
"epoch": 8.16260162601626,
"grad_norm": 652.4723510742188,
"learning_rate": 1.792662843387557e-05,
"loss": 25.4401,
"step": 5020
},
{
"epoch": 8.178861788617887,
"grad_norm": 545.2056884765625,
"learning_rate": 1.7620640770373286e-05,
"loss": 65.776,
"step": 5030
},
{
"epoch": 8.195121951219512,
"grad_norm": 986.5762329101562,
"learning_rate": 1.7317034578083547e-05,
"loss": 27.4899,
"step": 5040
},
{
"epoch": 8.211382113821138,
"grad_norm": 471.08343505859375,
"learning_rate": 1.70158186338337e-05,
"loss": 35.4397,
"step": 5050
},
{
"epoch": 8.227642276422765,
"grad_norm": 284.622802734375,
"learning_rate": 1.6717001645352324e-05,
"loss": 22.5494,
"step": 5060
},
{
"epoch": 8.24390243902439,
"grad_norm": 22431.65625,
"learning_rate": 1.6420592251017487e-05,
"loss": 45.1601,
"step": 5070
},
{
"epoch": 8.260162601626016,
"grad_norm": 780.5162353515625,
"learning_rate": 1.6126599019607223e-05,
"loss": 33.0745,
"step": 5080
},
{
"epoch": 8.276422764227643,
"grad_norm": 961.0186767578125,
"learning_rate": 1.5835030450051656e-05,
"loss": 34.2111,
"step": 5090
},
{
"epoch": 8.292682926829269,
"grad_norm": 240.08079528808594,
"learning_rate": 1.5545894971187303e-05,
"loss": 25.9617,
"step": 5100
},
{
"epoch": 8.308943089430894,
"grad_norm": 2864.75,
"learning_rate": 1.525920094151353e-05,
"loss": 43.9031,
"step": 5110
},
{
"epoch": 8.32520325203252,
"grad_norm": 791.8621215820312,
"learning_rate": 1.4974956648950845e-05,
"loss": 37.113,
"step": 5120
},
{
"epoch": 8.341463414634147,
"grad_norm": 470.98736572265625,
"learning_rate": 1.4693170310601212e-05,
"loss": 34.8349,
"step": 5130
},
{
"epoch": 8.357723577235772,
"grad_norm": 840.1485595703125,
"learning_rate": 1.4413850072510704e-05,
"loss": 24.1196,
"step": 5140
},
{
"epoch": 8.373983739837398,
"grad_norm": 660.6499633789062,
"learning_rate": 1.4137004009433885e-05,
"loss": 20.1648,
"step": 5150
},
{
"epoch": 8.390243902439025,
"grad_norm": 1366.75390625,
"learning_rate": 1.386264012460039e-05,
"loss": 29.1244,
"step": 5160
},
{
"epoch": 8.40650406504065,
"grad_norm": 270.5916442871094,
"learning_rate": 1.3590766349483586e-05,
"loss": 36.4448,
"step": 5170
},
{
"epoch": 8.422764227642276,
"grad_norm": 439.3215637207031,
"learning_rate": 1.3321390543571266e-05,
"loss": 33.3136,
"step": 5180
},
{
"epoch": 8.439024390243903,
"grad_norm": 37061.68359375,
"learning_rate": 1.3054520494138445e-05,
"loss": 64.5556,
"step": 5190
},
{
"epoch": 8.455284552845528,
"grad_norm": 316.3396911621094,
"learning_rate": 1.2790163916022312e-05,
"loss": 27.1406,
"step": 5200
},
{
"epoch": 8.471544715447154,
"grad_norm": 2111.4130859375,
"learning_rate": 1.2528328451399041e-05,
"loss": 22.3547,
"step": 5210
},
{
"epoch": 8.487804878048781,
"grad_norm": 489.82464599609375,
"learning_rate": 1.2269021669563041e-05,
"loss": 20.5392,
"step": 5220
},
{
"epoch": 8.504065040650406,
"grad_norm": 1655.57275390625,
"learning_rate": 1.2012251066708035e-05,
"loss": 25.9037,
"step": 5230
},
{
"epoch": 8.520325203252032,
"grad_norm": 1041.8621826171875,
"learning_rate": 1.1758024065710404e-05,
"loss": 26.4345,
"step": 5240
},
{
"epoch": 8.536585365853659,
"grad_norm": 1299.66650390625,
"learning_rate": 1.150634801591457e-05,
"loss": 42.8872,
"step": 5250
},
{
"epoch": 8.552845528455284,
"grad_norm": 435.3826904296875,
"learning_rate": 1.1257230192920565e-05,
"loss": 42.8848,
"step": 5260
},
{
"epoch": 8.56910569105691,
"grad_norm": 726.2322998046875,
"learning_rate": 1.1010677798373625e-05,
"loss": 25.041,
"step": 5270
},
{
"epoch": 8.585365853658537,
"grad_norm": 3022.15625,
"learning_rate": 1.0766697959756166e-05,
"loss": 68.7748,
"step": 5280
},
{
"epoch": 8.601626016260163,
"grad_norm": 4241.69580078125,
"learning_rate": 1.0525297730181572e-05,
"loss": 74.2972,
"step": 5290
},
{
"epoch": 8.617886178861788,
"grad_norm": 961.3088989257812,
"learning_rate": 1.028648408819034e-05,
"loss": 24.1545,
"step": 5300
},
{
"epoch": 8.634146341463415,
"grad_norm": 949.1688842773438,
"learning_rate": 1.0050263937548433e-05,
"loss": 49.1739,
"step": 5310
},
{
"epoch": 8.65040650406504,
"grad_norm": 470.57708740234375,
"learning_rate": 9.816644107047613e-06,
"loss": 32.3933,
"step": 5320
},
{
"epoch": 8.666666666666666,
"grad_norm": 717.5396728515625,
"learning_rate": 9.585631350308e-06,
"loss": 32.7468,
"step": 5330
},
{
"epoch": 8.682926829268293,
"grad_norm": 575.5538330078125,
"learning_rate": 9.357232345582922e-06,
"loss": 37.3175,
"step": 5340
},
{
"epoch": 8.699186991869919,
"grad_norm": 371.1407775878906,
"learning_rate": 9.131453695565872e-06,
"loss": 48.2922,
"step": 5350
},
{
"epoch": 8.715447154471544,
"grad_norm": 1407.066650390625,
"learning_rate": 8.90830192719947e-06,
"loss": 34.3162,
"step": 5360
},
{
"epoch": 8.731707317073171,
"grad_norm": 2786.113525390625,
"learning_rate": 8.687783491486966e-06,
"loss": 51.1913,
"step": 5370
},
{
"epoch": 8.747967479674797,
"grad_norm": 407.6085510253906,
"learning_rate": 8.46990476330567e-06,
"loss": 27.1041,
"step": 5380
},
{
"epoch": 8.764227642276422,
"grad_norm": 317.9125671386719,
"learning_rate": 8.254672041222611e-06,
"loss": 57.7832,
"step": 5390
},
{
"epoch": 8.78048780487805,
"grad_norm": 200.4461669921875,
"learning_rate": 8.042091547312569e-06,
"loss": 24.9711,
"step": 5400
},
{
"epoch": 8.796747967479675,
"grad_norm": 25919.078125,
"learning_rate": 7.83216942697813e-06,
"loss": 30.2866,
"step": 5410
},
{
"epoch": 8.8130081300813,
"grad_norm": 9640.9111328125,
"learning_rate": 7.624911748772023e-06,
"loss": 46.633,
"step": 5420
},
{
"epoch": 8.829268292682928,
"grad_norm": 339.77239990234375,
"learning_rate": 7.420324504221721e-06,
"loss": 49.0615,
"step": 5430
},
{
"epoch": 8.845528455284553,
"grad_norm": 360.1629638671875,
"learning_rate": 7.218413607656227e-06,
"loss": 43.912,
"step": 5440
},
{
"epoch": 8.861788617886178,
"grad_norm": 357.3642578125,
"learning_rate": 7.019184896035103e-06,
"loss": 40.2426,
"step": 5450
},
{
"epoch": 8.878048780487806,
"grad_norm": 342.8908386230469,
"learning_rate": 6.822644128779721e-06,
"loss": 27.857,
"step": 5460
},
{
"epoch": 8.894308943089431,
"grad_norm": 1741.92333984375,
"learning_rate": 6.628796987606722e-06,
"loss": 22.8556,
"step": 5470
},
{
"epoch": 8.910569105691057,
"grad_norm": 817.4639282226562,
"learning_rate": 6.437649076363883e-06,
"loss": 25.4468,
"step": 5480
},
{
"epoch": 8.926829268292684,
"grad_norm": 418.2152404785156,
"learning_rate": 6.249205920868018e-06,
"loss": 30.6125,
"step": 5490
},
{
"epoch": 8.94308943089431,
"grad_norm": 345.6661071777344,
"learning_rate": 6.063472968745221e-06,
"loss": 24.8203,
"step": 5500
},
{
"epoch": 8.959349593495935,
"grad_norm": 311.8279113769531,
"learning_rate": 5.880455589273481e-06,
"loss": 28.5219,
"step": 5510
},
{
"epoch": 8.975609756097562,
"grad_norm": 398.0353698730469,
"learning_rate": 5.7001590732273955e-06,
"loss": 38.751,
"step": 5520
},
{
"epoch": 8.991869918699187,
"grad_norm": 4006.41796875,
"learning_rate": 5.522588632725245e-06,
"loss": 48.2014,
"step": 5530
},
{
"epoch": 9.008130081300813,
"grad_norm": 863.8807983398438,
"learning_rate": 5.34774940107825e-06,
"loss": 42.1497,
"step": 5540
},
{
"epoch": 9.024390243902438,
"grad_norm": 6790.38232421875,
"learning_rate": 5.175646432642278e-06,
"loss": 31.0566,
"step": 5550
},
{
"epoch": 9.040650406504065,
"grad_norm": 772.9898681640625,
"learning_rate": 5.006284702671693e-06,
"loss": 36.8164,
"step": 5560
},
{
"epoch": 9.05691056910569,
"grad_norm": 4930.9443359375,
"learning_rate": 4.839669107175493e-06,
"loss": 42.4926,
"step": 5570
},
{
"epoch": 9.073170731707316,
"grad_norm": 192.48233032226562,
"learning_rate": 4.675804462775801e-06,
"loss": 39.5624,
"step": 5580
},
{
"epoch": 9.089430894308943,
"grad_norm": 886.0300903320312,
"learning_rate": 4.5146955065686e-06,
"loss": 32.467,
"step": 5590
},
{
"epoch": 9.105691056910569,
"grad_norm": 271.0351257324219,
"learning_rate": 4.3563468959868515e-06,
"loss": 29.2705,
"step": 5600
},
{
"epoch": 9.121951219512194,
"grad_norm": 651.6824340820312,
"learning_rate": 4.2007632086658035e-06,
"loss": 40.7806,
"step": 5610
},
{
"epoch": 9.138211382113822,
"grad_norm": 153.58518981933594,
"learning_rate": 4.047948942310631e-06,
"loss": 32.8395,
"step": 5620
},
{
"epoch": 9.154471544715447,
"grad_norm": 771.262939453125,
"learning_rate": 3.897908514566484e-06,
"loss": 59.9376,
"step": 5630
},
{
"epoch": 9.170731707317072,
"grad_norm": 2750.450439453125,
"learning_rate": 3.750646262890767e-06,
"loss": 26.9996,
"step": 5640
},
{
"epoch": 9.1869918699187,
"grad_norm": 361.48516845703125,
"learning_rate": 3.60616644442765e-06,
"loss": 30.9447,
"step": 5650
},
{
"epoch": 9.203252032520325,
"grad_norm": 1025.7686767578125,
"learning_rate": 3.4644732358851685e-06,
"loss": 27.8333,
"step": 5660
},
{
"epoch": 9.21951219512195,
"grad_norm": 301.7310485839844,
"learning_rate": 3.3255707334143516e-06,
"loss": 50.7049,
"step": 5670
},
{
"epoch": 9.235772357723578,
"grad_norm": 282.4934997558594,
"learning_rate": 3.1894629524908293e-06,
"loss": 58.6614,
"step": 5680
},
{
"epoch": 9.252032520325203,
"grad_norm": 2989.5283203125,
"learning_rate": 3.056153827798791e-06,
"loss": 65.7686,
"step": 5690
},
{
"epoch": 9.268292682926829,
"grad_norm": 145.37416076660156,
"learning_rate": 2.9256472131172442e-06,
"loss": 24.332,
"step": 5700
},
{
"epoch": 9.284552845528456,
"grad_norm": 245.1734619140625,
"learning_rate": 2.797946881208513e-06,
"loss": 62.6,
"step": 5710
},
{
"epoch": 9.300813008130081,
"grad_norm": 842.1190795898438,
"learning_rate": 2.673056523709294e-06,
"loss": 33.1712,
"step": 5720
},
{
"epoch": 9.317073170731707,
"grad_norm": 205.359130859375,
"learning_rate": 2.550979751023885e-06,
"loss": 24.7365,
"step": 5730
},
{
"epoch": 9.333333333333334,
"grad_norm": 189.57533264160156,
"learning_rate": 2.431720092219758e-06,
"loss": 28.2499,
"step": 5740
},
{
"epoch": 9.34959349593496,
"grad_norm": 311.52374267578125,
"learning_rate": 2.3152809949256503e-06,
"loss": 21.5204,
"step": 5750
},
{
"epoch": 9.365853658536585,
"grad_norm": 2237.07958984375,
"learning_rate": 2.2016658252318025e-06,
"loss": 26.6137,
"step": 5760
},
{
"epoch": 9.382113821138212,
"grad_norm": 623.1047973632812,
"learning_rate": 2.0908778675927e-06,
"loss": 24.8671,
"step": 5770
},
{
"epoch": 9.398373983739837,
"grad_norm": 292.36285400390625,
"learning_rate": 1.9829203247321293e-06,
"loss": 23.2705,
"step": 5780
},
{
"epoch": 9.414634146341463,
"grad_norm": 139.58456420898438,
"learning_rate": 1.8777963175505398e-06,
"loss": 34.1858,
"step": 5790
},
{
"epoch": 9.43089430894309,
"grad_norm": 5472.58349609375,
"learning_rate": 1.7755088850348822e-06,
"loss": 23.8006,
"step": 5800
},
{
"epoch": 9.447154471544716,
"grad_norm": 1327.946533203125,
"learning_rate": 1.676060984170702e-06,
"loss": 27.9731,
"step": 5810
},
{
"epoch": 9.463414634146341,
"grad_norm": 156.09629821777344,
"learning_rate": 1.5794554898567182e-06,
"loss": 24.1258,
"step": 5820
},
{
"epoch": 9.479674796747968,
"grad_norm": 485.4151306152344,
"learning_rate": 1.4856951948216569e-06,
"loss": 28.9193,
"step": 5830
},
{
"epoch": 9.495934959349594,
"grad_norm": 354.6837158203125,
"learning_rate": 1.39478280954356e-06,
"loss": 33.2445,
"step": 5840
},
{
"epoch": 9.512195121951219,
"grad_norm": 503.53289794921875,
"learning_rate": 1.3067209621713928e-06,
"loss": 25.0091,
"step": 5850
},
{
"epoch": 9.528455284552846,
"grad_norm": 329.1166687011719,
"learning_rate": 1.221512198449093e-06,
"loss": 35.9692,
"step": 5860
},
{
"epoch": 9.544715447154472,
"grad_norm": 374.5758361816406,
"learning_rate": 1.1391589816419968e-06,
"loss": 25.7447,
"step": 5870
},
{
"epoch": 9.560975609756097,
"grad_norm": 257.5137939453125,
"learning_rate": 1.059663692465529e-06,
"loss": 37.0374,
"step": 5880
},
{
"epoch": 9.577235772357724,
"grad_norm": 284.5126037597656,
"learning_rate": 9.830286290165357e-07,
"loss": 23.4132,
"step": 5890
},
{
"epoch": 9.59349593495935,
"grad_norm": 689.851806640625,
"learning_rate": 9.092560067067268e-07,
"loss": 47.7638,
"step": 5900
},
{
"epoch": 9.609756097560975,
"grad_norm": 1487.80859375,
"learning_rate": 8.383479581986597e-07,
"loss": 22.3418,
"step": 5910
},
{
"epoch": 9.6260162601626,
"grad_norm": 1127.08837890625,
"learning_rate": 7.70306533344134e-07,
"loss": 24.0052,
"step": 5920
},
{
"epoch": 9.642276422764228,
"grad_norm": 6250.7666015625,
"learning_rate": 7.051336991248714e-07,
"loss": 31.2493,
"step": 5930
},
{
"epoch": 9.658536585365853,
"grad_norm": 565.5596923828125,
"learning_rate": 6.428313395956953e-07,
"loss": 20.2709,
"step": 5940
},
{
"epoch": 9.67479674796748,
"grad_norm": 142.4834442138672,
"learning_rate": 5.834012558300295e-07,
"loss": 27.2821,
"step": 5950
},
{
"epoch": 9.691056910569106,
"grad_norm": 559.2692260742188,
"learning_rate": 5.26845165867873e-07,
"loss": 56.2713,
"step": 5960
},
{
"epoch": 9.707317073170731,
"grad_norm": 170.9761199951172,
"learning_rate": 4.7316470466611804e-07,
"loss": 25.9403,
"step": 5970
},
{
"epoch": 9.723577235772357,
"grad_norm": 577.9078369140625,
"learning_rate": 4.22361424051243e-07,
"loss": 27.2287,
"step": 5980
},
{
"epoch": 9.739837398373984,
"grad_norm": 203.03167724609375,
"learning_rate": 3.7443679267453735e-07,
"loss": 33.0212,
"step": 5990
},
{
"epoch": 9.75609756097561,
"grad_norm": 1709.7088623046875,
"learning_rate": 3.2939219596956895e-07,
"loss": 30.0687,
"step": 6000
},
{
"epoch": 9.772357723577235,
"grad_norm": 226.99795532226562,
"learning_rate": 2.872289361121605e-07,
"loss": 36.0599,
"step": 6010
},
{
"epoch": 9.788617886178862,
"grad_norm": 805.7896728515625,
"learning_rate": 2.4794823198275307e-07,
"loss": 48.3908,
"step": 6020
},
{
"epoch": 9.804878048780488,
"grad_norm": 21221.19921875,
"learning_rate": 2.115512191311564e-07,
"loss": 55.056,
"step": 6030
},
{
"epoch": 9.821138211382113,
"grad_norm": 1422.177001953125,
"learning_rate": 1.780389497437418e-07,
"loss": 20.1985,
"step": 6040
},
{
"epoch": 9.83739837398374,
"grad_norm": 182.74656677246094,
"learning_rate": 1.4741239261299998e-07,
"loss": 36.4601,
"step": 6050
},
{
"epoch": 9.853658536585366,
"grad_norm": 427.26385498046875,
"learning_rate": 1.1967243310955222e-07,
"loss": 49.9752,
"step": 6060
},
{
"epoch": 9.869918699186991,
"grad_norm": 463.0358581542969,
"learning_rate": 9.481987315653751e-08,
"loss": 38.0783,
"step": 6070
},
{
"epoch": 9.886178861788618,
"grad_norm": 381.15008544921875,
"learning_rate": 7.285543120645332e-08,
"loss": 40.3717,
"step": 6080
},
{
"epoch": 9.902439024390244,
"grad_norm": 414.7477111816406,
"learning_rate": 5.377974222036119e-08,
"loss": 23.7009,
"step": 6090
},
{
"epoch": 9.91869918699187,
"grad_norm": 2649.5400390625,
"learning_rate": 3.7593357649579055e-08,
"loss": 39.1989,
"step": 6100
},
{
"epoch": 9.934959349593496,
"grad_norm": 1547.17236328125,
"learning_rate": 2.429674541966076e-08,
"loss": 45.793,
"step": 6110
},
{
"epoch": 9.951219512195122,
"grad_norm": 394.08685302734375,
"learning_rate": 1.3890289916929089e-08,
"loss": 26.9755,
"step": 6120
},
{
"epoch": 9.967479674796747,
"grad_norm": 6701.4306640625,
"learning_rate": 6.37429197736239e-09,
"loss": 26.3901,
"step": 6130
},
{
"epoch": 9.983739837398375,
"grad_norm": 231.67611694335938,
"learning_rate": 1.7489688778793424e-09,
"loss": 22.5137,
"step": 6140
},
{
"epoch": 10.0,
"grad_norm": 158.60951232910156,
"learning_rate": 1.4454330032886986e-11,
"loss": 39.3726,
"step": 6150
},
{
"epoch": 10.001626016260163,
"step": 6151,
"total_flos": 2.157115506118272e+17,
"train_loss": 212.35847260424458,
"train_runtime": 2807.1103,
"train_samples_per_second": 35.06,
"train_steps_per_second": 2.191
}
],
"logging_steps": 10,
"max_steps": 6151,
"num_input_tokens_seen": 0,
"num_train_epochs": 11,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.157115506118272e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}