gr1 / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
0c95b57 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.498581560283688,
"eval_steps": 500,
"global_step": 10573,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0070921985815602835,
"grad_norm": 5.856144428253174,
"learning_rate": 3.780718336483932e-06,
"loss": 0.8655,
"step": 10
},
{
"epoch": 0.014184397163120567,
"grad_norm": 5.8862433433532715,
"learning_rate": 7.561436672967864e-06,
"loss": 0.7361,
"step": 20
},
{
"epoch": 0.02127659574468085,
"grad_norm": 3.5518908500671387,
"learning_rate": 1.1342155009451797e-05,
"loss": 0.551,
"step": 30
},
{
"epoch": 0.028368794326241134,
"grad_norm": 3.55409836769104,
"learning_rate": 1.5122873345935728e-05,
"loss": 0.3709,
"step": 40
},
{
"epoch": 0.03546099290780142,
"grad_norm": 2.7126119136810303,
"learning_rate": 1.890359168241966e-05,
"loss": 0.3237,
"step": 50
},
{
"epoch": 0.0425531914893617,
"grad_norm": 2.9090919494628906,
"learning_rate": 2.2684310018903593e-05,
"loss": 0.2899,
"step": 60
},
{
"epoch": 0.04964539007092199,
"grad_norm": 1.7755730152130127,
"learning_rate": 2.646502835538752e-05,
"loss": 0.1869,
"step": 70
},
{
"epoch": 0.05673758865248227,
"grad_norm": 1.6960084438323975,
"learning_rate": 3.0245746691871456e-05,
"loss": 0.1864,
"step": 80
},
{
"epoch": 0.06382978723404255,
"grad_norm": 1.7639371156692505,
"learning_rate": 3.4026465028355385e-05,
"loss": 0.1408,
"step": 90
},
{
"epoch": 0.07092198581560284,
"grad_norm": 2.5512263774871826,
"learning_rate": 3.780718336483932e-05,
"loss": 0.1363,
"step": 100
},
{
"epoch": 0.07801418439716312,
"grad_norm": 1.9627578258514404,
"learning_rate": 4.158790170132325e-05,
"loss": 0.129,
"step": 110
},
{
"epoch": 0.0851063829787234,
"grad_norm": 0.9527886509895325,
"learning_rate": 4.5368620037807186e-05,
"loss": 0.1181,
"step": 120
},
{
"epoch": 0.09219858156028368,
"grad_norm": 3.0496978759765625,
"learning_rate": 4.914933837429112e-05,
"loss": 0.0903,
"step": 130
},
{
"epoch": 0.09929078014184398,
"grad_norm": 1.1979912519454956,
"learning_rate": 5.293005671077504e-05,
"loss": 0.0906,
"step": 140
},
{
"epoch": 0.10638297872340426,
"grad_norm": 1.2007324695587158,
"learning_rate": 5.671077504725898e-05,
"loss": 0.0997,
"step": 150
},
{
"epoch": 0.11347517730496454,
"grad_norm": 2.112391948699951,
"learning_rate": 6.049149338374291e-05,
"loss": 0.0903,
"step": 160
},
{
"epoch": 0.12056737588652482,
"grad_norm": 1.144476056098938,
"learning_rate": 6.427221172022685e-05,
"loss": 0.0853,
"step": 170
},
{
"epoch": 0.1276595744680851,
"grad_norm": 0.9079101085662842,
"learning_rate": 6.805293005671077e-05,
"loss": 0.0813,
"step": 180
},
{
"epoch": 0.1347517730496454,
"grad_norm": 2.0803258419036865,
"learning_rate": 7.183364839319471e-05,
"loss": 0.0929,
"step": 190
},
{
"epoch": 0.14184397163120568,
"grad_norm": 1.2291367053985596,
"learning_rate": 7.561436672967865e-05,
"loss": 0.0875,
"step": 200
},
{
"epoch": 0.14893617021276595,
"grad_norm": 1.3166685104370117,
"learning_rate": 7.939508506616258e-05,
"loss": 0.0903,
"step": 210
},
{
"epoch": 0.15602836879432624,
"grad_norm": 1.6771767139434814,
"learning_rate": 8.31758034026465e-05,
"loss": 0.0837,
"step": 220
},
{
"epoch": 0.16312056737588654,
"grad_norm": 1.185477375984192,
"learning_rate": 8.695652173913044e-05,
"loss": 0.0774,
"step": 230
},
{
"epoch": 0.1702127659574468,
"grad_norm": 0.8530003428459167,
"learning_rate": 9.073724007561437e-05,
"loss": 0.0767,
"step": 240
},
{
"epoch": 0.1773049645390071,
"grad_norm": 0.8007742762565613,
"learning_rate": 9.45179584120983e-05,
"loss": 0.0733,
"step": 250
},
{
"epoch": 0.18439716312056736,
"grad_norm": 0.5508751273155212,
"learning_rate": 9.829867674858224e-05,
"loss": 0.0853,
"step": 260
},
{
"epoch": 0.19148936170212766,
"grad_norm": 0.8425294756889343,
"learning_rate": 0.00010207939508506617,
"loss": 0.0725,
"step": 270
},
{
"epoch": 0.19858156028368795,
"grad_norm": 1.2945622205734253,
"learning_rate": 0.00010586011342155009,
"loss": 0.0879,
"step": 280
},
{
"epoch": 0.20567375886524822,
"grad_norm": 0.6478763818740845,
"learning_rate": 0.00010964083175803403,
"loss": 0.0553,
"step": 290
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.9865133166313171,
"learning_rate": 0.00011342155009451796,
"loss": 0.0793,
"step": 300
},
{
"epoch": 0.2198581560283688,
"grad_norm": 1.046968936920166,
"learning_rate": 0.00011720226843100191,
"loss": 0.0825,
"step": 310
},
{
"epoch": 0.22695035460992907,
"grad_norm": 0.9418226480484009,
"learning_rate": 0.00012098298676748583,
"loss": 0.0793,
"step": 320
},
{
"epoch": 0.23404255319148937,
"grad_norm": 1.2901511192321777,
"learning_rate": 0.00012476370510396974,
"loss": 0.0753,
"step": 330
},
{
"epoch": 0.24113475177304963,
"grad_norm": 1.3087291717529297,
"learning_rate": 0.0001285444234404537,
"loss": 0.0615,
"step": 340
},
{
"epoch": 0.24822695035460993,
"grad_norm": 0.9991538524627686,
"learning_rate": 0.00013232514177693763,
"loss": 0.0626,
"step": 350
},
{
"epoch": 0.2553191489361702,
"grad_norm": 0.6831763386726379,
"learning_rate": 0.00013610586011342154,
"loss": 0.0626,
"step": 360
},
{
"epoch": 0.2624113475177305,
"grad_norm": 0.7626124024391174,
"learning_rate": 0.0001398865784499055,
"loss": 0.0622,
"step": 370
},
{
"epoch": 0.2695035460992908,
"grad_norm": 0.6531655192375183,
"learning_rate": 0.00014366729678638943,
"loss": 0.0607,
"step": 380
},
{
"epoch": 0.2765957446808511,
"grad_norm": 0.8742074966430664,
"learning_rate": 0.00014744801512287336,
"loss": 0.0768,
"step": 390
},
{
"epoch": 0.28368794326241137,
"grad_norm": 0.8710255026817322,
"learning_rate": 0.0001512287334593573,
"loss": 0.0576,
"step": 400
},
{
"epoch": 0.2907801418439716,
"grad_norm": 0.8089184761047363,
"learning_rate": 0.0001550094517958412,
"loss": 0.0659,
"step": 410
},
{
"epoch": 0.2978723404255319,
"grad_norm": 1.0766539573669434,
"learning_rate": 0.00015879017013232515,
"loss": 0.0748,
"step": 420
},
{
"epoch": 0.3049645390070922,
"grad_norm": 0.9432766437530518,
"learning_rate": 0.0001625708884688091,
"loss": 0.0659,
"step": 430
},
{
"epoch": 0.3120567375886525,
"grad_norm": 0.7996474504470825,
"learning_rate": 0.000166351606805293,
"loss": 0.0659,
"step": 440
},
{
"epoch": 0.3191489361702128,
"grad_norm": 1.3181546926498413,
"learning_rate": 0.00017013232514177695,
"loss": 0.0658,
"step": 450
},
{
"epoch": 0.3262411347517731,
"grad_norm": 0.8984364867210388,
"learning_rate": 0.00017391304347826088,
"loss": 0.0596,
"step": 460
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.0370538234710693,
"learning_rate": 0.0001776937618147448,
"loss": 0.066,
"step": 470
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.0649698972702026,
"learning_rate": 0.00018147448015122874,
"loss": 0.0597,
"step": 480
},
{
"epoch": 0.3475177304964539,
"grad_norm": 0.5405861735343933,
"learning_rate": 0.00018525519848771268,
"loss": 0.0603,
"step": 490
},
{
"epoch": 0.3546099290780142,
"grad_norm": 0.8146863579750061,
"learning_rate": 0.0001890359168241966,
"loss": 0.0524,
"step": 500
},
{
"epoch": 0.3617021276595745,
"grad_norm": 0.6537788510322571,
"learning_rate": 0.00019281663516068054,
"loss": 0.0564,
"step": 510
},
{
"epoch": 0.36879432624113473,
"grad_norm": 0.8714485764503479,
"learning_rate": 0.00019659735349716447,
"loss": 0.0525,
"step": 520
},
{
"epoch": 0.375886524822695,
"grad_norm": 0.5386486649513245,
"learning_rate": 0.00019999999510833915,
"loss": 0.0557,
"step": 530
},
{
"epoch": 0.3829787234042553,
"grad_norm": 0.6375821828842163,
"learning_rate": 0.00019999940810961714,
"loss": 0.0614,
"step": 540
},
{
"epoch": 0.3900709219858156,
"grad_norm": 0.6789309978485107,
"learning_rate": 0.00019999784278530695,
"loss": 0.0537,
"step": 550
},
{
"epoch": 0.3971631205673759,
"grad_norm": 0.8208333253860474,
"learning_rate": 0.00019999529915072262,
"loss": 0.0668,
"step": 560
},
{
"epoch": 0.40425531914893614,
"grad_norm": 0.6849876642227173,
"learning_rate": 0.00019999177723074935,
"loss": 0.0612,
"step": 570
},
{
"epoch": 0.41134751773049644,
"grad_norm": 0.690582811832428,
"learning_rate": 0.00019998727705984316,
"loss": 0.0652,
"step": 580
},
{
"epoch": 0.41843971631205673,
"grad_norm": 0.5250919461250305,
"learning_rate": 0.00019998179868203068,
"loss": 0.0596,
"step": 590
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.6307488679885864,
"learning_rate": 0.00019997534215090857,
"loss": 0.057,
"step": 600
},
{
"epoch": 0.4326241134751773,
"grad_norm": 0.630332350730896,
"learning_rate": 0.00019996790752964305,
"loss": 0.066,
"step": 610
},
{
"epoch": 0.4397163120567376,
"grad_norm": 0.40226250886917114,
"learning_rate": 0.00019995949489096945,
"loss": 0.0555,
"step": 620
},
{
"epoch": 0.44680851063829785,
"grad_norm": 0.5462756752967834,
"learning_rate": 0.00019995010431719118,
"loss": 0.0507,
"step": 630
},
{
"epoch": 0.45390070921985815,
"grad_norm": 0.5090711116790771,
"learning_rate": 0.00019993973590017922,
"loss": 0.0458,
"step": 640
},
{
"epoch": 0.46099290780141844,
"grad_norm": 0.47854822874069214,
"learning_rate": 0.00019992838974137103,
"loss": 0.0459,
"step": 650
},
{
"epoch": 0.46808510638297873,
"grad_norm": 0.7866285443305969,
"learning_rate": 0.00019991606595176964,
"loss": 0.0585,
"step": 660
},
{
"epoch": 0.475177304964539,
"grad_norm": 0.8126336932182312,
"learning_rate": 0.0001999027646519425,
"loss": 0.0448,
"step": 670
},
{
"epoch": 0.48226950354609927,
"grad_norm": 0.5472203493118286,
"learning_rate": 0.0001998884859720205,
"loss": 0.0514,
"step": 680
},
{
"epoch": 0.48936170212765956,
"grad_norm": 0.6094168424606323,
"learning_rate": 0.00019987323005169638,
"loss": 0.0459,
"step": 690
},
{
"epoch": 0.49645390070921985,
"grad_norm": 0.5688044428825378,
"learning_rate": 0.00019985699704022357,
"loss": 0.053,
"step": 700
},
{
"epoch": 0.5035460992907801,
"grad_norm": 0.469110906124115,
"learning_rate": 0.00019983978709641481,
"loss": 0.0524,
"step": 710
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.8332406282424927,
"learning_rate": 0.00019982160038864032,
"loss": 0.0507,
"step": 720
},
{
"epoch": 0.5177304964539007,
"grad_norm": 0.8524491190910339,
"learning_rate": 0.00019980243709482633,
"loss": 0.0573,
"step": 730
},
{
"epoch": 0.524822695035461,
"grad_norm": 0.7200371026992798,
"learning_rate": 0.00019978229740245343,
"loss": 0.0502,
"step": 740
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.4582567811012268,
"learning_rate": 0.0001997611815085545,
"loss": 0.0503,
"step": 750
},
{
"epoch": 0.5390070921985816,
"grad_norm": 0.5496141910552979,
"learning_rate": 0.000199739089619713,
"loss": 0.0493,
"step": 760
},
{
"epoch": 0.5460992907801419,
"grad_norm": 0.8712863326072693,
"learning_rate": 0.0001997160219520608,
"loss": 0.0469,
"step": 770
},
{
"epoch": 0.5531914893617021,
"grad_norm": 0.7600995302200317,
"learning_rate": 0.0001996919787312761,
"loss": 0.0544,
"step": 780
},
{
"epoch": 0.5602836879432624,
"grad_norm": 0.6321051716804504,
"learning_rate": 0.00019966696019258127,
"loss": 0.0418,
"step": 790
},
{
"epoch": 0.5673758865248227,
"grad_norm": 0.5661709904670715,
"learning_rate": 0.00019964096658074056,
"loss": 0.0437,
"step": 800
},
{
"epoch": 0.574468085106383,
"grad_norm": 0.322308748960495,
"learning_rate": 0.00019961399815005763,
"loss": 0.0379,
"step": 810
},
{
"epoch": 0.5815602836879432,
"grad_norm": 0.5047584176063538,
"learning_rate": 0.00019958605516437307,
"loss": 0.0628,
"step": 820
},
{
"epoch": 0.5886524822695035,
"grad_norm": 0.45054513216018677,
"learning_rate": 0.0001995571378970619,
"loss": 0.0475,
"step": 830
},
{
"epoch": 0.5957446808510638,
"grad_norm": 0.7547653913497925,
"learning_rate": 0.00019952724663103083,
"loss": 0.0413,
"step": 840
},
{
"epoch": 0.6028368794326241,
"grad_norm": 0.3875257670879364,
"learning_rate": 0.00019949638165871547,
"loss": 0.039,
"step": 850
},
{
"epoch": 0.6099290780141844,
"grad_norm": 0.6647422313690186,
"learning_rate": 0.00019946454328207753,
"loss": 0.0559,
"step": 860
},
{
"epoch": 0.6170212765957447,
"grad_norm": 0.3902786076068878,
"learning_rate": 0.00019943173181260186,
"loss": 0.0407,
"step": 870
},
{
"epoch": 0.624113475177305,
"grad_norm": 0.5156275033950806,
"learning_rate": 0.00019939794757129332,
"loss": 0.0443,
"step": 880
},
{
"epoch": 0.6312056737588653,
"grad_norm": 0.5711575746536255,
"learning_rate": 0.0001993631908886738,
"loss": 0.0407,
"step": 890
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.33694812655448914,
"learning_rate": 0.0001993274621047788,
"loss": 0.0402,
"step": 900
},
{
"epoch": 0.6453900709219859,
"grad_norm": 0.869691014289856,
"learning_rate": 0.00019929076156915425,
"loss": 0.0506,
"step": 910
},
{
"epoch": 0.6524822695035462,
"grad_norm": 0.5810511112213135,
"learning_rate": 0.00019925308964085297,
"loss": 0.0537,
"step": 920
},
{
"epoch": 0.6595744680851063,
"grad_norm": 0.666899561882019,
"learning_rate": 0.00019921444668843125,
"loss": 0.0574,
"step": 930
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.7992278337478638,
"learning_rate": 0.00019917483308994527,
"loss": 0.0385,
"step": 940
},
{
"epoch": 0.6737588652482269,
"grad_norm": 0.4858379662036896,
"learning_rate": 0.00019913424923294722,
"loss": 0.0473,
"step": 950
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.5866559147834778,
"learning_rate": 0.0001990926955144818,
"loss": 0.0465,
"step": 960
},
{
"epoch": 0.6879432624113475,
"grad_norm": 0.6053078770637512,
"learning_rate": 0.000199050172341082,
"loss": 0.0498,
"step": 970
},
{
"epoch": 0.6950354609929078,
"grad_norm": 0.5185685157775879,
"learning_rate": 0.00019900668012876543,
"loss": 0.0444,
"step": 980
},
{
"epoch": 0.7021276595744681,
"grad_norm": 0.6573126316070557,
"learning_rate": 0.00019896221930303,
"loss": 0.046,
"step": 990
},
{
"epoch": 0.7092198581560284,
"grad_norm": 0.6348971724510193,
"learning_rate": 0.00019891679029884993,
"loss": 0.0439,
"step": 1000
},
{
"epoch": 0.7163120567375887,
"grad_norm": 0.6452783346176147,
"learning_rate": 0.00019887039356067146,
"loss": 0.0607,
"step": 1010
},
{
"epoch": 0.723404255319149,
"grad_norm": 0.6160244941711426,
"learning_rate": 0.00019882302954240836,
"loss": 0.0397,
"step": 1020
},
{
"epoch": 0.7304964539007093,
"grad_norm": 0.5438401699066162,
"learning_rate": 0.00019877469870743778,
"loss": 0.0417,
"step": 1030
},
{
"epoch": 0.7375886524822695,
"grad_norm": 0.5617598295211792,
"learning_rate": 0.00019872540152859536,
"loss": 0.0405,
"step": 1040
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.4101731479167938,
"learning_rate": 0.00019867513848817093,
"loss": 0.0392,
"step": 1050
},
{
"epoch": 0.75177304964539,
"grad_norm": 0.4860725998878479,
"learning_rate": 0.00019862391007790354,
"loss": 0.049,
"step": 1060
},
{
"epoch": 0.7588652482269503,
"grad_norm": 0.3138566315174103,
"learning_rate": 0.00019857171679897687,
"loss": 0.0372,
"step": 1070
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.5368508100509644,
"learning_rate": 0.00019851855916201404,
"loss": 0.0466,
"step": 1080
},
{
"epoch": 0.7730496453900709,
"grad_norm": 0.5401434898376465,
"learning_rate": 0.000198464437687073,
"loss": 0.0489,
"step": 1090
},
{
"epoch": 0.7801418439716312,
"grad_norm": 0.3905884623527527,
"learning_rate": 0.00019840935290364105,
"loss": 0.0389,
"step": 1100
},
{
"epoch": 0.7872340425531915,
"grad_norm": 0.3689773976802826,
"learning_rate": 0.00019835330535062994,
"loss": 0.0432,
"step": 1110
},
{
"epoch": 0.7943262411347518,
"grad_norm": 0.5504758954048157,
"learning_rate": 0.0001982962955763705,
"loss": 0.04,
"step": 1120
},
{
"epoch": 0.8014184397163121,
"grad_norm": 0.3219192326068878,
"learning_rate": 0.00019823832413860714,
"loss": 0.0373,
"step": 1130
},
{
"epoch": 0.8085106382978723,
"grad_norm": 0.5545524954795837,
"learning_rate": 0.00019817939160449272,
"loss": 0.0367,
"step": 1140
},
{
"epoch": 0.8156028368794326,
"grad_norm": 0.5093657374382019,
"learning_rate": 0.0001981194985505827,
"loss": 0.0426,
"step": 1150
},
{
"epoch": 0.8226950354609929,
"grad_norm": 0.4452705979347229,
"learning_rate": 0.00019805864556282957,
"loss": 0.0357,
"step": 1160
},
{
"epoch": 0.8297872340425532,
"grad_norm": 0.5040962100028992,
"learning_rate": 0.00019799683323657726,
"loss": 0.0552,
"step": 1170
},
{
"epoch": 0.8368794326241135,
"grad_norm": 0.5125325322151184,
"learning_rate": 0.00019793406217655517,
"loss": 0.0455,
"step": 1180
},
{
"epoch": 0.8439716312056738,
"grad_norm": 0.5588168501853943,
"learning_rate": 0.0001978703329968722,
"loss": 0.0432,
"step": 1190
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.5096802711486816,
"learning_rate": 0.00019780564632101096,
"loss": 0.0488,
"step": 1200
},
{
"epoch": 0.8581560283687943,
"grad_norm": 0.2617908716201782,
"learning_rate": 0.00019774000278182147,
"loss": 0.0454,
"step": 1210
},
{
"epoch": 0.8652482269503546,
"grad_norm": 0.5790386199951172,
"learning_rate": 0.00019767340302151513,
"loss": 0.039,
"step": 1220
},
{
"epoch": 0.8723404255319149,
"grad_norm": 0.9014220237731934,
"learning_rate": 0.00019760584769165824,
"loss": 0.0325,
"step": 1230
},
{
"epoch": 0.8794326241134752,
"grad_norm": 0.41618219017982483,
"learning_rate": 0.0001975373374531658,
"loss": 0.0442,
"step": 1240
},
{
"epoch": 0.8865248226950354,
"grad_norm": 0.4697210192680359,
"learning_rate": 0.00019746787297629496,
"loss": 0.0436,
"step": 1250
},
{
"epoch": 0.8936170212765957,
"grad_norm": 0.4056944251060486,
"learning_rate": 0.00019739745494063855,
"loss": 0.0375,
"step": 1260
},
{
"epoch": 0.900709219858156,
"grad_norm": 0.6047598123550415,
"learning_rate": 0.00019732608403511822,
"loss": 0.0323,
"step": 1270
},
{
"epoch": 0.9078014184397163,
"grad_norm": 0.411379873752594,
"learning_rate": 0.00019725376095797804,
"loss": 0.0483,
"step": 1280
},
{
"epoch": 0.9148936170212766,
"grad_norm": 0.2770962119102478,
"learning_rate": 0.00019718048641677728,
"loss": 0.0379,
"step": 1290
},
{
"epoch": 0.9219858156028369,
"grad_norm": 0.6620250940322876,
"learning_rate": 0.00019710626112838382,
"loss": 0.042,
"step": 1300
},
{
"epoch": 0.9290780141843972,
"grad_norm": 0.4887118339538574,
"learning_rate": 0.0001970310858189669,
"loss": 0.0419,
"step": 1310
},
{
"epoch": 0.9361702127659575,
"grad_norm": 0.4877367615699768,
"learning_rate": 0.0001969549612239902,
"loss": 0.0386,
"step": 1320
},
{
"epoch": 0.9432624113475178,
"grad_norm": 0.6139522790908813,
"learning_rate": 0.00019687788808820452,
"loss": 0.0411,
"step": 1330
},
{
"epoch": 0.950354609929078,
"grad_norm": 0.628237247467041,
"learning_rate": 0.0001967998671656405,
"loss": 0.0532,
"step": 1340
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.5345218777656555,
"learning_rate": 0.00019672089921960137,
"loss": 0.0414,
"step": 1350
},
{
"epoch": 0.9645390070921985,
"grad_norm": 0.35509511828422546,
"learning_rate": 0.00019664098502265525,
"loss": 0.0464,
"step": 1360
},
{
"epoch": 0.9716312056737588,
"grad_norm": 0.4468687176704407,
"learning_rate": 0.00019656012535662786,
"loss": 0.0395,
"step": 1370
},
{
"epoch": 0.9787234042553191,
"grad_norm": 0.5955891013145447,
"learning_rate": 0.0001964783210125946,
"loss": 0.045,
"step": 1380
},
{
"epoch": 0.9858156028368794,
"grad_norm": 0.48954424262046814,
"learning_rate": 0.0001963955727908732,
"loss": 0.0471,
"step": 1390
},
{
"epoch": 0.9929078014184397,
"grad_norm": 0.450183242559433,
"learning_rate": 0.00019631188150101534,
"loss": 0.0366,
"step": 1400
},
{
"epoch": 1.0,
"grad_norm": 0.41711631417274475,
"learning_rate": 0.0001962272479617992,
"loss": 0.0528,
"step": 1410
},
{
"epoch": 1.0070921985815602,
"grad_norm": 0.3645707666873932,
"learning_rate": 0.00019614167300122126,
"loss": 0.043,
"step": 1420
},
{
"epoch": 1.0141843971631206,
"grad_norm": 0.42011067271232605,
"learning_rate": 0.00019605515745648822,
"loss": 0.0402,
"step": 1430
},
{
"epoch": 1.0212765957446808,
"grad_norm": 0.4155680537223816,
"learning_rate": 0.0001959677021740088,
"loss": 0.0392,
"step": 1440
},
{
"epoch": 1.0283687943262412,
"grad_norm": 0.49862140417099,
"learning_rate": 0.00019587930800938545,
"loss": 0.0484,
"step": 1450
},
{
"epoch": 1.0354609929078014,
"grad_norm": 0.42772340774536133,
"learning_rate": 0.00019578997582740603,
"loss": 0.0349,
"step": 1460
},
{
"epoch": 1.0425531914893618,
"grad_norm": 0.40276241302490234,
"learning_rate": 0.00019569970650203534,
"loss": 0.0335,
"step": 1470
},
{
"epoch": 1.049645390070922,
"grad_norm": 0.369478315114975,
"learning_rate": 0.00019560850091640647,
"loss": 0.0402,
"step": 1480
},
{
"epoch": 1.0567375886524824,
"grad_norm": 0.7682146430015564,
"learning_rate": 0.00019551635996281231,
"loss": 0.0392,
"step": 1490
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.619755744934082,
"learning_rate": 0.0001954232845426967,
"loss": 0.0426,
"step": 1500
},
{
"epoch": 1.070921985815603,
"grad_norm": 0.3280376195907593,
"learning_rate": 0.00019532927556664573,
"loss": 0.0311,
"step": 1510
},
{
"epoch": 1.0780141843971631,
"grad_norm": 0.42268431186676025,
"learning_rate": 0.00019523433395437866,
"loss": 0.0354,
"step": 1520
},
{
"epoch": 1.0851063829787233,
"grad_norm": 0.34740039706230164,
"learning_rate": 0.00019513846063473907,
"loss": 0.0374,
"step": 1530
},
{
"epoch": 1.0921985815602837,
"grad_norm": 0.36237838864326477,
"learning_rate": 0.00019504165654568576,
"loss": 0.028,
"step": 1540
},
{
"epoch": 1.099290780141844,
"grad_norm": 0.4167526364326477,
"learning_rate": 0.00019494392263428353,
"loss": 0.0447,
"step": 1550
},
{
"epoch": 1.1063829787234043,
"grad_norm": 0.5500767827033997,
"learning_rate": 0.00019484525985669383,
"loss": 0.0305,
"step": 1560
},
{
"epoch": 1.1134751773049645,
"grad_norm": 0.4666343927383423,
"learning_rate": 0.00019474566917816565,
"loss": 0.0323,
"step": 1570
},
{
"epoch": 1.1205673758865249,
"grad_norm": 0.43880197405815125,
"learning_rate": 0.0001946451515730258,
"loss": 0.0305,
"step": 1580
},
{
"epoch": 1.127659574468085,
"grad_norm": 0.4338850677013397,
"learning_rate": 0.00019454370802466953,
"loss": 0.035,
"step": 1590
},
{
"epoch": 1.1347517730496455,
"grad_norm": 0.4450472295284271,
"learning_rate": 0.00019444133952555096,
"loss": 0.0371,
"step": 1600
},
{
"epoch": 1.1418439716312057,
"grad_norm": 0.38660725951194763,
"learning_rate": 0.00019433804707717328,
"loss": 0.0388,
"step": 1610
},
{
"epoch": 1.148936170212766,
"grad_norm": 0.5273442268371582,
"learning_rate": 0.0001942338316900788,
"loss": 0.0392,
"step": 1620
},
{
"epoch": 1.1560283687943262,
"grad_norm": 0.5786083340644836,
"learning_rate": 0.00019412869438383945,
"loss": 0.0409,
"step": 1630
},
{
"epoch": 1.1631205673758864,
"grad_norm": 0.3235854506492615,
"learning_rate": 0.00019402263618704642,
"loss": 0.0325,
"step": 1640
},
{
"epoch": 1.1702127659574468,
"grad_norm": 0.4242108166217804,
"learning_rate": 0.0001939156581373004,
"loss": 0.0295,
"step": 1650
},
{
"epoch": 1.177304964539007,
"grad_norm": 0.518750011920929,
"learning_rate": 0.00019380776128120116,
"loss": 0.0412,
"step": 1660
},
{
"epoch": 1.1843971631205674,
"grad_norm": 0.3119601607322693,
"learning_rate": 0.00019369894667433754,
"loss": 0.027,
"step": 1670
},
{
"epoch": 1.1914893617021276,
"grad_norm": 0.3405968248844147,
"learning_rate": 0.00019358921538127697,
"loss": 0.036,
"step": 1680
},
{
"epoch": 1.198581560283688,
"grad_norm": 0.62176913022995,
"learning_rate": 0.00019347856847555512,
"loss": 0.0527,
"step": 1690
},
{
"epoch": 1.2056737588652482,
"grad_norm": 0.432858943939209,
"learning_rate": 0.00019336700703966538,
"loss": 0.0381,
"step": 1700
},
{
"epoch": 1.2127659574468086,
"grad_norm": 0.23901493847370148,
"learning_rate": 0.0001932545321650483,
"loss": 0.0295,
"step": 1710
},
{
"epoch": 1.2198581560283688,
"grad_norm": 0.3481459319591522,
"learning_rate": 0.00019314114495208086,
"loss": 0.0418,
"step": 1720
},
{
"epoch": 1.226950354609929,
"grad_norm": 0.3281635344028473,
"learning_rate": 0.00019302684651006574,
"loss": 0.0282,
"step": 1730
},
{
"epoch": 1.2340425531914894,
"grad_norm": 0.3726864755153656,
"learning_rate": 0.00019291163795722048,
"loss": 0.0384,
"step": 1740
},
{
"epoch": 1.2411347517730495,
"grad_norm": 0.35322147607803345,
"learning_rate": 0.00019279552042066652,
"loss": 0.0267,
"step": 1750
},
{
"epoch": 1.24822695035461,
"grad_norm": 0.3819480538368225,
"learning_rate": 0.0001926784950364181,
"loss": 0.0317,
"step": 1760
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.5167778134346008,
"learning_rate": 0.00019256056294937132,
"loss": 0.0365,
"step": 1770
},
{
"epoch": 1.2624113475177305,
"grad_norm": 0.2764042615890503,
"learning_rate": 0.00019244172531329278,
"loss": 0.03,
"step": 1780
},
{
"epoch": 1.2695035460992907,
"grad_norm": 0.568465530872345,
"learning_rate": 0.00019232198329080836,
"loss": 0.042,
"step": 1790
},
{
"epoch": 1.2765957446808511,
"grad_norm": 0.47287610173225403,
"learning_rate": 0.00019220133805339184,
"loss": 0.0431,
"step": 1800
},
{
"epoch": 1.2836879432624113,
"grad_norm": 0.5068013072013855,
"learning_rate": 0.00019207979078135346,
"loss": 0.0326,
"step": 1810
},
{
"epoch": 1.2907801418439715,
"grad_norm": 0.4016683101654053,
"learning_rate": 0.00019195734266382828,
"loss": 0.0327,
"step": 1820
},
{
"epoch": 1.297872340425532,
"grad_norm": 0.21608802676200867,
"learning_rate": 0.00019183399489876467,
"loss": 0.0332,
"step": 1830
},
{
"epoch": 1.3049645390070923,
"grad_norm": 0.5429771542549133,
"learning_rate": 0.00019170974869291255,
"loss": 0.0349,
"step": 1840
},
{
"epoch": 1.3120567375886525,
"grad_norm": 0.3801655173301697,
"learning_rate": 0.00019158460526181152,
"loss": 0.0377,
"step": 1850
},
{
"epoch": 1.3191489361702127,
"grad_norm": 0.3311924934387207,
"learning_rate": 0.00019145856582977904,
"loss": 0.0346,
"step": 1860
},
{
"epoch": 1.326241134751773,
"grad_norm": 0.5218601226806641,
"learning_rate": 0.0001913316316298984,
"loss": 0.0302,
"step": 1870
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.37110060453414917,
"learning_rate": 0.0001912038039040067,
"loss": 0.0387,
"step": 1880
},
{
"epoch": 1.3404255319148937,
"grad_norm": 0.46655604243278503,
"learning_rate": 0.00019107508390268276,
"loss": 0.0337,
"step": 1890
},
{
"epoch": 1.3475177304964538,
"grad_norm": 0.33353927731513977,
"learning_rate": 0.00019094547288523467,
"loss": 0.0466,
"step": 1900
},
{
"epoch": 1.3546099290780143,
"grad_norm": 0.44526827335357666,
"learning_rate": 0.00019081497211968773,
"loss": 0.0352,
"step": 1910
},
{
"epoch": 1.3617021276595744,
"grad_norm": 0.4011070430278778,
"learning_rate": 0.00019068358288277187,
"loss": 0.0294,
"step": 1920
},
{
"epoch": 1.3687943262411348,
"grad_norm": 0.3325769007205963,
"learning_rate": 0.0001905513064599092,
"loss": 0.0361,
"step": 1930
},
{
"epoch": 1.375886524822695,
"grad_norm": 0.2585453987121582,
"learning_rate": 0.0001904181441452015,
"loss": 0.0302,
"step": 1940
},
{
"epoch": 1.3829787234042552,
"grad_norm": 0.441824734210968,
"learning_rate": 0.00019028409724141746,
"loss": 0.0418,
"step": 1950
},
{
"epoch": 1.3900709219858156,
"grad_norm": 0.41769495606422424,
"learning_rate": 0.00019014916705998002,
"loss": 0.0301,
"step": 1960
},
{
"epoch": 1.397163120567376,
"grad_norm": 0.24308504164218903,
"learning_rate": 0.00019001335492095347,
"loss": 0.0353,
"step": 1970
},
{
"epoch": 1.4042553191489362,
"grad_norm": 0.3190462589263916,
"learning_rate": 0.00018987666215303058,
"loss": 0.0385,
"step": 1980
},
{
"epoch": 1.4113475177304964,
"grad_norm": 0.452863484621048,
"learning_rate": 0.0001897390900935196,
"loss": 0.0367,
"step": 1990
},
{
"epoch": 1.4184397163120568,
"grad_norm": 0.3386947214603424,
"learning_rate": 0.00018960064008833116,
"loss": 0.0372,
"step": 2000
},
{
"epoch": 1.425531914893617,
"grad_norm": 0.40042638778686523,
"learning_rate": 0.0001894613134919651,
"loss": 0.0354,
"step": 2010
},
{
"epoch": 1.4326241134751774,
"grad_norm": 0.3207855820655823,
"learning_rate": 0.00018932111166749724,
"loss": 0.0304,
"step": 2020
},
{
"epoch": 1.4397163120567376,
"grad_norm": 0.277885377407074,
"learning_rate": 0.000189180035986566,
"loss": 0.0303,
"step": 2030
},
{
"epoch": 1.4468085106382977,
"grad_norm": 0.4140087366104126,
"learning_rate": 0.00018903808782935904,
"loss": 0.0344,
"step": 2040
},
{
"epoch": 1.4539007092198581,
"grad_norm": 0.3237980306148529,
"learning_rate": 0.00018889526858459975,
"loss": 0.0317,
"step": 2050
},
{
"epoch": 1.4609929078014185,
"grad_norm": 0.38879090547561646,
"learning_rate": 0.00018875157964953358,
"loss": 0.0344,
"step": 2060
},
{
"epoch": 1.4680851063829787,
"grad_norm": 0.39974260330200195,
"learning_rate": 0.0001886070224299145,
"loss": 0.0339,
"step": 2070
},
{
"epoch": 1.475177304964539,
"grad_norm": 0.3247739374637604,
"learning_rate": 0.00018846159833999114,
"loss": 0.0317,
"step": 2080
},
{
"epoch": 1.4822695035460993,
"grad_norm": 0.5183299779891968,
"learning_rate": 0.000188315308802493,
"loss": 0.0262,
"step": 2090
},
{
"epoch": 1.4893617021276595,
"grad_norm": 0.44669198989868164,
"learning_rate": 0.00018816815524861654,
"loss": 0.0323,
"step": 2100
},
{
"epoch": 1.49645390070922,
"grad_norm": 0.5130909085273743,
"learning_rate": 0.00018802013911801112,
"loss": 0.0361,
"step": 2110
},
{
"epoch": 1.50354609929078,
"grad_norm": 0.47592759132385254,
"learning_rate": 0.00018787126185876502,
"loss": 0.0423,
"step": 2120
},
{
"epoch": 1.5106382978723403,
"grad_norm": 0.27714022994041443,
"learning_rate": 0.0001877215249273912,
"loss": 0.0398,
"step": 2130
},
{
"epoch": 1.5177304964539007,
"grad_norm": 0.37813687324523926,
"learning_rate": 0.00018757092978881302,
"loss": 0.0285,
"step": 2140
},
{
"epoch": 1.524822695035461,
"grad_norm": 0.2609386742115021,
"learning_rate": 0.00018741947791634994,
"loss": 0.0303,
"step": 2150
},
{
"epoch": 1.5319148936170213,
"grad_norm": 0.36917203664779663,
"learning_rate": 0.00018726717079170323,
"loss": 0.0473,
"step": 2160
},
{
"epoch": 1.5390070921985815,
"grad_norm": 0.18295632302761078,
"learning_rate": 0.00018711400990494123,
"loss": 0.0246,
"step": 2170
},
{
"epoch": 1.5460992907801419,
"grad_norm": 0.2882087826728821,
"learning_rate": 0.00018695999675448496,
"loss": 0.0224,
"step": 2180
},
{
"epoch": 1.5531914893617023,
"grad_norm": 0.35880571603775024,
"learning_rate": 0.00018680513284709344,
"loss": 0.0299,
"step": 2190
},
{
"epoch": 1.5602836879432624,
"grad_norm": 0.4506542384624481,
"learning_rate": 0.00018664941969784882,
"loss": 0.0312,
"step": 2200
},
{
"epoch": 1.5673758865248226,
"grad_norm": 0.3101454973220825,
"learning_rate": 0.00018649285883014173,
"loss": 0.036,
"step": 2210
},
{
"epoch": 1.574468085106383,
"grad_norm": 0.3249278664588928,
"learning_rate": 0.00018633545177565623,
"loss": 0.0357,
"step": 2220
},
{
"epoch": 1.5815602836879432,
"grad_norm": 0.23713338375091553,
"learning_rate": 0.00018617720007435497,
"loss": 0.0346,
"step": 2230
},
{
"epoch": 1.5886524822695036,
"grad_norm": 0.2550758421421051,
"learning_rate": 0.00018601810527446398,
"loss": 0.0265,
"step": 2240
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.41524937748908997,
"learning_rate": 0.00018585816893245763,
"loss": 0.0299,
"step": 2250
},
{
"epoch": 1.602836879432624,
"grad_norm": 0.3899802565574646,
"learning_rate": 0.00018569739261304328,
"loss": 0.0361,
"step": 2260
},
{
"epoch": 1.6099290780141844,
"grad_norm": 0.4563800096511841,
"learning_rate": 0.00018553577788914618,
"loss": 0.0358,
"step": 2270
},
{
"epoch": 1.6170212765957448,
"grad_norm": 0.8882343769073486,
"learning_rate": 0.00018537332634189384,
"loss": 0.0419,
"step": 2280
},
{
"epoch": 1.624113475177305,
"grad_norm": 0.5693446397781372,
"learning_rate": 0.00018521003956060078,
"loss": 0.0401,
"step": 2290
},
{
"epoch": 1.6312056737588652,
"grad_norm": 0.3942001461982727,
"learning_rate": 0.00018504591914275274,
"loss": 0.035,
"step": 2300
},
{
"epoch": 1.6382978723404256,
"grad_norm": 0.40895143151283264,
"learning_rate": 0.00018488096669399133,
"loss": 0.0292,
"step": 2310
},
{
"epoch": 1.645390070921986,
"grad_norm": 0.43449512124061584,
"learning_rate": 0.0001847151838280981,
"loss": 0.0367,
"step": 2320
},
{
"epoch": 1.6524822695035462,
"grad_norm": 0.4020148515701294,
"learning_rate": 0.00018454857216697882,
"loss": 0.0288,
"step": 2330
},
{
"epoch": 1.6595744680851063,
"grad_norm": 0.4451272785663605,
"learning_rate": 0.0001843811333406477,
"loss": 0.0375,
"step": 2340
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.26918983459472656,
"learning_rate": 0.00018421286898721127,
"loss": 0.0362,
"step": 2350
},
{
"epoch": 1.673758865248227,
"grad_norm": 0.3901691436767578,
"learning_rate": 0.0001840437807528525,
"loss": 0.0285,
"step": 2360
},
{
"epoch": 1.6808510638297873,
"grad_norm": 0.3481754660606384,
"learning_rate": 0.00018387387029181472,
"loss": 0.0292,
"step": 2370
},
{
"epoch": 1.6879432624113475,
"grad_norm": 0.40919333696365356,
"learning_rate": 0.00018370313926638522,
"loss": 0.0353,
"step": 2380
},
{
"epoch": 1.6950354609929077,
"grad_norm": 0.24539948999881744,
"learning_rate": 0.0001835315893468792,
"loss": 0.0407,
"step": 2390
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.25068360567092896,
"learning_rate": 0.00018335922221162336,
"loss": 0.0313,
"step": 2400
},
{
"epoch": 1.7092198581560285,
"grad_norm": 0.45453083515167236,
"learning_rate": 0.00018318603954693948,
"loss": 0.0328,
"step": 2410
},
{
"epoch": 1.7163120567375887,
"grad_norm": 0.5006433129310608,
"learning_rate": 0.0001830120430471279,
"loss": 0.03,
"step": 2420
},
{
"epoch": 1.7234042553191489,
"grad_norm": 0.3591817021369934,
"learning_rate": 0.00018283723441445097,
"loss": 0.0325,
"step": 2430
},
{
"epoch": 1.7304964539007093,
"grad_norm": 0.6227660775184631,
"learning_rate": 0.00018266161535911642,
"loss": 0.032,
"step": 2440
},
{
"epoch": 1.7375886524822695,
"grad_norm": 0.24666891992092133,
"learning_rate": 0.00018248518759926053,
"loss": 0.0388,
"step": 2450
},
{
"epoch": 1.7446808510638299,
"grad_norm": 0.34065425395965576,
"learning_rate": 0.0001823079528609315,
"loss": 0.0331,
"step": 2460
},
{
"epoch": 1.75177304964539,
"grad_norm": 0.340526819229126,
"learning_rate": 0.00018212991287807232,
"loss": 0.0297,
"step": 2470
},
{
"epoch": 1.7588652482269502,
"grad_norm": 0.3198091983795166,
"learning_rate": 0.00018195106939250408,
"loss": 0.0337,
"step": 2480
},
{
"epoch": 1.7659574468085106,
"grad_norm": 0.3750438988208771,
"learning_rate": 0.00018177142415390867,
"loss": 0.0341,
"step": 2490
},
{
"epoch": 1.773049645390071,
"grad_norm": 0.3867601454257965,
"learning_rate": 0.00018159097891981186,
"loss": 0.0318,
"step": 2500
},
{
"epoch": 1.7801418439716312,
"grad_norm": 0.37601733207702637,
"learning_rate": 0.00018140973545556594,
"loss": 0.0349,
"step": 2510
},
{
"epoch": 1.7872340425531914,
"grad_norm": 0.3913877606391907,
"learning_rate": 0.00018122769553433266,
"loss": 0.0257,
"step": 2520
},
{
"epoch": 1.7943262411347518,
"grad_norm": 0.30526429414749146,
"learning_rate": 0.00018104486093706567,
"loss": 0.0275,
"step": 2530
},
{
"epoch": 1.8014184397163122,
"grad_norm": 0.6147098541259766,
"learning_rate": 0.0001808612334524932,
"loss": 0.0408,
"step": 2540
},
{
"epoch": 1.8085106382978724,
"grad_norm": 0.3850766718387604,
"learning_rate": 0.00018067681487710053,
"loss": 0.0291,
"step": 2550
},
{
"epoch": 1.8156028368794326,
"grad_norm": 0.4822274148464203,
"learning_rate": 0.00018049160701511248,
"loss": 0.0441,
"step": 2560
},
{
"epoch": 1.8226950354609928,
"grad_norm": 0.3193504810333252,
"learning_rate": 0.00018030561167847568,
"loss": 0.0349,
"step": 2570
},
{
"epoch": 1.8297872340425532,
"grad_norm": 0.2928631901741028,
"learning_rate": 0.00018011883068684085,
"loss": 0.0401,
"step": 2580
},
{
"epoch": 1.8368794326241136,
"grad_norm": 0.3406616151332855,
"learning_rate": 0.00017993126586754508,
"loss": 0.031,
"step": 2590
},
{
"epoch": 1.8439716312056738,
"grad_norm": 0.33067846298217773,
"learning_rate": 0.00017974291905559382,
"loss": 0.043,
"step": 2600
},
{
"epoch": 1.851063829787234,
"grad_norm": 0.39099758863449097,
"learning_rate": 0.00017955379209364303,
"loss": 0.0315,
"step": 2610
},
{
"epoch": 1.8581560283687943,
"grad_norm": 0.34113165736198425,
"learning_rate": 0.00017936388683198112,
"loss": 0.0328,
"step": 2620
},
{
"epoch": 1.8652482269503547,
"grad_norm": 0.5652561783790588,
"learning_rate": 0.0001791732051285109,
"loss": 0.0356,
"step": 2630
},
{
"epoch": 1.872340425531915,
"grad_norm": 0.38378405570983887,
"learning_rate": 0.0001789817488487313,
"loss": 0.0324,
"step": 2640
},
{
"epoch": 1.8794326241134751,
"grad_norm": 0.34848251938819885,
"learning_rate": 0.00017878951986571913,
"loss": 0.0361,
"step": 2650
},
{
"epoch": 1.8865248226950353,
"grad_norm": 0.34506094455718994,
"learning_rate": 0.00017859652006011088,
"loss": 0.0254,
"step": 2660
},
{
"epoch": 1.8936170212765957,
"grad_norm": 0.26408687233924866,
"learning_rate": 0.00017840275132008422,
"loss": 0.0316,
"step": 2670
},
{
"epoch": 1.900709219858156,
"grad_norm": 0.4670010805130005,
"learning_rate": 0.0001782082155413395,
"loss": 0.0274,
"step": 2680
},
{
"epoch": 1.9078014184397163,
"grad_norm": 0.2338956594467163,
"learning_rate": 0.00017801291462708134,
"loss": 0.0227,
"step": 2690
},
{
"epoch": 1.9148936170212765,
"grad_norm": 0.3896683156490326,
"learning_rate": 0.00017781685048799984,
"loss": 0.0311,
"step": 2700
},
{
"epoch": 1.9219858156028369,
"grad_norm": 0.3847581446170807,
"learning_rate": 0.000177620025042252,
"loss": 0.0265,
"step": 2710
},
{
"epoch": 1.9290780141843973,
"grad_norm": 0.47572168707847595,
"learning_rate": 0.00017742244021544293,
"loss": 0.0318,
"step": 2720
},
{
"epoch": 1.9361702127659575,
"grad_norm": 0.256161630153656,
"learning_rate": 0.00017722409794060693,
"loss": 0.0228,
"step": 2730
},
{
"epoch": 1.9432624113475176,
"grad_norm": 0.4607986807823181,
"learning_rate": 0.00017702500015818876,
"loss": 0.0289,
"step": 2740
},
{
"epoch": 1.950354609929078,
"grad_norm": 0.26722726225852966,
"learning_rate": 0.0001768251488160245,
"loss": 0.0256,
"step": 2750
},
{
"epoch": 1.9574468085106385,
"grad_norm": 0.3193138539791107,
"learning_rate": 0.00017662454586932254,
"loss": 0.0277,
"step": 2760
},
{
"epoch": 1.9645390070921986,
"grad_norm": 0.3791826665401459,
"learning_rate": 0.00017642319328064446,
"loss": 0.029,
"step": 2770
},
{
"epoch": 1.9716312056737588,
"grad_norm": 0.24767844378948212,
"learning_rate": 0.0001762210930198858,
"loss": 0.0243,
"step": 2780
},
{
"epoch": 1.978723404255319,
"grad_norm": 0.2745870053768158,
"learning_rate": 0.00017601824706425684,
"loss": 0.0343,
"step": 2790
},
{
"epoch": 1.9858156028368794,
"grad_norm": 0.3385297656059265,
"learning_rate": 0.0001758146573982632,
"loss": 0.0382,
"step": 2800
},
{
"epoch": 1.9929078014184398,
"grad_norm": 0.5617783665657043,
"learning_rate": 0.0001756103260136865,
"loss": 0.0285,
"step": 2810
},
{
"epoch": 2.0,
"grad_norm": 0.40970340371131897,
"learning_rate": 0.0001754052549095648,
"loss": 0.0259,
"step": 2820
},
{
"epoch": 2.00709219858156,
"grad_norm": 0.27524593472480774,
"learning_rate": 0.00017519944609217295,
"loss": 0.0315,
"step": 2830
},
{
"epoch": 2.0141843971631204,
"grad_norm": 0.42446693778038025,
"learning_rate": 0.00017499290157500333,
"loss": 0.0284,
"step": 2840
},
{
"epoch": 2.021276595744681,
"grad_norm": 0.42036786675453186,
"learning_rate": 0.00017478562337874568,
"loss": 0.0337,
"step": 2850
},
{
"epoch": 2.028368794326241,
"grad_norm": 0.5692403316497803,
"learning_rate": 0.00017457761353126765,
"loss": 0.0298,
"step": 2860
},
{
"epoch": 2.0354609929078014,
"grad_norm": 0.470859557390213,
"learning_rate": 0.00017436887406759488,
"loss": 0.0343,
"step": 2870
},
{
"epoch": 2.0425531914893615,
"grad_norm": 0.36999377608299255,
"learning_rate": 0.00017415940702989103,
"loss": 0.0305,
"step": 2880
},
{
"epoch": 2.049645390070922,
"grad_norm": 0.4802422523498535,
"learning_rate": 0.00017394921446743783,
"loss": 0.0326,
"step": 2890
},
{
"epoch": 2.0567375886524824,
"grad_norm": 0.26723712682724,
"learning_rate": 0.0001737382984366151,
"loss": 0.0294,
"step": 2900
},
{
"epoch": 2.0638297872340425,
"grad_norm": 0.5008073449134827,
"learning_rate": 0.00017352666100088051,
"loss": 0.0364,
"step": 2910
},
{
"epoch": 2.0709219858156027,
"grad_norm": 0.35637590289115906,
"learning_rate": 0.0001733143042307496,
"loss": 0.0272,
"step": 2920
},
{
"epoch": 2.078014184397163,
"grad_norm": 0.35349592566490173,
"learning_rate": 0.00017310123020377517,
"loss": 0.0284,
"step": 2930
},
{
"epoch": 2.0851063829787235,
"grad_norm": 0.401716023683548,
"learning_rate": 0.00017288744100452737,
"loss": 0.0334,
"step": 2940
},
{
"epoch": 2.0921985815602837,
"grad_norm": 0.4555324912071228,
"learning_rate": 0.000172672938724573,
"loss": 0.0377,
"step": 2950
},
{
"epoch": 2.099290780141844,
"grad_norm": 0.349065363407135,
"learning_rate": 0.00017245772546245518,
"loss": 0.0326,
"step": 2960
},
{
"epoch": 2.106382978723404,
"grad_norm": 0.42119070887565613,
"learning_rate": 0.00017224180332367275,
"loss": 0.0325,
"step": 2970
},
{
"epoch": 2.1134751773049647,
"grad_norm": 0.4668595492839813,
"learning_rate": 0.00017202517442065974,
"loss": 0.0275,
"step": 2980
},
{
"epoch": 2.120567375886525,
"grad_norm": 0.29753541946411133,
"learning_rate": 0.00017180784087276476,
"loss": 0.0347,
"step": 2990
},
{
"epoch": 2.127659574468085,
"grad_norm": 0.34885773062705994,
"learning_rate": 0.00017158980480623003,
"loss": 0.0427,
"step": 3000
},
{
"epoch": 2.1347517730496453,
"grad_norm": 0.3699316680431366,
"learning_rate": 0.00017137106835417084,
"loss": 0.0302,
"step": 3010
},
{
"epoch": 2.141843971631206,
"grad_norm": 0.40523409843444824,
"learning_rate": 0.00017115163365655456,
"loss": 0.0322,
"step": 3020
},
{
"epoch": 2.148936170212766,
"grad_norm": 0.24978692829608917,
"learning_rate": 0.00017093150286017964,
"loss": 0.0321,
"step": 3030
},
{
"epoch": 2.1560283687943262,
"grad_norm": 0.2739085853099823,
"learning_rate": 0.00017071067811865476,
"loss": 0.0302,
"step": 3040
},
{
"epoch": 2.1631205673758864,
"grad_norm": 0.3967369794845581,
"learning_rate": 0.00017048916159237768,
"loss": 0.0328,
"step": 3050
},
{
"epoch": 2.1702127659574466,
"grad_norm": 0.2717509865760803,
"learning_rate": 0.00017026695544851403,
"loss": 0.0253,
"step": 3060
},
{
"epoch": 2.1773049645390072,
"grad_norm": 0.46482571959495544,
"learning_rate": 0.0001700440618609763,
"loss": 0.0265,
"step": 3070
},
{
"epoch": 2.1843971631205674,
"grad_norm": 0.42749258875846863,
"learning_rate": 0.00016982048301040237,
"loss": 0.0362,
"step": 3080
},
{
"epoch": 2.1914893617021276,
"grad_norm": 0.49205732345581055,
"learning_rate": 0.00016959622108413428,
"loss": 0.0332,
"step": 3090
},
{
"epoch": 2.198581560283688,
"grad_norm": 0.3003978729248047,
"learning_rate": 0.00016937127827619685,
"loss": 0.0338,
"step": 3100
},
{
"epoch": 2.2056737588652484,
"grad_norm": 0.41363534331321716,
"learning_rate": 0.00016914565678727617,
"loss": 0.0317,
"step": 3110
},
{
"epoch": 2.2127659574468086,
"grad_norm": 0.25217387080192566,
"learning_rate": 0.000168919358824698,
"loss": 0.0292,
"step": 3120
},
{
"epoch": 2.219858156028369,
"grad_norm": 0.26034829020500183,
"learning_rate": 0.00016869238660240638,
"loss": 0.0286,
"step": 3130
},
{
"epoch": 2.226950354609929,
"grad_norm": 0.333636999130249,
"learning_rate": 0.00016846474234094176,
"loss": 0.0305,
"step": 3140
},
{
"epoch": 2.2340425531914896,
"grad_norm": 0.45352891087532043,
"learning_rate": 0.00016823642826741938,
"loss": 0.0315,
"step": 3150
},
{
"epoch": 2.2411347517730498,
"grad_norm": 0.2589890658855438,
"learning_rate": 0.00016800744661550745,
"loss": 0.0259,
"step": 3160
},
{
"epoch": 2.24822695035461,
"grad_norm": 0.39509129524230957,
"learning_rate": 0.00016777779962540534,
"loss": 0.0331,
"step": 3170
},
{
"epoch": 2.25531914893617,
"grad_norm": 0.42908716201782227,
"learning_rate": 0.00016754748954382165,
"loss": 0.0342,
"step": 3180
},
{
"epoch": 2.2624113475177303,
"grad_norm": 0.28686532378196716,
"learning_rate": 0.0001673165186239521,
"loss": 0.0288,
"step": 3190
},
{
"epoch": 2.269503546099291,
"grad_norm": 0.5497453808784485,
"learning_rate": 0.0001670848891254577,
"loss": 0.0361,
"step": 3200
},
{
"epoch": 2.276595744680851,
"grad_norm": 0.4589519798755646,
"learning_rate": 0.00016685260331444253,
"loss": 0.0266,
"step": 3210
},
{
"epoch": 2.2836879432624113,
"grad_norm": 0.4259999990463257,
"learning_rate": 0.0001666196634634316,
"loss": 0.0273,
"step": 3220
},
{
"epoch": 2.2907801418439715,
"grad_norm": 0.4129974842071533,
"learning_rate": 0.00016638607185134852,
"loss": 0.029,
"step": 3230
},
{
"epoch": 2.297872340425532,
"grad_norm": 0.3736423850059509,
"learning_rate": 0.00016615183076349336,
"loss": 0.0255,
"step": 3240
},
{
"epoch": 2.3049645390070923,
"grad_norm": 0.38349997997283936,
"learning_rate": 0.00016591694249152013,
"loss": 0.026,
"step": 3250
},
{
"epoch": 2.3120567375886525,
"grad_norm": 0.4713188409805298,
"learning_rate": 0.0001656814093334146,
"loss": 0.031,
"step": 3260
},
{
"epoch": 2.3191489361702127,
"grad_norm": 0.5241472721099854,
"learning_rate": 0.00016544523359347143,
"loss": 0.0298,
"step": 3270
},
{
"epoch": 2.326241134751773,
"grad_norm": 0.25673630833625793,
"learning_rate": 0.0001652084175822721,
"loss": 0.0277,
"step": 3280
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.2788539528846741,
"learning_rate": 0.0001649709636166619,
"loss": 0.0293,
"step": 3290
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.33457013964653015,
"learning_rate": 0.00016473287401972756,
"loss": 0.0331,
"step": 3300
},
{
"epoch": 2.347517730496454,
"grad_norm": 0.240670844912529,
"learning_rate": 0.0001644941511207742,
"loss": 0.0223,
"step": 3310
},
{
"epoch": 2.354609929078014,
"grad_norm": 0.20768260955810547,
"learning_rate": 0.00016425479725530292,
"loss": 0.0239,
"step": 3320
},
{
"epoch": 2.3617021276595747,
"grad_norm": 0.4587162733078003,
"learning_rate": 0.00016401481476498772,
"loss": 0.0278,
"step": 3330
},
{
"epoch": 2.368794326241135,
"grad_norm": 0.4795977473258972,
"learning_rate": 0.00016377420599765255,
"loss": 0.0263,
"step": 3340
},
{
"epoch": 2.375886524822695,
"grad_norm": 0.20740842819213867,
"learning_rate": 0.0001635329733072485,
"loss": 0.0273,
"step": 3350
},
{
"epoch": 2.382978723404255,
"grad_norm": 0.28621232509613037,
"learning_rate": 0.0001632911190538307,
"loss": 0.0289,
"step": 3360
},
{
"epoch": 2.3900709219858154,
"grad_norm": 0.4048430621623993,
"learning_rate": 0.00016304864560353518,
"loss": 0.0337,
"step": 3370
},
{
"epoch": 2.397163120567376,
"grad_norm": 0.3549022674560547,
"learning_rate": 0.00016280555532855576,
"loss": 0.021,
"step": 3380
},
{
"epoch": 2.404255319148936,
"grad_norm": 0.3781268000602722,
"learning_rate": 0.00016256185060712093,
"loss": 0.0278,
"step": 3390
},
{
"epoch": 2.4113475177304964,
"grad_norm": 0.372190922498703,
"learning_rate": 0.00016231753382347047,
"loss": 0.0255,
"step": 3400
},
{
"epoch": 2.4184397163120566,
"grad_norm": 0.20026937127113342,
"learning_rate": 0.00016207260736783203,
"loss": 0.0261,
"step": 3410
},
{
"epoch": 2.425531914893617,
"grad_norm": 0.24735766649246216,
"learning_rate": 0.00016182707363639808,
"loss": 0.0252,
"step": 3420
},
{
"epoch": 2.4326241134751774,
"grad_norm": 0.36811962723731995,
"learning_rate": 0.00016158093503130215,
"loss": 0.0302,
"step": 3430
},
{
"epoch": 2.4397163120567376,
"grad_norm": 0.41325879096984863,
"learning_rate": 0.0001613341939605954,
"loss": 0.0402,
"step": 3440
},
{
"epoch": 2.4468085106382977,
"grad_norm": 0.3262059688568115,
"learning_rate": 0.00016108685283822317,
"loss": 0.027,
"step": 3450
},
{
"epoch": 2.453900709219858,
"grad_norm": 0.34455767273902893,
"learning_rate": 0.0001608389140840013,
"loss": 0.0296,
"step": 3460
},
{
"epoch": 2.4609929078014185,
"grad_norm": 0.42494380474090576,
"learning_rate": 0.0001605903801235924,
"loss": 0.0307,
"step": 3470
},
{
"epoch": 2.4680851063829787,
"grad_norm": 0.289614200592041,
"learning_rate": 0.00016034125338848222,
"loss": 0.0274,
"step": 3480
},
{
"epoch": 2.475177304964539,
"grad_norm": 0.5783960223197937,
"learning_rate": 0.0001600915363159557,
"loss": 0.0355,
"step": 3490
},
{
"epoch": 2.482269503546099,
"grad_norm": 0.28593072295188904,
"learning_rate": 0.00015984123134907345,
"loss": 0.0291,
"step": 3500
},
{
"epoch": 2.4893617021276597,
"grad_norm": 0.2484228014945984,
"learning_rate": 0.00015959034093664738,
"loss": 0.0324,
"step": 3510
},
{
"epoch": 2.49645390070922,
"grad_norm": 0.35120323300361633,
"learning_rate": 0.00015933886753321722,
"loss": 0.0324,
"step": 3520
},
{
"epoch": 2.50354609929078,
"grad_norm": 0.2902463376522064,
"learning_rate": 0.0001590868135990261,
"loss": 0.0265,
"step": 3530
},
{
"epoch": 2.5106382978723403,
"grad_norm": 0.3429517149925232,
"learning_rate": 0.0001588341815999968,
"loss": 0.0376,
"step": 3540
},
{
"epoch": 2.5177304964539005,
"grad_norm": 0.3964468240737915,
"learning_rate": 0.0001585809740077074,
"loss": 0.0263,
"step": 3550
},
{
"epoch": 2.524822695035461,
"grad_norm": 0.1953706592321396,
"learning_rate": 0.0001583271932993673,
"loss": 0.0314,
"step": 3560
},
{
"epoch": 2.5319148936170213,
"grad_norm": 0.35029205679893494,
"learning_rate": 0.00015807284195779272,
"loss": 0.0347,
"step": 3570
},
{
"epoch": 2.5390070921985815,
"grad_norm": 0.229562446475029,
"learning_rate": 0.0001578179224713827,
"loss": 0.0281,
"step": 3580
},
{
"epoch": 2.546099290780142,
"grad_norm": 0.379069447517395,
"learning_rate": 0.00015756243733409456,
"loss": 0.0296,
"step": 3590
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.17396724224090576,
"learning_rate": 0.00015730638904541957,
"loss": 0.0252,
"step": 3600
},
{
"epoch": 2.5602836879432624,
"grad_norm": 0.29691052436828613,
"learning_rate": 0.00015704978011035845,
"loss": 0.0292,
"step": 3610
},
{
"epoch": 2.5673758865248226,
"grad_norm": 0.39194944500923157,
"learning_rate": 0.000156792613039397,
"loss": 0.0274,
"step": 3620
},
{
"epoch": 2.574468085106383,
"grad_norm": 0.1824718415737152,
"learning_rate": 0.00015653489034848125,
"loss": 0.0252,
"step": 3630
},
{
"epoch": 2.581560283687943,
"grad_norm": 0.22263765335083008,
"learning_rate": 0.00015627661455899327,
"loss": 0.0208,
"step": 3640
},
{
"epoch": 2.5886524822695036,
"grad_norm": 0.2892332077026367,
"learning_rate": 0.00015601778819772613,
"loss": 0.027,
"step": 3650
},
{
"epoch": 2.595744680851064,
"grad_norm": 0.2670251429080963,
"learning_rate": 0.00015575841379685928,
"loss": 0.023,
"step": 3660
},
{
"epoch": 2.602836879432624,
"grad_norm": 0.2950701415538788,
"learning_rate": 0.00015549849389393395,
"loss": 0.0257,
"step": 3670
},
{
"epoch": 2.6099290780141846,
"grad_norm": 0.3107167184352875,
"learning_rate": 0.00015523803103182805,
"loss": 0.0244,
"step": 3680
},
{
"epoch": 2.617021276595745,
"grad_norm": 0.35476645827293396,
"learning_rate": 0.00015497702775873156,
"loss": 0.0229,
"step": 3690
},
{
"epoch": 2.624113475177305,
"grad_norm": 0.37949270009994507,
"learning_rate": 0.00015471548662812133,
"loss": 0.029,
"step": 3700
},
{
"epoch": 2.631205673758865,
"grad_norm": 0.39569422602653503,
"learning_rate": 0.00015445341019873634,
"loss": 0.0312,
"step": 3710
},
{
"epoch": 2.6382978723404253,
"grad_norm": 0.45384731888771057,
"learning_rate": 0.0001541908010345525,
"loss": 0.0293,
"step": 3720
},
{
"epoch": 2.645390070921986,
"grad_norm": 0.3378404676914215,
"learning_rate": 0.0001539276617047577,
"loss": 0.029,
"step": 3730
},
{
"epoch": 2.652482269503546,
"grad_norm": 0.4775332510471344,
"learning_rate": 0.00015366399478372662,
"loss": 0.0294,
"step": 3740
},
{
"epoch": 2.6595744680851063,
"grad_norm": 0.3015674352645874,
"learning_rate": 0.0001533998028509954,
"loss": 0.0315,
"step": 3750
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.27968233823776245,
"learning_rate": 0.00015313508849123668,
"loss": 0.0273,
"step": 3760
},
{
"epoch": 2.673758865248227,
"grad_norm": 0.40850046277046204,
"learning_rate": 0.00015286985429423404,
"loss": 0.0247,
"step": 3770
},
{
"epoch": 2.6808510638297873,
"grad_norm": 0.2494577169418335,
"learning_rate": 0.00015260410285485693,
"loss": 0.0235,
"step": 3780
},
{
"epoch": 2.6879432624113475,
"grad_norm": 0.24964429438114166,
"learning_rate": 0.00015233783677303498,
"loss": 0.0259,
"step": 3790
},
{
"epoch": 2.6950354609929077,
"grad_norm": 0.28433412313461304,
"learning_rate": 0.00015207105865373295,
"loss": 0.0372,
"step": 3800
},
{
"epoch": 2.702127659574468,
"grad_norm": 0.2023301124572754,
"learning_rate": 0.0001518037711069248,
"loss": 0.0275,
"step": 3810
},
{
"epoch": 2.7092198581560285,
"grad_norm": 0.26986822485923767,
"learning_rate": 0.0001515359767475685,
"loss": 0.0284,
"step": 3820
},
{
"epoch": 2.7163120567375887,
"grad_norm": 0.2430671602487564,
"learning_rate": 0.00015126767819558022,
"loss": 0.0241,
"step": 3830
},
{
"epoch": 2.723404255319149,
"grad_norm": 0.2965194582939148,
"learning_rate": 0.00015099887807580904,
"loss": 0.029,
"step": 3840
},
{
"epoch": 2.7304964539007095,
"grad_norm": 0.2736563980579376,
"learning_rate": 0.00015072957901801076,
"loss": 0.0264,
"step": 3850
},
{
"epoch": 2.7375886524822697,
"grad_norm": 0.233638733625412,
"learning_rate": 0.00015045978365682257,
"loss": 0.0298,
"step": 3860
},
{
"epoch": 2.74468085106383,
"grad_norm": 0.350392609834671,
"learning_rate": 0.0001501894946317372,
"loss": 0.0278,
"step": 3870
},
{
"epoch": 2.75177304964539,
"grad_norm": 0.2875712215900421,
"learning_rate": 0.00014991871458707698,
"loss": 0.0302,
"step": 3880
},
{
"epoch": 2.7588652482269502,
"grad_norm": 0.3458658754825592,
"learning_rate": 0.000149647446171968,
"loss": 0.0316,
"step": 3890
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.2638990879058838,
"learning_rate": 0.00014937569204031436,
"loss": 0.0248,
"step": 3900
},
{
"epoch": 2.773049645390071,
"grad_norm": 0.34576040506362915,
"learning_rate": 0.00014910345485077197,
"loss": 0.0211,
"step": 3910
},
{
"epoch": 2.780141843971631,
"grad_norm": 0.3976755738258362,
"learning_rate": 0.00014883073726672269,
"loss": 0.0225,
"step": 3920
},
{
"epoch": 2.7872340425531914,
"grad_norm": 0.36923620104789734,
"learning_rate": 0.00014855754195624822,
"loss": 0.0283,
"step": 3930
},
{
"epoch": 2.794326241134752,
"grad_norm": 0.3352384865283966,
"learning_rate": 0.00014828387159210397,
"loss": 0.0334,
"step": 3940
},
{
"epoch": 2.801418439716312,
"grad_norm": 0.30438244342803955,
"learning_rate": 0.00014800972885169303,
"loss": 0.0322,
"step": 3950
},
{
"epoch": 2.8085106382978724,
"grad_norm": 0.3594920337200165,
"learning_rate": 0.00014773511641703987,
"loss": 0.0399,
"step": 3960
},
{
"epoch": 2.8156028368794326,
"grad_norm": 0.31048035621643066,
"learning_rate": 0.00014746003697476404,
"loss": 0.0273,
"step": 3970
},
{
"epoch": 2.8226950354609928,
"grad_norm": 0.40405237674713135,
"learning_rate": 0.0001471844932160541,
"loss": 0.0271,
"step": 3980
},
{
"epoch": 2.829787234042553,
"grad_norm": 0.2698522210121155,
"learning_rate": 0.00014690848783664108,
"loss": 0.0241,
"step": 3990
},
{
"epoch": 2.8368794326241136,
"grad_norm": 0.3636997640132904,
"learning_rate": 0.00014663202353677222,
"loss": 0.0233,
"step": 4000
},
{
"epoch": 2.8439716312056738,
"grad_norm": 0.24219240248203278,
"learning_rate": 0.00014635510302118452,
"loss": 0.0228,
"step": 4010
},
{
"epoch": 2.851063829787234,
"grad_norm": 0.309758722782135,
"learning_rate": 0.00014607772899907824,
"loss": 0.0217,
"step": 4020
},
{
"epoch": 2.8581560283687946,
"grad_norm": 0.33416473865509033,
"learning_rate": 0.0001457999041840906,
"loss": 0.0236,
"step": 4030
},
{
"epoch": 2.8652482269503547,
"grad_norm": 0.30040785670280457,
"learning_rate": 0.00014552163129426875,
"loss": 0.0273,
"step": 4040
},
{
"epoch": 2.872340425531915,
"grad_norm": 0.35119491815567017,
"learning_rate": 0.00014524291305204382,
"loss": 0.0286,
"step": 4050
},
{
"epoch": 2.879432624113475,
"grad_norm": 0.31673938035964966,
"learning_rate": 0.00014496375218420383,
"loss": 0.0292,
"step": 4060
},
{
"epoch": 2.8865248226950353,
"grad_norm": 0.20967337489128113,
"learning_rate": 0.00014468415142186708,
"loss": 0.0298,
"step": 4070
},
{
"epoch": 2.8936170212765955,
"grad_norm": 0.5264660120010376,
"learning_rate": 0.0001444041135004556,
"loss": 0.0216,
"step": 4080
},
{
"epoch": 2.900709219858156,
"grad_norm": 0.3425130248069763,
"learning_rate": 0.0001441236411596683,
"loss": 0.0239,
"step": 4090
},
{
"epoch": 2.9078014184397163,
"grad_norm": 0.2979462742805481,
"learning_rate": 0.00014384273714345403,
"loss": 0.0243,
"step": 4100
},
{
"epoch": 2.9148936170212765,
"grad_norm": 0.2593551576137543,
"learning_rate": 0.00014356140419998493,
"loss": 0.0236,
"step": 4110
},
{
"epoch": 2.921985815602837,
"grad_norm": 0.41503509879112244,
"learning_rate": 0.0001432796450816295,
"loss": 0.0235,
"step": 4120
},
{
"epoch": 2.9290780141843973,
"grad_norm": 0.30138614773750305,
"learning_rate": 0.00014299746254492555,
"loss": 0.0209,
"step": 4130
},
{
"epoch": 2.9361702127659575,
"grad_norm": 0.34066081047058105,
"learning_rate": 0.00014271485935055347,
"loss": 0.022,
"step": 4140
},
{
"epoch": 2.9432624113475176,
"grad_norm": 0.45455053448677063,
"learning_rate": 0.00014243183826330894,
"loss": 0.0303,
"step": 4150
},
{
"epoch": 2.950354609929078,
"grad_norm": 0.19702021777629852,
"learning_rate": 0.00014214840205207605,
"loss": 0.0226,
"step": 4160
},
{
"epoch": 2.9574468085106385,
"grad_norm": 0.3477088212966919,
"learning_rate": 0.0001418645534898002,
"loss": 0.0233,
"step": 4170
},
{
"epoch": 2.9645390070921986,
"grad_norm": 0.349435955286026,
"learning_rate": 0.00014158029535346096,
"loss": 0.0252,
"step": 4180
},
{
"epoch": 2.971631205673759,
"grad_norm": 0.1625533252954483,
"learning_rate": 0.00014129563042404483,
"loss": 0.0217,
"step": 4190
},
{
"epoch": 2.978723404255319,
"grad_norm": 0.19161160290241241,
"learning_rate": 0.00014101056148651823,
"loss": 0.0173,
"step": 4200
},
{
"epoch": 2.9858156028368796,
"grad_norm": 0.22051677107810974,
"learning_rate": 0.00014072509132979994,
"loss": 0.0263,
"step": 4210
},
{
"epoch": 2.99290780141844,
"grad_norm": 0.3433932960033417,
"learning_rate": 0.0001404392227467341,
"loss": 0.02,
"step": 4220
},
{
"epoch": 3.0,
"grad_norm": 0.5353679060935974,
"learning_rate": 0.0001401529585340628,
"loss": 0.0218,
"step": 4230
},
{
"epoch": 3.00709219858156,
"grad_norm": 0.2597915232181549,
"learning_rate": 0.0001398663014923986,
"loss": 0.0175,
"step": 4240
},
{
"epoch": 3.0141843971631204,
"grad_norm": 0.38695409893989563,
"learning_rate": 0.00013957925442619737,
"loss": 0.0281,
"step": 4250
},
{
"epoch": 3.021276595744681,
"grad_norm": 0.33511126041412354,
"learning_rate": 0.00013929182014373054,
"loss": 0.0211,
"step": 4260
},
{
"epoch": 3.028368794326241,
"grad_norm": 0.2662104368209839,
"learning_rate": 0.00013900400145705794,
"loss": 0.0252,
"step": 4270
},
{
"epoch": 3.0354609929078014,
"grad_norm": 0.4052877128124237,
"learning_rate": 0.00013871580118200006,
"loss": 0.0245,
"step": 4280
},
{
"epoch": 3.0425531914893615,
"grad_norm": 0.428830623626709,
"learning_rate": 0.0001384272221381107,
"loss": 0.0396,
"step": 4290
},
{
"epoch": 3.049645390070922,
"grad_norm": 0.20592275261878967,
"learning_rate": 0.0001381382671486491,
"loss": 0.0195,
"step": 4300
},
{
"epoch": 3.0567375886524824,
"grad_norm": 0.29792851209640503,
"learning_rate": 0.00013784893904055266,
"loss": 0.0221,
"step": 4310
},
{
"epoch": 3.0638297872340425,
"grad_norm": 0.3490282893180847,
"learning_rate": 0.00013755924064440904,
"loss": 0.0255,
"step": 4320
},
{
"epoch": 3.0709219858156027,
"grad_norm": 0.245464026927948,
"learning_rate": 0.00013726917479442855,
"loss": 0.0239,
"step": 4330
},
{
"epoch": 3.078014184397163,
"grad_norm": 0.28532829880714417,
"learning_rate": 0.00013697874432841637,
"loss": 0.0206,
"step": 4340
},
{
"epoch": 3.0851063829787235,
"grad_norm": 0.45370057225227356,
"learning_rate": 0.00013668795208774496,
"loss": 0.0261,
"step": 4350
},
{
"epoch": 3.0921985815602837,
"grad_norm": 0.39404305815696716,
"learning_rate": 0.00013639680091732603,
"loss": 0.0244,
"step": 4360
},
{
"epoch": 3.099290780141844,
"grad_norm": 0.3023253083229065,
"learning_rate": 0.00013610529366558282,
"loss": 0.0237,
"step": 4370
},
{
"epoch": 3.106382978723404,
"grad_norm": 0.22088029980659485,
"learning_rate": 0.00013581343318442226,
"loss": 0.0185,
"step": 4380
},
{
"epoch": 3.1134751773049647,
"grad_norm": 0.28878822922706604,
"learning_rate": 0.00013552122232920707,
"loss": 0.0264,
"step": 4390
},
{
"epoch": 3.120567375886525,
"grad_norm": 0.2687053084373474,
"learning_rate": 0.00013522866395872758,
"loss": 0.0206,
"step": 4400
},
{
"epoch": 3.127659574468085,
"grad_norm": 0.32471123337745667,
"learning_rate": 0.00013493576093517434,
"loss": 0.0211,
"step": 4410
},
{
"epoch": 3.1347517730496453,
"grad_norm": 0.19945155084133148,
"learning_rate": 0.00013464251612410936,
"loss": 0.0225,
"step": 4420
},
{
"epoch": 3.141843971631206,
"grad_norm": 0.203238844871521,
"learning_rate": 0.00013434893239443877,
"loss": 0.019,
"step": 4430
},
{
"epoch": 3.148936170212766,
"grad_norm": 0.2069423794746399,
"learning_rate": 0.00013405501261838423,
"loss": 0.0246,
"step": 4440
},
{
"epoch": 3.1560283687943262,
"grad_norm": 0.3187784254550934,
"learning_rate": 0.00013376075967145524,
"loss": 0.0222,
"step": 4450
},
{
"epoch": 3.1631205673758864,
"grad_norm": 0.31714534759521484,
"learning_rate": 0.00013346617643242062,
"loss": 0.0246,
"step": 4460
},
{
"epoch": 3.1702127659574466,
"grad_norm": 0.4024612605571747,
"learning_rate": 0.00013317126578328065,
"loss": 0.0282,
"step": 4470
},
{
"epoch": 3.1773049645390072,
"grad_norm": 0.2583388388156891,
"learning_rate": 0.00013287603060923876,
"loss": 0.0184,
"step": 4480
},
{
"epoch": 3.1843971631205674,
"grad_norm": 0.28229662775993347,
"learning_rate": 0.00013258047379867334,
"loss": 0.0259,
"step": 4490
},
{
"epoch": 3.1914893617021276,
"grad_norm": 0.2962813973426819,
"learning_rate": 0.00013228459824310936,
"loss": 0.0327,
"step": 4500
},
{
"epoch": 3.198581560283688,
"grad_norm": 0.2835753858089447,
"learning_rate": 0.00013198840683719022,
"loss": 0.0174,
"step": 4510
},
{
"epoch": 3.2056737588652484,
"grad_norm": 0.3171481788158417,
"learning_rate": 0.00013169190247864943,
"loss": 0.0315,
"step": 4520
},
{
"epoch": 3.2127659574468086,
"grad_norm": 0.3521289527416229,
"learning_rate": 0.0001313950880682821,
"loss": 0.0209,
"step": 4530
},
{
"epoch": 3.219858156028369,
"grad_norm": 0.3271763324737549,
"learning_rate": 0.00013109796650991683,
"loss": 0.0183,
"step": 4540
},
{
"epoch": 3.226950354609929,
"grad_norm": 0.29443103075027466,
"learning_rate": 0.00013080054071038698,
"loss": 0.0285,
"step": 4550
},
{
"epoch": 3.2340425531914896,
"grad_norm": 0.35706964135169983,
"learning_rate": 0.00013050281357950255,
"loss": 0.026,
"step": 4560
},
{
"epoch": 3.2411347517730498,
"grad_norm": 0.3586270809173584,
"learning_rate": 0.00013020478803002142,
"loss": 0.025,
"step": 4570
},
{
"epoch": 3.24822695035461,
"grad_norm": 0.2771762013435364,
"learning_rate": 0.00012990646697762107,
"loss": 0.0222,
"step": 4580
},
{
"epoch": 3.25531914893617,
"grad_norm": 0.22596819698810577,
"learning_rate": 0.00012960785334087,
"loss": 0.0211,
"step": 4590
},
{
"epoch": 3.2624113475177303,
"grad_norm": 0.32333022356033325,
"learning_rate": 0.00012930895004119907,
"loss": 0.0189,
"step": 4600
},
{
"epoch": 3.269503546099291,
"grad_norm": 0.35000088810920715,
"learning_rate": 0.00012900976000287313,
"loss": 0.0284,
"step": 4610
},
{
"epoch": 3.276595744680851,
"grad_norm": 0.3429652154445648,
"learning_rate": 0.00012871028615296212,
"loss": 0.0307,
"step": 4620
},
{
"epoch": 3.2836879432624113,
"grad_norm": 0.2919664978981018,
"learning_rate": 0.00012841053142131272,
"loss": 0.0226,
"step": 4630
},
{
"epoch": 3.2907801418439715,
"grad_norm": 0.30846384167671204,
"learning_rate": 0.00012811049874051955,
"loss": 0.0199,
"step": 4640
},
{
"epoch": 3.297872340425532,
"grad_norm": 0.4330957531929016,
"learning_rate": 0.00012781019104589645,
"loss": 0.0283,
"step": 4650
},
{
"epoch": 3.3049645390070923,
"grad_norm": 0.2327142059803009,
"learning_rate": 0.0001275096112754478,
"loss": 0.0295,
"step": 4660
},
{
"epoch": 3.3120567375886525,
"grad_norm": 0.32682859897613525,
"learning_rate": 0.00012720876236983988,
"loss": 0.0364,
"step": 4670
},
{
"epoch": 3.3191489361702127,
"grad_norm": 0.25689056515693665,
"learning_rate": 0.00012690764727237193,
"loss": 0.02,
"step": 4680
},
{
"epoch": 3.326241134751773,
"grad_norm": 0.275511234998703,
"learning_rate": 0.0001266062689289474,
"loss": 0.0307,
"step": 4690
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.3178229331970215,
"learning_rate": 0.00012630463028804513,
"loss": 0.0269,
"step": 4700
},
{
"epoch": 3.3404255319148937,
"grad_norm": 0.2833107113838196,
"learning_rate": 0.00012600273430069073,
"loss": 0.0247,
"step": 4710
},
{
"epoch": 3.347517730496454,
"grad_norm": 0.3253328800201416,
"learning_rate": 0.0001257005839204273,
"loss": 0.0217,
"step": 4720
},
{
"epoch": 3.354609929078014,
"grad_norm": 0.26066187024116516,
"learning_rate": 0.00012539818210328683,
"loss": 0.0201,
"step": 4730
},
{
"epoch": 3.3617021276595747,
"grad_norm": 0.24645408987998962,
"learning_rate": 0.0001250955318077612,
"loss": 0.0315,
"step": 4740
},
{
"epoch": 3.368794326241135,
"grad_norm": 0.3252389132976532,
"learning_rate": 0.00012479263599477318,
"loss": 0.0203,
"step": 4750
},
{
"epoch": 3.375886524822695,
"grad_norm": 0.21474260091781616,
"learning_rate": 0.00012448949762764762,
"loss": 0.0207,
"step": 4760
},
{
"epoch": 3.382978723404255,
"grad_norm": 0.22793646156787872,
"learning_rate": 0.00012418611967208223,
"loss": 0.0211,
"step": 4770
},
{
"epoch": 3.3900709219858154,
"grad_norm": 0.2615390419960022,
"learning_rate": 0.00012388250509611876,
"loss": 0.0344,
"step": 4780
},
{
"epoch": 3.397163120567376,
"grad_norm": 0.4156443476676941,
"learning_rate": 0.00012357865687011389,
"loss": 0.0299,
"step": 4790
},
{
"epoch": 3.404255319148936,
"grad_norm": 0.49900200963020325,
"learning_rate": 0.00012327457796671015,
"loss": 0.0245,
"step": 4800
},
{
"epoch": 3.4113475177304964,
"grad_norm": 0.37122806906700134,
"learning_rate": 0.00012297027136080687,
"loss": 0.0276,
"step": 4810
},
{
"epoch": 3.4184397163120566,
"grad_norm": 0.28465649485588074,
"learning_rate": 0.00012266574002953108,
"loss": 0.0196,
"step": 4820
},
{
"epoch": 3.425531914893617,
"grad_norm": 0.24766407907009125,
"learning_rate": 0.00012236098695220831,
"loss": 0.0256,
"step": 4830
},
{
"epoch": 3.4326241134751774,
"grad_norm": 0.27211466431617737,
"learning_rate": 0.0001220560151103336,
"loss": 0.0284,
"step": 4840
},
{
"epoch": 3.4397163120567376,
"grad_norm": 0.2607908546924591,
"learning_rate": 0.00012175082748754212,
"loss": 0.0213,
"step": 4850
},
{
"epoch": 3.4468085106382977,
"grad_norm": 0.22450798749923706,
"learning_rate": 0.0001214454270695802,
"loss": 0.0243,
"step": 4860
},
{
"epoch": 3.453900709219858,
"grad_norm": 0.2559250295162201,
"learning_rate": 0.00012113981684427591,
"loss": 0.0302,
"step": 4870
},
{
"epoch": 3.4609929078014185,
"grad_norm": 0.4311963617801666,
"learning_rate": 0.00012083399980151,
"loss": 0.0275,
"step": 4880
},
{
"epoch": 3.4680851063829787,
"grad_norm": 0.24473054707050323,
"learning_rate": 0.00012052797893318657,
"loss": 0.0251,
"step": 4890
},
{
"epoch": 3.475177304964539,
"grad_norm": 0.24619214236736298,
"learning_rate": 0.00012022175723320381,
"loss": 0.0198,
"step": 4900
},
{
"epoch": 3.482269503546099,
"grad_norm": 0.3668628931045532,
"learning_rate": 0.00011991533769742469,
"loss": 0.0313,
"step": 4910
},
{
"epoch": 3.4893617021276597,
"grad_norm": 0.4206676483154297,
"learning_rate": 0.00011960872332364765,
"loss": 0.0296,
"step": 4920
},
{
"epoch": 3.49645390070922,
"grad_norm": 0.28001976013183594,
"learning_rate": 0.00011930191711157737,
"loss": 0.0243,
"step": 4930
},
{
"epoch": 3.50354609929078,
"grad_norm": 0.2904788553714752,
"learning_rate": 0.00011899492206279524,
"loss": 0.0215,
"step": 4940
},
{
"epoch": 3.5106382978723403,
"grad_norm": 0.3145068883895874,
"learning_rate": 0.0001186877411807302,
"loss": 0.0254,
"step": 4950
},
{
"epoch": 3.5177304964539005,
"grad_norm": 0.42452743649482727,
"learning_rate": 0.0001183803774706292,
"loss": 0.0249,
"step": 4960
},
{
"epoch": 3.524822695035461,
"grad_norm": 0.4683021903038025,
"learning_rate": 0.00011807283393952786,
"loss": 0.0218,
"step": 4970
},
{
"epoch": 3.5319148936170213,
"grad_norm": 0.20090143382549286,
"learning_rate": 0.00011776511359622105,
"loss": 0.0187,
"step": 4980
},
{
"epoch": 3.5390070921985815,
"grad_norm": 0.20158180594444275,
"learning_rate": 0.00011745721945123343,
"loss": 0.0263,
"step": 4990
},
{
"epoch": 3.546099290780142,
"grad_norm": 0.2160181701183319,
"learning_rate": 0.00011714915451679003,
"loss": 0.0253,
"step": 5000
},
{
"epoch": 3.5531914893617023,
"grad_norm": 0.2958919405937195,
"learning_rate": 0.00011684092180678683,
"loss": 0.0276,
"step": 5010
},
{
"epoch": 3.5602836879432624,
"grad_norm": 0.3748587667942047,
"learning_rate": 0.00011653252433676108,
"loss": 0.0244,
"step": 5020
},
{
"epoch": 3.5673758865248226,
"grad_norm": 0.1788649708032608,
"learning_rate": 0.00011622396512386202,
"loss": 0.0217,
"step": 5030
},
{
"epoch": 3.574468085106383,
"grad_norm": 0.21001924574375153,
"learning_rate": 0.00011591524718682127,
"loss": 0.019,
"step": 5040
},
{
"epoch": 3.581560283687943,
"grad_norm": 0.22885674238204956,
"learning_rate": 0.00011560637354592332,
"loss": 0.0185,
"step": 5050
},
{
"epoch": 3.5886524822695036,
"grad_norm": 0.29409468173980713,
"learning_rate": 0.0001152973472229758,
"loss": 0.0167,
"step": 5060
},
{
"epoch": 3.595744680851064,
"grad_norm": 0.30863863229751587,
"learning_rate": 0.00011498817124128032,
"loss": 0.0254,
"step": 5070
},
{
"epoch": 3.602836879432624,
"grad_norm": 0.18340665102005005,
"learning_rate": 0.00011467884862560245,
"loss": 0.0255,
"step": 5080
},
{
"epoch": 3.6099290780141846,
"grad_norm": 0.28949764370918274,
"learning_rate": 0.00011436938240214241,
"loss": 0.0303,
"step": 5090
},
{
"epoch": 3.617021276595745,
"grad_norm": 0.19286899268627167,
"learning_rate": 0.0001140597755985054,
"loss": 0.0284,
"step": 5100
},
{
"epoch": 3.624113475177305,
"grad_norm": 0.4090544879436493,
"learning_rate": 0.00011375003124367192,
"loss": 0.0218,
"step": 5110
},
{
"epoch": 3.631205673758865,
"grad_norm": 0.32162442803382874,
"learning_rate": 0.00011344015236796822,
"loss": 0.0253,
"step": 5120
},
{
"epoch": 3.6382978723404253,
"grad_norm": 0.2677665054798126,
"learning_rate": 0.00011313014200303647,
"loss": 0.0169,
"step": 5130
},
{
"epoch": 3.645390070921986,
"grad_norm": 0.4189298152923584,
"learning_rate": 0.00011282000318180545,
"loss": 0.0205,
"step": 5140
},
{
"epoch": 3.652482269503546,
"grad_norm": 0.2897457778453827,
"learning_rate": 0.00011250973893846055,
"loss": 0.0207,
"step": 5150
},
{
"epoch": 3.6595744680851063,
"grad_norm": 0.21248039603233337,
"learning_rate": 0.00011219935230841421,
"loss": 0.0221,
"step": 5160
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.23816858232021332,
"learning_rate": 0.00011188884632827619,
"loss": 0.0209,
"step": 5170
},
{
"epoch": 3.673758865248227,
"grad_norm": 0.24563542008399963,
"learning_rate": 0.00011157822403582399,
"loss": 0.0233,
"step": 5180
},
{
"epoch": 3.6808510638297873,
"grad_norm": 0.23103229701519012,
"learning_rate": 0.0001112674884699729,
"loss": 0.0157,
"step": 5190
},
{
"epoch": 3.6879432624113475,
"grad_norm": 0.27050310373306274,
"learning_rate": 0.00011095664267074655,
"loss": 0.0214,
"step": 5200
},
{
"epoch": 3.6950354609929077,
"grad_norm": 0.16521809995174408,
"learning_rate": 0.00011064568967924683,
"loss": 0.0224,
"step": 5210
},
{
"epoch": 3.702127659574468,
"grad_norm": 0.19685564935207367,
"learning_rate": 0.00011033463253762452,
"loss": 0.0157,
"step": 5220
},
{
"epoch": 3.7092198581560285,
"grad_norm": 0.2071635127067566,
"learning_rate": 0.0001100234742890492,
"loss": 0.0196,
"step": 5230
},
{
"epoch": 3.7163120567375887,
"grad_norm": 0.19163092970848083,
"learning_rate": 0.00010971221797767966,
"loss": 0.0183,
"step": 5240
},
{
"epoch": 3.723404255319149,
"grad_norm": 0.43572723865509033,
"learning_rate": 0.00010940086664863404,
"loss": 0.0189,
"step": 5250
},
{
"epoch": 3.7304964539007095,
"grad_norm": 0.19704580307006836,
"learning_rate": 0.00010908942334796015,
"loss": 0.0213,
"step": 5260
},
{
"epoch": 3.7375886524822697,
"grad_norm": 0.30172115564346313,
"learning_rate": 0.00010877789112260551,
"loss": 0.0242,
"step": 5270
},
{
"epoch": 3.74468085106383,
"grad_norm": 0.2991447150707245,
"learning_rate": 0.00010846627302038756,
"loss": 0.0163,
"step": 5280
},
{
"epoch": 3.75177304964539,
"grad_norm": 0.2086688131093979,
"learning_rate": 0.00010815457208996407,
"loss": 0.0162,
"step": 5290
},
{
"epoch": 3.7588652482269502,
"grad_norm": 0.25282007455825806,
"learning_rate": 0.000107842791380803,
"loss": 0.0292,
"step": 5300
},
{
"epoch": 3.7659574468085104,
"grad_norm": 0.21109668910503387,
"learning_rate": 0.0001075309339431529,
"loss": 0.0283,
"step": 5310
},
{
"epoch": 3.773049645390071,
"grad_norm": 0.212016299366951,
"learning_rate": 0.00010721900282801287,
"loss": 0.0248,
"step": 5320
},
{
"epoch": 3.780141843971631,
"grad_norm": 0.19006933271884918,
"learning_rate": 0.00010690700108710297,
"loss": 0.0247,
"step": 5330
},
{
"epoch": 3.7872340425531914,
"grad_norm": 0.28409719467163086,
"learning_rate": 0.00010659493177283408,
"loss": 0.0236,
"step": 5340
},
{
"epoch": 3.794326241134752,
"grad_norm": 0.3740707337856293,
"learning_rate": 0.00010628279793827825,
"loss": 0.0244,
"step": 5350
},
{
"epoch": 3.801418439716312,
"grad_norm": 0.20624110102653503,
"learning_rate": 0.00010597060263713872,
"loss": 0.0213,
"step": 5360
},
{
"epoch": 3.8085106382978724,
"grad_norm": 0.2841550409793854,
"learning_rate": 0.0001056583489237201,
"loss": 0.0156,
"step": 5370
},
{
"epoch": 3.8156028368794326,
"grad_norm": 0.3967061936855316,
"learning_rate": 0.00010534603985289844,
"loss": 0.0233,
"step": 5380
},
{
"epoch": 3.8226950354609928,
"grad_norm": 0.33291056752204895,
"learning_rate": 0.00010503367848009133,
"loss": 0.0214,
"step": 5390
},
{
"epoch": 3.829787234042553,
"grad_norm": 0.3669842779636383,
"learning_rate": 0.00010472126786122818,
"loss": 0.0255,
"step": 5400
},
{
"epoch": 3.8368794326241136,
"grad_norm": 0.2933257520198822,
"learning_rate": 0.00010440881105272007,
"loss": 0.021,
"step": 5410
},
{
"epoch": 3.8439716312056738,
"grad_norm": 0.1543368250131607,
"learning_rate": 0.00010409631111142997,
"loss": 0.0206,
"step": 5420
},
{
"epoch": 3.851063829787234,
"grad_norm": 0.2014048546552658,
"learning_rate": 0.0001037837710946429,
"loss": 0.0198,
"step": 5430
},
{
"epoch": 3.8581560283687946,
"grad_norm": 0.2781049907207489,
"learning_rate": 0.00010347119406003592,
"loss": 0.0261,
"step": 5440
},
{
"epoch": 3.8652482269503547,
"grad_norm": 0.26720258593559265,
"learning_rate": 0.0001031585830656482,
"loss": 0.0216,
"step": 5450
},
{
"epoch": 3.872340425531915,
"grad_norm": 0.23284177482128143,
"learning_rate": 0.00010284594116985125,
"loss": 0.0195,
"step": 5460
},
{
"epoch": 3.879432624113475,
"grad_norm": 0.36926499009132385,
"learning_rate": 0.00010253327143131879,
"loss": 0.0226,
"step": 5470
},
{
"epoch": 3.8865248226950353,
"grad_norm": 0.3114331066608429,
"learning_rate": 0.00010222057690899705,
"loss": 0.0242,
"step": 5480
},
{
"epoch": 3.8936170212765955,
"grad_norm": 0.2598218023777008,
"learning_rate": 0.00010190786066207458,
"loss": 0.0174,
"step": 5490
},
{
"epoch": 3.900709219858156,
"grad_norm": 0.21172010898590088,
"learning_rate": 0.00010159512574995258,
"loss": 0.0164,
"step": 5500
},
{
"epoch": 3.9078014184397163,
"grad_norm": 0.23221197724342346,
"learning_rate": 0.00010128237523221487,
"loss": 0.0174,
"step": 5510
},
{
"epoch": 3.9148936170212765,
"grad_norm": 0.21310311555862427,
"learning_rate": 0.00010096961216859787,
"loss": 0.0186,
"step": 5520
},
{
"epoch": 3.921985815602837,
"grad_norm": 0.22581326961517334,
"learning_rate": 0.00010065683961896074,
"loss": 0.0201,
"step": 5530
},
{
"epoch": 3.9290780141843973,
"grad_norm": 0.36404818296432495,
"learning_rate": 0.00010034406064325553,
"loss": 0.0188,
"step": 5540
},
{
"epoch": 3.9361702127659575,
"grad_norm": 0.2163950651884079,
"learning_rate": 0.00010003127830149706,
"loss": 0.0159,
"step": 5550
},
{
"epoch": 3.9432624113475176,
"grad_norm": 0.5740591883659363,
"learning_rate": 9.971849565373317e-05,
"loss": 0.0293,
"step": 5560
},
{
"epoch": 3.950354609929078,
"grad_norm": 0.31515783071517944,
"learning_rate": 9.940571576001465e-05,
"loss": 0.0214,
"step": 5570
},
{
"epoch": 3.9574468085106385,
"grad_norm": 0.3176766335964203,
"learning_rate": 9.909294168036531e-05,
"loss": 0.018,
"step": 5580
},
{
"epoch": 3.9645390070921986,
"grad_norm": 0.2528095245361328,
"learning_rate": 9.87801764747521e-05,
"loss": 0.0221,
"step": 5590
},
{
"epoch": 3.971631205673759,
"grad_norm": 0.30647730827331543,
"learning_rate": 9.846742320305527e-05,
"loss": 0.0261,
"step": 5600
},
{
"epoch": 3.978723404255319,
"grad_norm": 0.2308386266231537,
"learning_rate": 9.815468492503812e-05,
"loss": 0.017,
"step": 5610
},
{
"epoch": 3.9858156028368796,
"grad_norm": 0.21092812716960907,
"learning_rate": 9.78419647003174e-05,
"loss": 0.023,
"step": 5620
},
{
"epoch": 3.99290780141844,
"grad_norm": 0.3230001628398895,
"learning_rate": 9.752926558833317e-05,
"loss": 0.0174,
"step": 5630
},
{
"epoch": 4.0,
"grad_norm": 0.3302082419395447,
"learning_rate": 9.721659064831895e-05,
"loss": 0.0206,
"step": 5640
},
{
"epoch": 4.00709219858156,
"grad_norm": 0.2825748026371002,
"learning_rate": 9.690394293927189e-05,
"loss": 0.0203,
"step": 5650
},
{
"epoch": 4.01418439716312,
"grad_norm": 0.2544749081134796,
"learning_rate": 9.659132551992248e-05,
"loss": 0.0191,
"step": 5660
},
{
"epoch": 4.0212765957446805,
"grad_norm": 0.19003655016422272,
"learning_rate": 9.627874144870514e-05,
"loss": 0.0167,
"step": 5670
},
{
"epoch": 4.028368794326241,
"grad_norm": 0.18228965997695923,
"learning_rate": 9.596619378372794e-05,
"loss": 0.0286,
"step": 5680
},
{
"epoch": 4.035460992907802,
"grad_norm": 0.27924972772598267,
"learning_rate": 9.565368558274266e-05,
"loss": 0.0126,
"step": 5690
},
{
"epoch": 4.042553191489362,
"grad_norm": 0.1970324069261551,
"learning_rate": 9.534121990311515e-05,
"loss": 0.0192,
"step": 5700
},
{
"epoch": 4.049645390070922,
"grad_norm": 0.37134411931037903,
"learning_rate": 9.502879980179525e-05,
"loss": 0.0233,
"step": 5710
},
{
"epoch": 4.056737588652482,
"grad_norm": 0.20801763236522675,
"learning_rate": 9.471642833528673e-05,
"loss": 0.0176,
"step": 5720
},
{
"epoch": 4.0638297872340425,
"grad_norm": 0.19383108615875244,
"learning_rate": 9.440410855961776e-05,
"loss": 0.0228,
"step": 5730
},
{
"epoch": 4.070921985815603,
"grad_norm": 0.21508440375328064,
"learning_rate": 9.409184353031068e-05,
"loss": 0.0163,
"step": 5740
},
{
"epoch": 4.078014184397163,
"grad_norm": 0.2944611608982086,
"learning_rate": 9.377963630235225e-05,
"loss": 0.0219,
"step": 5750
},
{
"epoch": 4.085106382978723,
"grad_norm": 0.23027314245700836,
"learning_rate": 9.346748993016377e-05,
"loss": 0.0177,
"step": 5760
},
{
"epoch": 4.092198581560283,
"grad_norm": 0.17456182837486267,
"learning_rate": 9.315540746757108e-05,
"loss": 0.0157,
"step": 5770
},
{
"epoch": 4.099290780141844,
"grad_norm": 0.34717416763305664,
"learning_rate": 9.284339196777491e-05,
"loss": 0.0244,
"step": 5780
},
{
"epoch": 4.1063829787234045,
"grad_norm": 0.38114133477211,
"learning_rate": 9.25314464833208e-05,
"loss": 0.0189,
"step": 5790
},
{
"epoch": 4.113475177304965,
"grad_norm": 0.3224876821041107,
"learning_rate": 9.221957406606926e-05,
"loss": 0.0196,
"step": 5800
},
{
"epoch": 4.120567375886525,
"grad_norm": 0.38748404383659363,
"learning_rate": 9.190777776716606e-05,
"loss": 0.0271,
"step": 5810
},
{
"epoch": 4.127659574468085,
"grad_norm": 0.3015083074569702,
"learning_rate": 9.159606063701221e-05,
"loss": 0.0263,
"step": 5820
},
{
"epoch": 4.134751773049645,
"grad_norm": 0.23368023335933685,
"learning_rate": 9.128442572523417e-05,
"loss": 0.0199,
"step": 5830
},
{
"epoch": 4.141843971631205,
"grad_norm": 0.209278866648674,
"learning_rate": 9.097287608065414e-05,
"loss": 0.0157,
"step": 5840
},
{
"epoch": 4.148936170212766,
"grad_norm": 0.21174615621566772,
"learning_rate": 9.066141475126003e-05,
"loss": 0.0161,
"step": 5850
},
{
"epoch": 4.156028368794326,
"grad_norm": 0.22363576292991638,
"learning_rate": 9.035004478417573e-05,
"loss": 0.0202,
"step": 5860
},
{
"epoch": 4.163120567375887,
"grad_norm": 0.22810956835746765,
"learning_rate": 9.003876922563137e-05,
"loss": 0.0179,
"step": 5870
},
{
"epoch": 4.170212765957447,
"grad_norm": 0.1800043135881424,
"learning_rate": 8.972759112093336e-05,
"loss": 0.0235,
"step": 5880
},
{
"epoch": 4.177304964539007,
"grad_norm": 0.2977979779243469,
"learning_rate": 8.941651351443476e-05,
"loss": 0.0167,
"step": 5890
},
{
"epoch": 4.184397163120567,
"grad_norm": 0.1779855191707611,
"learning_rate": 8.910553944950549e-05,
"loss": 0.0156,
"step": 5900
},
{
"epoch": 4.191489361702128,
"grad_norm": 0.20011846721172333,
"learning_rate": 8.879467196850229e-05,
"loss": 0.0252,
"step": 5910
},
{
"epoch": 4.198581560283688,
"grad_norm": 0.12876836955547333,
"learning_rate": 8.848391411273933e-05,
"loss": 0.0191,
"step": 5920
},
{
"epoch": 4.205673758865248,
"grad_norm": 0.24333477020263672,
"learning_rate": 8.817326892245825e-05,
"loss": 0.0196,
"step": 5930
},
{
"epoch": 4.212765957446808,
"grad_norm": 0.1953645497560501,
"learning_rate": 8.786273943679835e-05,
"loss": 0.0225,
"step": 5940
},
{
"epoch": 4.219858156028369,
"grad_norm": 0.25837603211402893,
"learning_rate": 8.755232869376706e-05,
"loss": 0.0155,
"step": 5950
},
{
"epoch": 4.226950354609929,
"grad_norm": 0.24299311637878418,
"learning_rate": 8.724203973021015e-05,
"loss": 0.0168,
"step": 5960
},
{
"epoch": 4.23404255319149,
"grad_norm": 0.27927613258361816,
"learning_rate": 8.693187558178181e-05,
"loss": 0.0174,
"step": 5970
},
{
"epoch": 4.24113475177305,
"grad_norm": 0.23098881542682648,
"learning_rate": 8.662183928291532e-05,
"loss": 0.0119,
"step": 5980
},
{
"epoch": 4.24822695035461,
"grad_norm": 0.187413290143013,
"learning_rate": 8.631193386679301e-05,
"loss": 0.017,
"step": 5990
},
{
"epoch": 4.25531914893617,
"grad_norm": 0.36277079582214355,
"learning_rate": 8.600216236531682e-05,
"loss": 0.0249,
"step": 6000
},
{
"epoch": 4.26241134751773,
"grad_norm": 0.2662680745124817,
"learning_rate": 8.569252780907862e-05,
"loss": 0.0165,
"step": 6010
},
{
"epoch": 4.2695035460992905,
"grad_norm": 0.12048438936471939,
"learning_rate": 8.538303322733032e-05,
"loss": 0.0155,
"step": 6020
},
{
"epoch": 4.276595744680851,
"grad_norm": 0.2885691523551941,
"learning_rate": 8.507368164795462e-05,
"loss": 0.0259,
"step": 6030
},
{
"epoch": 4.283687943262412,
"grad_norm": 0.2707173526287079,
"learning_rate": 8.476447609743508e-05,
"loss": 0.0206,
"step": 6040
},
{
"epoch": 4.290780141843972,
"grad_norm": 0.32506418228149414,
"learning_rate": 8.44554196008266e-05,
"loss": 0.0167,
"step": 6050
},
{
"epoch": 4.297872340425532,
"grad_norm": 0.2805072069168091,
"learning_rate": 8.414651518172583e-05,
"loss": 0.0192,
"step": 6060
},
{
"epoch": 4.304964539007092,
"grad_norm": 0.34258946776390076,
"learning_rate": 8.383776586224175e-05,
"loss": 0.0187,
"step": 6070
},
{
"epoch": 4.3120567375886525,
"grad_norm": 0.21396967768669128,
"learning_rate": 8.35291746629657e-05,
"loss": 0.0139,
"step": 6080
},
{
"epoch": 4.319148936170213,
"grad_norm": 0.2685137987136841,
"learning_rate": 8.322074460294231e-05,
"loss": 0.0153,
"step": 6090
},
{
"epoch": 4.326241134751773,
"grad_norm": 0.22136232256889343,
"learning_rate": 8.291247869963959e-05,
"loss": 0.0256,
"step": 6100
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.2527240514755249,
"learning_rate": 8.26043799689196e-05,
"loss": 0.0219,
"step": 6110
},
{
"epoch": 4.340425531914893,
"grad_norm": 0.12039966136217117,
"learning_rate": 8.229645142500897e-05,
"loss": 0.0169,
"step": 6120
},
{
"epoch": 4.347517730496454,
"grad_norm": 0.2619229853153229,
"learning_rate": 8.198869608046915e-05,
"loss": 0.0245,
"step": 6130
},
{
"epoch": 4.3546099290780145,
"grad_norm": 0.3446950316429138,
"learning_rate": 8.168111694616733e-05,
"loss": 0.0196,
"step": 6140
},
{
"epoch": 4.361702127659575,
"grad_norm": 0.18836474418640137,
"learning_rate": 8.137371703124671e-05,
"loss": 0.0154,
"step": 6150
},
{
"epoch": 4.368794326241135,
"grad_norm": 0.2591850757598877,
"learning_rate": 8.106649934309706e-05,
"loss": 0.0214,
"step": 6160
},
{
"epoch": 4.375886524822695,
"grad_norm": 0.2989495098590851,
"learning_rate": 8.075946688732545e-05,
"loss": 0.0169,
"step": 6170
},
{
"epoch": 4.382978723404255,
"grad_norm": 0.33893871307373047,
"learning_rate": 8.045262266772675e-05,
"loss": 0.0234,
"step": 6180
},
{
"epoch": 4.390070921985815,
"grad_norm": 0.3758526146411896,
"learning_rate": 8.01459696862542e-05,
"loss": 0.0184,
"step": 6190
},
{
"epoch": 4.397163120567376,
"grad_norm": 0.26383623480796814,
"learning_rate": 7.983951094299022e-05,
"loss": 0.0221,
"step": 6200
},
{
"epoch": 4.404255319148936,
"grad_norm": 0.44497719407081604,
"learning_rate": 7.953324943611677e-05,
"loss": 0.024,
"step": 6210
},
{
"epoch": 4.411347517730497,
"grad_norm": 0.3149394392967224,
"learning_rate": 7.92271881618863e-05,
"loss": 0.0266,
"step": 6220
},
{
"epoch": 4.418439716312057,
"grad_norm": 0.19326764345169067,
"learning_rate": 7.892133011459237e-05,
"loss": 0.0179,
"step": 6230
},
{
"epoch": 4.425531914893617,
"grad_norm": 0.2152886539697647,
"learning_rate": 7.861567828654013e-05,
"loss": 0.0213,
"step": 6240
},
{
"epoch": 4.432624113475177,
"grad_norm": 0.22995711863040924,
"learning_rate": 7.831023566801734e-05,
"loss": 0.0152,
"step": 6250
},
{
"epoch": 4.439716312056738,
"grad_norm": 0.28582632541656494,
"learning_rate": 7.800500524726505e-05,
"loss": 0.0237,
"step": 6260
},
{
"epoch": 4.446808510638298,
"grad_norm": 0.2682250142097473,
"learning_rate": 7.769999001044818e-05,
"loss": 0.0198,
"step": 6270
},
{
"epoch": 4.453900709219858,
"grad_norm": 0.3235504627227783,
"learning_rate": 7.739519294162652e-05,
"loss": 0.0186,
"step": 6280
},
{
"epoch": 4.460992907801418,
"grad_norm": 0.3280167281627655,
"learning_rate": 7.709061702272546e-05,
"loss": 0.0168,
"step": 6290
},
{
"epoch": 4.468085106382979,
"grad_norm": 0.2376112937927246,
"learning_rate": 7.678626523350674e-05,
"loss": 0.0208,
"step": 6300
},
{
"epoch": 4.475177304964539,
"grad_norm": 0.38887420296669006,
"learning_rate": 7.648214055153946e-05,
"loss": 0.0146,
"step": 6310
},
{
"epoch": 4.4822695035460995,
"grad_norm": 0.24783270061016083,
"learning_rate": 7.617824595217074e-05,
"loss": 0.0172,
"step": 6320
},
{
"epoch": 4.48936170212766,
"grad_norm": 0.22741125524044037,
"learning_rate": 7.587458440849691e-05,
"loss": 0.0202,
"step": 6330
},
{
"epoch": 4.49645390070922,
"grad_norm": 0.3385200798511505,
"learning_rate": 7.557115889133408e-05,
"loss": 0.0232,
"step": 6340
},
{
"epoch": 4.50354609929078,
"grad_norm": 0.2820025384426117,
"learning_rate": 7.526797236918929e-05,
"loss": 0.0148,
"step": 6350
},
{
"epoch": 4.51063829787234,
"grad_norm": 0.27540770173072815,
"learning_rate": 7.496502780823141e-05,
"loss": 0.0173,
"step": 6360
},
{
"epoch": 4.5177304964539005,
"grad_norm": 0.23689982295036316,
"learning_rate": 7.466232817226224e-05,
"loss": 0.0192,
"step": 6370
},
{
"epoch": 4.524822695035461,
"grad_norm": 0.31511813402175903,
"learning_rate": 7.435987642268715e-05,
"loss": 0.019,
"step": 6380
},
{
"epoch": 4.531914893617021,
"grad_norm": 0.2491617202758789,
"learning_rate": 7.405767551848662e-05,
"loss": 0.0233,
"step": 6390
},
{
"epoch": 4.539007092198582,
"grad_norm": 0.3146982192993164,
"learning_rate": 7.37557284161869e-05,
"loss": 0.02,
"step": 6400
},
{
"epoch": 4.546099290780142,
"grad_norm": 0.1645408272743225,
"learning_rate": 7.345403806983121e-05,
"loss": 0.0195,
"step": 6410
},
{
"epoch": 4.553191489361702,
"grad_norm": 0.23353220522403717,
"learning_rate": 7.31526074309509e-05,
"loss": 0.0197,
"step": 6420
},
{
"epoch": 4.560283687943262,
"grad_norm": 0.2860185205936432,
"learning_rate": 7.285143944853652e-05,
"loss": 0.0199,
"step": 6430
},
{
"epoch": 4.567375886524823,
"grad_norm": 0.23552881181240082,
"learning_rate": 7.255053706900887e-05,
"loss": 0.0145,
"step": 6440
},
{
"epoch": 4.574468085106383,
"grad_norm": 0.41338714957237244,
"learning_rate": 7.224990323619044e-05,
"loss": 0.0194,
"step": 6450
},
{
"epoch": 4.581560283687943,
"grad_norm": 0.23148952424526215,
"learning_rate": 7.194954089127628e-05,
"loss": 0.0166,
"step": 6460
},
{
"epoch": 4.588652482269503,
"grad_norm": 0.26471659541130066,
"learning_rate": 7.16494529728055e-05,
"loss": 0.0147,
"step": 6470
},
{
"epoch": 4.595744680851064,
"grad_norm": 0.22270764410495758,
"learning_rate": 7.134964241663237e-05,
"loss": 0.0132,
"step": 6480
},
{
"epoch": 4.602836879432624,
"grad_norm": 0.1745089590549469,
"learning_rate": 7.105011215589759e-05,
"loss": 0.0122,
"step": 6490
},
{
"epoch": 4.609929078014185,
"grad_norm": 0.12301220744848251,
"learning_rate": 7.075086512099973e-05,
"loss": 0.0143,
"step": 6500
},
{
"epoch": 4.617021276595745,
"grad_norm": 0.29022157192230225,
"learning_rate": 7.045190423956646e-05,
"loss": 0.0279,
"step": 6510
},
{
"epoch": 4.624113475177305,
"grad_norm": 0.2177857905626297,
"learning_rate": 7.015323243642584e-05,
"loss": 0.0216,
"step": 6520
},
{
"epoch": 4.631205673758865,
"grad_norm": 0.31943878531455994,
"learning_rate": 6.985485263357785e-05,
"loss": 0.016,
"step": 6530
},
{
"epoch": 4.638297872340425,
"grad_norm": 0.2381797432899475,
"learning_rate": 6.955676775016579e-05,
"loss": 0.0205,
"step": 6540
},
{
"epoch": 4.6453900709219855,
"grad_norm": 0.26750218868255615,
"learning_rate": 6.925898070244752e-05,
"loss": 0.0231,
"step": 6550
},
{
"epoch": 4.652482269503546,
"grad_norm": 0.2351475954055786,
"learning_rate": 6.896149440376725e-05,
"loss": 0.0128,
"step": 6560
},
{
"epoch": 4.659574468085106,
"grad_norm": 0.2950522005558014,
"learning_rate": 6.86643117645267e-05,
"loss": 0.0223,
"step": 6570
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.24356043338775635,
"learning_rate": 6.836743569215696e-05,
"loss": 0.0151,
"step": 6580
},
{
"epoch": 4.673758865248227,
"grad_norm": 0.20342320203781128,
"learning_rate": 6.807086909108978e-05,
"loss": 0.015,
"step": 6590
},
{
"epoch": 4.680851063829787,
"grad_norm": 0.22289365530014038,
"learning_rate": 6.777461486272925e-05,
"loss": 0.0134,
"step": 6600
},
{
"epoch": 4.6879432624113475,
"grad_norm": 0.17356853187084198,
"learning_rate": 6.747867590542345e-05,
"loss": 0.0111,
"step": 6610
},
{
"epoch": 4.695035460992908,
"grad_norm": 0.1519446223974228,
"learning_rate": 6.718305511443612e-05,
"loss": 0.0172,
"step": 6620
},
{
"epoch": 4.702127659574468,
"grad_norm": 0.22093936800956726,
"learning_rate": 6.688775538191816e-05,
"loss": 0.0144,
"step": 6630
},
{
"epoch": 4.709219858156028,
"grad_norm": 0.29076847434043884,
"learning_rate": 6.659277959687954e-05,
"loss": 0.0147,
"step": 6640
},
{
"epoch": 4.716312056737589,
"grad_norm": 0.4279550611972809,
"learning_rate": 6.629813064516094e-05,
"loss": 0.015,
"step": 6650
},
{
"epoch": 4.723404255319149,
"grad_norm": 0.2095872014760971,
"learning_rate": 6.600381140940544e-05,
"loss": 0.017,
"step": 6660
},
{
"epoch": 4.7304964539007095,
"grad_norm": 0.19063495099544525,
"learning_rate": 6.570982476903061e-05,
"loss": 0.0218,
"step": 6670
},
{
"epoch": 4.73758865248227,
"grad_norm": 0.3333429992198944,
"learning_rate": 6.541617360019985e-05,
"loss": 0.0175,
"step": 6680
},
{
"epoch": 4.74468085106383,
"grad_norm": 0.20596669614315033,
"learning_rate": 6.512286077579478e-05,
"loss": 0.0143,
"step": 6690
},
{
"epoch": 4.75177304964539,
"grad_norm": 0.36356234550476074,
"learning_rate": 6.48298891653868e-05,
"loss": 0.0218,
"step": 6700
},
{
"epoch": 4.75886524822695,
"grad_norm": 0.18473972380161285,
"learning_rate": 6.453726163520906e-05,
"loss": 0.0133,
"step": 6710
},
{
"epoch": 4.76595744680851,
"grad_norm": 0.7571324110031128,
"learning_rate": 6.424498104812852e-05,
"loss": 0.016,
"step": 6720
},
{
"epoch": 4.773049645390071,
"grad_norm": 0.24869713187217712,
"learning_rate": 6.395305026361795e-05,
"loss": 0.0212,
"step": 6730
},
{
"epoch": 4.780141843971631,
"grad_norm": 0.1801852136850357,
"learning_rate": 6.366147213772772e-05,
"loss": 0.022,
"step": 6740
},
{
"epoch": 4.787234042553192,
"grad_norm": 0.31297555565834045,
"learning_rate": 6.337024952305819e-05,
"loss": 0.0178,
"step": 6750
},
{
"epoch": 4.794326241134752,
"grad_norm": 0.1869523674249649,
"learning_rate": 6.307938526873157e-05,
"loss": 0.02,
"step": 6760
},
{
"epoch": 4.801418439716312,
"grad_norm": 0.10907835513353348,
"learning_rate": 6.278888222036411e-05,
"loss": 0.0205,
"step": 6770
},
{
"epoch": 4.808510638297872,
"grad_norm": 0.23566560447216034,
"learning_rate": 6.249874322003833e-05,
"loss": 0.0164,
"step": 6780
},
{
"epoch": 4.815602836879433,
"grad_norm": 0.2034124732017517,
"learning_rate": 6.220897110627504e-05,
"loss": 0.014,
"step": 6790
},
{
"epoch": 4.822695035460993,
"grad_norm": 0.2759285271167755,
"learning_rate": 6.191956871400582e-05,
"loss": 0.0257,
"step": 6800
},
{
"epoch": 4.829787234042553,
"grad_norm": 0.36838316917419434,
"learning_rate": 6.163053887454509e-05,
"loss": 0.0189,
"step": 6810
},
{
"epoch": 4.836879432624113,
"grad_norm": 0.21057933568954468,
"learning_rate": 6.134188441556241e-05,
"loss": 0.0168,
"step": 6820
},
{
"epoch": 4.843971631205674,
"grad_norm": 0.22402699291706085,
"learning_rate": 6.105360816105498e-05,
"loss": 0.0191,
"step": 6830
},
{
"epoch": 4.851063829787234,
"grad_norm": 0.20568257570266724,
"learning_rate": 6.0765712931319826e-05,
"loss": 0.02,
"step": 6840
},
{
"epoch": 4.858156028368795,
"grad_norm": 0.1438162624835968,
"learning_rate": 6.0478201542926316e-05,
"loss": 0.0135,
"step": 6850
},
{
"epoch": 4.865248226950355,
"grad_norm": 0.18207921087741852,
"learning_rate": 6.019107680868859e-05,
"loss": 0.0154,
"step": 6860
},
{
"epoch": 4.872340425531915,
"grad_norm": 0.3172309398651123,
"learning_rate": 5.990434153763804e-05,
"loss": 0.0143,
"step": 6870
},
{
"epoch": 4.879432624113475,
"grad_norm": 0.2266601026058197,
"learning_rate": 5.9617998534995766e-05,
"loss": 0.0212,
"step": 6880
},
{
"epoch": 4.886524822695035,
"grad_norm": 0.20986783504486084,
"learning_rate": 5.933205060214525e-05,
"loss": 0.016,
"step": 6890
},
{
"epoch": 4.8936170212765955,
"grad_norm": 0.19340595602989197,
"learning_rate": 5.9046500536604796e-05,
"loss": 0.0161,
"step": 6900
},
{
"epoch": 4.900709219858156,
"grad_norm": 0.2367888242006302,
"learning_rate": 5.8761351132000295e-05,
"loss": 0.0196,
"step": 6910
},
{
"epoch": 4.907801418439716,
"grad_norm": 0.30044275522232056,
"learning_rate": 5.8476605178037925e-05,
"loss": 0.0176,
"step": 6920
},
{
"epoch": 4.914893617021277,
"grad_norm": 0.3268476128578186,
"learning_rate": 5.819226546047667e-05,
"loss": 0.018,
"step": 6930
},
{
"epoch": 4.921985815602837,
"grad_norm": 0.30031996965408325,
"learning_rate": 5.790833476110113e-05,
"loss": 0.0155,
"step": 6940
},
{
"epoch": 4.929078014184397,
"grad_norm": 0.2088995724916458,
"learning_rate": 5.762481585769455e-05,
"loss": 0.013,
"step": 6950
},
{
"epoch": 4.9361702127659575,
"grad_norm": 0.23445673286914825,
"learning_rate": 5.7341711524011224e-05,
"loss": 0.019,
"step": 6960
},
{
"epoch": 4.943262411347518,
"grad_norm": 0.1619795709848404,
"learning_rate": 5.705902452974978e-05,
"loss": 0.0147,
"step": 6970
},
{
"epoch": 4.950354609929078,
"grad_norm": 0.16455408930778503,
"learning_rate": 5.6776757640525736e-05,
"loss": 0.015,
"step": 6980
},
{
"epoch": 4.957446808510638,
"grad_norm": 0.1724604368209839,
"learning_rate": 5.6494913617844604e-05,
"loss": 0.0255,
"step": 6990
},
{
"epoch": 4.964539007092198,
"grad_norm": 0.2437286525964737,
"learning_rate": 5.6213495219074975e-05,
"loss": 0.0194,
"step": 7000
},
{
"epoch": 4.971631205673759,
"grad_norm": 0.20035170018672943,
"learning_rate": 5.593250519742127e-05,
"loss": 0.0197,
"step": 7010
},
{
"epoch": 4.9787234042553195,
"grad_norm": 0.2261771708726883,
"learning_rate": 5.5651946301897126e-05,
"loss": 0.0173,
"step": 7020
},
{
"epoch": 4.98581560283688,
"grad_norm": 0.24441353976726532,
"learning_rate": 5.537182127729822e-05,
"loss": 0.0154,
"step": 7030
},
{
"epoch": 4.99290780141844,
"grad_norm": 0.2184004932641983,
"learning_rate": 5.509213286417551e-05,
"loss": 0.0145,
"step": 7040
},
{
"epoch": 5.0,
"grad_norm": 0.3220185339450836,
"learning_rate": 5.481288379880857e-05,
"loss": 0.0204,
"step": 7050
},
{
"epoch": 5.00709219858156,
"grad_norm": 0.37926825881004333,
"learning_rate": 5.453407681317868e-05,
"loss": 0.0158,
"step": 7060
},
{
"epoch": 5.01418439716312,
"grad_norm": 0.37592944502830505,
"learning_rate": 5.4255714634941936e-05,
"loss": 0.0203,
"step": 7070
},
{
"epoch": 5.0212765957446805,
"grad_norm": 0.32261180877685547,
"learning_rate": 5.397779998740293e-05,
"loss": 0.0187,
"step": 7080
},
{
"epoch": 5.028368794326241,
"grad_norm": 0.24909910559654236,
"learning_rate": 5.3700335589487925e-05,
"loss": 0.0182,
"step": 7090
},
{
"epoch": 5.035460992907802,
"grad_norm": 0.2592754065990448,
"learning_rate": 5.3423324155718144e-05,
"loss": 0.0145,
"step": 7100
},
{
"epoch": 5.042553191489362,
"grad_norm": 0.30807894468307495,
"learning_rate": 5.314676839618332e-05,
"loss": 0.0134,
"step": 7110
},
{
"epoch": 5.049645390070922,
"grad_norm": 0.1335049420595169,
"learning_rate": 5.287067101651533e-05,
"loss": 0.015,
"step": 7120
},
{
"epoch": 5.056737588652482,
"grad_norm": 0.3497454822063446,
"learning_rate": 5.259503471786136e-05,
"loss": 0.0204,
"step": 7130
},
{
"epoch": 5.0638297872340425,
"grad_norm": 0.19706270098686218,
"learning_rate": 5.2319862196857914e-05,
"loss": 0.017,
"step": 7140
},
{
"epoch": 5.070921985815603,
"grad_norm": 0.31677910685539246,
"learning_rate": 5.204515614560407e-05,
"loss": 0.015,
"step": 7150
},
{
"epoch": 5.078014184397163,
"grad_norm": 0.24632439017295837,
"learning_rate": 5.177091925163529e-05,
"loss": 0.0257,
"step": 7160
},
{
"epoch": 5.085106382978723,
"grad_norm": 0.26360684633255005,
"learning_rate": 5.149715419789723e-05,
"loss": 0.0119,
"step": 7170
},
{
"epoch": 5.092198581560283,
"grad_norm": 0.23850224912166595,
"learning_rate": 5.122386366271923e-05,
"loss": 0.0196,
"step": 7180
},
{
"epoch": 5.099290780141844,
"grad_norm": 0.3354696035385132,
"learning_rate": 5.0951050319788444e-05,
"loss": 0.0138,
"step": 7190
},
{
"epoch": 5.1063829787234045,
"grad_norm": 0.18239453434944153,
"learning_rate": 5.067871683812338e-05,
"loss": 0.0206,
"step": 7200
},
{
"epoch": 5.113475177304965,
"grad_norm": 0.18862655758857727,
"learning_rate": 5.0406865882047884e-05,
"loss": 0.011,
"step": 7210
},
{
"epoch": 5.120567375886525,
"grad_norm": 0.1917518526315689,
"learning_rate": 5.0135500111165215e-05,
"loss": 0.0139,
"step": 7220
},
{
"epoch": 5.127659574468085,
"grad_norm": 0.19659163057804108,
"learning_rate": 4.986462218033192e-05,
"loss": 0.0161,
"step": 7230
},
{
"epoch": 5.134751773049645,
"grad_norm": 0.23764602839946747,
"learning_rate": 4.959423473963167e-05,
"loss": 0.017,
"step": 7240
},
{
"epoch": 5.141843971631205,
"grad_norm": 0.18340301513671875,
"learning_rate": 4.932434043434975e-05,
"loss": 0.0163,
"step": 7250
},
{
"epoch": 5.148936170212766,
"grad_norm": 0.2168426811695099,
"learning_rate": 4.905494190494674e-05,
"loss": 0.0141,
"step": 7260
},
{
"epoch": 5.156028368794326,
"grad_norm": 0.2772137522697449,
"learning_rate": 4.878604178703308e-05,
"loss": 0.0186,
"step": 7270
},
{
"epoch": 5.163120567375887,
"grad_norm": 0.17947837710380554,
"learning_rate": 4.851764271134296e-05,
"loss": 0.0141,
"step": 7280
},
{
"epoch": 5.170212765957447,
"grad_norm": 0.2101341038942337,
"learning_rate": 4.824974730370871e-05,
"loss": 0.0129,
"step": 7290
},
{
"epoch": 5.177304964539007,
"grad_norm": 0.28040558099746704,
"learning_rate": 4.798235818503522e-05,
"loss": 0.0218,
"step": 7300
},
{
"epoch": 5.184397163120567,
"grad_norm": 0.15831856429576874,
"learning_rate": 4.771547797127418e-05,
"loss": 0.0114,
"step": 7310
},
{
"epoch": 5.191489361702128,
"grad_norm": 0.08963089436292648,
"learning_rate": 4.744910927339842e-05,
"loss": 0.0113,
"step": 7320
},
{
"epoch": 5.198581560283688,
"grad_norm": 0.24577274918556213,
"learning_rate": 4.7183254697376456e-05,
"loss": 0.0145,
"step": 7330
},
{
"epoch": 5.205673758865248,
"grad_norm": 0.31053343415260315,
"learning_rate": 4.69179168441471e-05,
"loss": 0.0133,
"step": 7340
},
{
"epoch": 5.212765957446808,
"grad_norm": 0.1162974014878273,
"learning_rate": 4.665309830959377e-05,
"loss": 0.0167,
"step": 7350
},
{
"epoch": 5.219858156028369,
"grad_norm": 0.30117878317832947,
"learning_rate": 4.638880168451938e-05,
"loss": 0.022,
"step": 7360
},
{
"epoch": 5.226950354609929,
"grad_norm": 0.316582590341568,
"learning_rate": 4.61250295546206e-05,
"loss": 0.0186,
"step": 7370
},
{
"epoch": 5.23404255319149,
"grad_norm": 0.3584196865558624,
"learning_rate": 4.586178450046303e-05,
"loss": 0.0182,
"step": 7380
},
{
"epoch": 5.24113475177305,
"grad_norm": 0.2816001772880554,
"learning_rate": 4.559906909745567e-05,
"loss": 0.0175,
"step": 7390
},
{
"epoch": 5.24822695035461,
"grad_norm": 0.29749003052711487,
"learning_rate": 4.533688591582571e-05,
"loss": 0.0132,
"step": 7400
},
{
"epoch": 5.25531914893617,
"grad_norm": 0.15811972320079803,
"learning_rate": 4.5075237520593435e-05,
"loss": 0.0151,
"step": 7410
},
{
"epoch": 5.26241134751773,
"grad_norm": 0.22752103209495544,
"learning_rate": 4.4814126471547293e-05,
"loss": 0.0272,
"step": 7420
},
{
"epoch": 5.2695035460992905,
"grad_norm": 0.2540184259414673,
"learning_rate": 4.455355532321852e-05,
"loss": 0.0201,
"step": 7430
},
{
"epoch": 5.276595744680851,
"grad_norm": 0.24097682535648346,
"learning_rate": 4.429352662485652e-05,
"loss": 0.0139,
"step": 7440
},
{
"epoch": 5.283687943262412,
"grad_norm": 0.17041516304016113,
"learning_rate": 4.403404292040357e-05,
"loss": 0.014,
"step": 7450
},
{
"epoch": 5.290780141843972,
"grad_norm": 0.2444879710674286,
"learning_rate": 4.377510674847017e-05,
"loss": 0.0128,
"step": 7460
},
{
"epoch": 5.297872340425532,
"grad_norm": 0.17711535096168518,
"learning_rate": 4.3516720642310204e-05,
"loss": 0.0163,
"step": 7470
},
{
"epoch": 5.304964539007092,
"grad_norm": 0.14002487063407898,
"learning_rate": 4.3258887129795945e-05,
"loss": 0.0164,
"step": 7480
},
{
"epoch": 5.3120567375886525,
"grad_norm": 0.10432898253202438,
"learning_rate": 4.300160873339364e-05,
"loss": 0.0172,
"step": 7490
},
{
"epoch": 5.319148936170213,
"grad_norm": 0.16279327869415283,
"learning_rate": 4.2744887970138516e-05,
"loss": 0.0226,
"step": 7500
},
{
"epoch": 5.326241134751773,
"grad_norm": 0.13522747159004211,
"learning_rate": 4.2488727351610335e-05,
"loss": 0.0121,
"step": 7510
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.23896059393882751,
"learning_rate": 4.2233129383908874e-05,
"loss": 0.0193,
"step": 7520
},
{
"epoch": 5.340425531914893,
"grad_norm": 0.20990808308124542,
"learning_rate": 4.197809656762922e-05,
"loss": 0.022,
"step": 7530
},
{
"epoch": 5.347517730496454,
"grad_norm": 0.25492557883262634,
"learning_rate": 4.1723631397837416e-05,
"loss": 0.0138,
"step": 7540
},
{
"epoch": 5.3546099290780145,
"grad_norm": 0.30456793308258057,
"learning_rate": 4.1469736364046086e-05,
"loss": 0.0174,
"step": 7550
},
{
"epoch": 5.361702127659575,
"grad_norm": 0.18763889372348785,
"learning_rate": 4.121641395019006e-05,
"loss": 0.0136,
"step": 7560
},
{
"epoch": 5.368794326241135,
"grad_norm": 0.17302776873111725,
"learning_rate": 4.096366663460195e-05,
"loss": 0.012,
"step": 7570
},
{
"epoch": 5.375886524822695,
"grad_norm": 0.20956365764141083,
"learning_rate": 4.0711496889988076e-05,
"loss": 0.0147,
"step": 7580
},
{
"epoch": 5.382978723404255,
"grad_norm": 0.24955029785633087,
"learning_rate": 4.0459907183404135e-05,
"loss": 0.0195,
"step": 7590
},
{
"epoch": 5.390070921985815,
"grad_norm": 0.16770239174365997,
"learning_rate": 4.02088999762312e-05,
"loss": 0.0152,
"step": 7600
},
{
"epoch": 5.397163120567376,
"grad_norm": 0.17319859564304352,
"learning_rate": 3.995847772415159e-05,
"loss": 0.0127,
"step": 7610
},
{
"epoch": 5.404255319148936,
"grad_norm": 0.13932038843631744,
"learning_rate": 3.9708642877124724e-05,
"loss": 0.0121,
"step": 7620
},
{
"epoch": 5.411347517730497,
"grad_norm": 0.2718459665775299,
"learning_rate": 3.945939787936329e-05,
"loss": 0.0244,
"step": 7630
},
{
"epoch": 5.418439716312057,
"grad_norm": 0.15254083275794983,
"learning_rate": 3.9210745169309374e-05,
"loss": 0.0147,
"step": 7640
},
{
"epoch": 5.425531914893617,
"grad_norm": 0.3398546576499939,
"learning_rate": 3.896268717961041e-05,
"loss": 0.0175,
"step": 7650
},
{
"epoch": 5.432624113475177,
"grad_norm": 0.1195686087012291,
"learning_rate": 3.871522633709555e-05,
"loss": 0.018,
"step": 7660
},
{
"epoch": 5.439716312056738,
"grad_norm": 0.22691002488136292,
"learning_rate": 3.84683650627519e-05,
"loss": 0.0126,
"step": 7670
},
{
"epoch": 5.446808510638298,
"grad_norm": 0.27147871255874634,
"learning_rate": 3.8222105771700725e-05,
"loss": 0.0162,
"step": 7680
},
{
"epoch": 5.453900709219858,
"grad_norm": 0.18269909918308258,
"learning_rate": 3.7976450873174005e-05,
"loss": 0.0134,
"step": 7690
},
{
"epoch": 5.460992907801418,
"grad_norm": 0.1839030236005783,
"learning_rate": 3.7731402770490654e-05,
"loss": 0.0122,
"step": 7700
},
{
"epoch": 5.468085106382979,
"grad_norm": 0.20004524290561676,
"learning_rate": 3.748696386103313e-05,
"loss": 0.0137,
"step": 7710
},
{
"epoch": 5.475177304964539,
"grad_norm": 0.19872575998306274,
"learning_rate": 3.724313653622404e-05,
"loss": 0.0191,
"step": 7720
},
{
"epoch": 5.4822695035460995,
"grad_norm": 0.23203954100608826,
"learning_rate": 3.699992318150256e-05,
"loss": 0.0146,
"step": 7730
},
{
"epoch": 5.48936170212766,
"grad_norm": 0.3198186159133911,
"learning_rate": 3.675732617630132e-05,
"loss": 0.011,
"step": 7740
},
{
"epoch": 5.49645390070922,
"grad_norm": 0.195682555437088,
"learning_rate": 3.6515347894022914e-05,
"loss": 0.0166,
"step": 7750
},
{
"epoch": 5.50354609929078,
"grad_norm": 0.25143593549728394,
"learning_rate": 3.627399070201676e-05,
"loss": 0.0155,
"step": 7760
},
{
"epoch": 5.51063829787234,
"grad_norm": 0.19740962982177734,
"learning_rate": 3.603325696155605e-05,
"loss": 0.0107,
"step": 7770
},
{
"epoch": 5.5177304964539005,
"grad_norm": 0.1158263236284256,
"learning_rate": 3.579314902781458e-05,
"loss": 0.0162,
"step": 7780
},
{
"epoch": 5.524822695035461,
"grad_norm": 0.29272812604904175,
"learning_rate": 3.555366924984346e-05,
"loss": 0.0199,
"step": 7790
},
{
"epoch": 5.531914893617021,
"grad_norm": 0.13803668320178986,
"learning_rate": 3.531481997054861e-05,
"loss": 0.0105,
"step": 7800
},
{
"epoch": 5.539007092198582,
"grad_norm": 0.16399915516376495,
"learning_rate": 3.5076603526667404e-05,
"loss": 0.0115,
"step": 7810
},
{
"epoch": 5.546099290780142,
"grad_norm": 0.23237183690071106,
"learning_rate": 3.4839022248746136e-05,
"loss": 0.0152,
"step": 7820
},
{
"epoch": 5.553191489361702,
"grad_norm": 0.22134317457675934,
"learning_rate": 3.460207846111697e-05,
"loss": 0.0128,
"step": 7830
},
{
"epoch": 5.560283687943262,
"grad_norm": 0.14723555743694305,
"learning_rate": 3.436577448187529e-05,
"loss": 0.0126,
"step": 7840
},
{
"epoch": 5.567375886524823,
"grad_norm": 0.17128515243530273,
"learning_rate": 3.41301126228571e-05,
"loss": 0.0145,
"step": 7850
},
{
"epoch": 5.574468085106383,
"grad_norm": 0.20742635428905487,
"learning_rate": 3.389509518961637e-05,
"loss": 0.0151,
"step": 7860
},
{
"epoch": 5.581560283687943,
"grad_norm": 0.16497284173965454,
"learning_rate": 3.3660724481402326e-05,
"loss": 0.0151,
"step": 7870
},
{
"epoch": 5.588652482269503,
"grad_norm": 0.24318110942840576,
"learning_rate": 3.3427002791137164e-05,
"loss": 0.0159,
"step": 7880
},
{
"epoch": 5.595744680851064,
"grad_norm": 0.18855370581150055,
"learning_rate": 3.319393240539355e-05,
"loss": 0.015,
"step": 7890
},
{
"epoch": 5.602836879432624,
"grad_norm": 0.14175598323345184,
"learning_rate": 3.296151560437214e-05,
"loss": 0.0181,
"step": 7900
},
{
"epoch": 5.609929078014185,
"grad_norm": 0.27144867181777954,
"learning_rate": 3.272975466187951e-05,
"loss": 0.0148,
"step": 7910
},
{
"epoch": 5.617021276595745,
"grad_norm": 0.2221544086933136,
"learning_rate": 3.249865184530563e-05,
"loss": 0.0129,
"step": 7920
},
{
"epoch": 5.624113475177305,
"grad_norm": 0.2543604075908661,
"learning_rate": 3.226820941560186e-05,
"loss": 0.014,
"step": 7930
},
{
"epoch": 5.631205673758865,
"grad_norm": 0.10194625705480576,
"learning_rate": 3.2038429627258845e-05,
"loss": 0.0138,
"step": 7940
},
{
"epoch": 5.638297872340425,
"grad_norm": 0.18883180618286133,
"learning_rate": 3.180931472828435e-05,
"loss": 0.0143,
"step": 7950
},
{
"epoch": 5.6453900709219855,
"grad_norm": 0.1743205338716507,
"learning_rate": 3.158086696018126e-05,
"loss": 0.0128,
"step": 7960
},
{
"epoch": 5.652482269503546,
"grad_norm": 0.2825423777103424,
"learning_rate": 3.135308855792587e-05,
"loss": 0.015,
"step": 7970
},
{
"epoch": 5.659574468085106,
"grad_norm": 0.3754797577857971,
"learning_rate": 3.1125981749945686e-05,
"loss": 0.0117,
"step": 7980
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.19825135171413422,
"learning_rate": 3.089954875809794e-05,
"loss": 0.0129,
"step": 7990
},
{
"epoch": 5.673758865248227,
"grad_norm": 0.33323025703430176,
"learning_rate": 3.06737917976476e-05,
"loss": 0.0155,
"step": 8000
},
{
"epoch": 5.680851063829787,
"grad_norm": 0.09978597611188889,
"learning_rate": 3.0448713077245838e-05,
"loss": 0.0136,
"step": 8010
},
{
"epoch": 5.6879432624113475,
"grad_norm": 0.14637133479118347,
"learning_rate": 3.0224314798908414e-05,
"loss": 0.0164,
"step": 8020
},
{
"epoch": 5.695035460992908,
"grad_norm": 0.11906511336565018,
"learning_rate": 3.0000599157994148e-05,
"loss": 0.0134,
"step": 8030
},
{
"epoch": 5.702127659574468,
"grad_norm": 0.21006911993026733,
"learning_rate": 2.9777568343183303e-05,
"loss": 0.0128,
"step": 8040
},
{
"epoch": 5.709219858156028,
"grad_norm": 0.24388642609119415,
"learning_rate": 2.955522453645635e-05,
"loss": 0.0148,
"step": 8050
},
{
"epoch": 5.716312056737589,
"grad_norm": 0.40704119205474854,
"learning_rate": 2.9333569913072466e-05,
"loss": 0.0142,
"step": 8060
},
{
"epoch": 5.723404255319149,
"grad_norm": 0.24392291903495789,
"learning_rate": 2.9112606641548436e-05,
"loss": 0.0133,
"step": 8070
},
{
"epoch": 5.7304964539007095,
"grad_norm": 0.22407054901123047,
"learning_rate": 2.8892336883637327e-05,
"loss": 0.0133,
"step": 8080
},
{
"epoch": 5.73758865248227,
"grad_norm": 0.20735864341259003,
"learning_rate": 2.8672762794307173e-05,
"loss": 0.0108,
"step": 8090
},
{
"epoch": 5.74468085106383,
"grad_norm": 0.20651739835739136,
"learning_rate": 2.8453886521720264e-05,
"loss": 0.0155,
"step": 8100
},
{
"epoch": 5.75177304964539,
"grad_norm": 0.1309831440448761,
"learning_rate": 2.8235710207211874e-05,
"loss": 0.0171,
"step": 8110
},
{
"epoch": 5.75886524822695,
"grad_norm": 0.29270994663238525,
"learning_rate": 2.8018235985269325e-05,
"loss": 0.0179,
"step": 8120
},
{
"epoch": 5.76595744680851,
"grad_norm": 0.3431891202926636,
"learning_rate": 2.7801465983511143e-05,
"loss": 0.0156,
"step": 8130
},
{
"epoch": 5.773049645390071,
"grad_norm": 0.12922121584415436,
"learning_rate": 2.7585402322666333e-05,
"loss": 0.0196,
"step": 8140
},
{
"epoch": 5.780141843971631,
"grad_norm": 0.19691234827041626,
"learning_rate": 2.737004711655342e-05,
"loss": 0.0116,
"step": 8150
},
{
"epoch": 5.787234042553192,
"grad_norm": 0.17019961774349213,
"learning_rate": 2.7155402472060043e-05,
"loss": 0.0145,
"step": 8160
},
{
"epoch": 5.794326241134752,
"grad_norm": 0.18791693449020386,
"learning_rate": 2.6941470489122056e-05,
"loss": 0.0166,
"step": 8170
},
{
"epoch": 5.801418439716312,
"grad_norm": 0.1223282665014267,
"learning_rate": 2.6728253260703163e-05,
"loss": 0.0117,
"step": 8180
},
{
"epoch": 5.808510638297872,
"grad_norm": 0.16401457786560059,
"learning_rate": 2.6515752872774458e-05,
"loss": 0.0147,
"step": 8190
},
{
"epoch": 5.815602836879433,
"grad_norm": 0.13671791553497314,
"learning_rate": 2.6303971404293882e-05,
"loss": 0.0128,
"step": 8200
},
{
"epoch": 5.822695035460993,
"grad_norm": 0.21030209958553314,
"learning_rate": 2.609291092718604e-05,
"loss": 0.0157,
"step": 8210
},
{
"epoch": 5.829787234042553,
"grad_norm": 0.19939203560352325,
"learning_rate": 2.5882573506321772e-05,
"loss": 0.0139,
"step": 8220
},
{
"epoch": 5.836879432624113,
"grad_norm": 0.26017311215400696,
"learning_rate": 2.5672961199498058e-05,
"loss": 0.0133,
"step": 8230
},
{
"epoch": 5.843971631205674,
"grad_norm": 0.29861557483673096,
"learning_rate": 2.5464076057417883e-05,
"loss": 0.0154,
"step": 8240
},
{
"epoch": 5.851063829787234,
"grad_norm": 0.17012879252433777,
"learning_rate": 2.5255920123670196e-05,
"loss": 0.0163,
"step": 8250
},
{
"epoch": 5.858156028368795,
"grad_norm": 0.20148979127407074,
"learning_rate": 2.5048495434709708e-05,
"loss": 0.0117,
"step": 8260
},
{
"epoch": 5.865248226950355,
"grad_norm": 0.2007923573255539,
"learning_rate": 2.4841804019837323e-05,
"loss": 0.0146,
"step": 8270
},
{
"epoch": 5.872340425531915,
"grad_norm": 0.1671822965145111,
"learning_rate": 2.4635847901179932e-05,
"loss": 0.0174,
"step": 8280
},
{
"epoch": 5.879432624113475,
"grad_norm": 0.22692982852458954,
"learning_rate": 2.4430629093670963e-05,
"loss": 0.0173,
"step": 8290
},
{
"epoch": 5.886524822695035,
"grad_norm": 0.142043337225914,
"learning_rate": 2.4226149605030344e-05,
"loss": 0.0099,
"step": 8300
},
{
"epoch": 5.8936170212765955,
"grad_norm": 0.1621280163526535,
"learning_rate": 2.4022411435745074e-05,
"loss": 0.0111,
"step": 8310
},
{
"epoch": 5.900709219858156,
"grad_norm": 0.21993504464626312,
"learning_rate": 2.3819416579049603e-05,
"loss": 0.01,
"step": 8320
},
{
"epoch": 5.907801418439716,
"grad_norm": 0.2549281716346741,
"learning_rate": 2.361716702090634e-05,
"loss": 0.0156,
"step": 8330
},
{
"epoch": 5.914893617021277,
"grad_norm": 0.13125896453857422,
"learning_rate": 2.3415664739986165e-05,
"loss": 0.0179,
"step": 8340
},
{
"epoch": 5.921985815602837,
"grad_norm": 0.2061365842819214,
"learning_rate": 2.321491170764908e-05,
"loss": 0.0149,
"step": 8350
},
{
"epoch": 5.929078014184397,
"grad_norm": 0.2736816108226776,
"learning_rate": 2.3014909887925042e-05,
"loss": 0.0186,
"step": 8360
},
{
"epoch": 5.9361702127659575,
"grad_norm": 0.1882363259792328,
"learning_rate": 2.281566123749458e-05,
"loss": 0.0211,
"step": 8370
},
{
"epoch": 5.943262411347518,
"grad_norm": 0.2734488844871521,
"learning_rate": 2.2617167705669827e-05,
"loss": 0.0132,
"step": 8380
},
{
"epoch": 5.950354609929078,
"grad_norm": 0.20115582644939423,
"learning_rate": 2.2419431234375178e-05,
"loss": 0.0121,
"step": 8390
},
{
"epoch": 5.957446808510638,
"grad_norm": 0.1607801914215088,
"learning_rate": 2.2222453758128648e-05,
"loss": 0.0128,
"step": 8400
},
{
"epoch": 5.964539007092198,
"grad_norm": 0.20927660167217255,
"learning_rate": 2.2026237204022716e-05,
"loss": 0.0097,
"step": 8410
},
{
"epoch": 5.971631205673759,
"grad_norm": 0.15127459168434143,
"learning_rate": 2.1830783491705477e-05,
"loss": 0.0096,
"step": 8420
},
{
"epoch": 5.9787234042553195,
"grad_norm": 0.25664299726486206,
"learning_rate": 2.1636094533361896e-05,
"loss": 0.0135,
"step": 8430
},
{
"epoch": 5.98581560283688,
"grad_norm": 0.1857176572084427,
"learning_rate": 2.14421722336952e-05,
"loss": 0.0131,
"step": 8440
},
{
"epoch": 5.99290780141844,
"grad_norm": 0.11627791076898575,
"learning_rate": 2.1249018489908056e-05,
"loss": 0.0101,
"step": 8450
},
{
"epoch": 6.0,
"grad_norm": 0.42432013154029846,
"learning_rate": 2.1056635191684183e-05,
"loss": 0.0128,
"step": 8460
},
{
"epoch": 6.00709219858156,
"grad_norm": 0.21808061003684998,
"learning_rate": 2.086502422116974e-05,
"loss": 0.0136,
"step": 8470
},
{
"epoch": 6.01418439716312,
"grad_norm": 0.2082509845495224,
"learning_rate": 2.067418745295494e-05,
"loss": 0.0171,
"step": 8480
},
{
"epoch": 6.0212765957446805,
"grad_norm": 0.1480574756860733,
"learning_rate": 2.0484126754055842e-05,
"loss": 0.0125,
"step": 8490
},
{
"epoch": 6.028368794326241,
"grad_norm": 0.21675336360931396,
"learning_rate": 2.0294843983895828e-05,
"loss": 0.0148,
"step": 8500
},
{
"epoch": 6.035460992907802,
"grad_norm": 0.22429972887039185,
"learning_rate": 2.0106340994287698e-05,
"loss": 0.018,
"step": 8510
},
{
"epoch": 6.042553191489362,
"grad_norm": 0.18754935264587402,
"learning_rate": 1.9918619629415314e-05,
"loss": 0.0116,
"step": 8520
},
{
"epoch": 6.049645390070922,
"grad_norm": 0.1608024388551712,
"learning_rate": 1.9731681725815676e-05,
"loss": 0.0083,
"step": 8530
},
{
"epoch": 6.056737588652482,
"grad_norm": 0.19191201031208038,
"learning_rate": 1.9545529112361005e-05,
"loss": 0.0173,
"step": 8540
},
{
"epoch": 6.0638297872340425,
"grad_norm": 0.18507297337055206,
"learning_rate": 1.93601636102407e-05,
"loss": 0.0095,
"step": 8550
},
{
"epoch": 6.070921985815603,
"grad_norm": 0.18908648192882538,
"learning_rate": 1.917558703294361e-05,
"loss": 0.0129,
"step": 8560
},
{
"epoch": 6.078014184397163,
"grad_norm": 0.23118513822555542,
"learning_rate": 1.8991801186240342e-05,
"loss": 0.0089,
"step": 8570
},
{
"epoch": 6.085106382978723,
"grad_norm": 0.24339988827705383,
"learning_rate": 1.8808807868165512e-05,
"loss": 0.0116,
"step": 8580
},
{
"epoch": 6.092198581560283,
"grad_norm": 0.26911652088165283,
"learning_rate": 1.862660886900016e-05,
"loss": 0.0145,
"step": 8590
},
{
"epoch": 6.099290780141844,
"grad_norm": 0.15160147845745087,
"learning_rate": 1.8445205971254243e-05,
"loss": 0.0101,
"step": 8600
},
{
"epoch": 6.1063829787234045,
"grad_norm": 0.1248091533780098,
"learning_rate": 1.826460094964928e-05,
"loss": 0.0143,
"step": 8610
},
{
"epoch": 6.113475177304965,
"grad_norm": 0.19379976391792297,
"learning_rate": 1.808479557110081e-05,
"loss": 0.0112,
"step": 8620
},
{
"epoch": 6.120567375886525,
"grad_norm": 0.16450917720794678,
"learning_rate": 1.7905791594701337e-05,
"loss": 0.0202,
"step": 8630
},
{
"epoch": 6.127659574468085,
"grad_norm": 0.3321121335029602,
"learning_rate": 1.7727590771702894e-05,
"loss": 0.0107,
"step": 8640
},
{
"epoch": 6.134751773049645,
"grad_norm": 0.1483653485774994,
"learning_rate": 1.7550194845500025e-05,
"loss": 0.0123,
"step": 8650
},
{
"epoch": 6.141843971631205,
"grad_norm": 0.17642395198345184,
"learning_rate": 1.7373605551612805e-05,
"loss": 0.009,
"step": 8660
},
{
"epoch": 6.148936170212766,
"grad_norm": 0.24330726265907288,
"learning_rate": 1.7197824617669655e-05,
"loss": 0.0113,
"step": 8670
},
{
"epoch": 6.156028368794326,
"grad_norm": 0.2407854199409485,
"learning_rate": 1.7022853763390623e-05,
"loss": 0.013,
"step": 8680
},
{
"epoch": 6.163120567375887,
"grad_norm": 0.237819641828537,
"learning_rate": 1.68486947005705e-05,
"loss": 0.013,
"step": 8690
},
{
"epoch": 6.170212765957447,
"grad_norm": 0.16382509469985962,
"learning_rate": 1.6675349133062e-05,
"loss": 0.0128,
"step": 8700
},
{
"epoch": 6.177304964539007,
"grad_norm": 0.2901313602924347,
"learning_rate": 1.6502818756759276e-05,
"loss": 0.0085,
"step": 8710
},
{
"epoch": 6.184397163120567,
"grad_norm": 0.23529794812202454,
"learning_rate": 1.633110525958108e-05,
"loss": 0.009,
"step": 8720
},
{
"epoch": 6.191489361702128,
"grad_norm": 0.21968472003936768,
"learning_rate": 1.616021032145444e-05,
"loss": 0.0158,
"step": 8730
},
{
"epoch": 6.198581560283688,
"grad_norm": 0.21688121557235718,
"learning_rate": 1.5990135614298184e-05,
"loss": 0.0114,
"step": 8740
},
{
"epoch": 6.205673758865248,
"grad_norm": 0.19126644730567932,
"learning_rate": 1.582088280200652e-05,
"loss": 0.0111,
"step": 8750
},
{
"epoch": 6.212765957446808,
"grad_norm": 0.2854389548301697,
"learning_rate": 1.5652453540432856e-05,
"loss": 0.0124,
"step": 8760
},
{
"epoch": 6.219858156028369,
"grad_norm": 0.23691484332084656,
"learning_rate": 1.5484849477373463e-05,
"loss": 0.0163,
"step": 8770
},
{
"epoch": 6.226950354609929,
"grad_norm": 0.10352014750242233,
"learning_rate": 1.5318072252551498e-05,
"loss": 0.0121,
"step": 8780
},
{
"epoch": 6.23404255319149,
"grad_norm": 0.1918143928050995,
"learning_rate": 1.5152123497600879e-05,
"loss": 0.0166,
"step": 8790
},
{
"epoch": 6.24113475177305,
"grad_norm": 0.27419474720954895,
"learning_rate": 1.49870048360504e-05,
"loss": 0.0176,
"step": 8800
},
{
"epoch": 6.24822695035461,
"grad_norm": 0.1279487907886505,
"learning_rate": 1.4822717883307658e-05,
"loss": 0.0165,
"step": 8810
},
{
"epoch": 6.25531914893617,
"grad_norm": 0.20851808786392212,
"learning_rate": 1.46592642466435e-05,
"loss": 0.0111,
"step": 8820
},
{
"epoch": 6.26241134751773,
"grad_norm": 0.20219635963439941,
"learning_rate": 1.4496645525176166e-05,
"loss": 0.0105,
"step": 8830
},
{
"epoch": 6.2695035460992905,
"grad_norm": 0.17594772577285767,
"learning_rate": 1.4334863309855617e-05,
"loss": 0.02,
"step": 8840
},
{
"epoch": 6.276595744680851,
"grad_norm": 0.1454489380121231,
"learning_rate": 1.4173919183448026e-05,
"loss": 0.0147,
"step": 8850
},
{
"epoch": 6.283687943262412,
"grad_norm": 0.36037492752075195,
"learning_rate": 1.4013814720520258e-05,
"loss": 0.0104,
"step": 8860
},
{
"epoch": 6.290780141843972,
"grad_norm": 0.29365283250808716,
"learning_rate": 1.385455148742455e-05,
"loss": 0.0169,
"step": 8870
},
{
"epoch": 6.297872340425532,
"grad_norm": 0.2135290801525116,
"learning_rate": 1.36961310422831e-05,
"loss": 0.0183,
"step": 8880
},
{
"epoch": 6.304964539007092,
"grad_norm": 0.27943718433380127,
"learning_rate": 1.3538554934972813e-05,
"loss": 0.0148,
"step": 8890
},
{
"epoch": 6.3120567375886525,
"grad_norm": 0.18066227436065674,
"learning_rate": 1.3381824707110157e-05,
"loss": 0.0115,
"step": 8900
},
{
"epoch": 6.319148936170213,
"grad_norm": 0.087078757584095,
"learning_rate": 1.3225941892036198e-05,
"loss": 0.0121,
"step": 8910
},
{
"epoch": 6.326241134751773,
"grad_norm": 0.26353609561920166,
"learning_rate": 1.3070908014801375e-05,
"loss": 0.0087,
"step": 8920
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.11882209032773972,
"learning_rate": 1.2916724592150798e-05,
"loss": 0.0088,
"step": 8930
},
{
"epoch": 6.340425531914893,
"grad_norm": 0.17999006807804108,
"learning_rate": 1.276339313250925e-05,
"loss": 0.0094,
"step": 8940
},
{
"epoch": 6.347517730496454,
"grad_norm": 0.16146190464496613,
"learning_rate": 1.2610915135966495e-05,
"loss": 0.0112,
"step": 8950
},
{
"epoch": 6.3546099290780145,
"grad_norm": 0.26943373680114746,
"learning_rate": 1.2459292094262664e-05,
"loss": 0.014,
"step": 8960
},
{
"epoch": 6.361702127659575,
"grad_norm": 0.22631198167800903,
"learning_rate": 1.2308525490773526e-05,
"loss": 0.0103,
"step": 8970
},
{
"epoch": 6.368794326241135,
"grad_norm": 0.25705012679100037,
"learning_rate": 1.2158616800496059e-05,
"loss": 0.0152,
"step": 8980
},
{
"epoch": 6.375886524822695,
"grad_norm": 0.23096348345279694,
"learning_rate": 1.2009567490034046e-05,
"loss": 0.0128,
"step": 8990
},
{
"epoch": 6.382978723404255,
"grad_norm": 0.19161191582679749,
"learning_rate": 1.186137901758364e-05,
"loss": 0.0142,
"step": 9000
},
{
"epoch": 6.390070921985815,
"grad_norm": 0.17679370939731598,
"learning_rate": 1.1714052832919187e-05,
"loss": 0.0163,
"step": 9010
},
{
"epoch": 6.397163120567376,
"grad_norm": 0.1845289170742035,
"learning_rate": 1.1567590377378979e-05,
"loss": 0.0137,
"step": 9020
},
{
"epoch": 6.404255319148936,
"grad_norm": 0.10619324445724487,
"learning_rate": 1.1421993083851145e-05,
"loss": 0.0119,
"step": 9030
},
{
"epoch": 6.411347517730497,
"grad_norm": 0.17025884985923767,
"learning_rate": 1.1277262376759712e-05,
"loss": 0.0147,
"step": 9040
},
{
"epoch": 6.418439716312057,
"grad_norm": 0.24711932241916656,
"learning_rate": 1.1133399672050638e-05,
"loss": 0.0128,
"step": 9050
},
{
"epoch": 6.425531914893617,
"grad_norm": 0.24352525174617767,
"learning_rate": 1.0990406377177865e-05,
"loss": 0.0163,
"step": 9060
},
{
"epoch": 6.432624113475177,
"grad_norm": 0.11696803569793701,
"learning_rate": 1.0848283891089683e-05,
"loss": 0.0108,
"step": 9070
},
{
"epoch": 6.439716312056738,
"grad_norm": 0.14711810648441315,
"learning_rate": 1.0707033604214944e-05,
"loss": 0.0091,
"step": 9080
},
{
"epoch": 6.446808510638298,
"grad_norm": 0.08239645510911942,
"learning_rate": 1.0566656898449546e-05,
"loss": 0.0119,
"step": 9090
},
{
"epoch": 6.453900709219858,
"grad_norm": 0.1815529316663742,
"learning_rate": 1.0427155147142887e-05,
"loss": 0.0122,
"step": 9100
},
{
"epoch": 6.460992907801418,
"grad_norm": 0.16181941330432892,
"learning_rate": 1.0288529715084293e-05,
"loss": 0.0091,
"step": 9110
},
{
"epoch": 6.468085106382979,
"grad_norm": 0.1356610655784607,
"learning_rate": 1.0150781958489919e-05,
"loss": 0.0115,
"step": 9120
},
{
"epoch": 6.475177304964539,
"grad_norm": 0.10379713028669357,
"learning_rate": 1.0013913224989303e-05,
"loss": 0.0152,
"step": 9130
},
{
"epoch": 6.4822695035460995,
"grad_norm": 0.1019928902387619,
"learning_rate": 9.877924853612186e-06,
"loss": 0.011,
"step": 9140
},
{
"epoch": 6.48936170212766,
"grad_norm": 0.2234257161617279,
"learning_rate": 9.74281817477547e-06,
"loss": 0.0108,
"step": 9150
},
{
"epoch": 6.49645390070922,
"grad_norm": 0.2257377654314041,
"learning_rate": 9.608594510270218e-06,
"loss": 0.0111,
"step": 9160
},
{
"epoch": 6.50354609929078,
"grad_norm": 0.1945466548204422,
"learning_rate": 9.47525517324862e-06,
"loss": 0.0106,
"step": 9170
},
{
"epoch": 6.51063829787234,
"grad_norm": 0.39022570848464966,
"learning_rate": 9.342801468211283e-06,
"loss": 0.0112,
"step": 9180
},
{
"epoch": 6.5177304964539005,
"grad_norm": 0.16999700665473938,
"learning_rate": 9.211234690994364e-06,
"loss": 0.0161,
"step": 9190
},
{
"epoch": 6.524822695035461,
"grad_norm": 0.38921093940734863,
"learning_rate": 9.080556128756901e-06,
"loss": 0.0107,
"step": 9200
},
{
"epoch": 6.531914893617021,
"grad_norm": 0.26619699597358704,
"learning_rate": 8.950767059968302e-06,
"loss": 0.0164,
"step": 9210
},
{
"epoch": 6.539007092198582,
"grad_norm": 0.19151495397090912,
"learning_rate": 8.821868754395734e-06,
"loss": 0.0111,
"step": 9220
},
{
"epoch": 6.546099290780142,
"grad_norm": 0.09225843101739883,
"learning_rate": 8.693862473091785e-06,
"loss": 0.0113,
"step": 9230
},
{
"epoch": 6.553191489361702,
"grad_norm": 0.2490764856338501,
"learning_rate": 8.566749468382074e-06,
"loss": 0.0163,
"step": 9240
},
{
"epoch": 6.560283687943262,
"grad_norm": 0.2187177836894989,
"learning_rate": 8.440530983852978e-06,
"loss": 0.0132,
"step": 9250
},
{
"epoch": 6.567375886524823,
"grad_norm": 0.12911982834339142,
"learning_rate": 8.315208254339557e-06,
"loss": 0.011,
"step": 9260
},
{
"epoch": 6.574468085106383,
"grad_norm": 0.20145867764949799,
"learning_rate": 8.190782505913442e-06,
"loss": 0.0134,
"step": 9270
},
{
"epoch": 6.581560283687943,
"grad_norm": 0.2898944914340973,
"learning_rate": 8.067254955870707e-06,
"loss": 0.017,
"step": 9280
},
{
"epoch": 6.588652482269503,
"grad_norm": 0.08676121383905411,
"learning_rate": 7.944626812720169e-06,
"loss": 0.0096,
"step": 9290
},
{
"epoch": 6.595744680851064,
"grad_norm": 0.2594836950302124,
"learning_rate": 7.822899276171403e-06,
"loss": 0.0156,
"step": 9300
},
{
"epoch": 6.602836879432624,
"grad_norm": 0.15736845135688782,
"learning_rate": 7.702073537123145e-06,
"loss": 0.0109,
"step": 9310
},
{
"epoch": 6.609929078014185,
"grad_norm": 0.25035277009010315,
"learning_rate": 7.5821507776514866e-06,
"loss": 0.0261,
"step": 9320
},
{
"epoch": 6.617021276595745,
"grad_norm": 0.361659973859787,
"learning_rate": 7.463132170998388e-06,
"loss": 0.0117,
"step": 9330
},
{
"epoch": 6.624113475177305,
"grad_norm": 0.21234659850597382,
"learning_rate": 7.345018881560251e-06,
"loss": 0.0114,
"step": 9340
},
{
"epoch": 6.631205673758865,
"grad_norm": 0.1619425266981125,
"learning_rate": 7.227812064876471e-06,
"loss": 0.0095,
"step": 9350
},
{
"epoch": 6.638297872340425,
"grad_norm": 0.15260903537273407,
"learning_rate": 7.1115128676180975e-06,
"loss": 0.0129,
"step": 9360
},
{
"epoch": 6.6453900709219855,
"grad_norm": 0.13201937079429626,
"learning_rate": 6.996122427576635e-06,
"loss": 0.0216,
"step": 9370
},
{
"epoch": 6.652482269503546,
"grad_norm": 0.17256666719913483,
"learning_rate": 6.881641873653022e-06,
"loss": 0.0079,
"step": 9380
},
{
"epoch": 6.659574468085106,
"grad_norm": 0.3167516589164734,
"learning_rate": 6.768072325846387e-06,
"loss": 0.0187,
"step": 9390
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.17999926209449768,
"learning_rate": 6.655414895243306e-06,
"loss": 0.0135,
"step": 9400
},
{
"epoch": 6.673758865248227,
"grad_norm": 0.20833925902843475,
"learning_rate": 6.543670684006742e-06,
"loss": 0.0117,
"step": 9410
},
{
"epoch": 6.680851063829787,
"grad_norm": 0.19959613680839539,
"learning_rate": 6.432840785365368e-06,
"loss": 0.0113,
"step": 9420
},
{
"epoch": 6.6879432624113475,
"grad_norm": 0.16003111004829407,
"learning_rate": 6.3229262836028924e-06,
"loss": 0.013,
"step": 9430
},
{
"epoch": 6.695035460992908,
"grad_norm": 0.295195609331131,
"learning_rate": 6.213928254047352e-06,
"loss": 0.0137,
"step": 9440
},
{
"epoch": 6.702127659574468,
"grad_norm": 0.26150402426719666,
"learning_rate": 6.105847763060668e-06,
"loss": 0.0152,
"step": 9450
},
{
"epoch": 6.709219858156028,
"grad_norm": 0.10643389075994492,
"learning_rate": 5.998685868028231e-06,
"loss": 0.0076,
"step": 9460
},
{
"epoch": 6.716312056737589,
"grad_norm": 0.1366465985774994,
"learning_rate": 5.892443617348431e-06,
"loss": 0.0107,
"step": 9470
},
{
"epoch": 6.723404255319149,
"grad_norm": 0.22469396889209747,
"learning_rate": 5.7871220504226e-06,
"loss": 0.0099,
"step": 9480
},
{
"epoch": 6.7304964539007095,
"grad_norm": 0.17117497324943542,
"learning_rate": 5.682722197644652e-06,
"loss": 0.0108,
"step": 9490
},
{
"epoch": 6.73758865248227,
"grad_norm": 0.21108581125736237,
"learning_rate": 5.579245080391094e-06,
"loss": 0.0083,
"step": 9500
},
{
"epoch": 6.74468085106383,
"grad_norm": 0.1274946630001068,
"learning_rate": 5.47669171101105e-06,
"loss": 0.0141,
"step": 9510
},
{
"epoch": 6.75177304964539,
"grad_norm": 0.22684040665626526,
"learning_rate": 5.375063092816313e-06,
"loss": 0.0107,
"step": 9520
},
{
"epoch": 6.75886524822695,
"grad_norm": 0.12946170568466187,
"learning_rate": 5.2743602200715505e-06,
"loss": 0.0124,
"step": 9530
},
{
"epoch": 6.76595744680851,
"grad_norm": 0.19293436408042908,
"learning_rate": 5.1745840779845455e-06,
"loss": 0.0103,
"step": 9540
},
{
"epoch": 6.773049645390071,
"grad_norm": 0.17891868948936462,
"learning_rate": 5.075735642696611e-06,
"loss": 0.0172,
"step": 9550
},
{
"epoch": 6.780141843971631,
"grad_norm": 0.15858317911624908,
"learning_rate": 4.977815881273018e-06,
"loss": 0.0094,
"step": 9560
},
{
"epoch": 6.787234042553192,
"grad_norm": 0.1298726350069046,
"learning_rate": 4.880825751693518e-06,
"loss": 0.0129,
"step": 9570
},
{
"epoch": 6.794326241134752,
"grad_norm": 0.14093485474586487,
"learning_rate": 4.784766202842961e-06,
"loss": 0.0063,
"step": 9580
},
{
"epoch": 6.801418439716312,
"grad_norm": 0.3153584599494934,
"learning_rate": 4.689638174502076e-06,
"loss": 0.0099,
"step": 9590
},
{
"epoch": 6.808510638297872,
"grad_norm": 0.16759419441223145,
"learning_rate": 4.595442597338217e-06,
"loss": 0.0163,
"step": 9600
},
{
"epoch": 6.815602836879433,
"grad_norm": 0.2294328510761261,
"learning_rate": 4.502180392896272e-06,
"loss": 0.0084,
"step": 9610
},
{
"epoch": 6.822695035460993,
"grad_norm": 0.25540658831596375,
"learning_rate": 4.409852473589626e-06,
"loss": 0.0109,
"step": 9620
},
{
"epoch": 6.829787234042553,
"grad_norm": 0.16674602031707764,
"learning_rate": 4.318459742691316e-06,
"loss": 0.0098,
"step": 9630
},
{
"epoch": 6.836879432624113,
"grad_norm": 0.18922989070415497,
"learning_rate": 4.228003094325084e-06,
"loss": 0.0077,
"step": 9640
},
{
"epoch": 6.843971631205674,
"grad_norm": 0.12138693034648895,
"learning_rate": 4.13848341345674e-06,
"loss": 0.012,
"step": 9650
},
{
"epoch": 6.851063829787234,
"grad_norm": 0.2116478681564331,
"learning_rate": 4.049901575885373e-06,
"loss": 0.0114,
"step": 9660
},
{
"epoch": 6.858156028368795,
"grad_norm": 0.22467593848705292,
"learning_rate": 3.962258448234912e-06,
"loss": 0.0211,
"step": 9670
},
{
"epoch": 6.865248226950355,
"grad_norm": 0.13237184286117554,
"learning_rate": 3.875554887945576e-06,
"loss": 0.0195,
"step": 9680
},
{
"epoch": 6.872340425531915,
"grad_norm": 0.15521298348903656,
"learning_rate": 3.789791743265503e-06,
"loss": 0.0107,
"step": 9690
},
{
"epoch": 6.879432624113475,
"grad_norm": 0.23321086168289185,
"learning_rate": 3.704969853242446e-06,
"loss": 0.0177,
"step": 9700
},
{
"epoch": 6.886524822695035,
"grad_norm": 0.1563076227903366,
"learning_rate": 3.6210900477155696e-06,
"loss": 0.0094,
"step": 9710
},
{
"epoch": 6.8936170212765955,
"grad_norm": 0.12247644364833832,
"learning_rate": 3.5381531473073326e-06,
"loss": 0.016,
"step": 9720
},
{
"epoch": 6.900709219858156,
"grad_norm": 0.19630227982997894,
"learning_rate": 3.456159963415473e-06,
"loss": 0.0108,
"step": 9730
},
{
"epoch": 6.907801418439716,
"grad_norm": 0.25300222635269165,
"learning_rate": 3.3751112982050135e-06,
"loss": 0.0112,
"step": 9740
},
{
"epoch": 6.914893617021277,
"grad_norm": 0.17951223254203796,
"learning_rate": 3.295007944600481e-06,
"loss": 0.0158,
"step": 9750
},
{
"epoch": 6.921985815602837,
"grad_norm": 0.09103976935148239,
"learning_rate": 3.215850686278132e-06,
"loss": 0.0127,
"step": 9760
},
{
"epoch": 6.929078014184397,
"grad_norm": 0.13495738804340363,
"learning_rate": 3.1376402976582507e-06,
"loss": 0.008,
"step": 9770
},
{
"epoch": 6.9361702127659575,
"grad_norm": 0.12648430466651917,
"learning_rate": 3.060377543897619e-06,
"loss": 0.0106,
"step": 9780
},
{
"epoch": 6.943262411347518,
"grad_norm": 0.22089631855487823,
"learning_rate": 2.984063180882013e-06,
"loss": 0.0137,
"step": 9790
},
{
"epoch": 6.950354609929078,
"grad_norm": 0.12765297293663025,
"learning_rate": 2.908697955218753e-06,
"loss": 0.0088,
"step": 9800
},
{
"epoch": 6.957446808510638,
"grad_norm": 0.33271655440330505,
"learning_rate": 2.834282604229521e-06,
"loss": 0.0111,
"step": 9810
},
{
"epoch": 6.964539007092198,
"grad_norm": 0.2782209515571594,
"learning_rate": 2.7608178559430653e-06,
"loss": 0.0102,
"step": 9820
},
{
"epoch": 6.971631205673759,
"grad_norm": 0.21345843374729156,
"learning_rate": 2.6883044290880178e-06,
"loss": 0.0105,
"step": 9830
},
{
"epoch": 6.9787234042553195,
"grad_norm": 0.10163545608520508,
"learning_rate": 2.616743033086022e-06,
"loss": 0.0081,
"step": 9840
},
{
"epoch": 6.98581560283688,
"grad_norm": 0.13964958488941193,
"learning_rate": 2.5461343680446727e-06,
"loss": 0.0121,
"step": 9850
},
{
"epoch": 6.99290780141844,
"grad_norm": 0.13182148337364197,
"learning_rate": 2.476479124750697e-06,
"loss": 0.0128,
"step": 9860
},
{
"epoch": 7.0,
"grad_norm": 0.24027769267559052,
"learning_rate": 2.4077779846631732e-06,
"loss": 0.0111,
"step": 9870
},
{
"epoch": 7.00709219858156,
"grad_norm": 0.3481847941875458,
"learning_rate": 2.3400316199069238e-06,
"loss": 0.0103,
"step": 9880
},
{
"epoch": 7.01418439716312,
"grad_norm": 0.19290268421173096,
"learning_rate": 2.273240693265899e-06,
"loss": 0.014,
"step": 9890
},
{
"epoch": 7.0212765957446805,
"grad_norm": 0.059760428965091705,
"learning_rate": 2.207405858176692e-06,
"loss": 0.0111,
"step": 9900
},
{
"epoch": 7.028368794326241,
"grad_norm": 0.2962135374546051,
"learning_rate": 2.142527758722157e-06,
"loss": 0.0125,
"step": 9910
},
{
"epoch": 7.035460992907802,
"grad_norm": 0.06278011202812195,
"learning_rate": 2.0786070296250793e-06,
"loss": 0.0076,
"step": 9920
},
{
"epoch": 7.042553191489362,
"grad_norm": 0.06869203597307205,
"learning_rate": 2.0156442962420252e-06,
"loss": 0.0101,
"step": 9930
},
{
"epoch": 7.049645390070922,
"grad_norm": 0.250283420085907,
"learning_rate": 1.95364017455717e-06,
"loss": 0.0195,
"step": 9940
},
{
"epoch": 7.056737588652482,
"grad_norm": 0.1912689059972763,
"learning_rate": 1.8925952711763006e-06,
"loss": 0.0138,
"step": 9950
},
{
"epoch": 7.0638297872340425,
"grad_norm": 0.120729461312294,
"learning_rate": 1.8325101833208457e-06,
"loss": 0.0158,
"step": 9960
},
{
"epoch": 7.070921985815603,
"grad_norm": 0.2449929565191269,
"learning_rate": 1.7733854988220778e-06,
"loss": 0.0111,
"step": 9970
},
{
"epoch": 7.078014184397163,
"grad_norm": 0.11768271028995514,
"learning_rate": 1.7152217961153405e-06,
"loss": 0.011,
"step": 9980
},
{
"epoch": 7.085106382978723,
"grad_norm": 0.09861616790294647,
"learning_rate": 1.6580196442343987e-06,
"loss": 0.0097,
"step": 9990
},
{
"epoch": 7.092198581560283,
"grad_norm": 0.12583206593990326,
"learning_rate": 1.601779602805842e-06,
"loss": 0.0089,
"step": 10000
},
{
"epoch": 7.099290780141844,
"grad_norm": 0.13632884621620178,
"learning_rate": 1.5465022220436442e-06,
"loss": 0.01,
"step": 10010
},
{
"epoch": 7.1063829787234045,
"grad_norm": 0.19436778128147125,
"learning_rate": 1.4921880427437584e-06,
"loss": 0.0166,
"step": 10020
},
{
"epoch": 7.113475177304965,
"grad_norm": 0.10597945749759674,
"learning_rate": 1.4388375962788637e-06,
"loss": 0.0091,
"step": 10030
},
{
"epoch": 7.120567375886525,
"grad_norm": 0.08205860108137131,
"learning_rate": 1.3864514045931032e-06,
"loss": 0.0105,
"step": 10040
},
{
"epoch": 7.127659574468085,
"grad_norm": 0.19386780261993408,
"learning_rate": 1.3350299801970335e-06,
"loss": 0.0149,
"step": 10050
},
{
"epoch": 7.134751773049645,
"grad_norm": 0.22453084588050842,
"learning_rate": 1.2845738261625828e-06,
"loss": 0.0116,
"step": 10060
},
{
"epoch": 7.141843971631205,
"grad_norm": 0.1868603378534317,
"learning_rate": 1.235083436118145e-06,
"loss": 0.0089,
"step": 10070
},
{
"epoch": 7.148936170212766,
"grad_norm": 0.12683051824569702,
"learning_rate": 1.1865592942437275e-06,
"loss": 0.0102,
"step": 10080
},
{
"epoch": 7.156028368794326,
"grad_norm": 0.17270110547542572,
"learning_rate": 1.1390018752662436e-06,
"loss": 0.0141,
"step": 10090
},
{
"epoch": 7.163120567375887,
"grad_norm": 0.3085339069366455,
"learning_rate": 1.0924116444548383e-06,
"loss": 0.0101,
"step": 10100
},
{
"epoch": 7.170212765957447,
"grad_norm": 0.12679530680179596,
"learning_rate": 1.0467890576163707e-06,
"loss": 0.0111,
"step": 10110
},
{
"epoch": 7.177304964539007,
"grad_norm": 0.23530429601669312,
"learning_rate": 1.0021345610909171e-06,
"loss": 0.0136,
"step": 10120
},
{
"epoch": 7.184397163120567,
"grad_norm": 0.06431034952402115,
"learning_rate": 9.584485917474185e-07,
"loss": 0.0089,
"step": 10130
},
{
"epoch": 7.191489361702128,
"grad_norm": 0.15055204927921295,
"learning_rate": 9.157315769794284e-07,
"loss": 0.009,
"step": 10140
},
{
"epoch": 7.198581560283688,
"grad_norm": 0.1731652468442917,
"learning_rate": 8.739839347009171e-07,
"loss": 0.0135,
"step": 10150
},
{
"epoch": 7.205673758865248,
"grad_norm": 0.1978446990251541,
"learning_rate": 8.332060733421631e-07,
"loss": 0.0116,
"step": 10160
},
{
"epoch": 7.212765957446808,
"grad_norm": 0.29635027050971985,
"learning_rate": 7.933983918457677e-07,
"loss": 0.0117,
"step": 10170
},
{
"epoch": 7.219858156028369,
"grad_norm": 0.23973168432712555,
"learning_rate": 7.54561279662791e-07,
"loss": 0.0146,
"step": 10180
},
{
"epoch": 7.226950354609929,
"grad_norm": 0.12211582064628601,
"learning_rate": 7.166951167488667e-07,
"loss": 0.0076,
"step": 10190
},
{
"epoch": 7.23404255319149,
"grad_norm": 0.1157701313495636,
"learning_rate": 6.798002735605602e-07,
"loss": 0.0166,
"step": 10200
},
{
"epoch": 7.24113475177305,
"grad_norm": 0.08518693596124649,
"learning_rate": 6.43877111051705e-07,
"loss": 0.0116,
"step": 10210
},
{
"epoch": 7.24822695035461,
"grad_norm": 0.3765665888786316,
"learning_rate": 6.089259806698611e-07,
"loss": 0.0174,
"step": 10220
},
{
"epoch": 7.25531914893617,
"grad_norm": 0.315969854593277,
"learning_rate": 5.749472243529064e-07,
"loss": 0.0179,
"step": 10230
},
{
"epoch": 7.26241134751773,
"grad_norm": 0.1261627972126007,
"learning_rate": 5.419411745256841e-07,
"loss": 0.0145,
"step": 10240
},
{
"epoch": 7.2695035460992905,
"grad_norm": 0.14078722894191742,
"learning_rate": 5.099081540967277e-07,
"loss": 0.0082,
"step": 10250
},
{
"epoch": 7.276595744680851,
"grad_norm": 0.19919142127037048,
"learning_rate": 4.788484764551293e-07,
"loss": 0.0096,
"step": 10260
},
{
"epoch": 7.283687943262412,
"grad_norm": 0.1621677577495575,
"learning_rate": 4.487624454674544e-07,
"loss": 0.0101,
"step": 10270
},
{
"epoch": 7.290780141843972,
"grad_norm": 0.07020825147628784,
"learning_rate": 4.196503554747988e-07,
"loss": 0.0081,
"step": 10280
},
{
"epoch": 7.297872340425532,
"grad_norm": 0.4086223244667053,
"learning_rate": 3.9151249128988043e-07,
"loss": 0.0108,
"step": 10290
},
{
"epoch": 7.304964539007092,
"grad_norm": 0.19677507877349854,
"learning_rate": 3.643491281942302e-07,
"loss": 0.0088,
"step": 10300
},
{
"epoch": 7.3120567375886525,
"grad_norm": 0.18708615005016327,
"learning_rate": 3.3816053193556073e-07,
"loss": 0.0137,
"step": 10310
},
{
"epoch": 7.319148936170213,
"grad_norm": 0.21218359470367432,
"learning_rate": 3.129469587251466e-07,
"loss": 0.0122,
"step": 10320
},
{
"epoch": 7.326241134751773,
"grad_norm": 0.057501133531332016,
"learning_rate": 2.8870865523525915e-07,
"loss": 0.0073,
"step": 10330
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.09904036670923233,
"learning_rate": 2.6544585859683556e-07,
"loss": 0.0098,
"step": 10340
},
{
"epoch": 7.340425531914893,
"grad_norm": 0.2506350874900818,
"learning_rate": 2.431587963971138e-07,
"loss": 0.0082,
"step": 10350
},
{
"epoch": 7.347517730496454,
"grad_norm": 0.16516119241714478,
"learning_rate": 2.218476866774344e-07,
"loss": 0.0116,
"step": 10360
},
{
"epoch": 7.3546099290780145,
"grad_norm": 0.18242092430591583,
"learning_rate": 2.015127379310422e-07,
"loss": 0.0145,
"step": 10370
},
{
"epoch": 7.361702127659575,
"grad_norm": 0.1638125777244568,
"learning_rate": 1.821541491011547e-07,
"loss": 0.0089,
"step": 10380
},
{
"epoch": 7.368794326241135,
"grad_norm": 0.192805677652359,
"learning_rate": 1.6377210957888579e-07,
"loss": 0.0138,
"step": 10390
},
{
"epoch": 7.375886524822695,
"grad_norm": 0.16740106046199799,
"learning_rate": 1.4636679920152496e-07,
"loss": 0.0127,
"step": 10400
},
{
"epoch": 7.382978723404255,
"grad_norm": 0.20716732740402222,
"learning_rate": 1.2993838825066107e-07,
"loss": 0.0104,
"step": 10410
},
{
"epoch": 7.390070921985815,
"grad_norm": 0.2054859846830368,
"learning_rate": 1.1448703745061684e-07,
"loss": 0.0121,
"step": 10420
},
{
"epoch": 7.397163120567376,
"grad_norm": 0.19400155544281006,
"learning_rate": 1.0001289796678359e-07,
"loss": 0.0158,
"step": 10430
},
{
"epoch": 7.404255319148936,
"grad_norm": 0.257386714220047,
"learning_rate": 8.651611140423344e-08,
"loss": 0.0125,
"step": 10440
},
{
"epoch": 7.411347517730497,
"grad_norm": 0.1650981456041336,
"learning_rate": 7.399680980624268e-08,
"loss": 0.0131,
"step": 10450
},
{
"epoch": 7.418439716312057,
"grad_norm": 0.07822942733764648,
"learning_rate": 6.24551156530817e-08,
"loss": 0.0107,
"step": 10460
},
{
"epoch": 7.425531914893617,
"grad_norm": 0.2906341850757599,
"learning_rate": 5.1891141860760387e-08,
"loss": 0.0074,
"step": 10470
},
{
"epoch": 7.432624113475177,
"grad_norm": 0.10687454789876938,
"learning_rate": 4.230499177994007e-08,
"loss": 0.0099,
"step": 10480
},
{
"epoch": 7.439716312056738,
"grad_norm": 0.22731392085552216,
"learning_rate": 3.369675919495663e-08,
"loss": 0.012,
"step": 10490
},
{
"epoch": 7.446808510638298,
"grad_norm": 0.0800376906991005,
"learning_rate": 2.6066528322832294e-08,
"loss": 0.0108,
"step": 10500
},
{
"epoch": 7.453900709219858,
"grad_norm": 0.26024994254112244,
"learning_rate": 1.9414373812509655e-08,
"loss": 0.0159,
"step": 10510
},
{
"epoch": 7.460992907801418,
"grad_norm": 0.14578887820243835,
"learning_rate": 1.3740360744118886e-08,
"loss": 0.0163,
"step": 10520
},
{
"epoch": 7.468085106382979,
"grad_norm": 0.12513095140457153,
"learning_rate": 9.04454462830051e-09,
"loss": 0.0114,
"step": 10530
},
{
"epoch": 7.475177304964539,
"grad_norm": 0.20727184414863586,
"learning_rate": 5.326971405694714e-09,
"loss": 0.0126,
"step": 10540
},
{
"epoch": 7.4822695035460995,
"grad_norm": 0.22372283041477203,
"learning_rate": 2.5876774464972387e-09,
"loss": 0.0101,
"step": 10550
},
{
"epoch": 7.48936170212766,
"grad_norm": 0.26353222131729126,
"learning_rate": 8.266895500708138e-10,
"loss": 0.011,
"step": 10560
},
{
"epoch": 7.49645390070922,
"grad_norm": 0.1274196356534958,
"learning_rate": 4.402494471200669e-11,
"loss": 0.0129,
"step": 10570
},
{
"epoch": 7.498581560283688,
"step": 10573,
"total_flos": 3.707114868479735e+17,
"train_loss": 0.02894638727507956,
"train_runtime": 4663.694,
"train_samples_per_second": 36.273,
"train_steps_per_second": 2.267
}
],
"logging_steps": 10,
"max_steps": 10573,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.707114868479735e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}