alicegoesdown's picture
Training in progress, step 3900, checkpoint
85f0f1d verified
{
"best_metric": 0.3866276741027832,
"best_model_checkpoint": "./output/checkpoint-3000",
"epoch": 3.8537549407114624,
"eval_steps": 150,
"global_step": 3900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009881422924901186,
"grad_norm": 2.4057295322418213,
"learning_rate": 9.999999999999999e-06,
"loss": 0.527,
"step": 10
},
{
"epoch": 0.019762845849802372,
"grad_norm": 2.7988038063049316,
"learning_rate": 1.9999999999999998e-05,
"loss": 0.4789,
"step": 20
},
{
"epoch": 0.029644268774703556,
"grad_norm": 3.1874520778656006,
"learning_rate": 2.999999999999999e-05,
"loss": 0.5495,
"step": 30
},
{
"epoch": 0.039525691699604744,
"grad_norm": 3.206881046295166,
"learning_rate": 3.9999999999999996e-05,
"loss": 0.5418,
"step": 40
},
{
"epoch": 0.04940711462450593,
"grad_norm": 2.863534927368164,
"learning_rate": 4.999999999999999e-05,
"loss": 0.5065,
"step": 50
},
{
"epoch": 0.05928853754940711,
"grad_norm": 2.8917012214660645,
"learning_rate": 5.999999999999998e-05,
"loss": 0.5262,
"step": 60
},
{
"epoch": 0.0691699604743083,
"grad_norm": 2.8896090984344482,
"learning_rate": 6.999999999999998e-05,
"loss": 0.5294,
"step": 70
},
{
"epoch": 0.07905138339920949,
"grad_norm": 3.138671875,
"learning_rate": 7.999999999999999e-05,
"loss": 0.4787,
"step": 80
},
{
"epoch": 0.08893280632411067,
"grad_norm": 2.513195753097534,
"learning_rate": 8.999999999999998e-05,
"loss": 0.4873,
"step": 90
},
{
"epoch": 0.09881422924901186,
"grad_norm": 2.762770414352417,
"learning_rate": 9.999999999999998e-05,
"loss": 0.5069,
"step": 100
},
{
"epoch": 0.10869565217391304,
"grad_norm": 2.255554437637329,
"learning_rate": 9.999897234791827e-05,
"loss": 0.4713,
"step": 110
},
{
"epoch": 0.11857707509881422,
"grad_norm": 2.8342161178588867,
"learning_rate": 9.999588943391594e-05,
"loss": 0.5146,
"step": 120
},
{
"epoch": 0.12845849802371542,
"grad_norm": 2.1564338207244873,
"learning_rate": 9.999075138471948e-05,
"loss": 0.4976,
"step": 130
},
{
"epoch": 0.1383399209486166,
"grad_norm": 2.606574773788452,
"learning_rate": 9.998355841153397e-05,
"loss": 0.5166,
"step": 140
},
{
"epoch": 0.1482213438735178,
"grad_norm": 2.285940647125244,
"learning_rate": 9.997431081003437e-05,
"loss": 0.5137,
"step": 150
},
{
"epoch": 0.1482213438735178,
"eval_loss": 0.5139818787574768,
"eval_runtime": 35.3761,
"eval_samples_per_second": 14.134,
"eval_steps_per_second": 14.134,
"step": 150
},
{
"epoch": 0.15810276679841898,
"grad_norm": 2.5471882820129395,
"learning_rate": 9.996300896035337e-05,
"loss": 0.5392,
"step": 160
},
{
"epoch": 0.16798418972332016,
"grad_norm": 3.4715616703033447,
"learning_rate": 9.994965332706571e-05,
"loss": 0.517,
"step": 170
},
{
"epoch": 0.17786561264822134,
"grad_norm": 2.321061134338379,
"learning_rate": 9.99342444591692e-05,
"loss": 0.4852,
"step": 180
},
{
"epoch": 0.18774703557312253,
"grad_norm": 2.590848922729492,
"learning_rate": 9.991678299006203e-05,
"loss": 0.5101,
"step": 190
},
{
"epoch": 0.1976284584980237,
"grad_norm": 2.412264823913574,
"learning_rate": 9.989726963751679e-05,
"loss": 0.5099,
"step": 200
},
{
"epoch": 0.2075098814229249,
"grad_norm": 2.491581678390503,
"learning_rate": 9.987570520365101e-05,
"loss": 0.5057,
"step": 210
},
{
"epoch": 0.21739130434782608,
"grad_norm": 2.79880690574646,
"learning_rate": 9.985209057489407e-05,
"loss": 0.4955,
"step": 220
},
{
"epoch": 0.22727272727272727,
"grad_norm": 3.0157032012939453,
"learning_rate": 9.98264267219509e-05,
"loss": 0.4766,
"step": 230
},
{
"epoch": 0.23715415019762845,
"grad_norm": 2.266268491744995,
"learning_rate": 9.979871469976193e-05,
"loss": 0.4975,
"step": 240
},
{
"epoch": 0.24703557312252963,
"grad_norm": 2.4801745414733887,
"learning_rate": 9.976895564745989e-05,
"loss": 0.5068,
"step": 250
},
{
"epoch": 0.25691699604743085,
"grad_norm": 2.9422342777252197,
"learning_rate": 9.973715078832285e-05,
"loss": 0.4792,
"step": 260
},
{
"epoch": 0.26679841897233203,
"grad_norm": 2.364716053009033,
"learning_rate": 9.970330142972399e-05,
"loss": 0.475,
"step": 270
},
{
"epoch": 0.2766798418972332,
"grad_norm": 2.0679702758789062,
"learning_rate": 9.966740896307789e-05,
"loss": 0.4818,
"step": 280
},
{
"epoch": 0.2865612648221344,
"grad_norm": 2.8333821296691895,
"learning_rate": 9.962947486378323e-05,
"loss": 0.489,
"step": 290
},
{
"epoch": 0.2964426877470356,
"grad_norm": 2.5552380084991455,
"learning_rate": 9.958950069116228e-05,
"loss": 0.4921,
"step": 300
},
{
"epoch": 0.2964426877470356,
"eval_loss": 0.4803454577922821,
"eval_runtime": 35.2266,
"eval_samples_per_second": 14.194,
"eval_steps_per_second": 14.194,
"step": 300
},
{
"epoch": 0.30632411067193677,
"grad_norm": 2.617976188659668,
"learning_rate": 9.954748808839671e-05,
"loss": 0.5139,
"step": 310
},
{
"epoch": 0.31620553359683795,
"grad_norm": 2.61501407623291,
"learning_rate": 9.950343878246007e-05,
"loss": 0.4642,
"step": 320
},
{
"epoch": 0.32608695652173914,
"grad_norm": 2.3816354274749756,
"learning_rate": 9.945735458404678e-05,
"loss": 0.522,
"step": 330
},
{
"epoch": 0.3359683794466403,
"grad_norm": 2.5774052143096924,
"learning_rate": 9.940923738749776e-05,
"loss": 0.505,
"step": 340
},
{
"epoch": 0.3458498023715415,
"grad_norm": 2.245945692062378,
"learning_rate": 9.935908917072249e-05,
"loss": 0.4801,
"step": 350
},
{
"epoch": 0.3557312252964427,
"grad_norm": 1.9169942140579224,
"learning_rate": 9.930691199511772e-05,
"loss": 0.4414,
"step": 360
},
{
"epoch": 0.36561264822134387,
"grad_norm": 2.1389174461364746,
"learning_rate": 9.925270800548282e-05,
"loss": 0.4659,
"step": 370
},
{
"epoch": 0.37549407114624506,
"grad_norm": 2.4838740825653076,
"learning_rate": 9.919647942993145e-05,
"loss": 0.4594,
"step": 380
},
{
"epoch": 0.38537549407114624,
"grad_norm": 2.629211187362671,
"learning_rate": 9.913822857980017e-05,
"loss": 0.492,
"step": 390
},
{
"epoch": 0.3952569169960474,
"grad_norm": 2.387241840362549,
"learning_rate": 9.907795784955324e-05,
"loss": 0.4556,
"step": 400
},
{
"epoch": 0.4051383399209486,
"grad_norm": 2.450533390045166,
"learning_rate": 9.901566971668434e-05,
"loss": 0.4653,
"step": 410
},
{
"epoch": 0.4150197628458498,
"grad_norm": 2.464107036590576,
"learning_rate": 9.895136674161462e-05,
"loss": 0.4955,
"step": 420
},
{
"epoch": 0.424901185770751,
"grad_norm": 2.31066632270813,
"learning_rate": 9.888505156758756e-05,
"loss": 0.4623,
"step": 430
},
{
"epoch": 0.43478260869565216,
"grad_norm": 2.158092975616455,
"learning_rate": 9.881672692056019e-05,
"loss": 0.4692,
"step": 440
},
{
"epoch": 0.44466403162055335,
"grad_norm": 1.851882815361023,
"learning_rate": 9.874639560909115e-05,
"loss": 0.487,
"step": 450
},
{
"epoch": 0.44466403162055335,
"eval_loss": 0.46247246861457825,
"eval_runtime": 34.6043,
"eval_samples_per_second": 14.449,
"eval_steps_per_second": 14.449,
"step": 450
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.608232021331787,
"learning_rate": 9.867406052422521e-05,
"loss": 0.4742,
"step": 460
},
{
"epoch": 0.4644268774703557,
"grad_norm": 2.2287890911102295,
"learning_rate": 9.859972463937438e-05,
"loss": 0.4611,
"step": 470
},
{
"epoch": 0.4743083003952569,
"grad_norm": 2.5854885578155518,
"learning_rate": 9.852339101019572e-05,
"loss": 0.4576,
"step": 480
},
{
"epoch": 0.4841897233201581,
"grad_norm": 2.134244203567505,
"learning_rate": 9.844506277446574e-05,
"loss": 0.4824,
"step": 490
},
{
"epoch": 0.49407114624505927,
"grad_norm": 2.4558589458465576,
"learning_rate": 9.836474315195144e-05,
"loss": 0.4497,
"step": 500
},
{
"epoch": 0.5039525691699605,
"grad_norm": 2.9862284660339355,
"learning_rate": 9.828243544427793e-05,
"loss": 0.4632,
"step": 510
},
{
"epoch": 0.5138339920948617,
"grad_norm": 2.5558395385742188,
"learning_rate": 9.819814303479264e-05,
"loss": 0.4486,
"step": 520
},
{
"epoch": 0.5237154150197628,
"grad_norm": 1.770102620124817,
"learning_rate": 9.811186938842643e-05,
"loss": 0.4476,
"step": 530
},
{
"epoch": 0.5335968379446641,
"grad_norm": 3.525803565979004,
"learning_rate": 9.802361805155094e-05,
"loss": 0.4643,
"step": 540
},
{
"epoch": 0.5434782608695652,
"grad_norm": 2.2628469467163086,
"learning_rate": 9.7933392651833e-05,
"loss": 0.4731,
"step": 550
},
{
"epoch": 0.5533596837944664,
"grad_norm": 3.0324666500091553,
"learning_rate": 9.784119689808542e-05,
"loss": 0.4478,
"step": 560
},
{
"epoch": 0.5632411067193676,
"grad_norm": 2.263258218765259,
"learning_rate": 9.77470345801145e-05,
"loss": 0.4497,
"step": 570
},
{
"epoch": 0.5731225296442688,
"grad_norm": 2.645112991333008,
"learning_rate": 9.765090956856434e-05,
"loss": 0.4666,
"step": 580
},
{
"epoch": 0.5830039525691699,
"grad_norm": 2.0175938606262207,
"learning_rate": 9.755282581475766e-05,
"loss": 0.47,
"step": 590
},
{
"epoch": 0.5928853754940712,
"grad_norm": 2.175175666809082,
"learning_rate": 9.74527873505334e-05,
"loss": 0.4592,
"step": 600
},
{
"epoch": 0.5928853754940712,
"eval_loss": 0.44894906878471375,
"eval_runtime": 34.7044,
"eval_samples_per_second": 14.407,
"eval_steps_per_second": 14.407,
"step": 600
},
{
"epoch": 0.6027667984189723,
"grad_norm": 2.4829459190368652,
"learning_rate": 9.735079828808105e-05,
"loss": 0.4453,
"step": 610
},
{
"epoch": 0.6126482213438735,
"grad_norm": 2.2683026790618896,
"learning_rate": 9.724686281977144e-05,
"loss": 0.4701,
"step": 620
},
{
"epoch": 0.6225296442687747,
"grad_norm": 2.6205978393554688,
"learning_rate": 9.714098521798462e-05,
"loss": 0.4581,
"step": 630
},
{
"epoch": 0.6324110671936759,
"grad_norm": 2.5154802799224854,
"learning_rate": 9.703316983493411e-05,
"loss": 0.4372,
"step": 640
},
{
"epoch": 0.642292490118577,
"grad_norm": 2.478700637817383,
"learning_rate": 9.6923421102488e-05,
"loss": 0.4554,
"step": 650
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.286890745162964,
"learning_rate": 9.681174353198684e-05,
"loss": 0.474,
"step": 660
},
{
"epoch": 0.6620553359683794,
"grad_norm": 2.0446231365203857,
"learning_rate": 9.669814171405813e-05,
"loss": 0.4347,
"step": 670
},
{
"epoch": 0.6719367588932806,
"grad_norm": 2.571877956390381,
"learning_rate": 9.658262031842768e-05,
"loss": 0.4355,
"step": 680
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.160186290740967,
"learning_rate": 9.646518409372757e-05,
"loss": 0.4354,
"step": 690
},
{
"epoch": 0.691699604743083,
"grad_norm": 2.0123462677001953,
"learning_rate": 9.634583786730107e-05,
"loss": 0.4619,
"step": 700
},
{
"epoch": 0.7015810276679841,
"grad_norm": 2.4318065643310547,
"learning_rate": 9.622458654500406e-05,
"loss": 0.4795,
"step": 710
},
{
"epoch": 0.7114624505928854,
"grad_norm": 2.1146297454833984,
"learning_rate": 9.610143511100351e-05,
"loss": 0.442,
"step": 720
},
{
"epoch": 0.7213438735177866,
"grad_norm": 2.312072992324829,
"learning_rate": 9.597638862757252e-05,
"loss": 0.4339,
"step": 730
},
{
"epoch": 0.7312252964426877,
"grad_norm": 2.0367119312286377,
"learning_rate": 9.584945223488224e-05,
"loss": 0.4519,
"step": 740
},
{
"epoch": 0.741106719367589,
"grad_norm": 2.8992955684661865,
"learning_rate": 9.57206311507906e-05,
"loss": 0.4431,
"step": 750
},
{
"epoch": 0.741106719367589,
"eval_loss": 0.4378005266189575,
"eval_runtime": 34.0956,
"eval_samples_per_second": 14.665,
"eval_steps_per_second": 14.665,
"step": 750
},
{
"epoch": 0.7509881422924901,
"grad_norm": 2.3162925243377686,
"learning_rate": 9.558993067062783e-05,
"loss": 0.4426,
"step": 760
},
{
"epoch": 0.7608695652173914,
"grad_norm": 1.9839439392089844,
"learning_rate": 9.545735616697873e-05,
"loss": 0.46,
"step": 770
},
{
"epoch": 0.7707509881422925,
"grad_norm": 2.18251633644104,
"learning_rate": 9.532291308946188e-05,
"loss": 0.4254,
"step": 780
},
{
"epoch": 0.7806324110671937,
"grad_norm": 2.241259813308716,
"learning_rate": 9.518660696450565e-05,
"loss": 0.4246,
"step": 790
},
{
"epoch": 0.7905138339920948,
"grad_norm": 2.357609272003174,
"learning_rate": 9.504844339512093e-05,
"loss": 0.4497,
"step": 800
},
{
"epoch": 0.8003952569169961,
"grad_norm": 2.2541675567626953,
"learning_rate": 9.490842806067093e-05,
"loss": 0.4605,
"step": 810
},
{
"epoch": 0.8102766798418972,
"grad_norm": 2.1015920639038086,
"learning_rate": 9.476656671663764e-05,
"loss": 0.4227,
"step": 820
},
{
"epoch": 0.8201581027667985,
"grad_norm": 2.2886059284210205,
"learning_rate": 9.462286519438528e-05,
"loss": 0.4385,
"step": 830
},
{
"epoch": 0.8300395256916996,
"grad_norm": 2.2543296813964844,
"learning_rate": 9.447732940092057e-05,
"loss": 0.433,
"step": 840
},
{
"epoch": 0.8399209486166008,
"grad_norm": 2.2577757835388184,
"learning_rate": 9.432996531864999e-05,
"loss": 0.4634,
"step": 850
},
{
"epoch": 0.849802371541502,
"grad_norm": 1.795832872390747,
"learning_rate": 9.418077900513374e-05,
"loss": 0.4068,
"step": 860
},
{
"epoch": 0.8596837944664032,
"grad_norm": 1.9893933534622192,
"learning_rate": 9.402977659283688e-05,
"loss": 0.4527,
"step": 870
},
{
"epoch": 0.8695652173913043,
"grad_norm": 2.5203518867492676,
"learning_rate": 9.387696428887713e-05,
"loss": 0.4591,
"step": 880
},
{
"epoch": 0.8794466403162056,
"grad_norm": 2.42069673538208,
"learning_rate": 9.372234837476975e-05,
"loss": 0.4597,
"step": 890
},
{
"epoch": 0.8893280632411067,
"grad_norm": 2.087778329849243,
"learning_rate": 9.356593520616945e-05,
"loss": 0.4226,
"step": 900
},
{
"epoch": 0.8893280632411067,
"eval_loss": 0.42734819650650024,
"eval_runtime": 34.3301,
"eval_samples_per_second": 14.564,
"eval_steps_per_second": 14.564,
"step": 900
},
{
"epoch": 0.8992094861660079,
"grad_norm": 2.4652795791625977,
"learning_rate": 9.34077312126089e-05,
"loss": 0.4261,
"step": 910
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.0327532291412354,
"learning_rate": 9.324774289723465e-05,
"loss": 0.4674,
"step": 920
},
{
"epoch": 0.9189723320158103,
"grad_norm": 2.3021750450134277,
"learning_rate": 9.308597683653974e-05,
"loss": 0.4521,
"step": 930
},
{
"epoch": 0.9288537549407114,
"grad_norm": 6.520279884338379,
"learning_rate": 9.292243968009328e-05,
"loss": 0.4443,
"step": 940
},
{
"epoch": 0.9387351778656127,
"grad_norm": 2.0640597343444824,
"learning_rate": 9.27571381502673e-05,
"loss": 0.416,
"step": 950
},
{
"epoch": 0.9486166007905138,
"grad_norm": 2.280644416809082,
"learning_rate": 9.25900790419602e-05,
"loss": 0.4331,
"step": 960
},
{
"epoch": 0.958498023715415,
"grad_norm": 2.3445639610290527,
"learning_rate": 9.24212692223176e-05,
"loss": 0.4512,
"step": 970
},
{
"epoch": 0.9683794466403162,
"grad_norm": 2.072683334350586,
"learning_rate": 9.225071563045005e-05,
"loss": 0.3967,
"step": 980
},
{
"epoch": 0.9782608695652174,
"grad_norm": 1.9060055017471313,
"learning_rate": 9.207842527714764e-05,
"loss": 0.4102,
"step": 990
},
{
"epoch": 0.9881422924901185,
"grad_norm": 2.248657464981079,
"learning_rate": 9.1904405244592e-05,
"loss": 0.4505,
"step": 1000
},
{
"epoch": 0.9980237154150198,
"grad_norm": 2.048110008239746,
"learning_rate": 9.172866268606511e-05,
"loss": 0.4102,
"step": 1010
},
{
"epoch": 1.007905138339921,
"grad_norm": 1.9891077280044556,
"learning_rate": 9.155120482565518e-05,
"loss": 0.3866,
"step": 1020
},
{
"epoch": 1.017786561264822,
"grad_norm": 2.499363422393799,
"learning_rate": 9.13720389579598e-05,
"loss": 0.3584,
"step": 1030
},
{
"epoch": 1.0276679841897234,
"grad_norm": 2.4077465534210205,
"learning_rate": 9.119117244778605e-05,
"loss": 0.3736,
"step": 1040
},
{
"epoch": 1.0375494071146245,
"grad_norm": 2.0941267013549805,
"learning_rate": 9.100861272984777e-05,
"loss": 0.3769,
"step": 1050
},
{
"epoch": 1.0375494071146245,
"eval_loss": 0.4222135841846466,
"eval_runtime": 34.1626,
"eval_samples_per_second": 14.636,
"eval_steps_per_second": 14.636,
"step": 1050
},
{
"epoch": 1.0474308300395256,
"grad_norm": 2.29099702835083,
"learning_rate": 9.082436730845992e-05,
"loss": 0.3545,
"step": 1060
},
{
"epoch": 1.0573122529644268,
"grad_norm": 2.5847902297973633,
"learning_rate": 9.063844375723012e-05,
"loss": 0.3658,
"step": 1070
},
{
"epoch": 1.0671936758893281,
"grad_norm": 1.9889037609100342,
"learning_rate": 9.045084971874735e-05,
"loss": 0.3662,
"step": 1080
},
{
"epoch": 1.0770750988142292,
"grad_norm": 2.0356063842773438,
"learning_rate": 9.026159290426779e-05,
"loss": 0.3952,
"step": 1090
},
{
"epoch": 1.0869565217391304,
"grad_norm": 1.95900559425354,
"learning_rate": 9.007068109339781e-05,
"loss": 0.3624,
"step": 1100
},
{
"epoch": 1.0968379446640317,
"grad_norm": 2.0315041542053223,
"learning_rate": 8.987812213377421e-05,
"loss": 0.355,
"step": 1110
},
{
"epoch": 1.1067193675889329,
"grad_norm": 1.9098906517028809,
"learning_rate": 8.968392394074161e-05,
"loss": 0.3396,
"step": 1120
},
{
"epoch": 1.116600790513834,
"grad_norm": 2.3436784744262695,
"learning_rate": 8.94880944970271e-05,
"loss": 0.3433,
"step": 1130
},
{
"epoch": 1.1264822134387351,
"grad_norm": 2.013385534286499,
"learning_rate": 8.92906418524121e-05,
"loss": 0.3815,
"step": 1140
},
{
"epoch": 1.1363636363636362,
"grad_norm": 2.3570964336395264,
"learning_rate": 8.909157412340148e-05,
"loss": 0.3825,
"step": 1150
},
{
"epoch": 1.1462450592885376,
"grad_norm": 2.0097525119781494,
"learning_rate": 8.889089949288984e-05,
"loss": 0.3788,
"step": 1160
},
{
"epoch": 1.1561264822134387,
"grad_norm": 1.8614075183868408,
"learning_rate": 8.868862620982532e-05,
"loss": 0.3434,
"step": 1170
},
{
"epoch": 1.1660079051383399,
"grad_norm": 2.3193359375,
"learning_rate": 8.848476258887028e-05,
"loss": 0.3652,
"step": 1180
},
{
"epoch": 1.1758893280632412,
"grad_norm": 2.1564888954162598,
"learning_rate": 8.827931701005971e-05,
"loss": 0.3604,
"step": 1190
},
{
"epoch": 1.1857707509881423,
"grad_norm": 2.278334856033325,
"learning_rate": 8.80722979184567e-05,
"loss": 0.351,
"step": 1200
},
{
"epoch": 1.1857707509881423,
"eval_loss": 0.4201338589191437,
"eval_runtime": 34.2095,
"eval_samples_per_second": 14.616,
"eval_steps_per_second": 14.616,
"step": 1200
},
{
"epoch": 1.1956521739130435,
"grad_norm": 2.3817718029022217,
"learning_rate": 8.786371382380525e-05,
"loss": 0.3681,
"step": 1210
},
{
"epoch": 1.2055335968379446,
"grad_norm": 2.221449613571167,
"learning_rate": 8.765357330018053e-05,
"loss": 0.396,
"step": 1220
},
{
"epoch": 1.215415019762846,
"grad_norm": 1.9129923582077026,
"learning_rate": 8.744188498563639e-05,
"loss": 0.3861,
"step": 1230
},
{
"epoch": 1.225296442687747,
"grad_norm": 2.0991668701171875,
"learning_rate": 8.722865758185034e-05,
"loss": 0.373,
"step": 1240
},
{
"epoch": 1.2351778656126482,
"grad_norm": 1.9412460327148438,
"learning_rate": 8.701389985376575e-05,
"loss": 0.3592,
"step": 1250
},
{
"epoch": 1.2450592885375493,
"grad_norm": 2.6546976566314697,
"learning_rate": 8.679762062923174e-05,
"loss": 0.3871,
"step": 1260
},
{
"epoch": 1.2549407114624507,
"grad_norm": 2.3372902870178223,
"learning_rate": 8.657982879864005e-05,
"loss": 0.3776,
"step": 1270
},
{
"epoch": 1.2648221343873518,
"grad_norm": 1.9796963930130005,
"learning_rate": 8.636053331455984e-05,
"loss": 0.377,
"step": 1280
},
{
"epoch": 1.274703557312253,
"grad_norm": 2.1785104274749756,
"learning_rate": 8.613974319136955e-05,
"loss": 0.3942,
"step": 1290
},
{
"epoch": 1.2845849802371543,
"grad_norm": 1.8092831373214722,
"learning_rate": 8.591746750488636e-05,
"loss": 0.367,
"step": 1300
},
{
"epoch": 1.2944664031620554,
"grad_norm": 2.4670629501342773,
"learning_rate": 8.569371539199313e-05,
"loss": 0.3744,
"step": 1310
},
{
"epoch": 1.3043478260869565,
"grad_norm": 2.104426383972168,
"learning_rate": 8.546849605026287e-05,
"loss": 0.3801,
"step": 1320
},
{
"epoch": 1.3142292490118577,
"grad_norm": 2.5602879524230957,
"learning_rate": 8.524181873758057e-05,
"loss": 0.352,
"step": 1330
},
{
"epoch": 1.3241106719367588,
"grad_norm": 2.211514949798584,
"learning_rate": 8.501369277176273e-05,
"loss": 0.3643,
"step": 1340
},
{
"epoch": 1.3339920948616601,
"grad_norm": 2.312812566757202,
"learning_rate": 8.478412753017431e-05,
"loss": 0.3681,
"step": 1350
},
{
"epoch": 1.3339920948616601,
"eval_loss": 0.417085200548172,
"eval_runtime": 34.1379,
"eval_samples_per_second": 14.646,
"eval_steps_per_second": 14.646,
"step": 1350
},
{
"epoch": 1.3438735177865613,
"grad_norm": 2.7497165203094482,
"learning_rate": 8.455313244934322e-05,
"loss": 0.3739,
"step": 1360
},
{
"epoch": 1.3537549407114624,
"grad_norm": 2.3112716674804688,
"learning_rate": 8.432071702457251e-05,
"loss": 0.367,
"step": 1370
},
{
"epoch": 1.3636363636363638,
"grad_norm": 2.03934383392334,
"learning_rate": 8.408689080954995e-05,
"loss": 0.3506,
"step": 1380
},
{
"epoch": 1.3735177865612649,
"grad_norm": 1.942353367805481,
"learning_rate": 8.385166341595547e-05,
"loss": 0.3651,
"step": 1390
},
{
"epoch": 1.383399209486166,
"grad_norm": 1.985518455505371,
"learning_rate": 8.361504451306582e-05,
"loss": 0.3593,
"step": 1400
},
{
"epoch": 1.3932806324110671,
"grad_norm": 2.244945526123047,
"learning_rate": 8.337704382735738e-05,
"loss": 0.3616,
"step": 1410
},
{
"epoch": 1.4031620553359683,
"grad_norm": 2.084362268447876,
"learning_rate": 8.313767114210614e-05,
"loss": 0.3725,
"step": 1420
},
{
"epoch": 1.4130434782608696,
"grad_norm": 2.0909502506256104,
"learning_rate": 8.289693629698562e-05,
"loss": 0.3708,
"step": 1430
},
{
"epoch": 1.4229249011857708,
"grad_norm": 2.526142120361328,
"learning_rate": 8.265484918766241e-05,
"loss": 0.3688,
"step": 1440
},
{
"epoch": 1.4328063241106719,
"grad_norm": 1.7779805660247803,
"learning_rate": 8.241141976538941e-05,
"loss": 0.3628,
"step": 1450
},
{
"epoch": 1.4426877470355732,
"grad_norm": 2.2417075634002686,
"learning_rate": 8.216665803659669e-05,
"loss": 0.3539,
"step": 1460
},
{
"epoch": 1.4525691699604744,
"grad_norm": 2.3137755393981934,
"learning_rate": 8.192057406248027e-05,
"loss": 0.3526,
"step": 1470
},
{
"epoch": 1.4624505928853755,
"grad_norm": 2.308361768722534,
"learning_rate": 8.167317795858849e-05,
"loss": 0.364,
"step": 1480
},
{
"epoch": 1.4723320158102766,
"grad_norm": 2.1137235164642334,
"learning_rate": 8.142447989440615e-05,
"loss": 0.3725,
"step": 1490
},
{
"epoch": 1.4822134387351777,
"grad_norm": 2.206882953643799,
"learning_rate": 8.117449009293666e-05,
"loss": 0.38,
"step": 1500
},
{
"epoch": 1.4822134387351777,
"eval_loss": 0.41128015518188477,
"eval_runtime": 34.0412,
"eval_samples_per_second": 14.688,
"eval_steps_per_second": 14.688,
"step": 1500
},
{
"epoch": 1.492094861660079,
"grad_norm": 2.540431499481201,
"learning_rate": 8.092321883028156e-05,
"loss": 0.3703,
"step": 1510
},
{
"epoch": 1.5019762845849802,
"grad_norm": 2.1057121753692627,
"learning_rate": 8.067067643521832e-05,
"loss": 0.3797,
"step": 1520
},
{
"epoch": 1.5118577075098814,
"grad_norm": 2.375397205352783,
"learning_rate": 8.041687328877564e-05,
"loss": 0.3657,
"step": 1530
},
{
"epoch": 1.5217391304347827,
"grad_norm": 2.403914451599121,
"learning_rate": 8.016181982380679e-05,
"loss": 0.3807,
"step": 1540
},
{
"epoch": 1.5316205533596838,
"grad_norm": 2.3958826065063477,
"learning_rate": 7.990552652456078e-05,
"loss": 0.3622,
"step": 1550
},
{
"epoch": 1.541501976284585,
"grad_norm": 2.351919651031494,
"learning_rate": 7.964800392625127e-05,
"loss": 0.3762,
"step": 1560
},
{
"epoch": 1.5513833992094863,
"grad_norm": 2.015793800354004,
"learning_rate": 7.938926261462365e-05,
"loss": 0.3536,
"step": 1570
},
{
"epoch": 1.5612648221343872,
"grad_norm": 2.037121295928955,
"learning_rate": 7.912931322551979e-05,
"loss": 0.3718,
"step": 1580
},
{
"epoch": 1.5711462450592886,
"grad_norm": 2.1762428283691406,
"learning_rate": 7.886816644444096e-05,
"loss": 0.3504,
"step": 1590
},
{
"epoch": 1.5810276679841897,
"grad_norm": 1.8388617038726807,
"learning_rate": 7.860583300610847e-05,
"loss": 0.3431,
"step": 1600
},
{
"epoch": 1.5909090909090908,
"grad_norm": 1.9121774435043335,
"learning_rate": 7.834232369402248e-05,
"loss": 0.3769,
"step": 1610
},
{
"epoch": 1.6007905138339922,
"grad_norm": 2.1485304832458496,
"learning_rate": 7.807764934001872e-05,
"loss": 0.3361,
"step": 1620
},
{
"epoch": 1.6106719367588933,
"grad_norm": 2.162116289138794,
"learning_rate": 7.781182082382322e-05,
"loss": 0.3747,
"step": 1630
},
{
"epoch": 1.6205533596837944,
"grad_norm": 2.514573335647583,
"learning_rate": 7.754484907260511e-05,
"loss": 0.3857,
"step": 1640
},
{
"epoch": 1.6304347826086958,
"grad_norm": 2.3473386764526367,
"learning_rate": 7.727674506052742e-05,
"loss": 0.3269,
"step": 1650
},
{
"epoch": 1.6304347826086958,
"eval_loss": 0.4064118564128876,
"eval_runtime": 34.0417,
"eval_samples_per_second": 14.688,
"eval_steps_per_second": 14.688,
"step": 1650
},
{
"epoch": 1.6403162055335967,
"grad_norm": 2.0117132663726807,
"learning_rate": 7.700751980829599e-05,
"loss": 0.3662,
"step": 1660
},
{
"epoch": 1.650197628458498,
"grad_norm": 2.5356202125549316,
"learning_rate": 7.673718438270646e-05,
"loss": 0.3671,
"step": 1670
},
{
"epoch": 1.6600790513833992,
"grad_norm": 2.1220240592956543,
"learning_rate": 7.646574989618936e-05,
"loss": 0.3655,
"step": 1680
},
{
"epoch": 1.6699604743083003,
"grad_norm": 2.049267292022705,
"learning_rate": 7.619322750635325e-05,
"loss": 0.3916,
"step": 1690
},
{
"epoch": 1.6798418972332017,
"grad_norm": 2.0539910793304443,
"learning_rate": 7.591962841552624e-05,
"loss": 0.4168,
"step": 1700
},
{
"epoch": 1.6897233201581028,
"grad_norm": 2.229034662246704,
"learning_rate": 7.56449638702953e-05,
"loss": 0.369,
"step": 1710
},
{
"epoch": 1.699604743083004,
"grad_norm": 2.280418634414673,
"learning_rate": 7.536924516104408e-05,
"loss": 0.375,
"step": 1720
},
{
"epoch": 1.7094861660079053,
"grad_norm": 1.9317281246185303,
"learning_rate": 7.509248362148886e-05,
"loss": 0.3602,
"step": 1730
},
{
"epoch": 1.7193675889328062,
"grad_norm": 2.0074923038482666,
"learning_rate": 7.481469062821249e-05,
"loss": 0.3763,
"step": 1740
},
{
"epoch": 1.7292490118577075,
"grad_norm": 2.6529626846313477,
"learning_rate": 7.453587760019688e-05,
"loss": 0.3867,
"step": 1750
},
{
"epoch": 1.7391304347826086,
"grad_norm": 2.64829421043396,
"learning_rate": 7.425605599835358e-05,
"loss": 0.3459,
"step": 1760
},
{
"epoch": 1.7490118577075098,
"grad_norm": 2.139469861984253,
"learning_rate": 7.397523732505269e-05,
"loss": 0.3763,
"step": 1770
},
{
"epoch": 1.7588932806324111,
"grad_norm": 2.043088674545288,
"learning_rate": 7.369343312364992e-05,
"loss": 0.3313,
"step": 1780
},
{
"epoch": 1.7687747035573123,
"grad_norm": 2.4256412982940674,
"learning_rate": 7.341065497801227e-05,
"loss": 0.3607,
"step": 1790
},
{
"epoch": 1.7786561264822134,
"grad_norm": 2.4966022968292236,
"learning_rate": 7.312691451204175e-05,
"loss": 0.3413,
"step": 1800
},
{
"epoch": 1.7786561264822134,
"eval_loss": 0.39886847138404846,
"eval_runtime": 34.0089,
"eval_samples_per_second": 14.702,
"eval_steps_per_second": 14.702,
"step": 1800
},
{
"epoch": 1.7885375494071147,
"grad_norm": 2.126098394393921,
"learning_rate": 7.284222338919757e-05,
"loss": 0.3505,
"step": 1810
},
{
"epoch": 1.7984189723320159,
"grad_norm": 2.32716965675354,
"learning_rate": 7.25565933120167e-05,
"loss": 0.3706,
"step": 1820
},
{
"epoch": 1.808300395256917,
"grad_norm": 2.4196839332580566,
"learning_rate": 7.227003602163294e-05,
"loss": 0.3672,
"step": 1830
},
{
"epoch": 1.8181818181818183,
"grad_norm": 2.1417181491851807,
"learning_rate": 7.19825632972941e-05,
"loss": 0.3467,
"step": 1840
},
{
"epoch": 1.8280632411067192,
"grad_norm": 1.9946470260620117,
"learning_rate": 7.169418695587788e-05,
"loss": 0.3639,
"step": 1850
},
{
"epoch": 1.8379446640316206,
"grad_norm": 2.3900909423828125,
"learning_rate": 7.140491885140627e-05,
"loss": 0.354,
"step": 1860
},
{
"epoch": 1.8478260869565217,
"grad_norm": 2.3250668048858643,
"learning_rate": 7.111477087455798e-05,
"loss": 0.3829,
"step": 1870
},
{
"epoch": 1.8577075098814229,
"grad_norm": 2.3011209964752197,
"learning_rate": 7.082375495217994e-05,
"loss": 0.3567,
"step": 1880
},
{
"epoch": 1.8675889328063242,
"grad_norm": 2.4620919227600098,
"learning_rate": 7.053188304679689e-05,
"loss": 0.3729,
"step": 1890
},
{
"epoch": 1.8774703557312253,
"grad_norm": 1.9825767278671265,
"learning_rate": 7.023916715611966e-05,
"loss": 0.367,
"step": 1900
},
{
"epoch": 1.8873517786561265,
"grad_norm": 2.1703319549560547,
"learning_rate": 6.994561931255207e-05,
"loss": 0.3818,
"step": 1910
},
{
"epoch": 1.8972332015810278,
"grad_norm": 1.79076087474823,
"learning_rate": 6.965125158269616e-05,
"loss": 0.3553,
"step": 1920
},
{
"epoch": 1.9071146245059287,
"grad_norm": 2.1293234825134277,
"learning_rate": 6.935607606685639e-05,
"loss": 0.3665,
"step": 1930
},
{
"epoch": 1.91699604743083,
"grad_norm": 2.227125883102417,
"learning_rate": 6.906010489854208e-05,
"loss": 0.3753,
"step": 1940
},
{
"epoch": 1.9268774703557312,
"grad_norm": 1.7864975929260254,
"learning_rate": 6.87633502439687e-05,
"loss": 0.3534,
"step": 1950
},
{
"epoch": 1.9268774703557312,
"eval_loss": 0.3949296772480011,
"eval_runtime": 34.1382,
"eval_samples_per_second": 14.646,
"eval_steps_per_second": 14.646,
"step": 1950
},
{
"epoch": 1.9367588932806323,
"grad_norm": 1.8437656164169312,
"learning_rate": 6.84658243015578e-05,
"loss": 0.3605,
"step": 1960
},
{
"epoch": 1.9466403162055337,
"grad_norm": 2.3836982250213623,
"learning_rate": 6.816753930143555e-05,
"loss": 0.3686,
"step": 1970
},
{
"epoch": 1.9565217391304348,
"grad_norm": 2.2831881046295166,
"learning_rate": 6.786850750493004e-05,
"loss": 0.3655,
"step": 1980
},
{
"epoch": 1.966403162055336,
"grad_norm": 2.3932294845581055,
"learning_rate": 6.756874120406713e-05,
"loss": 0.3802,
"step": 1990
},
{
"epoch": 1.9762845849802373,
"grad_norm": 1.7772880792617798,
"learning_rate": 6.726825272106537e-05,
"loss": 0.3454,
"step": 2000
},
{
"epoch": 1.9861660079051382,
"grad_norm": 1.9510533809661865,
"learning_rate": 6.696705440782937e-05,
"loss": 0.3789,
"step": 2010
},
{
"epoch": 1.9960474308300395,
"grad_norm": 2.113067150115967,
"learning_rate": 6.666515864544208e-05,
"loss": 0.3718,
"step": 2020
},
{
"epoch": 2.005928853754941,
"grad_norm": 2.007193088531494,
"learning_rate": 6.636257784365583e-05,
"loss": 0.2785,
"step": 2030
},
{
"epoch": 2.015810276679842,
"grad_norm": 1.9907631874084473,
"learning_rate": 6.605932444038227e-05,
"loss": 0.284,
"step": 2040
},
{
"epoch": 2.025691699604743,
"grad_norm": 2.3482143878936768,
"learning_rate": 6.575541090118102e-05,
"loss": 0.2744,
"step": 2050
},
{
"epoch": 2.035573122529644,
"grad_norm": 2.2984330654144287,
"learning_rate": 6.545084971874736e-05,
"loss": 0.2763,
"step": 2060
},
{
"epoch": 2.0454545454545454,
"grad_norm": 2.089308261871338,
"learning_rate": 6.51456534123986e-05,
"loss": 0.2993,
"step": 2070
},
{
"epoch": 2.0553359683794468,
"grad_norm": 1.9980093240737915,
"learning_rate": 6.483983452755952e-05,
"loss": 0.295,
"step": 2080
},
{
"epoch": 2.0652173913043477,
"grad_norm": 2.138206958770752,
"learning_rate": 6.453340563524668e-05,
"loss": 0.302,
"step": 2090
},
{
"epoch": 2.075098814229249,
"grad_norm": 2.199354887008667,
"learning_rate": 6.422637933155161e-05,
"loss": 0.2791,
"step": 2100
},
{
"epoch": 2.075098814229249,
"eval_loss": 0.4022028148174286,
"eval_runtime": 34.24,
"eval_samples_per_second": 14.603,
"eval_steps_per_second": 14.603,
"step": 2100
},
{
"epoch": 2.0849802371541504,
"grad_norm": 1.7849321365356445,
"learning_rate": 6.391876823712316e-05,
"loss": 0.2882,
"step": 2110
},
{
"epoch": 2.0948616600790513,
"grad_norm": 2.316427230834961,
"learning_rate": 6.361058499664854e-05,
"loss": 0.2893,
"step": 2120
},
{
"epoch": 2.1047430830039526,
"grad_norm": 2.092482328414917,
"learning_rate": 6.330184227833374e-05,
"loss": 0.2851,
"step": 2130
},
{
"epoch": 2.1146245059288535,
"grad_norm": 2.0520172119140625,
"learning_rate": 6.299255277338263e-05,
"loss": 0.2893,
"step": 2140
},
{
"epoch": 2.124505928853755,
"grad_norm": 2.0501887798309326,
"learning_rate": 6.268272919547534e-05,
"loss": 0.2877,
"step": 2150
},
{
"epoch": 2.1343873517786562,
"grad_norm": 1.967375636100769,
"learning_rate": 6.23723842802457e-05,
"loss": 0.2838,
"step": 2160
},
{
"epoch": 2.144268774703557,
"grad_norm": 2.331676959991455,
"learning_rate": 6.20615307847576e-05,
"loss": 0.3169,
"step": 2170
},
{
"epoch": 2.1541501976284585,
"grad_norm": 2.251298666000366,
"learning_rate": 6.175018148698074e-05,
"loss": 0.2882,
"step": 2180
},
{
"epoch": 2.16403162055336,
"grad_norm": 2.0839710235595703,
"learning_rate": 6.143834918526526e-05,
"loss": 0.2862,
"step": 2190
},
{
"epoch": 2.1739130434782608,
"grad_norm": 2.633404493331909,
"learning_rate": 6.112604669781571e-05,
"loss": 0.2815,
"step": 2200
},
{
"epoch": 2.183794466403162,
"grad_norm": 2.089813709259033,
"learning_rate": 6.081328686216416e-05,
"loss": 0.3046,
"step": 2210
},
{
"epoch": 2.1936758893280635,
"grad_norm": 2.1596994400024414,
"learning_rate": 6.050008253464245e-05,
"loss": 0.2834,
"step": 2220
},
{
"epoch": 2.2035573122529644,
"grad_norm": 1.879882574081421,
"learning_rate": 6.018644658985377e-05,
"loss": 0.2797,
"step": 2230
},
{
"epoch": 2.2134387351778657,
"grad_norm": 2.6231043338775635,
"learning_rate": 5.987239192014334e-05,
"loss": 0.2761,
"step": 2240
},
{
"epoch": 2.2233201581027666,
"grad_norm": 2.381791114807129,
"learning_rate": 5.9557931435068606e-05,
"loss": 0.3281,
"step": 2250
},
{
"epoch": 2.2233201581027666,
"eval_loss": 0.40134918689727783,
"eval_runtime": 34.1432,
"eval_samples_per_second": 14.644,
"eval_steps_per_second": 14.644,
"step": 2250
},
{
"epoch": 2.233201581027668,
"grad_norm": 2.037327527999878,
"learning_rate": 5.9243078060868426e-05,
"loss": 0.2772,
"step": 2260
},
{
"epoch": 2.2430830039525693,
"grad_norm": 2.2997586727142334,
"learning_rate": 5.892784473993182e-05,
"loss": 0.2905,
"step": 2270
},
{
"epoch": 2.2529644268774702,
"grad_norm": 1.9744611978530884,
"learning_rate": 5.861224443026593e-05,
"loss": 0.2868,
"step": 2280
},
{
"epoch": 2.2628458498023716,
"grad_norm": 2.116672992706299,
"learning_rate": 5.8296290104963387e-05,
"loss": 0.2858,
"step": 2290
},
{
"epoch": 2.2727272727272725,
"grad_norm": 2.145845890045166,
"learning_rate": 5.797999475166895e-05,
"loss": 0.2752,
"step": 2300
},
{
"epoch": 2.282608695652174,
"grad_norm": 2.2736833095550537,
"learning_rate": 5.766337137204578e-05,
"loss": 0.2984,
"step": 2310
},
{
"epoch": 2.292490118577075,
"grad_norm": 2.218451499938965,
"learning_rate": 5.734643298124089e-05,
"loss": 0.2912,
"step": 2320
},
{
"epoch": 2.302371541501976,
"grad_norm": 1.9524636268615723,
"learning_rate": 5.702919260735013e-05,
"loss": 0.2919,
"step": 2330
},
{
"epoch": 2.3122529644268774,
"grad_norm": 2.35251784324646,
"learning_rate": 5.671166329088276e-05,
"loss": 0.3256,
"step": 2340
},
{
"epoch": 2.322134387351779,
"grad_norm": 2.4238321781158447,
"learning_rate": 5.639385808422529e-05,
"loss": 0.3056,
"step": 2350
},
{
"epoch": 2.3320158102766797,
"grad_norm": 2.408384084701538,
"learning_rate": 5.607579005110501e-05,
"loss": 0.2833,
"step": 2360
},
{
"epoch": 2.341897233201581,
"grad_norm": 2.345621109008789,
"learning_rate": 5.575747226605297e-05,
"loss": 0.2961,
"step": 2370
},
{
"epoch": 2.3517786561264824,
"grad_norm": 2.226508140563965,
"learning_rate": 5.543891781386654e-05,
"loss": 0.3138,
"step": 2380
},
{
"epoch": 2.3616600790513833,
"grad_norm": 2.230583429336548,
"learning_rate": 5.5120139789071554e-05,
"loss": 0.2837,
"step": 2390
},
{
"epoch": 2.3715415019762847,
"grad_norm": 2.449136972427368,
"learning_rate": 5.480115129538408e-05,
"loss": 0.2763,
"step": 2400
},
{
"epoch": 2.3715415019762847,
"eval_loss": 0.399911105632782,
"eval_runtime": 34.0722,
"eval_samples_per_second": 14.675,
"eval_steps_per_second": 14.675,
"step": 2400
},
{
"epoch": 2.3814229249011856,
"grad_norm": 2.04034161567688,
"learning_rate": 5.4481965445171666e-05,
"loss": 0.2889,
"step": 2410
},
{
"epoch": 2.391304347826087,
"grad_norm": 2.589332103729248,
"learning_rate": 5.416259535891445e-05,
"loss": 0.2882,
"step": 2420
},
{
"epoch": 2.4011857707509883,
"grad_norm": 1.8965773582458496,
"learning_rate": 5.384305416466583e-05,
"loss": 0.2664,
"step": 2430
},
{
"epoch": 2.411067193675889,
"grad_norm": 1.9925569295883179,
"learning_rate": 5.3523354997512684e-05,
"loss": 0.3079,
"step": 2440
},
{
"epoch": 2.4209486166007905,
"grad_norm": 1.982309341430664,
"learning_rate": 5.320351099903564e-05,
"loss": 0.2893,
"step": 2450
},
{
"epoch": 2.430830039525692,
"grad_norm": 2.4800615310668945,
"learning_rate": 5.288353531676871e-05,
"loss": 0.2576,
"step": 2460
},
{
"epoch": 2.440711462450593,
"grad_norm": 2.2494561672210693,
"learning_rate": 5.256344110365895e-05,
"loss": 0.292,
"step": 2470
},
{
"epoch": 2.450592885375494,
"grad_norm": 1.8020946979522705,
"learning_rate": 5.224324151752574e-05,
"loss": 0.2863,
"step": 2480
},
{
"epoch": 2.4604743083003955,
"grad_norm": 2.187232494354248,
"learning_rate": 5.192294972051991e-05,
"loss": 0.2753,
"step": 2490
},
{
"epoch": 2.4703557312252964,
"grad_norm": 2.2467732429504395,
"learning_rate": 5.160257887858276e-05,
"loss": 0.3188,
"step": 2500
},
{
"epoch": 2.4802371541501977,
"grad_norm": 1.9890021085739136,
"learning_rate": 5.128214216090477e-05,
"loss": 0.2838,
"step": 2510
},
{
"epoch": 2.4901185770750986,
"grad_norm": 1.962117075920105,
"learning_rate": 5.096165273938434e-05,
"loss": 0.2858,
"step": 2520
},
{
"epoch": 2.5,
"grad_norm": 1.8473106622695923,
"learning_rate": 5.064112378808635e-05,
"loss": 0.2692,
"step": 2530
},
{
"epoch": 2.5098814229249014,
"grad_norm": 1.9031504392623901,
"learning_rate": 5.032056848270054e-05,
"loss": 0.2993,
"step": 2540
},
{
"epoch": 2.5197628458498023,
"grad_norm": 2.2502200603485107,
"learning_rate": 4.999999999999999e-05,
"loss": 0.2696,
"step": 2550
},
{
"epoch": 2.5197628458498023,
"eval_loss": 0.39579418301582336,
"eval_runtime": 34.1604,
"eval_samples_per_second": 14.637,
"eval_steps_per_second": 14.637,
"step": 2550
},
{
"epoch": 2.5296442687747036,
"grad_norm": 2.3031277656555176,
"learning_rate": 4.9679431517299435e-05,
"loss": 0.3062,
"step": 2560
},
{
"epoch": 2.5395256916996045,
"grad_norm": 2.183401107788086,
"learning_rate": 4.9358876211913624e-05,
"loss": 0.325,
"step": 2570
},
{
"epoch": 2.549407114624506,
"grad_norm": 1.9418132305145264,
"learning_rate": 4.9038347260615636e-05,
"loss": 0.2874,
"step": 2580
},
{
"epoch": 2.559288537549407,
"grad_norm": 2.340853214263916,
"learning_rate": 4.871785783909522e-05,
"loss": 0.2914,
"step": 2590
},
{
"epoch": 2.5691699604743086,
"grad_norm": 1.8216912746429443,
"learning_rate": 4.839742112141723e-05,
"loss": 0.2935,
"step": 2600
},
{
"epoch": 2.5790513833992095,
"grad_norm": 2.1227974891662598,
"learning_rate": 4.807705027948006e-05,
"loss": 0.2903,
"step": 2610
},
{
"epoch": 2.588932806324111,
"grad_norm": 2.1689720153808594,
"learning_rate": 4.775675848247426e-05,
"loss": 0.2919,
"step": 2620
},
{
"epoch": 2.5988142292490117,
"grad_norm": 2.3520572185516357,
"learning_rate": 4.7436558896341037e-05,
"loss": 0.2947,
"step": 2630
},
{
"epoch": 2.608695652173913,
"grad_norm": 2.0316853523254395,
"learning_rate": 4.711646468323127e-05,
"loss": 0.2921,
"step": 2640
},
{
"epoch": 2.6185770750988144,
"grad_norm": 2.334075450897217,
"learning_rate": 4.6796489000964345e-05,
"loss": 0.3109,
"step": 2650
},
{
"epoch": 2.6284584980237153,
"grad_norm": 2.4072225093841553,
"learning_rate": 4.6476645002487286e-05,
"loss": 0.2886,
"step": 2660
},
{
"epoch": 2.6383399209486167,
"grad_norm": 2.3423006534576416,
"learning_rate": 4.615694583533417e-05,
"loss": 0.3002,
"step": 2670
},
{
"epoch": 2.6482213438735176,
"grad_norm": 2.290945291519165,
"learning_rate": 4.5837404641085526e-05,
"loss": 0.3013,
"step": 2680
},
{
"epoch": 2.658102766798419,
"grad_norm": 2.3166189193725586,
"learning_rate": 4.551803455482832e-05,
"loss": 0.2855,
"step": 2690
},
{
"epoch": 2.6679841897233203,
"grad_norm": 2.2672386169433594,
"learning_rate": 4.51988487046159e-05,
"loss": 0.2732,
"step": 2700
},
{
"epoch": 2.6679841897233203,
"eval_loss": 0.3915008008480072,
"eval_runtime": 34.1864,
"eval_samples_per_second": 14.626,
"eval_steps_per_second": 14.626,
"step": 2700
},
{
"epoch": 2.677865612648221,
"grad_norm": 2.359931468963623,
"learning_rate": 4.487986021092842e-05,
"loss": 0.3326,
"step": 2710
},
{
"epoch": 2.6877470355731226,
"grad_norm": 1.9921296834945679,
"learning_rate": 4.456108218613345e-05,
"loss": 0.28,
"step": 2720
},
{
"epoch": 2.6976284584980235,
"grad_norm": 2.083142042160034,
"learning_rate": 4.4242527733947024e-05,
"loss": 0.2936,
"step": 2730
},
{
"epoch": 2.707509881422925,
"grad_norm": 2.4226627349853516,
"learning_rate": 4.3924209948894975e-05,
"loss": 0.3011,
"step": 2740
},
{
"epoch": 2.717391304347826,
"grad_norm": 2.4604616165161133,
"learning_rate": 4.360614191577469e-05,
"loss": 0.2702,
"step": 2750
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.890394926071167,
"learning_rate": 4.3288336709117236e-05,
"loss": 0.2793,
"step": 2760
},
{
"epoch": 2.7371541501976284,
"grad_norm": 2.458721160888672,
"learning_rate": 4.297080739264986e-05,
"loss": 0.2882,
"step": 2770
},
{
"epoch": 2.7470355731225298,
"grad_norm": 2.1499905586242676,
"learning_rate": 4.2653567018759094e-05,
"loss": 0.2955,
"step": 2780
},
{
"epoch": 2.7569169960474307,
"grad_norm": 2.4274818897247314,
"learning_rate": 4.233662862795419e-05,
"loss": 0.302,
"step": 2790
},
{
"epoch": 2.766798418972332,
"grad_norm": 2.062570571899414,
"learning_rate": 4.202000524833104e-05,
"loss": 0.2715,
"step": 2800
},
{
"epoch": 2.7766798418972334,
"grad_norm": 2.0354299545288086,
"learning_rate": 4.170370989503661e-05,
"loss": 0.3038,
"step": 2810
},
{
"epoch": 2.7865612648221343,
"grad_norm": 2.3393170833587646,
"learning_rate": 4.1387755569734046e-05,
"loss": 0.2905,
"step": 2820
},
{
"epoch": 2.7964426877470356,
"grad_norm": 2.224705934524536,
"learning_rate": 4.1072155260068164e-05,
"loss": 0.2989,
"step": 2830
},
{
"epoch": 2.8063241106719365,
"grad_norm": 1.9835401773452759,
"learning_rate": 4.075692193913155e-05,
"loss": 0.2879,
"step": 2840
},
{
"epoch": 2.816205533596838,
"grad_norm": 2.378260850906372,
"learning_rate": 4.0442068564931385e-05,
"loss": 0.3009,
"step": 2850
},
{
"epoch": 2.816205533596838,
"eval_loss": 0.38761380314826965,
"eval_runtime": 34.3575,
"eval_samples_per_second": 14.553,
"eval_steps_per_second": 14.553,
"step": 2850
},
{
"epoch": 2.8260869565217392,
"grad_norm": 2.5100574493408203,
"learning_rate": 4.012760807985664e-05,
"loss": 0.2982,
"step": 2860
},
{
"epoch": 2.83596837944664,
"grad_norm": 1.639036774635315,
"learning_rate": 3.9813553410146214e-05,
"loss": 0.3087,
"step": 2870
},
{
"epoch": 2.8458498023715415,
"grad_norm": 2.490206241607666,
"learning_rate": 3.949991746535752e-05,
"loss": 0.2989,
"step": 2880
},
{
"epoch": 2.8557312252964424,
"grad_norm": 2.3250222206115723,
"learning_rate": 3.918671313783582e-05,
"loss": 0.3034,
"step": 2890
},
{
"epoch": 2.8656126482213438,
"grad_norm": 2.325777292251587,
"learning_rate": 3.8873953302184275e-05,
"loss": 0.2747,
"step": 2900
},
{
"epoch": 2.875494071146245,
"grad_norm": 2.2305283546447754,
"learning_rate": 3.856165081473473e-05,
"loss": 0.2856,
"step": 2910
},
{
"epoch": 2.8853754940711465,
"grad_norm": 2.2349300384521484,
"learning_rate": 3.824981851301923e-05,
"loss": 0.2716,
"step": 2920
},
{
"epoch": 2.8952569169960474,
"grad_norm": 1.9070847034454346,
"learning_rate": 3.793846921524236e-05,
"loss": 0.3053,
"step": 2930
},
{
"epoch": 2.9051383399209487,
"grad_norm": 2.1934654712677,
"learning_rate": 3.7627615719754287e-05,
"loss": 0.3143,
"step": 2940
},
{
"epoch": 2.9150197628458496,
"grad_norm": 1.9646544456481934,
"learning_rate": 3.7317270804524626e-05,
"loss": 0.2865,
"step": 2950
},
{
"epoch": 2.924901185770751,
"grad_norm": 2.232283353805542,
"learning_rate": 3.700744722661735e-05,
"loss": 0.2958,
"step": 2960
},
{
"epoch": 2.9347826086956523,
"grad_norm": 2.3013436794281006,
"learning_rate": 3.669815772166624e-05,
"loss": 0.2834,
"step": 2970
},
{
"epoch": 2.9446640316205532,
"grad_norm": 2.4497499465942383,
"learning_rate": 3.6389415003351434e-05,
"loss": 0.2978,
"step": 2980
},
{
"epoch": 2.9545454545454546,
"grad_norm": 1.9342460632324219,
"learning_rate": 3.608123176287684e-05,
"loss": 0.2826,
"step": 2990
},
{
"epoch": 2.9644268774703555,
"grad_norm": 2.4852263927459717,
"learning_rate": 3.577362066844837e-05,
"loss": 0.2766,
"step": 3000
},
{
"epoch": 2.9644268774703555,
"eval_loss": 0.3866276741027832,
"eval_runtime": 34.3485,
"eval_samples_per_second": 14.557,
"eval_steps_per_second": 14.557,
"step": 3000
},
{
"epoch": 2.974308300395257,
"grad_norm": 1.9710100889205933,
"learning_rate": 3.546659436475331e-05,
"loss": 0.282,
"step": 3010
},
{
"epoch": 2.984189723320158,
"grad_norm": 2.269618511199951,
"learning_rate": 3.516016547244046e-05,
"loss": 0.2974,
"step": 3020
},
{
"epoch": 2.9940711462450595,
"grad_norm": 2.038463830947876,
"learning_rate": 3.485434658760139e-05,
"loss": 0.2931,
"step": 3030
},
{
"epoch": 3.0039525691699605,
"grad_norm": 1.9927372932434082,
"learning_rate": 3.454915028125262e-05,
"loss": 0.2778,
"step": 3040
},
{
"epoch": 3.013833992094862,
"grad_norm": 2.1115493774414062,
"learning_rate": 3.424458909881896e-05,
"loss": 0.228,
"step": 3050
},
{
"epoch": 3.0237154150197627,
"grad_norm": 2.1515655517578125,
"learning_rate": 3.394067555961772e-05,
"loss": 0.2172,
"step": 3060
},
{
"epoch": 3.033596837944664,
"grad_norm": 2.6619913578033447,
"learning_rate": 3.3637422156344146e-05,
"loss": 0.2158,
"step": 3070
},
{
"epoch": 3.0434782608695654,
"grad_norm": 2.441153049468994,
"learning_rate": 3.333484135455791e-05,
"loss": 0.2305,
"step": 3080
},
{
"epoch": 3.0533596837944663,
"grad_norm": 1.8996953964233398,
"learning_rate": 3.3032945592170616e-05,
"loss": 0.2436,
"step": 3090
},
{
"epoch": 3.0632411067193677,
"grad_norm": 2.410764217376709,
"learning_rate": 3.2731747278934616e-05,
"loss": 0.2198,
"step": 3100
},
{
"epoch": 3.0731225296442686,
"grad_norm": 1.9897364377975464,
"learning_rate": 3.243125879593285e-05,
"loss": 0.2309,
"step": 3110
},
{
"epoch": 3.08300395256917,
"grad_norm": 2.573441982269287,
"learning_rate": 3.213149249506996e-05,
"loss": 0.213,
"step": 3120
},
{
"epoch": 3.0928853754940713,
"grad_norm": 2.5531418323516846,
"learning_rate": 3.1832460698564424e-05,
"loss": 0.234,
"step": 3130
},
{
"epoch": 3.102766798418972,
"grad_norm": 2.101729154586792,
"learning_rate": 3.1534175698442184e-05,
"loss": 0.239,
"step": 3140
},
{
"epoch": 3.1126482213438735,
"grad_norm": 1.9485621452331543,
"learning_rate": 3.123664975603129e-05,
"loss": 0.2145,
"step": 3150
},
{
"epoch": 3.1126482213438735,
"eval_loss": 0.40044403076171875,
"eval_runtime": 34.1191,
"eval_samples_per_second": 14.655,
"eval_steps_per_second": 14.655,
"step": 3150
},
{
"epoch": 3.122529644268775,
"grad_norm": 2.287511110305786,
"learning_rate": 3.093989510145791e-05,
"loss": 0.2225,
"step": 3160
},
{
"epoch": 3.132411067193676,
"grad_norm": 2.047910451889038,
"learning_rate": 3.064392393314359e-05,
"loss": 0.2178,
"step": 3170
},
{
"epoch": 3.142292490118577,
"grad_norm": 1.6208367347717285,
"learning_rate": 3.0348748417303817e-05,
"loss": 0.222,
"step": 3180
},
{
"epoch": 3.1521739130434785,
"grad_norm": 2.4111440181732178,
"learning_rate": 3.005438068744791e-05,
"loss": 0.2177,
"step": 3190
},
{
"epoch": 3.1620553359683794,
"grad_norm": 2.368447780609131,
"learning_rate": 2.9760832843880303e-05,
"loss": 0.2012,
"step": 3200
},
{
"epoch": 3.1719367588932808,
"grad_norm": 2.473605155944824,
"learning_rate": 2.94681169532031e-05,
"loss": 0.2277,
"step": 3210
},
{
"epoch": 3.1818181818181817,
"grad_norm": 2.02504301071167,
"learning_rate": 2.9176245047820055e-05,
"loss": 0.2193,
"step": 3220
},
{
"epoch": 3.191699604743083,
"grad_norm": 2.4214389324188232,
"learning_rate": 2.8885229125442014e-05,
"loss": 0.2196,
"step": 3230
},
{
"epoch": 3.2015810276679844,
"grad_norm": 2.1690824031829834,
"learning_rate": 2.859508114859373e-05,
"loss": 0.2267,
"step": 3240
},
{
"epoch": 3.2114624505928853,
"grad_norm": 1.960707426071167,
"learning_rate": 2.830581304412209e-05,
"loss": 0.2409,
"step": 3250
},
{
"epoch": 3.2213438735177866,
"grad_norm": 2.320850133895874,
"learning_rate": 2.8017436702705894e-05,
"loss": 0.2245,
"step": 3260
},
{
"epoch": 3.2312252964426875,
"grad_norm": 2.268925428390503,
"learning_rate": 2.7729963978367035e-05,
"loss": 0.2373,
"step": 3270
},
{
"epoch": 3.241106719367589,
"grad_norm": 2.374448776245117,
"learning_rate": 2.7443406687983255e-05,
"loss": 0.2188,
"step": 3280
},
{
"epoch": 3.2509881422924902,
"grad_norm": 1.9990499019622803,
"learning_rate": 2.7157776610802408e-05,
"loss": 0.253,
"step": 3290
},
{
"epoch": 3.260869565217391,
"grad_norm": 2.6509578227996826,
"learning_rate": 2.6873085487958243e-05,
"loss": 0.2471,
"step": 3300
},
{
"epoch": 3.260869565217391,
"eval_loss": 0.4012451171875,
"eval_runtime": 34.3577,
"eval_samples_per_second": 14.553,
"eval_steps_per_second": 14.553,
"step": 3300
},
{
"epoch": 3.2707509881422925,
"grad_norm": 2.4065935611724854,
"learning_rate": 2.6589345021987714e-05,
"loss": 0.2455,
"step": 3310
},
{
"epoch": 3.280632411067194,
"grad_norm": 2.315992593765259,
"learning_rate": 2.6306566876350062e-05,
"loss": 0.2137,
"step": 3320
},
{
"epoch": 3.2905138339920947,
"grad_norm": 2.162156820297241,
"learning_rate": 2.6024762674947306e-05,
"loss": 0.2075,
"step": 3330
},
{
"epoch": 3.300395256916996,
"grad_norm": 2.3874671459198,
"learning_rate": 2.5743944001646384e-05,
"loss": 0.2452,
"step": 3340
},
{
"epoch": 3.3102766798418974,
"grad_norm": 2.3413357734680176,
"learning_rate": 2.5464122399803118e-05,
"loss": 0.2491,
"step": 3350
},
{
"epoch": 3.3201581027667983,
"grad_norm": 2.0517518520355225,
"learning_rate": 2.5185309371787506e-05,
"loss": 0.2304,
"step": 3360
},
{
"epoch": 3.3300395256916997,
"grad_norm": 2.5444791316986084,
"learning_rate": 2.490751637851113e-05,
"loss": 0.2252,
"step": 3370
},
{
"epoch": 3.3399209486166006,
"grad_norm": 3.271428108215332,
"learning_rate": 2.4630754838955894e-05,
"loss": 0.2221,
"step": 3380
},
{
"epoch": 3.349802371541502,
"grad_norm": 2.159346103668213,
"learning_rate": 2.4355036129704693e-05,
"loss": 0.2355,
"step": 3390
},
{
"epoch": 3.3596837944664033,
"grad_norm": 1.997672438621521,
"learning_rate": 2.408037158447374e-05,
"loss": 0.2128,
"step": 3400
},
{
"epoch": 3.369565217391304,
"grad_norm": 1.8913171291351318,
"learning_rate": 2.3806772493646716e-05,
"loss": 0.2133,
"step": 3410
},
{
"epoch": 3.3794466403162056,
"grad_norm": 2.4527885913848877,
"learning_rate": 2.3534250103810622e-05,
"loss": 0.2171,
"step": 3420
},
{
"epoch": 3.3893280632411065,
"grad_norm": 2.4104301929473877,
"learning_rate": 2.326281561729351e-05,
"loss": 0.2268,
"step": 3430
},
{
"epoch": 3.399209486166008,
"grad_norm": 1.9531340599060059,
"learning_rate": 2.2992480191703996e-05,
"loss": 0.2158,
"step": 3440
},
{
"epoch": 3.409090909090909,
"grad_norm": 1.6948319673538208,
"learning_rate": 2.2723254939472564e-05,
"loss": 0.2223,
"step": 3450
},
{
"epoch": 3.409090909090909,
"eval_loss": 0.40440815687179565,
"eval_runtime": 34.4399,
"eval_samples_per_second": 14.518,
"eval_steps_per_second": 14.518,
"step": 3450
},
{
"epoch": 3.4189723320158105,
"grad_norm": 2.2849342823028564,
"learning_rate": 2.2455150927394874e-05,
"loss": 0.2354,
"step": 3460
},
{
"epoch": 3.4288537549407114,
"grad_norm": 2.416132688522339,
"learning_rate": 2.218817917617676e-05,
"loss": 0.2322,
"step": 3470
},
{
"epoch": 3.438735177865613,
"grad_norm": 2.2576396465301514,
"learning_rate": 2.1922350659981254e-05,
"loss": 0.2366,
"step": 3480
},
{
"epoch": 3.4486166007905137,
"grad_norm": 2.38684344291687,
"learning_rate": 2.1657676305977515e-05,
"loss": 0.2322,
"step": 3490
},
{
"epoch": 3.458498023715415,
"grad_norm": 2.6015143394470215,
"learning_rate": 2.1394166993891523e-05,
"loss": 0.2412,
"step": 3500
},
{
"epoch": 3.4683794466403164,
"grad_norm": 2.134091854095459,
"learning_rate": 2.1131833555559034e-05,
"loss": 0.2176,
"step": 3510
},
{
"epoch": 3.4782608695652173,
"grad_norm": 2.0862109661102295,
"learning_rate": 2.0870686774480193e-05,
"loss": 0.212,
"step": 3520
},
{
"epoch": 3.4881422924901186,
"grad_norm": 2.139421224594116,
"learning_rate": 2.0610737385376345e-05,
"loss": 0.2398,
"step": 3530
},
{
"epoch": 3.4980237154150196,
"grad_norm": 2.5852270126342773,
"learning_rate": 2.035199607374871e-05,
"loss": 0.2417,
"step": 3540
},
{
"epoch": 3.507905138339921,
"grad_norm": 2.2502431869506836,
"learning_rate": 2.0094473475439195e-05,
"loss": 0.2277,
"step": 3550
},
{
"epoch": 3.5177865612648223,
"grad_norm": 1.9478706121444702,
"learning_rate": 1.983818017619317e-05,
"loss": 0.247,
"step": 3560
},
{
"epoch": 3.527667984189723,
"grad_norm": 2.4664876461029053,
"learning_rate": 1.9583126711224336e-05,
"loss": 0.2398,
"step": 3570
},
{
"epoch": 3.5375494071146245,
"grad_norm": 2.0479726791381836,
"learning_rate": 1.9329323564781675e-05,
"loss": 0.2291,
"step": 3580
},
{
"epoch": 3.5474308300395254,
"grad_norm": 1.9972285032272339,
"learning_rate": 1.907678116971842e-05,
"loss": 0.2287,
"step": 3590
},
{
"epoch": 3.5573122529644268,
"grad_norm": 2.7339351177215576,
"learning_rate": 1.882550990706332e-05,
"loss": 0.2329,
"step": 3600
},
{
"epoch": 3.5573122529644268,
"eval_loss": 0.39887359738349915,
"eval_runtime": 34.1894,
"eval_samples_per_second": 14.624,
"eval_steps_per_second": 14.624,
"step": 3600
},
{
"epoch": 3.567193675889328,
"grad_norm": 2.279087781906128,
"learning_rate": 1.8575520105593814e-05,
"loss": 0.2165,
"step": 3610
},
{
"epoch": 3.5770750988142295,
"grad_norm": 2.1736888885498047,
"learning_rate": 1.8326822041411518e-05,
"loss": 0.2408,
"step": 3620
},
{
"epoch": 3.5869565217391304,
"grad_norm": 2.6511847972869873,
"learning_rate": 1.8079425937519722e-05,
"loss": 0.2075,
"step": 3630
},
{
"epoch": 3.5968379446640317,
"grad_norm": 2.688720226287842,
"learning_rate": 1.7833341963403307e-05,
"loss": 0.2257,
"step": 3640
},
{
"epoch": 3.6067193675889326,
"grad_norm": 2.398272752761841,
"learning_rate": 1.7588580234610588e-05,
"loss": 0.2248,
"step": 3650
},
{
"epoch": 3.616600790513834,
"grad_norm": 2.060293197631836,
"learning_rate": 1.7345150812337557e-05,
"loss": 0.2491,
"step": 3660
},
{
"epoch": 3.6264822134387353,
"grad_norm": 2.143730401992798,
"learning_rate": 1.7103063703014366e-05,
"loss": 0.2194,
"step": 3670
},
{
"epoch": 3.6363636363636362,
"grad_norm": 2.20381760597229,
"learning_rate": 1.686232885789385e-05,
"loss": 0.2299,
"step": 3680
},
{
"epoch": 3.6462450592885376,
"grad_norm": 2.0883138179779053,
"learning_rate": 1.6622956172642597e-05,
"loss": 0.2377,
"step": 3690
},
{
"epoch": 3.6561264822134385,
"grad_norm": 2.045193672180176,
"learning_rate": 1.6384955486934152e-05,
"loss": 0.2105,
"step": 3700
},
{
"epoch": 3.66600790513834,
"grad_norm": 1.9251530170440674,
"learning_rate": 1.6148336584044533e-05,
"loss": 0.2149,
"step": 3710
},
{
"epoch": 3.675889328063241,
"grad_norm": 2.162121534347534,
"learning_rate": 1.591310919045003e-05,
"loss": 0.2359,
"step": 3720
},
{
"epoch": 3.6857707509881426,
"grad_norm": 2.423417329788208,
"learning_rate": 1.5679282975427484e-05,
"loss": 0.2305,
"step": 3730
},
{
"epoch": 3.6956521739130435,
"grad_norm": 2.2101898193359375,
"learning_rate": 1.5446867550656765e-05,
"loss": 0.2174,
"step": 3740
},
{
"epoch": 3.705533596837945,
"grad_norm": 2.565885066986084,
"learning_rate": 1.5215872469825677e-05,
"loss": 0.2184,
"step": 3750
},
{
"epoch": 3.705533596837945,
"eval_loss": 0.39941611886024475,
"eval_runtime": 34.6521,
"eval_samples_per_second": 14.429,
"eval_steps_per_second": 14.429,
"step": 3750
},
{
"epoch": 3.7154150197628457,
"grad_norm": 2.2909016609191895,
"learning_rate": 1.4986307228237263e-05,
"loss": 0.2328,
"step": 3760
},
{
"epoch": 3.725296442687747,
"grad_norm": 2.3509652614593506,
"learning_rate": 1.475818126241942e-05,
"loss": 0.2126,
"step": 3770
},
{
"epoch": 3.7351778656126484,
"grad_norm": 2.112107515335083,
"learning_rate": 1.4531503949737103e-05,
"loss": 0.2093,
"step": 3780
},
{
"epoch": 3.7450592885375493,
"grad_norm": 2.65423846244812,
"learning_rate": 1.4306284608006833e-05,
"loss": 0.2398,
"step": 3790
},
{
"epoch": 3.7549407114624507,
"grad_norm": 2.521843194961548,
"learning_rate": 1.4082532495113623e-05,
"loss": 0.2215,
"step": 3800
},
{
"epoch": 3.7648221343873516,
"grad_norm": 1.9202762842178345,
"learning_rate": 1.3860256808630425e-05,
"loss": 0.2201,
"step": 3810
},
{
"epoch": 3.774703557312253,
"grad_norm": 2.7013628482818604,
"learning_rate": 1.3639466685440129e-05,
"loss": 0.2446,
"step": 3820
},
{
"epoch": 3.7845849802371543,
"grad_norm": 1.9625245332717896,
"learning_rate": 1.3420171201359928e-05,
"loss": 0.2197,
"step": 3830
},
{
"epoch": 3.794466403162055,
"grad_norm": 2.7485172748565674,
"learning_rate": 1.3202379370768249e-05,
"loss": 0.222,
"step": 3840
},
{
"epoch": 3.8043478260869565,
"grad_norm": 2.577033281326294,
"learning_rate": 1.2986100146234227e-05,
"loss": 0.2217,
"step": 3850
},
{
"epoch": 3.8142292490118574,
"grad_norm": 2.4234845638275146,
"learning_rate": 1.2771342418149653e-05,
"loss": 0.2205,
"step": 3860
},
{
"epoch": 3.824110671936759,
"grad_norm": 2.127350330352783,
"learning_rate": 1.2558115014363589e-05,
"loss": 0.2076,
"step": 3870
},
{
"epoch": 3.83399209486166,
"grad_norm": 2.254338502883911,
"learning_rate": 1.2346426699819455e-05,
"loss": 0.2408,
"step": 3880
},
{
"epoch": 3.8438735177865615,
"grad_norm": 2.1729793548583984,
"learning_rate": 1.2136286176194741e-05,
"loss": 0.2253,
"step": 3890
},
{
"epoch": 3.8537549407114624,
"grad_norm": 2.07226824760437,
"learning_rate": 1.1927702081543275e-05,
"loss": 0.2332,
"step": 3900
},
{
"epoch": 3.8537549407114624,
"eval_loss": 0.39704427123069763,
"eval_runtime": 34.4753,
"eval_samples_per_second": 14.503,
"eval_steps_per_second": 14.503,
"step": 3900
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.436859427356672e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}