andstor's picture
Upload folder using huggingface_hub
2f5e9b9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.985172981878089,
"eval_steps": 500,
"global_step": 453,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006589785831960461,
"grad_norm": 57166.33984375,
"learning_rate": 0.0,
"loss": 0.9208,
"step": 1
},
{
"epoch": 0.013179571663920923,
"grad_norm": 24538.927734375,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.7281,
"step": 2
},
{
"epoch": 0.019769357495881382,
"grad_norm": 15162.59375,
"learning_rate": 2.173913043478261e-06,
"loss": 0.7891,
"step": 3
},
{
"epoch": 0.026359143327841845,
"grad_norm": 19609.22265625,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.7988,
"step": 4
},
{
"epoch": 0.032948929159802305,
"grad_norm": 36443.21484375,
"learning_rate": 4.347826086956522e-06,
"loss": 0.7955,
"step": 5
},
{
"epoch": 0.039538714991762765,
"grad_norm": 5708.90673828125,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.756,
"step": 6
},
{
"epoch": 0.04612850082372323,
"grad_norm": 16939.138671875,
"learning_rate": 6.521739130434783e-06,
"loss": 0.8382,
"step": 7
},
{
"epoch": 0.05271828665568369,
"grad_norm": 9426.6240234375,
"learning_rate": 7.608695652173914e-06,
"loss": 0.8287,
"step": 8
},
{
"epoch": 0.05930807248764415,
"grad_norm": 35329.375,
"learning_rate": 8.695652173913044e-06,
"loss": 0.7193,
"step": 9
},
{
"epoch": 0.06589785831960461,
"grad_norm": 25405.27734375,
"learning_rate": 9.782608695652175e-06,
"loss": 0.7123,
"step": 10
},
{
"epoch": 0.07248764415156507,
"grad_norm": 39777.97265625,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.8391,
"step": 11
},
{
"epoch": 0.07907742998352553,
"grad_norm": 32360.591796875,
"learning_rate": 1.1956521739130435e-05,
"loss": 0.7638,
"step": 12
},
{
"epoch": 0.085667215815486,
"grad_norm": 20279.001953125,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.8044,
"step": 13
},
{
"epoch": 0.09225700164744646,
"grad_norm": 21474.5,
"learning_rate": 1.4130434782608694e-05,
"loss": 0.6984,
"step": 14
},
{
"epoch": 0.09884678747940692,
"grad_norm": 24501.21484375,
"learning_rate": 1.5217391304347828e-05,
"loss": 0.7501,
"step": 15
},
{
"epoch": 0.10543657331136738,
"grad_norm": 16850.5625,
"learning_rate": 1.630434782608696e-05,
"loss": 0.7073,
"step": 16
},
{
"epoch": 0.11202635914332784,
"grad_norm": 18344.18359375,
"learning_rate": 1.739130434782609e-05,
"loss": 0.7623,
"step": 17
},
{
"epoch": 0.1186161449752883,
"grad_norm": 7098.125,
"learning_rate": 1.8478260869565216e-05,
"loss": 0.6343,
"step": 18
},
{
"epoch": 0.12520593080724876,
"grad_norm": 15517.8115234375,
"learning_rate": 1.956521739130435e-05,
"loss": 0.6598,
"step": 19
},
{
"epoch": 0.13179571663920922,
"grad_norm": 111118.3046875,
"learning_rate": 2.065217391304348e-05,
"loss": 0.6546,
"step": 20
},
{
"epoch": 0.13838550247116968,
"grad_norm": 22035.267578125,
"learning_rate": 2.173913043478261e-05,
"loss": 0.6148,
"step": 21
},
{
"epoch": 0.14497528830313014,
"grad_norm": 21264.662109375,
"learning_rate": 2.282608695652174e-05,
"loss": 0.6115,
"step": 22
},
{
"epoch": 0.1515650741350906,
"grad_norm": 10420.123046875,
"learning_rate": 2.391304347826087e-05,
"loss": 0.6716,
"step": 23
},
{
"epoch": 0.15815485996705106,
"grad_norm": 31856.8828125,
"learning_rate": 2.5e-05,
"loss": 0.6439,
"step": 24
},
{
"epoch": 0.16474464579901152,
"grad_norm": 33552.83984375,
"learning_rate": 2.608695652173913e-05,
"loss": 0.6781,
"step": 25
},
{
"epoch": 0.171334431630972,
"grad_norm": 8155.12451171875,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.6042,
"step": 26
},
{
"epoch": 0.17792421746293247,
"grad_norm": 11283.130859375,
"learning_rate": 2.826086956521739e-05,
"loss": 0.6639,
"step": 27
},
{
"epoch": 0.18451400329489293,
"grad_norm": 56492.08984375,
"learning_rate": 2.9347826086956526e-05,
"loss": 0.6198,
"step": 28
},
{
"epoch": 0.19110378912685339,
"grad_norm": 36276.30859375,
"learning_rate": 3.0434782608695656e-05,
"loss": 0.7941,
"step": 29
},
{
"epoch": 0.19769357495881384,
"grad_norm": 16123.35546875,
"learning_rate": 3.152173913043479e-05,
"loss": 0.6976,
"step": 30
},
{
"epoch": 0.2042833607907743,
"grad_norm": 3790.1572265625,
"learning_rate": 3.260869565217392e-05,
"loss": 0.6401,
"step": 31
},
{
"epoch": 0.21087314662273476,
"grad_norm": 20178.978515625,
"learning_rate": 3.369565217391305e-05,
"loss": 0.7029,
"step": 32
},
{
"epoch": 0.21746293245469522,
"grad_norm": 12479.48046875,
"learning_rate": 3.478260869565218e-05,
"loss": 0.7024,
"step": 33
},
{
"epoch": 0.22405271828665568,
"grad_norm": 17013.83984375,
"learning_rate": 3.58695652173913e-05,
"loss": 0.7124,
"step": 34
},
{
"epoch": 0.23064250411861614,
"grad_norm": 8563.2705078125,
"learning_rate": 3.695652173913043e-05,
"loss": 0.6505,
"step": 35
},
{
"epoch": 0.2372322899505766,
"grad_norm": 8254.0205078125,
"learning_rate": 3.804347826086957e-05,
"loss": 0.6654,
"step": 36
},
{
"epoch": 0.24382207578253706,
"grad_norm": 49902.2890625,
"learning_rate": 3.91304347826087e-05,
"loss": 0.6191,
"step": 37
},
{
"epoch": 0.2504118616144975,
"grad_norm": 18592.572265625,
"learning_rate": 4.021739130434783e-05,
"loss": 0.6082,
"step": 38
},
{
"epoch": 0.257001647446458,
"grad_norm": 25148.34375,
"learning_rate": 4.130434782608696e-05,
"loss": 0.7318,
"step": 39
},
{
"epoch": 0.26359143327841844,
"grad_norm": 11293.0634765625,
"learning_rate": 4.239130434782609e-05,
"loss": 0.6075,
"step": 40
},
{
"epoch": 0.2701812191103789,
"grad_norm": 10365.61328125,
"learning_rate": 4.347826086956522e-05,
"loss": 0.5871,
"step": 41
},
{
"epoch": 0.27677100494233936,
"grad_norm": 6548.3740234375,
"learning_rate": 4.456521739130435e-05,
"loss": 0.5931,
"step": 42
},
{
"epoch": 0.2833607907742998,
"grad_norm": 17649.12109375,
"learning_rate": 4.565217391304348e-05,
"loss": 0.7389,
"step": 43
},
{
"epoch": 0.2899505766062603,
"grad_norm": 14737.7197265625,
"learning_rate": 4.673913043478261e-05,
"loss": 0.5972,
"step": 44
},
{
"epoch": 0.29654036243822074,
"grad_norm": 9264.072265625,
"learning_rate": 4.782608695652174e-05,
"loss": 0.6256,
"step": 45
},
{
"epoch": 0.3031301482701812,
"grad_norm": 86530.90625,
"learning_rate": 4.891304347826087e-05,
"loss": 0.6694,
"step": 46
},
{
"epoch": 0.30971993410214166,
"grad_norm": 10428.66015625,
"learning_rate": 5e-05,
"loss": 0.6411,
"step": 47
},
{
"epoch": 0.3163097199341021,
"grad_norm": 35924.46484375,
"learning_rate": 4.987714987714988e-05,
"loss": 0.7293,
"step": 48
},
{
"epoch": 0.3228995057660626,
"grad_norm": 9777.244140625,
"learning_rate": 4.9754299754299756e-05,
"loss": 0.6293,
"step": 49
},
{
"epoch": 0.32948929159802304,
"grad_norm": 19732.7421875,
"learning_rate": 4.963144963144963e-05,
"loss": 0.686,
"step": 50
},
{
"epoch": 0.33607907742998355,
"grad_norm": 35202.5234375,
"learning_rate": 4.950859950859951e-05,
"loss": 0.6859,
"step": 51
},
{
"epoch": 0.342668863261944,
"grad_norm": 12520.201171875,
"learning_rate": 4.9385749385749387e-05,
"loss": 0.6202,
"step": 52
},
{
"epoch": 0.34925864909390447,
"grad_norm": 34133.8671875,
"learning_rate": 4.926289926289926e-05,
"loss": 0.6566,
"step": 53
},
{
"epoch": 0.35584843492586493,
"grad_norm": 30663.423828125,
"learning_rate": 4.914004914004915e-05,
"loss": 0.607,
"step": 54
},
{
"epoch": 0.3624382207578254,
"grad_norm": 33234.12109375,
"learning_rate": 4.901719901719902e-05,
"loss": 0.5683,
"step": 55
},
{
"epoch": 0.36902800658978585,
"grad_norm": 11093.591796875,
"learning_rate": 4.8894348894348894e-05,
"loss": 0.6449,
"step": 56
},
{
"epoch": 0.3756177924217463,
"grad_norm": 10510.125,
"learning_rate": 4.877149877149878e-05,
"loss": 0.6932,
"step": 57
},
{
"epoch": 0.38220757825370677,
"grad_norm": 76218.2109375,
"learning_rate": 4.8648648648648654e-05,
"loss": 0.681,
"step": 58
},
{
"epoch": 0.38879736408566723,
"grad_norm": 25486.68359375,
"learning_rate": 4.8525798525798524e-05,
"loss": 0.577,
"step": 59
},
{
"epoch": 0.3953871499176277,
"grad_norm": 39751.8671875,
"learning_rate": 4.840294840294841e-05,
"loss": 0.6199,
"step": 60
},
{
"epoch": 0.40197693574958815,
"grad_norm": 14526.541015625,
"learning_rate": 4.8280098280098285e-05,
"loss": 0.5628,
"step": 61
},
{
"epoch": 0.4085667215815486,
"grad_norm": 33324.5625,
"learning_rate": 4.8157248157248155e-05,
"loss": 0.7594,
"step": 62
},
{
"epoch": 0.41515650741350907,
"grad_norm": 13729.7373046875,
"learning_rate": 4.803439803439804e-05,
"loss": 0.6227,
"step": 63
},
{
"epoch": 0.42174629324546953,
"grad_norm": 34946.58203125,
"learning_rate": 4.7911547911547915e-05,
"loss": 0.6377,
"step": 64
},
{
"epoch": 0.42833607907743,
"grad_norm": 7339.2587890625,
"learning_rate": 4.778869778869779e-05,
"loss": 0.7474,
"step": 65
},
{
"epoch": 0.43492586490939045,
"grad_norm": 61978.32421875,
"learning_rate": 4.766584766584767e-05,
"loss": 0.7136,
"step": 66
},
{
"epoch": 0.4415156507413509,
"grad_norm": 18481.29296875,
"learning_rate": 4.7542997542997546e-05,
"loss": 0.64,
"step": 67
},
{
"epoch": 0.44810543657331137,
"grad_norm": 11941.44140625,
"learning_rate": 4.742014742014742e-05,
"loss": 0.6273,
"step": 68
},
{
"epoch": 0.4546952224052718,
"grad_norm": 70506.828125,
"learning_rate": 4.72972972972973e-05,
"loss": 0.7494,
"step": 69
},
{
"epoch": 0.4612850082372323,
"grad_norm": 8327.736328125,
"learning_rate": 4.7174447174447176e-05,
"loss": 0.6674,
"step": 70
},
{
"epoch": 0.46787479406919275,
"grad_norm": 13985.94140625,
"learning_rate": 4.705159705159705e-05,
"loss": 0.6474,
"step": 71
},
{
"epoch": 0.4744645799011532,
"grad_norm": 9354.6904296875,
"learning_rate": 4.692874692874693e-05,
"loss": 0.663,
"step": 72
},
{
"epoch": 0.48105436573311366,
"grad_norm": 30927.59765625,
"learning_rate": 4.680589680589681e-05,
"loss": 0.6372,
"step": 73
},
{
"epoch": 0.4876441515650741,
"grad_norm": 32827.7109375,
"learning_rate": 4.6683046683046684e-05,
"loss": 0.623,
"step": 74
},
{
"epoch": 0.4942339373970346,
"grad_norm": 30631.48046875,
"learning_rate": 4.656019656019656e-05,
"loss": 0.6329,
"step": 75
},
{
"epoch": 0.500823723228995,
"grad_norm": 23105.517578125,
"learning_rate": 4.6437346437346444e-05,
"loss": 0.719,
"step": 76
},
{
"epoch": 0.5074135090609555,
"grad_norm": 21652.029296875,
"learning_rate": 4.6314496314496314e-05,
"loss": 0.7059,
"step": 77
},
{
"epoch": 0.514003294892916,
"grad_norm": 15986.599609375,
"learning_rate": 4.619164619164619e-05,
"loss": 0.6683,
"step": 78
},
{
"epoch": 0.5205930807248764,
"grad_norm": 20643.82421875,
"learning_rate": 4.6068796068796074e-05,
"loss": 0.7123,
"step": 79
},
{
"epoch": 0.5271828665568369,
"grad_norm": 24529.80078125,
"learning_rate": 4.594594594594595e-05,
"loss": 0.6836,
"step": 80
},
{
"epoch": 0.5337726523887973,
"grad_norm": 19665.833984375,
"learning_rate": 4.582309582309582e-05,
"loss": 0.6292,
"step": 81
},
{
"epoch": 0.5403624382207578,
"grad_norm": 35243.45703125,
"learning_rate": 4.5700245700245705e-05,
"loss": 0.6891,
"step": 82
},
{
"epoch": 0.5469522240527183,
"grad_norm": 15323.2197265625,
"learning_rate": 4.557739557739558e-05,
"loss": 0.6287,
"step": 83
},
{
"epoch": 0.5535420098846787,
"grad_norm": 21972.83203125,
"learning_rate": 4.545454545454546e-05,
"loss": 0.676,
"step": 84
},
{
"epoch": 0.5601317957166392,
"grad_norm": 38221.46484375,
"learning_rate": 4.5331695331695335e-05,
"loss": 0.7015,
"step": 85
},
{
"epoch": 0.5667215815485996,
"grad_norm": 17371.90625,
"learning_rate": 4.520884520884521e-05,
"loss": 0.6665,
"step": 86
},
{
"epoch": 0.5733113673805601,
"grad_norm": 23913.32421875,
"learning_rate": 4.508599508599509e-05,
"loss": 0.6641,
"step": 87
},
{
"epoch": 0.5799011532125206,
"grad_norm": 14188.1494140625,
"learning_rate": 4.4963144963144966e-05,
"loss": 0.6153,
"step": 88
},
{
"epoch": 0.586490939044481,
"grad_norm": 15268.400390625,
"learning_rate": 4.484029484029484e-05,
"loss": 0.7959,
"step": 89
},
{
"epoch": 0.5930807248764415,
"grad_norm": 85963.09375,
"learning_rate": 4.471744471744472e-05,
"loss": 0.7464,
"step": 90
},
{
"epoch": 0.5996705107084019,
"grad_norm": 48107.12890625,
"learning_rate": 4.4594594594594596e-05,
"loss": 0.6248,
"step": 91
},
{
"epoch": 0.6062602965403624,
"grad_norm": 32619.958984375,
"learning_rate": 4.447174447174447e-05,
"loss": 0.6176,
"step": 92
},
{
"epoch": 0.6128500823723229,
"grad_norm": 13193.05078125,
"learning_rate": 4.434889434889435e-05,
"loss": 0.7016,
"step": 93
},
{
"epoch": 0.6194398682042833,
"grad_norm": 242728.328125,
"learning_rate": 4.422604422604423e-05,
"loss": 0.6053,
"step": 94
},
{
"epoch": 0.6260296540362438,
"grad_norm": 13395.2021484375,
"learning_rate": 4.4103194103194104e-05,
"loss": 0.6552,
"step": 95
},
{
"epoch": 0.6326194398682042,
"grad_norm": 26586.83984375,
"learning_rate": 4.398034398034398e-05,
"loss": 0.6451,
"step": 96
},
{
"epoch": 0.6392092257001647,
"grad_norm": 20530.763671875,
"learning_rate": 4.385749385749386e-05,
"loss": 0.6142,
"step": 97
},
{
"epoch": 0.6457990115321252,
"grad_norm": 120099.6875,
"learning_rate": 4.373464373464374e-05,
"loss": 0.7366,
"step": 98
},
{
"epoch": 0.6523887973640856,
"grad_norm": 32992.8359375,
"learning_rate": 4.361179361179362e-05,
"loss": 0.6341,
"step": 99
},
{
"epoch": 0.6589785831960461,
"grad_norm": 114159.0625,
"learning_rate": 4.348894348894349e-05,
"loss": 0.6375,
"step": 100
},
{
"epoch": 0.6655683690280065,
"grad_norm": 59862.4453125,
"learning_rate": 4.336609336609337e-05,
"loss": 0.642,
"step": 101
},
{
"epoch": 0.6721581548599671,
"grad_norm": 22114.5078125,
"learning_rate": 4.324324324324325e-05,
"loss": 0.6883,
"step": 102
},
{
"epoch": 0.6787479406919276,
"grad_norm": 63335.8203125,
"learning_rate": 4.312039312039312e-05,
"loss": 0.695,
"step": 103
},
{
"epoch": 0.685337726523888,
"grad_norm": 8014.02001953125,
"learning_rate": 4.2997542997543e-05,
"loss": 0.668,
"step": 104
},
{
"epoch": 0.6919275123558485,
"grad_norm": 35121.1640625,
"learning_rate": 4.287469287469288e-05,
"loss": 0.5207,
"step": 105
},
{
"epoch": 0.6985172981878089,
"grad_norm": 16844.70703125,
"learning_rate": 4.2751842751842756e-05,
"loss": 0.7609,
"step": 106
},
{
"epoch": 0.7051070840197694,
"grad_norm": 12384.4345703125,
"learning_rate": 4.262899262899263e-05,
"loss": 0.6501,
"step": 107
},
{
"epoch": 0.7116968698517299,
"grad_norm": 5228.11572265625,
"learning_rate": 4.250614250614251e-05,
"loss": 0.6624,
"step": 108
},
{
"epoch": 0.7182866556836903,
"grad_norm": 112343.34375,
"learning_rate": 4.2383292383292386e-05,
"loss": 0.6711,
"step": 109
},
{
"epoch": 0.7248764415156508,
"grad_norm": 11115.724609375,
"learning_rate": 4.226044226044226e-05,
"loss": 0.668,
"step": 110
},
{
"epoch": 0.7314662273476112,
"grad_norm": 91624.140625,
"learning_rate": 4.213759213759214e-05,
"loss": 0.6913,
"step": 111
},
{
"epoch": 0.7380560131795717,
"grad_norm": 6046.64794921875,
"learning_rate": 4.2014742014742017e-05,
"loss": 0.6699,
"step": 112
},
{
"epoch": 0.7446457990115322,
"grad_norm": 26374.94921875,
"learning_rate": 4.189189189189189e-05,
"loss": 0.7146,
"step": 113
},
{
"epoch": 0.7512355848434926,
"grad_norm": 27798.625,
"learning_rate": 4.176904176904177e-05,
"loss": 0.6188,
"step": 114
},
{
"epoch": 0.7578253706754531,
"grad_norm": 8118.2197265625,
"learning_rate": 4.164619164619165e-05,
"loss": 0.6177,
"step": 115
},
{
"epoch": 0.7644151565074135,
"grad_norm": 40166.359375,
"learning_rate": 4.1523341523341524e-05,
"loss": 0.66,
"step": 116
},
{
"epoch": 0.771004942339374,
"grad_norm": 16126.8427734375,
"learning_rate": 4.14004914004914e-05,
"loss": 0.7109,
"step": 117
},
{
"epoch": 0.7775947281713345,
"grad_norm": 18022.84765625,
"learning_rate": 4.127764127764128e-05,
"loss": 0.6224,
"step": 118
},
{
"epoch": 0.7841845140032949,
"grad_norm": 15072.46484375,
"learning_rate": 4.1154791154791154e-05,
"loss": 0.5862,
"step": 119
},
{
"epoch": 0.7907742998352554,
"grad_norm": 11486.779296875,
"learning_rate": 4.103194103194104e-05,
"loss": 0.591,
"step": 120
},
{
"epoch": 0.7973640856672158,
"grad_norm": 12171.77734375,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.6598,
"step": 121
},
{
"epoch": 0.8039538714991763,
"grad_norm": 22686.841796875,
"learning_rate": 4.0786240786240785e-05,
"loss": 0.6059,
"step": 122
},
{
"epoch": 0.8105436573311368,
"grad_norm": 27070.658203125,
"learning_rate": 4.066339066339067e-05,
"loss": 0.5981,
"step": 123
},
{
"epoch": 0.8171334431630972,
"grad_norm": 15005.7265625,
"learning_rate": 4.0540540540540545e-05,
"loss": 0.6305,
"step": 124
},
{
"epoch": 0.8237232289950577,
"grad_norm": 11356.5380859375,
"learning_rate": 4.0417690417690415e-05,
"loss": 0.6736,
"step": 125
},
{
"epoch": 0.8303130148270181,
"grad_norm": 78888.9296875,
"learning_rate": 4.02948402948403e-05,
"loss": 0.6746,
"step": 126
},
{
"epoch": 0.8369028006589786,
"grad_norm": 8765.6591796875,
"learning_rate": 4.0171990171990176e-05,
"loss": 0.6088,
"step": 127
},
{
"epoch": 0.8434925864909391,
"grad_norm": 27692.36328125,
"learning_rate": 4.004914004914005e-05,
"loss": 0.7365,
"step": 128
},
{
"epoch": 0.8500823723228995,
"grad_norm": 27458.75,
"learning_rate": 3.992628992628993e-05,
"loss": 0.6885,
"step": 129
},
{
"epoch": 0.85667215815486,
"grad_norm": 32011.431640625,
"learning_rate": 3.9803439803439806e-05,
"loss": 0.6128,
"step": 130
},
{
"epoch": 0.8632619439868204,
"grad_norm": 8888.3623046875,
"learning_rate": 3.968058968058968e-05,
"loss": 0.8316,
"step": 131
},
{
"epoch": 0.8698517298187809,
"grad_norm": 31571.5,
"learning_rate": 3.955773955773956e-05,
"loss": 0.6086,
"step": 132
},
{
"epoch": 0.8764415156507414,
"grad_norm": 7211.0869140625,
"learning_rate": 3.943488943488944e-05,
"loss": 0.6343,
"step": 133
},
{
"epoch": 0.8830313014827018,
"grad_norm": 28377.615234375,
"learning_rate": 3.9312039312039314e-05,
"loss": 0.6234,
"step": 134
},
{
"epoch": 0.8896210873146623,
"grad_norm": 19913.7734375,
"learning_rate": 3.918918918918919e-05,
"loss": 0.7022,
"step": 135
},
{
"epoch": 0.8962108731466227,
"grad_norm": 38258.0546875,
"learning_rate": 3.906633906633907e-05,
"loss": 0.553,
"step": 136
},
{
"epoch": 0.9028006589785832,
"grad_norm": 18686.283203125,
"learning_rate": 3.8943488943488944e-05,
"loss": 0.6194,
"step": 137
},
{
"epoch": 0.9093904448105437,
"grad_norm": 25210.80078125,
"learning_rate": 3.882063882063882e-05,
"loss": 0.6567,
"step": 138
},
{
"epoch": 0.9159802306425041,
"grad_norm": 24367.556640625,
"learning_rate": 3.86977886977887e-05,
"loss": 0.686,
"step": 139
},
{
"epoch": 0.9225700164744646,
"grad_norm": 135211.953125,
"learning_rate": 3.857493857493858e-05,
"loss": 0.5756,
"step": 140
},
{
"epoch": 0.929159802306425,
"grad_norm": 18591.775390625,
"learning_rate": 3.845208845208845e-05,
"loss": 0.6784,
"step": 141
},
{
"epoch": 0.9357495881383855,
"grad_norm": 13147.021484375,
"learning_rate": 3.8329238329238335e-05,
"loss": 0.6022,
"step": 142
},
{
"epoch": 0.942339373970346,
"grad_norm": 19533.595703125,
"learning_rate": 3.820638820638821e-05,
"loss": 0.5408,
"step": 143
},
{
"epoch": 0.9489291598023064,
"grad_norm": 7753.4462890625,
"learning_rate": 3.808353808353808e-05,
"loss": 0.6628,
"step": 144
},
{
"epoch": 0.9555189456342669,
"grad_norm": 44759.55859375,
"learning_rate": 3.7960687960687965e-05,
"loss": 0.5822,
"step": 145
},
{
"epoch": 0.9621087314662273,
"grad_norm": 7180.6044921875,
"learning_rate": 3.783783783783784e-05,
"loss": 0.5473,
"step": 146
},
{
"epoch": 0.9686985172981878,
"grad_norm": 37214.40625,
"learning_rate": 3.771498771498771e-05,
"loss": 0.6619,
"step": 147
},
{
"epoch": 0.9752883031301482,
"grad_norm": 14762.1357421875,
"learning_rate": 3.7592137592137596e-05,
"loss": 0.6526,
"step": 148
},
{
"epoch": 0.9818780889621087,
"grad_norm": 67078.1875,
"learning_rate": 3.746928746928747e-05,
"loss": 0.72,
"step": 149
},
{
"epoch": 0.9884678747940692,
"grad_norm": 21195.07421875,
"learning_rate": 3.734643734643735e-05,
"loss": 0.6566,
"step": 150
},
{
"epoch": 0.9950576606260296,
"grad_norm": 22123.794921875,
"learning_rate": 3.7223587223587226e-05,
"loss": 0.647,
"step": 151
},
{
"epoch": 1.00164744645799,
"grad_norm": 9963.392578125,
"learning_rate": 3.71007371007371e-05,
"loss": 0.5923,
"step": 152
},
{
"epoch": 1.0082372322899507,
"grad_norm": 10004.744140625,
"learning_rate": 3.697788697788698e-05,
"loss": 0.5811,
"step": 153
},
{
"epoch": 1.014827018121911,
"grad_norm": 5818.58935546875,
"learning_rate": 3.685503685503686e-05,
"loss": 0.5744,
"step": 154
},
{
"epoch": 1.0214168039538716,
"grad_norm": 12603.640625,
"learning_rate": 3.6732186732186734e-05,
"loss": 0.6278,
"step": 155
},
{
"epoch": 1.028006589785832,
"grad_norm": 23099.1796875,
"learning_rate": 3.660933660933661e-05,
"loss": 0.6001,
"step": 156
},
{
"epoch": 1.0345963756177925,
"grad_norm": 19207.27734375,
"learning_rate": 3.648648648648649e-05,
"loss": 0.5701,
"step": 157
},
{
"epoch": 1.0411861614497528,
"grad_norm": 10040.673828125,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.6095,
"step": 158
},
{
"epoch": 1.0477759472817134,
"grad_norm": 13896.298828125,
"learning_rate": 3.624078624078625e-05,
"loss": 0.5673,
"step": 159
},
{
"epoch": 1.0543657331136738,
"grad_norm": 10502.177734375,
"learning_rate": 3.611793611793612e-05,
"loss": 0.5662,
"step": 160
},
{
"epoch": 1.0609555189456343,
"grad_norm": 6510.84033203125,
"learning_rate": 3.5995085995085995e-05,
"loss": 0.5095,
"step": 161
},
{
"epoch": 1.0675453047775947,
"grad_norm": 18709.1640625,
"learning_rate": 3.587223587223588e-05,
"loss": 0.5437,
"step": 162
},
{
"epoch": 1.0741350906095553,
"grad_norm": 6607.08935546875,
"learning_rate": 3.574938574938575e-05,
"loss": 0.5686,
"step": 163
},
{
"epoch": 1.0807248764415156,
"grad_norm": 11358.892578125,
"learning_rate": 3.562653562653563e-05,
"loss": 0.585,
"step": 164
},
{
"epoch": 1.0873146622734762,
"grad_norm": 24775.66796875,
"learning_rate": 3.550368550368551e-05,
"loss": 0.5248,
"step": 165
},
{
"epoch": 1.0939044481054365,
"grad_norm": 13074.2919921875,
"learning_rate": 3.538083538083538e-05,
"loss": 0.5206,
"step": 166
},
{
"epoch": 1.100494233937397,
"grad_norm": 10716.3486328125,
"learning_rate": 3.525798525798526e-05,
"loss": 0.6356,
"step": 167
},
{
"epoch": 1.1070840197693574,
"grad_norm": 19648.953125,
"learning_rate": 3.513513513513514e-05,
"loss": 0.5096,
"step": 168
},
{
"epoch": 1.113673805601318,
"grad_norm": 10499.521484375,
"learning_rate": 3.501228501228501e-05,
"loss": 0.6298,
"step": 169
},
{
"epoch": 1.1202635914332784,
"grad_norm": 38415.08984375,
"learning_rate": 3.488943488943489e-05,
"loss": 0.6272,
"step": 170
},
{
"epoch": 1.126853377265239,
"grad_norm": 18025.1328125,
"learning_rate": 3.476658476658477e-05,
"loss": 0.609,
"step": 171
},
{
"epoch": 1.1334431630971993,
"grad_norm": 17254.71484375,
"learning_rate": 3.4643734643734647e-05,
"loss": 0.5771,
"step": 172
},
{
"epoch": 1.1400329489291599,
"grad_norm": 56900.84375,
"learning_rate": 3.452088452088452e-05,
"loss": 0.4617,
"step": 173
},
{
"epoch": 1.1466227347611202,
"grad_norm": 94377.71875,
"learning_rate": 3.43980343980344e-05,
"loss": 0.5262,
"step": 174
},
{
"epoch": 1.1532125205930808,
"grad_norm": 26418.802734375,
"learning_rate": 3.427518427518428e-05,
"loss": 0.5388,
"step": 175
},
{
"epoch": 1.1598023064250411,
"grad_norm": 39558.36328125,
"learning_rate": 3.4152334152334154e-05,
"loss": 0.6323,
"step": 176
},
{
"epoch": 1.1663920922570017,
"grad_norm": 12363.81640625,
"learning_rate": 3.402948402948403e-05,
"loss": 0.4672,
"step": 177
},
{
"epoch": 1.172981878088962,
"grad_norm": 4902.41748046875,
"learning_rate": 3.390663390663391e-05,
"loss": 0.6218,
"step": 178
},
{
"epoch": 1.1795716639209226,
"grad_norm": 18641.564453125,
"learning_rate": 3.3783783783783784e-05,
"loss": 0.5324,
"step": 179
},
{
"epoch": 1.186161449752883,
"grad_norm": 20244.51171875,
"learning_rate": 3.366093366093366e-05,
"loss": 0.5641,
"step": 180
},
{
"epoch": 1.1927512355848435,
"grad_norm": 10752.59765625,
"learning_rate": 3.3538083538083545e-05,
"loss": 0.5537,
"step": 181
},
{
"epoch": 1.1993410214168039,
"grad_norm": 16433.798828125,
"learning_rate": 3.3415233415233415e-05,
"loss": 0.6038,
"step": 182
},
{
"epoch": 1.2059308072487644,
"grad_norm": 65011.72265625,
"learning_rate": 3.329238329238329e-05,
"loss": 0.71,
"step": 183
},
{
"epoch": 1.2125205930807248,
"grad_norm": 139034.671875,
"learning_rate": 3.3169533169533175e-05,
"loss": 0.5752,
"step": 184
},
{
"epoch": 1.2191103789126854,
"grad_norm": 6198.3046875,
"learning_rate": 3.3046683046683045e-05,
"loss": 0.6047,
"step": 185
},
{
"epoch": 1.2257001647446457,
"grad_norm": 32208.603515625,
"learning_rate": 3.292383292383293e-05,
"loss": 0.548,
"step": 186
},
{
"epoch": 1.2322899505766063,
"grad_norm": 83243.2578125,
"learning_rate": 3.2800982800982806e-05,
"loss": 0.6223,
"step": 187
},
{
"epoch": 1.2388797364085666,
"grad_norm": 11003.818359375,
"learning_rate": 3.2678132678132676e-05,
"loss": 0.5597,
"step": 188
},
{
"epoch": 1.2454695222405272,
"grad_norm": 30586.80078125,
"learning_rate": 3.255528255528256e-05,
"loss": 0.5653,
"step": 189
},
{
"epoch": 1.2520593080724876,
"grad_norm": 36942.6171875,
"learning_rate": 3.2432432432432436e-05,
"loss": 0.4991,
"step": 190
},
{
"epoch": 1.2586490939044481,
"grad_norm": 34296.30859375,
"learning_rate": 3.2309582309582306e-05,
"loss": 0.5868,
"step": 191
},
{
"epoch": 1.2652388797364087,
"grad_norm": 10427.1875,
"learning_rate": 3.218673218673219e-05,
"loss": 0.6163,
"step": 192
},
{
"epoch": 1.271828665568369,
"grad_norm": 50948.59375,
"learning_rate": 3.206388206388207e-05,
"loss": 0.5254,
"step": 193
},
{
"epoch": 1.2784184514003294,
"grad_norm": 5948.72802734375,
"learning_rate": 3.1941031941031943e-05,
"loss": 0.5176,
"step": 194
},
{
"epoch": 1.28500823723229,
"grad_norm": 24703.78515625,
"learning_rate": 3.181818181818182e-05,
"loss": 0.5382,
"step": 195
},
{
"epoch": 1.2915980230642505,
"grad_norm": 31584.517578125,
"learning_rate": 3.16953316953317e-05,
"loss": 0.5676,
"step": 196
},
{
"epoch": 1.2981878088962109,
"grad_norm": 15364.802734375,
"learning_rate": 3.1572481572481574e-05,
"loss": 0.5168,
"step": 197
},
{
"epoch": 1.3047775947281712,
"grad_norm": 20843.013671875,
"learning_rate": 3.144963144963145e-05,
"loss": 0.5424,
"step": 198
},
{
"epoch": 1.3113673805601318,
"grad_norm": 17939.484375,
"learning_rate": 3.132678132678133e-05,
"loss": 0.5653,
"step": 199
},
{
"epoch": 1.3179571663920924,
"grad_norm": 9113.7607421875,
"learning_rate": 3.120393120393121e-05,
"loss": 0.5648,
"step": 200
},
{
"epoch": 1.3245469522240527,
"grad_norm": 32489.458984375,
"learning_rate": 3.108108108108108e-05,
"loss": 0.5858,
"step": 201
},
{
"epoch": 1.331136738056013,
"grad_norm": 23361.166015625,
"learning_rate": 3.095823095823096e-05,
"loss": 0.5749,
"step": 202
},
{
"epoch": 1.3377265238879736,
"grad_norm": 12417.0478515625,
"learning_rate": 3.083538083538084e-05,
"loss": 0.5019,
"step": 203
},
{
"epoch": 1.3443163097199342,
"grad_norm": 24939.2265625,
"learning_rate": 3.071253071253071e-05,
"loss": 0.5861,
"step": 204
},
{
"epoch": 1.3509060955518946,
"grad_norm": 20826.5859375,
"learning_rate": 3.058968058968059e-05,
"loss": 0.591,
"step": 205
},
{
"epoch": 1.357495881383855,
"grad_norm": 13685.1572265625,
"learning_rate": 3.046683046683047e-05,
"loss": 0.6588,
"step": 206
},
{
"epoch": 1.3640856672158155,
"grad_norm": 26390.5625,
"learning_rate": 3.0343980343980342e-05,
"loss": 0.5532,
"step": 207
},
{
"epoch": 1.370675453047776,
"grad_norm": 13378.560546875,
"learning_rate": 3.0221130221130222e-05,
"loss": 0.5108,
"step": 208
},
{
"epoch": 1.3772652388797364,
"grad_norm": 8095.1689453125,
"learning_rate": 3.0098280098280103e-05,
"loss": 0.5522,
"step": 209
},
{
"epoch": 1.3838550247116967,
"grad_norm": 11471.375,
"learning_rate": 2.9975429975429976e-05,
"loss": 0.4777,
"step": 210
},
{
"epoch": 1.3904448105436573,
"grad_norm": 11452.013671875,
"learning_rate": 2.9852579852579853e-05,
"loss": 0.5945,
"step": 211
},
{
"epoch": 1.3970345963756179,
"grad_norm": 31670.03125,
"learning_rate": 2.9729729729729733e-05,
"loss": 0.6015,
"step": 212
},
{
"epoch": 1.4036243822075782,
"grad_norm": 22501.869140625,
"learning_rate": 2.9606879606879607e-05,
"loss": 0.4813,
"step": 213
},
{
"epoch": 1.4102141680395386,
"grad_norm": 102328.46875,
"learning_rate": 2.9484029484029483e-05,
"loss": 0.6696,
"step": 214
},
{
"epoch": 1.4168039538714992,
"grad_norm": 26580.23828125,
"learning_rate": 2.9361179361179364e-05,
"loss": 0.4692,
"step": 215
},
{
"epoch": 1.4233937397034597,
"grad_norm": 12255.455078125,
"learning_rate": 2.9238329238329237e-05,
"loss": 0.5184,
"step": 216
},
{
"epoch": 1.42998352553542,
"grad_norm": 13490.662109375,
"learning_rate": 2.9115479115479117e-05,
"loss": 0.5556,
"step": 217
},
{
"epoch": 1.4365733113673804,
"grad_norm": 14122.8046875,
"learning_rate": 2.8992628992628994e-05,
"loss": 0.5259,
"step": 218
},
{
"epoch": 1.443163097199341,
"grad_norm": 14578.7783203125,
"learning_rate": 2.8869778869778868e-05,
"loss": 0.5878,
"step": 219
},
{
"epoch": 1.4497528830313016,
"grad_norm": 36417.54296875,
"learning_rate": 2.8746928746928748e-05,
"loss": 0.6328,
"step": 220
},
{
"epoch": 1.456342668863262,
"grad_norm": 24717.400390625,
"learning_rate": 2.8624078624078625e-05,
"loss": 0.5597,
"step": 221
},
{
"epoch": 1.4629324546952225,
"grad_norm": 42796.0390625,
"learning_rate": 2.8501228501228505e-05,
"loss": 0.6242,
"step": 222
},
{
"epoch": 1.4695222405271828,
"grad_norm": 11775.6015625,
"learning_rate": 2.8378378378378378e-05,
"loss": 0.5561,
"step": 223
},
{
"epoch": 1.4761120263591434,
"grad_norm": 11294.82421875,
"learning_rate": 2.825552825552826e-05,
"loss": 0.6068,
"step": 224
},
{
"epoch": 1.4827018121911038,
"grad_norm": 164647.03125,
"learning_rate": 2.8132678132678135e-05,
"loss": 0.5618,
"step": 225
},
{
"epoch": 1.4892915980230643,
"grad_norm": 19815.3203125,
"learning_rate": 2.800982800982801e-05,
"loss": 0.5637,
"step": 226
},
{
"epoch": 1.4958813838550247,
"grad_norm": 87994.859375,
"learning_rate": 2.788697788697789e-05,
"loss": 0.6073,
"step": 227
},
{
"epoch": 1.5024711696869852,
"grad_norm": 9418.3603515625,
"learning_rate": 2.776412776412777e-05,
"loss": 0.5177,
"step": 228
},
{
"epoch": 1.5090609555189456,
"grad_norm": 293888.625,
"learning_rate": 2.764127764127764e-05,
"loss": 0.6276,
"step": 229
},
{
"epoch": 1.515650741350906,
"grad_norm": 10872.537109375,
"learning_rate": 2.751842751842752e-05,
"loss": 0.5746,
"step": 230
},
{
"epoch": 1.5222405271828665,
"grad_norm": 60597.2109375,
"learning_rate": 2.73955773955774e-05,
"loss": 0.5874,
"step": 231
},
{
"epoch": 1.528830313014827,
"grad_norm": 53855.8515625,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.5431,
"step": 232
},
{
"epoch": 1.5354200988467874,
"grad_norm": 17162.5859375,
"learning_rate": 2.714987714987715e-05,
"loss": 0.5703,
"step": 233
},
{
"epoch": 1.5420098846787478,
"grad_norm": 12093.095703125,
"learning_rate": 2.702702702702703e-05,
"loss": 0.5776,
"step": 234
},
{
"epoch": 1.5485996705107083,
"grad_norm": 33037.2421875,
"learning_rate": 2.6904176904176904e-05,
"loss": 0.5018,
"step": 235
},
{
"epoch": 1.555189456342669,
"grad_norm": 18464.041015625,
"learning_rate": 2.678132678132678e-05,
"loss": 0.5508,
"step": 236
},
{
"epoch": 1.5617792421746293,
"grad_norm": 33591.078125,
"learning_rate": 2.665847665847666e-05,
"loss": 0.5522,
"step": 237
},
{
"epoch": 1.5683690280065898,
"grad_norm": 13077.189453125,
"learning_rate": 2.6535626535626534e-05,
"loss": 0.4632,
"step": 238
},
{
"epoch": 1.5749588138385504,
"grad_norm": 8347.5576171875,
"learning_rate": 2.6412776412776414e-05,
"loss": 0.6278,
"step": 239
},
{
"epoch": 1.5815485996705108,
"grad_norm": 15981.6953125,
"learning_rate": 2.628992628992629e-05,
"loss": 0.496,
"step": 240
},
{
"epoch": 1.588138385502471,
"grad_norm": 8308.947265625,
"learning_rate": 2.616707616707617e-05,
"loss": 0.659,
"step": 241
},
{
"epoch": 1.5947281713344317,
"grad_norm": 16152.51171875,
"learning_rate": 2.6044226044226045e-05,
"loss": 0.5447,
"step": 242
},
{
"epoch": 1.6013179571663922,
"grad_norm": 10750.263671875,
"learning_rate": 2.5921375921375925e-05,
"loss": 0.6014,
"step": 243
},
{
"epoch": 1.6079077429983526,
"grad_norm": 16181.7021484375,
"learning_rate": 2.5798525798525802e-05,
"loss": 0.5084,
"step": 244
},
{
"epoch": 1.614497528830313,
"grad_norm": 17315.806640625,
"learning_rate": 2.5675675675675675e-05,
"loss": 0.6028,
"step": 245
},
{
"epoch": 1.6210873146622735,
"grad_norm": 35364.375,
"learning_rate": 2.5552825552825555e-05,
"loss": 0.565,
"step": 246
},
{
"epoch": 1.627677100494234,
"grad_norm": 7373.3349609375,
"learning_rate": 2.5429975429975432e-05,
"loss": 0.5849,
"step": 247
},
{
"epoch": 1.6342668863261944,
"grad_norm": 10823.8154296875,
"learning_rate": 2.5307125307125306e-05,
"loss": 0.4575,
"step": 248
},
{
"epoch": 1.6408566721581548,
"grad_norm": 15178.8916015625,
"learning_rate": 2.5184275184275186e-05,
"loss": 0.4679,
"step": 249
},
{
"epoch": 1.6474464579901154,
"grad_norm": 10164.9697265625,
"learning_rate": 2.5061425061425066e-05,
"loss": 0.4315,
"step": 250
},
{
"epoch": 1.654036243822076,
"grad_norm": 8609.7099609375,
"learning_rate": 2.493857493857494e-05,
"loss": 0.6248,
"step": 251
},
{
"epoch": 1.6606260296540363,
"grad_norm": 40045.09375,
"learning_rate": 2.4815724815724816e-05,
"loss": 0.5335,
"step": 252
},
{
"epoch": 1.6672158154859966,
"grad_norm": 17241.337890625,
"learning_rate": 2.4692874692874693e-05,
"loss": 0.6546,
"step": 253
},
{
"epoch": 1.6738056013179572,
"grad_norm": 23707.69921875,
"learning_rate": 2.4570024570024573e-05,
"loss": 0.5672,
"step": 254
},
{
"epoch": 1.6803953871499178,
"grad_norm": 9953.140625,
"learning_rate": 2.4447174447174447e-05,
"loss": 0.5867,
"step": 255
},
{
"epoch": 1.6869851729818781,
"grad_norm": 29007.3125,
"learning_rate": 2.4324324324324327e-05,
"loss": 0.5345,
"step": 256
},
{
"epoch": 1.6935749588138385,
"grad_norm": 9451.4765625,
"learning_rate": 2.4201474201474204e-05,
"loss": 0.6363,
"step": 257
},
{
"epoch": 1.700164744645799,
"grad_norm": 36570.16796875,
"learning_rate": 2.4078624078624077e-05,
"loss": 0.5335,
"step": 258
},
{
"epoch": 1.7067545304777596,
"grad_norm": 48013.99609375,
"learning_rate": 2.3955773955773958e-05,
"loss": 0.5234,
"step": 259
},
{
"epoch": 1.71334431630972,
"grad_norm": 43459.08203125,
"learning_rate": 2.3832923832923834e-05,
"loss": 0.5699,
"step": 260
},
{
"epoch": 1.7199341021416803,
"grad_norm": 27575.84375,
"learning_rate": 2.371007371007371e-05,
"loss": 0.5577,
"step": 261
},
{
"epoch": 1.7265238879736409,
"grad_norm": 6892.0205078125,
"learning_rate": 2.3587223587223588e-05,
"loss": 0.6098,
"step": 262
},
{
"epoch": 1.7331136738056014,
"grad_norm": 79846.03125,
"learning_rate": 2.3464373464373465e-05,
"loss": 0.5509,
"step": 263
},
{
"epoch": 1.7397034596375618,
"grad_norm": 13488.93359375,
"learning_rate": 2.3341523341523342e-05,
"loss": 0.5872,
"step": 264
},
{
"epoch": 1.7462932454695221,
"grad_norm": 20397.814453125,
"learning_rate": 2.3218673218673222e-05,
"loss": 0.5657,
"step": 265
},
{
"epoch": 1.7528830313014827,
"grad_norm": 14773.505859375,
"learning_rate": 2.3095823095823095e-05,
"loss": 0.5319,
"step": 266
},
{
"epoch": 1.7594728171334433,
"grad_norm": 59875.83203125,
"learning_rate": 2.2972972972972976e-05,
"loss": 0.6762,
"step": 267
},
{
"epoch": 1.7660626029654036,
"grad_norm": 29569.642578125,
"learning_rate": 2.2850122850122852e-05,
"loss": 0.7493,
"step": 268
},
{
"epoch": 1.772652388797364,
"grad_norm": 96513.0625,
"learning_rate": 2.272727272727273e-05,
"loss": 0.5796,
"step": 269
},
{
"epoch": 1.7792421746293245,
"grad_norm": 33220.875,
"learning_rate": 2.2604422604422606e-05,
"loss": 0.5513,
"step": 270
},
{
"epoch": 1.7858319604612851,
"grad_norm": 9378.62890625,
"learning_rate": 2.2481572481572483e-05,
"loss": 0.5444,
"step": 271
},
{
"epoch": 1.7924217462932455,
"grad_norm": 8672.4189453125,
"learning_rate": 2.235872235872236e-05,
"loss": 0.5997,
"step": 272
},
{
"epoch": 1.7990115321252058,
"grad_norm": 75445.5390625,
"learning_rate": 2.2235872235872237e-05,
"loss": 0.5383,
"step": 273
},
{
"epoch": 1.8056013179571664,
"grad_norm": 12022.205078125,
"learning_rate": 2.2113022113022113e-05,
"loss": 0.6333,
"step": 274
},
{
"epoch": 1.812191103789127,
"grad_norm": 28632.75390625,
"learning_rate": 2.199017199017199e-05,
"loss": 0.5491,
"step": 275
},
{
"epoch": 1.8187808896210873,
"grad_norm": 24272.568359375,
"learning_rate": 2.186732186732187e-05,
"loss": 0.5039,
"step": 276
},
{
"epoch": 1.8253706754530477,
"grad_norm": 9478.4189453125,
"learning_rate": 2.1744471744471744e-05,
"loss": 0.4395,
"step": 277
},
{
"epoch": 1.8319604612850082,
"grad_norm": 10953.66796875,
"learning_rate": 2.1621621621621624e-05,
"loss": 0.6367,
"step": 278
},
{
"epoch": 1.8385502471169688,
"grad_norm": 18094.830078125,
"learning_rate": 2.14987714987715e-05,
"loss": 0.5477,
"step": 279
},
{
"epoch": 1.8451400329489291,
"grad_norm": 18395.52734375,
"learning_rate": 2.1375921375921378e-05,
"loss": 0.5301,
"step": 280
},
{
"epoch": 1.8517298187808895,
"grad_norm": 104393.421875,
"learning_rate": 2.1253071253071255e-05,
"loss": 0.581,
"step": 281
},
{
"epoch": 1.85831960461285,
"grad_norm": 10463.978515625,
"learning_rate": 2.113022113022113e-05,
"loss": 0.5688,
"step": 282
},
{
"epoch": 1.8649093904448106,
"grad_norm": 7907.69189453125,
"learning_rate": 2.1007371007371008e-05,
"loss": 0.5534,
"step": 283
},
{
"epoch": 1.871499176276771,
"grad_norm": 12477.5712890625,
"learning_rate": 2.0884520884520885e-05,
"loss": 0.5412,
"step": 284
},
{
"epoch": 1.8780889621087313,
"grad_norm": 22077.701171875,
"learning_rate": 2.0761670761670762e-05,
"loss": 0.6015,
"step": 285
},
{
"epoch": 1.884678747940692,
"grad_norm": 29665.4296875,
"learning_rate": 2.063882063882064e-05,
"loss": 0.6349,
"step": 286
},
{
"epoch": 1.8912685337726525,
"grad_norm": 8550.171875,
"learning_rate": 2.051597051597052e-05,
"loss": 0.5846,
"step": 287
},
{
"epoch": 1.8978583196046128,
"grad_norm": 34534.80859375,
"learning_rate": 2.0393120393120392e-05,
"loss": 0.6063,
"step": 288
},
{
"epoch": 1.9044481054365732,
"grad_norm": 25177.533203125,
"learning_rate": 2.0270270270270273e-05,
"loss": 0.5608,
"step": 289
},
{
"epoch": 1.9110378912685337,
"grad_norm": 12976.5283203125,
"learning_rate": 2.014742014742015e-05,
"loss": 0.6424,
"step": 290
},
{
"epoch": 1.9176276771004943,
"grad_norm": 36851.8359375,
"learning_rate": 2.0024570024570026e-05,
"loss": 0.6051,
"step": 291
},
{
"epoch": 1.9242174629324547,
"grad_norm": 14700.5849609375,
"learning_rate": 1.9901719901719903e-05,
"loss": 0.6738,
"step": 292
},
{
"epoch": 1.930807248764415,
"grad_norm": 10047.1728515625,
"learning_rate": 1.977886977886978e-05,
"loss": 0.5817,
"step": 293
},
{
"epoch": 1.9373970345963756,
"grad_norm": 18526.845703125,
"learning_rate": 1.9656019656019657e-05,
"loss": 0.5037,
"step": 294
},
{
"epoch": 1.9439868204283361,
"grad_norm": 51555.40234375,
"learning_rate": 1.9533169533169534e-05,
"loss": 0.7556,
"step": 295
},
{
"epoch": 1.9505766062602965,
"grad_norm": 16909.0625,
"learning_rate": 1.941031941031941e-05,
"loss": 0.5487,
"step": 296
},
{
"epoch": 1.9571663920922568,
"grad_norm": 22686.32421875,
"learning_rate": 1.928746928746929e-05,
"loss": 0.617,
"step": 297
},
{
"epoch": 1.9637561779242174,
"grad_norm": 68786.3359375,
"learning_rate": 1.9164619164619167e-05,
"loss": 0.5581,
"step": 298
},
{
"epoch": 1.970345963756178,
"grad_norm": 44203.34375,
"learning_rate": 1.904176904176904e-05,
"loss": 0.5491,
"step": 299
},
{
"epoch": 1.9769357495881383,
"grad_norm": 39032.02734375,
"learning_rate": 1.891891891891892e-05,
"loss": 0.5876,
"step": 300
},
{
"epoch": 1.9835255354200987,
"grad_norm": 15117.4921875,
"learning_rate": 1.8796068796068798e-05,
"loss": 0.6322,
"step": 301
},
{
"epoch": 1.9901153212520593,
"grad_norm": 17359.947265625,
"learning_rate": 1.8673218673218675e-05,
"loss": 0.5526,
"step": 302
},
{
"epoch": 1.9967051070840198,
"grad_norm": 26160.140625,
"learning_rate": 1.855036855036855e-05,
"loss": 0.4606,
"step": 303
},
{
"epoch": 2.00329489291598,
"grad_norm": 27996.744140625,
"learning_rate": 1.842751842751843e-05,
"loss": 0.5775,
"step": 304
},
{
"epoch": 2.0098846787479405,
"grad_norm": 10826.375,
"learning_rate": 1.8304668304668305e-05,
"loss": 0.4791,
"step": 305
},
{
"epoch": 2.0164744645799013,
"grad_norm": 7867.39697265625,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.6314,
"step": 306
},
{
"epoch": 2.0230642504118617,
"grad_norm": 13956.0439453125,
"learning_rate": 1.805896805896806e-05,
"loss": 0.4655,
"step": 307
},
{
"epoch": 2.029654036243822,
"grad_norm": 14681.5576171875,
"learning_rate": 1.793611793611794e-05,
"loss": 0.536,
"step": 308
},
{
"epoch": 2.0362438220757824,
"grad_norm": 12553.2109375,
"learning_rate": 1.7813267813267816e-05,
"loss": 0.5533,
"step": 309
},
{
"epoch": 2.042833607907743,
"grad_norm": 27893.552734375,
"learning_rate": 1.769041769041769e-05,
"loss": 0.4929,
"step": 310
},
{
"epoch": 2.0494233937397035,
"grad_norm": 16818.6171875,
"learning_rate": 1.756756756756757e-05,
"loss": 0.5123,
"step": 311
},
{
"epoch": 2.056013179571664,
"grad_norm": 16560.30859375,
"learning_rate": 1.7444717444717446e-05,
"loss": 0.5537,
"step": 312
},
{
"epoch": 2.062602965403624,
"grad_norm": 10146.822265625,
"learning_rate": 1.7321867321867323e-05,
"loss": 0.5042,
"step": 313
},
{
"epoch": 2.069192751235585,
"grad_norm": 15292.8125,
"learning_rate": 1.71990171990172e-05,
"loss": 0.4717,
"step": 314
},
{
"epoch": 2.0757825370675453,
"grad_norm": 89878.0078125,
"learning_rate": 1.7076167076167077e-05,
"loss": 0.5159,
"step": 315
},
{
"epoch": 2.0823723228995057,
"grad_norm": 23665.125,
"learning_rate": 1.6953316953316954e-05,
"loss": 0.4446,
"step": 316
},
{
"epoch": 2.088962108731466,
"grad_norm": 15712.8779296875,
"learning_rate": 1.683046683046683e-05,
"loss": 0.594,
"step": 317
},
{
"epoch": 2.095551894563427,
"grad_norm": 50007.99609375,
"learning_rate": 1.6707616707616707e-05,
"loss": 0.5259,
"step": 318
},
{
"epoch": 2.102141680395387,
"grad_norm": 30487.50390625,
"learning_rate": 1.6584766584766588e-05,
"loss": 0.5166,
"step": 319
},
{
"epoch": 2.1087314662273475,
"grad_norm": 29152.314453125,
"learning_rate": 1.6461916461916464e-05,
"loss": 0.5055,
"step": 320
},
{
"epoch": 2.115321252059308,
"grad_norm": 22360.861328125,
"learning_rate": 1.6339066339066338e-05,
"loss": 0.4797,
"step": 321
},
{
"epoch": 2.1219110378912687,
"grad_norm": 45466.05078125,
"learning_rate": 1.6216216216216218e-05,
"loss": 0.4505,
"step": 322
},
{
"epoch": 2.128500823723229,
"grad_norm": 18740.75390625,
"learning_rate": 1.6093366093366095e-05,
"loss": 0.4849,
"step": 323
},
{
"epoch": 2.1350906095551894,
"grad_norm": 9168.376953125,
"learning_rate": 1.5970515970515972e-05,
"loss": 0.4991,
"step": 324
},
{
"epoch": 2.1416803953871497,
"grad_norm": 7032.57373046875,
"learning_rate": 1.584766584766585e-05,
"loss": 0.5332,
"step": 325
},
{
"epoch": 2.1482701812191105,
"grad_norm": 14946.6484375,
"learning_rate": 1.5724815724815725e-05,
"loss": 0.4797,
"step": 326
},
{
"epoch": 2.154859967051071,
"grad_norm": 10197.732421875,
"learning_rate": 1.5601965601965606e-05,
"loss": 0.4698,
"step": 327
},
{
"epoch": 2.161449752883031,
"grad_norm": 11579.7890625,
"learning_rate": 1.547911547911548e-05,
"loss": 0.4995,
"step": 328
},
{
"epoch": 2.168039538714992,
"grad_norm": 26515.12890625,
"learning_rate": 1.5356265356265356e-05,
"loss": 0.4931,
"step": 329
},
{
"epoch": 2.1746293245469523,
"grad_norm": 16076.119140625,
"learning_rate": 1.5233415233415234e-05,
"loss": 0.5051,
"step": 330
},
{
"epoch": 2.1812191103789127,
"grad_norm": 12420.37890625,
"learning_rate": 1.5110565110565111e-05,
"loss": 0.5227,
"step": 331
},
{
"epoch": 2.187808896210873,
"grad_norm": 17015.359375,
"learning_rate": 1.4987714987714988e-05,
"loss": 0.5115,
"step": 332
},
{
"epoch": 2.1943986820428334,
"grad_norm": 13335.0068359375,
"learning_rate": 1.4864864864864867e-05,
"loss": 0.5835,
"step": 333
},
{
"epoch": 2.200988467874794,
"grad_norm": 41513.1015625,
"learning_rate": 1.4742014742014742e-05,
"loss": 0.437,
"step": 334
},
{
"epoch": 2.2075782537067545,
"grad_norm": 11096.04296875,
"learning_rate": 1.4619164619164619e-05,
"loss": 0.4619,
"step": 335
},
{
"epoch": 2.214168039538715,
"grad_norm": 22811.95703125,
"learning_rate": 1.4496314496314497e-05,
"loss": 0.521,
"step": 336
},
{
"epoch": 2.2207578253706757,
"grad_norm": 96701.7109375,
"learning_rate": 1.4373464373464374e-05,
"loss": 0.5015,
"step": 337
},
{
"epoch": 2.227347611202636,
"grad_norm": 28764.791015625,
"learning_rate": 1.4250614250614252e-05,
"loss": 0.5669,
"step": 338
},
{
"epoch": 2.2339373970345964,
"grad_norm": 9551.7666015625,
"learning_rate": 1.412776412776413e-05,
"loss": 0.5331,
"step": 339
},
{
"epoch": 2.2405271828665567,
"grad_norm": 24150.939453125,
"learning_rate": 1.4004914004914004e-05,
"loss": 0.6009,
"step": 340
},
{
"epoch": 2.247116968698517,
"grad_norm": 6333.8935546875,
"learning_rate": 1.3882063882063885e-05,
"loss": 0.4918,
"step": 341
},
{
"epoch": 2.253706754530478,
"grad_norm": 12934.755859375,
"learning_rate": 1.375921375921376e-05,
"loss": 0.4723,
"step": 342
},
{
"epoch": 2.260296540362438,
"grad_norm": 35915.16015625,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.5468,
"step": 343
},
{
"epoch": 2.2668863261943986,
"grad_norm": 32075.701171875,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.4188,
"step": 344
},
{
"epoch": 2.2734761120263594,
"grad_norm": 34175.6171875,
"learning_rate": 1.339066339066339e-05,
"loss": 0.5332,
"step": 345
},
{
"epoch": 2.2800658978583197,
"grad_norm": 27540.275390625,
"learning_rate": 1.3267813267813267e-05,
"loss": 0.4866,
"step": 346
},
{
"epoch": 2.28665568369028,
"grad_norm": 8475.587890625,
"learning_rate": 1.3144963144963146e-05,
"loss": 0.4882,
"step": 347
},
{
"epoch": 2.2932454695222404,
"grad_norm": 31548.888671875,
"learning_rate": 1.3022113022113022e-05,
"loss": 0.4984,
"step": 348
},
{
"epoch": 2.2998352553542007,
"grad_norm": 13258.5400390625,
"learning_rate": 1.2899262899262901e-05,
"loss": 0.4839,
"step": 349
},
{
"epoch": 2.3064250411861615,
"grad_norm": 38574.671875,
"learning_rate": 1.2776412776412778e-05,
"loss": 0.4749,
"step": 350
},
{
"epoch": 2.313014827018122,
"grad_norm": 11923.9541015625,
"learning_rate": 1.2653562653562653e-05,
"loss": 0.4684,
"step": 351
},
{
"epoch": 2.3196046128500822,
"grad_norm": 26359.25390625,
"learning_rate": 1.2530712530712533e-05,
"loss": 0.544,
"step": 352
},
{
"epoch": 2.326194398682043,
"grad_norm": 13086.6357421875,
"learning_rate": 1.2407862407862408e-05,
"loss": 0.4722,
"step": 353
},
{
"epoch": 2.3327841845140034,
"grad_norm": 15811.302734375,
"learning_rate": 1.2285012285012287e-05,
"loss": 0.4928,
"step": 354
},
{
"epoch": 2.3393739703459637,
"grad_norm": 6917.1416015625,
"learning_rate": 1.2162162162162164e-05,
"loss": 0.5691,
"step": 355
},
{
"epoch": 2.345963756177924,
"grad_norm": 41444.01171875,
"learning_rate": 1.2039312039312039e-05,
"loss": 0.5405,
"step": 356
},
{
"epoch": 2.352553542009885,
"grad_norm": 139438.609375,
"learning_rate": 1.1916461916461917e-05,
"loss": 0.5219,
"step": 357
},
{
"epoch": 2.359143327841845,
"grad_norm": 9378.14453125,
"learning_rate": 1.1793611793611794e-05,
"loss": 0.4549,
"step": 358
},
{
"epoch": 2.3657331136738056,
"grad_norm": 15612.345703125,
"learning_rate": 1.1670761670761671e-05,
"loss": 0.551,
"step": 359
},
{
"epoch": 2.372322899505766,
"grad_norm": 24843.34375,
"learning_rate": 1.1547911547911548e-05,
"loss": 0.4928,
"step": 360
},
{
"epoch": 2.3789126853377267,
"grad_norm": 27883.3359375,
"learning_rate": 1.1425061425061426e-05,
"loss": 0.5312,
"step": 361
},
{
"epoch": 2.385502471169687,
"grad_norm": 29997.705078125,
"learning_rate": 1.1302211302211303e-05,
"loss": 0.5811,
"step": 362
},
{
"epoch": 2.3920922570016474,
"grad_norm": 15760.669921875,
"learning_rate": 1.117936117936118e-05,
"loss": 0.5359,
"step": 363
},
{
"epoch": 2.3986820428336078,
"grad_norm": 75893.109375,
"learning_rate": 1.1056511056511057e-05,
"loss": 0.4696,
"step": 364
},
{
"epoch": 2.4052718286655685,
"grad_norm": 20869.072265625,
"learning_rate": 1.0933660933660935e-05,
"loss": 0.5342,
"step": 365
},
{
"epoch": 2.411861614497529,
"grad_norm": 15102.4658203125,
"learning_rate": 1.0810810810810812e-05,
"loss": 0.505,
"step": 366
},
{
"epoch": 2.4184514003294892,
"grad_norm": 24776.064453125,
"learning_rate": 1.0687960687960689e-05,
"loss": 0.5192,
"step": 367
},
{
"epoch": 2.4250411861614496,
"grad_norm": 17849.11328125,
"learning_rate": 1.0565110565110566e-05,
"loss": 0.5461,
"step": 368
},
{
"epoch": 2.4316309719934104,
"grad_norm": 55960.76953125,
"learning_rate": 1.0442260442260443e-05,
"loss": 0.5075,
"step": 369
},
{
"epoch": 2.4382207578253707,
"grad_norm": 19820.607421875,
"learning_rate": 1.031941031941032e-05,
"loss": 0.5661,
"step": 370
},
{
"epoch": 2.444810543657331,
"grad_norm": 14240.271484375,
"learning_rate": 1.0196560196560196e-05,
"loss": 0.508,
"step": 371
},
{
"epoch": 2.4514003294892914,
"grad_norm": 134865.75,
"learning_rate": 1.0073710073710075e-05,
"loss": 0.5145,
"step": 372
},
{
"epoch": 2.4579901153212522,
"grad_norm": 26314.51953125,
"learning_rate": 9.950859950859952e-06,
"loss": 0.5024,
"step": 373
},
{
"epoch": 2.4645799011532126,
"grad_norm": 8372.9169921875,
"learning_rate": 9.828009828009828e-06,
"loss": 0.6207,
"step": 374
},
{
"epoch": 2.471169686985173,
"grad_norm": 9625.1865234375,
"learning_rate": 9.705159705159705e-06,
"loss": 0.4774,
"step": 375
},
{
"epoch": 2.4777594728171333,
"grad_norm": 12379.4033203125,
"learning_rate": 9.582309582309584e-06,
"loss": 0.4996,
"step": 376
},
{
"epoch": 2.484349258649094,
"grad_norm": 7536.5390625,
"learning_rate": 9.45945945945946e-06,
"loss": 0.5236,
"step": 377
},
{
"epoch": 2.4909390444810544,
"grad_norm": 28774.794921875,
"learning_rate": 9.336609336609337e-06,
"loss": 0.4793,
"step": 378
},
{
"epoch": 2.4975288303130148,
"grad_norm": 26588.126953125,
"learning_rate": 9.213759213759214e-06,
"loss": 0.4341,
"step": 379
},
{
"epoch": 2.504118616144975,
"grad_norm": 20777.30078125,
"learning_rate": 9.090909090909091e-06,
"loss": 0.5086,
"step": 380
},
{
"epoch": 2.510708401976936,
"grad_norm": 11463.2158203125,
"learning_rate": 8.96805896805897e-06,
"loss": 0.5769,
"step": 381
},
{
"epoch": 2.5172981878088962,
"grad_norm": 39708.0390625,
"learning_rate": 8.845208845208845e-06,
"loss": 0.5116,
"step": 382
},
{
"epoch": 2.5238879736408566,
"grad_norm": 20928.3671875,
"learning_rate": 8.722358722358723e-06,
"loss": 0.4997,
"step": 383
},
{
"epoch": 2.5304777594728174,
"grad_norm": 16851.19140625,
"learning_rate": 8.5995085995086e-06,
"loss": 0.5821,
"step": 384
},
{
"epoch": 2.5370675453047777,
"grad_norm": 12078.3798828125,
"learning_rate": 8.476658476658477e-06,
"loss": 0.5221,
"step": 385
},
{
"epoch": 2.543657331136738,
"grad_norm": 66684.203125,
"learning_rate": 8.353808353808354e-06,
"loss": 0.5621,
"step": 386
},
{
"epoch": 2.5502471169686984,
"grad_norm": 5581.15234375,
"learning_rate": 8.230958230958232e-06,
"loss": 0.4489,
"step": 387
},
{
"epoch": 2.556836902800659,
"grad_norm": 23770.3359375,
"learning_rate": 8.108108108108109e-06,
"loss": 0.5072,
"step": 388
},
{
"epoch": 2.5634266886326196,
"grad_norm": 15222.48046875,
"learning_rate": 7.985257985257986e-06,
"loss": 0.4957,
"step": 389
},
{
"epoch": 2.57001647446458,
"grad_norm": 14970.568359375,
"learning_rate": 7.862407862407863e-06,
"loss": 0.4914,
"step": 390
},
{
"epoch": 2.5766062602965403,
"grad_norm": 6360.3046875,
"learning_rate": 7.73955773955774e-06,
"loss": 0.4492,
"step": 391
},
{
"epoch": 2.583196046128501,
"grad_norm": 15164.78515625,
"learning_rate": 7.616707616707617e-06,
"loss": 0.4749,
"step": 392
},
{
"epoch": 2.5897858319604614,
"grad_norm": 22485.28125,
"learning_rate": 7.493857493857494e-06,
"loss": 0.5383,
"step": 393
},
{
"epoch": 2.5963756177924218,
"grad_norm": 30391.03515625,
"learning_rate": 7.371007371007371e-06,
"loss": 0.4506,
"step": 394
},
{
"epoch": 2.602965403624382,
"grad_norm": 15713.5595703125,
"learning_rate": 7.2481572481572485e-06,
"loss": 0.5178,
"step": 395
},
{
"epoch": 2.6095551894563425,
"grad_norm": 24513.958984375,
"learning_rate": 7.125307125307126e-06,
"loss": 0.4911,
"step": 396
},
{
"epoch": 2.6161449752883033,
"grad_norm": 21254.240234375,
"learning_rate": 7.002457002457002e-06,
"loss": 0.4883,
"step": 397
},
{
"epoch": 2.6227347611202636,
"grad_norm": 25682.513671875,
"learning_rate": 6.87960687960688e-06,
"loss": 0.5677,
"step": 398
},
{
"epoch": 2.629324546952224,
"grad_norm": 10649.7880859375,
"learning_rate": 6.7567567567567575e-06,
"loss": 0.4163,
"step": 399
},
{
"epoch": 2.6359143327841847,
"grad_norm": 11895.4677734375,
"learning_rate": 6.6339066339066335e-06,
"loss": 0.4617,
"step": 400
},
{
"epoch": 2.642504118616145,
"grad_norm": 71847.7421875,
"learning_rate": 6.511056511056511e-06,
"loss": 0.4855,
"step": 401
},
{
"epoch": 2.6490939044481054,
"grad_norm": 12087.333984375,
"learning_rate": 6.388206388206389e-06,
"loss": 0.5488,
"step": 402
},
{
"epoch": 2.655683690280066,
"grad_norm": 22596.474609375,
"learning_rate": 6.2653562653562665e-06,
"loss": 0.4533,
"step": 403
},
{
"epoch": 2.662273476112026,
"grad_norm": 13353.603515625,
"learning_rate": 6.142506142506143e-06,
"loss": 0.4916,
"step": 404
},
{
"epoch": 2.668863261943987,
"grad_norm": 28952.79296875,
"learning_rate": 6.019656019656019e-06,
"loss": 0.4664,
"step": 405
},
{
"epoch": 2.6754530477759473,
"grad_norm": 18681.466796875,
"learning_rate": 5.896805896805897e-06,
"loss": 0.5339,
"step": 406
},
{
"epoch": 2.6820428336079076,
"grad_norm": 55370.97265625,
"learning_rate": 5.773955773955774e-06,
"loss": 0.5157,
"step": 407
},
{
"epoch": 2.6886326194398684,
"grad_norm": 18563.974609375,
"learning_rate": 5.6511056511056515e-06,
"loss": 0.4533,
"step": 408
},
{
"epoch": 2.6952224052718288,
"grad_norm": 17806.52734375,
"learning_rate": 5.528255528255528e-06,
"loss": 0.5011,
"step": 409
},
{
"epoch": 2.701812191103789,
"grad_norm": 192237.5625,
"learning_rate": 5.405405405405406e-06,
"loss": 0.4728,
"step": 410
},
{
"epoch": 2.7084019769357495,
"grad_norm": 7906.73388671875,
"learning_rate": 5.282555282555283e-06,
"loss": 0.504,
"step": 411
},
{
"epoch": 2.71499176276771,
"grad_norm": 22805.4296875,
"learning_rate": 5.15970515970516e-06,
"loss": 0.5509,
"step": 412
},
{
"epoch": 2.7215815485996706,
"grad_norm": 12200.3623046875,
"learning_rate": 5.036855036855037e-06,
"loss": 0.4984,
"step": 413
},
{
"epoch": 2.728171334431631,
"grad_norm": 14147.498046875,
"learning_rate": 4.914004914004914e-06,
"loss": 0.4063,
"step": 414
},
{
"epoch": 2.7347611202635913,
"grad_norm": 11302.486328125,
"learning_rate": 4.791154791154792e-06,
"loss": 0.5305,
"step": 415
},
{
"epoch": 2.741350906095552,
"grad_norm": 12493.9912109375,
"learning_rate": 4.668304668304669e-06,
"loss": 0.5437,
"step": 416
},
{
"epoch": 2.7479406919275124,
"grad_norm": 12336.576171875,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.61,
"step": 417
},
{
"epoch": 2.754530477759473,
"grad_norm": 131413.265625,
"learning_rate": 4.422604422604422e-06,
"loss": 0.4386,
"step": 418
},
{
"epoch": 2.761120263591433,
"grad_norm": 6280.30810546875,
"learning_rate": 4.2997542997543e-06,
"loss": 0.4827,
"step": 419
},
{
"epoch": 2.7677100494233935,
"grad_norm": 22697.05859375,
"learning_rate": 4.176904176904177e-06,
"loss": 0.5122,
"step": 420
},
{
"epoch": 2.7742998352553543,
"grad_norm": 24444.3203125,
"learning_rate": 4.0540540540540545e-06,
"loss": 0.483,
"step": 421
},
{
"epoch": 2.7808896210873146,
"grad_norm": 11774.201171875,
"learning_rate": 3.931203931203931e-06,
"loss": 0.5757,
"step": 422
},
{
"epoch": 2.787479406919275,
"grad_norm": 16244.38671875,
"learning_rate": 3.8083538083538086e-06,
"loss": 0.4689,
"step": 423
},
{
"epoch": 2.7940691927512358,
"grad_norm": 22801.099609375,
"learning_rate": 3.6855036855036854e-06,
"loss": 0.4832,
"step": 424
},
{
"epoch": 2.800658978583196,
"grad_norm": 12143.181640625,
"learning_rate": 3.562653562653563e-06,
"loss": 0.5305,
"step": 425
},
{
"epoch": 2.8072487644151565,
"grad_norm": 37724.21875,
"learning_rate": 3.43980343980344e-06,
"loss": 0.6104,
"step": 426
},
{
"epoch": 2.813838550247117,
"grad_norm": 15959.6953125,
"learning_rate": 3.3169533169533168e-06,
"loss": 0.4719,
"step": 427
},
{
"epoch": 2.820428336079077,
"grad_norm": 16792.8984375,
"learning_rate": 3.1941031941031944e-06,
"loss": 0.5533,
"step": 428
},
{
"epoch": 2.827018121911038,
"grad_norm": 11308.0615234375,
"learning_rate": 3.0712530712530717e-06,
"loss": 0.5628,
"step": 429
},
{
"epoch": 2.8336079077429983,
"grad_norm": 23545.369140625,
"learning_rate": 2.9484029484029485e-06,
"loss": 0.5278,
"step": 430
},
{
"epoch": 2.8401976935749587,
"grad_norm": 33122.05859375,
"learning_rate": 2.8255528255528258e-06,
"loss": 0.4413,
"step": 431
},
{
"epoch": 2.8467874794069195,
"grad_norm": 22283.62890625,
"learning_rate": 2.702702702702703e-06,
"loss": 0.5699,
"step": 432
},
{
"epoch": 2.85337726523888,
"grad_norm": 13194.3056640625,
"learning_rate": 2.57985257985258e-06,
"loss": 0.5302,
"step": 433
},
{
"epoch": 2.85996705107084,
"grad_norm": 20634.77734375,
"learning_rate": 2.457002457002457e-06,
"loss": 0.5518,
"step": 434
},
{
"epoch": 2.8665568369028005,
"grad_norm": 15800.359375,
"learning_rate": 2.3341523341523343e-06,
"loss": 0.5346,
"step": 435
},
{
"epoch": 2.873146622734761,
"grad_norm": 15633.765625,
"learning_rate": 2.211302211302211e-06,
"loss": 0.4884,
"step": 436
},
{
"epoch": 2.8797364085667216,
"grad_norm": 31339.26953125,
"learning_rate": 2.0884520884520884e-06,
"loss": 0.4925,
"step": 437
},
{
"epoch": 2.886326194398682,
"grad_norm": 43050.4765625,
"learning_rate": 1.9656019656019657e-06,
"loss": 0.5312,
"step": 438
},
{
"epoch": 2.892915980230643,
"grad_norm": 17257.248046875,
"learning_rate": 1.8427518427518427e-06,
"loss": 0.4499,
"step": 439
},
{
"epoch": 2.899505766062603,
"grad_norm": 53676.23046875,
"learning_rate": 1.71990171990172e-06,
"loss": 0.4375,
"step": 440
},
{
"epoch": 2.9060955518945635,
"grad_norm": 31634.296875,
"learning_rate": 1.5970515970515972e-06,
"loss": 0.4394,
"step": 441
},
{
"epoch": 2.912685337726524,
"grad_norm": 22569.1171875,
"learning_rate": 1.4742014742014743e-06,
"loss": 0.4388,
"step": 442
},
{
"epoch": 2.919275123558484,
"grad_norm": 66667.6796875,
"learning_rate": 1.3513513513513515e-06,
"loss": 0.4113,
"step": 443
},
{
"epoch": 2.925864909390445,
"grad_norm": 7823.2880859375,
"learning_rate": 1.2285012285012285e-06,
"loss": 0.4316,
"step": 444
},
{
"epoch": 2.9324546952224053,
"grad_norm": 8603.0859375,
"learning_rate": 1.1056511056511056e-06,
"loss": 0.4213,
"step": 445
},
{
"epoch": 2.9390444810543657,
"grad_norm": 26335.857421875,
"learning_rate": 9.828009828009828e-07,
"loss": 0.4523,
"step": 446
},
{
"epoch": 2.9456342668863265,
"grad_norm": 18206.806640625,
"learning_rate": 8.5995085995086e-07,
"loss": 0.4656,
"step": 447
},
{
"epoch": 2.952224052718287,
"grad_norm": 9823.25,
"learning_rate": 7.371007371007371e-07,
"loss": 0.5816,
"step": 448
},
{
"epoch": 2.958813838550247,
"grad_norm": 15623.380859375,
"learning_rate": 6.142506142506143e-07,
"loss": 0.4464,
"step": 449
},
{
"epoch": 2.9654036243822075,
"grad_norm": 22244.39453125,
"learning_rate": 4.914004914004914e-07,
"loss": 0.5091,
"step": 450
},
{
"epoch": 2.971993410214168,
"grad_norm": 7978.923828125,
"learning_rate": 3.6855036855036856e-07,
"loss": 0.554,
"step": 451
},
{
"epoch": 2.9785831960461286,
"grad_norm": 20999.830078125,
"learning_rate": 2.457002457002457e-07,
"loss": 0.5075,
"step": 452
},
{
"epoch": 2.985172981878089,
"grad_norm": 22284.568359375,
"learning_rate": 1.2285012285012285e-07,
"loss": 0.4835,
"step": 453
},
{
"epoch": 2.985172981878089,
"step": 453,
"total_flos": 9.99505283128492e+17,
"train_loss": 0.5814478738155323,
"train_runtime": 61267.4818,
"train_samples_per_second": 0.357,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1.0,
"max_steps": 453,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.99505283128492e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}