andstor's picture
Upload folder using huggingface_hub
939248e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.985172981878089,
"eval_steps": 500,
"global_step": 453,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006589785831960461,
"grad_norm": 1333.285400390625,
"learning_rate": 0.0,
"loss": 1.1178,
"step": 1
},
{
"epoch": 0.013179571663920923,
"grad_norm": 1591.2745361328125,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.902,
"step": 2
},
{
"epoch": 0.019769357495881382,
"grad_norm": 574.3883056640625,
"learning_rate": 2.173913043478261e-06,
"loss": 0.9008,
"step": 3
},
{
"epoch": 0.026359143327841845,
"grad_norm": 787.6026611328125,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.9099,
"step": 4
},
{
"epoch": 0.032948929159802305,
"grad_norm": 1140.8524169921875,
"learning_rate": 4.347826086956522e-06,
"loss": 0.8861,
"step": 5
},
{
"epoch": 0.039538714991762765,
"grad_norm": 572.358642578125,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.8068,
"step": 6
},
{
"epoch": 0.04612850082372323,
"grad_norm": 1086.734130859375,
"learning_rate": 6.521739130434783e-06,
"loss": 0.863,
"step": 7
},
{
"epoch": 0.05271828665568369,
"grad_norm": 630.6577758789062,
"learning_rate": 7.608695652173914e-06,
"loss": 0.8086,
"step": 8
},
{
"epoch": 0.05930807248764415,
"grad_norm": 602.673828125,
"learning_rate": 8.695652173913044e-06,
"loss": 0.7029,
"step": 9
},
{
"epoch": 0.06589785831960461,
"grad_norm": 486.8932189941406,
"learning_rate": 9.782608695652175e-06,
"loss": 0.7151,
"step": 10
},
{
"epoch": 0.07248764415156507,
"grad_norm": 502.26641845703125,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.9117,
"step": 11
},
{
"epoch": 0.07907742998352553,
"grad_norm": 510.6407470703125,
"learning_rate": 1.1956521739130435e-05,
"loss": 0.7578,
"step": 12
},
{
"epoch": 0.085667215815486,
"grad_norm": 280.19989013671875,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.8804,
"step": 13
},
{
"epoch": 0.09225700164744646,
"grad_norm": 660.488525390625,
"learning_rate": 1.4130434782608694e-05,
"loss": 0.7351,
"step": 14
},
{
"epoch": 0.09884678747940692,
"grad_norm": 543.288330078125,
"learning_rate": 1.5217391304347828e-05,
"loss": 0.8505,
"step": 15
},
{
"epoch": 0.10543657331136738,
"grad_norm": 449.69940185546875,
"learning_rate": 1.630434782608696e-05,
"loss": 0.8146,
"step": 16
},
{
"epoch": 0.11202635914332784,
"grad_norm": 194.49293518066406,
"learning_rate": 1.739130434782609e-05,
"loss": 0.8442,
"step": 17
},
{
"epoch": 0.1186161449752883,
"grad_norm": 445.9173583984375,
"learning_rate": 1.8478260869565216e-05,
"loss": 0.7377,
"step": 18
},
{
"epoch": 0.12520593080724876,
"grad_norm": 215.03514099121094,
"learning_rate": 1.956521739130435e-05,
"loss": 0.7561,
"step": 19
},
{
"epoch": 0.13179571663920922,
"grad_norm": 389.9723815917969,
"learning_rate": 2.065217391304348e-05,
"loss": 0.7568,
"step": 20
},
{
"epoch": 0.13838550247116968,
"grad_norm": 613.5521850585938,
"learning_rate": 2.173913043478261e-05,
"loss": 0.7034,
"step": 21
},
{
"epoch": 0.14497528830313014,
"grad_norm": 405.9554748535156,
"learning_rate": 2.282608695652174e-05,
"loss": 0.7053,
"step": 22
},
{
"epoch": 0.1515650741350906,
"grad_norm": 215.52670288085938,
"learning_rate": 2.391304347826087e-05,
"loss": 0.7889,
"step": 23
},
{
"epoch": 0.15815485996705106,
"grad_norm": 205.6142120361328,
"learning_rate": 2.5e-05,
"loss": 0.7658,
"step": 24
},
{
"epoch": 0.16474464579901152,
"grad_norm": 987.6297607421875,
"learning_rate": 2.608695652173913e-05,
"loss": 0.788,
"step": 25
},
{
"epoch": 0.171334431630972,
"grad_norm": 222.69949340820312,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.677,
"step": 26
},
{
"epoch": 0.17792421746293247,
"grad_norm": 345.14007568359375,
"learning_rate": 2.826086956521739e-05,
"loss": 0.781,
"step": 27
},
{
"epoch": 0.18451400329489293,
"grad_norm": 463.8091125488281,
"learning_rate": 2.9347826086956526e-05,
"loss": 0.719,
"step": 28
},
{
"epoch": 0.19110378912685339,
"grad_norm": 305.9866943359375,
"learning_rate": 3.0434782608695656e-05,
"loss": 0.913,
"step": 29
},
{
"epoch": 0.19769357495881384,
"grad_norm": 181.99391174316406,
"learning_rate": 3.152173913043479e-05,
"loss": 0.8097,
"step": 30
},
{
"epoch": 0.2042833607907743,
"grad_norm": 218.90301513671875,
"learning_rate": 3.260869565217392e-05,
"loss": 0.7213,
"step": 31
},
{
"epoch": 0.21087314662273476,
"grad_norm": 661.6412353515625,
"learning_rate": 3.369565217391305e-05,
"loss": 0.845,
"step": 32
},
{
"epoch": 0.21746293245469522,
"grad_norm": 454.7393493652344,
"learning_rate": 3.478260869565218e-05,
"loss": 0.8505,
"step": 33
},
{
"epoch": 0.22405271828665568,
"grad_norm": 410.3249816894531,
"learning_rate": 3.58695652173913e-05,
"loss": 0.8294,
"step": 34
},
{
"epoch": 0.23064250411861614,
"grad_norm": 152.13143920898438,
"learning_rate": 3.695652173913043e-05,
"loss": 0.8209,
"step": 35
},
{
"epoch": 0.2372322899505766,
"grad_norm": 271.0032653808594,
"learning_rate": 3.804347826086957e-05,
"loss": 0.8357,
"step": 36
},
{
"epoch": 0.24382207578253706,
"grad_norm": 635.7935791015625,
"learning_rate": 3.91304347826087e-05,
"loss": 0.7799,
"step": 37
},
{
"epoch": 0.2504118616144975,
"grad_norm": 461.5861511230469,
"learning_rate": 4.021739130434783e-05,
"loss": 0.765,
"step": 38
},
{
"epoch": 0.257001647446458,
"grad_norm": 279.9590148925781,
"learning_rate": 4.130434782608696e-05,
"loss": 0.8731,
"step": 39
},
{
"epoch": 0.26359143327841844,
"grad_norm": 765.4867553710938,
"learning_rate": 4.239130434782609e-05,
"loss": 0.7388,
"step": 40
},
{
"epoch": 0.2701812191103789,
"grad_norm": 575.533447265625,
"learning_rate": 4.347826086956522e-05,
"loss": 0.7368,
"step": 41
},
{
"epoch": 0.27677100494233936,
"grad_norm": 405.68023681640625,
"learning_rate": 4.456521739130435e-05,
"loss": 0.7146,
"step": 42
},
{
"epoch": 0.2833607907742998,
"grad_norm": 320.5788269042969,
"learning_rate": 4.565217391304348e-05,
"loss": 0.9472,
"step": 43
},
{
"epoch": 0.2899505766062603,
"grad_norm": 159.42025756835938,
"learning_rate": 4.673913043478261e-05,
"loss": 0.748,
"step": 44
},
{
"epoch": 0.29654036243822074,
"grad_norm": 343.9827575683594,
"learning_rate": 4.782608695652174e-05,
"loss": 0.7369,
"step": 45
},
{
"epoch": 0.3031301482701812,
"grad_norm": 2192.0439453125,
"learning_rate": 4.891304347826087e-05,
"loss": 0.8394,
"step": 46
},
{
"epoch": 0.30971993410214166,
"grad_norm": 379.729248046875,
"learning_rate": 5e-05,
"loss": 0.8388,
"step": 47
},
{
"epoch": 0.3163097199341021,
"grad_norm": 154.18643188476562,
"learning_rate": 4.987714987714988e-05,
"loss": 0.9252,
"step": 48
},
{
"epoch": 0.3228995057660626,
"grad_norm": 561.9174194335938,
"learning_rate": 4.9754299754299756e-05,
"loss": 0.8088,
"step": 49
},
{
"epoch": 0.32948929159802304,
"grad_norm": 434.27325439453125,
"learning_rate": 4.963144963144963e-05,
"loss": 0.9263,
"step": 50
},
{
"epoch": 0.33607907742998355,
"grad_norm": 122.2130126953125,
"learning_rate": 4.950859950859951e-05,
"loss": 0.8773,
"step": 51
},
{
"epoch": 0.342668863261944,
"grad_norm": 745.0607299804688,
"learning_rate": 4.9385749385749387e-05,
"loss": 0.7825,
"step": 52
},
{
"epoch": 0.34925864909390447,
"grad_norm": 328.8779602050781,
"learning_rate": 4.926289926289926e-05,
"loss": 0.8562,
"step": 53
},
{
"epoch": 0.35584843492586493,
"grad_norm": 192.5826873779297,
"learning_rate": 4.914004914004915e-05,
"loss": 0.8408,
"step": 54
},
{
"epoch": 0.3624382207578254,
"grad_norm": 290.76776123046875,
"learning_rate": 4.901719901719902e-05,
"loss": 0.769,
"step": 55
},
{
"epoch": 0.36902800658978585,
"grad_norm": 212.2420654296875,
"learning_rate": 4.8894348894348894e-05,
"loss": 0.7944,
"step": 56
},
{
"epoch": 0.3756177924217463,
"grad_norm": 141.33392333984375,
"learning_rate": 4.877149877149878e-05,
"loss": 0.9014,
"step": 57
},
{
"epoch": 0.38220757825370677,
"grad_norm": 210.45494079589844,
"learning_rate": 4.8648648648648654e-05,
"loss": 0.8414,
"step": 58
},
{
"epoch": 0.38879736408566723,
"grad_norm": 160.95689392089844,
"learning_rate": 4.8525798525798524e-05,
"loss": 0.7485,
"step": 59
},
{
"epoch": 0.3953871499176277,
"grad_norm": 303.22906494140625,
"learning_rate": 4.840294840294841e-05,
"loss": 0.8196,
"step": 60
},
{
"epoch": 0.40197693574958815,
"grad_norm": 327.06805419921875,
"learning_rate": 4.8280098280098285e-05,
"loss": 0.7513,
"step": 61
},
{
"epoch": 0.4085667215815486,
"grad_norm": 1190.357421875,
"learning_rate": 4.8157248157248155e-05,
"loss": 0.9952,
"step": 62
},
{
"epoch": 0.41515650741350907,
"grad_norm": 206.6424102783203,
"learning_rate": 4.803439803439804e-05,
"loss": 0.8309,
"step": 63
},
{
"epoch": 0.42174629324546953,
"grad_norm": 534.4395141601562,
"learning_rate": 4.7911547911547915e-05,
"loss": 0.8531,
"step": 64
},
{
"epoch": 0.42833607907743,
"grad_norm": 341.8865966796875,
"learning_rate": 4.778869778869779e-05,
"loss": 0.9542,
"step": 65
},
{
"epoch": 0.43492586490939045,
"grad_norm": 228.7908172607422,
"learning_rate": 4.766584766584767e-05,
"loss": 0.9572,
"step": 66
},
{
"epoch": 0.4415156507413509,
"grad_norm": 352.82086181640625,
"learning_rate": 4.7542997542997546e-05,
"loss": 0.8549,
"step": 67
},
{
"epoch": 0.44810543657331137,
"grad_norm": 105.19104766845703,
"learning_rate": 4.742014742014742e-05,
"loss": 0.8718,
"step": 68
},
{
"epoch": 0.4546952224052718,
"grad_norm": 99.13899230957031,
"learning_rate": 4.72972972972973e-05,
"loss": 0.9738,
"step": 69
},
{
"epoch": 0.4612850082372323,
"grad_norm": 228.2894287109375,
"learning_rate": 4.7174447174447176e-05,
"loss": 0.8689,
"step": 70
},
{
"epoch": 0.46787479406919275,
"grad_norm": 157.54298400878906,
"learning_rate": 4.705159705159705e-05,
"loss": 0.8553,
"step": 71
},
{
"epoch": 0.4744645799011532,
"grad_norm": 328.7658996582031,
"learning_rate": 4.692874692874693e-05,
"loss": 0.8788,
"step": 72
},
{
"epoch": 0.48105436573311366,
"grad_norm": 1948.38916015625,
"learning_rate": 4.680589680589681e-05,
"loss": 0.8151,
"step": 73
},
{
"epoch": 0.4876441515650741,
"grad_norm": 320.9216003417969,
"learning_rate": 4.6683046683046684e-05,
"loss": 0.8086,
"step": 74
},
{
"epoch": 0.4942339373970346,
"grad_norm": 1094.80517578125,
"learning_rate": 4.656019656019656e-05,
"loss": 0.881,
"step": 75
},
{
"epoch": 0.500823723228995,
"grad_norm": 262.46636962890625,
"learning_rate": 4.6437346437346444e-05,
"loss": 0.9475,
"step": 76
},
{
"epoch": 0.5074135090609555,
"grad_norm": 395.812744140625,
"learning_rate": 4.6314496314496314e-05,
"loss": 0.9287,
"step": 77
},
{
"epoch": 0.514003294892916,
"grad_norm": 312.8116149902344,
"learning_rate": 4.619164619164619e-05,
"loss": 0.898,
"step": 78
},
{
"epoch": 0.5205930807248764,
"grad_norm": 124.0872802734375,
"learning_rate": 4.6068796068796074e-05,
"loss": 0.9626,
"step": 79
},
{
"epoch": 0.5271828665568369,
"grad_norm": 180.38021850585938,
"learning_rate": 4.594594594594595e-05,
"loss": 0.8902,
"step": 80
},
{
"epoch": 0.5337726523887973,
"grad_norm": 190.2543182373047,
"learning_rate": 4.582309582309582e-05,
"loss": 0.8404,
"step": 81
},
{
"epoch": 0.5403624382207578,
"grad_norm": 119.05390167236328,
"learning_rate": 4.5700245700245705e-05,
"loss": 0.9087,
"step": 82
},
{
"epoch": 0.5469522240527183,
"grad_norm": 564.9111938476562,
"learning_rate": 4.557739557739558e-05,
"loss": 0.7668,
"step": 83
},
{
"epoch": 0.5535420098846787,
"grad_norm": 131.78086853027344,
"learning_rate": 4.545454545454546e-05,
"loss": 0.8434,
"step": 84
},
{
"epoch": 0.5601317957166392,
"grad_norm": 453.88775634765625,
"learning_rate": 4.5331695331695335e-05,
"loss": 0.8631,
"step": 85
},
{
"epoch": 0.5667215815485996,
"grad_norm": 192.94564819335938,
"learning_rate": 4.520884520884521e-05,
"loss": 0.8508,
"step": 86
},
{
"epoch": 0.5733113673805601,
"grad_norm": 178.88607788085938,
"learning_rate": 4.508599508599509e-05,
"loss": 0.8746,
"step": 87
},
{
"epoch": 0.5799011532125206,
"grad_norm": 355.49322509765625,
"learning_rate": 4.4963144963144966e-05,
"loss": 0.8221,
"step": 88
},
{
"epoch": 0.586490939044481,
"grad_norm": 727.7778930664062,
"learning_rate": 4.484029484029484e-05,
"loss": 1.0285,
"step": 89
},
{
"epoch": 0.5930807248764415,
"grad_norm": 1586.21337890625,
"learning_rate": 4.471744471744472e-05,
"loss": 0.9701,
"step": 90
},
{
"epoch": 0.5996705107084019,
"grad_norm": 558.633544921875,
"learning_rate": 4.4594594594594596e-05,
"loss": 0.8067,
"step": 91
},
{
"epoch": 0.6062602965403624,
"grad_norm": 969.2847900390625,
"learning_rate": 4.447174447174447e-05,
"loss": 0.7854,
"step": 92
},
{
"epoch": 0.6128500823723229,
"grad_norm": 437.51397705078125,
"learning_rate": 4.434889434889435e-05,
"loss": 0.9254,
"step": 93
},
{
"epoch": 0.6194398682042833,
"grad_norm": 436.55853271484375,
"learning_rate": 4.422604422604423e-05,
"loss": 0.8167,
"step": 94
},
{
"epoch": 0.6260296540362438,
"grad_norm": 213.31967163085938,
"learning_rate": 4.4103194103194104e-05,
"loss": 0.8767,
"step": 95
},
{
"epoch": 0.6326194398682042,
"grad_norm": 121.2298583984375,
"learning_rate": 4.398034398034398e-05,
"loss": 0.8475,
"step": 96
},
{
"epoch": 0.6392092257001647,
"grad_norm": 275.9543762207031,
"learning_rate": 4.385749385749386e-05,
"loss": 0.7616,
"step": 97
},
{
"epoch": 0.6457990115321252,
"grad_norm": 211.8274688720703,
"learning_rate": 4.373464373464374e-05,
"loss": 0.8941,
"step": 98
},
{
"epoch": 0.6523887973640856,
"grad_norm": 85.87938690185547,
"learning_rate": 4.361179361179362e-05,
"loss": 0.8319,
"step": 99
},
{
"epoch": 0.6589785831960461,
"grad_norm": 160.2537841796875,
"learning_rate": 4.348894348894349e-05,
"loss": 0.8626,
"step": 100
},
{
"epoch": 0.6655683690280065,
"grad_norm": 458.2138977050781,
"learning_rate": 4.336609336609337e-05,
"loss": 0.8062,
"step": 101
},
{
"epoch": 0.6721581548599671,
"grad_norm": 162.5004425048828,
"learning_rate": 4.324324324324325e-05,
"loss": 0.8782,
"step": 102
},
{
"epoch": 0.6787479406919276,
"grad_norm": 95.31204223632812,
"learning_rate": 4.312039312039312e-05,
"loss": 0.888,
"step": 103
},
{
"epoch": 0.685337726523888,
"grad_norm": 224.08078002929688,
"learning_rate": 4.2997542997543e-05,
"loss": 0.8597,
"step": 104
},
{
"epoch": 0.6919275123558485,
"grad_norm": 1092.45458984375,
"learning_rate": 4.287469287469288e-05,
"loss": 0.6954,
"step": 105
},
{
"epoch": 0.6985172981878089,
"grad_norm": 440.71038818359375,
"learning_rate": 4.2751842751842756e-05,
"loss": 0.9465,
"step": 106
},
{
"epoch": 0.7051070840197694,
"grad_norm": 530.1962280273438,
"learning_rate": 4.262899262899263e-05,
"loss": 0.8514,
"step": 107
},
{
"epoch": 0.7116968698517299,
"grad_norm": 177.9505157470703,
"learning_rate": 4.250614250614251e-05,
"loss": 0.9099,
"step": 108
},
{
"epoch": 0.7182866556836903,
"grad_norm": 445.9535217285156,
"learning_rate": 4.2383292383292386e-05,
"loss": 0.8832,
"step": 109
},
{
"epoch": 0.7248764415156508,
"grad_norm": 381.5651550292969,
"learning_rate": 4.226044226044226e-05,
"loss": 0.8822,
"step": 110
},
{
"epoch": 0.7314662273476112,
"grad_norm": 304.6574401855469,
"learning_rate": 4.213759213759214e-05,
"loss": 0.9179,
"step": 111
},
{
"epoch": 0.7380560131795717,
"grad_norm": 493.38702392578125,
"learning_rate": 4.2014742014742017e-05,
"loss": 0.8883,
"step": 112
},
{
"epoch": 0.7446457990115322,
"grad_norm": 308.3809814453125,
"learning_rate": 4.189189189189189e-05,
"loss": 0.9063,
"step": 113
},
{
"epoch": 0.7512355848434926,
"grad_norm": 566.2634887695312,
"learning_rate": 4.176904176904177e-05,
"loss": 0.8362,
"step": 114
},
{
"epoch": 0.7578253706754531,
"grad_norm": 351.1834716796875,
"learning_rate": 4.164619164619165e-05,
"loss": 0.7975,
"step": 115
},
{
"epoch": 0.7644151565074135,
"grad_norm": 431.1800842285156,
"learning_rate": 4.1523341523341524e-05,
"loss": 0.9101,
"step": 116
},
{
"epoch": 0.771004942339374,
"grad_norm": 162.2163848876953,
"learning_rate": 4.14004914004914e-05,
"loss": 0.9346,
"step": 117
},
{
"epoch": 0.7775947281713345,
"grad_norm": 380.3731994628906,
"learning_rate": 4.127764127764128e-05,
"loss": 0.8159,
"step": 118
},
{
"epoch": 0.7841845140032949,
"grad_norm": 162.47447204589844,
"learning_rate": 4.1154791154791154e-05,
"loss": 0.7163,
"step": 119
},
{
"epoch": 0.7907742998352554,
"grad_norm": 302.619873046875,
"learning_rate": 4.103194103194104e-05,
"loss": 0.7443,
"step": 120
},
{
"epoch": 0.7973640856672158,
"grad_norm": 263.76385498046875,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.8107,
"step": 121
},
{
"epoch": 0.8039538714991763,
"grad_norm": 175.19789123535156,
"learning_rate": 4.0786240786240785e-05,
"loss": 0.7875,
"step": 122
},
{
"epoch": 0.8105436573311368,
"grad_norm": 201.381591796875,
"learning_rate": 4.066339066339067e-05,
"loss": 0.7453,
"step": 123
},
{
"epoch": 0.8171334431630972,
"grad_norm": 66.85758972167969,
"learning_rate": 4.0540540540540545e-05,
"loss": 0.7854,
"step": 124
},
{
"epoch": 0.8237232289950577,
"grad_norm": 497.2497253417969,
"learning_rate": 4.0417690417690415e-05,
"loss": 0.8332,
"step": 125
},
{
"epoch": 0.8303130148270181,
"grad_norm": 446.8744812011719,
"learning_rate": 4.02948402948403e-05,
"loss": 0.8435,
"step": 126
},
{
"epoch": 0.8369028006589786,
"grad_norm": 149.99560546875,
"learning_rate": 4.0171990171990176e-05,
"loss": 0.7904,
"step": 127
},
{
"epoch": 0.8434925864909391,
"grad_norm": 95.36408996582031,
"learning_rate": 4.004914004914005e-05,
"loss": 0.8921,
"step": 128
},
{
"epoch": 0.8500823723228995,
"grad_norm": 329.3395080566406,
"learning_rate": 3.992628992628993e-05,
"loss": 0.938,
"step": 129
},
{
"epoch": 0.85667215815486,
"grad_norm": 105.44376373291016,
"learning_rate": 3.9803439803439806e-05,
"loss": 0.7651,
"step": 130
},
{
"epoch": 0.8632619439868204,
"grad_norm": 365.7022399902344,
"learning_rate": 3.968058968058968e-05,
"loss": 0.9926,
"step": 131
},
{
"epoch": 0.8698517298187809,
"grad_norm": 157.09010314941406,
"learning_rate": 3.955773955773956e-05,
"loss": 0.8138,
"step": 132
},
{
"epoch": 0.8764415156507414,
"grad_norm": 199.70140075683594,
"learning_rate": 3.943488943488944e-05,
"loss": 0.8207,
"step": 133
},
{
"epoch": 0.8830313014827018,
"grad_norm": 170.55154418945312,
"learning_rate": 3.9312039312039314e-05,
"loss": 0.8112,
"step": 134
},
{
"epoch": 0.8896210873146623,
"grad_norm": 101.48678588867188,
"learning_rate": 3.918918918918919e-05,
"loss": 0.9179,
"step": 135
},
{
"epoch": 0.8962108731466227,
"grad_norm": 134.48329162597656,
"learning_rate": 3.906633906633907e-05,
"loss": 0.7232,
"step": 136
},
{
"epoch": 0.9028006589785832,
"grad_norm": 55.03281784057617,
"learning_rate": 3.8943488943488944e-05,
"loss": 0.7995,
"step": 137
},
{
"epoch": 0.9093904448105437,
"grad_norm": 150.32440185546875,
"learning_rate": 3.882063882063882e-05,
"loss": 0.809,
"step": 138
},
{
"epoch": 0.9159802306425041,
"grad_norm": 338.85614013671875,
"learning_rate": 3.86977886977887e-05,
"loss": 0.8687,
"step": 139
},
{
"epoch": 0.9225700164744646,
"grad_norm": 151.74453735351562,
"learning_rate": 3.857493857493858e-05,
"loss": 0.7813,
"step": 140
},
{
"epoch": 0.929159802306425,
"grad_norm": 138.03311157226562,
"learning_rate": 3.845208845208845e-05,
"loss": 0.8763,
"step": 141
},
{
"epoch": 0.9357495881383855,
"grad_norm": 391.97857666015625,
"learning_rate": 3.8329238329238335e-05,
"loss": 0.8187,
"step": 142
},
{
"epoch": 0.942339373970346,
"grad_norm": 266.4914245605469,
"learning_rate": 3.820638820638821e-05,
"loss": 0.6941,
"step": 143
},
{
"epoch": 0.9489291598023064,
"grad_norm": 79.93824005126953,
"learning_rate": 3.808353808353808e-05,
"loss": 0.8388,
"step": 144
},
{
"epoch": 0.9555189456342669,
"grad_norm": 596.604736328125,
"learning_rate": 3.7960687960687965e-05,
"loss": 0.7768,
"step": 145
},
{
"epoch": 0.9621087314662273,
"grad_norm": 70.91590118408203,
"learning_rate": 3.783783783783784e-05,
"loss": 0.7389,
"step": 146
},
{
"epoch": 0.9686985172981878,
"grad_norm": 305.1685485839844,
"learning_rate": 3.771498771498771e-05,
"loss": 0.9451,
"step": 147
},
{
"epoch": 0.9752883031301482,
"grad_norm": 96.85557556152344,
"learning_rate": 3.7592137592137596e-05,
"loss": 0.8449,
"step": 148
},
{
"epoch": 0.9818780889621087,
"grad_norm": 480.3149108886719,
"learning_rate": 3.746928746928747e-05,
"loss": 0.9493,
"step": 149
},
{
"epoch": 0.9884678747940692,
"grad_norm": 238.29258728027344,
"learning_rate": 3.734643734643735e-05,
"loss": 0.8697,
"step": 150
},
{
"epoch": 0.9950576606260296,
"grad_norm": 106.33365631103516,
"learning_rate": 3.7223587223587226e-05,
"loss": 0.8836,
"step": 151
},
{
"epoch": 1.00164744645799,
"grad_norm": 439.60009765625,
"learning_rate": 3.71007371007371e-05,
"loss": 0.7164,
"step": 152
},
{
"epoch": 1.0082372322899507,
"grad_norm": 109.26066589355469,
"learning_rate": 3.697788697788698e-05,
"loss": 0.632,
"step": 153
},
{
"epoch": 1.014827018121911,
"grad_norm": 415.2508239746094,
"learning_rate": 3.685503685503686e-05,
"loss": 0.6479,
"step": 154
},
{
"epoch": 1.0214168039538716,
"grad_norm": 260.052734375,
"learning_rate": 3.6732186732186734e-05,
"loss": 0.5934,
"step": 155
},
{
"epoch": 1.028006589785832,
"grad_norm": 1057.008056640625,
"learning_rate": 3.660933660933661e-05,
"loss": 0.5648,
"step": 156
},
{
"epoch": 1.0345963756177925,
"grad_norm": 106.66386413574219,
"learning_rate": 3.648648648648649e-05,
"loss": 0.5957,
"step": 157
},
{
"epoch": 1.0411861614497528,
"grad_norm": 130.90151977539062,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.5769,
"step": 158
},
{
"epoch": 1.0477759472817134,
"grad_norm": 262.9067077636719,
"learning_rate": 3.624078624078625e-05,
"loss": 0.6341,
"step": 159
},
{
"epoch": 1.0543657331136738,
"grad_norm": 59.18153762817383,
"learning_rate": 3.611793611793612e-05,
"loss": 0.5328,
"step": 160
},
{
"epoch": 1.0609555189456343,
"grad_norm": 122.3954849243164,
"learning_rate": 3.5995085995085995e-05,
"loss": 0.5212,
"step": 161
},
{
"epoch": 1.0675453047775947,
"grad_norm": 219.09283447265625,
"learning_rate": 3.587223587223588e-05,
"loss": 0.5569,
"step": 162
},
{
"epoch": 1.0741350906095553,
"grad_norm": 321.9674987792969,
"learning_rate": 3.574938574938575e-05,
"loss": 0.5809,
"step": 163
},
{
"epoch": 1.0807248764415156,
"grad_norm": 83.09851837158203,
"learning_rate": 3.562653562653563e-05,
"loss": 0.5576,
"step": 164
},
{
"epoch": 1.0873146622734762,
"grad_norm": 138.06068420410156,
"learning_rate": 3.550368550368551e-05,
"loss": 0.5051,
"step": 165
},
{
"epoch": 1.0939044481054365,
"grad_norm": 152.32656860351562,
"learning_rate": 3.538083538083538e-05,
"loss": 0.5291,
"step": 166
},
{
"epoch": 1.100494233937397,
"grad_norm": 327.38824462890625,
"learning_rate": 3.525798525798526e-05,
"loss": 0.6476,
"step": 167
},
{
"epoch": 1.1070840197693574,
"grad_norm": 121.95663452148438,
"learning_rate": 3.513513513513514e-05,
"loss": 0.5158,
"step": 168
},
{
"epoch": 1.113673805601318,
"grad_norm": 92.62237548828125,
"learning_rate": 3.501228501228501e-05,
"loss": 0.5702,
"step": 169
},
{
"epoch": 1.1202635914332784,
"grad_norm": 683.2556762695312,
"learning_rate": 3.488943488943489e-05,
"loss": 0.621,
"step": 170
},
{
"epoch": 1.126853377265239,
"grad_norm": 229.4330291748047,
"learning_rate": 3.476658476658477e-05,
"loss": 0.5686,
"step": 171
},
{
"epoch": 1.1334431630971993,
"grad_norm": 213.6857147216797,
"learning_rate": 3.4643734643734647e-05,
"loss": 0.5534,
"step": 172
},
{
"epoch": 1.1400329489291599,
"grad_norm": 150.42703247070312,
"learning_rate": 3.452088452088452e-05,
"loss": 0.4501,
"step": 173
},
{
"epoch": 1.1466227347611202,
"grad_norm": 405.1623840332031,
"learning_rate": 3.43980343980344e-05,
"loss": 0.4762,
"step": 174
},
{
"epoch": 1.1532125205930808,
"grad_norm": 213.4350128173828,
"learning_rate": 3.427518427518428e-05,
"loss": 0.5208,
"step": 175
},
{
"epoch": 1.1598023064250411,
"grad_norm": 527.8944702148438,
"learning_rate": 3.4152334152334154e-05,
"loss": 0.6235,
"step": 176
},
{
"epoch": 1.1663920922570017,
"grad_norm": 192.6360626220703,
"learning_rate": 3.402948402948403e-05,
"loss": 0.4193,
"step": 177
},
{
"epoch": 1.172981878088962,
"grad_norm": 203.14984130859375,
"learning_rate": 3.390663390663391e-05,
"loss": 0.6234,
"step": 178
},
{
"epoch": 1.1795716639209226,
"grad_norm": 206.81492614746094,
"learning_rate": 3.3783783783783784e-05,
"loss": 0.4987,
"step": 179
},
{
"epoch": 1.186161449752883,
"grad_norm": 243.38145446777344,
"learning_rate": 3.366093366093366e-05,
"loss": 0.5247,
"step": 180
},
{
"epoch": 1.1927512355848435,
"grad_norm": 161.61740112304688,
"learning_rate": 3.3538083538083545e-05,
"loss": 0.5509,
"step": 181
},
{
"epoch": 1.1993410214168039,
"grad_norm": 291.75469970703125,
"learning_rate": 3.3415233415233415e-05,
"loss": 0.5825,
"step": 182
},
{
"epoch": 1.2059308072487644,
"grad_norm": 133.4263458251953,
"learning_rate": 3.329238329238329e-05,
"loss": 0.6691,
"step": 183
},
{
"epoch": 1.2125205930807248,
"grad_norm": 1017.546875,
"learning_rate": 3.3169533169533175e-05,
"loss": 0.4931,
"step": 184
},
{
"epoch": 1.2191103789126854,
"grad_norm": 108.6457748413086,
"learning_rate": 3.3046683046683045e-05,
"loss": 0.5175,
"step": 185
},
{
"epoch": 1.2257001647446457,
"grad_norm": 146.6004638671875,
"learning_rate": 3.292383292383293e-05,
"loss": 0.4866,
"step": 186
},
{
"epoch": 1.2322899505766063,
"grad_norm": 178.45260620117188,
"learning_rate": 3.2800982800982806e-05,
"loss": 0.5496,
"step": 187
},
{
"epoch": 1.2388797364085666,
"grad_norm": 373.0599365234375,
"learning_rate": 3.2678132678132676e-05,
"loss": 0.5361,
"step": 188
},
{
"epoch": 1.2454695222405272,
"grad_norm": 146.2403106689453,
"learning_rate": 3.255528255528256e-05,
"loss": 0.5619,
"step": 189
},
{
"epoch": 1.2520593080724876,
"grad_norm": 49.14468002319336,
"learning_rate": 3.2432432432432436e-05,
"loss": 0.4598,
"step": 190
},
{
"epoch": 1.2586490939044481,
"grad_norm": 1165.1783447265625,
"learning_rate": 3.2309582309582306e-05,
"loss": 0.5666,
"step": 191
},
{
"epoch": 1.2652388797364087,
"grad_norm": 61.536949157714844,
"learning_rate": 3.218673218673219e-05,
"loss": 0.5534,
"step": 192
},
{
"epoch": 1.271828665568369,
"grad_norm": 595.9248046875,
"learning_rate": 3.206388206388207e-05,
"loss": 0.4841,
"step": 193
},
{
"epoch": 1.2784184514003294,
"grad_norm": 186.39930725097656,
"learning_rate": 3.1941031941031943e-05,
"loss": 0.4922,
"step": 194
},
{
"epoch": 1.28500823723229,
"grad_norm": 325.35980224609375,
"learning_rate": 3.181818181818182e-05,
"loss": 0.5436,
"step": 195
},
{
"epoch": 1.2915980230642505,
"grad_norm": 75.9457015991211,
"learning_rate": 3.16953316953317e-05,
"loss": 0.5036,
"step": 196
},
{
"epoch": 1.2981878088962109,
"grad_norm": 67.08493041992188,
"learning_rate": 3.1572481572481574e-05,
"loss": 0.5175,
"step": 197
},
{
"epoch": 1.3047775947281712,
"grad_norm": 74.11741638183594,
"learning_rate": 3.144963144963145e-05,
"loss": 0.518,
"step": 198
},
{
"epoch": 1.3113673805601318,
"grad_norm": 280.2093811035156,
"learning_rate": 3.132678132678133e-05,
"loss": 0.5865,
"step": 199
},
{
"epoch": 1.3179571663920924,
"grad_norm": 168.95388793945312,
"learning_rate": 3.120393120393121e-05,
"loss": 0.6006,
"step": 200
},
{
"epoch": 1.3245469522240527,
"grad_norm": 411.6920471191406,
"learning_rate": 3.108108108108108e-05,
"loss": 0.5912,
"step": 201
},
{
"epoch": 1.331136738056013,
"grad_norm": 208.7516632080078,
"learning_rate": 3.095823095823096e-05,
"loss": 0.4938,
"step": 202
},
{
"epoch": 1.3377265238879736,
"grad_norm": 49.95132827758789,
"learning_rate": 3.083538083538084e-05,
"loss": 0.5094,
"step": 203
},
{
"epoch": 1.3443163097199342,
"grad_norm": 73.55326080322266,
"learning_rate": 3.071253071253071e-05,
"loss": 0.584,
"step": 204
},
{
"epoch": 1.3509060955518946,
"grad_norm": 102.9446792602539,
"learning_rate": 3.058968058968059e-05,
"loss": 0.5473,
"step": 205
},
{
"epoch": 1.357495881383855,
"grad_norm": 457.49359130859375,
"learning_rate": 3.046683046683047e-05,
"loss": 0.6395,
"step": 206
},
{
"epoch": 1.3640856672158155,
"grad_norm": 236.29953002929688,
"learning_rate": 3.0343980343980342e-05,
"loss": 0.5853,
"step": 207
},
{
"epoch": 1.370675453047776,
"grad_norm": 39.58445358276367,
"learning_rate": 3.0221130221130222e-05,
"loss": 0.5631,
"step": 208
},
{
"epoch": 1.3772652388797364,
"grad_norm": 275.3215026855469,
"learning_rate": 3.0098280098280103e-05,
"loss": 0.5391,
"step": 209
},
{
"epoch": 1.3838550247116967,
"grad_norm": 231.88194274902344,
"learning_rate": 2.9975429975429976e-05,
"loss": 0.5002,
"step": 210
},
{
"epoch": 1.3904448105436573,
"grad_norm": 294.2489929199219,
"learning_rate": 2.9852579852579853e-05,
"loss": 0.5846,
"step": 211
},
{
"epoch": 1.3970345963756179,
"grad_norm": 203.10426330566406,
"learning_rate": 2.9729729729729733e-05,
"loss": 0.6386,
"step": 212
},
{
"epoch": 1.4036243822075782,
"grad_norm": 84.11065673828125,
"learning_rate": 2.9606879606879607e-05,
"loss": 0.4882,
"step": 213
},
{
"epoch": 1.4102141680395386,
"grad_norm": 220.28628540039062,
"learning_rate": 2.9484029484029483e-05,
"loss": 0.6756,
"step": 214
},
{
"epoch": 1.4168039538714992,
"grad_norm": 236.40895080566406,
"learning_rate": 2.9361179361179364e-05,
"loss": 0.5164,
"step": 215
},
{
"epoch": 1.4233937397034597,
"grad_norm": 229.29913330078125,
"learning_rate": 2.9238329238329237e-05,
"loss": 0.5916,
"step": 216
},
{
"epoch": 1.42998352553542,
"grad_norm": 137.1915740966797,
"learning_rate": 2.9115479115479117e-05,
"loss": 0.6021,
"step": 217
},
{
"epoch": 1.4365733113673804,
"grad_norm": 173.90122985839844,
"learning_rate": 2.8992628992628994e-05,
"loss": 0.6061,
"step": 218
},
{
"epoch": 1.443163097199341,
"grad_norm": 99.96955108642578,
"learning_rate": 2.8869778869778868e-05,
"loss": 0.5733,
"step": 219
},
{
"epoch": 1.4497528830313016,
"grad_norm": 56.602989196777344,
"learning_rate": 2.8746928746928748e-05,
"loss": 0.5788,
"step": 220
},
{
"epoch": 1.456342668863262,
"grad_norm": 69.04216003417969,
"learning_rate": 2.8624078624078625e-05,
"loss": 0.4894,
"step": 221
},
{
"epoch": 1.4629324546952225,
"grad_norm": 265.542724609375,
"learning_rate": 2.8501228501228505e-05,
"loss": 0.5877,
"step": 222
},
{
"epoch": 1.4695222405271828,
"grad_norm": 203.73353576660156,
"learning_rate": 2.8378378378378378e-05,
"loss": 0.5662,
"step": 223
},
{
"epoch": 1.4761120263591434,
"grad_norm": 174.82192993164062,
"learning_rate": 2.825552825552826e-05,
"loss": 0.567,
"step": 224
},
{
"epoch": 1.4827018121911038,
"grad_norm": 197.1634063720703,
"learning_rate": 2.8132678132678135e-05,
"loss": 0.5622,
"step": 225
},
{
"epoch": 1.4892915980230643,
"grad_norm": 313.1665954589844,
"learning_rate": 2.800982800982801e-05,
"loss": 0.5647,
"step": 226
},
{
"epoch": 1.4958813838550247,
"grad_norm": 337.4092102050781,
"learning_rate": 2.788697788697789e-05,
"loss": 0.5896,
"step": 227
},
{
"epoch": 1.5024711696869852,
"grad_norm": 335.1864318847656,
"learning_rate": 2.776412776412777e-05,
"loss": 0.5274,
"step": 228
},
{
"epoch": 1.5090609555189456,
"grad_norm": 149.53665161132812,
"learning_rate": 2.764127764127764e-05,
"loss": 0.6525,
"step": 229
},
{
"epoch": 1.515650741350906,
"grad_norm": 211.15191650390625,
"learning_rate": 2.751842751842752e-05,
"loss": 0.5824,
"step": 230
},
{
"epoch": 1.5222405271828665,
"grad_norm": 96.61034393310547,
"learning_rate": 2.73955773955774e-05,
"loss": 0.5774,
"step": 231
},
{
"epoch": 1.528830313014827,
"grad_norm": 158.73837280273438,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.5415,
"step": 232
},
{
"epoch": 1.5354200988467874,
"grad_norm": 50.02914810180664,
"learning_rate": 2.714987714987715e-05,
"loss": 0.5729,
"step": 233
},
{
"epoch": 1.5420098846787478,
"grad_norm": 57.91206359863281,
"learning_rate": 2.702702702702703e-05,
"loss": 0.6118,
"step": 234
},
{
"epoch": 1.5485996705107083,
"grad_norm": 134.28807067871094,
"learning_rate": 2.6904176904176904e-05,
"loss": 0.4875,
"step": 235
},
{
"epoch": 1.555189456342669,
"grad_norm": 111.96310424804688,
"learning_rate": 2.678132678132678e-05,
"loss": 0.5112,
"step": 236
},
{
"epoch": 1.5617792421746293,
"grad_norm": 210.6829376220703,
"learning_rate": 2.665847665847666e-05,
"loss": 0.5276,
"step": 237
},
{
"epoch": 1.5683690280065898,
"grad_norm": 160.88055419921875,
"learning_rate": 2.6535626535626534e-05,
"loss": 0.502,
"step": 238
},
{
"epoch": 1.5749588138385504,
"grad_norm": 230.14341735839844,
"learning_rate": 2.6412776412776414e-05,
"loss": 0.6196,
"step": 239
},
{
"epoch": 1.5815485996705108,
"grad_norm": 407.249267578125,
"learning_rate": 2.628992628992629e-05,
"loss": 0.4606,
"step": 240
},
{
"epoch": 1.588138385502471,
"grad_norm": 261.7560119628906,
"learning_rate": 2.616707616707617e-05,
"loss": 0.6383,
"step": 241
},
{
"epoch": 1.5947281713344317,
"grad_norm": 52.85226821899414,
"learning_rate": 2.6044226044226045e-05,
"loss": 0.5304,
"step": 242
},
{
"epoch": 1.6013179571663922,
"grad_norm": 123.97418975830078,
"learning_rate": 2.5921375921375925e-05,
"loss": 0.5707,
"step": 243
},
{
"epoch": 1.6079077429983526,
"grad_norm": 230.69007873535156,
"learning_rate": 2.5798525798525802e-05,
"loss": 0.4895,
"step": 244
},
{
"epoch": 1.614497528830313,
"grad_norm": 228.80450439453125,
"learning_rate": 2.5675675675675675e-05,
"loss": 0.6058,
"step": 245
},
{
"epoch": 1.6210873146622735,
"grad_norm": 94.60694122314453,
"learning_rate": 2.5552825552825555e-05,
"loss": 0.5854,
"step": 246
},
{
"epoch": 1.627677100494234,
"grad_norm": 170.16766357421875,
"learning_rate": 2.5429975429975432e-05,
"loss": 0.587,
"step": 247
},
{
"epoch": 1.6342668863261944,
"grad_norm": 374.7227783203125,
"learning_rate": 2.5307125307125306e-05,
"loss": 0.468,
"step": 248
},
{
"epoch": 1.6408566721581548,
"grad_norm": 304.4844665527344,
"learning_rate": 2.5184275184275186e-05,
"loss": 0.4362,
"step": 249
},
{
"epoch": 1.6474464579901154,
"grad_norm": 555.0403442382812,
"learning_rate": 2.5061425061425066e-05,
"loss": 0.4977,
"step": 250
},
{
"epoch": 1.654036243822076,
"grad_norm": 282.910888671875,
"learning_rate": 2.493857493857494e-05,
"loss": 0.6255,
"step": 251
},
{
"epoch": 1.6606260296540363,
"grad_norm": 141.1566925048828,
"learning_rate": 2.4815724815724816e-05,
"loss": 0.5125,
"step": 252
},
{
"epoch": 1.6672158154859966,
"grad_norm": 141.13299560546875,
"learning_rate": 2.4692874692874693e-05,
"loss": 0.563,
"step": 253
},
{
"epoch": 1.6738056013179572,
"grad_norm": 167.4251708984375,
"learning_rate": 2.4570024570024573e-05,
"loss": 0.4985,
"step": 254
},
{
"epoch": 1.6803953871499178,
"grad_norm": 214.25567626953125,
"learning_rate": 2.4447174447174447e-05,
"loss": 0.5863,
"step": 255
},
{
"epoch": 1.6869851729818781,
"grad_norm": 183.01986694335938,
"learning_rate": 2.4324324324324327e-05,
"loss": 0.4937,
"step": 256
},
{
"epoch": 1.6935749588138385,
"grad_norm": 131.54083251953125,
"learning_rate": 2.4201474201474204e-05,
"loss": 0.6094,
"step": 257
},
{
"epoch": 1.700164744645799,
"grad_norm": 443.8067626953125,
"learning_rate": 2.4078624078624077e-05,
"loss": 0.4898,
"step": 258
},
{
"epoch": 1.7067545304777596,
"grad_norm": 133.2246551513672,
"learning_rate": 2.3955773955773958e-05,
"loss": 0.4859,
"step": 259
},
{
"epoch": 1.71334431630972,
"grad_norm": 259.9535217285156,
"learning_rate": 2.3832923832923834e-05,
"loss": 0.5411,
"step": 260
},
{
"epoch": 1.7199341021416803,
"grad_norm": 380.90997314453125,
"learning_rate": 2.371007371007371e-05,
"loss": 0.5762,
"step": 261
},
{
"epoch": 1.7265238879736409,
"grad_norm": 423.51702880859375,
"learning_rate": 2.3587223587223588e-05,
"loss": 0.5791,
"step": 262
},
{
"epoch": 1.7331136738056014,
"grad_norm": 256.31378173828125,
"learning_rate": 2.3464373464373465e-05,
"loss": 0.5474,
"step": 263
},
{
"epoch": 1.7397034596375618,
"grad_norm": 352.33868408203125,
"learning_rate": 2.3341523341523342e-05,
"loss": 0.5265,
"step": 264
},
{
"epoch": 1.7462932454695221,
"grad_norm": 378.3638000488281,
"learning_rate": 2.3218673218673222e-05,
"loss": 0.5491,
"step": 265
},
{
"epoch": 1.7528830313014827,
"grad_norm": 209.05747985839844,
"learning_rate": 2.3095823095823095e-05,
"loss": 0.5141,
"step": 266
},
{
"epoch": 1.7594728171334433,
"grad_norm": 141.59524536132812,
"learning_rate": 2.2972972972972976e-05,
"loss": 0.6506,
"step": 267
},
{
"epoch": 1.7660626029654036,
"grad_norm": 219.4475555419922,
"learning_rate": 2.2850122850122852e-05,
"loss": 0.7009,
"step": 268
},
{
"epoch": 1.772652388797364,
"grad_norm": 80.54459381103516,
"learning_rate": 2.272727272727273e-05,
"loss": 0.5327,
"step": 269
},
{
"epoch": 1.7792421746293245,
"grad_norm": 699.1453247070312,
"learning_rate": 2.2604422604422606e-05,
"loss": 0.5236,
"step": 270
},
{
"epoch": 1.7858319604612851,
"grad_norm": 289.4430236816406,
"learning_rate": 2.2481572481572483e-05,
"loss": 0.4824,
"step": 271
},
{
"epoch": 1.7924217462932455,
"grad_norm": 182.82986450195312,
"learning_rate": 2.235872235872236e-05,
"loss": 0.5676,
"step": 272
},
{
"epoch": 1.7990115321252058,
"grad_norm": 225.81126403808594,
"learning_rate": 2.2235872235872237e-05,
"loss": 0.5194,
"step": 273
},
{
"epoch": 1.8056013179571664,
"grad_norm": 304.61309814453125,
"learning_rate": 2.2113022113022113e-05,
"loss": 0.626,
"step": 274
},
{
"epoch": 1.812191103789127,
"grad_norm": 154.47415161132812,
"learning_rate": 2.199017199017199e-05,
"loss": 0.5235,
"step": 275
},
{
"epoch": 1.8187808896210873,
"grad_norm": 530.3298950195312,
"learning_rate": 2.186732186732187e-05,
"loss": 0.466,
"step": 276
},
{
"epoch": 1.8253706754530477,
"grad_norm": 61.56108093261719,
"learning_rate": 2.1744471744471744e-05,
"loss": 0.4229,
"step": 277
},
{
"epoch": 1.8319604612850082,
"grad_norm": 427.62469482421875,
"learning_rate": 2.1621621621621624e-05,
"loss": 0.644,
"step": 278
},
{
"epoch": 1.8385502471169688,
"grad_norm": 95.06147003173828,
"learning_rate": 2.14987714987715e-05,
"loss": 0.5268,
"step": 279
},
{
"epoch": 1.8451400329489291,
"grad_norm": 85.69621276855469,
"learning_rate": 2.1375921375921378e-05,
"loss": 0.4855,
"step": 280
},
{
"epoch": 1.8517298187808895,
"grad_norm": 526.4759521484375,
"learning_rate": 2.1253071253071255e-05,
"loss": 0.5573,
"step": 281
},
{
"epoch": 1.85831960461285,
"grad_norm": 265.5906677246094,
"learning_rate": 2.113022113022113e-05,
"loss": 0.555,
"step": 282
},
{
"epoch": 1.8649093904448106,
"grad_norm": 414.3144226074219,
"learning_rate": 2.1007371007371008e-05,
"loss": 0.5584,
"step": 283
},
{
"epoch": 1.871499176276771,
"grad_norm": 304.21405029296875,
"learning_rate": 2.0884520884520885e-05,
"loss": 0.4671,
"step": 284
},
{
"epoch": 1.8780889621087313,
"grad_norm": 414.1387023925781,
"learning_rate": 2.0761670761670762e-05,
"loss": 0.6691,
"step": 285
},
{
"epoch": 1.884678747940692,
"grad_norm": 208.69493103027344,
"learning_rate": 2.063882063882064e-05,
"loss": 0.6271,
"step": 286
},
{
"epoch": 1.8912685337726525,
"grad_norm": 430.6809387207031,
"learning_rate": 2.051597051597052e-05,
"loss": 0.5414,
"step": 287
},
{
"epoch": 1.8978583196046128,
"grad_norm": 115.23016357421875,
"learning_rate": 2.0393120393120392e-05,
"loss": 0.5869,
"step": 288
},
{
"epoch": 1.9044481054365732,
"grad_norm": 242.86927795410156,
"learning_rate": 2.0270270270270273e-05,
"loss": 0.5224,
"step": 289
},
{
"epoch": 1.9110378912685337,
"grad_norm": 250.8336944580078,
"learning_rate": 2.014742014742015e-05,
"loss": 0.6022,
"step": 290
},
{
"epoch": 1.9176276771004943,
"grad_norm": 104.50414276123047,
"learning_rate": 2.0024570024570026e-05,
"loss": 0.6064,
"step": 291
},
{
"epoch": 1.9242174629324547,
"grad_norm": 466.93768310546875,
"learning_rate": 1.9901719901719903e-05,
"loss": 0.6291,
"step": 292
},
{
"epoch": 1.930807248764415,
"grad_norm": 138.8919219970703,
"learning_rate": 1.977886977886978e-05,
"loss": 0.514,
"step": 293
},
{
"epoch": 1.9373970345963756,
"grad_norm": 532.3485717773438,
"learning_rate": 1.9656019656019657e-05,
"loss": 0.4918,
"step": 294
},
{
"epoch": 1.9439868204283361,
"grad_norm": 116.6861572265625,
"learning_rate": 1.9533169533169534e-05,
"loss": 0.7242,
"step": 295
},
{
"epoch": 1.9505766062602965,
"grad_norm": 338.7384338378906,
"learning_rate": 1.941031941031941e-05,
"loss": 0.5006,
"step": 296
},
{
"epoch": 1.9571663920922568,
"grad_norm": 57.919403076171875,
"learning_rate": 1.928746928746929e-05,
"loss": 0.5343,
"step": 297
},
{
"epoch": 1.9637561779242174,
"grad_norm": 300.79095458984375,
"learning_rate": 1.9164619164619167e-05,
"loss": 0.4868,
"step": 298
},
{
"epoch": 1.970345963756178,
"grad_norm": 81.19691467285156,
"learning_rate": 1.904176904176904e-05,
"loss": 0.4897,
"step": 299
},
{
"epoch": 1.9769357495881383,
"grad_norm": 159.11351013183594,
"learning_rate": 1.891891891891892e-05,
"loss": 0.5667,
"step": 300
},
{
"epoch": 1.9835255354200987,
"grad_norm": 80.84410095214844,
"learning_rate": 1.8796068796068798e-05,
"loss": 0.5623,
"step": 301
},
{
"epoch": 1.9901153212520593,
"grad_norm": 185.26185607910156,
"learning_rate": 1.8673218673218675e-05,
"loss": 0.5102,
"step": 302
},
{
"epoch": 1.9967051070840198,
"grad_norm": 110.74467468261719,
"learning_rate": 1.855036855036855e-05,
"loss": 0.4176,
"step": 303
},
{
"epoch": 2.00329489291598,
"grad_norm": 175.4639892578125,
"learning_rate": 1.842751842751843e-05,
"loss": 0.4222,
"step": 304
},
{
"epoch": 2.0098846787479405,
"grad_norm": 123.6357192993164,
"learning_rate": 1.8304668304668305e-05,
"loss": 0.3096,
"step": 305
},
{
"epoch": 2.0164744645799013,
"grad_norm": 237.6382598876953,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.4244,
"step": 306
},
{
"epoch": 2.0230642504118617,
"grad_norm": 303.2618103027344,
"learning_rate": 1.805896805896806e-05,
"loss": 0.2701,
"step": 307
},
{
"epoch": 2.029654036243822,
"grad_norm": 338.3935241699219,
"learning_rate": 1.793611793611794e-05,
"loss": 0.3519,
"step": 308
},
{
"epoch": 2.0362438220757824,
"grad_norm": 246.98533630371094,
"learning_rate": 1.7813267813267816e-05,
"loss": 0.3403,
"step": 309
},
{
"epoch": 2.042833607907743,
"grad_norm": 376.6452941894531,
"learning_rate": 1.769041769041769e-05,
"loss": 0.336,
"step": 310
},
{
"epoch": 2.0494233937397035,
"grad_norm": 134.3882293701172,
"learning_rate": 1.756756756756757e-05,
"loss": 0.311,
"step": 311
},
{
"epoch": 2.056013179571664,
"grad_norm": 45.66189956665039,
"learning_rate": 1.7444717444717446e-05,
"loss": 0.3463,
"step": 312
},
{
"epoch": 2.062602965403624,
"grad_norm": 251.7926788330078,
"learning_rate": 1.7321867321867323e-05,
"loss": 0.2922,
"step": 313
},
{
"epoch": 2.069192751235585,
"grad_norm": 86.86029815673828,
"learning_rate": 1.71990171990172e-05,
"loss": 0.2848,
"step": 314
},
{
"epoch": 2.0757825370675453,
"grad_norm": 72.99238586425781,
"learning_rate": 1.7076167076167077e-05,
"loss": 0.2699,
"step": 315
},
{
"epoch": 2.0823723228995057,
"grad_norm": 348.0635681152344,
"learning_rate": 1.6953316953316954e-05,
"loss": 0.2766,
"step": 316
},
{
"epoch": 2.088962108731466,
"grad_norm": 307.6921691894531,
"learning_rate": 1.683046683046683e-05,
"loss": 0.3256,
"step": 317
},
{
"epoch": 2.095551894563427,
"grad_norm": 92.43419647216797,
"learning_rate": 1.6707616707616707e-05,
"loss": 0.312,
"step": 318
},
{
"epoch": 2.102141680395387,
"grad_norm": 365.3904113769531,
"learning_rate": 1.6584766584766588e-05,
"loss": 0.3238,
"step": 319
},
{
"epoch": 2.1087314662273475,
"grad_norm": 243.0485076904297,
"learning_rate": 1.6461916461916464e-05,
"loss": 0.3406,
"step": 320
},
{
"epoch": 2.115321252059308,
"grad_norm": 70.53246307373047,
"learning_rate": 1.6339066339066338e-05,
"loss": 0.2882,
"step": 321
},
{
"epoch": 2.1219110378912687,
"grad_norm": 271.1737060546875,
"learning_rate": 1.6216216216216218e-05,
"loss": 0.318,
"step": 322
},
{
"epoch": 2.128500823723229,
"grad_norm": 944.7637329101562,
"learning_rate": 1.6093366093366095e-05,
"loss": 0.3186,
"step": 323
},
{
"epoch": 2.1350906095551894,
"grad_norm": 286.0992736816406,
"learning_rate": 1.5970515970515972e-05,
"loss": 0.3693,
"step": 324
},
{
"epoch": 2.1416803953871497,
"grad_norm": 331.73931884765625,
"learning_rate": 1.584766584766585e-05,
"loss": 0.3566,
"step": 325
},
{
"epoch": 2.1482701812191105,
"grad_norm": 68.8238754272461,
"learning_rate": 1.5724815724815725e-05,
"loss": 0.3458,
"step": 326
},
{
"epoch": 2.154859967051071,
"grad_norm": 117.55406188964844,
"learning_rate": 1.5601965601965606e-05,
"loss": 0.2875,
"step": 327
},
{
"epoch": 2.161449752883031,
"grad_norm": 137.7025909423828,
"learning_rate": 1.547911547911548e-05,
"loss": 0.2866,
"step": 328
},
{
"epoch": 2.168039538714992,
"grad_norm": 49.644142150878906,
"learning_rate": 1.5356265356265356e-05,
"loss": 0.2936,
"step": 329
},
{
"epoch": 2.1746293245469523,
"grad_norm": 228.79408264160156,
"learning_rate": 1.5233415233415234e-05,
"loss": 0.3444,
"step": 330
},
{
"epoch": 2.1812191103789127,
"grad_norm": 197.12803649902344,
"learning_rate": 1.5110565110565111e-05,
"loss": 0.38,
"step": 331
},
{
"epoch": 2.187808896210873,
"grad_norm": 239.77589416503906,
"learning_rate": 1.4987714987714988e-05,
"loss": 0.3348,
"step": 332
},
{
"epoch": 2.1943986820428334,
"grad_norm": 52.3128547668457,
"learning_rate": 1.4864864864864867e-05,
"loss": 0.3723,
"step": 333
},
{
"epoch": 2.200988467874794,
"grad_norm": 136.6421661376953,
"learning_rate": 1.4742014742014742e-05,
"loss": 0.2879,
"step": 334
},
{
"epoch": 2.2075782537067545,
"grad_norm": 104.56753540039062,
"learning_rate": 1.4619164619164619e-05,
"loss": 0.2591,
"step": 335
},
{
"epoch": 2.214168039538715,
"grad_norm": 98.5406265258789,
"learning_rate": 1.4496314496314497e-05,
"loss": 0.3729,
"step": 336
},
{
"epoch": 2.2207578253706757,
"grad_norm": 200.8502960205078,
"learning_rate": 1.4373464373464374e-05,
"loss": 0.3363,
"step": 337
},
{
"epoch": 2.227347611202636,
"grad_norm": 66.05599212646484,
"learning_rate": 1.4250614250614252e-05,
"loss": 0.3238,
"step": 338
},
{
"epoch": 2.2339373970345964,
"grad_norm": 229.73007202148438,
"learning_rate": 1.412776412776413e-05,
"loss": 0.3587,
"step": 339
},
{
"epoch": 2.2405271828665567,
"grad_norm": 117.70530700683594,
"learning_rate": 1.4004914004914004e-05,
"loss": 0.4228,
"step": 340
},
{
"epoch": 2.247116968698517,
"grad_norm": 132.34347534179688,
"learning_rate": 1.3882063882063885e-05,
"loss": 0.2763,
"step": 341
},
{
"epoch": 2.253706754530478,
"grad_norm": 133.9874267578125,
"learning_rate": 1.375921375921376e-05,
"loss": 0.2553,
"step": 342
},
{
"epoch": 2.260296540362438,
"grad_norm": 106.14327239990234,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.3593,
"step": 343
},
{
"epoch": 2.2668863261943986,
"grad_norm": 188.0239715576172,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.2971,
"step": 344
},
{
"epoch": 2.2734761120263594,
"grad_norm": 255.7284698486328,
"learning_rate": 1.339066339066339e-05,
"loss": 0.3132,
"step": 345
},
{
"epoch": 2.2800658978583197,
"grad_norm": 400.6213073730469,
"learning_rate": 1.3267813267813267e-05,
"loss": 0.3044,
"step": 346
},
{
"epoch": 2.28665568369028,
"grad_norm": 104.66104888916016,
"learning_rate": 1.3144963144963146e-05,
"loss": 0.3143,
"step": 347
},
{
"epoch": 2.2932454695222404,
"grad_norm": 49.61936569213867,
"learning_rate": 1.3022113022113022e-05,
"loss": 0.3229,
"step": 348
},
{
"epoch": 2.2998352553542007,
"grad_norm": 349.636962890625,
"learning_rate": 1.2899262899262901e-05,
"loss": 0.3035,
"step": 349
},
{
"epoch": 2.3064250411861615,
"grad_norm": 284.7281494140625,
"learning_rate": 1.2776412776412778e-05,
"loss": 0.3167,
"step": 350
},
{
"epoch": 2.313014827018122,
"grad_norm": 49.98039245605469,
"learning_rate": 1.2653562653562653e-05,
"loss": 0.278,
"step": 351
},
{
"epoch": 2.3196046128500822,
"grad_norm": 67.77862548828125,
"learning_rate": 1.2530712530712533e-05,
"loss": 0.3302,
"step": 352
},
{
"epoch": 2.326194398682043,
"grad_norm": 118.573486328125,
"learning_rate": 1.2407862407862408e-05,
"loss": 0.3006,
"step": 353
},
{
"epoch": 2.3327841845140034,
"grad_norm": 33.04819107055664,
"learning_rate": 1.2285012285012287e-05,
"loss": 0.3596,
"step": 354
},
{
"epoch": 2.3393739703459637,
"grad_norm": 161.4209747314453,
"learning_rate": 1.2162162162162164e-05,
"loss": 0.3984,
"step": 355
},
{
"epoch": 2.345963756177924,
"grad_norm": 44.77053451538086,
"learning_rate": 1.2039312039312039e-05,
"loss": 0.4111,
"step": 356
},
{
"epoch": 2.352553542009885,
"grad_norm": 343.490966796875,
"learning_rate": 1.1916461916461917e-05,
"loss": 0.3359,
"step": 357
},
{
"epoch": 2.359143327841845,
"grad_norm": 864.7401733398438,
"learning_rate": 1.1793611793611794e-05,
"loss": 0.2895,
"step": 358
},
{
"epoch": 2.3657331136738056,
"grad_norm": 148.0756378173828,
"learning_rate": 1.1670761670761671e-05,
"loss": 0.335,
"step": 359
},
{
"epoch": 2.372322899505766,
"grad_norm": 78.04084777832031,
"learning_rate": 1.1547911547911548e-05,
"loss": 0.2999,
"step": 360
},
{
"epoch": 2.3789126853377267,
"grad_norm": 77.24546813964844,
"learning_rate": 1.1425061425061426e-05,
"loss": 0.3043,
"step": 361
},
{
"epoch": 2.385502471169687,
"grad_norm": 60.59427261352539,
"learning_rate": 1.1302211302211303e-05,
"loss": 0.3838,
"step": 362
},
{
"epoch": 2.3920922570016474,
"grad_norm": 208.8542938232422,
"learning_rate": 1.117936117936118e-05,
"loss": 0.2956,
"step": 363
},
{
"epoch": 2.3986820428336078,
"grad_norm": 403.25823974609375,
"learning_rate": 1.1056511056511057e-05,
"loss": 0.2976,
"step": 364
},
{
"epoch": 2.4052718286655685,
"grad_norm": 152.59671020507812,
"learning_rate": 1.0933660933660935e-05,
"loss": 0.3533,
"step": 365
},
{
"epoch": 2.411861614497529,
"grad_norm": 533.1868286132812,
"learning_rate": 1.0810810810810812e-05,
"loss": 0.3418,
"step": 366
},
{
"epoch": 2.4184514003294892,
"grad_norm": 191.50588989257812,
"learning_rate": 1.0687960687960689e-05,
"loss": 0.3042,
"step": 367
},
{
"epoch": 2.4250411861614496,
"grad_norm": 81.26240539550781,
"learning_rate": 1.0565110565110566e-05,
"loss": 0.3436,
"step": 368
},
{
"epoch": 2.4316309719934104,
"grad_norm": 281.524169921875,
"learning_rate": 1.0442260442260443e-05,
"loss": 0.3453,
"step": 369
},
{
"epoch": 2.4382207578253707,
"grad_norm": 221.48391723632812,
"learning_rate": 1.031941031941032e-05,
"loss": 0.3288,
"step": 370
},
{
"epoch": 2.444810543657331,
"grad_norm": 89.54031372070312,
"learning_rate": 1.0196560196560196e-05,
"loss": 0.3415,
"step": 371
},
{
"epoch": 2.4514003294892914,
"grad_norm": 102.45128631591797,
"learning_rate": 1.0073710073710075e-05,
"loss": 0.3264,
"step": 372
},
{
"epoch": 2.4579901153212522,
"grad_norm": 93.46699523925781,
"learning_rate": 9.950859950859952e-06,
"loss": 0.347,
"step": 373
},
{
"epoch": 2.4645799011532126,
"grad_norm": 288.0792541503906,
"learning_rate": 9.828009828009828e-06,
"loss": 0.3834,
"step": 374
},
{
"epoch": 2.471169686985173,
"grad_norm": 119.23399353027344,
"learning_rate": 9.705159705159705e-06,
"loss": 0.2886,
"step": 375
},
{
"epoch": 2.4777594728171333,
"grad_norm": 1750.0697021484375,
"learning_rate": 9.582309582309584e-06,
"loss": 0.339,
"step": 376
},
{
"epoch": 2.484349258649094,
"grad_norm": 188.45376586914062,
"learning_rate": 9.45945945945946e-06,
"loss": 0.2951,
"step": 377
},
{
"epoch": 2.4909390444810544,
"grad_norm": 68.5443115234375,
"learning_rate": 9.336609336609337e-06,
"loss": 0.3369,
"step": 378
},
{
"epoch": 2.4975288303130148,
"grad_norm": 43.438167572021484,
"learning_rate": 9.213759213759214e-06,
"loss": 0.2915,
"step": 379
},
{
"epoch": 2.504118616144975,
"grad_norm": 70.3156967163086,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3346,
"step": 380
},
{
"epoch": 2.510708401976936,
"grad_norm": 150.3292694091797,
"learning_rate": 8.96805896805897e-06,
"loss": 0.4153,
"step": 381
},
{
"epoch": 2.5172981878088962,
"grad_norm": 57.9390983581543,
"learning_rate": 8.845208845208845e-06,
"loss": 0.3337,
"step": 382
},
{
"epoch": 2.5238879736408566,
"grad_norm": 238.041259765625,
"learning_rate": 8.722358722358723e-06,
"loss": 0.3022,
"step": 383
},
{
"epoch": 2.5304777594728174,
"grad_norm": 181.7864227294922,
"learning_rate": 8.5995085995086e-06,
"loss": 0.3024,
"step": 384
},
{
"epoch": 2.5370675453047777,
"grad_norm": 170.37905883789062,
"learning_rate": 8.476658476658477e-06,
"loss": 0.3579,
"step": 385
},
{
"epoch": 2.543657331136738,
"grad_norm": 36.57583999633789,
"learning_rate": 8.353808353808354e-06,
"loss": 0.3481,
"step": 386
},
{
"epoch": 2.5502471169686984,
"grad_norm": 66.17654418945312,
"learning_rate": 8.230958230958232e-06,
"loss": 0.2793,
"step": 387
},
{
"epoch": 2.556836902800659,
"grad_norm": 156.1625518798828,
"learning_rate": 8.108108108108109e-06,
"loss": 0.3333,
"step": 388
},
{
"epoch": 2.5634266886326196,
"grad_norm": 79.08843994140625,
"learning_rate": 7.985257985257986e-06,
"loss": 0.3328,
"step": 389
},
{
"epoch": 2.57001647446458,
"grad_norm": 37.87118148803711,
"learning_rate": 7.862407862407863e-06,
"loss": 0.3145,
"step": 390
},
{
"epoch": 2.5766062602965403,
"grad_norm": 135.13316345214844,
"learning_rate": 7.73955773955774e-06,
"loss": 0.2399,
"step": 391
},
{
"epoch": 2.583196046128501,
"grad_norm": 150.52328491210938,
"learning_rate": 7.616707616707617e-06,
"loss": 0.2809,
"step": 392
},
{
"epoch": 2.5897858319604614,
"grad_norm": 76.37353515625,
"learning_rate": 7.493857493857494e-06,
"loss": 0.3387,
"step": 393
},
{
"epoch": 2.5963756177924218,
"grad_norm": 57.05943298339844,
"learning_rate": 7.371007371007371e-06,
"loss": 0.2754,
"step": 394
},
{
"epoch": 2.602965403624382,
"grad_norm": 92.42174530029297,
"learning_rate": 7.2481572481572485e-06,
"loss": 0.2883,
"step": 395
},
{
"epoch": 2.6095551894563425,
"grad_norm": 187.5482177734375,
"learning_rate": 7.125307125307126e-06,
"loss": 0.3647,
"step": 396
},
{
"epoch": 2.6161449752883033,
"grad_norm": 183.51123046875,
"learning_rate": 7.002457002457002e-06,
"loss": 0.294,
"step": 397
},
{
"epoch": 2.6227347611202636,
"grad_norm": 100.82892608642578,
"learning_rate": 6.87960687960688e-06,
"loss": 0.3716,
"step": 398
},
{
"epoch": 2.629324546952224,
"grad_norm": 328.86724853515625,
"learning_rate": 6.7567567567567575e-06,
"loss": 0.2346,
"step": 399
},
{
"epoch": 2.6359143327841847,
"grad_norm": 414.6925964355469,
"learning_rate": 6.6339066339066335e-06,
"loss": 0.3206,
"step": 400
},
{
"epoch": 2.642504118616145,
"grad_norm": 321.9985656738281,
"learning_rate": 6.511056511056511e-06,
"loss": 0.2855,
"step": 401
},
{
"epoch": 2.6490939044481054,
"grad_norm": 150.3809814453125,
"learning_rate": 6.388206388206389e-06,
"loss": 0.3489,
"step": 402
},
{
"epoch": 2.655683690280066,
"grad_norm": 139.02951049804688,
"learning_rate": 6.2653562653562665e-06,
"loss": 0.2419,
"step": 403
},
{
"epoch": 2.662273476112026,
"grad_norm": 151.592529296875,
"learning_rate": 6.142506142506143e-06,
"loss": 0.3109,
"step": 404
},
{
"epoch": 2.668863261943987,
"grad_norm": 47.01968765258789,
"learning_rate": 6.019656019656019e-06,
"loss": 0.2973,
"step": 405
},
{
"epoch": 2.6754530477759473,
"grad_norm": 300.16302490234375,
"learning_rate": 5.896805896805897e-06,
"loss": 0.3582,
"step": 406
},
{
"epoch": 2.6820428336079076,
"grad_norm": 380.8616027832031,
"learning_rate": 5.773955773955774e-06,
"loss": 0.2839,
"step": 407
},
{
"epoch": 2.6886326194398684,
"grad_norm": 119.63872528076172,
"learning_rate": 5.6511056511056515e-06,
"loss": 0.2836,
"step": 408
},
{
"epoch": 2.6952224052718288,
"grad_norm": 97.77668762207031,
"learning_rate": 5.528255528255528e-06,
"loss": 0.2978,
"step": 409
},
{
"epoch": 2.701812191103789,
"grad_norm": 32.76837921142578,
"learning_rate": 5.405405405405406e-06,
"loss": 0.3184,
"step": 410
},
{
"epoch": 2.7084019769357495,
"grad_norm": 148.2511749267578,
"learning_rate": 5.282555282555283e-06,
"loss": 0.3299,
"step": 411
},
{
"epoch": 2.71499176276771,
"grad_norm": 78.83771514892578,
"learning_rate": 5.15970515970516e-06,
"loss": 0.3379,
"step": 412
},
{
"epoch": 2.7215815485996706,
"grad_norm": 43.07817459106445,
"learning_rate": 5.036855036855037e-06,
"loss": 0.3325,
"step": 413
},
{
"epoch": 2.728171334431631,
"grad_norm": 34.94389343261719,
"learning_rate": 4.914004914004914e-06,
"loss": 0.2558,
"step": 414
},
{
"epoch": 2.7347611202635913,
"grad_norm": 241.07553100585938,
"learning_rate": 4.791154791154792e-06,
"loss": 0.3095,
"step": 415
},
{
"epoch": 2.741350906095552,
"grad_norm": 173.70211791992188,
"learning_rate": 4.668304668304669e-06,
"loss": 0.3556,
"step": 416
},
{
"epoch": 2.7479406919275124,
"grad_norm": 131.01210021972656,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.3794,
"step": 417
},
{
"epoch": 2.754530477759473,
"grad_norm": 80.76969146728516,
"learning_rate": 4.422604422604422e-06,
"loss": 0.2821,
"step": 418
},
{
"epoch": 2.761120263591433,
"grad_norm": 473.2821960449219,
"learning_rate": 4.2997542997543e-06,
"loss": 0.3323,
"step": 419
},
{
"epoch": 2.7677100494233935,
"grad_norm": 97.82337951660156,
"learning_rate": 4.176904176904177e-06,
"loss": 0.2755,
"step": 420
},
{
"epoch": 2.7742998352553543,
"grad_norm": 316.9011535644531,
"learning_rate": 4.0540540540540545e-06,
"loss": 0.2792,
"step": 421
},
{
"epoch": 2.7808896210873146,
"grad_norm": 260.79034423828125,
"learning_rate": 3.931203931203931e-06,
"loss": 0.3694,
"step": 422
},
{
"epoch": 2.787479406919275,
"grad_norm": 62.82155990600586,
"learning_rate": 3.8083538083538086e-06,
"loss": 0.2944,
"step": 423
},
{
"epoch": 2.7940691927512358,
"grad_norm": 157.30381774902344,
"learning_rate": 3.6855036855036854e-06,
"loss": 0.2965,
"step": 424
},
{
"epoch": 2.800658978583196,
"grad_norm": 170.7925567626953,
"learning_rate": 3.562653562653563e-06,
"loss": 0.3216,
"step": 425
},
{
"epoch": 2.8072487644151565,
"grad_norm": 78.40022277832031,
"learning_rate": 3.43980343980344e-06,
"loss": 0.3452,
"step": 426
},
{
"epoch": 2.813838550247117,
"grad_norm": 89.03524017333984,
"learning_rate": 3.3169533169533168e-06,
"loss": 0.2611,
"step": 427
},
{
"epoch": 2.820428336079077,
"grad_norm": 38.65609359741211,
"learning_rate": 3.1941031941031944e-06,
"loss": 0.3284,
"step": 428
},
{
"epoch": 2.827018121911038,
"grad_norm": 608.3867797851562,
"learning_rate": 3.0712530712530717e-06,
"loss": 0.3454,
"step": 429
},
{
"epoch": 2.8336079077429983,
"grad_norm": 76.90625762939453,
"learning_rate": 2.9484029484029485e-06,
"loss": 0.2576,
"step": 430
},
{
"epoch": 2.8401976935749587,
"grad_norm": 53.53964614868164,
"learning_rate": 2.8255528255528258e-06,
"loss": 0.3003,
"step": 431
},
{
"epoch": 2.8467874794069195,
"grad_norm": 70.24076843261719,
"learning_rate": 2.702702702702703e-06,
"loss": 0.3142,
"step": 432
},
{
"epoch": 2.85337726523888,
"grad_norm": 33.27001190185547,
"learning_rate": 2.57985257985258e-06,
"loss": 0.3152,
"step": 433
},
{
"epoch": 2.85996705107084,
"grad_norm": 163.56005859375,
"learning_rate": 2.457002457002457e-06,
"loss": 0.2981,
"step": 434
},
{
"epoch": 2.8665568369028005,
"grad_norm": 126.6738510131836,
"learning_rate": 2.3341523341523343e-06,
"loss": 0.3119,
"step": 435
},
{
"epoch": 2.873146622734761,
"grad_norm": 39.26594924926758,
"learning_rate": 2.211302211302211e-06,
"loss": 0.3173,
"step": 436
},
{
"epoch": 2.8797364085667216,
"grad_norm": 106.17019653320312,
"learning_rate": 2.0884520884520884e-06,
"loss": 0.342,
"step": 437
},
{
"epoch": 2.886326194398682,
"grad_norm": 119.90926361083984,
"learning_rate": 1.9656019656019657e-06,
"loss": 0.2775,
"step": 438
},
{
"epoch": 2.892915980230643,
"grad_norm": 278.5794677734375,
"learning_rate": 1.8427518427518427e-06,
"loss": 0.2704,
"step": 439
},
{
"epoch": 2.899505766062603,
"grad_norm": 95.96866607666016,
"learning_rate": 1.71990171990172e-06,
"loss": 0.2633,
"step": 440
},
{
"epoch": 2.9060955518945635,
"grad_norm": 94.9200210571289,
"learning_rate": 1.5970515970515972e-06,
"loss": 0.2662,
"step": 441
},
{
"epoch": 2.912685337726524,
"grad_norm": 331.02508544921875,
"learning_rate": 1.4742014742014743e-06,
"loss": 0.2685,
"step": 442
},
{
"epoch": 2.919275123558484,
"grad_norm": 474.9095764160156,
"learning_rate": 1.3513513513513515e-06,
"loss": 0.2664,
"step": 443
},
{
"epoch": 2.925864909390445,
"grad_norm": 146.21035766601562,
"learning_rate": 1.2285012285012285e-06,
"loss": 0.2524,
"step": 444
},
{
"epoch": 2.9324546952224053,
"grad_norm": 67.15187072753906,
"learning_rate": 1.1056511056511056e-06,
"loss": 0.2769,
"step": 445
},
{
"epoch": 2.9390444810543657,
"grad_norm": 68.6742172241211,
"learning_rate": 9.828009828009828e-07,
"loss": 0.2768,
"step": 446
},
{
"epoch": 2.9456342668863265,
"grad_norm": 82.91572570800781,
"learning_rate": 8.5995085995086e-07,
"loss": 0.2923,
"step": 447
},
{
"epoch": 2.952224052718287,
"grad_norm": 236.94644165039062,
"learning_rate": 7.371007371007371e-07,
"loss": 0.3512,
"step": 448
},
{
"epoch": 2.958813838550247,
"grad_norm": 84.61334991455078,
"learning_rate": 6.142506142506143e-07,
"loss": 0.254,
"step": 449
},
{
"epoch": 2.9654036243822075,
"grad_norm": 160.33612060546875,
"learning_rate": 4.914004914004914e-07,
"loss": 0.2808,
"step": 450
},
{
"epoch": 2.971993410214168,
"grad_norm": 96.69217681884766,
"learning_rate": 3.6855036855036856e-07,
"loss": 0.2926,
"step": 451
},
{
"epoch": 2.9785831960461286,
"grad_norm": 315.9173278808594,
"learning_rate": 2.457002457002457e-07,
"loss": 0.2809,
"step": 452
},
{
"epoch": 2.985172981878089,
"grad_norm": 46.96442413330078,
"learning_rate": 1.2285012285012285e-07,
"loss": 0.3087,
"step": 453
},
{
"epoch": 2.985172981878089,
"step": 453,
"total_flos": 1.0318088260361912e+18,
"train_loss": 0.5700862745509768,
"train_runtime": 62496.1524,
"train_samples_per_second": 0.35,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1.0,
"max_steps": 453,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0318088260361912e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}