tahamajs's picture
Upload folder using huggingface_hub
469fb00 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 924,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008658008658008658,
"grad_norm": 5.4204916523303837e-05,
"learning_rate": 4.255319148936171e-06,
"loss": 2.1959,
"step": 2
},
{
"epoch": 0.017316017316017316,
"grad_norm": 4.602531771524809e-05,
"learning_rate": 1.2765957446808511e-05,
"loss": 1.982,
"step": 4
},
{
"epoch": 0.025974025974025976,
"grad_norm": 3.824224040727131e-05,
"learning_rate": 2.1276595744680852e-05,
"loss": 2.2121,
"step": 6
},
{
"epoch": 0.03463203463203463,
"grad_norm": 2.3534847059636377e-05,
"learning_rate": 2.9787234042553192e-05,
"loss": 2.0893,
"step": 8
},
{
"epoch": 0.04329004329004329,
"grad_norm": 1.8666707546799444e-05,
"learning_rate": 3.829787234042553e-05,
"loss": 1.8373,
"step": 10
},
{
"epoch": 0.05194805194805195,
"grad_norm": 1.5847288523218594e-05,
"learning_rate": 4.680851063829788e-05,
"loss": 2.0273,
"step": 12
},
{
"epoch": 0.06060606060606061,
"grad_norm": 1.0995895536325406e-05,
"learning_rate": 5.531914893617022e-05,
"loss": 1.8151,
"step": 14
},
{
"epoch": 0.06926406926406926,
"grad_norm": 1.6404985217377543e-05,
"learning_rate": 6.382978723404256e-05,
"loss": 2.0606,
"step": 16
},
{
"epoch": 0.07792207792207792,
"grad_norm": 1.4904575436958112e-05,
"learning_rate": 7.23404255319149e-05,
"loss": 1.9488,
"step": 18
},
{
"epoch": 0.08658008658008658,
"grad_norm": 1.3000194485357497e-05,
"learning_rate": 8.085106382978723e-05,
"loss": 2.6669,
"step": 20
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1.2479895303840749e-05,
"learning_rate": 8.936170212765958e-05,
"loss": 1.742,
"step": 22
},
{
"epoch": 0.1038961038961039,
"grad_norm": 1.0467254469403997e-05,
"learning_rate": 9.787234042553192e-05,
"loss": 1.7994,
"step": 24
},
{
"epoch": 0.11255411255411256,
"grad_norm": 1.3622248843603302e-05,
"learning_rate": 0.00010638297872340425,
"loss": 1.9219,
"step": 26
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.3718491572944913e-05,
"learning_rate": 0.00011489361702127661,
"loss": 1.8505,
"step": 28
},
{
"epoch": 0.12987012987012986,
"grad_norm": 1.3184439012547955e-05,
"learning_rate": 0.00012340425531914893,
"loss": 1.7647,
"step": 30
},
{
"epoch": 0.13852813852813853,
"grad_norm": 1.2686439731623977e-05,
"learning_rate": 0.00013191489361702127,
"loss": 1.8865,
"step": 32
},
{
"epoch": 0.1471861471861472,
"grad_norm": 3.454366378718987e-05,
"learning_rate": 0.00014042553191489363,
"loss": 1.8743,
"step": 34
},
{
"epoch": 0.15584415584415584,
"grad_norm": 1.81365103344433e-05,
"learning_rate": 0.00014893617021276596,
"loss": 1.9598,
"step": 36
},
{
"epoch": 0.1645021645021645,
"grad_norm": 1.4871564417262562e-05,
"learning_rate": 0.00015744680851063832,
"loss": 1.7956,
"step": 38
},
{
"epoch": 0.17316017316017315,
"grad_norm": 1.3106016922392882e-05,
"learning_rate": 0.00016595744680851065,
"loss": 1.6608,
"step": 40
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.4674634257971775e-05,
"learning_rate": 0.00017446808510638298,
"loss": 1.7878,
"step": 42
},
{
"epoch": 0.19047619047619047,
"grad_norm": 2.0664912881329656e-05,
"learning_rate": 0.00018297872340425532,
"loss": 1.8639,
"step": 44
},
{
"epoch": 0.19913419913419914,
"grad_norm": 1.741530650178902e-05,
"learning_rate": 0.00019148936170212768,
"loss": 1.5938,
"step": 46
},
{
"epoch": 0.2077922077922078,
"grad_norm": 1.3442709132505115e-05,
"learning_rate": 0.0002,
"loss": 1.6691,
"step": 48
},
{
"epoch": 0.21645021645021645,
"grad_norm": 2.1207506506470963e-05,
"learning_rate": 0.00019999743357429378,
"loss": 1.804,
"step": 50
},
{
"epoch": 0.22510822510822512,
"grad_norm": 3.287912477389909e-05,
"learning_rate": 0.00019998973442890598,
"loss": 1.8404,
"step": 52
},
{
"epoch": 0.23376623376623376,
"grad_norm": 2.17068190977443e-05,
"learning_rate": 0.00019997690295902226,
"loss": 1.7561,
"step": 54
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.806787622626871e-05,
"learning_rate": 0.00019995893982326286,
"loss": 1.8094,
"step": 56
},
{
"epoch": 0.2510822510822511,
"grad_norm": 1.9331200746819377e-05,
"learning_rate": 0.00019993584594364894,
"loss": 1.9647,
"step": 58
},
{
"epoch": 0.2597402597402597,
"grad_norm": 1.799209712771699e-05,
"learning_rate": 0.00019990762250555495,
"loss": 1.6961,
"step": 60
},
{
"epoch": 0.2683982683982684,
"grad_norm": 1.7485201169620268e-05,
"learning_rate": 0.0001998742709576481,
"loss": 1.8473,
"step": 62
},
{
"epoch": 0.27705627705627706,
"grad_norm": 1.571387838339433e-05,
"learning_rate": 0.00019983579301181373,
"loss": 1.5972,
"step": 64
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.9293982404633425e-05,
"learning_rate": 0.00019979219064306762,
"loss": 1.6437,
"step": 66
},
{
"epoch": 0.2943722943722944,
"grad_norm": 1.444847475795541e-05,
"learning_rate": 0.00019974346608945466,
"loss": 1.6366,
"step": 68
},
{
"epoch": 0.30303030303030304,
"grad_norm": 1.3742756891588215e-05,
"learning_rate": 0.00019968962185193365,
"loss": 1.5994,
"step": 70
},
{
"epoch": 0.3116883116883117,
"grad_norm": 8.885600254870951e-05,
"learning_rate": 0.00019963066069424943,
"loss": 1.7003,
"step": 72
},
{
"epoch": 0.3203463203463203,
"grad_norm": 2.5485469450359233e-05,
"learning_rate": 0.0001995665856427905,
"loss": 1.6381,
"step": 74
},
{
"epoch": 0.329004329004329,
"grad_norm": 3.399766137590632e-05,
"learning_rate": 0.00019949739998643414,
"loss": 1.6196,
"step": 76
},
{
"epoch": 0.33766233766233766,
"grad_norm": 3.7044861528556794e-05,
"learning_rate": 0.00019942310727637724,
"loss": 1.667,
"step": 78
},
{
"epoch": 0.3463203463203463,
"grad_norm": 3.5706671042134985e-05,
"learning_rate": 0.00019934371132595424,
"loss": 1.8312,
"step": 80
},
{
"epoch": 0.354978354978355,
"grad_norm": 2.0404797396622598e-05,
"learning_rate": 0.00019925921621044129,
"loss": 1.6482,
"step": 82
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.7347399989375845e-05,
"learning_rate": 0.00019916962626684707,
"loss": 1.6175,
"step": 84
},
{
"epoch": 0.3722943722943723,
"grad_norm": 3.3090138458646834e-05,
"learning_rate": 0.00019907494609369035,
"loss": 1.8201,
"step": 86
},
{
"epoch": 0.38095238095238093,
"grad_norm": 2.85672413156135e-05,
"learning_rate": 0.0001989751805507637,
"loss": 1.7632,
"step": 88
},
{
"epoch": 0.38961038961038963,
"grad_norm": 3.10399555019103e-05,
"learning_rate": 0.00019887033475888419,
"loss": 1.648,
"step": 90
},
{
"epoch": 0.39826839826839827,
"grad_norm": 2.8758275220752694e-05,
"learning_rate": 0.00019876041409963056,
"loss": 1.6615,
"step": 92
},
{
"epoch": 0.4069264069264069,
"grad_norm": 2.411049536021892e-05,
"learning_rate": 0.00019864542421506686,
"loss": 1.6684,
"step": 94
},
{
"epoch": 0.4155844155844156,
"grad_norm": 2.1932239178568125e-05,
"learning_rate": 0.00019852537100745307,
"loss": 1.5879,
"step": 96
},
{
"epoch": 0.42424242424242425,
"grad_norm": 2.419168049527798e-05,
"learning_rate": 0.00019840026063894193,
"loss": 1.6055,
"step": 98
},
{
"epoch": 0.4329004329004329,
"grad_norm": 3.5360833862796426e-05,
"learning_rate": 0.00019827009953126275,
"loss": 1.6233,
"step": 100
},
{
"epoch": 0.44155844155844154,
"grad_norm": 5.8022818848257884e-05,
"learning_rate": 0.0001981348943653918,
"loss": 1.5232,
"step": 102
},
{
"epoch": 0.45021645021645024,
"grad_norm": 3.986386946053244e-05,
"learning_rate": 0.0001979946520812093,
"loss": 1.7553,
"step": 104
},
{
"epoch": 0.4588744588744589,
"grad_norm": 5.403523027780466e-05,
"learning_rate": 0.00019784937987714333,
"loss": 1.6931,
"step": 106
},
{
"epoch": 0.4675324675324675,
"grad_norm": 2.1774114429717883e-05,
"learning_rate": 0.00019769908520980034,
"loss": 1.5427,
"step": 108
},
{
"epoch": 0.47619047619047616,
"grad_norm": 3.063875192310661e-05,
"learning_rate": 0.0001975437757935822,
"loss": 1.7336,
"step": 110
},
{
"epoch": 0.48484848484848486,
"grad_norm": 2.7840827897307463e-05,
"learning_rate": 0.0001973834596002905,
"loss": 1.6536,
"step": 112
},
{
"epoch": 0.4935064935064935,
"grad_norm": 2.68031708401395e-05,
"learning_rate": 0.00019721814485871726,
"loss": 1.7237,
"step": 114
},
{
"epoch": 0.5021645021645021,
"grad_norm": 4.067725967615843e-05,
"learning_rate": 0.0001970478400542225,
"loss": 1.6127,
"step": 116
},
{
"epoch": 0.5108225108225108,
"grad_norm": 3.361668132129125e-05,
"learning_rate": 0.00019687255392829877,
"loss": 1.5502,
"step": 118
},
{
"epoch": 0.5194805194805194,
"grad_norm": 3.693327744258568e-05,
"learning_rate": 0.00019669229547812249,
"loss": 1.6483,
"step": 120
},
{
"epoch": 0.5281385281385281,
"grad_norm": 3.008819476235658e-05,
"learning_rate": 0.00019650707395609204,
"loss": 1.6436,
"step": 122
},
{
"epoch": 0.5367965367965368,
"grad_norm": 3.0448116376646794e-05,
"learning_rate": 0.00019631689886935298,
"loss": 1.4893,
"step": 124
},
{
"epoch": 0.5454545454545454,
"grad_norm": 4.133440961595625e-05,
"learning_rate": 0.00019612177997930987,
"loss": 1.6093,
"step": 126
},
{
"epoch": 0.5541125541125541,
"grad_norm": 8.941220585256815e-05,
"learning_rate": 0.00019592172730112544,
"loss": 1.737,
"step": 128
},
{
"epoch": 0.5627705627705628,
"grad_norm": 4.473665831028484e-05,
"learning_rate": 0.00019571675110320643,
"loss": 1.6455,
"step": 130
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.00013181190297473222,
"learning_rate": 0.00019550686190667648,
"loss": 1.5996,
"step": 132
},
{
"epoch": 0.5800865800865801,
"grad_norm": 9.064975165529177e-05,
"learning_rate": 0.0001952920704848362,
"loss": 1.6947,
"step": 134
},
{
"epoch": 0.5887445887445888,
"grad_norm": 3.242671664338559e-05,
"learning_rate": 0.00019507238786261008,
"loss": 1.6403,
"step": 136
},
{
"epoch": 0.5974025974025974,
"grad_norm": 0.0001139045343734324,
"learning_rate": 0.00019484782531598073,
"loss": 1.6445,
"step": 138
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.0001039160051732324,
"learning_rate": 0.00019461839437141004,
"loss": 1.7483,
"step": 140
},
{
"epoch": 0.6147186147186147,
"grad_norm": 8.791654545348138e-05,
"learning_rate": 0.0001943841068052474,
"loss": 1.5357,
"step": 142
},
{
"epoch": 0.6233766233766234,
"grad_norm": 4.35874389950186e-05,
"learning_rate": 0.0001941449746431255,
"loss": 1.7572,
"step": 144
},
{
"epoch": 0.6320346320346321,
"grad_norm": 4.0783703298075125e-05,
"learning_rate": 0.0001939010101593429,
"loss": 1.7342,
"step": 146
},
{
"epoch": 0.6406926406926406,
"grad_norm": 0.00014383271627593786,
"learning_rate": 0.00019365222587623405,
"loss": 1.6868,
"step": 148
},
{
"epoch": 0.6493506493506493,
"grad_norm": 9.058301657205448e-05,
"learning_rate": 0.00019339863456352657,
"loss": 1.7322,
"step": 150
},
{
"epoch": 0.658008658008658,
"grad_norm": 4.600944521371275e-05,
"learning_rate": 0.0001931402492376857,
"loss": 1.7196,
"step": 152
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.00011446730059105903,
"learning_rate": 0.0001928770831612463,
"loss": 1.6174,
"step": 154
},
{
"epoch": 0.6753246753246753,
"grad_norm": 8.582652662880719e-05,
"learning_rate": 0.00019260914984213203,
"loss": 1.6256,
"step": 156
},
{
"epoch": 0.683982683982684,
"grad_norm": 4.044194065500051e-05,
"learning_rate": 0.00019233646303296205,
"loss": 1.6417,
"step": 158
},
{
"epoch": 0.6926406926406926,
"grad_norm": 4.7330380766652524e-05,
"learning_rate": 0.0001920590367303451,
"loss": 1.6737,
"step": 160
},
{
"epoch": 0.7012987012987013,
"grad_norm": 5.3459320042748004e-05,
"learning_rate": 0.00019177688517416105,
"loss": 1.7177,
"step": 162
},
{
"epoch": 0.70995670995671,
"grad_norm": 9.026160842040554e-05,
"learning_rate": 0.00019149002284683008,
"loss": 1.7326,
"step": 164
},
{
"epoch": 0.7186147186147186,
"grad_norm": 6.937263970030472e-05,
"learning_rate": 0.0001911984644725692,
"loss": 1.7971,
"step": 166
},
{
"epoch": 0.7272727272727273,
"grad_norm": 8.01292626420036e-05,
"learning_rate": 0.0001909022250166365,
"loss": 1.6087,
"step": 168
},
{
"epoch": 0.7359307359307359,
"grad_norm": 7.882928184699267e-05,
"learning_rate": 0.00019060131968456312,
"loss": 1.6477,
"step": 170
},
{
"epoch": 0.7445887445887446,
"grad_norm": 0.00010625456343404949,
"learning_rate": 0.00019029576392137263,
"loss": 1.7002,
"step": 172
},
{
"epoch": 0.7532467532467533,
"grad_norm": 0.00021590011601801962,
"learning_rate": 0.00018998557341078835,
"loss": 1.7764,
"step": 174
},
{
"epoch": 0.7619047619047619,
"grad_norm": 6.014715472701937e-05,
"learning_rate": 0.00018967076407442829,
"loss": 1.6852,
"step": 176
},
{
"epoch": 0.7705627705627706,
"grad_norm": 8.015201456146315e-05,
"learning_rate": 0.00018935135207098785,
"loss": 1.6184,
"step": 178
},
{
"epoch": 0.7792207792207793,
"grad_norm": 6.944081542314962e-05,
"learning_rate": 0.00018902735379541064,
"loss": 1.6294,
"step": 180
},
{
"epoch": 0.7878787878787878,
"grad_norm": 6.165856029838324e-05,
"learning_rate": 0.0001886987858780467,
"loss": 1.778,
"step": 182
},
{
"epoch": 0.7965367965367965,
"grad_norm": 8.64676694618538e-05,
"learning_rate": 0.000188365665183799,
"loss": 1.5531,
"step": 184
},
{
"epoch": 0.8051948051948052,
"grad_norm": 9.756162035046145e-05,
"learning_rate": 0.00018802800881125784,
"loss": 1.7175,
"step": 186
},
{
"epoch": 0.8138528138528138,
"grad_norm": 0.00021862791618332267,
"learning_rate": 0.00018768583409182305,
"loss": 1.8573,
"step": 188
},
{
"epoch": 0.8225108225108225,
"grad_norm": 0.0001369621604681015,
"learning_rate": 0.00018733915858881462,
"loss": 1.7662,
"step": 190
},
{
"epoch": 0.8311688311688312,
"grad_norm": 0.00031745131127536297,
"learning_rate": 0.00018698800009657094,
"loss": 2.0091,
"step": 192
},
{
"epoch": 0.8398268398268398,
"grad_norm": 0.00010971837764373049,
"learning_rate": 0.00018663237663953567,
"loss": 1.7376,
"step": 194
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.00029516848735511303,
"learning_rate": 0.0001862723064713324,
"loss": 1.7701,
"step": 196
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.00034297676756978035,
"learning_rate": 0.0001859078080738279,
"loss": 1.9447,
"step": 198
},
{
"epoch": 0.8658008658008658,
"grad_norm": 0.0002530592610128224,
"learning_rate": 0.00018553890015618333,
"loss": 1.8612,
"step": 200
},
{
"epoch": 0.8744588744588745,
"grad_norm": 0.00010490286513231695,
"learning_rate": 0.00018516560165389388,
"loss": 1.9922,
"step": 202
},
{
"epoch": 0.8831168831168831,
"grad_norm": 0.00020270211098250002,
"learning_rate": 0.00018478793172781708,
"loss": 1.9509,
"step": 204
},
{
"epoch": 0.8917748917748918,
"grad_norm": 9.309218876296654e-05,
"learning_rate": 0.000184405909763189,
"loss": 1.909,
"step": 206
},
{
"epoch": 0.9004329004329005,
"grad_norm": 7.871213892940432e-05,
"learning_rate": 0.00018401955536862948,
"loss": 2.0207,
"step": 208
},
{
"epoch": 0.9090909090909091,
"grad_norm": 8.371711010113358e-05,
"learning_rate": 0.00018362888837513547,
"loss": 2.0631,
"step": 210
},
{
"epoch": 0.9177489177489178,
"grad_norm": 0.00014524892321787775,
"learning_rate": 0.00018323392883506335,
"loss": 2.2097,
"step": 212
},
{
"epoch": 0.9264069264069265,
"grad_norm": 0.00010290888894814998,
"learning_rate": 0.00018283469702109936,
"loss": 2.1094,
"step": 214
},
{
"epoch": 0.935064935064935,
"grad_norm": 9.359593968838453e-05,
"learning_rate": 0.00018243121342521935,
"loss": 2.4721,
"step": 216
},
{
"epoch": 0.9437229437229437,
"grad_norm": 9.659567876951769e-05,
"learning_rate": 0.0001820234987576368,
"loss": 2.6673,
"step": 218
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.00016237075033131987,
"learning_rate": 0.0001816115739457397,
"loss": 2.9459,
"step": 220
},
{
"epoch": 0.961038961038961,
"grad_norm": 0.0001531827583676204,
"learning_rate": 0.00018119546013301664,
"loss": 3.2397,
"step": 222
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.0001731283700792119,
"learning_rate": 0.0001807751786779713,
"loss": 3.62,
"step": 224
},
{
"epoch": 0.9783549783549783,
"grad_norm": 8.82472813827917e-05,
"learning_rate": 0.00018035075115302633,
"loss": 3.9451,
"step": 226
},
{
"epoch": 0.987012987012987,
"grad_norm": 0.00010907312389463186,
"learning_rate": 0.0001799221993434159,
"loss": 4.6026,
"step": 228
},
{
"epoch": 0.9956709956709957,
"grad_norm": 0.00013694152585230768,
"learning_rate": 0.00017948954524606763,
"loss": 5.3581,
"step": 230
},
{
"epoch": 1.0043290043290043,
"grad_norm": 0.00015375320799648762,
"learning_rate": 0.00017905281106847344,
"loss": 5.3862,
"step": 232
},
{
"epoch": 1.0129870129870129,
"grad_norm": 0.000532141828443855,
"learning_rate": 0.00017861201922754979,
"loss": 6.0815,
"step": 234
},
{
"epoch": 1.0216450216450217,
"grad_norm": NaN,
"learning_rate": 0.00017839010874560733,
"loss": 0.0,
"step": 236
},
{
"epoch": 1.0303030303030303,
"grad_norm": NaN,
"learning_rate": 0.00017839010874560733,
"loss": 0.0,
"step": 238
},
{
"epoch": 1.0389610389610389,
"grad_norm": NaN,
"learning_rate": 0.00017839010874560733,
"loss": 0.0,
"step": 240
},
{
"epoch": 1.0476190476190477,
"grad_norm": NaN,
"learning_rate": 0.00017839010874560733,
"loss": 0.0,
"step": 242
},
{
"epoch": 1.0562770562770563,
"grad_norm": NaN,
"learning_rate": 0.00017839010874560733,
"loss": 0.0,
"step": 244
},
{
"epoch": 1.0649350649350648,
"grad_norm": 0.1457211822271347,
"learning_rate": 0.0001781671923484869,
"loss": 5.7907,
"step": 246
},
{
"epoch": 1.0735930735930737,
"grad_norm": 0.4493274390697479,
"learning_rate": 0.00017771835326358743,
"loss": 7.5356,
"step": 248
},
{
"epoch": 1.0822510822510822,
"grad_norm": 3.0114400386810303,
"learning_rate": 0.00017726552501109478,
"loss": 7.336,
"step": 250
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.23199492692947388,
"learning_rate": 0.0001768087308340103,
"loss": 8.1303,
"step": 252
},
{
"epoch": 1.0995670995670996,
"grad_norm": 0.23555096983909607,
"learning_rate": 0.00017634799417890035,
"loss": 9.2992,
"step": 254
},
{
"epoch": 1.1082251082251082,
"grad_norm": 0.17370502650737762,
"learning_rate": 0.0001758833386946928,
"loss": 9.3926,
"step": 256
},
{
"epoch": 1.1168831168831168,
"grad_norm": 0.25211918354034424,
"learning_rate": 0.00017541478823146327,
"loss": 9.3047,
"step": 258
},
{
"epoch": 1.1255411255411256,
"grad_norm": 0.18103384971618652,
"learning_rate": 0.00017494236683921084,
"loss": 9.1649,
"step": 260
},
{
"epoch": 1.1341991341991342,
"grad_norm": 0.05794016644358635,
"learning_rate": 0.00017446609876662356,
"loss": 8.9708,
"step": 262
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.08229361474514008,
"learning_rate": 0.000173986008459834,
"loss": 9.7143,
"step": 264
},
{
"epoch": 1.1515151515151516,
"grad_norm": 0.1594569981098175,
"learning_rate": 0.00017350212056116418,
"loss": 10.6715,
"step": 266
},
{
"epoch": 1.1601731601731602,
"grad_norm": 0.19106224179267883,
"learning_rate": 0.00017301445990786102,
"loss": 11.408,
"step": 268
},
{
"epoch": 1.1688311688311688,
"grad_norm": 0.2994789183139801,
"learning_rate": 0.00017252305153082114,
"loss": 12.2243,
"step": 270
},
{
"epoch": 1.1774891774891776,
"grad_norm": 0.21744585037231445,
"learning_rate": 0.00017202792065330646,
"loss": 12.9086,
"step": 272
},
{
"epoch": 1.1861471861471862,
"grad_norm": 0.16616225242614746,
"learning_rate": 0.00017152909268964916,
"loss": 13.5227,
"step": 274
},
{
"epoch": 1.1948051948051948,
"grad_norm": 0.15657858550548553,
"learning_rate": 0.00017102659324394747,
"loss": 13.9467,
"step": 276
},
{
"epoch": 1.2034632034632033,
"grad_norm": 0.16317234933376312,
"learning_rate": 0.00017052044810875126,
"loss": 14.6343,
"step": 278
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.21492412686347961,
"learning_rate": 0.00017001068326373827,
"loss": 15.0887,
"step": 280
},
{
"epoch": 1.2207792207792207,
"grad_norm": 0.1301400512456894,
"learning_rate": 0.00016949732487438047,
"loss": 15.6847,
"step": 282
},
{
"epoch": 1.2294372294372296,
"grad_norm": 0.20310911536216736,
"learning_rate": 0.00016898039929060129,
"loss": 16.6419,
"step": 284
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.05390779674053192,
"learning_rate": 0.00016845993304542283,
"loss": 15.3988,
"step": 286
},
{
"epoch": 1.2467532467532467,
"grad_norm": 0.16172261536121368,
"learning_rate": 0.0001679359528536041,
"loss": 15.9208,
"step": 288
},
{
"epoch": 1.2554112554112553,
"grad_norm": 0.13750512897968292,
"learning_rate": 0.0001674084856102698,
"loss": 16.2978,
"step": 290
},
{
"epoch": 1.2640692640692641,
"grad_norm": 0.07984600216150284,
"learning_rate": 0.00016687755838952972,
"loss": 15.957,
"step": 292
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.13223163783550262,
"learning_rate": 0.00016634319844308925,
"loss": 16.2106,
"step": 294
},
{
"epoch": 1.2813852813852815,
"grad_norm": 0.08291322737932205,
"learning_rate": 0.00016580543319885048,
"loss": 16.14,
"step": 296
},
{
"epoch": 1.29004329004329,
"grad_norm": 0.052721720188856125,
"learning_rate": 0.00016526429025950424,
"loss": 15.9756,
"step": 298
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.08016930520534515,
"learning_rate": 0.00016471979740111366,
"loss": 16.2873,
"step": 300
},
{
"epoch": 1.3073593073593073,
"grad_norm": 0.09582065045833588,
"learning_rate": 0.00016417198257168803,
"loss": 16.4316,
"step": 302
},
{
"epoch": 1.316017316017316,
"grad_norm": 0.07704820483922958,
"learning_rate": 0.00016362087388974863,
"loss": 16.7252,
"step": 304
},
{
"epoch": 1.3246753246753247,
"grad_norm": 0.06254423409700394,
"learning_rate": 0.00016306649964288516,
"loss": 16.1704,
"step": 306
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.07229200750589371,
"learning_rate": 0.000162508888286304,
"loss": 16.0079,
"step": 308
},
{
"epoch": 1.341991341991342,
"grad_norm": 0.052998676896095276,
"learning_rate": 0.00016194806844136754,
"loss": 16.0897,
"step": 310
},
{
"epoch": 1.3506493506493507,
"grad_norm": 0.10584064573049545,
"learning_rate": 0.00016138406889412512,
"loss": 15.995,
"step": 312
},
{
"epoch": 1.3593073593073592,
"grad_norm": 0.11045046895742416,
"learning_rate": 0.00016081691859383545,
"loss": 16.0464,
"step": 314
},
{
"epoch": 1.3679653679653678,
"grad_norm": 0.06835480034351349,
"learning_rate": 0.00016024664665148077,
"loss": 15.842,
"step": 316
},
{
"epoch": 1.3766233766233766,
"grad_norm": 0.05876036360859871,
"learning_rate": 0.00015967328233827249,
"loss": 15.5758,
"step": 318
},
{
"epoch": 1.3852813852813852,
"grad_norm": 0.05429469048976898,
"learning_rate": 0.00015909685508414884,
"loss": 15.6389,
"step": 320
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.08564180880784988,
"learning_rate": 0.00015851739447626434,
"loss": 15.5351,
"step": 322
},
{
"epoch": 1.4025974025974026,
"grad_norm": 0.08055119216442108,
"learning_rate": 0.00015793493025747092,
"loss": 15.4569,
"step": 324
},
{
"epoch": 1.4112554112554112,
"grad_norm": 0.08411835134029388,
"learning_rate": 0.00015734949232479152,
"loss": 15.5636,
"step": 326
},
{
"epoch": 1.4199134199134198,
"grad_norm": 0.15642018616199493,
"learning_rate": 0.00015676111072788527,
"loss": 15.7577,
"step": 328
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.17614306509494781,
"learning_rate": 0.00015616981566750538,
"loss": 15.3891,
"step": 330
},
{
"epoch": 1.4372294372294372,
"grad_norm": 0.10892828553915024,
"learning_rate": 0.00015557563749394858,
"loss": 15.0548,
"step": 332
},
{
"epoch": 1.445887445887446,
"grad_norm": 0.09866315126419067,
"learning_rate": 0.00015497860670549772,
"loss": 14.8649,
"step": 334
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.1259247213602066,
"learning_rate": 0.00015437875394685606,
"loss": 14.9433,
"step": 336
},
{
"epoch": 1.4632034632034632,
"grad_norm": 0.1163773462176323,
"learning_rate": 0.0001537761100075744,
"loss": 15.0705,
"step": 338
},
{
"epoch": 1.4718614718614718,
"grad_norm": 0.10914402455091476,
"learning_rate": 0.00015317070582047065,
"loss": 15.2118,
"step": 340
},
{
"epoch": 1.4805194805194806,
"grad_norm": 0.15627121925354004,
"learning_rate": 0.00015256257246004217,
"loss": 15.6857,
"step": 342
},
{
"epoch": 1.4891774891774892,
"grad_norm": 0.12600019574165344,
"learning_rate": 0.00015195174114087078,
"loss": 15.2044,
"step": 344
},
{
"epoch": 1.497835497835498,
"grad_norm": 0.1444402039051056,
"learning_rate": 0.00015133824321602045,
"loss": 15.1016,
"step": 346
},
{
"epoch": 1.5064935064935066,
"grad_norm": 0.07496843487024307,
"learning_rate": 0.00015072211017542813,
"loss": 14.8805,
"step": 348
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.14399290084838867,
"learning_rate": 0.0001501033736442872,
"loss": 14.8005,
"step": 350
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.10512608289718628,
"learning_rate": 0.00014948206538142457,
"loss": 14.4348,
"step": 352
},
{
"epoch": 1.5324675324675323,
"grad_norm": 0.05682501569390297,
"learning_rate": 0.00014885821727767006,
"loss": 14.274,
"step": 354
},
{
"epoch": 1.5411255411255411,
"grad_norm": 0.09160462766885757,
"learning_rate": 0.00014823186135421994,
"loss": 14.1816,
"step": 356
},
{
"epoch": 1.54978354978355,
"grad_norm": 0.03531830757856369,
"learning_rate": 0.00014760302976099304,
"loss": 14.0882,
"step": 358
},
{
"epoch": 1.5584415584415585,
"grad_norm": 0.039006441831588745,
"learning_rate": 0.00014697175477498074,
"loss": 14.1806,
"step": 360
},
{
"epoch": 1.567099567099567,
"grad_norm": 0.07611044496297836,
"learning_rate": 0.00014633806879859,
"loss": 14.2338,
"step": 362
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.05836552381515503,
"learning_rate": 0.00014570200435798044,
"loss": 14.2683,
"step": 364
},
{
"epoch": 1.5844155844155843,
"grad_norm": 0.01268097199499607,
"learning_rate": 0.0001450635941013947,
"loss": 14.2289,
"step": 366
},
{
"epoch": 1.593073593073593,
"grad_norm": 0.07083582133054733,
"learning_rate": 0.00014442287079748263,
"loss": 14.3293,
"step": 368
},
{
"epoch": 1.601731601731602,
"grad_norm": 0.07286886125802994,
"learning_rate": 0.0001437798673336194,
"loss": 14.4253,
"step": 370
},
{
"epoch": 1.6103896103896105,
"grad_norm": 0.014986686408519745,
"learning_rate": 0.00014313461671421735,
"loss": 14.2353,
"step": 372
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.06591885536909103,
"learning_rate": 0.00014248715205903204,
"loss": 14.1536,
"step": 374
},
{
"epoch": 1.6277056277056277,
"grad_norm": 0.07969338446855545,
"learning_rate": 0.0001418375066014622,
"loss": 14.6147,
"step": 376
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.06318385899066925,
"learning_rate": 0.00014118571368684383,
"loss": 14.345,
"step": 378
},
{
"epoch": 1.645021645021645,
"grad_norm": 0.05836130306124687,
"learning_rate": 0.00014053180677073876,
"loss": 14.2053,
"step": 380
},
{
"epoch": 1.6536796536796536,
"grad_norm": 0.06260073184967041,
"learning_rate": 0.0001398758194172174,
"loss": 14.1105,
"step": 382
},
{
"epoch": 1.6623376623376624,
"grad_norm": 0.060454513877630234,
"learning_rate": 0.00013921778529713582,
"loss": 14.0394,
"step": 384
},
{
"epoch": 1.670995670995671,
"grad_norm": 0.07355903834104538,
"learning_rate": 0.00013855773818640773,
"loss": 14.025,
"step": 386
},
{
"epoch": 1.6796536796536796,
"grad_norm": 0.066501684486866,
"learning_rate": 0.00013789571196427055,
"loss": 14.6727,
"step": 388
},
{
"epoch": 1.6883116883116882,
"grad_norm": 0.07008553296327591,
"learning_rate": 0.0001372317406115465,
"loss": 14.0043,
"step": 390
},
{
"epoch": 1.696969696969697,
"grad_norm": 0.010356806218624115,
"learning_rate": 0.00013656585820889867,
"loss": 14.0629,
"step": 392
},
{
"epoch": 1.7056277056277056,
"grad_norm": 0.06225749850273132,
"learning_rate": 0.00013589809893508128,
"loss": 13.9668,
"step": 394
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.07188686728477478,
"learning_rate": 0.00013522849706518566,
"loss": 13.8812,
"step": 396
},
{
"epoch": 1.722943722943723,
"grad_norm": 0.06185540184378624,
"learning_rate": 0.00013455708696888085,
"loss": 14.1862,
"step": 398
},
{
"epoch": 1.7316017316017316,
"grad_norm": 0.0615684911608696,
"learning_rate": 0.00013388390310864945,
"loss": 14.4132,
"step": 400
},
{
"epoch": 1.7402597402597402,
"grad_norm": 0.07077145576477051,
"learning_rate": 0.00013320898003801879,
"loss": 14.4692,
"step": 402
},
{
"epoch": 1.7489177489177488,
"grad_norm": 0.05472411587834358,
"learning_rate": 0.00013253235239978715,
"loss": 13.9146,
"step": 404
},
{
"epoch": 1.7575757575757576,
"grad_norm": 0.06190333142876625,
"learning_rate": 0.00013185405492424588,
"loss": 14.0298,
"step": 406
},
{
"epoch": 1.7662337662337664,
"grad_norm": 0.06982716172933578,
"learning_rate": 0.00013117412242739655,
"loss": 14.0501,
"step": 408
},
{
"epoch": 1.774891774891775,
"grad_norm": 0.05709117650985718,
"learning_rate": 0.00013049258980916387,
"loss": 14.0339,
"step": 410
},
{
"epoch": 1.7835497835497836,
"grad_norm": 0.1394786387681961,
"learning_rate": 0.00012980949205160448,
"loss": 14.2314,
"step": 412
},
{
"epoch": 1.7922077922077921,
"grad_norm": 0.09034628421068192,
"learning_rate": 0.00012912486421711128,
"loss": 14.1102,
"step": 414
},
{
"epoch": 1.8008658008658007,
"grad_norm": 0.06254860013723373,
"learning_rate": 0.00012843874144661372,
"loss": 14.0066,
"step": 416
},
{
"epoch": 1.8095238095238095,
"grad_norm": 0.09414557367563248,
"learning_rate": 0.00012775115895777417,
"loss": 14.0197,
"step": 418
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.021693430840969086,
"learning_rate": 0.00012706215204318007,
"loss": 14.9639,
"step": 420
},
{
"epoch": 1.826839826839827,
"grad_norm": 0.058936670422554016,
"learning_rate": 0.00012637175606853264,
"loss": 13.7909,
"step": 422
},
{
"epoch": 1.8354978354978355,
"grad_norm": 0.04964934289455414,
"learning_rate": 0.0001256800064708313,
"loss": 13.8304,
"step": 424
},
{
"epoch": 1.844155844155844,
"grad_norm": 0.04889138787984848,
"learning_rate": 0.00012498693875655516,
"loss": 13.9045,
"step": 426
},
{
"epoch": 1.8528138528138527,
"grad_norm": 0.015483302064239979,
"learning_rate": 0.00012429258849984014,
"loss": 13.9397,
"step": 428
},
{
"epoch": 1.8614718614718615,
"grad_norm": 0.009186900220811367,
"learning_rate": 0.00012359699134065314,
"loss": 13.7447,
"step": 430
},
{
"epoch": 1.87012987012987,
"grad_norm": 0.05520382896065712,
"learning_rate": 0.00012290018298296285,
"loss": 13.8049,
"step": 432
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.05643709748983383,
"learning_rate": 0.00012220219919290687,
"loss": 14.3662,
"step": 434
},
{
"epoch": 1.8874458874458875,
"grad_norm": 0.05961998179554939,
"learning_rate": 0.00012150307579695601,
"loss": 13.9983,
"step": 436
},
{
"epoch": 1.896103896103896,
"grad_norm": 0.006795608904212713,
"learning_rate": 0.00012080284868007541,
"loss": 13.9398,
"step": 438
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.007627868093550205,
"learning_rate": 0.00012010155378388253,
"loss": 13.9321,
"step": 440
},
{
"epoch": 1.9134199134199135,
"grad_norm": 0.027596835047006607,
"learning_rate": 0.00011939922710480229,
"loss": 14.0445,
"step": 442
},
{
"epoch": 1.922077922077922,
"grad_norm": 0.0442458800971508,
"learning_rate": 0.00011869590469221965,
"loss": 13.999,
"step": 444
},
{
"epoch": 1.9307359307359309,
"grad_norm": 0.0031651495955884457,
"learning_rate": 0.0001179916226466289,
"loss": 14.3455,
"step": 446
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.0655621886253357,
"learning_rate": 0.00011728641711778103,
"loss": 14.0114,
"step": 448
},
{
"epoch": 1.948051948051948,
"grad_norm": 0.05245671793818474,
"learning_rate": 0.000116580324302828,
"loss": 14.1412,
"step": 450
},
{
"epoch": 1.9567099567099566,
"grad_norm": 0.017913660034537315,
"learning_rate": 0.00011587338044446476,
"loss": 14.0516,
"step": 452
},
{
"epoch": 1.9653679653679652,
"grad_norm": 0.005986363161355257,
"learning_rate": 0.00011516562182906922,
"loss": 14.0402,
"step": 454
},
{
"epoch": 1.974025974025974,
"grad_norm": 0.06156434491276741,
"learning_rate": 0.0001144570847848394,
"loss": 13.9566,
"step": 456
},
{
"epoch": 1.9826839826839828,
"grad_norm": 0.009826838970184326,
"learning_rate": 0.000113747805679929,
"loss": 13.9418,
"step": 458
},
{
"epoch": 1.9913419913419914,
"grad_norm": 0.009458528831601143,
"learning_rate": 0.00011303782092058061,
"loss": 13.8423,
"step": 460
},
{
"epoch": 2.0,
"grad_norm": 0.22517594695091248,
"learning_rate": 0.00011232716694925693,
"loss": 13.9573,
"step": 462
},
{
"epoch": 2.0086580086580086,
"grad_norm": 0.003495636396110058,
"learning_rate": 0.00011161588024277036,
"loss": 14.1671,
"step": 464
},
{
"epoch": 2.017316017316017,
"grad_norm": 0.013556540943682194,
"learning_rate": 0.00011090399731041072,
"loss": 14.503,
"step": 466
},
{
"epoch": 2.0259740259740258,
"grad_norm": 0.05908797308802605,
"learning_rate": 0.0001101915546920711,
"loss": 14.613,
"step": 468
},
{
"epoch": 2.034632034632035,
"grad_norm": 0.002222011098638177,
"learning_rate": 0.00010947858895637255,
"loss": 14.9046,
"step": 470
},
{
"epoch": 2.0432900432900434,
"grad_norm": 0.005948877427726984,
"learning_rate": 0.00010876513669878683,
"loss": 15.0299,
"step": 472
},
{
"epoch": 2.051948051948052,
"grad_norm": 0.0032331624533981085,
"learning_rate": 0.0001080512345397583,
"loss": 15.1622,
"step": 474
},
{
"epoch": 2.0606060606060606,
"grad_norm": 0.04379121959209442,
"learning_rate": 0.00010733691912282396,
"loss": 15.2752,
"step": 476
},
{
"epoch": 2.069264069264069,
"grad_norm": 0.003626056481152773,
"learning_rate": 0.00010662222711273279,
"loss": 15.4877,
"step": 478
},
{
"epoch": 2.0779220779220777,
"grad_norm": 0.0034651593305170536,
"learning_rate": 0.00010590719519356373,
"loss": 15.7603,
"step": 480
},
{
"epoch": 2.0865800865800868,
"grad_norm": 0.007096354383975267,
"learning_rate": 0.00010519186006684277,
"loss": 15.4634,
"step": 482
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.005978535860776901,
"learning_rate": 0.000104476258449659,
"loss": 15.6005,
"step": 484
},
{
"epoch": 2.103896103896104,
"grad_norm": 0.0036010430194437504,
"learning_rate": 0.0001037604270727802,
"loss": 15.6865,
"step": 486
},
{
"epoch": 2.1125541125541125,
"grad_norm": 0.005017112474888563,
"learning_rate": 0.00010304440267876727,
"loss": 15.8028,
"step": 488
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.002776139648631215,
"learning_rate": 0.00010232822202008844,
"loss": 15.662,
"step": 490
},
{
"epoch": 2.1298701298701297,
"grad_norm": 0.0025852976832538843,
"learning_rate": 0.0001016119218572328,
"loss": 16.5597,
"step": 492
},
{
"epoch": 2.1385281385281387,
"grad_norm": 0.0041742087341845036,
"learning_rate": 0.0001008955389568233,
"loss": 15.6964,
"step": 494
},
{
"epoch": 2.1471861471861473,
"grad_norm": 0.003886697581037879,
"learning_rate": 0.00010017911008972982,
"loss": 15.6254,
"step": 496
},
{
"epoch": 2.155844155844156,
"grad_norm": 0.06273438781499863,
"learning_rate": 9.946267202918157e-05,
"loss": 15.7853,
"step": 498
},
{
"epoch": 2.1645021645021645,
"grad_norm": 0.0052159507758915424,
"learning_rate": 9.87462615488797e-05,
"loss": 15.7882,
"step": 500
},
{
"epoch": 2.173160173160173,
"grad_norm": 0.003929893020540476,
"learning_rate": 9.802991542110958e-05,
"loss": 16.0684,
"step": 502
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.004246190190315247,
"learning_rate": 9.731367041485359e-05,
"loss": 15.7668,
"step": 504
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.0031269013416022062,
"learning_rate": 9.659756329390367e-05,
"loss": 15.7715,
"step": 506
},
{
"epoch": 2.1991341991341993,
"grad_norm": 0.001676430692896247,
"learning_rate": 9.588163081497427e-05,
"loss": 15.6645,
"step": 508
},
{
"epoch": 2.207792207792208,
"grad_norm": 0.003888419596478343,
"learning_rate": 9.516590972581578e-05,
"loss": 15.6698,
"step": 510
},
{
"epoch": 2.2164502164502164,
"grad_norm": 0.0032913736067712307,
"learning_rate": 9.445043676332819e-05,
"loss": 15.7315,
"step": 512
},
{
"epoch": 2.225108225108225,
"grad_norm": 0.0013064603554084897,
"learning_rate": 9.373524865167555e-05,
"loss": 15.6309,
"step": 514
},
{
"epoch": 2.2337662337662336,
"grad_norm": 0.0033070454373955727,
"learning_rate": 9.302038210040099e-05,
"loss": 15.8009,
"step": 516
},
{
"epoch": 2.242424242424242,
"grad_norm": 0.002767252502962947,
"learning_rate": 9.230587380254237e-05,
"loss": 15.7214,
"step": 518
},
{
"epoch": 2.2510822510822512,
"grad_norm": 0.0022966829128563404,
"learning_rate": 9.159176043274895e-05,
"loss": 15.6413,
"step": 520
},
{
"epoch": 2.25974025974026,
"grad_norm": 0.002309858100488782,
"learning_rate": 9.087807864539897e-05,
"loss": 15.7846,
"step": 522
},
{
"epoch": 2.2683982683982684,
"grad_norm": 0.0039366851560771465,
"learning_rate": 9.016486507271803e-05,
"loss": 15.8136,
"step": 524
},
{
"epoch": 2.277056277056277,
"grad_norm": 0.004376592580229044,
"learning_rate": 8.945215632289912e-05,
"loss": 15.7964,
"step": 526
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.002474917098879814,
"learning_rate": 8.873998897822336e-05,
"loss": 15.9371,
"step": 528
},
{
"epoch": 2.2943722943722946,
"grad_norm": 0.0036089515779167414,
"learning_rate": 8.802839959318239e-05,
"loss": 15.7513,
"step": 530
},
{
"epoch": 2.303030303030303,
"grad_norm": 0.0045991260558366776,
"learning_rate": 8.731742469260201e-05,
"loss": 15.7613,
"step": 532
},
{
"epoch": 2.311688311688312,
"grad_norm": 0.0072856624610722065,
"learning_rate": 8.66071007697674e-05,
"loss": 16.4219,
"step": 534
},
{
"epoch": 2.3203463203463204,
"grad_norm": 0.0017560477135702968,
"learning_rate": 8.58974642845501e-05,
"loss": 16.0309,
"step": 536
},
{
"epoch": 2.329004329004329,
"grad_norm": 0.004301704466342926,
"learning_rate": 8.518855166153644e-05,
"loss": 27.1619,
"step": 538
},
{
"epoch": 2.3376623376623376,
"grad_norm": 0.059041913598775864,
"learning_rate": 8.448039928815804e-05,
"loss": 16.0355,
"step": 540
},
{
"epoch": 2.346320346320346,
"grad_norm": 0.002861848333850503,
"learning_rate": 8.377304351282399e-05,
"loss": 15.8526,
"step": 542
},
{
"epoch": 2.354978354978355,
"grad_norm": 0.003043128876015544,
"learning_rate": 8.306652064305517e-05,
"loss": 15.7983,
"step": 544
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.0002984661259688437,
"learning_rate": 8.23608669436207e-05,
"loss": 15.8203,
"step": 546
},
{
"epoch": 2.3722943722943723,
"grad_norm": 0.0006706724525429308,
"learning_rate": 8.165611863467644e-05,
"loss": 15.7032,
"step": 548
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.0023594857193529606,
"learning_rate": 8.095231188990597e-05,
"loss": 15.8033,
"step": 550
},
{
"epoch": 2.3896103896103895,
"grad_norm": 0.0,
"learning_rate": 8.024948283466367e-05,
"loss": 15.722,
"step": 552
},
{
"epoch": 2.398268398268398,
"grad_norm": 0.0003026532067451626,
"learning_rate": 7.954766754412066e-05,
"loss": 15.7503,
"step": 554
},
{
"epoch": 2.4069264069264067,
"grad_norm": 0.0020434176549315453,
"learning_rate": 7.884690204141298e-05,
"loss": 15.8143,
"step": 556
},
{
"epoch": 2.4155844155844157,
"grad_norm": 0.03126376122236252,
"learning_rate": 7.814722229579264e-05,
"loss": 16.6621,
"step": 558
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.0025495837908238173,
"learning_rate": 7.744866422078133e-05,
"loss": 15.757,
"step": 560
},
{
"epoch": 2.432900432900433,
"grad_norm": 0.0023321521002799273,
"learning_rate": 7.67512636723271e-05,
"loss": 15.7909,
"step": 562
},
{
"epoch": 2.4415584415584415,
"grad_norm": 0.0014553000219166279,
"learning_rate": 7.605505644696387e-05,
"loss": 15.7724,
"step": 564
},
{
"epoch": 2.45021645021645,
"grad_norm": 0.0026701318565756083,
"learning_rate": 7.536007827997397e-05,
"loss": 15.7491,
"step": 566
},
{
"epoch": 2.458874458874459,
"grad_norm": 0.0020363188814371824,
"learning_rate": 7.46663648435541e-05,
"loss": 15.7404,
"step": 568
},
{
"epoch": 2.4675324675324677,
"grad_norm": 0.005381477996706963,
"learning_rate": 7.397395174498417e-05,
"loss": 15.7682,
"step": 570
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.0010892607970163226,
"learning_rate": 7.328287452479968e-05,
"loss": 15.7591,
"step": 572
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.0046081384643912315,
"learning_rate": 7.259316865496757e-05,
"loss": 16.2354,
"step": 574
},
{
"epoch": 2.4935064935064934,
"grad_norm": 0.0023402958177030087,
"learning_rate": 7.19048695370652e-05,
"loss": 15.8337,
"step": 576
},
{
"epoch": 2.502164502164502,
"grad_norm": 0.0027834863867610693,
"learning_rate": 7.121801250046363e-05,
"loss": 15.9034,
"step": 578
},
{
"epoch": 2.5108225108225106,
"grad_norm": 0.004065465647727251,
"learning_rate": 7.053263280051394e-05,
"loss": 15.717,
"step": 580
},
{
"epoch": 2.5194805194805197,
"grad_norm": 0.0,
"learning_rate": 6.984876561673776e-05,
"loss": 15.7805,
"step": 582
},
{
"epoch": 2.5281385281385282,
"grad_norm": 0.002813364379107952,
"learning_rate": 6.91664460510215e-05,
"loss": 15.7205,
"step": 584
},
{
"epoch": 2.536796536796537,
"grad_norm": 0.0021334670018404722,
"learning_rate": 6.848570912581463e-05,
"loss": 15.8153,
"step": 586
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.003558355150744319,
"learning_rate": 6.780658978233199e-05,
"loss": 15.7313,
"step": 588
},
{
"epoch": 2.554112554112554,
"grad_norm": 0.001838353113271296,
"learning_rate": 6.71291228787604e-05,
"loss": 15.6268,
"step": 590
},
{
"epoch": 2.562770562770563,
"grad_norm": 0.003100910922512412,
"learning_rate": 6.64533431884694e-05,
"loss": 15.7013,
"step": 592
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.0012829442275688052,
"learning_rate": 6.57792853982264e-05,
"loss": 15.832,
"step": 594
},
{
"epoch": 2.58008658008658,
"grad_norm": 0.001535665593110025,
"learning_rate": 6.51069841064162e-05,
"loss": 15.7034,
"step": 596
},
{
"epoch": 2.588744588744589,
"grad_norm": 0.010908816941082478,
"learning_rate": 6.443647382126509e-05,
"loss": 15.8574,
"step": 598
},
{
"epoch": 2.5974025974025974,
"grad_norm": 0.0028520359192043543,
"learning_rate": 6.376778895906976e-05,
"loss": 15.8502,
"step": 600
},
{
"epoch": 2.606060606060606,
"grad_norm": 0.011398903094232082,
"learning_rate": 6.310096384243061e-05,
"loss": 15.9701,
"step": 602
},
{
"epoch": 2.6147186147186146,
"grad_norm": 0.0021338535007089376,
"learning_rate": 6.243603269849003e-05,
"loss": 15.7824,
"step": 604
},
{
"epoch": 2.6233766233766236,
"grad_norm": 0.004288196098059416,
"learning_rate": 6.177302965717566e-05,
"loss": 15.7025,
"step": 606
},
{
"epoch": 2.632034632034632,
"grad_norm": 0.0022099835332483053,
"learning_rate": 6.111198874944845e-05,
"loss": 15.8339,
"step": 608
},
{
"epoch": 2.6406926406926408,
"grad_norm": 0.0,
"learning_rate": 6.045294390555598e-05,
"loss": 15.6495,
"step": 610
},
{
"epoch": 2.6493506493506493,
"grad_norm": 0.06401393562555313,
"learning_rate": 5.979592895329085e-05,
"loss": 15.9235,
"step": 612
},
{
"epoch": 2.658008658008658,
"grad_norm": 0.0,
"learning_rate": 5.914097761625428e-05,
"loss": 15.7719,
"step": 614
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.0014950314071029425,
"learning_rate": 5.848812351212522e-05,
"loss": 15.8033,
"step": 616
},
{
"epoch": 2.675324675324675,
"grad_norm": 0.003303313162177801,
"learning_rate": 5.783740015093484e-05,
"loss": 15.8201,
"step": 618
},
{
"epoch": 2.683982683982684,
"grad_norm": 0.0021313403267413378,
"learning_rate": 5.718884093334627e-05,
"loss": 15.6961,
"step": 620
},
{
"epoch": 2.6926406926406927,
"grad_norm": 0.004075351171195507,
"learning_rate": 5.654247914894058e-05,
"loss": 15.8514,
"step": 622
},
{
"epoch": 2.7012987012987013,
"grad_norm": 0.0010667102178558707,
"learning_rate": 5.589834797450764e-05,
"loss": 15.9098,
"step": 624
},
{
"epoch": 2.70995670995671,
"grad_norm": 0.0020862577948719263,
"learning_rate": 5.525648047234364e-05,
"loss": 15.8231,
"step": 626
},
{
"epoch": 2.7186147186147185,
"grad_norm": 0.0027077968697994947,
"learning_rate": 5.4616909588553674e-05,
"loss": 15.7527,
"step": 628
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.0037543477956205606,
"learning_rate": 5.3979668151360905e-05,
"loss": 15.8286,
"step": 630
},
{
"epoch": 2.7359307359307357,
"grad_norm": 0.0034395060501992702,
"learning_rate": 5.33447888694214e-05,
"loss": 15.8517,
"step": 632
},
{
"epoch": 2.7445887445887447,
"grad_norm": 0.0016386689385399222,
"learning_rate": 5.271230433014542e-05,
"loss": 15.6296,
"step": 634
},
{
"epoch": 2.7532467532467533,
"grad_norm": 0.002882522065192461,
"learning_rate": 5.2082246998024485e-05,
"loss": 15.9626,
"step": 636
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.0016715782694518566,
"learning_rate": 5.145464921296537e-05,
"loss": 16.1011,
"step": 638
},
{
"epoch": 2.7705627705627704,
"grad_norm": 0.003780187340453267,
"learning_rate": 5.082954318862978e-05,
"loss": 15.7561,
"step": 640
},
{
"epoch": 2.779220779220779,
"grad_norm": 0.0015209164703264832,
"learning_rate": 5.0206961010781085e-05,
"loss": 15.7211,
"step": 642
},
{
"epoch": 2.787878787878788,
"grad_norm": 0.003561074612662196,
"learning_rate": 4.958693463563748e-05,
"loss": 15.9192,
"step": 644
},
{
"epoch": 2.7965367965367967,
"grad_norm": 0.0025618516374379396,
"learning_rate": 4.8969495888231484e-05,
"loss": 15.959,
"step": 646
},
{
"epoch": 2.8051948051948052,
"grad_norm": 0.0,
"learning_rate": 4.835467646077656e-05,
"loss": 15.6299,
"step": 648
},
{
"epoch": 2.813852813852814,
"grad_norm": 0.0032659226562827826,
"learning_rate": 4.7742507911040325e-05,
"loss": 15.8226,
"step": 650
},
{
"epoch": 2.8225108225108224,
"grad_norm": 0.0029562402050942183,
"learning_rate": 4.713302166072492e-05,
"loss": 16.5945,
"step": 652
},
{
"epoch": 2.8311688311688314,
"grad_norm": 0.003705563023686409,
"learning_rate": 4.652624899385387e-05,
"loss": 15.8928,
"step": 654
},
{
"epoch": 2.8398268398268396,
"grad_norm": 0.002291543409228325,
"learning_rate": 4.5922221055166656e-05,
"loss": 15.6579,
"step": 656
},
{
"epoch": 2.8484848484848486,
"grad_norm": 0.00401789927855134,
"learning_rate": 4.532096884851978e-05,
"loss": 15.9641,
"step": 658
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.003257931210100651,
"learning_rate": 4.4722523235295745e-05,
"loss": 15.6643,
"step": 660
},
{
"epoch": 2.865800865800866,
"grad_norm": 0.0,
"learning_rate": 4.41269149328185e-05,
"loss": 15.7199,
"step": 662
},
{
"epoch": 2.8744588744588744,
"grad_norm": 0.0021371468901634216,
"learning_rate": 4.3534174512777324e-05,
"loss": 15.8272,
"step": 664
},
{
"epoch": 2.883116883116883,
"grad_norm": 0.0009620334021747112,
"learning_rate": 4.2944332399657184e-05,
"loss": 16.1842,
"step": 666
},
{
"epoch": 2.891774891774892,
"grad_norm": 0.0020075475331395864,
"learning_rate": 4.2357418869177354e-05,
"loss": 15.6193,
"step": 668
},
{
"epoch": 2.9004329004329006,
"grad_norm": 0.0,
"learning_rate": 4.1773464046737276e-05,
"loss": 15.6615,
"step": 670
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.000680964847560972,
"learning_rate": 4.1192497905870276e-05,
"loss": 15.7517,
"step": 672
},
{
"epoch": 2.9177489177489178,
"grad_norm": 0.004122753627598286,
"learning_rate": 4.061455026670509e-05,
"loss": 16.6327,
"step": 674
},
{
"epoch": 2.9264069264069263,
"grad_norm": 0.002312893746420741,
"learning_rate": 4.0039650794435344e-05,
"loss": 15.8981,
"step": 676
},
{
"epoch": 2.935064935064935,
"grad_norm": 0.024043424054980278,
"learning_rate": 3.946782899779667e-05,
"loss": 15.8999,
"step": 678
},
{
"epoch": 2.9437229437229435,
"grad_norm": 0.0012768743326887488,
"learning_rate": 3.889911422755231e-05,
"loss": 15.6874,
"step": 680
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.0023466376587748528,
"learning_rate": 3.8333535674986275e-05,
"loss": 15.7274,
"step": 682
},
{
"epoch": 2.961038961038961,
"grad_norm": 0.0034092010464519262,
"learning_rate": 3.777112237040537e-05,
"loss": 15.7464,
"step": 684
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.0017824557144194841,
"learning_rate": 3.721190318164877e-05,
"loss": 15.7556,
"step": 686
},
{
"epoch": 2.9783549783549783,
"grad_norm": 0.0014683044282719493,
"learning_rate": 3.665590681260658e-05,
"loss": 16.0187,
"step": 688
},
{
"epoch": 2.987012987012987,
"grad_norm": 0.00010432133422000334,
"learning_rate": 3.610316180174622e-05,
"loss": 16.4009,
"step": 690
},
{
"epoch": 2.995670995670996,
"grad_norm": 0.003945178352296352,
"learning_rate": 3.555369652064787e-05,
"loss": 15.7738,
"step": 692
},
{
"epoch": 3.0043290043290045,
"grad_norm": 0.0048090131022036076,
"learning_rate": 3.500753917254787e-05,
"loss": 15.9384,
"step": 694
},
{
"epoch": 3.012987012987013,
"grad_norm": 0.002933148993179202,
"learning_rate": 3.446471779089144e-05,
"loss": 15.8317,
"step": 696
},
{
"epoch": 3.0216450216450217,
"grad_norm": 0.003420398337766528,
"learning_rate": 3.392526023789349e-05,
"loss": 15.536,
"step": 698
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.0001099475848604925,
"learning_rate": 3.338919420310871e-05,
"loss": 15.5597,
"step": 700
},
{
"epoch": 3.038961038961039,
"grad_norm": 0.00021568694501183927,
"learning_rate": 3.28565472020101e-05,
"loss": 15.603,
"step": 702
},
{
"epoch": 3.0476190476190474,
"grad_norm": 0.003983495756983757,
"learning_rate": 3.2327346574576753e-05,
"loss": 15.5045,
"step": 704
},
{
"epoch": 3.0562770562770565,
"grad_norm": 0.00216678692959249,
"learning_rate": 3.180161948389062e-05,
"loss": 15.5247,
"step": 706
},
{
"epoch": 3.064935064935065,
"grad_norm": 0.031781021505594254,
"learning_rate": 3.1279392914742046e-05,
"loss": 15.4569,
"step": 708
},
{
"epoch": 3.0735930735930737,
"grad_norm": 0.00032434993772767484,
"learning_rate": 3.076069367224486e-05,
"loss": 16.0896,
"step": 710
},
{
"epoch": 3.0822510822510822,
"grad_norm": 0.0037430988159030676,
"learning_rate": 3.0245548380460486e-05,
"loss": 15.6038,
"step": 712
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.0002255355502711609,
"learning_rate": 2.9733983481031302e-05,
"loss": 15.3307,
"step": 714
},
{
"epoch": 3.0995670995670994,
"grad_norm": 0.0028247262816876173,
"learning_rate": 2.922602523182344e-05,
"loss": 15.4397,
"step": 716
},
{
"epoch": 3.108225108225108,
"grad_norm": 0.018052997067570686,
"learning_rate": 2.872169970557913e-05,
"loss": 15.599,
"step": 718
},
{
"epoch": 3.116883116883117,
"grad_norm": 0.0025733679067343473,
"learning_rate": 2.8221032788578205e-05,
"loss": 15.4745,
"step": 720
},
{
"epoch": 3.1255411255411256,
"grad_norm": 0.0,
"learning_rate": 2.7724050179309646e-05,
"loss": 15.9485,
"step": 722
},
{
"epoch": 3.134199134199134,
"grad_norm": 0.0,
"learning_rate": 2.7230777387152296e-05,
"loss": 15.4227,
"step": 724
},
{
"epoch": 3.142857142857143,
"grad_norm": 0.000987619161605835,
"learning_rate": 2.6741239731065647e-05,
"loss": 15.5741,
"step": 726
},
{
"epoch": 3.1515151515151514,
"grad_norm": 0.0019787976052612066,
"learning_rate": 2.625546233829016e-05,
"loss": 16.6376,
"step": 728
},
{
"epoch": 3.16017316017316,
"grad_norm": 0.0037663017865270376,
"learning_rate": 2.5773470143057655e-05,
"loss": 15.5158,
"step": 730
},
{
"epoch": 3.168831168831169,
"grad_norm": 0.0,
"learning_rate": 2.529528788531128e-05,
"loss": 15.4871,
"step": 732
},
{
"epoch": 3.1774891774891776,
"grad_norm": 0.0023285530041903257,
"learning_rate": 2.4820940109435885e-05,
"loss": 15.4632,
"step": 734
},
{
"epoch": 3.186147186147186,
"grad_norm": 0.0020200940780341625,
"learning_rate": 2.4350451162997877e-05,
"loss": 15.5132,
"step": 736
},
{
"epoch": 3.1948051948051948,
"grad_norm": 0.003936352673918009,
"learning_rate": 2.3883845195495878e-05,
"loss": 15.5417,
"step": 738
},
{
"epoch": 3.2034632034632033,
"grad_norm": 0.0037546472158282995,
"learning_rate": 2.342114615712081e-05,
"loss": 15.9803,
"step": 740
},
{
"epoch": 3.212121212121212,
"grad_norm": 0.0005508167669177055,
"learning_rate": 2.296237779752687e-05,
"loss": 15.2634,
"step": 742
},
{
"epoch": 3.220779220779221,
"grad_norm": 0.0036611163523048162,
"learning_rate": 2.2507563664612252e-05,
"loss": 15.5582,
"step": 744
},
{
"epoch": 3.2294372294372296,
"grad_norm": 0.003752505173906684,
"learning_rate": 2.205672710331059e-05,
"loss": 15.5934,
"step": 746
},
{
"epoch": 3.238095238095238,
"grad_norm": 0.0008896394865587354,
"learning_rate": 2.1609891254392678e-05,
"loss": 15.4942,
"step": 748
},
{
"epoch": 3.2467532467532467,
"grad_norm": 0.0,
"learning_rate": 2.1167079053278737e-05,
"loss": 15.3308,
"step": 750
},
{
"epoch": 3.2554112554112553,
"grad_norm": 0.00010732792725320905,
"learning_rate": 2.072831322886105e-05,
"loss": 15.3753,
"step": 752
},
{
"epoch": 3.264069264069264,
"grad_norm": 0.003391178324818611,
"learning_rate": 2.029361630233747e-05,
"loss": 15.4389,
"step": 754
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.0003303180856164545,
"learning_rate": 1.986301058605531e-05,
"loss": 15.385,
"step": 756
},
{
"epoch": 3.2813852813852815,
"grad_norm": 0.0005520053091458976,
"learning_rate": 1.9436518182366158e-05,
"loss": 15.4051,
"step": 758
},
{
"epoch": 3.29004329004329,
"grad_norm": 0.003017255337908864,
"learning_rate": 1.901416098249136e-05,
"loss": 15.647,
"step": 760
},
{
"epoch": 3.2987012987012987,
"grad_norm": 0.021177958697080612,
"learning_rate": 1.8595960665398458e-05,
"loss": 15.6138,
"step": 762
},
{
"epoch": 3.3073593073593073,
"grad_norm": 0.0034116676542907953,
"learning_rate": 1.8181938696688296e-05,
"loss": 15.4569,
"step": 764
},
{
"epoch": 3.316017316017316,
"grad_norm": 0.00011423804244259372,
"learning_rate": 1.7772116327493372e-05,
"loss": 15.4194,
"step": 766
},
{
"epoch": 3.324675324675325,
"grad_norm": 0.0,
"learning_rate": 1.736651459338695e-05,
"loss": 15.4586,
"step": 768
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.001321481540799141,
"learning_rate": 1.6965154313303368e-05,
"loss": 15.613,
"step": 770
},
{
"epoch": 3.341991341991342,
"grad_norm": 0.0015613286523148417,
"learning_rate": 1.6568056088469387e-05,
"loss": 15.4395,
"step": 772
},
{
"epoch": 3.3506493506493507,
"grad_norm": 0.00010790528176585212,
"learning_rate": 1.6175240301346906e-05,
"loss": 15.4752,
"step": 774
},
{
"epoch": 3.3593073593073592,
"grad_norm": 0.003342802170664072,
"learning_rate": 1.5786727114586586e-05,
"loss": 15.4756,
"step": 776
},
{
"epoch": 3.367965367965368,
"grad_norm": 0.001664191484451294,
"learning_rate": 1.540253646999299e-05,
"loss": 15.4152,
"step": 778
},
{
"epoch": 3.3766233766233764,
"grad_norm": 0.0,
"learning_rate": 1.5022688087501092e-05,
"loss": 15.3462,
"step": 780
},
{
"epoch": 3.3852813852813854,
"grad_norm": 0.0,
"learning_rate": 1.4647201464163906e-05,
"loss": 15.4894,
"step": 782
},
{
"epoch": 3.393939393939394,
"grad_norm": 0.002181015908718109,
"learning_rate": 1.4276095873151952e-05,
"loss": 15.4537,
"step": 784
},
{
"epoch": 3.4025974025974026,
"grad_norm": 0.0,
"learning_rate": 1.3909390362763752e-05,
"loss": 15.4039,
"step": 786
},
{
"epoch": 3.411255411255411,
"grad_norm": 0.001633924781344831,
"learning_rate": 1.3547103755448287e-05,
"loss": 15.3927,
"step": 788
},
{
"epoch": 3.41991341991342,
"grad_norm": 0.0019662026315927505,
"learning_rate": 1.3189254646838767e-05,
"loss": 15.573,
"step": 790
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.0017789127305150032,
"learning_rate": 1.2835861404798265e-05,
"loss": 15.4386,
"step": 792
},
{
"epoch": 3.4372294372294374,
"grad_norm": 0.0037333201617002487,
"learning_rate": 1.2486942168476756e-05,
"loss": 15.4277,
"step": 794
},
{
"epoch": 3.445887445887446,
"grad_norm": 0.00033077617990784347,
"learning_rate": 1.2142514847380237e-05,
"loss": 15.43,
"step": 796
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.0,
"learning_rate": 1.1802597120451286e-05,
"loss": 15.5442,
"step": 798
},
{
"epoch": 3.463203463203463,
"grad_norm": 0.0014221479650586843,
"learning_rate": 1.146720643516177e-05,
"loss": 15.4446,
"step": 800
},
{
"epoch": 3.4718614718614718,
"grad_norm": 0.0004546408890746534,
"learning_rate": 1.1136360006617185e-05,
"loss": 16.2801,
"step": 802
},
{
"epoch": 3.4805194805194803,
"grad_norm": 0.0038407333195209503,
"learning_rate": 1.0810074816673154e-05,
"loss": 26.5808,
"step": 804
},
{
"epoch": 3.4891774891774894,
"grad_norm": 0.000989666790701449,
"learning_rate": 1.048836761306361e-05,
"loss": 15.379,
"step": 806
},
{
"epoch": 3.497835497835498,
"grad_norm": 0.0,
"learning_rate": 1.0171254908541372e-05,
"loss": 15.4584,
"step": 808
},
{
"epoch": 3.5064935064935066,
"grad_norm": 0.0,
"learning_rate": 9.858752980030295e-06,
"loss": 15.609,
"step": 810
},
{
"epoch": 3.515151515151515,
"grad_norm": 0.0013323862804099917,
"learning_rate": 9.550877867790065e-06,
"loss": 15.5316,
"step": 812
},
{
"epoch": 3.5238095238095237,
"grad_norm": 0.00010888870747294277,
"learning_rate": 9.247645374592717e-06,
"loss": 15.4492,
"step": 814
},
{
"epoch": 3.5324675324675323,
"grad_norm": 0.00010042625217465684,
"learning_rate": 8.949071064911585e-06,
"loss": 15.4255,
"step": 816
},
{
"epoch": 3.541125541125541,
"grad_norm": 0.0003212654555682093,
"learning_rate": 8.655170264122303e-06,
"loss": 15.9795,
"step": 818
},
{
"epoch": 3.54978354978355,
"grad_norm": 0.003979097120463848,
"learning_rate": 8.365958057716338e-06,
"loss": 15.5248,
"step": 820
},
{
"epoch": 3.5584415584415585,
"grad_norm": 0.0019642910920083523,
"learning_rate": 8.081449290526432e-06,
"loss": 15.4587,
"step": 822
},
{
"epoch": 3.567099567099567,
"grad_norm": 0.008268770761787891,
"learning_rate": 7.80165856596492e-06,
"loss": 15.7592,
"step": 824
},
{
"epoch": 3.5757575757575757,
"grad_norm": 0.0025506443344056606,
"learning_rate": 7.526600245273918e-06,
"loss": 15.8564,
"step": 826
},
{
"epoch": 3.5844155844155843,
"grad_norm": 0.0012161307968199253,
"learning_rate": 7.256288446788362e-06,
"loss": 15.3659,
"step": 828
},
{
"epoch": 3.5930735930735933,
"grad_norm": 0.004051461815834045,
"learning_rate": 6.9907370452112046e-06,
"loss": 15.9221,
"step": 830
},
{
"epoch": 3.601731601731602,
"grad_norm": 0.003108682343736291,
"learning_rate": 6.729959670901309e-06,
"loss": 15.5334,
"step": 832
},
{
"epoch": 3.6103896103896105,
"grad_norm": 0.0033409043680876493,
"learning_rate": 6.4739697091738e-06,
"loss": 15.426,
"step": 834
},
{
"epoch": 3.619047619047619,
"grad_norm": 0.0016489011468365788,
"learning_rate": 6.222780299613074e-06,
"loss": 15.3453,
"step": 836
},
{
"epoch": 3.6277056277056277,
"grad_norm": 0.00321365287527442,
"learning_rate": 5.976404335398256e-06,
"loss": 15.607,
"step": 838
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.00022275045921560377,
"learning_rate": 5.734854462641548e-06,
"loss": 15.4681,
"step": 840
},
{
"epoch": 3.645021645021645,
"grad_norm": 0.0018924333853647113,
"learning_rate": 5.498143079738971e-06,
"loss": 15.4828,
"step": 842
},
{
"epoch": 3.653679653679654,
"grad_norm": 0.0013136648340150714,
"learning_rate": 5.2662823367340855e-06,
"loss": 15.2643,
"step": 844
},
{
"epoch": 3.6623376623376624,
"grad_norm": 0.00011183915194123983,
"learning_rate": 5.039284134694333e-06,
"loss": 15.4385,
"step": 846
},
{
"epoch": 3.670995670995671,
"grad_norm": 0.0032093904446810484,
"learning_rate": 4.817160125100106e-06,
"loss": 15.4845,
"step": 848
},
{
"epoch": 3.6796536796536796,
"grad_norm": 0.001872088760137558,
"learning_rate": 4.599921709246812e-06,
"loss": 15.3745,
"step": 850
},
{
"epoch": 3.688311688311688,
"grad_norm": 0.002430541208013892,
"learning_rate": 4.3875800376595e-06,
"loss": 15.3891,
"step": 852
},
{
"epoch": 3.6969696969696972,
"grad_norm": 0.003143192734569311,
"learning_rate": 4.180146009520702e-06,
"loss": 15.6386,
"step": 854
},
{
"epoch": 3.7056277056277054,
"grad_norm": 0.003909118473529816,
"learning_rate": 3.977630272110811e-06,
"loss": 15.8103,
"step": 856
},
{
"epoch": 3.7142857142857144,
"grad_norm": 0.00011017678480129689,
"learning_rate": 3.780043220261764e-06,
"loss": 15.4262,
"step": 858
},
{
"epoch": 3.722943722943723,
"grad_norm": 0.002079217229038477,
"learning_rate": 3.587394995823301e-06,
"loss": 15.5448,
"step": 860
},
{
"epoch": 3.7316017316017316,
"grad_norm": 0.00011921657278435305,
"learning_rate": 3.3996954871425845e-06,
"loss": 15.3458,
"step": 862
},
{
"epoch": 3.74025974025974,
"grad_norm": 0.0003392777871340513,
"learning_rate": 3.216954328556443e-06,
"loss": 15.4115,
"step": 864
},
{
"epoch": 3.7489177489177488,
"grad_norm": 0.0011154324747622013,
"learning_rate": 3.039180899897043e-06,
"loss": 15.5333,
"step": 866
},
{
"epoch": 3.757575757575758,
"grad_norm": 0.0,
"learning_rate": 2.8663843260103074e-06,
"loss": 15.4002,
"step": 868
},
{
"epoch": 3.7662337662337664,
"grad_norm": 0.00021322118118405342,
"learning_rate": 2.698573476287658e-06,
"loss": 15.634,
"step": 870
},
{
"epoch": 3.774891774891775,
"grad_norm": 0.0,
"learning_rate": 2.535756964210634e-06,
"loss": 15.2512,
"step": 872
},
{
"epoch": 3.7835497835497836,
"grad_norm": 0.0,
"learning_rate": 2.37794314690889e-06,
"loss": 15.5646,
"step": 874
},
{
"epoch": 3.792207792207792,
"grad_norm": 0.0,
"learning_rate": 2.225140124731151e-06,
"loss": 15.436,
"step": 876
},
{
"epoch": 3.8008658008658007,
"grad_norm": 0.0018814082723110914,
"learning_rate": 2.0773557408295343e-06,
"loss": 15.5011,
"step": 878
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.0019670824985951185,
"learning_rate": 1.9345975807568474e-06,
"loss": 15.3244,
"step": 880
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.0035723568871617317,
"learning_rate": 1.7968729720773459e-06,
"loss": 15.6439,
"step": 882
},
{
"epoch": 3.826839826839827,
"grad_norm": 0.0,
"learning_rate": 1.6641889839905445e-06,
"loss": 15.3774,
"step": 884
},
{
"epoch": 3.8354978354978355,
"grad_norm": 0.0,
"learning_rate": 1.536552426968396e-06,
"loss": 15.3533,
"step": 886
},
{
"epoch": 3.844155844155844,
"grad_norm": 0.0,
"learning_rate": 1.4139698524057165e-06,
"loss": 16.2874,
"step": 888
},
{
"epoch": 3.8528138528138527,
"grad_norm": 0.00011134289525216445,
"learning_rate": 1.2964475522839304e-06,
"loss": 15.4719,
"step": 890
},
{
"epoch": 3.8614718614718617,
"grad_norm": 0.00011475420615170151,
"learning_rate": 1.1839915588480743e-06,
"loss": 15.2561,
"step": 892
},
{
"epoch": 3.87012987012987,
"grad_norm": 0.0,
"learning_rate": 1.0766076442971895e-06,
"loss": 15.4674,
"step": 894
},
{
"epoch": 3.878787878787879,
"grad_norm": 0.0022935173474252224,
"learning_rate": 9.74301320488058e-07,
"loss": 15.6186,
"step": 896
},
{
"epoch": 3.8874458874458875,
"grad_norm": 0.0,
"learning_rate": 8.770778386522627e-07,
"loss": 15.3455,
"step": 898
},
{
"epoch": 3.896103896103896,
"grad_norm": 0.0,
"learning_rate": 7.849421891266584e-07,
"loss": 15.4871,
"step": 900
},
{
"epoch": 3.9047619047619047,
"grad_norm": 0.0,
"learning_rate": 6.978991010972547e-07,
"loss": 16.203,
"step": 902
},
{
"epoch": 3.9134199134199132,
"grad_norm": 0.0,
"learning_rate": 6.159530423563986e-07,
"loss": 15.4409,
"step": 904
},
{
"epoch": 3.9220779220779223,
"grad_norm": 0.004353491589426994,
"learning_rate": 5.391082190735252e-07,
"loss": 15.4641,
"step": 906
},
{
"epoch": 3.930735930735931,
"grad_norm": 0.0022043841890990734,
"learning_rate": 4.6736857557925227e-07,
"loss": 15.527,
"step": 908
},
{
"epoch": 3.9393939393939394,
"grad_norm": 0.0,
"learning_rate": 4.007377941628754e-07,
"loss": 15.3882,
"step": 910
},
{
"epoch": 3.948051948051948,
"grad_norm": 0.0020064269192516804,
"learning_rate": 3.392192948833861e-07,
"loss": 15.5124,
"step": 912
},
{
"epoch": 3.9567099567099566,
"grad_norm": 0.00176339247263968,
"learning_rate": 2.828162353939678e-07,
"loss": 15.3776,
"step": 914
},
{
"epoch": 3.965367965367965,
"grad_norm": 0.0016750121721997857,
"learning_rate": 2.315315107798366e-07,
"loss": 15.4065,
"step": 916
},
{
"epoch": 3.974025974025974,
"grad_norm": 0.0027372916229069233,
"learning_rate": 1.8536775340970425e-07,
"loss": 15.5642,
"step": 918
},
{
"epoch": 3.982683982683983,
"grad_norm": 0.0,
"learning_rate": 1.4432733280065335e-07,
"loss": 15.3594,
"step": 920
},
{
"epoch": 3.9913419913419914,
"grad_norm": 0.0021978975273668766,
"learning_rate": 1.0841235549648999e-07,
"loss": 15.4857,
"step": 922
},
{
"epoch": 4.0,
"grad_norm": 0.25208330154418945,
"learning_rate": 7.762466495964127e-08,
"loss": 15.4448,
"step": 924
}
],
"logging_steps": 2,
"max_steps": 924,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3868198638796145e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}