huseinzol05's picture
Upload folder using huggingface_hub
d69615f verified
raw
history blame
210 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.18438844499078058,
"eval_steps": 500,
"global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015365703749231714,
"grad_norm": 54.03105926513672,
"learning_rate": 1.9999385371850035e-05,
"loss": 5.8486,
"step": 2
},
{
"epoch": 0.00030731407498463427,
"grad_norm": 60.26078414916992,
"learning_rate": 1.999877074370006e-05,
"loss": 5.5476,
"step": 4
},
{
"epoch": 0.00046097111247695143,
"grad_norm": 172.71730041503906,
"learning_rate": 1.9998156115550094e-05,
"loss": 4.8213,
"step": 6
},
{
"epoch": 0.0006146281499692685,
"grad_norm": 80.93025970458984,
"learning_rate": 1.9997541487400124e-05,
"loss": 4.8849,
"step": 8
},
{
"epoch": 0.0007682851874615857,
"grad_norm": 118.46244049072266,
"learning_rate": 1.9996926859250153e-05,
"loss": 4.0143,
"step": 10
},
{
"epoch": 0.0009219422249539029,
"grad_norm": 58.34870910644531,
"learning_rate": 1.9996312231100187e-05,
"loss": 4.0411,
"step": 12
},
{
"epoch": 0.0010755992624462201,
"grad_norm": 35.50437927246094,
"learning_rate": 1.9995697602950216e-05,
"loss": 3.9182,
"step": 14
},
{
"epoch": 0.001229256299938537,
"grad_norm": 60.75965118408203,
"learning_rate": 1.999508297480025e-05,
"loss": 3.7527,
"step": 16
},
{
"epoch": 0.0013829133374308542,
"grad_norm": 53.328765869140625,
"learning_rate": 1.999446834665028e-05,
"loss": 3.5424,
"step": 18
},
{
"epoch": 0.0015365703749231714,
"grad_norm": 40.73623275756836,
"learning_rate": 1.999385371850031e-05,
"loss": 3.7179,
"step": 20
},
{
"epoch": 0.0016902274124154886,
"grad_norm": 15.511174201965332,
"learning_rate": 1.9993239090350342e-05,
"loss": 3.3177,
"step": 22
},
{
"epoch": 0.0018438844499078057,
"grad_norm": 62.24359130859375,
"learning_rate": 1.9992624462200368e-05,
"loss": 3.5207,
"step": 24
},
{
"epoch": 0.001997541487400123,
"grad_norm": 69.20399475097656,
"learning_rate": 1.99920098340504e-05,
"loss": 3.6051,
"step": 26
},
{
"epoch": 0.0021511985248924403,
"grad_norm": 39.98881530761719,
"learning_rate": 1.999139520590043e-05,
"loss": 3.3193,
"step": 28
},
{
"epoch": 0.0023048555623847574,
"grad_norm": 10.113142013549805,
"learning_rate": 1.999078057775046e-05,
"loss": 3.2187,
"step": 30
},
{
"epoch": 0.002458512599877074,
"grad_norm": 28.31175422668457,
"learning_rate": 1.9990165949600494e-05,
"loss": 3.4458,
"step": 32
},
{
"epoch": 0.0026121696373693913,
"grad_norm": 21.829612731933594,
"learning_rate": 1.9989551321450523e-05,
"loss": 3.5843,
"step": 34
},
{
"epoch": 0.0027658266748617085,
"grad_norm": 18.00796127319336,
"learning_rate": 1.9988936693300556e-05,
"loss": 3.2827,
"step": 36
},
{
"epoch": 0.0029194837123540257,
"grad_norm": 16.2840576171875,
"learning_rate": 1.9988322065150586e-05,
"loss": 3.2059,
"step": 38
},
{
"epoch": 0.003073140749846343,
"grad_norm": 11.987384796142578,
"learning_rate": 1.9987707437000616e-05,
"loss": 3.1313,
"step": 40
},
{
"epoch": 0.00322679778733866,
"grad_norm": 6.873617649078369,
"learning_rate": 1.998709280885065e-05,
"loss": 2.8568,
"step": 42
},
{
"epoch": 0.003380454824830977,
"grad_norm": 6.603146076202393,
"learning_rate": 1.998647818070068e-05,
"loss": 3.0781,
"step": 44
},
{
"epoch": 0.0035341118623232943,
"grad_norm": 25.308164596557617,
"learning_rate": 1.9985863552550708e-05,
"loss": 3.0764,
"step": 46
},
{
"epoch": 0.0036877688998156115,
"grad_norm": 15.176654815673828,
"learning_rate": 1.998524892440074e-05,
"loss": 3.0356,
"step": 48
},
{
"epoch": 0.0038414259373079286,
"grad_norm": 7.444390773773193,
"learning_rate": 1.9984634296250767e-05,
"loss": 2.9488,
"step": 50
},
{
"epoch": 0.003995082974800246,
"grad_norm": 18.565139770507812,
"learning_rate": 1.99840196681008e-05,
"loss": 2.7179,
"step": 52
},
{
"epoch": 0.004148740012292563,
"grad_norm": 10.658416748046875,
"learning_rate": 1.998340503995083e-05,
"loss": 2.716,
"step": 54
},
{
"epoch": 0.0043023970497848806,
"grad_norm": 9.682657241821289,
"learning_rate": 1.9982790411800863e-05,
"loss": 2.9189,
"step": 56
},
{
"epoch": 0.004456054087277198,
"grad_norm": 20.967639923095703,
"learning_rate": 1.9982175783650893e-05,
"loss": 2.8078,
"step": 58
},
{
"epoch": 0.004609711124769515,
"grad_norm": 16.931556701660156,
"learning_rate": 1.9981561155500923e-05,
"loss": 2.837,
"step": 60
},
{
"epoch": 0.004763368162261831,
"grad_norm": 12.055686950683594,
"learning_rate": 1.9980946527350956e-05,
"loss": 2.7392,
"step": 62
},
{
"epoch": 0.004917025199754148,
"grad_norm": 7.959167957305908,
"learning_rate": 1.9980331899200985e-05,
"loss": 2.8915,
"step": 64
},
{
"epoch": 0.0050706822372464655,
"grad_norm": 9.24318790435791,
"learning_rate": 1.9979717271051015e-05,
"loss": 2.5171,
"step": 66
},
{
"epoch": 0.005224339274738783,
"grad_norm": 20.02304458618164,
"learning_rate": 1.9979102642901048e-05,
"loss": 2.7947,
"step": 68
},
{
"epoch": 0.0053779963122311,
"grad_norm": 8.09688663482666,
"learning_rate": 1.9978488014751078e-05,
"loss": 2.7323,
"step": 70
},
{
"epoch": 0.005531653349723417,
"grad_norm": 8.636987686157227,
"learning_rate": 1.9977873386601108e-05,
"loss": 2.6309,
"step": 72
},
{
"epoch": 0.005685310387215734,
"grad_norm": 6.815808296203613,
"learning_rate": 1.997725875845114e-05,
"loss": 2.4741,
"step": 74
},
{
"epoch": 0.005838967424708051,
"grad_norm": 7.532662868499756,
"learning_rate": 1.997664413030117e-05,
"loss": 2.5875,
"step": 76
},
{
"epoch": 0.0059926244622003685,
"grad_norm": 6.733164310455322,
"learning_rate": 1.99760295021512e-05,
"loss": 2.6103,
"step": 78
},
{
"epoch": 0.006146281499692686,
"grad_norm": 6.442116737365723,
"learning_rate": 1.997541487400123e-05,
"loss": 2.6208,
"step": 80
},
{
"epoch": 0.006299938537185003,
"grad_norm": 6.882765769958496,
"learning_rate": 1.9974800245851263e-05,
"loss": 2.5554,
"step": 82
},
{
"epoch": 0.00645359557467732,
"grad_norm": 6.64527702331543,
"learning_rate": 1.9974185617701292e-05,
"loss": 2.5667,
"step": 84
},
{
"epoch": 0.006607252612169637,
"grad_norm": 7.69775390625,
"learning_rate": 1.9973570989551322e-05,
"loss": 2.6117,
"step": 86
},
{
"epoch": 0.006760909649661954,
"grad_norm": 7.077218532562256,
"learning_rate": 1.9972956361401355e-05,
"loss": 2.4501,
"step": 88
},
{
"epoch": 0.0069145666871542714,
"grad_norm": 5.539189338684082,
"learning_rate": 1.9972341733251385e-05,
"loss": 2.4775,
"step": 90
},
{
"epoch": 0.007068223724646589,
"grad_norm": 6.602914333343506,
"learning_rate": 1.9971727105101415e-05,
"loss": 2.3944,
"step": 92
},
{
"epoch": 0.007221880762138906,
"grad_norm": 5.995626449584961,
"learning_rate": 1.9971112476951448e-05,
"loss": 2.4826,
"step": 94
},
{
"epoch": 0.007375537799631223,
"grad_norm": 6.836587429046631,
"learning_rate": 1.9970497848801477e-05,
"loss": 2.468,
"step": 96
},
{
"epoch": 0.00752919483712354,
"grad_norm": 6.4697651863098145,
"learning_rate": 1.9969883220651507e-05,
"loss": 2.2833,
"step": 98
},
{
"epoch": 0.007682851874615857,
"grad_norm": 8.081903457641602,
"learning_rate": 1.996926859250154e-05,
"loss": 2.6544,
"step": 100
},
{
"epoch": 0.007836508912108174,
"grad_norm": 6.688724517822266,
"learning_rate": 1.996865396435157e-05,
"loss": 2.55,
"step": 102
},
{
"epoch": 0.007990165949600492,
"grad_norm": 6.878283977508545,
"learning_rate": 1.99680393362016e-05,
"loss": 2.2658,
"step": 104
},
{
"epoch": 0.008143822987092809,
"grad_norm": 7.079164505004883,
"learning_rate": 1.996742470805163e-05,
"loss": 2.4793,
"step": 106
},
{
"epoch": 0.008297480024585127,
"grad_norm": 6.391737461090088,
"learning_rate": 1.9966810079901662e-05,
"loss": 2.4154,
"step": 108
},
{
"epoch": 0.008451137062077443,
"grad_norm": 7.503854274749756,
"learning_rate": 1.9966195451751692e-05,
"loss": 2.3137,
"step": 110
},
{
"epoch": 0.008604794099569761,
"grad_norm": 6.10397481918335,
"learning_rate": 1.996558082360172e-05,
"loss": 2.4306,
"step": 112
},
{
"epoch": 0.008758451137062077,
"grad_norm": 6.1603264808654785,
"learning_rate": 1.9964966195451755e-05,
"loss": 2.2859,
"step": 114
},
{
"epoch": 0.008912108174554395,
"grad_norm": 7.389194011688232,
"learning_rate": 1.9964351567301784e-05,
"loss": 2.3876,
"step": 116
},
{
"epoch": 0.009065765212046712,
"grad_norm": 6.887446403503418,
"learning_rate": 1.9963736939151814e-05,
"loss": 2.5139,
"step": 118
},
{
"epoch": 0.00921942224953903,
"grad_norm": 7.2416768074035645,
"learning_rate": 1.9963122311001847e-05,
"loss": 2.5653,
"step": 120
},
{
"epoch": 0.009373079287031346,
"grad_norm": 7.454037189483643,
"learning_rate": 1.9962507682851877e-05,
"loss": 2.3707,
"step": 122
},
{
"epoch": 0.009526736324523662,
"grad_norm": 6.9176483154296875,
"learning_rate": 1.9961893054701906e-05,
"loss": 2.4158,
"step": 124
},
{
"epoch": 0.00968039336201598,
"grad_norm": 7.838490009307861,
"learning_rate": 1.9961278426551936e-05,
"loss": 2.45,
"step": 126
},
{
"epoch": 0.009834050399508297,
"grad_norm": 6.680061340332031,
"learning_rate": 1.996066379840197e-05,
"loss": 2.1975,
"step": 128
},
{
"epoch": 0.009987707437000615,
"grad_norm": 7.567671775817871,
"learning_rate": 1.9960049170252e-05,
"loss": 2.1892,
"step": 130
},
{
"epoch": 0.010141364474492931,
"grad_norm": 6.0987396240234375,
"learning_rate": 1.995943454210203e-05,
"loss": 2.2957,
"step": 132
},
{
"epoch": 0.010295021511985249,
"grad_norm": 6.579552173614502,
"learning_rate": 1.995881991395206e-05,
"loss": 2.4326,
"step": 134
},
{
"epoch": 0.010448678549477565,
"grad_norm": 7.131938934326172,
"learning_rate": 1.995820528580209e-05,
"loss": 2.4767,
"step": 136
},
{
"epoch": 0.010602335586969883,
"grad_norm": 6.883522033691406,
"learning_rate": 1.995759065765212e-05,
"loss": 2.3893,
"step": 138
},
{
"epoch": 0.0107559926244622,
"grad_norm": 5.52859354019165,
"learning_rate": 1.9956976029502154e-05,
"loss": 2.1851,
"step": 140
},
{
"epoch": 0.010909649661954518,
"grad_norm": 6.14478874206543,
"learning_rate": 1.9956361401352184e-05,
"loss": 2.2019,
"step": 142
},
{
"epoch": 0.011063306699446834,
"grad_norm": 6.477922439575195,
"learning_rate": 1.9955746773202213e-05,
"loss": 2.2746,
"step": 144
},
{
"epoch": 0.011216963736939152,
"grad_norm": 7.661022186279297,
"learning_rate": 1.9955132145052246e-05,
"loss": 2.3499,
"step": 146
},
{
"epoch": 0.011370620774431468,
"grad_norm": 7.439324378967285,
"learning_rate": 1.9954517516902276e-05,
"loss": 2.1848,
"step": 148
},
{
"epoch": 0.011524277811923786,
"grad_norm": 7.070183753967285,
"learning_rate": 1.9953902888752306e-05,
"loss": 2.2816,
"step": 150
},
{
"epoch": 0.011677934849416103,
"grad_norm": 5.912161350250244,
"learning_rate": 1.9953288260602336e-05,
"loss": 2.3688,
"step": 152
},
{
"epoch": 0.01183159188690842,
"grad_norm": 6.827462673187256,
"learning_rate": 1.995267363245237e-05,
"loss": 2.3945,
"step": 154
},
{
"epoch": 0.011985248924400737,
"grad_norm": 5.7712082862854,
"learning_rate": 1.9952059004302398e-05,
"loss": 2.1618,
"step": 156
},
{
"epoch": 0.012138905961893055,
"grad_norm": 5.9169020652771,
"learning_rate": 1.9951444376152428e-05,
"loss": 2.1781,
"step": 158
},
{
"epoch": 0.012292562999385371,
"grad_norm": 5.994232177734375,
"learning_rate": 1.995082974800246e-05,
"loss": 2.1474,
"step": 160
},
{
"epoch": 0.01244622003687769,
"grad_norm": 6.10550594329834,
"learning_rate": 1.995021511985249e-05,
"loss": 2.2227,
"step": 162
},
{
"epoch": 0.012599877074370006,
"grad_norm": 7.107779502868652,
"learning_rate": 1.994960049170252e-05,
"loss": 2.334,
"step": 164
},
{
"epoch": 0.012753534111862324,
"grad_norm": 4.990610122680664,
"learning_rate": 1.9948985863552553e-05,
"loss": 2.2313,
"step": 166
},
{
"epoch": 0.01290719114935464,
"grad_norm": 8.93641185760498,
"learning_rate": 1.9948371235402583e-05,
"loss": 2.1062,
"step": 168
},
{
"epoch": 0.013060848186846958,
"grad_norm": 5.389564037322998,
"learning_rate": 1.9947756607252613e-05,
"loss": 2.1729,
"step": 170
},
{
"epoch": 0.013214505224339274,
"grad_norm": 5.347591400146484,
"learning_rate": 1.9947141979102646e-05,
"loss": 2.0474,
"step": 172
},
{
"epoch": 0.013368162261831592,
"grad_norm": 6.475700378417969,
"learning_rate": 1.9946527350952676e-05,
"loss": 2.1939,
"step": 174
},
{
"epoch": 0.013521819299323909,
"grad_norm": 6.144668102264404,
"learning_rate": 1.9945912722802705e-05,
"loss": 2.217,
"step": 176
},
{
"epoch": 0.013675476336816227,
"grad_norm": 6.778875350952148,
"learning_rate": 1.9945298094652735e-05,
"loss": 2.139,
"step": 178
},
{
"epoch": 0.013829133374308543,
"grad_norm": 7.560453414916992,
"learning_rate": 1.9944683466502768e-05,
"loss": 2.1931,
"step": 180
},
{
"epoch": 0.013982790411800861,
"grad_norm": 5.251035690307617,
"learning_rate": 1.9944068838352798e-05,
"loss": 2.1596,
"step": 182
},
{
"epoch": 0.014136447449293177,
"grad_norm": 5.9772162437438965,
"learning_rate": 1.9943454210202827e-05,
"loss": 2.232,
"step": 184
},
{
"epoch": 0.014290104486785495,
"grad_norm": 7.088453769683838,
"learning_rate": 1.994283958205286e-05,
"loss": 2.3468,
"step": 186
},
{
"epoch": 0.014443761524277812,
"grad_norm": 6.209799289703369,
"learning_rate": 1.994222495390289e-05,
"loss": 2.3158,
"step": 188
},
{
"epoch": 0.01459741856177013,
"grad_norm": 6.048709392547607,
"learning_rate": 1.994161032575292e-05,
"loss": 1.9986,
"step": 190
},
{
"epoch": 0.014751075599262446,
"grad_norm": 5.292468070983887,
"learning_rate": 1.9940995697602953e-05,
"loss": 2.1564,
"step": 192
},
{
"epoch": 0.014904732636754764,
"grad_norm": 6.045801639556885,
"learning_rate": 1.9940381069452983e-05,
"loss": 2.2064,
"step": 194
},
{
"epoch": 0.01505838967424708,
"grad_norm": 6.204288482666016,
"learning_rate": 1.9939766441303012e-05,
"loss": 2.2869,
"step": 196
},
{
"epoch": 0.015212046711739398,
"grad_norm": 6.579591274261475,
"learning_rate": 1.9939151813153045e-05,
"loss": 2.1494,
"step": 198
},
{
"epoch": 0.015365703749231715,
"grad_norm": 6.20919942855835,
"learning_rate": 1.9938537185003075e-05,
"loss": 2.0245,
"step": 200
},
{
"epoch": 0.015519360786724033,
"grad_norm": 6.129773139953613,
"learning_rate": 1.9937922556853108e-05,
"loss": 2.0684,
"step": 202
},
{
"epoch": 0.01567301782421635,
"grad_norm": 7.500084400177002,
"learning_rate": 1.9937307928703134e-05,
"loss": 2.1818,
"step": 204
},
{
"epoch": 0.015826674861708665,
"grad_norm": 6.189898490905762,
"learning_rate": 1.9936693300553167e-05,
"loss": 2.1377,
"step": 206
},
{
"epoch": 0.015980331899200985,
"grad_norm": 5.788628101348877,
"learning_rate": 1.9936078672403197e-05,
"loss": 2.1195,
"step": 208
},
{
"epoch": 0.0161339889366933,
"grad_norm": 6.9061055183410645,
"learning_rate": 1.9935464044253227e-05,
"loss": 2.1815,
"step": 210
},
{
"epoch": 0.016287645974185617,
"grad_norm": 7.366201877593994,
"learning_rate": 1.993484941610326e-05,
"loss": 2.2884,
"step": 212
},
{
"epoch": 0.016441303011677934,
"grad_norm": 5.979190826416016,
"learning_rate": 1.993423478795329e-05,
"loss": 2.3251,
"step": 214
},
{
"epoch": 0.016594960049170254,
"grad_norm": 6.170030117034912,
"learning_rate": 1.993362015980332e-05,
"loss": 2.2108,
"step": 216
},
{
"epoch": 0.01674861708666257,
"grad_norm": 6.819857120513916,
"learning_rate": 1.9933005531653352e-05,
"loss": 2.2231,
"step": 218
},
{
"epoch": 0.016902274124154886,
"grad_norm": 7.386382579803467,
"learning_rate": 1.9932390903503382e-05,
"loss": 2.1647,
"step": 220
},
{
"epoch": 0.017055931161647202,
"grad_norm": 5.797331809997559,
"learning_rate": 1.9931776275353415e-05,
"loss": 2.2092,
"step": 222
},
{
"epoch": 0.017209588199139522,
"grad_norm": 5.605097770690918,
"learning_rate": 1.993116164720344e-05,
"loss": 2.2266,
"step": 224
},
{
"epoch": 0.01736324523663184,
"grad_norm": 5.865804672241211,
"learning_rate": 1.9930547019053474e-05,
"loss": 2.0874,
"step": 226
},
{
"epoch": 0.017516902274124155,
"grad_norm": 7.769106864929199,
"learning_rate": 1.9929932390903508e-05,
"loss": 2.1032,
"step": 228
},
{
"epoch": 0.01767055931161647,
"grad_norm": 6.673518180847168,
"learning_rate": 1.9929317762753534e-05,
"loss": 2.0957,
"step": 230
},
{
"epoch": 0.01782421634910879,
"grad_norm": 6.331215858459473,
"learning_rate": 1.9928703134603567e-05,
"loss": 2.143,
"step": 232
},
{
"epoch": 0.017977873386601107,
"grad_norm": 5.792760848999023,
"learning_rate": 1.9928088506453597e-05,
"loss": 2.0157,
"step": 234
},
{
"epoch": 0.018131530424093423,
"grad_norm": 6.460434436798096,
"learning_rate": 1.9927473878303626e-05,
"loss": 2.0018,
"step": 236
},
{
"epoch": 0.01828518746158574,
"grad_norm": 6.339091777801514,
"learning_rate": 1.992685925015366e-05,
"loss": 2.2635,
"step": 238
},
{
"epoch": 0.01843884449907806,
"grad_norm": 5.446582317352295,
"learning_rate": 1.992624462200369e-05,
"loss": 2.036,
"step": 240
},
{
"epoch": 0.018592501536570376,
"grad_norm": 6.4099273681640625,
"learning_rate": 1.9925629993853722e-05,
"loss": 2.0031,
"step": 242
},
{
"epoch": 0.018746158574062692,
"grad_norm": 7.307748794555664,
"learning_rate": 1.9925015365703752e-05,
"loss": 2.0746,
"step": 244
},
{
"epoch": 0.01889981561155501,
"grad_norm": 5.755754470825195,
"learning_rate": 1.992440073755378e-05,
"loss": 2.1756,
"step": 246
},
{
"epoch": 0.019053472649047325,
"grad_norm": 5.9470744132995605,
"learning_rate": 1.9923786109403815e-05,
"loss": 2.1579,
"step": 248
},
{
"epoch": 0.019207129686539644,
"grad_norm": 5.4200873374938965,
"learning_rate": 1.992317148125384e-05,
"loss": 2.1868,
"step": 250
},
{
"epoch": 0.01936078672403196,
"grad_norm": 6.8247175216674805,
"learning_rate": 1.9922556853103874e-05,
"loss": 2.1525,
"step": 252
},
{
"epoch": 0.019514443761524277,
"grad_norm": 6.334802627563477,
"learning_rate": 1.9921942224953904e-05,
"loss": 2.1261,
"step": 254
},
{
"epoch": 0.019668100799016593,
"grad_norm": 7.025927543640137,
"learning_rate": 1.9921327596803933e-05,
"loss": 2.2474,
"step": 256
},
{
"epoch": 0.019821757836508913,
"grad_norm": 6.594686508178711,
"learning_rate": 1.9920712968653966e-05,
"loss": 1.9885,
"step": 258
},
{
"epoch": 0.01997541487400123,
"grad_norm": 6.713582992553711,
"learning_rate": 1.9920098340503996e-05,
"loss": 2.3728,
"step": 260
},
{
"epoch": 0.020129071911493546,
"grad_norm": 5.78023099899292,
"learning_rate": 1.9919483712354026e-05,
"loss": 2.0887,
"step": 262
},
{
"epoch": 0.020282728948985862,
"grad_norm": 5.462549686431885,
"learning_rate": 1.991886908420406e-05,
"loss": 2.0673,
"step": 264
},
{
"epoch": 0.020436385986478182,
"grad_norm": 6.792922019958496,
"learning_rate": 1.991825445605409e-05,
"loss": 2.177,
"step": 266
},
{
"epoch": 0.020590043023970498,
"grad_norm": 6.281880855560303,
"learning_rate": 1.991763982790412e-05,
"loss": 1.9686,
"step": 268
},
{
"epoch": 0.020743700061462814,
"grad_norm": 5.745354175567627,
"learning_rate": 1.991702519975415e-05,
"loss": 2.1414,
"step": 270
},
{
"epoch": 0.02089735709895513,
"grad_norm": 6.046512126922607,
"learning_rate": 1.991641057160418e-05,
"loss": 2.1541,
"step": 272
},
{
"epoch": 0.02105101413644745,
"grad_norm": 7.513150691986084,
"learning_rate": 1.9915795943454214e-05,
"loss": 2.1383,
"step": 274
},
{
"epoch": 0.021204671173939767,
"grad_norm": 8.351797103881836,
"learning_rate": 1.991518131530424e-05,
"loss": 2.209,
"step": 276
},
{
"epoch": 0.021358328211432083,
"grad_norm": 6.781789302825928,
"learning_rate": 1.9914566687154273e-05,
"loss": 1.9494,
"step": 278
},
{
"epoch": 0.0215119852489244,
"grad_norm": 5.912288188934326,
"learning_rate": 1.9913952059004303e-05,
"loss": 1.9871,
"step": 280
},
{
"epoch": 0.02166564228641672,
"grad_norm": 5.441234111785889,
"learning_rate": 1.9913337430854333e-05,
"loss": 2.118,
"step": 282
},
{
"epoch": 0.021819299323909035,
"grad_norm": 6.041057109832764,
"learning_rate": 1.9912722802704366e-05,
"loss": 2.0064,
"step": 284
},
{
"epoch": 0.02197295636140135,
"grad_norm": 6.26601505279541,
"learning_rate": 1.9912108174554395e-05,
"loss": 1.9593,
"step": 286
},
{
"epoch": 0.022126613398893668,
"grad_norm": 6.992424488067627,
"learning_rate": 1.991149354640443e-05,
"loss": 2.1785,
"step": 288
},
{
"epoch": 0.022280270436385988,
"grad_norm": 7.048946857452393,
"learning_rate": 1.9910878918254458e-05,
"loss": 2.0809,
"step": 290
},
{
"epoch": 0.022433927473878304,
"grad_norm": 7.00367546081543,
"learning_rate": 1.9910264290104488e-05,
"loss": 2.0688,
"step": 292
},
{
"epoch": 0.02258758451137062,
"grad_norm": 6.326030731201172,
"learning_rate": 1.990964966195452e-05,
"loss": 2.1279,
"step": 294
},
{
"epoch": 0.022741241548862937,
"grad_norm": 5.886343002319336,
"learning_rate": 1.990903503380455e-05,
"loss": 1.9146,
"step": 296
},
{
"epoch": 0.022894898586355256,
"grad_norm": 6.407416820526123,
"learning_rate": 1.990842040565458e-05,
"loss": 2.073,
"step": 298
},
{
"epoch": 0.023048555623847573,
"grad_norm": 5.35817289352417,
"learning_rate": 1.9907805777504613e-05,
"loss": 2.064,
"step": 300
},
{
"epoch": 0.02320221266133989,
"grad_norm": 5.71148157119751,
"learning_rate": 1.990719114935464e-05,
"loss": 2.2207,
"step": 302
},
{
"epoch": 0.023355869698832205,
"grad_norm": 7.2422051429748535,
"learning_rate": 1.9906576521204673e-05,
"loss": 2.1518,
"step": 304
},
{
"epoch": 0.023509526736324525,
"grad_norm": 7.267468452453613,
"learning_rate": 1.9905961893054702e-05,
"loss": 2.0082,
"step": 306
},
{
"epoch": 0.02366318377381684,
"grad_norm": 6.504114627838135,
"learning_rate": 1.9905347264904736e-05,
"loss": 1.9722,
"step": 308
},
{
"epoch": 0.023816840811309158,
"grad_norm": 7.074812889099121,
"learning_rate": 1.9904732636754765e-05,
"loss": 2.1789,
"step": 310
},
{
"epoch": 0.023970497848801474,
"grad_norm": 6.774876117706299,
"learning_rate": 1.9904118008604795e-05,
"loss": 2.219,
"step": 312
},
{
"epoch": 0.024124154886293794,
"grad_norm": 5.666469097137451,
"learning_rate": 1.9903503380454828e-05,
"loss": 1.8294,
"step": 314
},
{
"epoch": 0.02427781192378611,
"grad_norm": 6.548127174377441,
"learning_rate": 1.9902888752304858e-05,
"loss": 2.0859,
"step": 316
},
{
"epoch": 0.024431468961278426,
"grad_norm": 5.174642562866211,
"learning_rate": 1.9902274124154887e-05,
"loss": 1.989,
"step": 318
},
{
"epoch": 0.024585125998770743,
"grad_norm": 5.891490936279297,
"learning_rate": 1.990165949600492e-05,
"loss": 2.0776,
"step": 320
},
{
"epoch": 0.024738783036263062,
"grad_norm": 5.7647504806518555,
"learning_rate": 1.9901044867854947e-05,
"loss": 1.9681,
"step": 322
},
{
"epoch": 0.02489244007375538,
"grad_norm": 5.61868143081665,
"learning_rate": 1.990043023970498e-05,
"loss": 1.8923,
"step": 324
},
{
"epoch": 0.025046097111247695,
"grad_norm": 7.358055114746094,
"learning_rate": 1.9899815611555013e-05,
"loss": 1.9859,
"step": 326
},
{
"epoch": 0.02519975414874001,
"grad_norm": 5.265814781188965,
"learning_rate": 1.9899200983405043e-05,
"loss": 1.939,
"step": 328
},
{
"epoch": 0.02535341118623233,
"grad_norm": 9.370257377624512,
"learning_rate": 1.9898586355255072e-05,
"loss": 1.9538,
"step": 330
},
{
"epoch": 0.025507068223724647,
"grad_norm": 7.504848003387451,
"learning_rate": 1.9897971727105102e-05,
"loss": 2.0802,
"step": 332
},
{
"epoch": 0.025660725261216964,
"grad_norm": 5.975841045379639,
"learning_rate": 1.9897357098955135e-05,
"loss": 1.853,
"step": 334
},
{
"epoch": 0.02581438229870928,
"grad_norm": 6.099985122680664,
"learning_rate": 1.9896742470805165e-05,
"loss": 2.0014,
"step": 336
},
{
"epoch": 0.0259680393362016,
"grad_norm": 6.825030326843262,
"learning_rate": 1.9896127842655194e-05,
"loss": 1.9608,
"step": 338
},
{
"epoch": 0.026121696373693916,
"grad_norm": 6.16441535949707,
"learning_rate": 1.9895513214505227e-05,
"loss": 2.0848,
"step": 340
},
{
"epoch": 0.026275353411186232,
"grad_norm": 6.392692565917969,
"learning_rate": 1.9894898586355257e-05,
"loss": 1.9651,
"step": 342
},
{
"epoch": 0.02642901044867855,
"grad_norm": 5.567882537841797,
"learning_rate": 1.9894283958205287e-05,
"loss": 2.1211,
"step": 344
},
{
"epoch": 0.026582667486170868,
"grad_norm": 10.182480812072754,
"learning_rate": 1.989366933005532e-05,
"loss": 1.9924,
"step": 346
},
{
"epoch": 0.026736324523663185,
"grad_norm": 5.608663558959961,
"learning_rate": 1.989305470190535e-05,
"loss": 1.936,
"step": 348
},
{
"epoch": 0.0268899815611555,
"grad_norm": 5.883683204650879,
"learning_rate": 1.989244007375538e-05,
"loss": 2.0998,
"step": 350
},
{
"epoch": 0.027043638598647817,
"grad_norm": 8.584614753723145,
"learning_rate": 1.989182544560541e-05,
"loss": 2.1266,
"step": 352
},
{
"epoch": 0.027197295636140137,
"grad_norm": 6.828667640686035,
"learning_rate": 1.9891210817455442e-05,
"loss": 1.8693,
"step": 354
},
{
"epoch": 0.027350952673632453,
"grad_norm": 7.0278449058532715,
"learning_rate": 1.989059618930547e-05,
"loss": 1.9258,
"step": 356
},
{
"epoch": 0.02750460971112477,
"grad_norm": 5.643075466156006,
"learning_rate": 1.98899815611555e-05,
"loss": 2.071,
"step": 358
},
{
"epoch": 0.027658266748617086,
"grad_norm": 6.685908794403076,
"learning_rate": 1.9889366933005534e-05,
"loss": 1.8658,
"step": 360
},
{
"epoch": 0.027811923786109402,
"grad_norm": 5.766722679138184,
"learning_rate": 1.9888752304855564e-05,
"loss": 2.0608,
"step": 362
},
{
"epoch": 0.027965580823601722,
"grad_norm": 6.229999542236328,
"learning_rate": 1.9888137676705594e-05,
"loss": 1.8478,
"step": 364
},
{
"epoch": 0.028119237861094038,
"grad_norm": 14.6449613571167,
"learning_rate": 1.9887523048555627e-05,
"loss": 2.0233,
"step": 366
},
{
"epoch": 0.028272894898586354,
"grad_norm": 5.458970069885254,
"learning_rate": 1.9886908420405657e-05,
"loss": 1.8742,
"step": 368
},
{
"epoch": 0.02842655193607867,
"grad_norm": 9.708429336547852,
"learning_rate": 1.9886293792255686e-05,
"loss": 2.0435,
"step": 370
},
{
"epoch": 0.02858020897357099,
"grad_norm": 8.345685958862305,
"learning_rate": 1.988567916410572e-05,
"loss": 1.9448,
"step": 372
},
{
"epoch": 0.028733866011063307,
"grad_norm": 5.213901519775391,
"learning_rate": 1.988506453595575e-05,
"loss": 1.8902,
"step": 374
},
{
"epoch": 0.028887523048555623,
"grad_norm": 6.842494964599609,
"learning_rate": 1.988444990780578e-05,
"loss": 2.0958,
"step": 376
},
{
"epoch": 0.02904118008604794,
"grad_norm": 6.533809185028076,
"learning_rate": 1.988383527965581e-05,
"loss": 2.0073,
"step": 378
},
{
"epoch": 0.02919483712354026,
"grad_norm": 5.832721710205078,
"learning_rate": 1.988322065150584e-05,
"loss": 1.9462,
"step": 380
},
{
"epoch": 0.029348494161032575,
"grad_norm": 6.040827751159668,
"learning_rate": 1.988260602335587e-05,
"loss": 2.0111,
"step": 382
},
{
"epoch": 0.02950215119852489,
"grad_norm": 6.082043647766113,
"learning_rate": 1.98819913952059e-05,
"loss": 2.0088,
"step": 384
},
{
"epoch": 0.029655808236017208,
"grad_norm": 4.5363383293151855,
"learning_rate": 1.9881376767055934e-05,
"loss": 1.9059,
"step": 386
},
{
"epoch": 0.029809465273509528,
"grad_norm": 4.769321918487549,
"learning_rate": 1.9880762138905964e-05,
"loss": 1.8781,
"step": 388
},
{
"epoch": 0.029963122311001844,
"grad_norm": 6.1424994468688965,
"learning_rate": 1.9880147510755993e-05,
"loss": 2.0232,
"step": 390
},
{
"epoch": 0.03011677934849416,
"grad_norm": 6.081544399261475,
"learning_rate": 1.9879532882606026e-05,
"loss": 1.8908,
"step": 392
},
{
"epoch": 0.030270436385986477,
"grad_norm": 6.146285057067871,
"learning_rate": 1.9878918254456056e-05,
"loss": 2.0144,
"step": 394
},
{
"epoch": 0.030424093423478796,
"grad_norm": 5.401834011077881,
"learning_rate": 1.9878303626306086e-05,
"loss": 1.8258,
"step": 396
},
{
"epoch": 0.030577750460971113,
"grad_norm": 6.835007667541504,
"learning_rate": 1.987768899815612e-05,
"loss": 2.0515,
"step": 398
},
{
"epoch": 0.03073140749846343,
"grad_norm": 7.031691551208496,
"learning_rate": 1.987707437000615e-05,
"loss": 2.0362,
"step": 400
},
{
"epoch": 0.030885064535955745,
"grad_norm": 5.733877182006836,
"learning_rate": 1.9876459741856178e-05,
"loss": 2.099,
"step": 402
},
{
"epoch": 0.031038721573448065,
"grad_norm": 6.152698516845703,
"learning_rate": 1.9875845113706208e-05,
"loss": 1.9393,
"step": 404
},
{
"epoch": 0.03119237861094038,
"grad_norm": 5.859741687774658,
"learning_rate": 1.987523048555624e-05,
"loss": 1.995,
"step": 406
},
{
"epoch": 0.0313460356484327,
"grad_norm": 6.834084510803223,
"learning_rate": 1.987461585740627e-05,
"loss": 2.0035,
"step": 408
},
{
"epoch": 0.031499692685925014,
"grad_norm": 6.169229030609131,
"learning_rate": 1.98740012292563e-05,
"loss": 2.1276,
"step": 410
},
{
"epoch": 0.03165334972341733,
"grad_norm": 5.270079135894775,
"learning_rate": 1.9873386601106333e-05,
"loss": 2.0073,
"step": 412
},
{
"epoch": 0.03180700676090965,
"grad_norm": 5.952144145965576,
"learning_rate": 1.9872771972956363e-05,
"loss": 1.8817,
"step": 414
},
{
"epoch": 0.03196066379840197,
"grad_norm": 6.3290019035339355,
"learning_rate": 1.9872157344806393e-05,
"loss": 1.9526,
"step": 416
},
{
"epoch": 0.032114320835894286,
"grad_norm": 5.712306499481201,
"learning_rate": 1.9871542716656426e-05,
"loss": 2.0793,
"step": 418
},
{
"epoch": 0.0322679778733866,
"grad_norm": 5.497166156768799,
"learning_rate": 1.9870928088506455e-05,
"loss": 1.9061,
"step": 420
},
{
"epoch": 0.03242163491087892,
"grad_norm": 6.435750484466553,
"learning_rate": 1.9870313460356485e-05,
"loss": 1.8971,
"step": 422
},
{
"epoch": 0.032575291948371235,
"grad_norm": 5.9519734382629395,
"learning_rate": 1.9869698832206518e-05,
"loss": 2.0295,
"step": 424
},
{
"epoch": 0.03272894898586355,
"grad_norm": 6.359841823577881,
"learning_rate": 1.9869084204056548e-05,
"loss": 1.9017,
"step": 426
},
{
"epoch": 0.03288260602335587,
"grad_norm": 6.195022106170654,
"learning_rate": 1.9868469575906578e-05,
"loss": 2.0663,
"step": 428
},
{
"epoch": 0.033036263060848184,
"grad_norm": 5.500522613525391,
"learning_rate": 1.9867854947756607e-05,
"loss": 1.9694,
"step": 430
},
{
"epoch": 0.03318992009834051,
"grad_norm": 7.16880464553833,
"learning_rate": 1.986724031960664e-05,
"loss": 1.918,
"step": 432
},
{
"epoch": 0.03334357713583282,
"grad_norm": 6.0987348556518555,
"learning_rate": 1.986662569145667e-05,
"loss": 1.8705,
"step": 434
},
{
"epoch": 0.03349723417332514,
"grad_norm": 6.8652753829956055,
"learning_rate": 1.98660110633067e-05,
"loss": 1.9383,
"step": 436
},
{
"epoch": 0.033650891210817456,
"grad_norm": 5.421166896820068,
"learning_rate": 1.9865396435156733e-05,
"loss": 1.879,
"step": 438
},
{
"epoch": 0.03380454824830977,
"grad_norm": 5.929842948913574,
"learning_rate": 1.9864781807006762e-05,
"loss": 1.7183,
"step": 440
},
{
"epoch": 0.03395820528580209,
"grad_norm": 5.500015735626221,
"learning_rate": 1.9864167178856792e-05,
"loss": 1.9168,
"step": 442
},
{
"epoch": 0.034111862323294405,
"grad_norm": 6.267481327056885,
"learning_rate": 1.9863552550706825e-05,
"loss": 1.8126,
"step": 444
},
{
"epoch": 0.03426551936078672,
"grad_norm": 6.300197124481201,
"learning_rate": 1.9862937922556855e-05,
"loss": 2.0519,
"step": 446
},
{
"epoch": 0.034419176398279044,
"grad_norm": 8.094818115234375,
"learning_rate": 1.9862323294406885e-05,
"loss": 1.8122,
"step": 448
},
{
"epoch": 0.03457283343577136,
"grad_norm": 5.738587379455566,
"learning_rate": 1.9861708666256914e-05,
"loss": 1.8155,
"step": 450
},
{
"epoch": 0.03472649047326368,
"grad_norm": 5.194686412811279,
"learning_rate": 1.9861094038106947e-05,
"loss": 1.9198,
"step": 452
},
{
"epoch": 0.03488014751075599,
"grad_norm": 4.97174072265625,
"learning_rate": 1.9860479409956977e-05,
"loss": 1.9955,
"step": 454
},
{
"epoch": 0.03503380454824831,
"grad_norm": 5.790378570556641,
"learning_rate": 1.9859864781807007e-05,
"loss": 1.8218,
"step": 456
},
{
"epoch": 0.035187461585740626,
"grad_norm": 5.287135124206543,
"learning_rate": 1.985925015365704e-05,
"loss": 1.9169,
"step": 458
},
{
"epoch": 0.03534111862323294,
"grad_norm": 8.098136901855469,
"learning_rate": 1.985863552550707e-05,
"loss": 1.9039,
"step": 460
},
{
"epoch": 0.03549477566072526,
"grad_norm": 6.957726955413818,
"learning_rate": 1.98580208973571e-05,
"loss": 2.0036,
"step": 462
},
{
"epoch": 0.03564843269821758,
"grad_norm": 4.368841171264648,
"learning_rate": 1.9857406269207132e-05,
"loss": 1.8883,
"step": 464
},
{
"epoch": 0.0358020897357099,
"grad_norm": 5.95673131942749,
"learning_rate": 1.9856791641057162e-05,
"loss": 1.8977,
"step": 466
},
{
"epoch": 0.035955746773202214,
"grad_norm": 7.365513324737549,
"learning_rate": 1.985617701290719e-05,
"loss": 1.9865,
"step": 468
},
{
"epoch": 0.03610940381069453,
"grad_norm": 5.386063098907471,
"learning_rate": 1.9855562384757225e-05,
"loss": 1.8164,
"step": 470
},
{
"epoch": 0.03626306084818685,
"grad_norm": 6.155988693237305,
"learning_rate": 1.9854947756607254e-05,
"loss": 2.0083,
"step": 472
},
{
"epoch": 0.03641671788567916,
"grad_norm": 6.110922336578369,
"learning_rate": 1.9854333128457287e-05,
"loss": 1.8688,
"step": 474
},
{
"epoch": 0.03657037492317148,
"grad_norm": 5.692699909210205,
"learning_rate": 1.9853718500307314e-05,
"loss": 1.8501,
"step": 476
},
{
"epoch": 0.036724031960663796,
"grad_norm": 6.044013977050781,
"learning_rate": 1.9853103872157347e-05,
"loss": 1.8486,
"step": 478
},
{
"epoch": 0.03687768899815612,
"grad_norm": 6.102372169494629,
"learning_rate": 1.9852489244007376e-05,
"loss": 2.0873,
"step": 480
},
{
"epoch": 0.037031346035648435,
"grad_norm": 5.4327239990234375,
"learning_rate": 1.9851874615857406e-05,
"loss": 1.8635,
"step": 482
},
{
"epoch": 0.03718500307314075,
"grad_norm": 5.779347896575928,
"learning_rate": 1.985125998770744e-05,
"loss": 2.0413,
"step": 484
},
{
"epoch": 0.03733866011063307,
"grad_norm": 5.000186920166016,
"learning_rate": 1.985064535955747e-05,
"loss": 2.0214,
"step": 486
},
{
"epoch": 0.037492317148125384,
"grad_norm": 6.581515312194824,
"learning_rate": 1.98500307314075e-05,
"loss": 1.9141,
"step": 488
},
{
"epoch": 0.0376459741856177,
"grad_norm": 6.037952423095703,
"learning_rate": 1.984941610325753e-05,
"loss": 1.9475,
"step": 490
},
{
"epoch": 0.03779963122311002,
"grad_norm": 4.99038553237915,
"learning_rate": 1.984880147510756e-05,
"loss": 1.8296,
"step": 492
},
{
"epoch": 0.03795328826060233,
"grad_norm": 5.351291656494141,
"learning_rate": 1.9848186846957594e-05,
"loss": 1.9845,
"step": 494
},
{
"epoch": 0.03810694529809465,
"grad_norm": 6.249404430389404,
"learning_rate": 1.9847572218807624e-05,
"loss": 1.8824,
"step": 496
},
{
"epoch": 0.03826060233558697,
"grad_norm": 5.460664749145508,
"learning_rate": 1.9846957590657654e-05,
"loss": 1.9348,
"step": 498
},
{
"epoch": 0.03841425937307929,
"grad_norm": 5.399702072143555,
"learning_rate": 1.9846342962507687e-05,
"loss": 1.8646,
"step": 500
},
{
"epoch": 0.038567916410571605,
"grad_norm": 6.00943660736084,
"learning_rate": 1.9845728334357713e-05,
"loss": 1.8804,
"step": 502
},
{
"epoch": 0.03872157344806392,
"grad_norm": 6.057244300842285,
"learning_rate": 1.9845113706207746e-05,
"loss": 1.8876,
"step": 504
},
{
"epoch": 0.03887523048555624,
"grad_norm": 5.178292274475098,
"learning_rate": 1.9844499078057776e-05,
"loss": 1.8163,
"step": 506
},
{
"epoch": 0.039028887523048554,
"grad_norm": 5.430099964141846,
"learning_rate": 1.9843884449907806e-05,
"loss": 1.9221,
"step": 508
},
{
"epoch": 0.03918254456054087,
"grad_norm": 5.2391791343688965,
"learning_rate": 1.984326982175784e-05,
"loss": 1.9671,
"step": 510
},
{
"epoch": 0.03933620159803319,
"grad_norm": 6.54328727722168,
"learning_rate": 1.9842655193607868e-05,
"loss": 1.9916,
"step": 512
},
{
"epoch": 0.03948985863552551,
"grad_norm": 5.6781134605407715,
"learning_rate": 1.98420405654579e-05,
"loss": 1.9,
"step": 514
},
{
"epoch": 0.039643515673017826,
"grad_norm": 5.34329891204834,
"learning_rate": 1.984142593730793e-05,
"loss": 1.7433,
"step": 516
},
{
"epoch": 0.03979717271051014,
"grad_norm": 6.142169952392578,
"learning_rate": 1.984081130915796e-05,
"loss": 1.8559,
"step": 518
},
{
"epoch": 0.03995082974800246,
"grad_norm": 5.825856685638428,
"learning_rate": 1.9840196681007994e-05,
"loss": 1.7434,
"step": 520
},
{
"epoch": 0.040104486785494775,
"grad_norm": 4.883429050445557,
"learning_rate": 1.9839582052858023e-05,
"loss": 1.8403,
"step": 522
},
{
"epoch": 0.04025814382298709,
"grad_norm": 5.759003162384033,
"learning_rate": 1.9838967424708053e-05,
"loss": 1.872,
"step": 524
},
{
"epoch": 0.04041180086047941,
"grad_norm": 5.845025539398193,
"learning_rate": 1.9838352796558086e-05,
"loss": 1.818,
"step": 526
},
{
"epoch": 0.040565457897971724,
"grad_norm": 6.238631248474121,
"learning_rate": 1.9837738168408113e-05,
"loss": 1.9553,
"step": 528
},
{
"epoch": 0.04071911493546405,
"grad_norm": 5.450825214385986,
"learning_rate": 1.9837123540258146e-05,
"loss": 1.9314,
"step": 530
},
{
"epoch": 0.040872771972956363,
"grad_norm": 5.4290385246276855,
"learning_rate": 1.9836508912108175e-05,
"loss": 1.8316,
"step": 532
},
{
"epoch": 0.04102642901044868,
"grad_norm": 6.243955612182617,
"learning_rate": 1.9835894283958205e-05,
"loss": 1.9605,
"step": 534
},
{
"epoch": 0.041180086047940996,
"grad_norm": 5.5207672119140625,
"learning_rate": 1.9835279655808238e-05,
"loss": 1.9377,
"step": 536
},
{
"epoch": 0.04133374308543331,
"grad_norm": 5.570779323577881,
"learning_rate": 1.9834665027658268e-05,
"loss": 1.9706,
"step": 538
},
{
"epoch": 0.04148740012292563,
"grad_norm": 4.921234130859375,
"learning_rate": 1.98340503995083e-05,
"loss": 1.8666,
"step": 540
},
{
"epoch": 0.041641057160417945,
"grad_norm": 6.029317855834961,
"learning_rate": 1.983343577135833e-05,
"loss": 1.8431,
"step": 542
},
{
"epoch": 0.04179471419791026,
"grad_norm": 5.6237664222717285,
"learning_rate": 1.983282114320836e-05,
"loss": 2.0265,
"step": 544
},
{
"epoch": 0.041948371235402585,
"grad_norm": 4.848809719085693,
"learning_rate": 1.9832206515058393e-05,
"loss": 1.851,
"step": 546
},
{
"epoch": 0.0421020282728949,
"grad_norm": 6.06104040145874,
"learning_rate": 1.983159188690842e-05,
"loss": 1.9252,
"step": 548
},
{
"epoch": 0.04225568531038722,
"grad_norm": 6.721662521362305,
"learning_rate": 1.9830977258758453e-05,
"loss": 1.9046,
"step": 550
},
{
"epoch": 0.04240934234787953,
"grad_norm": 5.039158821105957,
"learning_rate": 1.9830362630608482e-05,
"loss": 1.9457,
"step": 552
},
{
"epoch": 0.04256299938537185,
"grad_norm": 4.985758304595947,
"learning_rate": 1.9829748002458512e-05,
"loss": 1.7706,
"step": 554
},
{
"epoch": 0.042716656422864166,
"grad_norm": 5.59445858001709,
"learning_rate": 1.9829133374308545e-05,
"loss": 1.9232,
"step": 556
},
{
"epoch": 0.04287031346035648,
"grad_norm": 5.786518573760986,
"learning_rate": 1.9828518746158575e-05,
"loss": 1.9535,
"step": 558
},
{
"epoch": 0.0430239704978488,
"grad_norm": 5.362064838409424,
"learning_rate": 1.9827904118008608e-05,
"loss": 1.774,
"step": 560
},
{
"epoch": 0.04317762753534112,
"grad_norm": 6.807535171508789,
"learning_rate": 1.9827289489858637e-05,
"loss": 1.9963,
"step": 562
},
{
"epoch": 0.04333128457283344,
"grad_norm": 4.927182197570801,
"learning_rate": 1.9826674861708667e-05,
"loss": 1.8839,
"step": 564
},
{
"epoch": 0.043484941610325754,
"grad_norm": 7.077647686004639,
"learning_rate": 1.98260602335587e-05,
"loss": 1.8577,
"step": 566
},
{
"epoch": 0.04363859864781807,
"grad_norm": 4.930956840515137,
"learning_rate": 1.982544560540873e-05,
"loss": 1.9032,
"step": 568
},
{
"epoch": 0.04379225568531039,
"grad_norm": 5.537839889526367,
"learning_rate": 1.982483097725876e-05,
"loss": 1.8599,
"step": 570
},
{
"epoch": 0.0439459127228027,
"grad_norm": 4.91294527053833,
"learning_rate": 1.9824216349108793e-05,
"loss": 1.8962,
"step": 572
},
{
"epoch": 0.04409956976029502,
"grad_norm": 7.946929931640625,
"learning_rate": 1.982360172095882e-05,
"loss": 2.0401,
"step": 574
},
{
"epoch": 0.044253226797787336,
"grad_norm": 5.566417217254639,
"learning_rate": 1.9822987092808852e-05,
"loss": 1.7317,
"step": 576
},
{
"epoch": 0.04440688383527966,
"grad_norm": 6.196030616760254,
"learning_rate": 1.9822372464658882e-05,
"loss": 1.9818,
"step": 578
},
{
"epoch": 0.044560540872771975,
"grad_norm": 5.8990888595581055,
"learning_rate": 1.9821757836508915e-05,
"loss": 1.9209,
"step": 580
},
{
"epoch": 0.04471419791026429,
"grad_norm": 4.752439022064209,
"learning_rate": 1.9821143208358944e-05,
"loss": 1.8661,
"step": 582
},
{
"epoch": 0.04486785494775661,
"grad_norm": 5.3692121505737305,
"learning_rate": 1.9820528580208974e-05,
"loss": 1.8574,
"step": 584
},
{
"epoch": 0.045021511985248924,
"grad_norm": 4.94577169418335,
"learning_rate": 1.9819913952059007e-05,
"loss": 1.76,
"step": 586
},
{
"epoch": 0.04517516902274124,
"grad_norm": 5.1533708572387695,
"learning_rate": 1.9819299323909037e-05,
"loss": 1.8634,
"step": 588
},
{
"epoch": 0.04532882606023356,
"grad_norm": 5.460253715515137,
"learning_rate": 1.9818684695759067e-05,
"loss": 1.7615,
"step": 590
},
{
"epoch": 0.04548248309772587,
"grad_norm": 6.106910705566406,
"learning_rate": 1.98180700676091e-05,
"loss": 1.8658,
"step": 592
},
{
"epoch": 0.045636140135218196,
"grad_norm": 8.604896545410156,
"learning_rate": 1.981745543945913e-05,
"loss": 1.8234,
"step": 594
},
{
"epoch": 0.04578979717271051,
"grad_norm": 5.533381938934326,
"learning_rate": 1.981684081130916e-05,
"loss": 1.8133,
"step": 596
},
{
"epoch": 0.04594345421020283,
"grad_norm": 5.140172481536865,
"learning_rate": 1.9816226183159192e-05,
"loss": 1.7655,
"step": 598
},
{
"epoch": 0.046097111247695145,
"grad_norm": 5.633389472961426,
"learning_rate": 1.9815611555009222e-05,
"loss": 1.8804,
"step": 600
},
{
"epoch": 0.04625076828518746,
"grad_norm": 5.397654056549072,
"learning_rate": 1.981499692685925e-05,
"loss": 1.9422,
"step": 602
},
{
"epoch": 0.04640442532267978,
"grad_norm": 5.916885852813721,
"learning_rate": 1.981438229870928e-05,
"loss": 1.9222,
"step": 604
},
{
"epoch": 0.046558082360172094,
"grad_norm": 4.4198198318481445,
"learning_rate": 1.9813767670559314e-05,
"loss": 1.8088,
"step": 606
},
{
"epoch": 0.04671173939766441,
"grad_norm": 6.035666465759277,
"learning_rate": 1.9813153042409344e-05,
"loss": 1.9505,
"step": 608
},
{
"epoch": 0.04686539643515673,
"grad_norm": 5.293002605438232,
"learning_rate": 1.9812538414259374e-05,
"loss": 1.9354,
"step": 610
},
{
"epoch": 0.04701905347264905,
"grad_norm": 5.066743850708008,
"learning_rate": 1.9811923786109407e-05,
"loss": 2.001,
"step": 612
},
{
"epoch": 0.047172710510141366,
"grad_norm": 6.867171764373779,
"learning_rate": 1.9811309157959436e-05,
"loss": 1.86,
"step": 614
},
{
"epoch": 0.04732636754763368,
"grad_norm": 4.908615589141846,
"learning_rate": 1.9810694529809466e-05,
"loss": 1.8855,
"step": 616
},
{
"epoch": 0.047480024585126,
"grad_norm": 5.6588006019592285,
"learning_rate": 1.98100799016595e-05,
"loss": 1.8047,
"step": 618
},
{
"epoch": 0.047633681622618315,
"grad_norm": 5.6555304527282715,
"learning_rate": 1.980946527350953e-05,
"loss": 1.7656,
"step": 620
},
{
"epoch": 0.04778733866011063,
"grad_norm": 4.742602348327637,
"learning_rate": 1.980885064535956e-05,
"loss": 1.976,
"step": 622
},
{
"epoch": 0.04794099569760295,
"grad_norm": 5.0910868644714355,
"learning_rate": 1.980823601720959e-05,
"loss": 1.8894,
"step": 624
},
{
"epoch": 0.048094652735095264,
"grad_norm": 5.279669761657715,
"learning_rate": 1.980762138905962e-05,
"loss": 1.9323,
"step": 626
},
{
"epoch": 0.04824830977258759,
"grad_norm": 5.603051662445068,
"learning_rate": 1.980700676090965e-05,
"loss": 1.9327,
"step": 628
},
{
"epoch": 0.048401966810079904,
"grad_norm": 5.823456764221191,
"learning_rate": 1.980639213275968e-05,
"loss": 1.9087,
"step": 630
},
{
"epoch": 0.04855562384757222,
"grad_norm": 4.226296424865723,
"learning_rate": 1.9805777504609714e-05,
"loss": 1.7298,
"step": 632
},
{
"epoch": 0.048709280885064536,
"grad_norm": 4.537020683288574,
"learning_rate": 1.9805162876459743e-05,
"loss": 1.8588,
"step": 634
},
{
"epoch": 0.04886293792255685,
"grad_norm": 5.843430519104004,
"learning_rate": 1.9804548248309773e-05,
"loss": 1.8581,
"step": 636
},
{
"epoch": 0.04901659496004917,
"grad_norm": 5.234043598175049,
"learning_rate": 1.9803933620159806e-05,
"loss": 1.8016,
"step": 638
},
{
"epoch": 0.049170251997541485,
"grad_norm": 6.091218948364258,
"learning_rate": 1.9803318992009836e-05,
"loss": 1.8419,
"step": 640
},
{
"epoch": 0.0493239090350338,
"grad_norm": 5.473825454711914,
"learning_rate": 1.9802704363859865e-05,
"loss": 1.8742,
"step": 642
},
{
"epoch": 0.049477566072526125,
"grad_norm": 5.018134117126465,
"learning_rate": 1.98020897357099e-05,
"loss": 1.9246,
"step": 644
},
{
"epoch": 0.04963122311001844,
"grad_norm": 5.1250505447387695,
"learning_rate": 1.9801475107559928e-05,
"loss": 1.8988,
"step": 646
},
{
"epoch": 0.04978488014751076,
"grad_norm": 5.310157299041748,
"learning_rate": 1.9800860479409958e-05,
"loss": 1.9718,
"step": 648
},
{
"epoch": 0.049938537185003073,
"grad_norm": 5.5490570068359375,
"learning_rate": 1.980024585125999e-05,
"loss": 1.8779,
"step": 650
},
{
"epoch": 0.05009219422249539,
"grad_norm": 5.242208480834961,
"learning_rate": 1.979963122311002e-05,
"loss": 1.8712,
"step": 652
},
{
"epoch": 0.050245851259987706,
"grad_norm": 4.680446624755859,
"learning_rate": 1.979901659496005e-05,
"loss": 1.9475,
"step": 654
},
{
"epoch": 0.05039950829748002,
"grad_norm": 12.400496482849121,
"learning_rate": 1.979840196681008e-05,
"loss": 1.9387,
"step": 656
},
{
"epoch": 0.05055316533497234,
"grad_norm": 4.818700313568115,
"learning_rate": 1.9797787338660113e-05,
"loss": 1.7356,
"step": 658
},
{
"epoch": 0.05070682237246466,
"grad_norm": 4.733686923980713,
"learning_rate": 1.9797172710510143e-05,
"loss": 1.7161,
"step": 660
},
{
"epoch": 0.05086047940995698,
"grad_norm": 5.9219865798950195,
"learning_rate": 1.9796558082360172e-05,
"loss": 1.9821,
"step": 662
},
{
"epoch": 0.051014136447449294,
"grad_norm": 4.954675197601318,
"learning_rate": 1.9795943454210206e-05,
"loss": 1.9392,
"step": 664
},
{
"epoch": 0.05116779348494161,
"grad_norm": 4.482631206512451,
"learning_rate": 1.9795328826060235e-05,
"loss": 1.9687,
"step": 666
},
{
"epoch": 0.05132145052243393,
"grad_norm": 6.749068737030029,
"learning_rate": 1.9794714197910265e-05,
"loss": 1.8242,
"step": 668
},
{
"epoch": 0.05147510755992624,
"grad_norm": 4.532095909118652,
"learning_rate": 1.9794099569760298e-05,
"loss": 1.7596,
"step": 670
},
{
"epoch": 0.05162876459741856,
"grad_norm": 5.727676868438721,
"learning_rate": 1.9793484941610328e-05,
"loss": 1.94,
"step": 672
},
{
"epoch": 0.051782421634910876,
"grad_norm": 5.493950843811035,
"learning_rate": 1.9792870313460357e-05,
"loss": 1.9243,
"step": 674
},
{
"epoch": 0.0519360786724032,
"grad_norm": 5.48468017578125,
"learning_rate": 1.9792255685310387e-05,
"loss": 1.7862,
"step": 676
},
{
"epoch": 0.052089735709895515,
"grad_norm": 5.862773895263672,
"learning_rate": 1.979164105716042e-05,
"loss": 1.843,
"step": 678
},
{
"epoch": 0.05224339274738783,
"grad_norm": 5.505096912384033,
"learning_rate": 1.979102642901045e-05,
"loss": 1.8366,
"step": 680
},
{
"epoch": 0.05239704978488015,
"grad_norm": 5.697121620178223,
"learning_rate": 1.979041180086048e-05,
"loss": 1.9764,
"step": 682
},
{
"epoch": 0.052550706822372464,
"grad_norm": 4.900547027587891,
"learning_rate": 1.9789797172710513e-05,
"loss": 1.9252,
"step": 684
},
{
"epoch": 0.05270436385986478,
"grad_norm": 5.347836017608643,
"learning_rate": 1.9789182544560542e-05,
"loss": 1.8527,
"step": 686
},
{
"epoch": 0.0528580208973571,
"grad_norm": 5.393474102020264,
"learning_rate": 1.9788567916410572e-05,
"loss": 1.8422,
"step": 688
},
{
"epoch": 0.05301167793484941,
"grad_norm": 5.27833366394043,
"learning_rate": 1.9787953288260605e-05,
"loss": 1.8933,
"step": 690
},
{
"epoch": 0.053165334972341736,
"grad_norm": 5.38336181640625,
"learning_rate": 1.9787338660110635e-05,
"loss": 1.9456,
"step": 692
},
{
"epoch": 0.05331899200983405,
"grad_norm": 5.273176193237305,
"learning_rate": 1.9786724031960664e-05,
"loss": 1.8142,
"step": 694
},
{
"epoch": 0.05347264904732637,
"grad_norm": 5.413751125335693,
"learning_rate": 1.9786109403810697e-05,
"loss": 1.7652,
"step": 696
},
{
"epoch": 0.053626306084818685,
"grad_norm": 5.373195648193359,
"learning_rate": 1.9785494775660727e-05,
"loss": 1.883,
"step": 698
},
{
"epoch": 0.053779963122311,
"grad_norm": 4.942586421966553,
"learning_rate": 1.9784880147510757e-05,
"loss": 1.76,
"step": 700
},
{
"epoch": 0.05393362015980332,
"grad_norm": 5.6196980476379395,
"learning_rate": 1.9784265519360786e-05,
"loss": 1.7673,
"step": 702
},
{
"epoch": 0.054087277197295634,
"grad_norm": 5.702764987945557,
"learning_rate": 1.978365089121082e-05,
"loss": 1.8437,
"step": 704
},
{
"epoch": 0.05424093423478795,
"grad_norm": 4.99530553817749,
"learning_rate": 1.978303626306085e-05,
"loss": 1.8747,
"step": 706
},
{
"epoch": 0.054394591272280274,
"grad_norm": 5.105679035186768,
"learning_rate": 1.978242163491088e-05,
"loss": 1.6632,
"step": 708
},
{
"epoch": 0.05454824830977259,
"grad_norm": 4.710418701171875,
"learning_rate": 1.9781807006760912e-05,
"loss": 1.8736,
"step": 710
},
{
"epoch": 0.054701905347264906,
"grad_norm": 4.792379856109619,
"learning_rate": 1.978119237861094e-05,
"loss": 1.8016,
"step": 712
},
{
"epoch": 0.05485556238475722,
"grad_norm": 4.937024116516113,
"learning_rate": 1.978057775046097e-05,
"loss": 1.7436,
"step": 714
},
{
"epoch": 0.05500921942224954,
"grad_norm": 5.5544867515563965,
"learning_rate": 1.9779963122311004e-05,
"loss": 1.926,
"step": 716
},
{
"epoch": 0.055162876459741855,
"grad_norm": 6.484194278717041,
"learning_rate": 1.9779348494161034e-05,
"loss": 1.9102,
"step": 718
},
{
"epoch": 0.05531653349723417,
"grad_norm": 5.408361434936523,
"learning_rate": 1.9778733866011064e-05,
"loss": 1.7786,
"step": 720
},
{
"epoch": 0.05547019053472649,
"grad_norm": 5.705206394195557,
"learning_rate": 1.9778119237861097e-05,
"loss": 1.779,
"step": 722
},
{
"epoch": 0.055623847572218804,
"grad_norm": 6.138594627380371,
"learning_rate": 1.9777504609711127e-05,
"loss": 1.6926,
"step": 724
},
{
"epoch": 0.05577750460971113,
"grad_norm": 5.507882595062256,
"learning_rate": 1.977688998156116e-05,
"loss": 2.0119,
"step": 726
},
{
"epoch": 0.055931161647203444,
"grad_norm": 5.1471710205078125,
"learning_rate": 1.9776275353411186e-05,
"loss": 1.7674,
"step": 728
},
{
"epoch": 0.05608481868469576,
"grad_norm": 5.558322906494141,
"learning_rate": 1.977566072526122e-05,
"loss": 1.7262,
"step": 730
},
{
"epoch": 0.056238475722188076,
"grad_norm": 5.859812259674072,
"learning_rate": 1.977504609711125e-05,
"loss": 2.0273,
"step": 732
},
{
"epoch": 0.05639213275968039,
"grad_norm": 4.931456565856934,
"learning_rate": 1.977443146896128e-05,
"loss": 1.7008,
"step": 734
},
{
"epoch": 0.05654578979717271,
"grad_norm": 4.835200786590576,
"learning_rate": 1.977381684081131e-05,
"loss": 1.7615,
"step": 736
},
{
"epoch": 0.056699446834665025,
"grad_norm": 5.542105674743652,
"learning_rate": 1.977320221266134e-05,
"loss": 1.7991,
"step": 738
},
{
"epoch": 0.05685310387215734,
"grad_norm": 5.737773895263672,
"learning_rate": 1.977258758451137e-05,
"loss": 1.7067,
"step": 740
},
{
"epoch": 0.057006760909649665,
"grad_norm": 4.556394100189209,
"learning_rate": 1.9771972956361404e-05,
"loss": 1.8418,
"step": 742
},
{
"epoch": 0.05716041794714198,
"grad_norm": 4.682400226593018,
"learning_rate": 1.9771358328211434e-05,
"loss": 1.7565,
"step": 744
},
{
"epoch": 0.0573140749846343,
"grad_norm": 5.617753982543945,
"learning_rate": 1.9770743700061467e-05,
"loss": 1.8314,
"step": 746
},
{
"epoch": 0.057467732022126614,
"grad_norm": 4.796401500701904,
"learning_rate": 1.9770129071911496e-05,
"loss": 1.6892,
"step": 748
},
{
"epoch": 0.05762138905961893,
"grad_norm": 5.084446430206299,
"learning_rate": 1.9769514443761526e-05,
"loss": 1.6731,
"step": 750
},
{
"epoch": 0.057775046097111246,
"grad_norm": 5.344216823577881,
"learning_rate": 1.976889981561156e-05,
"loss": 1.8096,
"step": 752
},
{
"epoch": 0.05792870313460356,
"grad_norm": 4.87506103515625,
"learning_rate": 1.9768285187461585e-05,
"loss": 1.9015,
"step": 754
},
{
"epoch": 0.05808236017209588,
"grad_norm": 5.019058704376221,
"learning_rate": 1.976767055931162e-05,
"loss": 1.9467,
"step": 756
},
{
"epoch": 0.0582360172095882,
"grad_norm": 5.275008678436279,
"learning_rate": 1.9767055931161648e-05,
"loss": 1.6159,
"step": 758
},
{
"epoch": 0.05838967424708052,
"grad_norm": 5.17955207824707,
"learning_rate": 1.9766441303011678e-05,
"loss": 1.6819,
"step": 760
},
{
"epoch": 0.058543331284572835,
"grad_norm": 5.578658580780029,
"learning_rate": 1.976582667486171e-05,
"loss": 1.8369,
"step": 762
},
{
"epoch": 0.05869698832206515,
"grad_norm": 4.934607982635498,
"learning_rate": 1.976521204671174e-05,
"loss": 1.8909,
"step": 764
},
{
"epoch": 0.05885064535955747,
"grad_norm": 5.5896759033203125,
"learning_rate": 1.9764597418561774e-05,
"loss": 1.7238,
"step": 766
},
{
"epoch": 0.05900430239704978,
"grad_norm": 5.263469696044922,
"learning_rate": 1.9763982790411803e-05,
"loss": 1.7776,
"step": 768
},
{
"epoch": 0.0591579594345421,
"grad_norm": 4.459990978240967,
"learning_rate": 1.9763368162261833e-05,
"loss": 1.7082,
"step": 770
},
{
"epoch": 0.059311616472034416,
"grad_norm": 5.528759002685547,
"learning_rate": 1.9762753534111866e-05,
"loss": 1.9014,
"step": 772
},
{
"epoch": 0.05946527350952674,
"grad_norm": 5.372073650360107,
"learning_rate": 1.9762138905961892e-05,
"loss": 1.691,
"step": 774
},
{
"epoch": 0.059618930547019056,
"grad_norm": 5.765900135040283,
"learning_rate": 1.9761524277811925e-05,
"loss": 1.9243,
"step": 776
},
{
"epoch": 0.05977258758451137,
"grad_norm": 5.123989105224609,
"learning_rate": 1.9760909649661955e-05,
"loss": 2.0223,
"step": 778
},
{
"epoch": 0.05992624462200369,
"grad_norm": 5.149808406829834,
"learning_rate": 1.9760295021511985e-05,
"loss": 1.8063,
"step": 780
},
{
"epoch": 0.060079901659496004,
"grad_norm": 5.047703266143799,
"learning_rate": 1.9759680393362018e-05,
"loss": 1.9353,
"step": 782
},
{
"epoch": 0.06023355869698832,
"grad_norm": 5.555423259735107,
"learning_rate": 1.9759065765212048e-05,
"loss": 1.695,
"step": 784
},
{
"epoch": 0.06038721573448064,
"grad_norm": 5.100247859954834,
"learning_rate": 1.9758451137062077e-05,
"loss": 1.6758,
"step": 786
},
{
"epoch": 0.06054087277197295,
"grad_norm": 4.941176891326904,
"learning_rate": 1.975783650891211e-05,
"loss": 1.7391,
"step": 788
},
{
"epoch": 0.06069452980946528,
"grad_norm": 5.3119964599609375,
"learning_rate": 1.975722188076214e-05,
"loss": 2.0366,
"step": 790
},
{
"epoch": 0.06084818684695759,
"grad_norm": 6.0686235427856445,
"learning_rate": 1.9756607252612173e-05,
"loss": 1.7149,
"step": 792
},
{
"epoch": 0.06100184388444991,
"grad_norm": 6.141575336456299,
"learning_rate": 1.9755992624462203e-05,
"loss": 1.7808,
"step": 794
},
{
"epoch": 0.061155500921942225,
"grad_norm": 5.157688140869141,
"learning_rate": 1.9755377996312232e-05,
"loss": 1.8069,
"step": 796
},
{
"epoch": 0.06130915795943454,
"grad_norm": 5.358695983886719,
"learning_rate": 1.9754763368162266e-05,
"loss": 1.9632,
"step": 798
},
{
"epoch": 0.06146281499692686,
"grad_norm": 5.3423261642456055,
"learning_rate": 1.9754148740012292e-05,
"loss": 1.6711,
"step": 800
},
{
"epoch": 0.061616472034419174,
"grad_norm": 5.9911980628967285,
"learning_rate": 1.9753534111862325e-05,
"loss": 1.6384,
"step": 802
},
{
"epoch": 0.06177012907191149,
"grad_norm": 5.021694183349609,
"learning_rate": 1.9752919483712355e-05,
"loss": 1.7412,
"step": 804
},
{
"epoch": 0.061923786109403814,
"grad_norm": 5.38372802734375,
"learning_rate": 1.9752304855562384e-05,
"loss": 1.5895,
"step": 806
},
{
"epoch": 0.06207744314689613,
"grad_norm": 5.618641376495361,
"learning_rate": 1.9751690227412417e-05,
"loss": 1.8097,
"step": 808
},
{
"epoch": 0.062231100184388446,
"grad_norm": 5.081387519836426,
"learning_rate": 1.9751075599262447e-05,
"loss": 1.7789,
"step": 810
},
{
"epoch": 0.06238475722188076,
"grad_norm": 5.361464500427246,
"learning_rate": 1.975046097111248e-05,
"loss": 1.7821,
"step": 812
},
{
"epoch": 0.06253841425937308,
"grad_norm": 5.113397598266602,
"learning_rate": 1.974984634296251e-05,
"loss": 1.7292,
"step": 814
},
{
"epoch": 0.0626920712968654,
"grad_norm": 5.261277198791504,
"learning_rate": 1.974923171481254e-05,
"loss": 1.6956,
"step": 816
},
{
"epoch": 0.06284572833435771,
"grad_norm": 7.06874942779541,
"learning_rate": 1.9748617086662573e-05,
"loss": 1.865,
"step": 818
},
{
"epoch": 0.06299938537185003,
"grad_norm": 4.949324131011963,
"learning_rate": 1.9748002458512602e-05,
"loss": 1.7885,
"step": 820
},
{
"epoch": 0.06315304240934234,
"grad_norm": 6.291264533996582,
"learning_rate": 1.9747387830362632e-05,
"loss": 1.8685,
"step": 822
},
{
"epoch": 0.06330669944683466,
"grad_norm": 4.500913143157959,
"learning_rate": 1.9746773202212665e-05,
"loss": 1.6422,
"step": 824
},
{
"epoch": 0.06346035648432698,
"grad_norm": 5.313440322875977,
"learning_rate": 1.974615857406269e-05,
"loss": 1.8328,
"step": 826
},
{
"epoch": 0.0636140135218193,
"grad_norm": 5.809798240661621,
"learning_rate": 1.9745543945912724e-05,
"loss": 1.8912,
"step": 828
},
{
"epoch": 0.06376767055931162,
"grad_norm": 4.69051456451416,
"learning_rate": 1.9744929317762754e-05,
"loss": 1.6786,
"step": 830
},
{
"epoch": 0.06392132759680394,
"grad_norm": 5.292459487915039,
"learning_rate": 1.9744314689612787e-05,
"loss": 1.8497,
"step": 832
},
{
"epoch": 0.06407498463429626,
"grad_norm": 4.772144794464111,
"learning_rate": 1.9743700061462817e-05,
"loss": 1.6261,
"step": 834
},
{
"epoch": 0.06422864167178857,
"grad_norm": 4.984364032745361,
"learning_rate": 1.9743085433312846e-05,
"loss": 1.7354,
"step": 836
},
{
"epoch": 0.06438229870928089,
"grad_norm": 4.450577735900879,
"learning_rate": 1.974247080516288e-05,
"loss": 1.7349,
"step": 838
},
{
"epoch": 0.0645359557467732,
"grad_norm": 5.341747760772705,
"learning_rate": 1.974185617701291e-05,
"loss": 1.8332,
"step": 840
},
{
"epoch": 0.06468961278426552,
"grad_norm": 5.368303298950195,
"learning_rate": 1.974124154886294e-05,
"loss": 1.8219,
"step": 842
},
{
"epoch": 0.06484326982175784,
"grad_norm": 4.4096360206604,
"learning_rate": 1.9740626920712972e-05,
"loss": 1.8158,
"step": 844
},
{
"epoch": 0.06499692685925015,
"grad_norm": 6.098479270935059,
"learning_rate": 1.9740012292563e-05,
"loss": 1.7323,
"step": 846
},
{
"epoch": 0.06515058389674247,
"grad_norm": 4.606769561767578,
"learning_rate": 1.973939766441303e-05,
"loss": 1.7527,
"step": 848
},
{
"epoch": 0.06530424093423479,
"grad_norm": 6.082258701324463,
"learning_rate": 1.9738783036263064e-05,
"loss": 1.8471,
"step": 850
},
{
"epoch": 0.0654578979717271,
"grad_norm": 5.389958381652832,
"learning_rate": 1.9738168408113094e-05,
"loss": 1.8564,
"step": 852
},
{
"epoch": 0.06561155500921942,
"grad_norm": 5.574385643005371,
"learning_rate": 1.9737553779963124e-05,
"loss": 1.784,
"step": 854
},
{
"epoch": 0.06576521204671174,
"grad_norm": 5.1567487716674805,
"learning_rate": 1.9736939151813153e-05,
"loss": 1.713,
"step": 856
},
{
"epoch": 0.06591886908420405,
"grad_norm": 5.475706577301025,
"learning_rate": 1.9736324523663187e-05,
"loss": 1.7555,
"step": 858
},
{
"epoch": 0.06607252612169637,
"grad_norm": 4.831605434417725,
"learning_rate": 1.9735709895513216e-05,
"loss": 1.8695,
"step": 860
},
{
"epoch": 0.06622618315918868,
"grad_norm": 6.022873878479004,
"learning_rate": 1.9735095267363246e-05,
"loss": 1.7765,
"step": 862
},
{
"epoch": 0.06637984019668101,
"grad_norm": 4.874941825866699,
"learning_rate": 1.973448063921328e-05,
"loss": 1.757,
"step": 864
},
{
"epoch": 0.06653349723417333,
"grad_norm": 4.488655090332031,
"learning_rate": 1.973386601106331e-05,
"loss": 1.8837,
"step": 866
},
{
"epoch": 0.06668715427166565,
"grad_norm": 4.5713090896606445,
"learning_rate": 1.973325138291334e-05,
"loss": 1.6627,
"step": 868
},
{
"epoch": 0.06684081130915796,
"grad_norm": 5.312070846557617,
"learning_rate": 1.973263675476337e-05,
"loss": 1.7551,
"step": 870
},
{
"epoch": 0.06699446834665028,
"grad_norm": 5.104644775390625,
"learning_rate": 1.97320221266134e-05,
"loss": 1.8789,
"step": 872
},
{
"epoch": 0.0671481253841426,
"grad_norm": 4.595895290374756,
"learning_rate": 1.973140749846343e-05,
"loss": 1.8319,
"step": 874
},
{
"epoch": 0.06730178242163491,
"grad_norm": 5.223522186279297,
"learning_rate": 1.973079287031346e-05,
"loss": 1.6661,
"step": 876
},
{
"epoch": 0.06745543945912723,
"grad_norm": 4.466522693634033,
"learning_rate": 1.9730178242163494e-05,
"loss": 1.8338,
"step": 878
},
{
"epoch": 0.06760909649661954,
"grad_norm": 4.613927841186523,
"learning_rate": 1.9729563614013523e-05,
"loss": 1.7562,
"step": 880
},
{
"epoch": 0.06776275353411186,
"grad_norm": 5.868101119995117,
"learning_rate": 1.9728948985863553e-05,
"loss": 1.9181,
"step": 882
},
{
"epoch": 0.06791641057160418,
"grad_norm": 5.407005786895752,
"learning_rate": 1.9728334357713586e-05,
"loss": 1.8462,
"step": 884
},
{
"epoch": 0.0680700676090965,
"grad_norm": 6.075082778930664,
"learning_rate": 1.9727719729563616e-05,
"loss": 1.816,
"step": 886
},
{
"epoch": 0.06822372464658881,
"grad_norm": 5.164397716522217,
"learning_rate": 1.9727105101413645e-05,
"loss": 1.9236,
"step": 888
},
{
"epoch": 0.06837738168408113,
"grad_norm": 5.688714981079102,
"learning_rate": 1.972649047326368e-05,
"loss": 1.7171,
"step": 890
},
{
"epoch": 0.06853103872157344,
"grad_norm": 4.9518842697143555,
"learning_rate": 1.9725875845113708e-05,
"loss": 1.7829,
"step": 892
},
{
"epoch": 0.06868469575906576,
"grad_norm": 5.185763835906982,
"learning_rate": 1.9725261216963738e-05,
"loss": 1.7146,
"step": 894
},
{
"epoch": 0.06883835279655809,
"grad_norm": 5.043625354766846,
"learning_rate": 1.972464658881377e-05,
"loss": 1.8594,
"step": 896
},
{
"epoch": 0.0689920098340504,
"grad_norm": 4.783642292022705,
"learning_rate": 1.97240319606638e-05,
"loss": 1.7053,
"step": 898
},
{
"epoch": 0.06914566687154272,
"grad_norm": 4.887513637542725,
"learning_rate": 1.972341733251383e-05,
"loss": 1.7929,
"step": 900
},
{
"epoch": 0.06929932390903504,
"grad_norm": 4.4108123779296875,
"learning_rate": 1.972280270436386e-05,
"loss": 1.7296,
"step": 902
},
{
"epoch": 0.06945298094652735,
"grad_norm": 5.531246662139893,
"learning_rate": 1.9722188076213893e-05,
"loss": 1.7792,
"step": 904
},
{
"epoch": 0.06960663798401967,
"grad_norm": 4.462593078613281,
"learning_rate": 1.9721573448063923e-05,
"loss": 1.7121,
"step": 906
},
{
"epoch": 0.06976029502151199,
"grad_norm": 4.543118000030518,
"learning_rate": 1.9720958819913952e-05,
"loss": 1.7871,
"step": 908
},
{
"epoch": 0.0699139520590043,
"grad_norm": 5.9536638259887695,
"learning_rate": 1.9720344191763985e-05,
"loss": 1.7956,
"step": 910
},
{
"epoch": 0.07006760909649662,
"grad_norm": 4.735901832580566,
"learning_rate": 1.9719729563614015e-05,
"loss": 1.7511,
"step": 912
},
{
"epoch": 0.07022126613398894,
"grad_norm": 4.490820407867432,
"learning_rate": 1.9719114935464045e-05,
"loss": 1.7364,
"step": 914
},
{
"epoch": 0.07037492317148125,
"grad_norm": 4.837772846221924,
"learning_rate": 1.9718500307314078e-05,
"loss": 1.8204,
"step": 916
},
{
"epoch": 0.07052858020897357,
"grad_norm": 4.400464057922363,
"learning_rate": 1.9717885679164108e-05,
"loss": 1.8637,
"step": 918
},
{
"epoch": 0.07068223724646588,
"grad_norm": 4.991397857666016,
"learning_rate": 1.9717271051014137e-05,
"loss": 1.9175,
"step": 920
},
{
"epoch": 0.0708358942839582,
"grad_norm": 4.983181953430176,
"learning_rate": 1.971665642286417e-05,
"loss": 1.7065,
"step": 922
},
{
"epoch": 0.07098955132145052,
"grad_norm": 4.5055341720581055,
"learning_rate": 1.97160417947142e-05,
"loss": 1.7113,
"step": 924
},
{
"epoch": 0.07114320835894283,
"grad_norm": 5.021308422088623,
"learning_rate": 1.971542716656423e-05,
"loss": 1.8903,
"step": 926
},
{
"epoch": 0.07129686539643516,
"grad_norm": 6.712131023406982,
"learning_rate": 1.971481253841426e-05,
"loss": 1.7608,
"step": 928
},
{
"epoch": 0.07145052243392748,
"grad_norm": 5.711028575897217,
"learning_rate": 1.9714197910264292e-05,
"loss": 1.7826,
"step": 930
},
{
"epoch": 0.0716041794714198,
"grad_norm": 5.202549457550049,
"learning_rate": 1.9713583282114322e-05,
"loss": 1.7201,
"step": 932
},
{
"epoch": 0.07175783650891211,
"grad_norm": 4.809873580932617,
"learning_rate": 1.9712968653964352e-05,
"loss": 1.7824,
"step": 934
},
{
"epoch": 0.07191149354640443,
"grad_norm": 4.417870998382568,
"learning_rate": 1.9712354025814385e-05,
"loss": 1.8275,
"step": 936
},
{
"epoch": 0.07206515058389674,
"grad_norm": 4.823970794677734,
"learning_rate": 1.9711739397664415e-05,
"loss": 1.7293,
"step": 938
},
{
"epoch": 0.07221880762138906,
"grad_norm": 5.289034843444824,
"learning_rate": 1.9711124769514444e-05,
"loss": 1.7507,
"step": 940
},
{
"epoch": 0.07237246465888138,
"grad_norm": 4.538127422332764,
"learning_rate": 1.9710510141364477e-05,
"loss": 1.6391,
"step": 942
},
{
"epoch": 0.0725261216963737,
"grad_norm": 4.500412464141846,
"learning_rate": 1.9709895513214507e-05,
"loss": 1.6816,
"step": 944
},
{
"epoch": 0.07267977873386601,
"grad_norm": 5.149693012237549,
"learning_rate": 1.9709280885064537e-05,
"loss": 1.6366,
"step": 946
},
{
"epoch": 0.07283343577135833,
"grad_norm": 4.554830074310303,
"learning_rate": 1.970866625691457e-05,
"loss": 1.7712,
"step": 948
},
{
"epoch": 0.07298709280885064,
"grad_norm": 5.352977752685547,
"learning_rate": 1.97080516287646e-05,
"loss": 1.5819,
"step": 950
},
{
"epoch": 0.07314074984634296,
"grad_norm": 5.3114094734191895,
"learning_rate": 1.970743700061463e-05,
"loss": 1.802,
"step": 952
},
{
"epoch": 0.07329440688383528,
"grad_norm": 4.19597053527832,
"learning_rate": 1.970682237246466e-05,
"loss": 1.5855,
"step": 954
},
{
"epoch": 0.07344806392132759,
"grad_norm": 4.087234020233154,
"learning_rate": 1.9706207744314692e-05,
"loss": 1.7158,
"step": 956
},
{
"epoch": 0.07360172095881991,
"grad_norm": 5.064235210418701,
"learning_rate": 1.970559311616472e-05,
"loss": 1.7579,
"step": 958
},
{
"epoch": 0.07375537799631224,
"grad_norm": 5.033870697021484,
"learning_rate": 1.970497848801475e-05,
"loss": 1.8105,
"step": 960
},
{
"epoch": 0.07390903503380455,
"grad_norm": 4.546055793762207,
"learning_rate": 1.9704363859864784e-05,
"loss": 1.6964,
"step": 962
},
{
"epoch": 0.07406269207129687,
"grad_norm": 5.068551540374756,
"learning_rate": 1.9703749231714814e-05,
"loss": 1.6664,
"step": 964
},
{
"epoch": 0.07421634910878919,
"grad_norm": 5.1122050285339355,
"learning_rate": 1.9703134603564844e-05,
"loss": 1.819,
"step": 966
},
{
"epoch": 0.0743700061462815,
"grad_norm": 4.410829544067383,
"learning_rate": 1.9702519975414877e-05,
"loss": 1.7872,
"step": 968
},
{
"epoch": 0.07452366318377382,
"grad_norm": 4.5425190925598145,
"learning_rate": 1.9701905347264906e-05,
"loss": 1.6679,
"step": 970
},
{
"epoch": 0.07467732022126614,
"grad_norm": 4.571905612945557,
"learning_rate": 1.9701290719114936e-05,
"loss": 1.7135,
"step": 972
},
{
"epoch": 0.07483097725875845,
"grad_norm": 4.853597640991211,
"learning_rate": 1.970067609096497e-05,
"loss": 1.6162,
"step": 974
},
{
"epoch": 0.07498463429625077,
"grad_norm": 5.288270473480225,
"learning_rate": 1.9700061462815e-05,
"loss": 1.7473,
"step": 976
},
{
"epoch": 0.07513829133374308,
"grad_norm": 4.418172359466553,
"learning_rate": 1.9699446834665032e-05,
"loss": 1.6986,
"step": 978
},
{
"epoch": 0.0752919483712354,
"grad_norm": 4.6486496925354,
"learning_rate": 1.9698832206515058e-05,
"loss": 1.6914,
"step": 980
},
{
"epoch": 0.07544560540872772,
"grad_norm": 5.205509185791016,
"learning_rate": 1.969821757836509e-05,
"loss": 1.6129,
"step": 982
},
{
"epoch": 0.07559926244622003,
"grad_norm": 4.540061950683594,
"learning_rate": 1.969760295021512e-05,
"loss": 1.7026,
"step": 984
},
{
"epoch": 0.07575291948371235,
"grad_norm": 4.7810845375061035,
"learning_rate": 1.969698832206515e-05,
"loss": 1.686,
"step": 986
},
{
"epoch": 0.07590657652120467,
"grad_norm": 4.475192546844482,
"learning_rate": 1.9696373693915184e-05,
"loss": 1.739,
"step": 988
},
{
"epoch": 0.07606023355869698,
"grad_norm": 5.317092418670654,
"learning_rate": 1.9695759065765213e-05,
"loss": 2.0029,
"step": 990
},
{
"epoch": 0.0762138905961893,
"grad_norm": 5.178996562957764,
"learning_rate": 1.9695144437615243e-05,
"loss": 1.7278,
"step": 992
},
{
"epoch": 0.07636754763368163,
"grad_norm": 5.976894855499268,
"learning_rate": 1.9694529809465276e-05,
"loss": 1.753,
"step": 994
},
{
"epoch": 0.07652120467117395,
"grad_norm": 4.536045551300049,
"learning_rate": 1.9693915181315306e-05,
"loss": 1.6081,
"step": 996
},
{
"epoch": 0.07667486170866626,
"grad_norm": 4.751937389373779,
"learning_rate": 1.969330055316534e-05,
"loss": 1.6765,
"step": 998
},
{
"epoch": 0.07682851874615858,
"grad_norm": 5.145371437072754,
"learning_rate": 1.9692685925015365e-05,
"loss": 1.9088,
"step": 1000
},
{
"epoch": 0.0769821757836509,
"grad_norm": 5.149151802062988,
"learning_rate": 1.9692071296865398e-05,
"loss": 1.7855,
"step": 1002
},
{
"epoch": 0.07713583282114321,
"grad_norm": 4.331295490264893,
"learning_rate": 1.9691456668715428e-05,
"loss": 1.6398,
"step": 1004
},
{
"epoch": 0.07728948985863553,
"grad_norm": 5.288293361663818,
"learning_rate": 1.9690842040565458e-05,
"loss": 1.8451,
"step": 1006
},
{
"epoch": 0.07744314689612784,
"grad_norm": 4.709437370300293,
"learning_rate": 1.969022741241549e-05,
"loss": 1.8546,
"step": 1008
},
{
"epoch": 0.07759680393362016,
"grad_norm": 5.373138427734375,
"learning_rate": 1.968961278426552e-05,
"loss": 1.7561,
"step": 1010
},
{
"epoch": 0.07775046097111248,
"grad_norm": 6.308657169342041,
"learning_rate": 1.968899815611555e-05,
"loss": 1.8271,
"step": 1012
},
{
"epoch": 0.07790411800860479,
"grad_norm": 5.380505561828613,
"learning_rate": 1.9688383527965583e-05,
"loss": 1.7852,
"step": 1014
},
{
"epoch": 0.07805777504609711,
"grad_norm": 5.468803405761719,
"learning_rate": 1.9687768899815613e-05,
"loss": 1.7797,
"step": 1016
},
{
"epoch": 0.07821143208358942,
"grad_norm": 4.73374080657959,
"learning_rate": 1.9687154271665646e-05,
"loss": 1.6201,
"step": 1018
},
{
"epoch": 0.07836508912108174,
"grad_norm": 4.9570631980896,
"learning_rate": 1.9686539643515676e-05,
"loss": 1.7538,
"step": 1020
},
{
"epoch": 0.07851874615857406,
"grad_norm": 4.574160575866699,
"learning_rate": 1.9685925015365705e-05,
"loss": 1.7656,
"step": 1022
},
{
"epoch": 0.07867240319606637,
"grad_norm": 5.078851222991943,
"learning_rate": 1.968531038721574e-05,
"loss": 1.806,
"step": 1024
},
{
"epoch": 0.0788260602335587,
"grad_norm": 5.474549293518066,
"learning_rate": 1.9684695759065765e-05,
"loss": 1.8226,
"step": 1026
},
{
"epoch": 0.07897971727105102,
"grad_norm": 6.257920265197754,
"learning_rate": 1.9684081130915798e-05,
"loss": 1.7714,
"step": 1028
},
{
"epoch": 0.07913337430854334,
"grad_norm": 4.811653137207031,
"learning_rate": 1.9683466502765827e-05,
"loss": 1.7903,
"step": 1030
},
{
"epoch": 0.07928703134603565,
"grad_norm": 4.90382194519043,
"learning_rate": 1.9682851874615857e-05,
"loss": 1.9185,
"step": 1032
},
{
"epoch": 0.07944068838352797,
"grad_norm": 5.112819194793701,
"learning_rate": 1.968223724646589e-05,
"loss": 1.8098,
"step": 1034
},
{
"epoch": 0.07959434542102028,
"grad_norm": 4.832859039306641,
"learning_rate": 1.968162261831592e-05,
"loss": 1.7965,
"step": 1036
},
{
"epoch": 0.0797480024585126,
"grad_norm": 5.124858379364014,
"learning_rate": 1.9681007990165953e-05,
"loss": 1.6745,
"step": 1038
},
{
"epoch": 0.07990165949600492,
"grad_norm": 4.530187606811523,
"learning_rate": 1.9680393362015983e-05,
"loss": 1.7152,
"step": 1040
},
{
"epoch": 0.08005531653349723,
"grad_norm": 4.918298244476318,
"learning_rate": 1.9679778733866012e-05,
"loss": 1.7445,
"step": 1042
},
{
"epoch": 0.08020897357098955,
"grad_norm": 4.637542247772217,
"learning_rate": 1.9679164105716045e-05,
"loss": 1.6802,
"step": 1044
},
{
"epoch": 0.08036263060848187,
"grad_norm": 5.314078330993652,
"learning_rate": 1.9678549477566075e-05,
"loss": 1.7547,
"step": 1046
},
{
"epoch": 0.08051628764597418,
"grad_norm": 5.02266263961792,
"learning_rate": 1.9677934849416105e-05,
"loss": 1.7107,
"step": 1048
},
{
"epoch": 0.0806699446834665,
"grad_norm": 4.683084487915039,
"learning_rate": 1.9677320221266138e-05,
"loss": 1.7415,
"step": 1050
},
{
"epoch": 0.08082360172095882,
"grad_norm": 4.539281845092773,
"learning_rate": 1.9676705593116164e-05,
"loss": 1.9356,
"step": 1052
},
{
"epoch": 0.08097725875845113,
"grad_norm": 4.526500701904297,
"learning_rate": 1.9676090964966197e-05,
"loss": 1.6487,
"step": 1054
},
{
"epoch": 0.08113091579594345,
"grad_norm": 4.272531509399414,
"learning_rate": 1.9675476336816227e-05,
"loss": 1.6478,
"step": 1056
},
{
"epoch": 0.08128457283343578,
"grad_norm": 4.874919414520264,
"learning_rate": 1.9674861708666257e-05,
"loss": 1.715,
"step": 1058
},
{
"epoch": 0.0814382298709281,
"grad_norm": 5.863064765930176,
"learning_rate": 1.967424708051629e-05,
"loss": 1.8635,
"step": 1060
},
{
"epoch": 0.08159188690842041,
"grad_norm": 5.5292205810546875,
"learning_rate": 1.967363245236632e-05,
"loss": 1.7227,
"step": 1062
},
{
"epoch": 0.08174554394591273,
"grad_norm": 4.343425273895264,
"learning_rate": 1.9673017824216352e-05,
"loss": 1.8257,
"step": 1064
},
{
"epoch": 0.08189920098340504,
"grad_norm": 5.900411128997803,
"learning_rate": 1.9672403196066382e-05,
"loss": 1.8205,
"step": 1066
},
{
"epoch": 0.08205285802089736,
"grad_norm": 4.439347743988037,
"learning_rate": 1.9671788567916412e-05,
"loss": 1.6958,
"step": 1068
},
{
"epoch": 0.08220651505838968,
"grad_norm": 5.694681644439697,
"learning_rate": 1.9671173939766445e-05,
"loss": 1.8959,
"step": 1070
},
{
"epoch": 0.08236017209588199,
"grad_norm": 4.767448425292969,
"learning_rate": 1.9670559311616474e-05,
"loss": 1.6721,
"step": 1072
},
{
"epoch": 0.08251382913337431,
"grad_norm": 4.426461696624756,
"learning_rate": 1.9669944683466504e-05,
"loss": 1.607,
"step": 1074
},
{
"epoch": 0.08266748617086662,
"grad_norm": 4.794045925140381,
"learning_rate": 1.9669330055316537e-05,
"loss": 1.6254,
"step": 1076
},
{
"epoch": 0.08282114320835894,
"grad_norm": 4.544212341308594,
"learning_rate": 1.9668715427166564e-05,
"loss": 1.6082,
"step": 1078
},
{
"epoch": 0.08297480024585126,
"grad_norm": 4.256971836090088,
"learning_rate": 1.9668100799016597e-05,
"loss": 1.8001,
"step": 1080
},
{
"epoch": 0.08312845728334357,
"grad_norm": 6.058056831359863,
"learning_rate": 1.9667486170866626e-05,
"loss": 1.7258,
"step": 1082
},
{
"epoch": 0.08328211432083589,
"grad_norm": 4.815703868865967,
"learning_rate": 1.966687154271666e-05,
"loss": 1.7528,
"step": 1084
},
{
"epoch": 0.0834357713583282,
"grad_norm": 4.661309719085693,
"learning_rate": 1.966625691456669e-05,
"loss": 1.7316,
"step": 1086
},
{
"epoch": 0.08358942839582052,
"grad_norm": 4.863770961761475,
"learning_rate": 1.966564228641672e-05,
"loss": 1.8824,
"step": 1088
},
{
"epoch": 0.08374308543331284,
"grad_norm": 5.061347961425781,
"learning_rate": 1.9665027658266752e-05,
"loss": 1.6873,
"step": 1090
},
{
"epoch": 0.08389674247080517,
"grad_norm": 4.252581596374512,
"learning_rate": 1.966441303011678e-05,
"loss": 1.8093,
"step": 1092
},
{
"epoch": 0.08405039950829749,
"grad_norm": 5.112542152404785,
"learning_rate": 1.966379840196681e-05,
"loss": 1.7366,
"step": 1094
},
{
"epoch": 0.0842040565457898,
"grad_norm": 5.147494792938232,
"learning_rate": 1.9663183773816844e-05,
"loss": 1.8676,
"step": 1096
},
{
"epoch": 0.08435771358328212,
"grad_norm": 4.757107257843018,
"learning_rate": 1.966256914566687e-05,
"loss": 1.5888,
"step": 1098
},
{
"epoch": 0.08451137062077443,
"grad_norm": 5.5242600440979,
"learning_rate": 1.9661954517516904e-05,
"loss": 1.7018,
"step": 1100
},
{
"epoch": 0.08466502765826675,
"grad_norm": 5.5325117111206055,
"learning_rate": 1.9661339889366933e-05,
"loss": 1.6928,
"step": 1102
},
{
"epoch": 0.08481868469575907,
"grad_norm": 5.308017253875732,
"learning_rate": 1.9660725261216966e-05,
"loss": 1.7695,
"step": 1104
},
{
"epoch": 0.08497234173325138,
"grad_norm": 4.460916519165039,
"learning_rate": 1.9660110633066996e-05,
"loss": 1.649,
"step": 1106
},
{
"epoch": 0.0851259987707437,
"grad_norm": 5.222771644592285,
"learning_rate": 1.9659496004917026e-05,
"loss": 1.8017,
"step": 1108
},
{
"epoch": 0.08527965580823602,
"grad_norm": 4.484593391418457,
"learning_rate": 1.965888137676706e-05,
"loss": 1.697,
"step": 1110
},
{
"epoch": 0.08543331284572833,
"grad_norm": 4.95808219909668,
"learning_rate": 1.965826674861709e-05,
"loss": 1.6171,
"step": 1112
},
{
"epoch": 0.08558696988322065,
"grad_norm": 5.313704967498779,
"learning_rate": 1.9657652120467118e-05,
"loss": 1.665,
"step": 1114
},
{
"epoch": 0.08574062692071296,
"grad_norm": 4.555895805358887,
"learning_rate": 1.965703749231715e-05,
"loss": 1.7004,
"step": 1116
},
{
"epoch": 0.08589428395820528,
"grad_norm": 4.939544677734375,
"learning_rate": 1.965642286416718e-05,
"loss": 1.7204,
"step": 1118
},
{
"epoch": 0.0860479409956976,
"grad_norm": 4.291268348693848,
"learning_rate": 1.965580823601721e-05,
"loss": 1.6592,
"step": 1120
},
{
"epoch": 0.08620159803318991,
"grad_norm": 5.050233840942383,
"learning_rate": 1.9655193607867244e-05,
"loss": 1.7656,
"step": 1122
},
{
"epoch": 0.08635525507068224,
"grad_norm": 4.5748090744018555,
"learning_rate": 1.9654578979717273e-05,
"loss": 1.8491,
"step": 1124
},
{
"epoch": 0.08650891210817456,
"grad_norm": 4.3803391456604,
"learning_rate": 1.9653964351567303e-05,
"loss": 1.697,
"step": 1126
},
{
"epoch": 0.08666256914566688,
"grad_norm": 5.346717834472656,
"learning_rate": 1.9653349723417333e-05,
"loss": 1.817,
"step": 1128
},
{
"epoch": 0.08681622618315919,
"grad_norm": 7.8311944007873535,
"learning_rate": 1.9652735095267366e-05,
"loss": 1.7637,
"step": 1130
},
{
"epoch": 0.08696988322065151,
"grad_norm": 4.9822845458984375,
"learning_rate": 1.9652120467117395e-05,
"loss": 1.6673,
"step": 1132
},
{
"epoch": 0.08712354025814383,
"grad_norm": 5.492576599121094,
"learning_rate": 1.9651505838967425e-05,
"loss": 1.7831,
"step": 1134
},
{
"epoch": 0.08727719729563614,
"grad_norm": 5.493293285369873,
"learning_rate": 1.9650891210817458e-05,
"loss": 1.698,
"step": 1136
},
{
"epoch": 0.08743085433312846,
"grad_norm": 6.50536584854126,
"learning_rate": 1.9650276582667488e-05,
"loss": 1.7632,
"step": 1138
},
{
"epoch": 0.08758451137062077,
"grad_norm": 4.753702640533447,
"learning_rate": 1.9649661954517518e-05,
"loss": 1.64,
"step": 1140
},
{
"epoch": 0.08773816840811309,
"grad_norm": 4.77907657623291,
"learning_rate": 1.964904732636755e-05,
"loss": 1.9073,
"step": 1142
},
{
"epoch": 0.0878918254456054,
"grad_norm": 4.242752552032471,
"learning_rate": 1.964843269821758e-05,
"loss": 1.7625,
"step": 1144
},
{
"epoch": 0.08804548248309772,
"grad_norm": 5.127992630004883,
"learning_rate": 1.964781807006761e-05,
"loss": 1.6271,
"step": 1146
},
{
"epoch": 0.08819913952059004,
"grad_norm": 4.7297749519348145,
"learning_rate": 1.9647203441917643e-05,
"loss": 1.6252,
"step": 1148
},
{
"epoch": 0.08835279655808236,
"grad_norm": 5.08091926574707,
"learning_rate": 1.9646588813767673e-05,
"loss": 1.7145,
"step": 1150
},
{
"epoch": 0.08850645359557467,
"grad_norm": 4.986452102661133,
"learning_rate": 1.9645974185617702e-05,
"loss": 1.8065,
"step": 1152
},
{
"epoch": 0.08866011063306699,
"grad_norm": 5.4139204025268555,
"learning_rate": 1.9645359557467732e-05,
"loss": 1.6284,
"step": 1154
},
{
"epoch": 0.08881376767055932,
"grad_norm": 5.257991313934326,
"learning_rate": 1.9644744929317765e-05,
"loss": 1.8338,
"step": 1156
},
{
"epoch": 0.08896742470805163,
"grad_norm": 4.872262001037598,
"learning_rate": 1.9644130301167795e-05,
"loss": 1.8488,
"step": 1158
},
{
"epoch": 0.08912108174554395,
"grad_norm": 5.82914924621582,
"learning_rate": 1.9643515673017825e-05,
"loss": 1.7731,
"step": 1160
},
{
"epoch": 0.08927473878303627,
"grad_norm": 4.288515090942383,
"learning_rate": 1.9642901044867858e-05,
"loss": 1.6748,
"step": 1162
},
{
"epoch": 0.08942839582052858,
"grad_norm": 4.828827381134033,
"learning_rate": 1.9642286416717887e-05,
"loss": 1.7893,
"step": 1164
},
{
"epoch": 0.0895820528580209,
"grad_norm": 4.2209577560424805,
"learning_rate": 1.9641671788567917e-05,
"loss": 1.6139,
"step": 1166
},
{
"epoch": 0.08973570989551322,
"grad_norm": 5.542986869812012,
"learning_rate": 1.964105716041795e-05,
"loss": 1.5873,
"step": 1168
},
{
"epoch": 0.08988936693300553,
"grad_norm": 4.995078086853027,
"learning_rate": 1.964044253226798e-05,
"loss": 1.5976,
"step": 1170
},
{
"epoch": 0.09004302397049785,
"grad_norm": 4.493627071380615,
"learning_rate": 1.963982790411801e-05,
"loss": 1.6423,
"step": 1172
},
{
"epoch": 0.09019668100799016,
"grad_norm": 5.041021823883057,
"learning_rate": 1.9639213275968043e-05,
"loss": 1.6996,
"step": 1174
},
{
"epoch": 0.09035033804548248,
"grad_norm": 5.527616024017334,
"learning_rate": 1.9638598647818072e-05,
"loss": 1.7338,
"step": 1176
},
{
"epoch": 0.0905039950829748,
"grad_norm": 4.937674045562744,
"learning_rate": 1.9637984019668102e-05,
"loss": 1.8472,
"step": 1178
},
{
"epoch": 0.09065765212046711,
"grad_norm": 5.515735149383545,
"learning_rate": 1.963736939151813e-05,
"loss": 1.8828,
"step": 1180
},
{
"epoch": 0.09081130915795943,
"grad_norm": 5.273295879364014,
"learning_rate": 1.9636754763368165e-05,
"loss": 1.6819,
"step": 1182
},
{
"epoch": 0.09096496619545175,
"grad_norm": 4.745762825012207,
"learning_rate": 1.9636140135218194e-05,
"loss": 1.7752,
"step": 1184
},
{
"epoch": 0.09111862323294406,
"grad_norm": 4.8165130615234375,
"learning_rate": 1.9635525507068224e-05,
"loss": 1.6133,
"step": 1186
},
{
"epoch": 0.09127228027043639,
"grad_norm": 4.156392574310303,
"learning_rate": 1.9634910878918257e-05,
"loss": 1.7126,
"step": 1188
},
{
"epoch": 0.09142593730792871,
"grad_norm": 4.523599624633789,
"learning_rate": 1.9634296250768287e-05,
"loss": 1.616,
"step": 1190
},
{
"epoch": 0.09157959434542103,
"grad_norm": 4.08304500579834,
"learning_rate": 1.9633681622618316e-05,
"loss": 1.6602,
"step": 1192
},
{
"epoch": 0.09173325138291334,
"grad_norm": 4.6315202713012695,
"learning_rate": 1.963306699446835e-05,
"loss": 1.7004,
"step": 1194
},
{
"epoch": 0.09188690842040566,
"grad_norm": 4.613175868988037,
"learning_rate": 1.963245236631838e-05,
"loss": 1.7607,
"step": 1196
},
{
"epoch": 0.09204056545789797,
"grad_norm": 5.473268508911133,
"learning_rate": 1.963183773816841e-05,
"loss": 1.7788,
"step": 1198
},
{
"epoch": 0.09219422249539029,
"grad_norm": 4.436158657073975,
"learning_rate": 1.963122311001844e-05,
"loss": 1.507,
"step": 1200
},
{
"epoch": 0.0923478795328826,
"grad_norm": 4.081661701202393,
"learning_rate": 1.963060848186847e-05,
"loss": 1.6724,
"step": 1202
},
{
"epoch": 0.09250153657037492,
"grad_norm": 4.4669318199157715,
"learning_rate": 1.96299938537185e-05,
"loss": 1.6776,
"step": 1204
},
{
"epoch": 0.09265519360786724,
"grad_norm": 4.446565628051758,
"learning_rate": 1.962937922556853e-05,
"loss": 1.7225,
"step": 1206
},
{
"epoch": 0.09280885064535956,
"grad_norm": 4.1054816246032715,
"learning_rate": 1.9628764597418564e-05,
"loss": 1.5964,
"step": 1208
},
{
"epoch": 0.09296250768285187,
"grad_norm": 3.9810330867767334,
"learning_rate": 1.9628149969268594e-05,
"loss": 1.6933,
"step": 1210
},
{
"epoch": 0.09311616472034419,
"grad_norm": 5.808743476867676,
"learning_rate": 1.9627535341118623e-05,
"loss": 1.796,
"step": 1212
},
{
"epoch": 0.0932698217578365,
"grad_norm": 5.4757795333862305,
"learning_rate": 1.9626920712968657e-05,
"loss": 1.6707,
"step": 1214
},
{
"epoch": 0.09342347879532882,
"grad_norm": 4.4285430908203125,
"learning_rate": 1.9626306084818686e-05,
"loss": 1.6214,
"step": 1216
},
{
"epoch": 0.09357713583282114,
"grad_norm": 4.347642421722412,
"learning_rate": 1.9625691456668716e-05,
"loss": 1.7012,
"step": 1218
},
{
"epoch": 0.09373079287031345,
"grad_norm": 4.711369037628174,
"learning_rate": 1.962507682851875e-05,
"loss": 1.6307,
"step": 1220
},
{
"epoch": 0.09388444990780578,
"grad_norm": 4.427557945251465,
"learning_rate": 1.962446220036878e-05,
"loss": 1.7988,
"step": 1222
},
{
"epoch": 0.0940381069452981,
"grad_norm": 4.078681945800781,
"learning_rate": 1.962384757221881e-05,
"loss": 1.6596,
"step": 1224
},
{
"epoch": 0.09419176398279042,
"grad_norm": 4.403939247131348,
"learning_rate": 1.9623232944068838e-05,
"loss": 1.7761,
"step": 1226
},
{
"epoch": 0.09434542102028273,
"grad_norm": 4.222808361053467,
"learning_rate": 1.962261831591887e-05,
"loss": 1.7145,
"step": 1228
},
{
"epoch": 0.09449907805777505,
"grad_norm": 4.754458904266357,
"learning_rate": 1.96220036877689e-05,
"loss": 1.6256,
"step": 1230
},
{
"epoch": 0.09465273509526737,
"grad_norm": 4.7824201583862305,
"learning_rate": 1.962138905961893e-05,
"loss": 1.7846,
"step": 1232
},
{
"epoch": 0.09480639213275968,
"grad_norm": 4.751527786254883,
"learning_rate": 1.9620774431468964e-05,
"loss": 1.6602,
"step": 1234
},
{
"epoch": 0.094960049170252,
"grad_norm": 4.74616813659668,
"learning_rate": 1.9620159803318993e-05,
"loss": 1.6707,
"step": 1236
},
{
"epoch": 0.09511370620774431,
"grad_norm": 5.099863052368164,
"learning_rate": 1.9619545175169023e-05,
"loss": 1.7293,
"step": 1238
},
{
"epoch": 0.09526736324523663,
"grad_norm": 5.293537616729736,
"learning_rate": 1.9618930547019056e-05,
"loss": 1.7172,
"step": 1240
},
{
"epoch": 0.09542102028272895,
"grad_norm": 5.443637847900391,
"learning_rate": 1.9618315918869086e-05,
"loss": 1.7516,
"step": 1242
},
{
"epoch": 0.09557467732022126,
"grad_norm": 4.447843551635742,
"learning_rate": 1.9617701290719115e-05,
"loss": 1.7449,
"step": 1244
},
{
"epoch": 0.09572833435771358,
"grad_norm": 4.490113258361816,
"learning_rate": 1.961708666256915e-05,
"loss": 1.6253,
"step": 1246
},
{
"epoch": 0.0958819913952059,
"grad_norm": 4.979306221008301,
"learning_rate": 1.9616472034419178e-05,
"loss": 1.6743,
"step": 1248
},
{
"epoch": 0.09603564843269821,
"grad_norm": 4.146381855010986,
"learning_rate": 1.961585740626921e-05,
"loss": 1.7014,
"step": 1250
},
{
"epoch": 0.09618930547019053,
"grad_norm": 4.571809768676758,
"learning_rate": 1.9615242778119237e-05,
"loss": 1.7867,
"step": 1252
},
{
"epoch": 0.09634296250768286,
"grad_norm": 4.7382988929748535,
"learning_rate": 1.961462814996927e-05,
"loss": 1.7669,
"step": 1254
},
{
"epoch": 0.09649661954517517,
"grad_norm": 4.332629203796387,
"learning_rate": 1.96140135218193e-05,
"loss": 1.8072,
"step": 1256
},
{
"epoch": 0.09665027658266749,
"grad_norm": 4.376523494720459,
"learning_rate": 1.961339889366933e-05,
"loss": 1.7595,
"step": 1258
},
{
"epoch": 0.09680393362015981,
"grad_norm": 4.876426696777344,
"learning_rate": 1.9612784265519363e-05,
"loss": 1.5174,
"step": 1260
},
{
"epoch": 0.09695759065765212,
"grad_norm": 4.8033905029296875,
"learning_rate": 1.9612169637369393e-05,
"loss": 1.7284,
"step": 1262
},
{
"epoch": 0.09711124769514444,
"grad_norm": 4.221518039703369,
"learning_rate": 1.9611555009219422e-05,
"loss": 1.7409,
"step": 1264
},
{
"epoch": 0.09726490473263676,
"grad_norm": 4.445687294006348,
"learning_rate": 1.9610940381069455e-05,
"loss": 1.6211,
"step": 1266
},
{
"epoch": 0.09741856177012907,
"grad_norm": 4.590234279632568,
"learning_rate": 1.9610325752919485e-05,
"loss": 1.5695,
"step": 1268
},
{
"epoch": 0.09757221880762139,
"grad_norm": 5.60252571105957,
"learning_rate": 1.9609711124769518e-05,
"loss": 1.6369,
"step": 1270
},
{
"epoch": 0.0977258758451137,
"grad_norm": 4.938035011291504,
"learning_rate": 1.9609096496619548e-05,
"loss": 1.7329,
"step": 1272
},
{
"epoch": 0.09787953288260602,
"grad_norm": 5.1367106437683105,
"learning_rate": 1.9608481868469578e-05,
"loss": 1.8568,
"step": 1274
},
{
"epoch": 0.09803318992009834,
"grad_norm": 4.405098915100098,
"learning_rate": 1.960786724031961e-05,
"loss": 1.6226,
"step": 1276
},
{
"epoch": 0.09818684695759065,
"grad_norm": 5.822478771209717,
"learning_rate": 1.9607252612169637e-05,
"loss": 1.7561,
"step": 1278
},
{
"epoch": 0.09834050399508297,
"grad_norm": 4.770538806915283,
"learning_rate": 1.960663798401967e-05,
"loss": 1.6214,
"step": 1280
},
{
"epoch": 0.09849416103257529,
"grad_norm": 6.316437244415283,
"learning_rate": 1.96060233558697e-05,
"loss": 1.8268,
"step": 1282
},
{
"epoch": 0.0986478180700676,
"grad_norm": 4.640567302703857,
"learning_rate": 1.960540872771973e-05,
"loss": 1.7484,
"step": 1284
},
{
"epoch": 0.09880147510755993,
"grad_norm": 4.596543312072754,
"learning_rate": 1.9604794099569762e-05,
"loss": 1.6986,
"step": 1286
},
{
"epoch": 0.09895513214505225,
"grad_norm": 4.36724328994751,
"learning_rate": 1.9604179471419792e-05,
"loss": 1.6108,
"step": 1288
},
{
"epoch": 0.09910878918254457,
"grad_norm": 5.017337322235107,
"learning_rate": 1.9603564843269825e-05,
"loss": 1.7047,
"step": 1290
},
{
"epoch": 0.09926244622003688,
"grad_norm": 4.327188491821289,
"learning_rate": 1.9602950215119855e-05,
"loss": 1.6083,
"step": 1292
},
{
"epoch": 0.0994161032575292,
"grad_norm": 5.734022617340088,
"learning_rate": 1.9602335586969885e-05,
"loss": 1.7501,
"step": 1294
},
{
"epoch": 0.09956976029502151,
"grad_norm": 4.524082183837891,
"learning_rate": 1.9601720958819918e-05,
"loss": 1.7556,
"step": 1296
},
{
"epoch": 0.09972341733251383,
"grad_norm": 4.61295747756958,
"learning_rate": 1.9601106330669947e-05,
"loss": 1.6598,
"step": 1298
},
{
"epoch": 0.09987707437000615,
"grad_norm": 4.453684329986572,
"learning_rate": 1.9600491702519977e-05,
"loss": 1.6554,
"step": 1300
},
{
"epoch": 0.10003073140749846,
"grad_norm": 4.732148170471191,
"learning_rate": 1.959987707437001e-05,
"loss": 1.7181,
"step": 1302
},
{
"epoch": 0.10018438844499078,
"grad_norm": 4.715574741363525,
"learning_rate": 1.9599262446220036e-05,
"loss": 1.6849,
"step": 1304
},
{
"epoch": 0.1003380454824831,
"grad_norm": 4.356414318084717,
"learning_rate": 1.959864781807007e-05,
"loss": 1.687,
"step": 1306
},
{
"epoch": 0.10049170251997541,
"grad_norm": 4.813374996185303,
"learning_rate": 1.95980331899201e-05,
"loss": 1.5262,
"step": 1308
},
{
"epoch": 0.10064535955746773,
"grad_norm": 4.9926981925964355,
"learning_rate": 1.959741856177013e-05,
"loss": 1.8137,
"step": 1310
},
{
"epoch": 0.10079901659496004,
"grad_norm": 5.103787422180176,
"learning_rate": 1.9596803933620162e-05,
"loss": 1.64,
"step": 1312
},
{
"epoch": 0.10095267363245236,
"grad_norm": 4.895768165588379,
"learning_rate": 1.959618930547019e-05,
"loss": 1.693,
"step": 1314
},
{
"epoch": 0.10110633066994468,
"grad_norm": 4.513470649719238,
"learning_rate": 1.9595574677320225e-05,
"loss": 1.5745,
"step": 1316
},
{
"epoch": 0.10125998770743701,
"grad_norm": 5.475149154663086,
"learning_rate": 1.9594960049170254e-05,
"loss": 1.6189,
"step": 1318
},
{
"epoch": 0.10141364474492932,
"grad_norm": 4.828972339630127,
"learning_rate": 1.9594345421020284e-05,
"loss": 1.6547,
"step": 1320
},
{
"epoch": 0.10156730178242164,
"grad_norm": 5.218929290771484,
"learning_rate": 1.9593730792870317e-05,
"loss": 1.6494,
"step": 1322
},
{
"epoch": 0.10172095881991396,
"grad_norm": 4.358766078948975,
"learning_rate": 1.9593116164720343e-05,
"loss": 1.7283,
"step": 1324
},
{
"epoch": 0.10187461585740627,
"grad_norm": 4.14285135269165,
"learning_rate": 1.9592501536570376e-05,
"loss": 1.7769,
"step": 1326
},
{
"epoch": 0.10202827289489859,
"grad_norm": 4.319285869598389,
"learning_rate": 1.9591886908420406e-05,
"loss": 1.4707,
"step": 1328
},
{
"epoch": 0.1021819299323909,
"grad_norm": 5.230128288269043,
"learning_rate": 1.9591272280270436e-05,
"loss": 1.4906,
"step": 1330
},
{
"epoch": 0.10233558696988322,
"grad_norm": 5.243448257446289,
"learning_rate": 1.959065765212047e-05,
"loss": 1.8825,
"step": 1332
},
{
"epoch": 0.10248924400737554,
"grad_norm": 4.784072399139404,
"learning_rate": 1.95900430239705e-05,
"loss": 1.7553,
"step": 1334
},
{
"epoch": 0.10264290104486785,
"grad_norm": 5.595427513122559,
"learning_rate": 1.958942839582053e-05,
"loss": 1.6913,
"step": 1336
},
{
"epoch": 0.10279655808236017,
"grad_norm": 4.856276512145996,
"learning_rate": 1.958881376767056e-05,
"loss": 1.7659,
"step": 1338
},
{
"epoch": 0.10295021511985249,
"grad_norm": 5.188042640686035,
"learning_rate": 1.958819913952059e-05,
"loss": 1.8205,
"step": 1340
},
{
"epoch": 0.1031038721573448,
"grad_norm": 4.261306285858154,
"learning_rate": 1.9587584511370624e-05,
"loss": 1.5887,
"step": 1342
},
{
"epoch": 0.10325752919483712,
"grad_norm": 4.269975185394287,
"learning_rate": 1.9586969883220654e-05,
"loss": 1.5869,
"step": 1344
},
{
"epoch": 0.10341118623232944,
"grad_norm": 5.029308795928955,
"learning_rate": 1.9586355255070683e-05,
"loss": 1.8049,
"step": 1346
},
{
"epoch": 0.10356484326982175,
"grad_norm": 4.857789516448975,
"learning_rate": 1.9585740626920716e-05,
"loss": 1.5154,
"step": 1348
},
{
"epoch": 0.10371850030731407,
"grad_norm": 4.701939582824707,
"learning_rate": 1.9585125998770743e-05,
"loss": 1.6822,
"step": 1350
},
{
"epoch": 0.1038721573448064,
"grad_norm": 4.069787979125977,
"learning_rate": 1.9584511370620776e-05,
"loss": 1.7238,
"step": 1352
},
{
"epoch": 0.10402581438229871,
"grad_norm": 4.703420162200928,
"learning_rate": 1.9583896742470806e-05,
"loss": 1.6951,
"step": 1354
},
{
"epoch": 0.10417947141979103,
"grad_norm": 4.920733451843262,
"learning_rate": 1.958328211432084e-05,
"loss": 1.6335,
"step": 1356
},
{
"epoch": 0.10433312845728335,
"grad_norm": 4.38323974609375,
"learning_rate": 1.9582667486170868e-05,
"loss": 1.7094,
"step": 1358
},
{
"epoch": 0.10448678549477566,
"grad_norm": 4.646501541137695,
"learning_rate": 1.9582052858020898e-05,
"loss": 1.6878,
"step": 1360
},
{
"epoch": 0.10464044253226798,
"grad_norm": 4.569819450378418,
"learning_rate": 1.958143822987093e-05,
"loss": 1.7198,
"step": 1362
},
{
"epoch": 0.1047940995697603,
"grad_norm": 4.552595615386963,
"learning_rate": 1.958082360172096e-05,
"loss": 1.7335,
"step": 1364
},
{
"epoch": 0.10494775660725261,
"grad_norm": 3.9051506519317627,
"learning_rate": 1.958020897357099e-05,
"loss": 1.5694,
"step": 1366
},
{
"epoch": 0.10510141364474493,
"grad_norm": 3.9420411586761475,
"learning_rate": 1.9579594345421023e-05,
"loss": 1.7298,
"step": 1368
},
{
"epoch": 0.10525507068223725,
"grad_norm": 4.996294021606445,
"learning_rate": 1.9578979717271053e-05,
"loss": 1.7769,
"step": 1370
},
{
"epoch": 0.10540872771972956,
"grad_norm": 4.845794677734375,
"learning_rate": 1.9578365089121083e-05,
"loss": 1.694,
"step": 1372
},
{
"epoch": 0.10556238475722188,
"grad_norm": 4.156089782714844,
"learning_rate": 1.9577750460971116e-05,
"loss": 1.5621,
"step": 1374
},
{
"epoch": 0.1057160417947142,
"grad_norm": 5.298906326293945,
"learning_rate": 1.9577135832821146e-05,
"loss": 1.6016,
"step": 1376
},
{
"epoch": 0.10586969883220651,
"grad_norm": 4.974923610687256,
"learning_rate": 1.9576521204671175e-05,
"loss": 1.9024,
"step": 1378
},
{
"epoch": 0.10602335586969883,
"grad_norm": 4.5802998542785645,
"learning_rate": 1.9575906576521205e-05,
"loss": 1.8249,
"step": 1380
},
{
"epoch": 0.10617701290719114,
"grad_norm": 5.364488124847412,
"learning_rate": 1.9575291948371238e-05,
"loss": 1.6205,
"step": 1382
},
{
"epoch": 0.10633066994468347,
"grad_norm": 4.810891151428223,
"learning_rate": 1.9574677320221268e-05,
"loss": 1.6702,
"step": 1384
},
{
"epoch": 0.10648432698217579,
"grad_norm": 5.155327320098877,
"learning_rate": 1.9574062692071297e-05,
"loss": 1.6251,
"step": 1386
},
{
"epoch": 0.1066379840196681,
"grad_norm": 4.292688369750977,
"learning_rate": 1.957344806392133e-05,
"loss": 1.505,
"step": 1388
},
{
"epoch": 0.10679164105716042,
"grad_norm": 4.611319541931152,
"learning_rate": 1.957283343577136e-05,
"loss": 1.7301,
"step": 1390
},
{
"epoch": 0.10694529809465274,
"grad_norm": 4.324422359466553,
"learning_rate": 1.957221880762139e-05,
"loss": 1.6551,
"step": 1392
},
{
"epoch": 0.10709895513214505,
"grad_norm": 4.826112747192383,
"learning_rate": 1.9571604179471423e-05,
"loss": 1.607,
"step": 1394
},
{
"epoch": 0.10725261216963737,
"grad_norm": 4.303924560546875,
"learning_rate": 1.9570989551321453e-05,
"loss": 1.6128,
"step": 1396
},
{
"epoch": 0.10740626920712969,
"grad_norm": 5.093891620635986,
"learning_rate": 1.9570374923171482e-05,
"loss": 1.6747,
"step": 1398
},
{
"epoch": 0.107559926244622,
"grad_norm": 4.303253650665283,
"learning_rate": 1.9569760295021515e-05,
"loss": 1.5981,
"step": 1400
},
{
"epoch": 0.10771358328211432,
"grad_norm": 4.165480613708496,
"learning_rate": 1.9569145666871545e-05,
"loss": 1.5993,
"step": 1402
},
{
"epoch": 0.10786724031960664,
"grad_norm": 4.655346393585205,
"learning_rate": 1.9568531038721575e-05,
"loss": 1.6806,
"step": 1404
},
{
"epoch": 0.10802089735709895,
"grad_norm": 4.743736743927002,
"learning_rate": 1.9567916410571604e-05,
"loss": 1.6624,
"step": 1406
},
{
"epoch": 0.10817455439459127,
"grad_norm": 4.2791643142700195,
"learning_rate": 1.9567301782421637e-05,
"loss": 1.6776,
"step": 1408
},
{
"epoch": 0.10832821143208358,
"grad_norm": 5.005465030670166,
"learning_rate": 1.9566687154271667e-05,
"loss": 1.5962,
"step": 1410
},
{
"epoch": 0.1084818684695759,
"grad_norm": 4.345304012298584,
"learning_rate": 1.9566072526121697e-05,
"loss": 1.5865,
"step": 1412
},
{
"epoch": 0.10863552550706822,
"grad_norm": 3.8712103366851807,
"learning_rate": 1.956545789797173e-05,
"loss": 1.6583,
"step": 1414
},
{
"epoch": 0.10878918254456055,
"grad_norm": 4.381411075592041,
"learning_rate": 1.956484326982176e-05,
"loss": 1.6203,
"step": 1416
},
{
"epoch": 0.10894283958205286,
"grad_norm": 3.933609962463379,
"learning_rate": 1.956422864167179e-05,
"loss": 1.7631,
"step": 1418
},
{
"epoch": 0.10909649661954518,
"grad_norm": 5.570189952850342,
"learning_rate": 1.9563614013521822e-05,
"loss": 1.7173,
"step": 1420
},
{
"epoch": 0.1092501536570375,
"grad_norm": 4.816314220428467,
"learning_rate": 1.9562999385371852e-05,
"loss": 1.6941,
"step": 1422
},
{
"epoch": 0.10940381069452981,
"grad_norm": 4.110052585601807,
"learning_rate": 1.9562384757221882e-05,
"loss": 1.6782,
"step": 1424
},
{
"epoch": 0.10955746773202213,
"grad_norm": 4.069727420806885,
"learning_rate": 1.956177012907191e-05,
"loss": 1.657,
"step": 1426
},
{
"epoch": 0.10971112476951445,
"grad_norm": 5.244446277618408,
"learning_rate": 1.9561155500921944e-05,
"loss": 1.683,
"step": 1428
},
{
"epoch": 0.10986478180700676,
"grad_norm": 5.359142780303955,
"learning_rate": 1.9560540872771974e-05,
"loss": 1.6764,
"step": 1430
},
{
"epoch": 0.11001843884449908,
"grad_norm": 5.057417869567871,
"learning_rate": 1.9559926244622004e-05,
"loss": 1.787,
"step": 1432
},
{
"epoch": 0.1101720958819914,
"grad_norm": 4.59893274307251,
"learning_rate": 1.9559311616472037e-05,
"loss": 1.6119,
"step": 1434
},
{
"epoch": 0.11032575291948371,
"grad_norm": 4.8417744636535645,
"learning_rate": 1.9558696988322067e-05,
"loss": 1.7965,
"step": 1436
},
{
"epoch": 0.11047940995697603,
"grad_norm": 4.829365253448486,
"learning_rate": 1.9558082360172096e-05,
"loss": 1.6125,
"step": 1438
},
{
"epoch": 0.11063306699446834,
"grad_norm": 4.74966287612915,
"learning_rate": 1.955746773202213e-05,
"loss": 1.5761,
"step": 1440
},
{
"epoch": 0.11078672403196066,
"grad_norm": 4.8681535720825195,
"learning_rate": 1.955685310387216e-05,
"loss": 1.5977,
"step": 1442
},
{
"epoch": 0.11094038106945298,
"grad_norm": 4.576766014099121,
"learning_rate": 1.955623847572219e-05,
"loss": 1.6297,
"step": 1444
},
{
"epoch": 0.11109403810694529,
"grad_norm": 4.206700325012207,
"learning_rate": 1.9555623847572222e-05,
"loss": 1.6246,
"step": 1446
},
{
"epoch": 0.11124769514443761,
"grad_norm": 4.753570079803467,
"learning_rate": 1.955500921942225e-05,
"loss": 1.6627,
"step": 1448
},
{
"epoch": 0.11140135218192994,
"grad_norm": 4.992982864379883,
"learning_rate": 1.955439459127228e-05,
"loss": 1.7223,
"step": 1450
},
{
"epoch": 0.11155500921942225,
"grad_norm": 4.912965297698975,
"learning_rate": 1.955377996312231e-05,
"loss": 1.5616,
"step": 1452
},
{
"epoch": 0.11170866625691457,
"grad_norm": 4.4759840965271,
"learning_rate": 1.9553165334972344e-05,
"loss": 1.5403,
"step": 1454
},
{
"epoch": 0.11186232329440689,
"grad_norm": 5.181031703948975,
"learning_rate": 1.9552550706822374e-05,
"loss": 1.6365,
"step": 1456
},
{
"epoch": 0.1120159803318992,
"grad_norm": 4.845396518707275,
"learning_rate": 1.9551936078672403e-05,
"loss": 1.6279,
"step": 1458
},
{
"epoch": 0.11216963736939152,
"grad_norm": 4.756799221038818,
"learning_rate": 1.9551321450522436e-05,
"loss": 1.6919,
"step": 1460
},
{
"epoch": 0.11232329440688384,
"grad_norm": 5.1768107414245605,
"learning_rate": 1.9550706822372466e-05,
"loss": 1.5765,
"step": 1462
},
{
"epoch": 0.11247695144437615,
"grad_norm": 4.743069648742676,
"learning_rate": 1.9550092194222496e-05,
"loss": 1.8469,
"step": 1464
},
{
"epoch": 0.11263060848186847,
"grad_norm": 4.831038951873779,
"learning_rate": 1.954947756607253e-05,
"loss": 1.7746,
"step": 1466
},
{
"epoch": 0.11278426551936079,
"grad_norm": 4.309507846832275,
"learning_rate": 1.954886293792256e-05,
"loss": 1.5142,
"step": 1468
},
{
"epoch": 0.1129379225568531,
"grad_norm": 55.4850959777832,
"learning_rate": 1.9548248309772588e-05,
"loss": 1.683,
"step": 1470
},
{
"epoch": 0.11309157959434542,
"grad_norm": 4.364781856536865,
"learning_rate": 1.954763368162262e-05,
"loss": 1.8977,
"step": 1472
},
{
"epoch": 0.11324523663183773,
"grad_norm": 4.795863151550293,
"learning_rate": 1.954701905347265e-05,
"loss": 1.6436,
"step": 1474
},
{
"epoch": 0.11339889366933005,
"grad_norm": 4.47898530960083,
"learning_rate": 1.954640442532268e-05,
"loss": 1.7043,
"step": 1476
},
{
"epoch": 0.11355255070682237,
"grad_norm": 5.761251926422119,
"learning_rate": 1.954578979717271e-05,
"loss": 1.6943,
"step": 1478
},
{
"epoch": 0.11370620774431468,
"grad_norm": 5.274534225463867,
"learning_rate": 1.9545175169022743e-05,
"loss": 1.6005,
"step": 1480
},
{
"epoch": 0.11385986478180701,
"grad_norm": 4.412993431091309,
"learning_rate": 1.9544560540872773e-05,
"loss": 1.4977,
"step": 1482
},
{
"epoch": 0.11401352181929933,
"grad_norm": 4.08578634262085,
"learning_rate": 1.9543945912722803e-05,
"loss": 1.6459,
"step": 1484
},
{
"epoch": 0.11416717885679165,
"grad_norm": 3.7015979290008545,
"learning_rate": 1.9543331284572836e-05,
"loss": 1.5978,
"step": 1486
},
{
"epoch": 0.11432083589428396,
"grad_norm": 4.919078826904297,
"learning_rate": 1.9542716656422865e-05,
"loss": 1.5456,
"step": 1488
},
{
"epoch": 0.11447449293177628,
"grad_norm": 4.756066799163818,
"learning_rate": 1.9542102028272895e-05,
"loss": 1.7221,
"step": 1490
},
{
"epoch": 0.1146281499692686,
"grad_norm": 4.4432525634765625,
"learning_rate": 1.9541487400122928e-05,
"loss": 1.5918,
"step": 1492
},
{
"epoch": 0.11478180700676091,
"grad_norm": 5.371875286102295,
"learning_rate": 1.9540872771972958e-05,
"loss": 1.7009,
"step": 1494
},
{
"epoch": 0.11493546404425323,
"grad_norm": 3.5335211753845215,
"learning_rate": 1.9540258143822988e-05,
"loss": 1.6021,
"step": 1496
},
{
"epoch": 0.11508912108174554,
"grad_norm": 4.77205753326416,
"learning_rate": 1.953964351567302e-05,
"loss": 1.5721,
"step": 1498
},
{
"epoch": 0.11524277811923786,
"grad_norm": 5.020537376403809,
"learning_rate": 1.953902888752305e-05,
"loss": 1.6367,
"step": 1500
},
{
"epoch": 0.11539643515673018,
"grad_norm": 4.866142272949219,
"learning_rate": 1.9538414259373083e-05,
"loss": 1.6035,
"step": 1502
},
{
"epoch": 0.11555009219422249,
"grad_norm": 3.647397756576538,
"learning_rate": 1.953779963122311e-05,
"loss": 1.5274,
"step": 1504
},
{
"epoch": 0.11570374923171481,
"grad_norm": 3.999390125274658,
"learning_rate": 1.9537185003073143e-05,
"loss": 1.7818,
"step": 1506
},
{
"epoch": 0.11585740626920712,
"grad_norm": 4.787381172180176,
"learning_rate": 1.9536570374923172e-05,
"loss": 1.7033,
"step": 1508
},
{
"epoch": 0.11601106330669944,
"grad_norm": 4.415989398956299,
"learning_rate": 1.9535955746773202e-05,
"loss": 1.831,
"step": 1510
},
{
"epoch": 0.11616472034419176,
"grad_norm": 4.548354148864746,
"learning_rate": 1.9535341118623235e-05,
"loss": 1.5145,
"step": 1512
},
{
"epoch": 0.11631837738168409,
"grad_norm": 5.4493560791015625,
"learning_rate": 1.9534726490473265e-05,
"loss": 1.876,
"step": 1514
},
{
"epoch": 0.1164720344191764,
"grad_norm": 3.9988834857940674,
"learning_rate": 1.9534111862323295e-05,
"loss": 1.6325,
"step": 1516
},
{
"epoch": 0.11662569145666872,
"grad_norm": 4.861139297485352,
"learning_rate": 1.9533497234173328e-05,
"loss": 1.6743,
"step": 1518
},
{
"epoch": 0.11677934849416104,
"grad_norm": 5.388833522796631,
"learning_rate": 1.9532882606023357e-05,
"loss": 1.5854,
"step": 1520
},
{
"epoch": 0.11693300553165335,
"grad_norm": 4.772726058959961,
"learning_rate": 1.953226797787339e-05,
"loss": 1.6481,
"step": 1522
},
{
"epoch": 0.11708666256914567,
"grad_norm": 4.285337924957275,
"learning_rate": 1.9531653349723417e-05,
"loss": 1.7054,
"step": 1524
},
{
"epoch": 0.11724031960663799,
"grad_norm": 4.5872626304626465,
"learning_rate": 1.953103872157345e-05,
"loss": 1.6569,
"step": 1526
},
{
"epoch": 0.1173939766441303,
"grad_norm": 4.3280463218688965,
"learning_rate": 1.9530424093423483e-05,
"loss": 1.5277,
"step": 1528
},
{
"epoch": 0.11754763368162262,
"grad_norm": 4.480382919311523,
"learning_rate": 1.952980946527351e-05,
"loss": 1.7289,
"step": 1530
},
{
"epoch": 0.11770129071911493,
"grad_norm": 4.207196235656738,
"learning_rate": 1.9529194837123542e-05,
"loss": 1.803,
"step": 1532
},
{
"epoch": 0.11785494775660725,
"grad_norm": 4.125123023986816,
"learning_rate": 1.9528580208973572e-05,
"loss": 1.6289,
"step": 1534
},
{
"epoch": 0.11800860479409957,
"grad_norm": 6.329103469848633,
"learning_rate": 1.95279655808236e-05,
"loss": 1.6592,
"step": 1536
},
{
"epoch": 0.11816226183159188,
"grad_norm": 4.436602592468262,
"learning_rate": 1.9527350952673635e-05,
"loss": 1.5904,
"step": 1538
},
{
"epoch": 0.1183159188690842,
"grad_norm": 4.564888954162598,
"learning_rate": 1.9526736324523664e-05,
"loss": 1.7565,
"step": 1540
},
{
"epoch": 0.11846957590657652,
"grad_norm": 4.3771514892578125,
"learning_rate": 1.9526121696373697e-05,
"loss": 1.6746,
"step": 1542
},
{
"epoch": 0.11862323294406883,
"grad_norm": 4.449161529541016,
"learning_rate": 1.9525507068223727e-05,
"loss": 1.6377,
"step": 1544
},
{
"epoch": 0.11877688998156116,
"grad_norm": 4.770364761352539,
"learning_rate": 1.9524892440073757e-05,
"loss": 1.5953,
"step": 1546
},
{
"epoch": 0.11893054701905348,
"grad_norm": 4.0749640464782715,
"learning_rate": 1.952427781192379e-05,
"loss": 1.5511,
"step": 1548
},
{
"epoch": 0.1190842040565458,
"grad_norm": 4.361663341522217,
"learning_rate": 1.9523663183773816e-05,
"loss": 1.5562,
"step": 1550
},
{
"epoch": 0.11923786109403811,
"grad_norm": 4.269155025482178,
"learning_rate": 1.952304855562385e-05,
"loss": 1.5725,
"step": 1552
},
{
"epoch": 0.11939151813153043,
"grad_norm": 4.128551483154297,
"learning_rate": 1.952243392747388e-05,
"loss": 1.5,
"step": 1554
},
{
"epoch": 0.11954517516902274,
"grad_norm": 4.763240814208984,
"learning_rate": 1.952181929932391e-05,
"loss": 1.7572,
"step": 1556
},
{
"epoch": 0.11969883220651506,
"grad_norm": 4.871914386749268,
"learning_rate": 1.952120467117394e-05,
"loss": 1.609,
"step": 1558
},
{
"epoch": 0.11985248924400738,
"grad_norm": 4.267725467681885,
"learning_rate": 1.952059004302397e-05,
"loss": 1.5832,
"step": 1560
},
{
"epoch": 0.12000614628149969,
"grad_norm": 4.569482326507568,
"learning_rate": 1.9519975414874e-05,
"loss": 1.6262,
"step": 1562
},
{
"epoch": 0.12015980331899201,
"grad_norm": 4.285094261169434,
"learning_rate": 1.9519360786724034e-05,
"loss": 1.7479,
"step": 1564
},
{
"epoch": 0.12031346035648433,
"grad_norm": 4.529351234436035,
"learning_rate": 1.9518746158574064e-05,
"loss": 1.6297,
"step": 1566
},
{
"epoch": 0.12046711739397664,
"grad_norm": 4.966389179229736,
"learning_rate": 1.9518131530424097e-05,
"loss": 1.7544,
"step": 1568
},
{
"epoch": 0.12062077443146896,
"grad_norm": 4.608340263366699,
"learning_rate": 1.9517516902274127e-05,
"loss": 1.4635,
"step": 1570
},
{
"epoch": 0.12077443146896127,
"grad_norm": 3.8790552616119385,
"learning_rate": 1.9516902274124156e-05,
"loss": 1.6345,
"step": 1572
},
{
"epoch": 0.12092808850645359,
"grad_norm": 5.229369163513184,
"learning_rate": 1.951628764597419e-05,
"loss": 1.6829,
"step": 1574
},
{
"epoch": 0.1210817455439459,
"grad_norm": 4.269663333892822,
"learning_rate": 1.9515673017824216e-05,
"loss": 1.568,
"step": 1576
},
{
"epoch": 0.12123540258143822,
"grad_norm": 4.905238151550293,
"learning_rate": 1.951505838967425e-05,
"loss": 1.671,
"step": 1578
},
{
"epoch": 0.12138905961893055,
"grad_norm": 4.5513596534729,
"learning_rate": 1.951444376152428e-05,
"loss": 1.6636,
"step": 1580
},
{
"epoch": 0.12154271665642287,
"grad_norm": 4.586058616638184,
"learning_rate": 1.9513829133374308e-05,
"loss": 1.7669,
"step": 1582
},
{
"epoch": 0.12169637369391519,
"grad_norm": 5.4855170249938965,
"learning_rate": 1.951321450522434e-05,
"loss": 1.5033,
"step": 1584
},
{
"epoch": 0.1218500307314075,
"grad_norm": 4.668776035308838,
"learning_rate": 1.951259987707437e-05,
"loss": 1.5859,
"step": 1586
},
{
"epoch": 0.12200368776889982,
"grad_norm": 3.9210376739501953,
"learning_rate": 1.9511985248924404e-05,
"loss": 1.5757,
"step": 1588
},
{
"epoch": 0.12215734480639213,
"grad_norm": 4.558568000793457,
"learning_rate": 1.9511370620774434e-05,
"loss": 1.5945,
"step": 1590
},
{
"epoch": 0.12231100184388445,
"grad_norm": 4.247246265411377,
"learning_rate": 1.9510755992624463e-05,
"loss": 1.624,
"step": 1592
},
{
"epoch": 0.12246465888137677,
"grad_norm": 4.2471604347229,
"learning_rate": 1.9510141364474496e-05,
"loss": 1.5873,
"step": 1594
},
{
"epoch": 0.12261831591886908,
"grad_norm": 4.362886428833008,
"learning_rate": 1.9509526736324526e-05,
"loss": 1.7448,
"step": 1596
},
{
"epoch": 0.1227719729563614,
"grad_norm": 5.111678123474121,
"learning_rate": 1.9508912108174556e-05,
"loss": 1.8134,
"step": 1598
},
{
"epoch": 0.12292562999385372,
"grad_norm": 4.4582624435424805,
"learning_rate": 1.950829748002459e-05,
"loss": 1.7155,
"step": 1600
},
{
"epoch": 0.12307928703134603,
"grad_norm": 3.796780586242676,
"learning_rate": 1.9507682851874615e-05,
"loss": 1.5636,
"step": 1602
},
{
"epoch": 0.12323294406883835,
"grad_norm": 4.517824649810791,
"learning_rate": 1.9507068223724648e-05,
"loss": 1.6092,
"step": 1604
},
{
"epoch": 0.12338660110633067,
"grad_norm": 4.659684181213379,
"learning_rate": 1.9506453595574678e-05,
"loss": 1.641,
"step": 1606
},
{
"epoch": 0.12354025814382298,
"grad_norm": 4.470782279968262,
"learning_rate": 1.950583896742471e-05,
"loss": 1.618,
"step": 1608
},
{
"epoch": 0.1236939151813153,
"grad_norm": 4.486400604248047,
"learning_rate": 1.950522433927474e-05,
"loss": 1.6912,
"step": 1610
},
{
"epoch": 0.12384757221880763,
"grad_norm": 4.459258556365967,
"learning_rate": 1.950460971112477e-05,
"loss": 1.5627,
"step": 1612
},
{
"epoch": 0.12400122925629994,
"grad_norm": 4.486885070800781,
"learning_rate": 1.9503995082974803e-05,
"loss": 1.8642,
"step": 1614
},
{
"epoch": 0.12415488629379226,
"grad_norm": 4.576472282409668,
"learning_rate": 1.9503380454824833e-05,
"loss": 1.6411,
"step": 1616
},
{
"epoch": 0.12430854333128458,
"grad_norm": 4.349391460418701,
"learning_rate": 1.9502765826674863e-05,
"loss": 1.6382,
"step": 1618
},
{
"epoch": 0.12446220036877689,
"grad_norm": 4.264526844024658,
"learning_rate": 1.9502151198524896e-05,
"loss": 1.621,
"step": 1620
},
{
"epoch": 0.12461585740626921,
"grad_norm": 4.798770904541016,
"learning_rate": 1.9501536570374925e-05,
"loss": 1.8124,
"step": 1622
},
{
"epoch": 0.12476951444376153,
"grad_norm": 3.747992515563965,
"learning_rate": 1.9500921942224955e-05,
"loss": 1.516,
"step": 1624
},
{
"epoch": 0.12492317148125384,
"grad_norm": 4.410411834716797,
"learning_rate": 1.9500307314074988e-05,
"loss": 1.5645,
"step": 1626
},
{
"epoch": 0.12507682851874616,
"grad_norm": 4.139060020446777,
"learning_rate": 1.9499692685925018e-05,
"loss": 1.6217,
"step": 1628
},
{
"epoch": 0.12523048555623847,
"grad_norm": 4.380125045776367,
"learning_rate": 1.9499078057775048e-05,
"loss": 1.6909,
"step": 1630
},
{
"epoch": 0.1253841425937308,
"grad_norm": 4.449796676635742,
"learning_rate": 1.9498463429625077e-05,
"loss": 1.7215,
"step": 1632
},
{
"epoch": 0.1255377996312231,
"grad_norm": 4.043376922607422,
"learning_rate": 1.949784880147511e-05,
"loss": 1.6326,
"step": 1634
},
{
"epoch": 0.12569145666871542,
"grad_norm": 4.427875518798828,
"learning_rate": 1.949723417332514e-05,
"loss": 1.6962,
"step": 1636
},
{
"epoch": 0.12584511370620774,
"grad_norm": 4.617554187774658,
"learning_rate": 1.949661954517517e-05,
"loss": 1.5711,
"step": 1638
},
{
"epoch": 0.12599877074370006,
"grad_norm": 4.245482444763184,
"learning_rate": 1.9496004917025203e-05,
"loss": 1.6479,
"step": 1640
},
{
"epoch": 0.12615242778119237,
"grad_norm": 4.876771926879883,
"learning_rate": 1.9495390288875232e-05,
"loss": 1.7238,
"step": 1642
},
{
"epoch": 0.1263060848186847,
"grad_norm": 4.263737678527832,
"learning_rate": 1.9494775660725262e-05,
"loss": 1.5666,
"step": 1644
},
{
"epoch": 0.126459741856177,
"grad_norm": 6.202945232391357,
"learning_rate": 1.9494161032575295e-05,
"loss": 1.6217,
"step": 1646
},
{
"epoch": 0.12661339889366932,
"grad_norm": 4.307828426361084,
"learning_rate": 1.9493546404425325e-05,
"loss": 1.49,
"step": 1648
},
{
"epoch": 0.12676705593116164,
"grad_norm": 4.122886657714844,
"learning_rate": 1.9492931776275355e-05,
"loss": 1.7121,
"step": 1650
},
{
"epoch": 0.12692071296865395,
"grad_norm": 4.3632426261901855,
"learning_rate": 1.9492317148125384e-05,
"loss": 1.6835,
"step": 1652
},
{
"epoch": 0.12707437000614627,
"grad_norm": 4.4186625480651855,
"learning_rate": 1.9491702519975417e-05,
"loss": 1.7579,
"step": 1654
},
{
"epoch": 0.1272280270436386,
"grad_norm": 4.411682605743408,
"learning_rate": 1.9491087891825447e-05,
"loss": 1.5771,
"step": 1656
},
{
"epoch": 0.1273816840811309,
"grad_norm": 4.259854316711426,
"learning_rate": 1.9490473263675477e-05,
"loss": 1.6239,
"step": 1658
},
{
"epoch": 0.12753534111862325,
"grad_norm": 4.225386619567871,
"learning_rate": 1.948985863552551e-05,
"loss": 1.6777,
"step": 1660
},
{
"epoch": 0.12768899815611556,
"grad_norm": 4.977676868438721,
"learning_rate": 1.948924400737554e-05,
"loss": 1.6166,
"step": 1662
},
{
"epoch": 0.12784265519360788,
"grad_norm": 3.7306509017944336,
"learning_rate": 1.948862937922557e-05,
"loss": 1.5834,
"step": 1664
},
{
"epoch": 0.1279963122311002,
"grad_norm": 4.451853275299072,
"learning_rate": 1.9488014751075602e-05,
"loss": 1.6464,
"step": 1666
},
{
"epoch": 0.1281499692685925,
"grad_norm": 4.641234397888184,
"learning_rate": 1.9487400122925632e-05,
"loss": 1.6698,
"step": 1668
},
{
"epoch": 0.12830362630608483,
"grad_norm": 5.218206882476807,
"learning_rate": 1.948678549477566e-05,
"loss": 1.6614,
"step": 1670
},
{
"epoch": 0.12845728334357714,
"grad_norm": 4.623648166656494,
"learning_rate": 1.9486170866625695e-05,
"loss": 1.6586,
"step": 1672
},
{
"epoch": 0.12861094038106946,
"grad_norm": 5.1708478927612305,
"learning_rate": 1.9485556238475724e-05,
"loss": 1.6275,
"step": 1674
},
{
"epoch": 0.12876459741856178,
"grad_norm": 4.305856227874756,
"learning_rate": 1.9484941610325754e-05,
"loss": 1.6349,
"step": 1676
},
{
"epoch": 0.1289182544560541,
"grad_norm": 4.788485050201416,
"learning_rate": 1.9484326982175784e-05,
"loss": 1.4676,
"step": 1678
},
{
"epoch": 0.1290719114935464,
"grad_norm": 4.4581379890441895,
"learning_rate": 1.9483712354025817e-05,
"loss": 1.5062,
"step": 1680
},
{
"epoch": 0.12922556853103873,
"grad_norm": 3.9021549224853516,
"learning_rate": 1.9483097725875846e-05,
"loss": 1.7848,
"step": 1682
},
{
"epoch": 0.12937922556853104,
"grad_norm": 4.530584812164307,
"learning_rate": 1.9482483097725876e-05,
"loss": 1.6594,
"step": 1684
},
{
"epoch": 0.12953288260602336,
"grad_norm": 4.8017497062683105,
"learning_rate": 1.948186846957591e-05,
"loss": 1.6167,
"step": 1686
},
{
"epoch": 0.12968653964351567,
"grad_norm": 4.41823148727417,
"learning_rate": 1.948125384142594e-05,
"loss": 1.5293,
"step": 1688
},
{
"epoch": 0.129840196681008,
"grad_norm": 4.470682144165039,
"learning_rate": 1.948063921327597e-05,
"loss": 1.6036,
"step": 1690
},
{
"epoch": 0.1299938537185003,
"grad_norm": 3.947842597961426,
"learning_rate": 1.9480024585126e-05,
"loss": 1.7037,
"step": 1692
},
{
"epoch": 0.13014751075599262,
"grad_norm": 4.953098297119141,
"learning_rate": 1.947940995697603e-05,
"loss": 1.6436,
"step": 1694
},
{
"epoch": 0.13030116779348494,
"grad_norm": 4.112635135650635,
"learning_rate": 1.947879532882606e-05,
"loss": 1.4595,
"step": 1696
},
{
"epoch": 0.13045482483097726,
"grad_norm": 4.197033882141113,
"learning_rate": 1.9478180700676094e-05,
"loss": 1.657,
"step": 1698
},
{
"epoch": 0.13060848186846957,
"grad_norm": 4.02692985534668,
"learning_rate": 1.9477566072526124e-05,
"loss": 1.6321,
"step": 1700
},
{
"epoch": 0.1307621389059619,
"grad_norm": 4.7861809730529785,
"learning_rate": 1.9476951444376153e-05,
"loss": 1.5609,
"step": 1702
},
{
"epoch": 0.1309157959434542,
"grad_norm": 4.392903804779053,
"learning_rate": 1.9476336816226183e-05,
"loss": 1.7549,
"step": 1704
},
{
"epoch": 0.13106945298094652,
"grad_norm": 4.314429759979248,
"learning_rate": 1.9475722188076216e-05,
"loss": 1.5698,
"step": 1706
},
{
"epoch": 0.13122311001843884,
"grad_norm": 4.254858016967773,
"learning_rate": 1.9475107559926246e-05,
"loss": 1.6291,
"step": 1708
},
{
"epoch": 0.13137676705593115,
"grad_norm": 4.288058757781982,
"learning_rate": 1.9474492931776276e-05,
"loss": 1.822,
"step": 1710
},
{
"epoch": 0.13153042409342347,
"grad_norm": 4.206986904144287,
"learning_rate": 1.947387830362631e-05,
"loss": 1.6372,
"step": 1712
},
{
"epoch": 0.1316840811309158,
"grad_norm": 3.9056224822998047,
"learning_rate": 1.947326367547634e-05,
"loss": 1.6141,
"step": 1714
},
{
"epoch": 0.1318377381684081,
"grad_norm": 5.1152777671813965,
"learning_rate": 1.9472649047326368e-05,
"loss": 1.6361,
"step": 1716
},
{
"epoch": 0.13199139520590042,
"grad_norm": 4.0903120040893555,
"learning_rate": 1.94720344191764e-05,
"loss": 1.5559,
"step": 1718
},
{
"epoch": 0.13214505224339274,
"grad_norm": 4.825276851654053,
"learning_rate": 1.947141979102643e-05,
"loss": 1.732,
"step": 1720
},
{
"epoch": 0.13229870928088505,
"grad_norm": 4.649293899536133,
"learning_rate": 1.947080516287646e-05,
"loss": 1.5941,
"step": 1722
},
{
"epoch": 0.13245236631837737,
"grad_norm": 4.052992820739746,
"learning_rate": 1.9470190534726494e-05,
"loss": 1.5664,
"step": 1724
},
{
"epoch": 0.1326060233558697,
"grad_norm": 4.36129903793335,
"learning_rate": 1.9469575906576523e-05,
"loss": 1.7345,
"step": 1726
},
{
"epoch": 0.13275968039336203,
"grad_norm": 4.522770404815674,
"learning_rate": 1.9468961278426553e-05,
"loss": 1.6731,
"step": 1728
},
{
"epoch": 0.13291333743085434,
"grad_norm": 4.922299385070801,
"learning_rate": 1.9468346650276583e-05,
"loss": 1.8072,
"step": 1730
},
{
"epoch": 0.13306699446834666,
"grad_norm": 4.385134220123291,
"learning_rate": 1.9467732022126616e-05,
"loss": 1.5836,
"step": 1732
},
{
"epoch": 0.13322065150583898,
"grad_norm": 4.031277179718018,
"learning_rate": 1.9467117393976645e-05,
"loss": 1.465,
"step": 1734
},
{
"epoch": 0.1333743085433313,
"grad_norm": 4.437002182006836,
"learning_rate": 1.9466502765826675e-05,
"loss": 1.5624,
"step": 1736
},
{
"epoch": 0.1335279655808236,
"grad_norm": 3.754696846008301,
"learning_rate": 1.9465888137676708e-05,
"loss": 1.6044,
"step": 1738
},
{
"epoch": 0.13368162261831593,
"grad_norm": 3.967130661010742,
"learning_rate": 1.9465273509526738e-05,
"loss": 1.5832,
"step": 1740
},
{
"epoch": 0.13383527965580824,
"grad_norm": 3.958448648452759,
"learning_rate": 1.9464658881376767e-05,
"loss": 1.6812,
"step": 1742
},
{
"epoch": 0.13398893669330056,
"grad_norm": 5.2511982917785645,
"learning_rate": 1.94640442532268e-05,
"loss": 1.8565,
"step": 1744
},
{
"epoch": 0.13414259373079288,
"grad_norm": 4.229193210601807,
"learning_rate": 1.946342962507683e-05,
"loss": 1.6365,
"step": 1746
},
{
"epoch": 0.1342962507682852,
"grad_norm": 3.8518741130828857,
"learning_rate": 1.946281499692686e-05,
"loss": 1.571,
"step": 1748
},
{
"epoch": 0.1344499078057775,
"grad_norm": 4.383627414703369,
"learning_rate": 1.946220036877689e-05,
"loss": 1.5181,
"step": 1750
},
{
"epoch": 0.13460356484326982,
"grad_norm": 4.58341121673584,
"learning_rate": 1.9461585740626923e-05,
"loss": 1.6387,
"step": 1752
},
{
"epoch": 0.13475722188076214,
"grad_norm": 4.656858921051025,
"learning_rate": 1.9460971112476956e-05,
"loss": 1.6474,
"step": 1754
},
{
"epoch": 0.13491087891825446,
"grad_norm": 5.039700031280518,
"learning_rate": 1.9460356484326982e-05,
"loss": 1.6755,
"step": 1756
},
{
"epoch": 0.13506453595574677,
"grad_norm": 4.46349573135376,
"learning_rate": 1.9459741856177015e-05,
"loss": 1.5728,
"step": 1758
},
{
"epoch": 0.1352181929932391,
"grad_norm": 4.041154861450195,
"learning_rate": 1.9459127228027045e-05,
"loss": 1.5847,
"step": 1760
},
{
"epoch": 0.1353718500307314,
"grad_norm": 4.126910209655762,
"learning_rate": 1.9458512599877074e-05,
"loss": 1.6807,
"step": 1762
},
{
"epoch": 0.13552550706822372,
"grad_norm": 4.063604831695557,
"learning_rate": 1.9457897971727108e-05,
"loss": 1.5294,
"step": 1764
},
{
"epoch": 0.13567916410571604,
"grad_norm": 4.1347150802612305,
"learning_rate": 1.9457283343577137e-05,
"loss": 1.5728,
"step": 1766
},
{
"epoch": 0.13583282114320835,
"grad_norm": 4.593793869018555,
"learning_rate": 1.9456668715427167e-05,
"loss": 1.7155,
"step": 1768
},
{
"epoch": 0.13598647818070067,
"grad_norm": 4.340649127960205,
"learning_rate": 1.94560540872772e-05,
"loss": 1.6996,
"step": 1770
},
{
"epoch": 0.136140135218193,
"grad_norm": 4.278517246246338,
"learning_rate": 1.945543945912723e-05,
"loss": 1.6012,
"step": 1772
},
{
"epoch": 0.1362937922556853,
"grad_norm": 4.626030445098877,
"learning_rate": 1.9454824830977263e-05,
"loss": 1.6195,
"step": 1774
},
{
"epoch": 0.13644744929317762,
"grad_norm": 4.450915813446045,
"learning_rate": 1.945421020282729e-05,
"loss": 1.6398,
"step": 1776
},
{
"epoch": 0.13660110633066994,
"grad_norm": 4.265727996826172,
"learning_rate": 1.9453595574677322e-05,
"loss": 1.5958,
"step": 1778
},
{
"epoch": 0.13675476336816225,
"grad_norm": 4.036159038543701,
"learning_rate": 1.9452980946527352e-05,
"loss": 1.5647,
"step": 1780
},
{
"epoch": 0.13690842040565457,
"grad_norm": 4.2282257080078125,
"learning_rate": 1.945236631837738e-05,
"loss": 1.6079,
"step": 1782
},
{
"epoch": 0.13706207744314688,
"grad_norm": 4.005040645599365,
"learning_rate": 1.9451751690227415e-05,
"loss": 1.5044,
"step": 1784
},
{
"epoch": 0.1372157344806392,
"grad_norm": 4.676270484924316,
"learning_rate": 1.9451137062077444e-05,
"loss": 1.6304,
"step": 1786
},
{
"epoch": 0.13736939151813152,
"grad_norm": 4.598161697387695,
"learning_rate": 1.9450522433927474e-05,
"loss": 1.6887,
"step": 1788
},
{
"epoch": 0.13752304855562386,
"grad_norm": 5.0116448402404785,
"learning_rate": 1.9449907805777507e-05,
"loss": 1.6998,
"step": 1790
},
{
"epoch": 0.13767670559311618,
"grad_norm": 4.892838954925537,
"learning_rate": 1.9449293177627537e-05,
"loss": 1.6274,
"step": 1792
},
{
"epoch": 0.1378303626306085,
"grad_norm": 5.293637752532959,
"learning_rate": 1.944867854947757e-05,
"loss": 1.628,
"step": 1794
},
{
"epoch": 0.1379840196681008,
"grad_norm": 4.583549976348877,
"learning_rate": 1.94480639213276e-05,
"loss": 1.4792,
"step": 1796
},
{
"epoch": 0.13813767670559313,
"grad_norm": 3.773277759552002,
"learning_rate": 1.944744929317763e-05,
"loss": 1.5219,
"step": 1798
},
{
"epoch": 0.13829133374308544,
"grad_norm": 4.440420150756836,
"learning_rate": 1.9446834665027662e-05,
"loss": 1.6732,
"step": 1800
},
{
"epoch": 0.13844499078057776,
"grad_norm": 4.711763858795166,
"learning_rate": 1.944622003687769e-05,
"loss": 1.5463,
"step": 1802
},
{
"epoch": 0.13859864781807008,
"grad_norm": 5.035058498382568,
"learning_rate": 1.944560540872772e-05,
"loss": 1.6961,
"step": 1804
},
{
"epoch": 0.1387523048555624,
"grad_norm": 3.963282346725464,
"learning_rate": 1.944499078057775e-05,
"loss": 1.5568,
"step": 1806
},
{
"epoch": 0.1389059618930547,
"grad_norm": 4.577483654022217,
"learning_rate": 1.944437615242778e-05,
"loss": 1.5428,
"step": 1808
},
{
"epoch": 0.13905961893054702,
"grad_norm": 4.509146690368652,
"learning_rate": 1.9443761524277814e-05,
"loss": 1.6397,
"step": 1810
},
{
"epoch": 0.13921327596803934,
"grad_norm": 4.317050933837891,
"learning_rate": 1.9443146896127844e-05,
"loss": 1.7304,
"step": 1812
},
{
"epoch": 0.13936693300553166,
"grad_norm": 4.572277069091797,
"learning_rate": 1.9442532267977877e-05,
"loss": 1.6759,
"step": 1814
},
{
"epoch": 0.13952059004302397,
"grad_norm": 4.773606777191162,
"learning_rate": 1.9441917639827906e-05,
"loss": 1.6869,
"step": 1816
},
{
"epoch": 0.1396742470805163,
"grad_norm": 4.57815408706665,
"learning_rate": 1.9441303011677936e-05,
"loss": 1.647,
"step": 1818
},
{
"epoch": 0.1398279041180086,
"grad_norm": 4.822877407073975,
"learning_rate": 1.944068838352797e-05,
"loss": 1.6182,
"step": 1820
},
{
"epoch": 0.13998156115550092,
"grad_norm": 4.272431373596191,
"learning_rate": 1.9440073755378e-05,
"loss": 1.6215,
"step": 1822
},
{
"epoch": 0.14013521819299324,
"grad_norm": 4.476557731628418,
"learning_rate": 1.943945912722803e-05,
"loss": 1.6277,
"step": 1824
},
{
"epoch": 0.14028887523048555,
"grad_norm": 4.522927284240723,
"learning_rate": 1.943884449907806e-05,
"loss": 1.526,
"step": 1826
},
{
"epoch": 0.14044253226797787,
"grad_norm": 3.991070032119751,
"learning_rate": 1.9438229870928088e-05,
"loss": 1.6106,
"step": 1828
},
{
"epoch": 0.1405961893054702,
"grad_norm": 4.189483165740967,
"learning_rate": 1.943761524277812e-05,
"loss": 1.619,
"step": 1830
},
{
"epoch": 0.1407498463429625,
"grad_norm": 4.5693159103393555,
"learning_rate": 1.943700061462815e-05,
"loss": 1.7438,
"step": 1832
},
{
"epoch": 0.14090350338045482,
"grad_norm": 3.8766119480133057,
"learning_rate": 1.943638598647818e-05,
"loss": 1.4953,
"step": 1834
},
{
"epoch": 0.14105716041794714,
"grad_norm": 4.294021129608154,
"learning_rate": 1.9435771358328213e-05,
"loss": 1.5746,
"step": 1836
},
{
"epoch": 0.14121081745543945,
"grad_norm": 4.195743083953857,
"learning_rate": 1.9435156730178243e-05,
"loss": 1.5049,
"step": 1838
},
{
"epoch": 0.14136447449293177,
"grad_norm": 4.331358909606934,
"learning_rate": 1.9434542102028276e-05,
"loss": 1.8416,
"step": 1840
},
{
"epoch": 0.14151813153042408,
"grad_norm": 4.328099727630615,
"learning_rate": 1.9433927473878306e-05,
"loss": 1.7201,
"step": 1842
},
{
"epoch": 0.1416717885679164,
"grad_norm": 4.2462005615234375,
"learning_rate": 1.9433312845728336e-05,
"loss": 1.6308,
"step": 1844
},
{
"epoch": 0.14182544560540872,
"grad_norm": 4.253352165222168,
"learning_rate": 1.943269821757837e-05,
"loss": 1.6856,
"step": 1846
},
{
"epoch": 0.14197910264290103,
"grad_norm": 4.154186248779297,
"learning_rate": 1.9432083589428395e-05,
"loss": 1.6079,
"step": 1848
},
{
"epoch": 0.14213275968039335,
"grad_norm": 6.227648735046387,
"learning_rate": 1.9431468961278428e-05,
"loss": 1.73,
"step": 1850
},
{
"epoch": 0.14228641671788567,
"grad_norm": 4.038461208343506,
"learning_rate": 1.943085433312846e-05,
"loss": 1.7227,
"step": 1852
},
{
"epoch": 0.14244007375537798,
"grad_norm": 4.844911098480225,
"learning_rate": 1.9430239704978487e-05,
"loss": 1.5841,
"step": 1854
},
{
"epoch": 0.14259373079287033,
"grad_norm": 3.845120429992676,
"learning_rate": 1.942962507682852e-05,
"loss": 1.6545,
"step": 1856
},
{
"epoch": 0.14274738783036264,
"grad_norm": 4.25357723236084,
"learning_rate": 1.942901044867855e-05,
"loss": 1.6038,
"step": 1858
},
{
"epoch": 0.14290104486785496,
"grad_norm": 4.518612861633301,
"learning_rate": 1.9428395820528583e-05,
"loss": 1.703,
"step": 1860
},
{
"epoch": 0.14305470190534728,
"grad_norm": 4.541075229644775,
"learning_rate": 1.9427781192378613e-05,
"loss": 1.6203,
"step": 1862
},
{
"epoch": 0.1432083589428396,
"grad_norm": 4.06412935256958,
"learning_rate": 1.9427166564228643e-05,
"loss": 1.714,
"step": 1864
},
{
"epoch": 0.1433620159803319,
"grad_norm": 4.289870738983154,
"learning_rate": 1.9426551936078676e-05,
"loss": 1.5524,
"step": 1866
},
{
"epoch": 0.14351567301782422,
"grad_norm": 3.937469005584717,
"learning_rate": 1.9425937307928705e-05,
"loss": 1.5572,
"step": 1868
},
{
"epoch": 0.14366933005531654,
"grad_norm": 4.361362457275391,
"learning_rate": 1.9425322679778735e-05,
"loss": 1.493,
"step": 1870
},
{
"epoch": 0.14382298709280886,
"grad_norm": 3.9257559776306152,
"learning_rate": 1.9424708051628768e-05,
"loss": 1.502,
"step": 1872
},
{
"epoch": 0.14397664413030117,
"grad_norm": 4.2765655517578125,
"learning_rate": 1.9424093423478794e-05,
"loss": 1.6001,
"step": 1874
},
{
"epoch": 0.1441303011677935,
"grad_norm": 3.724155902862549,
"learning_rate": 1.9423478795328827e-05,
"loss": 1.5722,
"step": 1876
},
{
"epoch": 0.1442839582052858,
"grad_norm": 4.135402679443359,
"learning_rate": 1.9422864167178857e-05,
"loss": 1.7137,
"step": 1878
},
{
"epoch": 0.14443761524277812,
"grad_norm": 4.522433280944824,
"learning_rate": 1.942224953902889e-05,
"loss": 1.6161,
"step": 1880
},
{
"epoch": 0.14459127228027044,
"grad_norm": 4.247946262359619,
"learning_rate": 1.942163491087892e-05,
"loss": 1.67,
"step": 1882
},
{
"epoch": 0.14474492931776275,
"grad_norm": 4.322160243988037,
"learning_rate": 1.942102028272895e-05,
"loss": 1.5362,
"step": 1884
},
{
"epoch": 0.14489858635525507,
"grad_norm": 4.25150728225708,
"learning_rate": 1.9420405654578983e-05,
"loss": 1.6026,
"step": 1886
},
{
"epoch": 0.1450522433927474,
"grad_norm": 4.95831823348999,
"learning_rate": 1.9419791026429012e-05,
"loss": 1.6806,
"step": 1888
},
{
"epoch": 0.1452059004302397,
"grad_norm": 4.125936031341553,
"learning_rate": 1.9419176398279042e-05,
"loss": 1.634,
"step": 1890
},
{
"epoch": 0.14535955746773202,
"grad_norm": 3.6493210792541504,
"learning_rate": 1.9418561770129075e-05,
"loss": 1.4507,
"step": 1892
},
{
"epoch": 0.14551321450522434,
"grad_norm": 4.338488578796387,
"learning_rate": 1.9417947141979105e-05,
"loss": 1.5127,
"step": 1894
},
{
"epoch": 0.14566687154271665,
"grad_norm": 4.250901222229004,
"learning_rate": 1.9417332513829134e-05,
"loss": 1.4201,
"step": 1896
},
{
"epoch": 0.14582052858020897,
"grad_norm": 3.8600480556488037,
"learning_rate": 1.9416717885679167e-05,
"loss": 1.5595,
"step": 1898
},
{
"epoch": 0.14597418561770129,
"grad_norm": 4.317285537719727,
"learning_rate": 1.9416103257529197e-05,
"loss": 1.6094,
"step": 1900
},
{
"epoch": 0.1461278426551936,
"grad_norm": 4.718072891235352,
"learning_rate": 1.9415488629379227e-05,
"loss": 1.768,
"step": 1902
},
{
"epoch": 0.14628149969268592,
"grad_norm": 4.9370808601379395,
"learning_rate": 1.9414874001229257e-05,
"loss": 1.5801,
"step": 1904
},
{
"epoch": 0.14643515673017823,
"grad_norm": 4.436810493469238,
"learning_rate": 1.941425937307929e-05,
"loss": 1.5847,
"step": 1906
},
{
"epoch": 0.14658881376767055,
"grad_norm": 4.890700817108154,
"learning_rate": 1.941364474492932e-05,
"loss": 1.5092,
"step": 1908
},
{
"epoch": 0.14674247080516287,
"grad_norm": 3.926815986633301,
"learning_rate": 1.941303011677935e-05,
"loss": 1.6612,
"step": 1910
},
{
"epoch": 0.14689612784265518,
"grad_norm": 4.331315994262695,
"learning_rate": 1.9412415488629382e-05,
"loss": 1.7669,
"step": 1912
},
{
"epoch": 0.1470497848801475,
"grad_norm": 5.178247928619385,
"learning_rate": 1.9411800860479412e-05,
"loss": 1.6749,
"step": 1914
},
{
"epoch": 0.14720344191763982,
"grad_norm": 3.871377944946289,
"learning_rate": 1.941118623232944e-05,
"loss": 1.4508,
"step": 1916
},
{
"epoch": 0.14735709895513213,
"grad_norm": 4.062928676605225,
"learning_rate": 1.9410571604179474e-05,
"loss": 1.458,
"step": 1918
},
{
"epoch": 0.14751075599262448,
"grad_norm": 4.205310344696045,
"learning_rate": 1.9409956976029504e-05,
"loss": 1.6419,
"step": 1920
},
{
"epoch": 0.1476644130301168,
"grad_norm": 3.822014093399048,
"learning_rate": 1.9409342347879534e-05,
"loss": 1.558,
"step": 1922
},
{
"epoch": 0.1478180700676091,
"grad_norm": 4.9377593994140625,
"learning_rate": 1.9408727719729567e-05,
"loss": 1.687,
"step": 1924
},
{
"epoch": 0.14797172710510142,
"grad_norm": 4.531102657318115,
"learning_rate": 1.9408113091579597e-05,
"loss": 1.7183,
"step": 1926
},
{
"epoch": 0.14812538414259374,
"grad_norm": 4.2398881912231445,
"learning_rate": 1.9407498463429626e-05,
"loss": 1.5952,
"step": 1928
},
{
"epoch": 0.14827904118008606,
"grad_norm": 4.426001071929932,
"learning_rate": 1.9406883835279656e-05,
"loss": 1.7844,
"step": 1930
},
{
"epoch": 0.14843269821757837,
"grad_norm": 4.2123494148254395,
"learning_rate": 1.940626920712969e-05,
"loss": 1.6711,
"step": 1932
},
{
"epoch": 0.1485863552550707,
"grad_norm": 4.681150913238525,
"learning_rate": 1.940565457897972e-05,
"loss": 1.6319,
"step": 1934
},
{
"epoch": 0.148740012292563,
"grad_norm": 4.499131202697754,
"learning_rate": 1.940503995082975e-05,
"loss": 1.6108,
"step": 1936
},
{
"epoch": 0.14889366933005532,
"grad_norm": 5.171452522277832,
"learning_rate": 1.940442532267978e-05,
"loss": 1.7528,
"step": 1938
},
{
"epoch": 0.14904732636754764,
"grad_norm": 4.263631343841553,
"learning_rate": 1.940381069452981e-05,
"loss": 1.5191,
"step": 1940
},
{
"epoch": 0.14920098340503996,
"grad_norm": 5.029245853424072,
"learning_rate": 1.940319606637984e-05,
"loss": 1.5623,
"step": 1942
},
{
"epoch": 0.14935464044253227,
"grad_norm": 4.505579948425293,
"learning_rate": 1.9402581438229874e-05,
"loss": 1.4555,
"step": 1944
},
{
"epoch": 0.1495082974800246,
"grad_norm": 4.269020080566406,
"learning_rate": 1.9401966810079904e-05,
"loss": 1.5765,
"step": 1946
},
{
"epoch": 0.1496619545175169,
"grad_norm": 3.9689948558807373,
"learning_rate": 1.9401352181929933e-05,
"loss": 1.6277,
"step": 1948
},
{
"epoch": 0.14981561155500922,
"grad_norm": 3.684130907058716,
"learning_rate": 1.9400737553779966e-05,
"loss": 1.5172,
"step": 1950
},
{
"epoch": 0.14996926859250154,
"grad_norm": 3.713602066040039,
"learning_rate": 1.9400122925629996e-05,
"loss": 1.5211,
"step": 1952
},
{
"epoch": 0.15012292562999385,
"grad_norm": 4.927131175994873,
"learning_rate": 1.9399508297480026e-05,
"loss": 1.6324,
"step": 1954
},
{
"epoch": 0.15027658266748617,
"grad_norm": 4.4799299240112305,
"learning_rate": 1.9398893669330055e-05,
"loss": 1.5679,
"step": 1956
},
{
"epoch": 0.15043023970497849,
"grad_norm": 4.372833251953125,
"learning_rate": 1.939827904118009e-05,
"loss": 1.5638,
"step": 1958
},
{
"epoch": 0.1505838967424708,
"grad_norm": 4.321191787719727,
"learning_rate": 1.9397664413030118e-05,
"loss": 1.4329,
"step": 1960
},
{
"epoch": 0.15073755377996312,
"grad_norm": 4.75023078918457,
"learning_rate": 1.9397049784880148e-05,
"loss": 1.6682,
"step": 1962
},
{
"epoch": 0.15089121081745543,
"grad_norm": 4.172933101654053,
"learning_rate": 1.939643515673018e-05,
"loss": 1.7316,
"step": 1964
},
{
"epoch": 0.15104486785494775,
"grad_norm": 4.329415321350098,
"learning_rate": 1.939582052858021e-05,
"loss": 1.7017,
"step": 1966
},
{
"epoch": 0.15119852489244007,
"grad_norm": 4.249721527099609,
"learning_rate": 1.939520590043024e-05,
"loss": 1.5796,
"step": 1968
},
{
"epoch": 0.15135218192993238,
"grad_norm": 4.071712970733643,
"learning_rate": 1.9394591272280273e-05,
"loss": 1.4623,
"step": 1970
},
{
"epoch": 0.1515058389674247,
"grad_norm": 4.0507731437683105,
"learning_rate": 1.9393976644130303e-05,
"loss": 1.4821,
"step": 1972
},
{
"epoch": 0.15165949600491702,
"grad_norm": 4.356963634490967,
"learning_rate": 1.9393362015980333e-05,
"loss": 1.547,
"step": 1974
},
{
"epoch": 0.15181315304240933,
"grad_norm": 5.182737350463867,
"learning_rate": 1.9392747387830362e-05,
"loss": 1.6367,
"step": 1976
},
{
"epoch": 0.15196681007990165,
"grad_norm": 3.9492363929748535,
"learning_rate": 1.9392132759680395e-05,
"loss": 1.679,
"step": 1978
},
{
"epoch": 0.15212046711739396,
"grad_norm": 4.404160976409912,
"learning_rate": 1.9391518131530425e-05,
"loss": 1.5288,
"step": 1980
},
{
"epoch": 0.15227412415488628,
"grad_norm": 4.500973701477051,
"learning_rate": 1.9390903503380455e-05,
"loss": 1.7715,
"step": 1982
},
{
"epoch": 0.1524277811923786,
"grad_norm": 4.121068954467773,
"learning_rate": 1.9390288875230488e-05,
"loss": 1.5507,
"step": 1984
},
{
"epoch": 0.15258143822987094,
"grad_norm": 3.7095515727996826,
"learning_rate": 1.9389674247080518e-05,
"loss": 1.616,
"step": 1986
},
{
"epoch": 0.15273509526736326,
"grad_norm": 5.333407878875732,
"learning_rate": 1.9389059618930547e-05,
"loss": 1.7196,
"step": 1988
},
{
"epoch": 0.15288875230485557,
"grad_norm": 4.188971042633057,
"learning_rate": 1.938844499078058e-05,
"loss": 1.6256,
"step": 1990
},
{
"epoch": 0.1530424093423479,
"grad_norm": 4.126604080200195,
"learning_rate": 1.938783036263061e-05,
"loss": 1.5089,
"step": 1992
},
{
"epoch": 0.1531960663798402,
"grad_norm": 4.127197742462158,
"learning_rate": 1.938721573448064e-05,
"loss": 1.5598,
"step": 1994
},
{
"epoch": 0.15334972341733252,
"grad_norm": 4.481958389282227,
"learning_rate": 1.9386601106330673e-05,
"loss": 1.6238,
"step": 1996
},
{
"epoch": 0.15350338045482484,
"grad_norm": 4.15784215927124,
"learning_rate": 1.9385986478180702e-05,
"loss": 1.6611,
"step": 1998
},
{
"epoch": 0.15365703749231716,
"grad_norm": 4.322861194610596,
"learning_rate": 1.9385371850030732e-05,
"loss": 1.513,
"step": 2000
},
{
"epoch": 0.15381069452980947,
"grad_norm": 3.9926345348358154,
"learning_rate": 1.9384757221880762e-05,
"loss": 1.5764,
"step": 2002
},
{
"epoch": 0.1539643515673018,
"grad_norm": 5.112368583679199,
"learning_rate": 1.9384142593730795e-05,
"loss": 1.6181,
"step": 2004
},
{
"epoch": 0.1541180086047941,
"grad_norm": 3.6655466556549072,
"learning_rate": 1.9383527965580825e-05,
"loss": 1.3721,
"step": 2006
},
{
"epoch": 0.15427166564228642,
"grad_norm": 4.533908367156982,
"learning_rate": 1.9382913337430854e-05,
"loss": 1.5841,
"step": 2008
},
{
"epoch": 0.15442532267977874,
"grad_norm": 3.722304344177246,
"learning_rate": 1.9382298709280887e-05,
"loss": 1.5965,
"step": 2010
},
{
"epoch": 0.15457897971727105,
"grad_norm": 4.5336012840271,
"learning_rate": 1.9381684081130917e-05,
"loss": 1.5995,
"step": 2012
},
{
"epoch": 0.15473263675476337,
"grad_norm": 4.144698619842529,
"learning_rate": 1.9381069452980947e-05,
"loss": 1.6343,
"step": 2014
},
{
"epoch": 0.15488629379225569,
"grad_norm": 4.033304691314697,
"learning_rate": 1.938045482483098e-05,
"loss": 1.6888,
"step": 2016
},
{
"epoch": 0.155039950829748,
"grad_norm": 4.00151252746582,
"learning_rate": 1.937984019668101e-05,
"loss": 1.5246,
"step": 2018
},
{
"epoch": 0.15519360786724032,
"grad_norm": 4.766987323760986,
"learning_rate": 1.937922556853104e-05,
"loss": 1.7345,
"step": 2020
},
{
"epoch": 0.15534726490473263,
"grad_norm": 4.144290924072266,
"learning_rate": 1.9378610940381072e-05,
"loss": 1.5848,
"step": 2022
},
{
"epoch": 0.15550092194222495,
"grad_norm": 4.038874626159668,
"learning_rate": 1.9377996312231102e-05,
"loss": 1.6425,
"step": 2024
},
{
"epoch": 0.15565457897971727,
"grad_norm": 3.832429885864258,
"learning_rate": 1.9377381684081135e-05,
"loss": 1.5454,
"step": 2026
},
{
"epoch": 0.15580823601720958,
"grad_norm": 4.775087833404541,
"learning_rate": 1.937676705593116e-05,
"loss": 1.6767,
"step": 2028
},
{
"epoch": 0.1559618930547019,
"grad_norm": 3.997192144393921,
"learning_rate": 1.9376152427781194e-05,
"loss": 1.5539,
"step": 2030
},
{
"epoch": 0.15611555009219422,
"grad_norm": 3.764519453048706,
"learning_rate": 1.9375537799631224e-05,
"loss": 1.46,
"step": 2032
},
{
"epoch": 0.15626920712968653,
"grad_norm": 4.074234962463379,
"learning_rate": 1.9374923171481254e-05,
"loss": 1.5064,
"step": 2034
},
{
"epoch": 0.15642286416717885,
"grad_norm": 4.302229881286621,
"learning_rate": 1.9374308543331287e-05,
"loss": 1.6276,
"step": 2036
},
{
"epoch": 0.15657652120467117,
"grad_norm": 4.602327346801758,
"learning_rate": 1.9373693915181316e-05,
"loss": 1.5551,
"step": 2038
},
{
"epoch": 0.15673017824216348,
"grad_norm": 4.131155490875244,
"learning_rate": 1.9373079287031346e-05,
"loss": 1.4554,
"step": 2040
},
{
"epoch": 0.1568838352796558,
"grad_norm": 4.661727428436279,
"learning_rate": 1.937246465888138e-05,
"loss": 1.6566,
"step": 2042
},
{
"epoch": 0.15703749231714811,
"grad_norm": 4.23723030090332,
"learning_rate": 1.937185003073141e-05,
"loss": 1.6038,
"step": 2044
},
{
"epoch": 0.15719114935464043,
"grad_norm": 3.995077133178711,
"learning_rate": 1.9371235402581442e-05,
"loss": 1.7374,
"step": 2046
},
{
"epoch": 0.15734480639213275,
"grad_norm": 4.001912593841553,
"learning_rate": 1.937062077443147e-05,
"loss": 1.5991,
"step": 2048
},
{
"epoch": 0.15749846342962506,
"grad_norm": 4.558352470397949,
"learning_rate": 1.93700061462815e-05,
"loss": 1.6115,
"step": 2050
},
{
"epoch": 0.1576521204671174,
"grad_norm": 4.651041030883789,
"learning_rate": 1.9369391518131534e-05,
"loss": 1.5451,
"step": 2052
},
{
"epoch": 0.15780577750460972,
"grad_norm": 4.203875541687012,
"learning_rate": 1.936877688998156e-05,
"loss": 1.5506,
"step": 2054
},
{
"epoch": 0.15795943454210204,
"grad_norm": 4.153205394744873,
"learning_rate": 1.9368162261831594e-05,
"loss": 1.5209,
"step": 2056
},
{
"epoch": 0.15811309157959436,
"grad_norm": 4.187285900115967,
"learning_rate": 1.9367547633681623e-05,
"loss": 1.7677,
"step": 2058
},
{
"epoch": 0.15826674861708667,
"grad_norm": 4.654025554656982,
"learning_rate": 1.9366933005531653e-05,
"loss": 1.63,
"step": 2060
},
{
"epoch": 0.158420405654579,
"grad_norm": 4.145837306976318,
"learning_rate": 1.9366318377381686e-05,
"loss": 1.7693,
"step": 2062
},
{
"epoch": 0.1585740626920713,
"grad_norm": 4.076268196105957,
"learning_rate": 1.9365703749231716e-05,
"loss": 1.5436,
"step": 2064
},
{
"epoch": 0.15872771972956362,
"grad_norm": 3.9100687503814697,
"learning_rate": 1.936508912108175e-05,
"loss": 1.5738,
"step": 2066
},
{
"epoch": 0.15888137676705594,
"grad_norm": 4.173727989196777,
"learning_rate": 1.936447449293178e-05,
"loss": 1.6155,
"step": 2068
},
{
"epoch": 0.15903503380454825,
"grad_norm": 4.555833339691162,
"learning_rate": 1.936385986478181e-05,
"loss": 1.6879,
"step": 2070
},
{
"epoch": 0.15918869084204057,
"grad_norm": 4.008725166320801,
"learning_rate": 1.936324523663184e-05,
"loss": 1.5752,
"step": 2072
},
{
"epoch": 0.1593423478795329,
"grad_norm": 5.628279209136963,
"learning_rate": 1.9362630608481868e-05,
"loss": 1.6574,
"step": 2074
},
{
"epoch": 0.1594960049170252,
"grad_norm": 4.112339496612549,
"learning_rate": 1.93620159803319e-05,
"loss": 1.6789,
"step": 2076
},
{
"epoch": 0.15964966195451752,
"grad_norm": 4.378570556640625,
"learning_rate": 1.9361401352181934e-05,
"loss": 1.4837,
"step": 2078
},
{
"epoch": 0.15980331899200984,
"grad_norm": 3.819582223892212,
"learning_rate": 1.936078672403196e-05,
"loss": 1.573,
"step": 2080
},
{
"epoch": 0.15995697602950215,
"grad_norm": 4.303280830383301,
"learning_rate": 1.9360172095881993e-05,
"loss": 1.4278,
"step": 2082
},
{
"epoch": 0.16011063306699447,
"grad_norm": 3.9183764457702637,
"learning_rate": 1.9359557467732023e-05,
"loss": 1.4808,
"step": 2084
},
{
"epoch": 0.16026429010448678,
"grad_norm": 4.484659671783447,
"learning_rate": 1.9358942839582053e-05,
"loss": 1.557,
"step": 2086
},
{
"epoch": 0.1604179471419791,
"grad_norm": 4.600409984588623,
"learning_rate": 1.9358328211432086e-05,
"loss": 1.5848,
"step": 2088
},
{
"epoch": 0.16057160417947142,
"grad_norm": 3.4427108764648438,
"learning_rate": 1.9357713583282115e-05,
"loss": 1.5589,
"step": 2090
},
{
"epoch": 0.16072526121696373,
"grad_norm": 7.8328375816345215,
"learning_rate": 1.935709895513215e-05,
"loss": 1.5519,
"step": 2092
},
{
"epoch": 0.16087891825445605,
"grad_norm": 4.630655765533447,
"learning_rate": 1.9356484326982178e-05,
"loss": 1.6441,
"step": 2094
},
{
"epoch": 0.16103257529194837,
"grad_norm": 4.217165946960449,
"learning_rate": 1.9355869698832208e-05,
"loss": 1.5559,
"step": 2096
},
{
"epoch": 0.16118623232944068,
"grad_norm": 4.941442012786865,
"learning_rate": 1.935525507068224e-05,
"loss": 1.5065,
"step": 2098
},
{
"epoch": 0.161339889366933,
"grad_norm": 4.171699523925781,
"learning_rate": 1.9354640442532267e-05,
"loss": 1.4993,
"step": 2100
},
{
"epoch": 0.16149354640442531,
"grad_norm": 3.8235647678375244,
"learning_rate": 1.93540258143823e-05,
"loss": 1.6035,
"step": 2102
},
{
"epoch": 0.16164720344191763,
"grad_norm": 5.202590465545654,
"learning_rate": 1.935341118623233e-05,
"loss": 1.6397,
"step": 2104
},
{
"epoch": 0.16180086047940995,
"grad_norm": 3.8114516735076904,
"learning_rate": 1.935279655808236e-05,
"loss": 1.5103,
"step": 2106
},
{
"epoch": 0.16195451751690226,
"grad_norm": 4.666793346405029,
"learning_rate": 1.9352181929932393e-05,
"loss": 1.5363,
"step": 2108
},
{
"epoch": 0.16210817455439458,
"grad_norm": 4.18889856338501,
"learning_rate": 1.9351567301782422e-05,
"loss": 1.662,
"step": 2110
},
{
"epoch": 0.1622618315918869,
"grad_norm": 4.392826557159424,
"learning_rate": 1.9350952673632455e-05,
"loss": 1.5492,
"step": 2112
},
{
"epoch": 0.1624154886293792,
"grad_norm": 4.0653839111328125,
"learning_rate": 1.9350338045482485e-05,
"loss": 1.7512,
"step": 2114
},
{
"epoch": 0.16256914566687156,
"grad_norm": 4.020386219024658,
"learning_rate": 1.9349723417332515e-05,
"loss": 1.7604,
"step": 2116
},
{
"epoch": 0.16272280270436387,
"grad_norm": 4.062155723571777,
"learning_rate": 1.9349108789182548e-05,
"loss": 1.7181,
"step": 2118
},
{
"epoch": 0.1628764597418562,
"grad_norm": 4.4019999504089355,
"learning_rate": 1.9348494161032578e-05,
"loss": 1.639,
"step": 2120
},
{
"epoch": 0.1630301167793485,
"grad_norm": 3.761319875717163,
"learning_rate": 1.9347879532882607e-05,
"loss": 1.4084,
"step": 2122
},
{
"epoch": 0.16318377381684082,
"grad_norm": 4.566369533538818,
"learning_rate": 1.934726490473264e-05,
"loss": 1.5193,
"step": 2124
},
{
"epoch": 0.16333743085433314,
"grad_norm": 4.366701602935791,
"learning_rate": 1.9346650276582667e-05,
"loss": 1.5211,
"step": 2126
},
{
"epoch": 0.16349108789182545,
"grad_norm": 4.0437116622924805,
"learning_rate": 1.93460356484327e-05,
"loss": 1.5901,
"step": 2128
},
{
"epoch": 0.16364474492931777,
"grad_norm": 3.9914474487304688,
"learning_rate": 1.934542102028273e-05,
"loss": 1.4975,
"step": 2130
},
{
"epoch": 0.1637984019668101,
"grad_norm": 4.267744541168213,
"learning_rate": 1.9344806392132762e-05,
"loss": 1.6574,
"step": 2132
},
{
"epoch": 0.1639520590043024,
"grad_norm": 3.843414068222046,
"learning_rate": 1.9344191763982792e-05,
"loss": 1.4976,
"step": 2134
},
{
"epoch": 0.16410571604179472,
"grad_norm": 4.3155975341796875,
"learning_rate": 1.9343577135832822e-05,
"loss": 1.5333,
"step": 2136
},
{
"epoch": 0.16425937307928704,
"grad_norm": 4.159292697906494,
"learning_rate": 1.9342962507682855e-05,
"loss": 1.5053,
"step": 2138
},
{
"epoch": 0.16441303011677935,
"grad_norm": 4.296607971191406,
"learning_rate": 1.9342347879532885e-05,
"loss": 1.6112,
"step": 2140
},
{
"epoch": 0.16456668715427167,
"grad_norm": 3.9782469272613525,
"learning_rate": 1.9341733251382914e-05,
"loss": 1.5241,
"step": 2142
},
{
"epoch": 0.16472034419176398,
"grad_norm": 4.183950901031494,
"learning_rate": 1.9341118623232947e-05,
"loss": 1.5901,
"step": 2144
},
{
"epoch": 0.1648740012292563,
"grad_norm": 3.9188575744628906,
"learning_rate": 1.9340503995082977e-05,
"loss": 1.621,
"step": 2146
},
{
"epoch": 0.16502765826674862,
"grad_norm": 4.474676132202148,
"learning_rate": 1.9339889366933007e-05,
"loss": 1.487,
"step": 2148
},
{
"epoch": 0.16518131530424093,
"grad_norm": 4.6410017013549805,
"learning_rate": 1.933927473878304e-05,
"loss": 1.6997,
"step": 2150
},
{
"epoch": 0.16533497234173325,
"grad_norm": 10.933167457580566,
"learning_rate": 1.933866011063307e-05,
"loss": 1.4752,
"step": 2152
},
{
"epoch": 0.16548862937922557,
"grad_norm": 4.885360240936279,
"learning_rate": 1.93380454824831e-05,
"loss": 1.5604,
"step": 2154
},
{
"epoch": 0.16564228641671788,
"grad_norm": 3.9420063495635986,
"learning_rate": 1.933743085433313e-05,
"loss": 1.5909,
"step": 2156
},
{
"epoch": 0.1657959434542102,
"grad_norm": 3.5523521900177,
"learning_rate": 1.9336816226183162e-05,
"loss": 1.6274,
"step": 2158
},
{
"epoch": 0.16594960049170251,
"grad_norm": 3.6621317863464355,
"learning_rate": 1.933620159803319e-05,
"loss": 1.5111,
"step": 2160
},
{
"epoch": 0.16610325752919483,
"grad_norm": 4.57207727432251,
"learning_rate": 1.933558696988322e-05,
"loss": 1.6702,
"step": 2162
},
{
"epoch": 0.16625691456668715,
"grad_norm": 3.9489428997039795,
"learning_rate": 1.9334972341733254e-05,
"loss": 1.589,
"step": 2164
},
{
"epoch": 0.16641057160417946,
"grad_norm": 3.8509654998779297,
"learning_rate": 1.9334357713583284e-05,
"loss": 1.5522,
"step": 2166
},
{
"epoch": 0.16656422864167178,
"grad_norm": 3.6893975734710693,
"learning_rate": 1.9333743085433314e-05,
"loss": 1.4767,
"step": 2168
},
{
"epoch": 0.1667178856791641,
"grad_norm": 4.064876079559326,
"learning_rate": 1.9333128457283347e-05,
"loss": 1.5515,
"step": 2170
},
{
"epoch": 0.1668715427166564,
"grad_norm": 4.427343845367432,
"learning_rate": 1.9332513829133376e-05,
"loss": 1.7098,
"step": 2172
},
{
"epoch": 0.16702519975414873,
"grad_norm": 3.724740505218506,
"learning_rate": 1.9331899200983406e-05,
"loss": 1.5612,
"step": 2174
},
{
"epoch": 0.16717885679164105,
"grad_norm": 4.302028656005859,
"learning_rate": 1.933128457283344e-05,
"loss": 1.641,
"step": 2176
},
{
"epoch": 0.16733251382913336,
"grad_norm": 4.149264335632324,
"learning_rate": 1.933066994468347e-05,
"loss": 1.6923,
"step": 2178
},
{
"epoch": 0.16748617086662568,
"grad_norm": 4.091360092163086,
"learning_rate": 1.93300553165335e-05,
"loss": 1.4934,
"step": 2180
},
{
"epoch": 0.16763982790411802,
"grad_norm": 4.653087139129639,
"learning_rate": 1.9329440688383528e-05,
"loss": 1.6434,
"step": 2182
},
{
"epoch": 0.16779348494161034,
"grad_norm": 4.141650199890137,
"learning_rate": 1.932882606023356e-05,
"loss": 1.5228,
"step": 2184
},
{
"epoch": 0.16794714197910265,
"grad_norm": 4.257270336151123,
"learning_rate": 1.932821143208359e-05,
"loss": 1.7463,
"step": 2186
},
{
"epoch": 0.16810079901659497,
"grad_norm": 4.1163554191589355,
"learning_rate": 1.932759680393362e-05,
"loss": 1.4543,
"step": 2188
},
{
"epoch": 0.1682544560540873,
"grad_norm": 3.6683640480041504,
"learning_rate": 1.9326982175783654e-05,
"loss": 1.4778,
"step": 2190
},
{
"epoch": 0.1684081130915796,
"grad_norm": 3.734006881713867,
"learning_rate": 1.9326367547633683e-05,
"loss": 1.5738,
"step": 2192
},
{
"epoch": 0.16856177012907192,
"grad_norm": 4.454776287078857,
"learning_rate": 1.9325752919483713e-05,
"loss": 1.6569,
"step": 2194
},
{
"epoch": 0.16871542716656424,
"grad_norm": 4.1497883796691895,
"learning_rate": 1.9325138291333746e-05,
"loss": 1.5172,
"step": 2196
},
{
"epoch": 0.16886908420405655,
"grad_norm": 4.288064479827881,
"learning_rate": 1.9324523663183776e-05,
"loss": 1.6416,
"step": 2198
},
{
"epoch": 0.16902274124154887,
"grad_norm": 3.463115930557251,
"learning_rate": 1.9323909035033806e-05,
"loss": 1.4196,
"step": 2200
},
{
"epoch": 0.16917639827904118,
"grad_norm": 5.139834403991699,
"learning_rate": 1.9323294406883835e-05,
"loss": 1.6058,
"step": 2202
},
{
"epoch": 0.1693300553165335,
"grad_norm": 4.1170806884765625,
"learning_rate": 1.9322679778733868e-05,
"loss": 1.4493,
"step": 2204
},
{
"epoch": 0.16948371235402582,
"grad_norm": 3.8090291023254395,
"learning_rate": 1.9322065150583898e-05,
"loss": 1.6205,
"step": 2206
},
{
"epoch": 0.16963736939151813,
"grad_norm": 3.461530923843384,
"learning_rate": 1.9321450522433928e-05,
"loss": 1.4338,
"step": 2208
},
{
"epoch": 0.16979102642901045,
"grad_norm": 4.355661392211914,
"learning_rate": 1.932083589428396e-05,
"loss": 1.6044,
"step": 2210
},
{
"epoch": 0.16994468346650277,
"grad_norm": 3.4141671657562256,
"learning_rate": 1.932022126613399e-05,
"loss": 1.5096,
"step": 2212
},
{
"epoch": 0.17009834050399508,
"grad_norm": 4.202045917510986,
"learning_rate": 1.931960663798402e-05,
"loss": 1.7529,
"step": 2214
},
{
"epoch": 0.1702519975414874,
"grad_norm": 3.8714635372161865,
"learning_rate": 1.9318992009834053e-05,
"loss": 1.5085,
"step": 2216
},
{
"epoch": 0.17040565457897972,
"grad_norm": 5.047643184661865,
"learning_rate": 1.9318377381684083e-05,
"loss": 1.6937,
"step": 2218
},
{
"epoch": 0.17055931161647203,
"grad_norm": 4.094550132751465,
"learning_rate": 1.9317762753534113e-05,
"loss": 1.7053,
"step": 2220
},
{
"epoch": 0.17071296865396435,
"grad_norm": 3.65474796295166,
"learning_rate": 1.9317148125384146e-05,
"loss": 1.5531,
"step": 2222
},
{
"epoch": 0.17086662569145666,
"grad_norm": 4.38794469833374,
"learning_rate": 1.9316533497234175e-05,
"loss": 1.7151,
"step": 2224
},
{
"epoch": 0.17102028272894898,
"grad_norm": 4.336912631988525,
"learning_rate": 1.9315918869084205e-05,
"loss": 1.4871,
"step": 2226
},
{
"epoch": 0.1711739397664413,
"grad_norm": 3.9137933254241943,
"learning_rate": 1.9315304240934235e-05,
"loss": 1.3785,
"step": 2228
},
{
"epoch": 0.1713275968039336,
"grad_norm": 4.241963863372803,
"learning_rate": 1.9314689612784268e-05,
"loss": 1.482,
"step": 2230
},
{
"epoch": 0.17148125384142593,
"grad_norm": 4.047958850860596,
"learning_rate": 1.9314074984634297e-05,
"loss": 1.7098,
"step": 2232
},
{
"epoch": 0.17163491087891825,
"grad_norm": 4.688525676727295,
"learning_rate": 1.9313460356484327e-05,
"loss": 1.5618,
"step": 2234
},
{
"epoch": 0.17178856791641056,
"grad_norm": 4.020751953125,
"learning_rate": 1.931284572833436e-05,
"loss": 1.5526,
"step": 2236
},
{
"epoch": 0.17194222495390288,
"grad_norm": 3.93445086479187,
"learning_rate": 1.931223110018439e-05,
"loss": 1.6595,
"step": 2238
},
{
"epoch": 0.1720958819913952,
"grad_norm": 5.324620723724365,
"learning_rate": 1.931161647203442e-05,
"loss": 1.657,
"step": 2240
},
{
"epoch": 0.1722495390288875,
"grad_norm": 3.6193902492523193,
"learning_rate": 1.9311001843884453e-05,
"loss": 1.5,
"step": 2242
},
{
"epoch": 0.17240319606637983,
"grad_norm": 4.382716178894043,
"learning_rate": 1.9310387215734482e-05,
"loss": 1.6785,
"step": 2244
},
{
"epoch": 0.17255685310387217,
"grad_norm": 3.7359304428100586,
"learning_rate": 1.9309772587584512e-05,
"loss": 1.5003,
"step": 2246
},
{
"epoch": 0.1727105101413645,
"grad_norm": 4.570140838623047,
"learning_rate": 1.9309157959434545e-05,
"loss": 1.6353,
"step": 2248
},
{
"epoch": 0.1728641671788568,
"grad_norm": 4.809631824493408,
"learning_rate": 1.9308543331284575e-05,
"loss": 1.5898,
"step": 2250
},
{
"epoch": 0.17301782421634912,
"grad_norm": 4.994627475738525,
"learning_rate": 1.9307928703134604e-05,
"loss": 1.6614,
"step": 2252
},
{
"epoch": 0.17317148125384144,
"grad_norm": 4.121060371398926,
"learning_rate": 1.9307314074984634e-05,
"loss": 1.6817,
"step": 2254
},
{
"epoch": 0.17332513829133375,
"grad_norm": 4.009014129638672,
"learning_rate": 1.9306699446834667e-05,
"loss": 1.5645,
"step": 2256
},
{
"epoch": 0.17347879532882607,
"grad_norm": 4.27223539352417,
"learning_rate": 1.9306084818684697e-05,
"loss": 1.6764,
"step": 2258
},
{
"epoch": 0.17363245236631838,
"grad_norm": 4.074213027954102,
"learning_rate": 1.9305470190534727e-05,
"loss": 1.6095,
"step": 2260
},
{
"epoch": 0.1737861094038107,
"grad_norm": 3.6030173301696777,
"learning_rate": 1.930485556238476e-05,
"loss": 1.5907,
"step": 2262
},
{
"epoch": 0.17393976644130302,
"grad_norm": 4.34961462020874,
"learning_rate": 1.930424093423479e-05,
"loss": 1.611,
"step": 2264
},
{
"epoch": 0.17409342347879533,
"grad_norm": 3.9723429679870605,
"learning_rate": 1.930362630608482e-05,
"loss": 1.5256,
"step": 2266
},
{
"epoch": 0.17424708051628765,
"grad_norm": 3.7899746894836426,
"learning_rate": 1.9303011677934852e-05,
"loss": 1.6923,
"step": 2268
},
{
"epoch": 0.17440073755377997,
"grad_norm": 4.415828227996826,
"learning_rate": 1.9302397049784882e-05,
"loss": 1.4681,
"step": 2270
},
{
"epoch": 0.17455439459127228,
"grad_norm": 5.616640567779541,
"learning_rate": 1.930178242163491e-05,
"loss": 1.6252,
"step": 2272
},
{
"epoch": 0.1747080516287646,
"grad_norm": 3.5017950534820557,
"learning_rate": 1.9301167793484944e-05,
"loss": 1.4621,
"step": 2274
},
{
"epoch": 0.17486170866625692,
"grad_norm": 4.120169639587402,
"learning_rate": 1.9300553165334974e-05,
"loss": 1.5537,
"step": 2276
},
{
"epoch": 0.17501536570374923,
"grad_norm": 4.489522457122803,
"learning_rate": 1.9299938537185007e-05,
"loss": 1.6915,
"step": 2278
},
{
"epoch": 0.17516902274124155,
"grad_norm": 4.285830974578857,
"learning_rate": 1.9299323909035034e-05,
"loss": 1.5101,
"step": 2280
},
{
"epoch": 0.17532267977873386,
"grad_norm": 3.9038150310516357,
"learning_rate": 1.9298709280885067e-05,
"loss": 1.6067,
"step": 2282
},
{
"epoch": 0.17547633681622618,
"grad_norm": 3.8521156311035156,
"learning_rate": 1.9298094652735096e-05,
"loss": 1.4494,
"step": 2284
},
{
"epoch": 0.1756299938537185,
"grad_norm": 7.329223155975342,
"learning_rate": 1.9297480024585126e-05,
"loss": 1.5588,
"step": 2286
},
{
"epoch": 0.1757836508912108,
"grad_norm": 3.97939395904541,
"learning_rate": 1.929686539643516e-05,
"loss": 1.4802,
"step": 2288
},
{
"epoch": 0.17593730792870313,
"grad_norm": 3.464115858078003,
"learning_rate": 1.929625076828519e-05,
"loss": 1.4913,
"step": 2290
},
{
"epoch": 0.17609096496619545,
"grad_norm": 4.677506446838379,
"learning_rate": 1.929563614013522e-05,
"loss": 1.5254,
"step": 2292
},
{
"epoch": 0.17624462200368776,
"grad_norm": 3.7886929512023926,
"learning_rate": 1.929502151198525e-05,
"loss": 1.6297,
"step": 2294
},
{
"epoch": 0.17639827904118008,
"grad_norm": 3.5035488605499268,
"learning_rate": 1.929440688383528e-05,
"loss": 1.657,
"step": 2296
},
{
"epoch": 0.1765519360786724,
"grad_norm": 4.172173976898193,
"learning_rate": 1.9293792255685314e-05,
"loss": 1.5612,
"step": 2298
},
{
"epoch": 0.1767055931161647,
"grad_norm": 3.9481425285339355,
"learning_rate": 1.929317762753534e-05,
"loss": 1.4419,
"step": 2300
},
{
"epoch": 0.17685925015365703,
"grad_norm": 3.922159433364868,
"learning_rate": 1.9292562999385374e-05,
"loss": 1.5459,
"step": 2302
},
{
"epoch": 0.17701290719114934,
"grad_norm": 4.2247233390808105,
"learning_rate": 1.9291948371235403e-05,
"loss": 1.6869,
"step": 2304
},
{
"epoch": 0.17716656422864166,
"grad_norm": 4.960201740264893,
"learning_rate": 1.9291333743085433e-05,
"loss": 1.4055,
"step": 2306
},
{
"epoch": 0.17732022126613398,
"grad_norm": 4.675178527832031,
"learning_rate": 1.9290719114935466e-05,
"loss": 1.4635,
"step": 2308
},
{
"epoch": 0.1774738783036263,
"grad_norm": 4.3724589347839355,
"learning_rate": 1.9290104486785496e-05,
"loss": 1.4423,
"step": 2310
},
{
"epoch": 0.17762753534111864,
"grad_norm": 4.629543304443359,
"learning_rate": 1.9289489858635525e-05,
"loss": 1.6696,
"step": 2312
},
{
"epoch": 0.17778119237861095,
"grad_norm": 3.8183395862579346,
"learning_rate": 1.928887523048556e-05,
"loss": 1.5558,
"step": 2314
},
{
"epoch": 0.17793484941610327,
"grad_norm": 3.7984275817871094,
"learning_rate": 1.9288260602335588e-05,
"loss": 1.5884,
"step": 2316
},
{
"epoch": 0.17808850645359559,
"grad_norm": 3.9068145751953125,
"learning_rate": 1.928764597418562e-05,
"loss": 1.6217,
"step": 2318
},
{
"epoch": 0.1782421634910879,
"grad_norm": 4.159458160400391,
"learning_rate": 1.928703134603565e-05,
"loss": 1.5934,
"step": 2320
},
{
"epoch": 0.17839582052858022,
"grad_norm": 4.013321876525879,
"learning_rate": 1.928641671788568e-05,
"loss": 1.5542,
"step": 2322
},
{
"epoch": 0.17854947756607253,
"grad_norm": 4.504942893981934,
"learning_rate": 1.9285802089735714e-05,
"loss": 1.7811,
"step": 2324
},
{
"epoch": 0.17870313460356485,
"grad_norm": 4.721269130706787,
"learning_rate": 1.928518746158574e-05,
"loss": 1.4974,
"step": 2326
},
{
"epoch": 0.17885679164105717,
"grad_norm": 3.662440776824951,
"learning_rate": 1.9284572833435773e-05,
"loss": 1.6497,
"step": 2328
},
{
"epoch": 0.17901044867854948,
"grad_norm": 3.8075759410858154,
"learning_rate": 1.9283958205285803e-05,
"loss": 1.4787,
"step": 2330
},
{
"epoch": 0.1791641057160418,
"grad_norm": 4.013290882110596,
"learning_rate": 1.9283343577135832e-05,
"loss": 1.5371,
"step": 2332
},
{
"epoch": 0.17931776275353412,
"grad_norm": 4.095331192016602,
"learning_rate": 1.9282728948985865e-05,
"loss": 1.6089,
"step": 2334
},
{
"epoch": 0.17947141979102643,
"grad_norm": 4.137665748596191,
"learning_rate": 1.9282114320835895e-05,
"loss": 1.6971,
"step": 2336
},
{
"epoch": 0.17962507682851875,
"grad_norm": 4.847195625305176,
"learning_rate": 1.9281499692685928e-05,
"loss": 1.365,
"step": 2338
},
{
"epoch": 0.17977873386601106,
"grad_norm": 4.068114280700684,
"learning_rate": 1.9280885064535958e-05,
"loss": 1.7029,
"step": 2340
},
{
"epoch": 0.17993239090350338,
"grad_norm": 4.104188442230225,
"learning_rate": 1.9280270436385988e-05,
"loss": 1.636,
"step": 2342
},
{
"epoch": 0.1800860479409957,
"grad_norm": 4.033984661102295,
"learning_rate": 1.927965580823602e-05,
"loss": 1.5451,
"step": 2344
},
{
"epoch": 0.180239704978488,
"grad_norm": 4.00771951675415,
"learning_rate": 1.927904118008605e-05,
"loss": 1.5162,
"step": 2346
},
{
"epoch": 0.18039336201598033,
"grad_norm": 4.097219467163086,
"learning_rate": 1.927842655193608e-05,
"loss": 1.6558,
"step": 2348
},
{
"epoch": 0.18054701905347265,
"grad_norm": 4.354104042053223,
"learning_rate": 1.9277811923786113e-05,
"loss": 1.61,
"step": 2350
},
{
"epoch": 0.18070067609096496,
"grad_norm": 4.535645484924316,
"learning_rate": 1.927719729563614e-05,
"loss": 1.6289,
"step": 2352
},
{
"epoch": 0.18085433312845728,
"grad_norm": 4.078785419464111,
"learning_rate": 1.9276582667486172e-05,
"loss": 1.7284,
"step": 2354
},
{
"epoch": 0.1810079901659496,
"grad_norm": 4.275857448577881,
"learning_rate": 1.9275968039336202e-05,
"loss": 1.5461,
"step": 2356
},
{
"epoch": 0.1811616472034419,
"grad_norm": 4.156821250915527,
"learning_rate": 1.9275353411186232e-05,
"loss": 1.5956,
"step": 2358
},
{
"epoch": 0.18131530424093423,
"grad_norm": 3.6036367416381836,
"learning_rate": 1.9274738783036265e-05,
"loss": 1.5949,
"step": 2360
},
{
"epoch": 0.18146896127842654,
"grad_norm": 4.079240798950195,
"learning_rate": 1.9274124154886295e-05,
"loss": 1.5671,
"step": 2362
},
{
"epoch": 0.18162261831591886,
"grad_norm": 4.024125576019287,
"learning_rate": 1.9273509526736328e-05,
"loss": 1.4904,
"step": 2364
},
{
"epoch": 0.18177627535341118,
"grad_norm": 3.7651987075805664,
"learning_rate": 1.9272894898586357e-05,
"loss": 1.5179,
"step": 2366
},
{
"epoch": 0.1819299323909035,
"grad_norm": 3.8718831539154053,
"learning_rate": 1.9272280270436387e-05,
"loss": 1.4699,
"step": 2368
},
{
"epoch": 0.1820835894283958,
"grad_norm": 4.548869609832764,
"learning_rate": 1.927166564228642e-05,
"loss": 1.6476,
"step": 2370
},
{
"epoch": 0.18223724646588813,
"grad_norm": 4.528201580047607,
"learning_rate": 1.927105101413645e-05,
"loss": 1.6755,
"step": 2372
},
{
"epoch": 0.18239090350338044,
"grad_norm": 4.0802388191223145,
"learning_rate": 1.927043638598648e-05,
"loss": 1.5722,
"step": 2374
},
{
"epoch": 0.18254456054087279,
"grad_norm": 4.145775318145752,
"learning_rate": 1.9269821757836513e-05,
"loss": 1.5816,
"step": 2376
},
{
"epoch": 0.1826982175783651,
"grad_norm": 3.925696611404419,
"learning_rate": 1.926920712968654e-05,
"loss": 1.625,
"step": 2378
},
{
"epoch": 0.18285187461585742,
"grad_norm": 3.8499910831451416,
"learning_rate": 1.9268592501536572e-05,
"loss": 1.5522,
"step": 2380
},
{
"epoch": 0.18300553165334973,
"grad_norm": 4.174738883972168,
"learning_rate": 1.92679778733866e-05,
"loss": 1.561,
"step": 2382
},
{
"epoch": 0.18315918869084205,
"grad_norm": 3.801260232925415,
"learning_rate": 1.9267363245236635e-05,
"loss": 1.5159,
"step": 2384
},
{
"epoch": 0.18331284572833437,
"grad_norm": 4.0040202140808105,
"learning_rate": 1.9266748617086664e-05,
"loss": 1.6021,
"step": 2386
},
{
"epoch": 0.18346650276582668,
"grad_norm": 4.132852554321289,
"learning_rate": 1.9266133988936694e-05,
"loss": 1.6887,
"step": 2388
},
{
"epoch": 0.183620159803319,
"grad_norm": 3.7313075065612793,
"learning_rate": 1.9265519360786727e-05,
"loss": 1.4582,
"step": 2390
},
{
"epoch": 0.18377381684081132,
"grad_norm": 3.824453115463257,
"learning_rate": 1.9264904732636757e-05,
"loss": 1.4394,
"step": 2392
},
{
"epoch": 0.18392747387830363,
"grad_norm": 4.368152141571045,
"learning_rate": 1.9264290104486786e-05,
"loss": 1.5397,
"step": 2394
},
{
"epoch": 0.18408113091579595,
"grad_norm": 3.7525463104248047,
"learning_rate": 1.926367547633682e-05,
"loss": 1.6875,
"step": 2396
},
{
"epoch": 0.18423478795328826,
"grad_norm": 4.229045391082764,
"learning_rate": 1.9263060848186846e-05,
"loss": 1.6248,
"step": 2398
},
{
"epoch": 0.18438844499078058,
"grad_norm": 3.9596312046051025,
"learning_rate": 1.926244622003688e-05,
"loss": 1.5278,
"step": 2400
}
],
"logging_steps": 2,
"max_steps": 65080,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5320091651263693e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}