{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18438844499078058, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015365703749231714, "grad_norm": 54.03105926513672, "learning_rate": 1.9999385371850035e-05, "loss": 5.8486, "step": 2 }, { "epoch": 0.00030731407498463427, "grad_norm": 60.26078414916992, "learning_rate": 1.999877074370006e-05, "loss": 5.5476, "step": 4 }, { "epoch": 0.00046097111247695143, "grad_norm": 172.71730041503906, "learning_rate": 1.9998156115550094e-05, "loss": 4.8213, "step": 6 }, { "epoch": 0.0006146281499692685, "grad_norm": 80.93025970458984, "learning_rate": 1.9997541487400124e-05, "loss": 4.8849, "step": 8 }, { "epoch": 0.0007682851874615857, "grad_norm": 118.46244049072266, "learning_rate": 1.9996926859250153e-05, "loss": 4.0143, "step": 10 }, { "epoch": 0.0009219422249539029, "grad_norm": 58.34870910644531, "learning_rate": 1.9996312231100187e-05, "loss": 4.0411, "step": 12 }, { "epoch": 0.0010755992624462201, "grad_norm": 35.50437927246094, "learning_rate": 1.9995697602950216e-05, "loss": 3.9182, "step": 14 }, { "epoch": 0.001229256299938537, "grad_norm": 60.75965118408203, "learning_rate": 1.999508297480025e-05, "loss": 3.7527, "step": 16 }, { "epoch": 0.0013829133374308542, "grad_norm": 53.328765869140625, "learning_rate": 1.999446834665028e-05, "loss": 3.5424, "step": 18 }, { "epoch": 0.0015365703749231714, "grad_norm": 40.73623275756836, "learning_rate": 1.999385371850031e-05, "loss": 3.7179, "step": 20 }, { "epoch": 0.0016902274124154886, "grad_norm": 15.511174201965332, "learning_rate": 1.9993239090350342e-05, "loss": 3.3177, "step": 22 }, { "epoch": 0.0018438844499078057, "grad_norm": 62.24359130859375, "learning_rate": 1.9992624462200368e-05, "loss": 3.5207, "step": 24 }, { "epoch": 0.001997541487400123, "grad_norm": 69.20399475097656, "learning_rate": 1.99920098340504e-05, "loss": 3.6051, "step": 26 }, { "epoch": 0.0021511985248924403, "grad_norm": 39.98881530761719, "learning_rate": 1.999139520590043e-05, "loss": 3.3193, "step": 28 }, { "epoch": 0.0023048555623847574, "grad_norm": 10.113142013549805, "learning_rate": 1.999078057775046e-05, "loss": 3.2187, "step": 30 }, { "epoch": 0.002458512599877074, "grad_norm": 28.31175422668457, "learning_rate": 1.9990165949600494e-05, "loss": 3.4458, "step": 32 }, { "epoch": 0.0026121696373693913, "grad_norm": 21.829612731933594, "learning_rate": 1.9989551321450523e-05, "loss": 3.5843, "step": 34 }, { "epoch": 0.0027658266748617085, "grad_norm": 18.00796127319336, "learning_rate": 1.9988936693300556e-05, "loss": 3.2827, "step": 36 }, { "epoch": 0.0029194837123540257, "grad_norm": 16.2840576171875, "learning_rate": 1.9988322065150586e-05, "loss": 3.2059, "step": 38 }, { "epoch": 0.003073140749846343, "grad_norm": 11.987384796142578, "learning_rate": 1.9987707437000616e-05, "loss": 3.1313, "step": 40 }, { "epoch": 0.00322679778733866, "grad_norm": 6.873617649078369, "learning_rate": 1.998709280885065e-05, "loss": 2.8568, "step": 42 }, { "epoch": 0.003380454824830977, "grad_norm": 6.603146076202393, "learning_rate": 1.998647818070068e-05, "loss": 3.0781, "step": 44 }, { "epoch": 0.0035341118623232943, "grad_norm": 25.308164596557617, "learning_rate": 1.9985863552550708e-05, "loss": 3.0764, "step": 46 }, { "epoch": 0.0036877688998156115, "grad_norm": 15.176654815673828, "learning_rate": 1.998524892440074e-05, "loss": 3.0356, "step": 48 }, { "epoch": 0.0038414259373079286, "grad_norm": 7.444390773773193, "learning_rate": 1.9984634296250767e-05, "loss": 2.9488, "step": 50 }, { "epoch": 0.003995082974800246, "grad_norm": 18.565139770507812, "learning_rate": 1.99840196681008e-05, "loss": 2.7179, "step": 52 }, { "epoch": 0.004148740012292563, "grad_norm": 10.658416748046875, "learning_rate": 1.998340503995083e-05, "loss": 2.716, "step": 54 }, { "epoch": 0.0043023970497848806, "grad_norm": 9.682657241821289, "learning_rate": 1.9982790411800863e-05, "loss": 2.9189, "step": 56 }, { "epoch": 0.004456054087277198, "grad_norm": 20.967639923095703, "learning_rate": 1.9982175783650893e-05, "loss": 2.8078, "step": 58 }, { "epoch": 0.004609711124769515, "grad_norm": 16.931556701660156, "learning_rate": 1.9981561155500923e-05, "loss": 2.837, "step": 60 }, { "epoch": 0.004763368162261831, "grad_norm": 12.055686950683594, "learning_rate": 1.9980946527350956e-05, "loss": 2.7392, "step": 62 }, { "epoch": 0.004917025199754148, "grad_norm": 7.959167957305908, "learning_rate": 1.9980331899200985e-05, "loss": 2.8915, "step": 64 }, { "epoch": 0.0050706822372464655, "grad_norm": 9.24318790435791, "learning_rate": 1.9979717271051015e-05, "loss": 2.5171, "step": 66 }, { "epoch": 0.005224339274738783, "grad_norm": 20.02304458618164, "learning_rate": 1.9979102642901048e-05, "loss": 2.7947, "step": 68 }, { "epoch": 0.0053779963122311, "grad_norm": 8.09688663482666, "learning_rate": 1.9978488014751078e-05, "loss": 2.7323, "step": 70 }, { "epoch": 0.005531653349723417, "grad_norm": 8.636987686157227, "learning_rate": 1.9977873386601108e-05, "loss": 2.6309, "step": 72 }, { "epoch": 0.005685310387215734, "grad_norm": 6.815808296203613, "learning_rate": 1.997725875845114e-05, "loss": 2.4741, "step": 74 }, { "epoch": 0.005838967424708051, "grad_norm": 7.532662868499756, "learning_rate": 1.997664413030117e-05, "loss": 2.5875, "step": 76 }, { "epoch": 0.0059926244622003685, "grad_norm": 6.733164310455322, "learning_rate": 1.99760295021512e-05, "loss": 2.6103, "step": 78 }, { "epoch": 0.006146281499692686, "grad_norm": 6.442116737365723, "learning_rate": 1.997541487400123e-05, "loss": 2.6208, "step": 80 }, { "epoch": 0.006299938537185003, "grad_norm": 6.882765769958496, "learning_rate": 1.9974800245851263e-05, "loss": 2.5554, "step": 82 }, { "epoch": 0.00645359557467732, "grad_norm": 6.64527702331543, "learning_rate": 1.9974185617701292e-05, "loss": 2.5667, "step": 84 }, { "epoch": 0.006607252612169637, "grad_norm": 7.69775390625, "learning_rate": 1.9973570989551322e-05, "loss": 2.6117, "step": 86 }, { "epoch": 0.006760909649661954, "grad_norm": 7.077218532562256, "learning_rate": 1.9972956361401355e-05, "loss": 2.4501, "step": 88 }, { "epoch": 0.0069145666871542714, "grad_norm": 5.539189338684082, "learning_rate": 1.9972341733251385e-05, "loss": 2.4775, "step": 90 }, { "epoch": 0.007068223724646589, "grad_norm": 6.602914333343506, "learning_rate": 1.9971727105101415e-05, "loss": 2.3944, "step": 92 }, { "epoch": 0.007221880762138906, "grad_norm": 5.995626449584961, "learning_rate": 1.9971112476951448e-05, "loss": 2.4826, "step": 94 }, { "epoch": 0.007375537799631223, "grad_norm": 6.836587429046631, "learning_rate": 1.9970497848801477e-05, "loss": 2.468, "step": 96 }, { "epoch": 0.00752919483712354, "grad_norm": 6.4697651863098145, "learning_rate": 1.9969883220651507e-05, "loss": 2.2833, "step": 98 }, { "epoch": 0.007682851874615857, "grad_norm": 8.081903457641602, "learning_rate": 1.996926859250154e-05, "loss": 2.6544, "step": 100 }, { "epoch": 0.007836508912108174, "grad_norm": 6.688724517822266, "learning_rate": 1.996865396435157e-05, "loss": 2.55, "step": 102 }, { "epoch": 0.007990165949600492, "grad_norm": 6.878283977508545, "learning_rate": 1.99680393362016e-05, "loss": 2.2658, "step": 104 }, { "epoch": 0.008143822987092809, "grad_norm": 7.079164505004883, "learning_rate": 1.996742470805163e-05, "loss": 2.4793, "step": 106 }, { "epoch": 0.008297480024585127, "grad_norm": 6.391737461090088, "learning_rate": 1.9966810079901662e-05, "loss": 2.4154, "step": 108 }, { "epoch": 0.008451137062077443, "grad_norm": 7.503854274749756, "learning_rate": 1.9966195451751692e-05, "loss": 2.3137, "step": 110 }, { "epoch": 0.008604794099569761, "grad_norm": 6.10397481918335, "learning_rate": 1.996558082360172e-05, "loss": 2.4306, "step": 112 }, { "epoch": 0.008758451137062077, "grad_norm": 6.1603264808654785, "learning_rate": 1.9964966195451755e-05, "loss": 2.2859, "step": 114 }, { "epoch": 0.008912108174554395, "grad_norm": 7.389194011688232, "learning_rate": 1.9964351567301784e-05, "loss": 2.3876, "step": 116 }, { "epoch": 0.009065765212046712, "grad_norm": 6.887446403503418, "learning_rate": 1.9963736939151814e-05, "loss": 2.5139, "step": 118 }, { "epoch": 0.00921942224953903, "grad_norm": 7.2416768074035645, "learning_rate": 1.9963122311001847e-05, "loss": 2.5653, "step": 120 }, { "epoch": 0.009373079287031346, "grad_norm": 7.454037189483643, "learning_rate": 1.9962507682851877e-05, "loss": 2.3707, "step": 122 }, { "epoch": 0.009526736324523662, "grad_norm": 6.9176483154296875, "learning_rate": 1.9961893054701906e-05, "loss": 2.4158, "step": 124 }, { "epoch": 0.00968039336201598, "grad_norm": 7.838490009307861, "learning_rate": 1.9961278426551936e-05, "loss": 2.45, "step": 126 }, { "epoch": 0.009834050399508297, "grad_norm": 6.680061340332031, "learning_rate": 1.996066379840197e-05, "loss": 2.1975, "step": 128 }, { "epoch": 0.009987707437000615, "grad_norm": 7.567671775817871, "learning_rate": 1.9960049170252e-05, "loss": 2.1892, "step": 130 }, { "epoch": 0.010141364474492931, "grad_norm": 6.0987396240234375, "learning_rate": 1.995943454210203e-05, "loss": 2.2957, "step": 132 }, { "epoch": 0.010295021511985249, "grad_norm": 6.579552173614502, "learning_rate": 1.995881991395206e-05, "loss": 2.4326, "step": 134 }, { "epoch": 0.010448678549477565, "grad_norm": 7.131938934326172, "learning_rate": 1.995820528580209e-05, "loss": 2.4767, "step": 136 }, { "epoch": 0.010602335586969883, "grad_norm": 6.883522033691406, "learning_rate": 1.995759065765212e-05, "loss": 2.3893, "step": 138 }, { "epoch": 0.0107559926244622, "grad_norm": 5.52859354019165, "learning_rate": 1.9956976029502154e-05, "loss": 2.1851, "step": 140 }, { "epoch": 0.010909649661954518, "grad_norm": 6.14478874206543, "learning_rate": 1.9956361401352184e-05, "loss": 2.2019, "step": 142 }, { "epoch": 0.011063306699446834, "grad_norm": 6.477922439575195, "learning_rate": 1.9955746773202213e-05, "loss": 2.2746, "step": 144 }, { "epoch": 0.011216963736939152, "grad_norm": 7.661022186279297, "learning_rate": 1.9955132145052246e-05, "loss": 2.3499, "step": 146 }, { "epoch": 0.011370620774431468, "grad_norm": 7.439324378967285, "learning_rate": 1.9954517516902276e-05, "loss": 2.1848, "step": 148 }, { "epoch": 0.011524277811923786, "grad_norm": 7.070183753967285, "learning_rate": 1.9953902888752306e-05, "loss": 2.2816, "step": 150 }, { "epoch": 0.011677934849416103, "grad_norm": 5.912161350250244, "learning_rate": 1.9953288260602336e-05, "loss": 2.3688, "step": 152 }, { "epoch": 0.01183159188690842, "grad_norm": 6.827462673187256, "learning_rate": 1.995267363245237e-05, "loss": 2.3945, "step": 154 }, { "epoch": 0.011985248924400737, "grad_norm": 5.7712082862854, "learning_rate": 1.9952059004302398e-05, "loss": 2.1618, "step": 156 }, { "epoch": 0.012138905961893055, "grad_norm": 5.9169020652771, "learning_rate": 1.9951444376152428e-05, "loss": 2.1781, "step": 158 }, { "epoch": 0.012292562999385371, "grad_norm": 5.994232177734375, "learning_rate": 1.995082974800246e-05, "loss": 2.1474, "step": 160 }, { "epoch": 0.01244622003687769, "grad_norm": 6.10550594329834, "learning_rate": 1.995021511985249e-05, "loss": 2.2227, "step": 162 }, { "epoch": 0.012599877074370006, "grad_norm": 7.107779502868652, "learning_rate": 1.994960049170252e-05, "loss": 2.334, "step": 164 }, { "epoch": 0.012753534111862324, "grad_norm": 4.990610122680664, "learning_rate": 1.9948985863552553e-05, "loss": 2.2313, "step": 166 }, { "epoch": 0.01290719114935464, "grad_norm": 8.93641185760498, "learning_rate": 1.9948371235402583e-05, "loss": 2.1062, "step": 168 }, { "epoch": 0.013060848186846958, "grad_norm": 5.389564037322998, "learning_rate": 1.9947756607252613e-05, "loss": 2.1729, "step": 170 }, { "epoch": 0.013214505224339274, "grad_norm": 5.347591400146484, "learning_rate": 1.9947141979102646e-05, "loss": 2.0474, "step": 172 }, { "epoch": 0.013368162261831592, "grad_norm": 6.475700378417969, "learning_rate": 1.9946527350952676e-05, "loss": 2.1939, "step": 174 }, { "epoch": 0.013521819299323909, "grad_norm": 6.144668102264404, "learning_rate": 1.9945912722802705e-05, "loss": 2.217, "step": 176 }, { "epoch": 0.013675476336816227, "grad_norm": 6.778875350952148, "learning_rate": 1.9945298094652735e-05, "loss": 2.139, "step": 178 }, { "epoch": 0.013829133374308543, "grad_norm": 7.560453414916992, "learning_rate": 1.9944683466502768e-05, "loss": 2.1931, "step": 180 }, { "epoch": 0.013982790411800861, "grad_norm": 5.251035690307617, "learning_rate": 1.9944068838352798e-05, "loss": 2.1596, "step": 182 }, { "epoch": 0.014136447449293177, "grad_norm": 5.9772162437438965, "learning_rate": 1.9943454210202827e-05, "loss": 2.232, "step": 184 }, { "epoch": 0.014290104486785495, "grad_norm": 7.088453769683838, "learning_rate": 1.994283958205286e-05, "loss": 2.3468, "step": 186 }, { "epoch": 0.014443761524277812, "grad_norm": 6.209799289703369, "learning_rate": 1.994222495390289e-05, "loss": 2.3158, "step": 188 }, { "epoch": 0.01459741856177013, "grad_norm": 6.048709392547607, "learning_rate": 1.994161032575292e-05, "loss": 1.9986, "step": 190 }, { "epoch": 0.014751075599262446, "grad_norm": 5.292468070983887, "learning_rate": 1.9940995697602953e-05, "loss": 2.1564, "step": 192 }, { "epoch": 0.014904732636754764, "grad_norm": 6.045801639556885, "learning_rate": 1.9940381069452983e-05, "loss": 2.2064, "step": 194 }, { "epoch": 0.01505838967424708, "grad_norm": 6.204288482666016, "learning_rate": 1.9939766441303012e-05, "loss": 2.2869, "step": 196 }, { "epoch": 0.015212046711739398, "grad_norm": 6.579591274261475, "learning_rate": 1.9939151813153045e-05, "loss": 2.1494, "step": 198 }, { "epoch": 0.015365703749231715, "grad_norm": 6.20919942855835, "learning_rate": 1.9938537185003075e-05, "loss": 2.0245, "step": 200 }, { "epoch": 0.015519360786724033, "grad_norm": 6.129773139953613, "learning_rate": 1.9937922556853108e-05, "loss": 2.0684, "step": 202 }, { "epoch": 0.01567301782421635, "grad_norm": 7.500084400177002, "learning_rate": 1.9937307928703134e-05, "loss": 2.1818, "step": 204 }, { "epoch": 0.015826674861708665, "grad_norm": 6.189898490905762, "learning_rate": 1.9936693300553167e-05, "loss": 2.1377, "step": 206 }, { "epoch": 0.015980331899200985, "grad_norm": 5.788628101348877, "learning_rate": 1.9936078672403197e-05, "loss": 2.1195, "step": 208 }, { "epoch": 0.0161339889366933, "grad_norm": 6.9061055183410645, "learning_rate": 1.9935464044253227e-05, "loss": 2.1815, "step": 210 }, { "epoch": 0.016287645974185617, "grad_norm": 7.366201877593994, "learning_rate": 1.993484941610326e-05, "loss": 2.2884, "step": 212 }, { "epoch": 0.016441303011677934, "grad_norm": 5.979190826416016, "learning_rate": 1.993423478795329e-05, "loss": 2.3251, "step": 214 }, { "epoch": 0.016594960049170254, "grad_norm": 6.170030117034912, "learning_rate": 1.993362015980332e-05, "loss": 2.2108, "step": 216 }, { "epoch": 0.01674861708666257, "grad_norm": 6.819857120513916, "learning_rate": 1.9933005531653352e-05, "loss": 2.2231, "step": 218 }, { "epoch": 0.016902274124154886, "grad_norm": 7.386382579803467, "learning_rate": 1.9932390903503382e-05, "loss": 2.1647, "step": 220 }, { "epoch": 0.017055931161647202, "grad_norm": 5.797331809997559, "learning_rate": 1.9931776275353415e-05, "loss": 2.2092, "step": 222 }, { "epoch": 0.017209588199139522, "grad_norm": 5.605097770690918, "learning_rate": 1.993116164720344e-05, "loss": 2.2266, "step": 224 }, { "epoch": 0.01736324523663184, "grad_norm": 5.865804672241211, "learning_rate": 1.9930547019053474e-05, "loss": 2.0874, "step": 226 }, { "epoch": 0.017516902274124155, "grad_norm": 7.769106864929199, "learning_rate": 1.9929932390903508e-05, "loss": 2.1032, "step": 228 }, { "epoch": 0.01767055931161647, "grad_norm": 6.673518180847168, "learning_rate": 1.9929317762753534e-05, "loss": 2.0957, "step": 230 }, { "epoch": 0.01782421634910879, "grad_norm": 6.331215858459473, "learning_rate": 1.9928703134603567e-05, "loss": 2.143, "step": 232 }, { "epoch": 0.017977873386601107, "grad_norm": 5.792760848999023, "learning_rate": 1.9928088506453597e-05, "loss": 2.0157, "step": 234 }, { "epoch": 0.018131530424093423, "grad_norm": 6.460434436798096, "learning_rate": 1.9927473878303626e-05, "loss": 2.0018, "step": 236 }, { "epoch": 0.01828518746158574, "grad_norm": 6.339091777801514, "learning_rate": 1.992685925015366e-05, "loss": 2.2635, "step": 238 }, { "epoch": 0.01843884449907806, "grad_norm": 5.446582317352295, "learning_rate": 1.992624462200369e-05, "loss": 2.036, "step": 240 }, { "epoch": 0.018592501536570376, "grad_norm": 6.4099273681640625, "learning_rate": 1.9925629993853722e-05, "loss": 2.0031, "step": 242 }, { "epoch": 0.018746158574062692, "grad_norm": 7.307748794555664, "learning_rate": 1.9925015365703752e-05, "loss": 2.0746, "step": 244 }, { "epoch": 0.01889981561155501, "grad_norm": 5.755754470825195, "learning_rate": 1.992440073755378e-05, "loss": 2.1756, "step": 246 }, { "epoch": 0.019053472649047325, "grad_norm": 5.9470744132995605, "learning_rate": 1.9923786109403815e-05, "loss": 2.1579, "step": 248 }, { "epoch": 0.019207129686539644, "grad_norm": 5.4200873374938965, "learning_rate": 1.992317148125384e-05, "loss": 2.1868, "step": 250 }, { "epoch": 0.01936078672403196, "grad_norm": 6.8247175216674805, "learning_rate": 1.9922556853103874e-05, "loss": 2.1525, "step": 252 }, { "epoch": 0.019514443761524277, "grad_norm": 6.334802627563477, "learning_rate": 1.9921942224953904e-05, "loss": 2.1261, "step": 254 }, { "epoch": 0.019668100799016593, "grad_norm": 7.025927543640137, "learning_rate": 1.9921327596803933e-05, "loss": 2.2474, "step": 256 }, { "epoch": 0.019821757836508913, "grad_norm": 6.594686508178711, "learning_rate": 1.9920712968653966e-05, "loss": 1.9885, "step": 258 }, { "epoch": 0.01997541487400123, "grad_norm": 6.713582992553711, "learning_rate": 1.9920098340503996e-05, "loss": 2.3728, "step": 260 }, { "epoch": 0.020129071911493546, "grad_norm": 5.78023099899292, "learning_rate": 1.9919483712354026e-05, "loss": 2.0887, "step": 262 }, { "epoch": 0.020282728948985862, "grad_norm": 5.462549686431885, "learning_rate": 1.991886908420406e-05, "loss": 2.0673, "step": 264 }, { "epoch": 0.020436385986478182, "grad_norm": 6.792922019958496, "learning_rate": 1.991825445605409e-05, "loss": 2.177, "step": 266 }, { "epoch": 0.020590043023970498, "grad_norm": 6.281880855560303, "learning_rate": 1.991763982790412e-05, "loss": 1.9686, "step": 268 }, { "epoch": 0.020743700061462814, "grad_norm": 5.745354175567627, "learning_rate": 1.991702519975415e-05, "loss": 2.1414, "step": 270 }, { "epoch": 0.02089735709895513, "grad_norm": 6.046512126922607, "learning_rate": 1.991641057160418e-05, "loss": 2.1541, "step": 272 }, { "epoch": 0.02105101413644745, "grad_norm": 7.513150691986084, "learning_rate": 1.9915795943454214e-05, "loss": 2.1383, "step": 274 }, { "epoch": 0.021204671173939767, "grad_norm": 8.351797103881836, "learning_rate": 1.991518131530424e-05, "loss": 2.209, "step": 276 }, { "epoch": 0.021358328211432083, "grad_norm": 6.781789302825928, "learning_rate": 1.9914566687154273e-05, "loss": 1.9494, "step": 278 }, { "epoch": 0.0215119852489244, "grad_norm": 5.912288188934326, "learning_rate": 1.9913952059004303e-05, "loss": 1.9871, "step": 280 }, { "epoch": 0.02166564228641672, "grad_norm": 5.441234111785889, "learning_rate": 1.9913337430854333e-05, "loss": 2.118, "step": 282 }, { "epoch": 0.021819299323909035, "grad_norm": 6.041057109832764, "learning_rate": 1.9912722802704366e-05, "loss": 2.0064, "step": 284 }, { "epoch": 0.02197295636140135, "grad_norm": 6.26601505279541, "learning_rate": 1.9912108174554395e-05, "loss": 1.9593, "step": 286 }, { "epoch": 0.022126613398893668, "grad_norm": 6.992424488067627, "learning_rate": 1.991149354640443e-05, "loss": 2.1785, "step": 288 }, { "epoch": 0.022280270436385988, "grad_norm": 7.048946857452393, "learning_rate": 1.9910878918254458e-05, "loss": 2.0809, "step": 290 }, { "epoch": 0.022433927473878304, "grad_norm": 7.00367546081543, "learning_rate": 1.9910264290104488e-05, "loss": 2.0688, "step": 292 }, { "epoch": 0.02258758451137062, "grad_norm": 6.326030731201172, "learning_rate": 1.990964966195452e-05, "loss": 2.1279, "step": 294 }, { "epoch": 0.022741241548862937, "grad_norm": 5.886343002319336, "learning_rate": 1.990903503380455e-05, "loss": 1.9146, "step": 296 }, { "epoch": 0.022894898586355256, "grad_norm": 6.407416820526123, "learning_rate": 1.990842040565458e-05, "loss": 2.073, "step": 298 }, { "epoch": 0.023048555623847573, "grad_norm": 5.35817289352417, "learning_rate": 1.9907805777504613e-05, "loss": 2.064, "step": 300 }, { "epoch": 0.02320221266133989, "grad_norm": 5.71148157119751, "learning_rate": 1.990719114935464e-05, "loss": 2.2207, "step": 302 }, { "epoch": 0.023355869698832205, "grad_norm": 7.2422051429748535, "learning_rate": 1.9906576521204673e-05, "loss": 2.1518, "step": 304 }, { "epoch": 0.023509526736324525, "grad_norm": 7.267468452453613, "learning_rate": 1.9905961893054702e-05, "loss": 2.0082, "step": 306 }, { "epoch": 0.02366318377381684, "grad_norm": 6.504114627838135, "learning_rate": 1.9905347264904736e-05, "loss": 1.9722, "step": 308 }, { "epoch": 0.023816840811309158, "grad_norm": 7.074812889099121, "learning_rate": 1.9904732636754765e-05, "loss": 2.1789, "step": 310 }, { "epoch": 0.023970497848801474, "grad_norm": 6.774876117706299, "learning_rate": 1.9904118008604795e-05, "loss": 2.219, "step": 312 }, { "epoch": 0.024124154886293794, "grad_norm": 5.666469097137451, "learning_rate": 1.9903503380454828e-05, "loss": 1.8294, "step": 314 }, { "epoch": 0.02427781192378611, "grad_norm": 6.548127174377441, "learning_rate": 1.9902888752304858e-05, "loss": 2.0859, "step": 316 }, { "epoch": 0.024431468961278426, "grad_norm": 5.174642562866211, "learning_rate": 1.9902274124154887e-05, "loss": 1.989, "step": 318 }, { "epoch": 0.024585125998770743, "grad_norm": 5.891490936279297, "learning_rate": 1.990165949600492e-05, "loss": 2.0776, "step": 320 }, { "epoch": 0.024738783036263062, "grad_norm": 5.7647504806518555, "learning_rate": 1.9901044867854947e-05, "loss": 1.9681, "step": 322 }, { "epoch": 0.02489244007375538, "grad_norm": 5.61868143081665, "learning_rate": 1.990043023970498e-05, "loss": 1.8923, "step": 324 }, { "epoch": 0.025046097111247695, "grad_norm": 7.358055114746094, "learning_rate": 1.9899815611555013e-05, "loss": 1.9859, "step": 326 }, { "epoch": 0.02519975414874001, "grad_norm": 5.265814781188965, "learning_rate": 1.9899200983405043e-05, "loss": 1.939, "step": 328 }, { "epoch": 0.02535341118623233, "grad_norm": 9.370257377624512, "learning_rate": 1.9898586355255072e-05, "loss": 1.9538, "step": 330 }, { "epoch": 0.025507068223724647, "grad_norm": 7.504848003387451, "learning_rate": 1.9897971727105102e-05, "loss": 2.0802, "step": 332 }, { "epoch": 0.025660725261216964, "grad_norm": 5.975841045379639, "learning_rate": 1.9897357098955135e-05, "loss": 1.853, "step": 334 }, { "epoch": 0.02581438229870928, "grad_norm": 6.099985122680664, "learning_rate": 1.9896742470805165e-05, "loss": 2.0014, "step": 336 }, { "epoch": 0.0259680393362016, "grad_norm": 6.825030326843262, "learning_rate": 1.9896127842655194e-05, "loss": 1.9608, "step": 338 }, { "epoch": 0.026121696373693916, "grad_norm": 6.16441535949707, "learning_rate": 1.9895513214505227e-05, "loss": 2.0848, "step": 340 }, { "epoch": 0.026275353411186232, "grad_norm": 6.392692565917969, "learning_rate": 1.9894898586355257e-05, "loss": 1.9651, "step": 342 }, { "epoch": 0.02642901044867855, "grad_norm": 5.567882537841797, "learning_rate": 1.9894283958205287e-05, "loss": 2.1211, "step": 344 }, { "epoch": 0.026582667486170868, "grad_norm": 10.182480812072754, "learning_rate": 1.989366933005532e-05, "loss": 1.9924, "step": 346 }, { "epoch": 0.026736324523663185, "grad_norm": 5.608663558959961, "learning_rate": 1.989305470190535e-05, "loss": 1.936, "step": 348 }, { "epoch": 0.0268899815611555, "grad_norm": 5.883683204650879, "learning_rate": 1.989244007375538e-05, "loss": 2.0998, "step": 350 }, { "epoch": 0.027043638598647817, "grad_norm": 8.584614753723145, "learning_rate": 1.989182544560541e-05, "loss": 2.1266, "step": 352 }, { "epoch": 0.027197295636140137, "grad_norm": 6.828667640686035, "learning_rate": 1.9891210817455442e-05, "loss": 1.8693, "step": 354 }, { "epoch": 0.027350952673632453, "grad_norm": 7.0278449058532715, "learning_rate": 1.989059618930547e-05, "loss": 1.9258, "step": 356 }, { "epoch": 0.02750460971112477, "grad_norm": 5.643075466156006, "learning_rate": 1.98899815611555e-05, "loss": 2.071, "step": 358 }, { "epoch": 0.027658266748617086, "grad_norm": 6.685908794403076, "learning_rate": 1.9889366933005534e-05, "loss": 1.8658, "step": 360 }, { "epoch": 0.027811923786109402, "grad_norm": 5.766722679138184, "learning_rate": 1.9888752304855564e-05, "loss": 2.0608, "step": 362 }, { "epoch": 0.027965580823601722, "grad_norm": 6.229999542236328, "learning_rate": 1.9888137676705594e-05, "loss": 1.8478, "step": 364 }, { "epoch": 0.028119237861094038, "grad_norm": 14.6449613571167, "learning_rate": 1.9887523048555627e-05, "loss": 2.0233, "step": 366 }, { "epoch": 0.028272894898586354, "grad_norm": 5.458970069885254, "learning_rate": 1.9886908420405657e-05, "loss": 1.8742, "step": 368 }, { "epoch": 0.02842655193607867, "grad_norm": 9.708429336547852, "learning_rate": 1.9886293792255686e-05, "loss": 2.0435, "step": 370 }, { "epoch": 0.02858020897357099, "grad_norm": 8.345685958862305, "learning_rate": 1.988567916410572e-05, "loss": 1.9448, "step": 372 }, { "epoch": 0.028733866011063307, "grad_norm": 5.213901519775391, "learning_rate": 1.988506453595575e-05, "loss": 1.8902, "step": 374 }, { "epoch": 0.028887523048555623, "grad_norm": 6.842494964599609, "learning_rate": 1.988444990780578e-05, "loss": 2.0958, "step": 376 }, { "epoch": 0.02904118008604794, "grad_norm": 6.533809185028076, "learning_rate": 1.988383527965581e-05, "loss": 2.0073, "step": 378 }, { "epoch": 0.02919483712354026, "grad_norm": 5.832721710205078, "learning_rate": 1.988322065150584e-05, "loss": 1.9462, "step": 380 }, { "epoch": 0.029348494161032575, "grad_norm": 6.040827751159668, "learning_rate": 1.988260602335587e-05, "loss": 2.0111, "step": 382 }, { "epoch": 0.02950215119852489, "grad_norm": 6.082043647766113, "learning_rate": 1.98819913952059e-05, "loss": 2.0088, "step": 384 }, { "epoch": 0.029655808236017208, "grad_norm": 4.5363383293151855, "learning_rate": 1.9881376767055934e-05, "loss": 1.9059, "step": 386 }, { "epoch": 0.029809465273509528, "grad_norm": 4.769321918487549, "learning_rate": 1.9880762138905964e-05, "loss": 1.8781, "step": 388 }, { "epoch": 0.029963122311001844, "grad_norm": 6.1424994468688965, "learning_rate": 1.9880147510755993e-05, "loss": 2.0232, "step": 390 }, { "epoch": 0.03011677934849416, "grad_norm": 6.081544399261475, "learning_rate": 1.9879532882606026e-05, "loss": 1.8908, "step": 392 }, { "epoch": 0.030270436385986477, "grad_norm": 6.146285057067871, "learning_rate": 1.9878918254456056e-05, "loss": 2.0144, "step": 394 }, { "epoch": 0.030424093423478796, "grad_norm": 5.401834011077881, "learning_rate": 1.9878303626306086e-05, "loss": 1.8258, "step": 396 }, { "epoch": 0.030577750460971113, "grad_norm": 6.835007667541504, "learning_rate": 1.987768899815612e-05, "loss": 2.0515, "step": 398 }, { "epoch": 0.03073140749846343, "grad_norm": 7.031691551208496, "learning_rate": 1.987707437000615e-05, "loss": 2.0362, "step": 400 }, { "epoch": 0.030885064535955745, "grad_norm": 5.733877182006836, "learning_rate": 1.9876459741856178e-05, "loss": 2.099, "step": 402 }, { "epoch": 0.031038721573448065, "grad_norm": 6.152698516845703, "learning_rate": 1.9875845113706208e-05, "loss": 1.9393, "step": 404 }, { "epoch": 0.03119237861094038, "grad_norm": 5.859741687774658, "learning_rate": 1.987523048555624e-05, "loss": 1.995, "step": 406 }, { "epoch": 0.0313460356484327, "grad_norm": 6.834084510803223, "learning_rate": 1.987461585740627e-05, "loss": 2.0035, "step": 408 }, { "epoch": 0.031499692685925014, "grad_norm": 6.169229030609131, "learning_rate": 1.98740012292563e-05, "loss": 2.1276, "step": 410 }, { "epoch": 0.03165334972341733, "grad_norm": 5.270079135894775, "learning_rate": 1.9873386601106333e-05, "loss": 2.0073, "step": 412 }, { "epoch": 0.03180700676090965, "grad_norm": 5.952144145965576, "learning_rate": 1.9872771972956363e-05, "loss": 1.8817, "step": 414 }, { "epoch": 0.03196066379840197, "grad_norm": 6.3290019035339355, "learning_rate": 1.9872157344806393e-05, "loss": 1.9526, "step": 416 }, { "epoch": 0.032114320835894286, "grad_norm": 5.712306499481201, "learning_rate": 1.9871542716656426e-05, "loss": 2.0793, "step": 418 }, { "epoch": 0.0322679778733866, "grad_norm": 5.497166156768799, "learning_rate": 1.9870928088506455e-05, "loss": 1.9061, "step": 420 }, { "epoch": 0.03242163491087892, "grad_norm": 6.435750484466553, "learning_rate": 1.9870313460356485e-05, "loss": 1.8971, "step": 422 }, { "epoch": 0.032575291948371235, "grad_norm": 5.9519734382629395, "learning_rate": 1.9869698832206518e-05, "loss": 2.0295, "step": 424 }, { "epoch": 0.03272894898586355, "grad_norm": 6.359841823577881, "learning_rate": 1.9869084204056548e-05, "loss": 1.9017, "step": 426 }, { "epoch": 0.03288260602335587, "grad_norm": 6.195022106170654, "learning_rate": 1.9868469575906578e-05, "loss": 2.0663, "step": 428 }, { "epoch": 0.033036263060848184, "grad_norm": 5.500522613525391, "learning_rate": 1.9867854947756607e-05, "loss": 1.9694, "step": 430 }, { "epoch": 0.03318992009834051, "grad_norm": 7.16880464553833, "learning_rate": 1.986724031960664e-05, "loss": 1.918, "step": 432 }, { "epoch": 0.03334357713583282, "grad_norm": 6.0987348556518555, "learning_rate": 1.986662569145667e-05, "loss": 1.8705, "step": 434 }, { "epoch": 0.03349723417332514, "grad_norm": 6.8652753829956055, "learning_rate": 1.98660110633067e-05, "loss": 1.9383, "step": 436 }, { "epoch": 0.033650891210817456, "grad_norm": 5.421166896820068, "learning_rate": 1.9865396435156733e-05, "loss": 1.879, "step": 438 }, { "epoch": 0.03380454824830977, "grad_norm": 5.929842948913574, "learning_rate": 1.9864781807006762e-05, "loss": 1.7183, "step": 440 }, { "epoch": 0.03395820528580209, "grad_norm": 5.500015735626221, "learning_rate": 1.9864167178856792e-05, "loss": 1.9168, "step": 442 }, { "epoch": 0.034111862323294405, "grad_norm": 6.267481327056885, "learning_rate": 1.9863552550706825e-05, "loss": 1.8126, "step": 444 }, { "epoch": 0.03426551936078672, "grad_norm": 6.300197124481201, "learning_rate": 1.9862937922556855e-05, "loss": 2.0519, "step": 446 }, { "epoch": 0.034419176398279044, "grad_norm": 8.094818115234375, "learning_rate": 1.9862323294406885e-05, "loss": 1.8122, "step": 448 }, { "epoch": 0.03457283343577136, "grad_norm": 5.738587379455566, "learning_rate": 1.9861708666256914e-05, "loss": 1.8155, "step": 450 }, { "epoch": 0.03472649047326368, "grad_norm": 5.194686412811279, "learning_rate": 1.9861094038106947e-05, "loss": 1.9198, "step": 452 }, { "epoch": 0.03488014751075599, "grad_norm": 4.97174072265625, "learning_rate": 1.9860479409956977e-05, "loss": 1.9955, "step": 454 }, { "epoch": 0.03503380454824831, "grad_norm": 5.790378570556641, "learning_rate": 1.9859864781807007e-05, "loss": 1.8218, "step": 456 }, { "epoch": 0.035187461585740626, "grad_norm": 5.287135124206543, "learning_rate": 1.985925015365704e-05, "loss": 1.9169, "step": 458 }, { "epoch": 0.03534111862323294, "grad_norm": 8.098136901855469, "learning_rate": 1.985863552550707e-05, "loss": 1.9039, "step": 460 }, { "epoch": 0.03549477566072526, "grad_norm": 6.957726955413818, "learning_rate": 1.98580208973571e-05, "loss": 2.0036, "step": 462 }, { "epoch": 0.03564843269821758, "grad_norm": 4.368841171264648, "learning_rate": 1.9857406269207132e-05, "loss": 1.8883, "step": 464 }, { "epoch": 0.0358020897357099, "grad_norm": 5.95673131942749, "learning_rate": 1.9856791641057162e-05, "loss": 1.8977, "step": 466 }, { "epoch": 0.035955746773202214, "grad_norm": 7.365513324737549, "learning_rate": 1.985617701290719e-05, "loss": 1.9865, "step": 468 }, { "epoch": 0.03610940381069453, "grad_norm": 5.386063098907471, "learning_rate": 1.9855562384757225e-05, "loss": 1.8164, "step": 470 }, { "epoch": 0.03626306084818685, "grad_norm": 6.155988693237305, "learning_rate": 1.9854947756607254e-05, "loss": 2.0083, "step": 472 }, { "epoch": 0.03641671788567916, "grad_norm": 6.110922336578369, "learning_rate": 1.9854333128457287e-05, "loss": 1.8688, "step": 474 }, { "epoch": 0.03657037492317148, "grad_norm": 5.692699909210205, "learning_rate": 1.9853718500307314e-05, "loss": 1.8501, "step": 476 }, { "epoch": 0.036724031960663796, "grad_norm": 6.044013977050781, "learning_rate": 1.9853103872157347e-05, "loss": 1.8486, "step": 478 }, { "epoch": 0.03687768899815612, "grad_norm": 6.102372169494629, "learning_rate": 1.9852489244007376e-05, "loss": 2.0873, "step": 480 }, { "epoch": 0.037031346035648435, "grad_norm": 5.4327239990234375, "learning_rate": 1.9851874615857406e-05, "loss": 1.8635, "step": 482 }, { "epoch": 0.03718500307314075, "grad_norm": 5.779347896575928, "learning_rate": 1.985125998770744e-05, "loss": 2.0413, "step": 484 }, { "epoch": 0.03733866011063307, "grad_norm": 5.000186920166016, "learning_rate": 1.985064535955747e-05, "loss": 2.0214, "step": 486 }, { "epoch": 0.037492317148125384, "grad_norm": 6.581515312194824, "learning_rate": 1.98500307314075e-05, "loss": 1.9141, "step": 488 }, { "epoch": 0.0376459741856177, "grad_norm": 6.037952423095703, "learning_rate": 1.984941610325753e-05, "loss": 1.9475, "step": 490 }, { "epoch": 0.03779963122311002, "grad_norm": 4.99038553237915, "learning_rate": 1.984880147510756e-05, "loss": 1.8296, "step": 492 }, { "epoch": 0.03795328826060233, "grad_norm": 5.351291656494141, "learning_rate": 1.9848186846957594e-05, "loss": 1.9845, "step": 494 }, { "epoch": 0.03810694529809465, "grad_norm": 6.249404430389404, "learning_rate": 1.9847572218807624e-05, "loss": 1.8824, "step": 496 }, { "epoch": 0.03826060233558697, "grad_norm": 5.460664749145508, "learning_rate": 1.9846957590657654e-05, "loss": 1.9348, "step": 498 }, { "epoch": 0.03841425937307929, "grad_norm": 5.399702072143555, "learning_rate": 1.9846342962507687e-05, "loss": 1.8646, "step": 500 }, { "epoch": 0.038567916410571605, "grad_norm": 6.00943660736084, "learning_rate": 1.9845728334357713e-05, "loss": 1.8804, "step": 502 }, { "epoch": 0.03872157344806392, "grad_norm": 6.057244300842285, "learning_rate": 1.9845113706207746e-05, "loss": 1.8876, "step": 504 }, { "epoch": 0.03887523048555624, "grad_norm": 5.178292274475098, "learning_rate": 1.9844499078057776e-05, "loss": 1.8163, "step": 506 }, { "epoch": 0.039028887523048554, "grad_norm": 5.430099964141846, "learning_rate": 1.9843884449907806e-05, "loss": 1.9221, "step": 508 }, { "epoch": 0.03918254456054087, "grad_norm": 5.2391791343688965, "learning_rate": 1.984326982175784e-05, "loss": 1.9671, "step": 510 }, { "epoch": 0.03933620159803319, "grad_norm": 6.54328727722168, "learning_rate": 1.9842655193607868e-05, "loss": 1.9916, "step": 512 }, { "epoch": 0.03948985863552551, "grad_norm": 5.6781134605407715, "learning_rate": 1.98420405654579e-05, "loss": 1.9, "step": 514 }, { "epoch": 0.039643515673017826, "grad_norm": 5.34329891204834, "learning_rate": 1.984142593730793e-05, "loss": 1.7433, "step": 516 }, { "epoch": 0.03979717271051014, "grad_norm": 6.142169952392578, "learning_rate": 1.984081130915796e-05, "loss": 1.8559, "step": 518 }, { "epoch": 0.03995082974800246, "grad_norm": 5.825856685638428, "learning_rate": 1.9840196681007994e-05, "loss": 1.7434, "step": 520 }, { "epoch": 0.040104486785494775, "grad_norm": 4.883429050445557, "learning_rate": 1.9839582052858023e-05, "loss": 1.8403, "step": 522 }, { "epoch": 0.04025814382298709, "grad_norm": 5.759003162384033, "learning_rate": 1.9838967424708053e-05, "loss": 1.872, "step": 524 }, { "epoch": 0.04041180086047941, "grad_norm": 5.845025539398193, "learning_rate": 1.9838352796558086e-05, "loss": 1.818, "step": 526 }, { "epoch": 0.040565457897971724, "grad_norm": 6.238631248474121, "learning_rate": 1.9837738168408113e-05, "loss": 1.9553, "step": 528 }, { "epoch": 0.04071911493546405, "grad_norm": 5.450825214385986, "learning_rate": 1.9837123540258146e-05, "loss": 1.9314, "step": 530 }, { "epoch": 0.040872771972956363, "grad_norm": 5.4290385246276855, "learning_rate": 1.9836508912108175e-05, "loss": 1.8316, "step": 532 }, { "epoch": 0.04102642901044868, "grad_norm": 6.243955612182617, "learning_rate": 1.9835894283958205e-05, "loss": 1.9605, "step": 534 }, { "epoch": 0.041180086047940996, "grad_norm": 5.5207672119140625, "learning_rate": 1.9835279655808238e-05, "loss": 1.9377, "step": 536 }, { "epoch": 0.04133374308543331, "grad_norm": 5.570779323577881, "learning_rate": 1.9834665027658268e-05, "loss": 1.9706, "step": 538 }, { "epoch": 0.04148740012292563, "grad_norm": 4.921234130859375, "learning_rate": 1.98340503995083e-05, "loss": 1.8666, "step": 540 }, { "epoch": 0.041641057160417945, "grad_norm": 6.029317855834961, "learning_rate": 1.983343577135833e-05, "loss": 1.8431, "step": 542 }, { "epoch": 0.04179471419791026, "grad_norm": 5.6237664222717285, "learning_rate": 1.983282114320836e-05, "loss": 2.0265, "step": 544 }, { "epoch": 0.041948371235402585, "grad_norm": 4.848809719085693, "learning_rate": 1.9832206515058393e-05, "loss": 1.851, "step": 546 }, { "epoch": 0.0421020282728949, "grad_norm": 6.06104040145874, "learning_rate": 1.983159188690842e-05, "loss": 1.9252, "step": 548 }, { "epoch": 0.04225568531038722, "grad_norm": 6.721662521362305, "learning_rate": 1.9830977258758453e-05, "loss": 1.9046, "step": 550 }, { "epoch": 0.04240934234787953, "grad_norm": 5.039158821105957, "learning_rate": 1.9830362630608482e-05, "loss": 1.9457, "step": 552 }, { "epoch": 0.04256299938537185, "grad_norm": 4.985758304595947, "learning_rate": 1.9829748002458512e-05, "loss": 1.7706, "step": 554 }, { "epoch": 0.042716656422864166, "grad_norm": 5.59445858001709, "learning_rate": 1.9829133374308545e-05, "loss": 1.9232, "step": 556 }, { "epoch": 0.04287031346035648, "grad_norm": 5.786518573760986, "learning_rate": 1.9828518746158575e-05, "loss": 1.9535, "step": 558 }, { "epoch": 0.0430239704978488, "grad_norm": 5.362064838409424, "learning_rate": 1.9827904118008608e-05, "loss": 1.774, "step": 560 }, { "epoch": 0.04317762753534112, "grad_norm": 6.807535171508789, "learning_rate": 1.9827289489858637e-05, "loss": 1.9963, "step": 562 }, { "epoch": 0.04333128457283344, "grad_norm": 4.927182197570801, "learning_rate": 1.9826674861708667e-05, "loss": 1.8839, "step": 564 }, { "epoch": 0.043484941610325754, "grad_norm": 7.077647686004639, "learning_rate": 1.98260602335587e-05, "loss": 1.8577, "step": 566 }, { "epoch": 0.04363859864781807, "grad_norm": 4.930956840515137, "learning_rate": 1.982544560540873e-05, "loss": 1.9032, "step": 568 }, { "epoch": 0.04379225568531039, "grad_norm": 5.537839889526367, "learning_rate": 1.982483097725876e-05, "loss": 1.8599, "step": 570 }, { "epoch": 0.0439459127228027, "grad_norm": 4.91294527053833, "learning_rate": 1.9824216349108793e-05, "loss": 1.8962, "step": 572 }, { "epoch": 0.04409956976029502, "grad_norm": 7.946929931640625, "learning_rate": 1.982360172095882e-05, "loss": 2.0401, "step": 574 }, { "epoch": 0.044253226797787336, "grad_norm": 5.566417217254639, "learning_rate": 1.9822987092808852e-05, "loss": 1.7317, "step": 576 }, { "epoch": 0.04440688383527966, "grad_norm": 6.196030616760254, "learning_rate": 1.9822372464658882e-05, "loss": 1.9818, "step": 578 }, { "epoch": 0.044560540872771975, "grad_norm": 5.8990888595581055, "learning_rate": 1.9821757836508915e-05, "loss": 1.9209, "step": 580 }, { "epoch": 0.04471419791026429, "grad_norm": 4.752439022064209, "learning_rate": 1.9821143208358944e-05, "loss": 1.8661, "step": 582 }, { "epoch": 0.04486785494775661, "grad_norm": 5.3692121505737305, "learning_rate": 1.9820528580208974e-05, "loss": 1.8574, "step": 584 }, { "epoch": 0.045021511985248924, "grad_norm": 4.94577169418335, "learning_rate": 1.9819913952059007e-05, "loss": 1.76, "step": 586 }, { "epoch": 0.04517516902274124, "grad_norm": 5.1533708572387695, "learning_rate": 1.9819299323909037e-05, "loss": 1.8634, "step": 588 }, { "epoch": 0.04532882606023356, "grad_norm": 5.460253715515137, "learning_rate": 1.9818684695759067e-05, "loss": 1.7615, "step": 590 }, { "epoch": 0.04548248309772587, "grad_norm": 6.106910705566406, "learning_rate": 1.98180700676091e-05, "loss": 1.8658, "step": 592 }, { "epoch": 0.045636140135218196, "grad_norm": 8.604896545410156, "learning_rate": 1.981745543945913e-05, "loss": 1.8234, "step": 594 }, { "epoch": 0.04578979717271051, "grad_norm": 5.533381938934326, "learning_rate": 1.981684081130916e-05, "loss": 1.8133, "step": 596 }, { "epoch": 0.04594345421020283, "grad_norm": 5.140172481536865, "learning_rate": 1.9816226183159192e-05, "loss": 1.7655, "step": 598 }, { "epoch": 0.046097111247695145, "grad_norm": 5.633389472961426, "learning_rate": 1.9815611555009222e-05, "loss": 1.8804, "step": 600 }, { "epoch": 0.04625076828518746, "grad_norm": 5.397654056549072, "learning_rate": 1.981499692685925e-05, "loss": 1.9422, "step": 602 }, { "epoch": 0.04640442532267978, "grad_norm": 5.916885852813721, "learning_rate": 1.981438229870928e-05, "loss": 1.9222, "step": 604 }, { "epoch": 0.046558082360172094, "grad_norm": 4.4198198318481445, "learning_rate": 1.9813767670559314e-05, "loss": 1.8088, "step": 606 }, { "epoch": 0.04671173939766441, "grad_norm": 6.035666465759277, "learning_rate": 1.9813153042409344e-05, "loss": 1.9505, "step": 608 }, { "epoch": 0.04686539643515673, "grad_norm": 5.293002605438232, "learning_rate": 1.9812538414259374e-05, "loss": 1.9354, "step": 610 }, { "epoch": 0.04701905347264905, "grad_norm": 5.066743850708008, "learning_rate": 1.9811923786109407e-05, "loss": 2.001, "step": 612 }, { "epoch": 0.047172710510141366, "grad_norm": 6.867171764373779, "learning_rate": 1.9811309157959436e-05, "loss": 1.86, "step": 614 }, { "epoch": 0.04732636754763368, "grad_norm": 4.908615589141846, "learning_rate": 1.9810694529809466e-05, "loss": 1.8855, "step": 616 }, { "epoch": 0.047480024585126, "grad_norm": 5.6588006019592285, "learning_rate": 1.98100799016595e-05, "loss": 1.8047, "step": 618 }, { "epoch": 0.047633681622618315, "grad_norm": 5.6555304527282715, "learning_rate": 1.980946527350953e-05, "loss": 1.7656, "step": 620 }, { "epoch": 0.04778733866011063, "grad_norm": 4.742602348327637, "learning_rate": 1.980885064535956e-05, "loss": 1.976, "step": 622 }, { "epoch": 0.04794099569760295, "grad_norm": 5.0910868644714355, "learning_rate": 1.980823601720959e-05, "loss": 1.8894, "step": 624 }, { "epoch": 0.048094652735095264, "grad_norm": 5.279669761657715, "learning_rate": 1.980762138905962e-05, "loss": 1.9323, "step": 626 }, { "epoch": 0.04824830977258759, "grad_norm": 5.603051662445068, "learning_rate": 1.980700676090965e-05, "loss": 1.9327, "step": 628 }, { "epoch": 0.048401966810079904, "grad_norm": 5.823456764221191, "learning_rate": 1.980639213275968e-05, "loss": 1.9087, "step": 630 }, { "epoch": 0.04855562384757222, "grad_norm": 4.226296424865723, "learning_rate": 1.9805777504609714e-05, "loss": 1.7298, "step": 632 }, { "epoch": 0.048709280885064536, "grad_norm": 4.537020683288574, "learning_rate": 1.9805162876459743e-05, "loss": 1.8588, "step": 634 }, { "epoch": 0.04886293792255685, "grad_norm": 5.843430519104004, "learning_rate": 1.9804548248309773e-05, "loss": 1.8581, "step": 636 }, { "epoch": 0.04901659496004917, "grad_norm": 5.234043598175049, "learning_rate": 1.9803933620159806e-05, "loss": 1.8016, "step": 638 }, { "epoch": 0.049170251997541485, "grad_norm": 6.091218948364258, "learning_rate": 1.9803318992009836e-05, "loss": 1.8419, "step": 640 }, { "epoch": 0.0493239090350338, "grad_norm": 5.473825454711914, "learning_rate": 1.9802704363859865e-05, "loss": 1.8742, "step": 642 }, { "epoch": 0.049477566072526125, "grad_norm": 5.018134117126465, "learning_rate": 1.98020897357099e-05, "loss": 1.9246, "step": 644 }, { "epoch": 0.04963122311001844, "grad_norm": 5.1250505447387695, "learning_rate": 1.9801475107559928e-05, "loss": 1.8988, "step": 646 }, { "epoch": 0.04978488014751076, "grad_norm": 5.310157299041748, "learning_rate": 1.9800860479409958e-05, "loss": 1.9718, "step": 648 }, { "epoch": 0.049938537185003073, "grad_norm": 5.5490570068359375, "learning_rate": 1.980024585125999e-05, "loss": 1.8779, "step": 650 }, { "epoch": 0.05009219422249539, "grad_norm": 5.242208480834961, "learning_rate": 1.979963122311002e-05, "loss": 1.8712, "step": 652 }, { "epoch": 0.050245851259987706, "grad_norm": 4.680446624755859, "learning_rate": 1.979901659496005e-05, "loss": 1.9475, "step": 654 }, { "epoch": 0.05039950829748002, "grad_norm": 12.400496482849121, "learning_rate": 1.979840196681008e-05, "loss": 1.9387, "step": 656 }, { "epoch": 0.05055316533497234, "grad_norm": 4.818700313568115, "learning_rate": 1.9797787338660113e-05, "loss": 1.7356, "step": 658 }, { "epoch": 0.05070682237246466, "grad_norm": 4.733686923980713, "learning_rate": 1.9797172710510143e-05, "loss": 1.7161, "step": 660 }, { "epoch": 0.05086047940995698, "grad_norm": 5.9219865798950195, "learning_rate": 1.9796558082360172e-05, "loss": 1.9821, "step": 662 }, { "epoch": 0.051014136447449294, "grad_norm": 4.954675197601318, "learning_rate": 1.9795943454210206e-05, "loss": 1.9392, "step": 664 }, { "epoch": 0.05116779348494161, "grad_norm": 4.482631206512451, "learning_rate": 1.9795328826060235e-05, "loss": 1.9687, "step": 666 }, { "epoch": 0.05132145052243393, "grad_norm": 6.749068737030029, "learning_rate": 1.9794714197910265e-05, "loss": 1.8242, "step": 668 }, { "epoch": 0.05147510755992624, "grad_norm": 4.532095909118652, "learning_rate": 1.9794099569760298e-05, "loss": 1.7596, "step": 670 }, { "epoch": 0.05162876459741856, "grad_norm": 5.727676868438721, "learning_rate": 1.9793484941610328e-05, "loss": 1.94, "step": 672 }, { "epoch": 0.051782421634910876, "grad_norm": 5.493950843811035, "learning_rate": 1.9792870313460357e-05, "loss": 1.9243, "step": 674 }, { "epoch": 0.0519360786724032, "grad_norm": 5.48468017578125, "learning_rate": 1.9792255685310387e-05, "loss": 1.7862, "step": 676 }, { "epoch": 0.052089735709895515, "grad_norm": 5.862773895263672, "learning_rate": 1.979164105716042e-05, "loss": 1.843, "step": 678 }, { "epoch": 0.05224339274738783, "grad_norm": 5.505096912384033, "learning_rate": 1.979102642901045e-05, "loss": 1.8366, "step": 680 }, { "epoch": 0.05239704978488015, "grad_norm": 5.697121620178223, "learning_rate": 1.979041180086048e-05, "loss": 1.9764, "step": 682 }, { "epoch": 0.052550706822372464, "grad_norm": 4.900547027587891, "learning_rate": 1.9789797172710513e-05, "loss": 1.9252, "step": 684 }, { "epoch": 0.05270436385986478, "grad_norm": 5.347836017608643, "learning_rate": 1.9789182544560542e-05, "loss": 1.8527, "step": 686 }, { "epoch": 0.0528580208973571, "grad_norm": 5.393474102020264, "learning_rate": 1.9788567916410572e-05, "loss": 1.8422, "step": 688 }, { "epoch": 0.05301167793484941, "grad_norm": 5.27833366394043, "learning_rate": 1.9787953288260605e-05, "loss": 1.8933, "step": 690 }, { "epoch": 0.053165334972341736, "grad_norm": 5.38336181640625, "learning_rate": 1.9787338660110635e-05, "loss": 1.9456, "step": 692 }, { "epoch": 0.05331899200983405, "grad_norm": 5.273176193237305, "learning_rate": 1.9786724031960664e-05, "loss": 1.8142, "step": 694 }, { "epoch": 0.05347264904732637, "grad_norm": 5.413751125335693, "learning_rate": 1.9786109403810697e-05, "loss": 1.7652, "step": 696 }, { "epoch": 0.053626306084818685, "grad_norm": 5.373195648193359, "learning_rate": 1.9785494775660727e-05, "loss": 1.883, "step": 698 }, { "epoch": 0.053779963122311, "grad_norm": 4.942586421966553, "learning_rate": 1.9784880147510757e-05, "loss": 1.76, "step": 700 }, { "epoch": 0.05393362015980332, "grad_norm": 5.6196980476379395, "learning_rate": 1.9784265519360786e-05, "loss": 1.7673, "step": 702 }, { "epoch": 0.054087277197295634, "grad_norm": 5.702764987945557, "learning_rate": 1.978365089121082e-05, "loss": 1.8437, "step": 704 }, { "epoch": 0.05424093423478795, "grad_norm": 4.99530553817749, "learning_rate": 1.978303626306085e-05, "loss": 1.8747, "step": 706 }, { "epoch": 0.054394591272280274, "grad_norm": 5.105679035186768, "learning_rate": 1.978242163491088e-05, "loss": 1.6632, "step": 708 }, { "epoch": 0.05454824830977259, "grad_norm": 4.710418701171875, "learning_rate": 1.9781807006760912e-05, "loss": 1.8736, "step": 710 }, { "epoch": 0.054701905347264906, "grad_norm": 4.792379856109619, "learning_rate": 1.978119237861094e-05, "loss": 1.8016, "step": 712 }, { "epoch": 0.05485556238475722, "grad_norm": 4.937024116516113, "learning_rate": 1.978057775046097e-05, "loss": 1.7436, "step": 714 }, { "epoch": 0.05500921942224954, "grad_norm": 5.5544867515563965, "learning_rate": 1.9779963122311004e-05, "loss": 1.926, "step": 716 }, { "epoch": 0.055162876459741855, "grad_norm": 6.484194278717041, "learning_rate": 1.9779348494161034e-05, "loss": 1.9102, "step": 718 }, { "epoch": 0.05531653349723417, "grad_norm": 5.408361434936523, "learning_rate": 1.9778733866011064e-05, "loss": 1.7786, "step": 720 }, { "epoch": 0.05547019053472649, "grad_norm": 5.705206394195557, "learning_rate": 1.9778119237861097e-05, "loss": 1.779, "step": 722 }, { "epoch": 0.055623847572218804, "grad_norm": 6.138594627380371, "learning_rate": 1.9777504609711127e-05, "loss": 1.6926, "step": 724 }, { "epoch": 0.05577750460971113, "grad_norm": 5.507882595062256, "learning_rate": 1.977688998156116e-05, "loss": 2.0119, "step": 726 }, { "epoch": 0.055931161647203444, "grad_norm": 5.1471710205078125, "learning_rate": 1.9776275353411186e-05, "loss": 1.7674, "step": 728 }, { "epoch": 0.05608481868469576, "grad_norm": 5.558322906494141, "learning_rate": 1.977566072526122e-05, "loss": 1.7262, "step": 730 }, { "epoch": 0.056238475722188076, "grad_norm": 5.859812259674072, "learning_rate": 1.977504609711125e-05, "loss": 2.0273, "step": 732 }, { "epoch": 0.05639213275968039, "grad_norm": 4.931456565856934, "learning_rate": 1.977443146896128e-05, "loss": 1.7008, "step": 734 }, { "epoch": 0.05654578979717271, "grad_norm": 4.835200786590576, "learning_rate": 1.977381684081131e-05, "loss": 1.7615, "step": 736 }, { "epoch": 0.056699446834665025, "grad_norm": 5.542105674743652, "learning_rate": 1.977320221266134e-05, "loss": 1.7991, "step": 738 }, { "epoch": 0.05685310387215734, "grad_norm": 5.737773895263672, "learning_rate": 1.977258758451137e-05, "loss": 1.7067, "step": 740 }, { "epoch": 0.057006760909649665, "grad_norm": 4.556394100189209, "learning_rate": 1.9771972956361404e-05, "loss": 1.8418, "step": 742 }, { "epoch": 0.05716041794714198, "grad_norm": 4.682400226593018, "learning_rate": 1.9771358328211434e-05, "loss": 1.7565, "step": 744 }, { "epoch": 0.0573140749846343, "grad_norm": 5.617753982543945, "learning_rate": 1.9770743700061467e-05, "loss": 1.8314, "step": 746 }, { "epoch": 0.057467732022126614, "grad_norm": 4.796401500701904, "learning_rate": 1.9770129071911496e-05, "loss": 1.6892, "step": 748 }, { "epoch": 0.05762138905961893, "grad_norm": 5.084446430206299, "learning_rate": 1.9769514443761526e-05, "loss": 1.6731, "step": 750 }, { "epoch": 0.057775046097111246, "grad_norm": 5.344216823577881, "learning_rate": 1.976889981561156e-05, "loss": 1.8096, "step": 752 }, { "epoch": 0.05792870313460356, "grad_norm": 4.87506103515625, "learning_rate": 1.9768285187461585e-05, "loss": 1.9015, "step": 754 }, { "epoch": 0.05808236017209588, "grad_norm": 5.019058704376221, "learning_rate": 1.976767055931162e-05, "loss": 1.9467, "step": 756 }, { "epoch": 0.0582360172095882, "grad_norm": 5.275008678436279, "learning_rate": 1.9767055931161648e-05, "loss": 1.6159, "step": 758 }, { "epoch": 0.05838967424708052, "grad_norm": 5.17955207824707, "learning_rate": 1.9766441303011678e-05, "loss": 1.6819, "step": 760 }, { "epoch": 0.058543331284572835, "grad_norm": 5.578658580780029, "learning_rate": 1.976582667486171e-05, "loss": 1.8369, "step": 762 }, { "epoch": 0.05869698832206515, "grad_norm": 4.934607982635498, "learning_rate": 1.976521204671174e-05, "loss": 1.8909, "step": 764 }, { "epoch": 0.05885064535955747, "grad_norm": 5.5896759033203125, "learning_rate": 1.9764597418561774e-05, "loss": 1.7238, "step": 766 }, { "epoch": 0.05900430239704978, "grad_norm": 5.263469696044922, "learning_rate": 1.9763982790411803e-05, "loss": 1.7776, "step": 768 }, { "epoch": 0.0591579594345421, "grad_norm": 4.459990978240967, "learning_rate": 1.9763368162261833e-05, "loss": 1.7082, "step": 770 }, { "epoch": 0.059311616472034416, "grad_norm": 5.528759002685547, "learning_rate": 1.9762753534111866e-05, "loss": 1.9014, "step": 772 }, { "epoch": 0.05946527350952674, "grad_norm": 5.372073650360107, "learning_rate": 1.9762138905961892e-05, "loss": 1.691, "step": 774 }, { "epoch": 0.059618930547019056, "grad_norm": 5.765900135040283, "learning_rate": 1.9761524277811925e-05, "loss": 1.9243, "step": 776 }, { "epoch": 0.05977258758451137, "grad_norm": 5.123989105224609, "learning_rate": 1.9760909649661955e-05, "loss": 2.0223, "step": 778 }, { "epoch": 0.05992624462200369, "grad_norm": 5.149808406829834, "learning_rate": 1.9760295021511985e-05, "loss": 1.8063, "step": 780 }, { "epoch": 0.060079901659496004, "grad_norm": 5.047703266143799, "learning_rate": 1.9759680393362018e-05, "loss": 1.9353, "step": 782 }, { "epoch": 0.06023355869698832, "grad_norm": 5.555423259735107, "learning_rate": 1.9759065765212048e-05, "loss": 1.695, "step": 784 }, { "epoch": 0.06038721573448064, "grad_norm": 5.100247859954834, "learning_rate": 1.9758451137062077e-05, "loss": 1.6758, "step": 786 }, { "epoch": 0.06054087277197295, "grad_norm": 4.941176891326904, "learning_rate": 1.975783650891211e-05, "loss": 1.7391, "step": 788 }, { "epoch": 0.06069452980946528, "grad_norm": 5.3119964599609375, "learning_rate": 1.975722188076214e-05, "loss": 2.0366, "step": 790 }, { "epoch": 0.06084818684695759, "grad_norm": 6.0686235427856445, "learning_rate": 1.9756607252612173e-05, "loss": 1.7149, "step": 792 }, { "epoch": 0.06100184388444991, "grad_norm": 6.141575336456299, "learning_rate": 1.9755992624462203e-05, "loss": 1.7808, "step": 794 }, { "epoch": 0.061155500921942225, "grad_norm": 5.157688140869141, "learning_rate": 1.9755377996312232e-05, "loss": 1.8069, "step": 796 }, { "epoch": 0.06130915795943454, "grad_norm": 5.358695983886719, "learning_rate": 1.9754763368162266e-05, "loss": 1.9632, "step": 798 }, { "epoch": 0.06146281499692686, "grad_norm": 5.3423261642456055, "learning_rate": 1.9754148740012292e-05, "loss": 1.6711, "step": 800 }, { "epoch": 0.061616472034419174, "grad_norm": 5.9911980628967285, "learning_rate": 1.9753534111862325e-05, "loss": 1.6384, "step": 802 }, { "epoch": 0.06177012907191149, "grad_norm": 5.021694183349609, "learning_rate": 1.9752919483712355e-05, "loss": 1.7412, "step": 804 }, { "epoch": 0.061923786109403814, "grad_norm": 5.38372802734375, "learning_rate": 1.9752304855562384e-05, "loss": 1.5895, "step": 806 }, { "epoch": 0.06207744314689613, "grad_norm": 5.618641376495361, "learning_rate": 1.9751690227412417e-05, "loss": 1.8097, "step": 808 }, { "epoch": 0.062231100184388446, "grad_norm": 5.081387519836426, "learning_rate": 1.9751075599262447e-05, "loss": 1.7789, "step": 810 }, { "epoch": 0.06238475722188076, "grad_norm": 5.361464500427246, "learning_rate": 1.975046097111248e-05, "loss": 1.7821, "step": 812 }, { "epoch": 0.06253841425937308, "grad_norm": 5.113397598266602, "learning_rate": 1.974984634296251e-05, "loss": 1.7292, "step": 814 }, { "epoch": 0.0626920712968654, "grad_norm": 5.261277198791504, "learning_rate": 1.974923171481254e-05, "loss": 1.6956, "step": 816 }, { "epoch": 0.06284572833435771, "grad_norm": 7.06874942779541, "learning_rate": 1.9748617086662573e-05, "loss": 1.865, "step": 818 }, { "epoch": 0.06299938537185003, "grad_norm": 4.949324131011963, "learning_rate": 1.9748002458512602e-05, "loss": 1.7885, "step": 820 }, { "epoch": 0.06315304240934234, "grad_norm": 6.291264533996582, "learning_rate": 1.9747387830362632e-05, "loss": 1.8685, "step": 822 }, { "epoch": 0.06330669944683466, "grad_norm": 4.500913143157959, "learning_rate": 1.9746773202212665e-05, "loss": 1.6422, "step": 824 }, { "epoch": 0.06346035648432698, "grad_norm": 5.313440322875977, "learning_rate": 1.974615857406269e-05, "loss": 1.8328, "step": 826 }, { "epoch": 0.0636140135218193, "grad_norm": 5.809798240661621, "learning_rate": 1.9745543945912724e-05, "loss": 1.8912, "step": 828 }, { "epoch": 0.06376767055931162, "grad_norm": 4.69051456451416, "learning_rate": 1.9744929317762754e-05, "loss": 1.6786, "step": 830 }, { "epoch": 0.06392132759680394, "grad_norm": 5.292459487915039, "learning_rate": 1.9744314689612787e-05, "loss": 1.8497, "step": 832 }, { "epoch": 0.06407498463429626, "grad_norm": 4.772144794464111, "learning_rate": 1.9743700061462817e-05, "loss": 1.6261, "step": 834 }, { "epoch": 0.06422864167178857, "grad_norm": 4.984364032745361, "learning_rate": 1.9743085433312846e-05, "loss": 1.7354, "step": 836 }, { "epoch": 0.06438229870928089, "grad_norm": 4.450577735900879, "learning_rate": 1.974247080516288e-05, "loss": 1.7349, "step": 838 }, { "epoch": 0.0645359557467732, "grad_norm": 5.341747760772705, "learning_rate": 1.974185617701291e-05, "loss": 1.8332, "step": 840 }, { "epoch": 0.06468961278426552, "grad_norm": 5.368303298950195, "learning_rate": 1.974124154886294e-05, "loss": 1.8219, "step": 842 }, { "epoch": 0.06484326982175784, "grad_norm": 4.4096360206604, "learning_rate": 1.9740626920712972e-05, "loss": 1.8158, "step": 844 }, { "epoch": 0.06499692685925015, "grad_norm": 6.098479270935059, "learning_rate": 1.9740012292563e-05, "loss": 1.7323, "step": 846 }, { "epoch": 0.06515058389674247, "grad_norm": 4.606769561767578, "learning_rate": 1.973939766441303e-05, "loss": 1.7527, "step": 848 }, { "epoch": 0.06530424093423479, "grad_norm": 6.082258701324463, "learning_rate": 1.9738783036263064e-05, "loss": 1.8471, "step": 850 }, { "epoch": 0.0654578979717271, "grad_norm": 5.389958381652832, "learning_rate": 1.9738168408113094e-05, "loss": 1.8564, "step": 852 }, { "epoch": 0.06561155500921942, "grad_norm": 5.574385643005371, "learning_rate": 1.9737553779963124e-05, "loss": 1.784, "step": 854 }, { "epoch": 0.06576521204671174, "grad_norm": 5.1567487716674805, "learning_rate": 1.9736939151813153e-05, "loss": 1.713, "step": 856 }, { "epoch": 0.06591886908420405, "grad_norm": 5.475706577301025, "learning_rate": 1.9736324523663187e-05, "loss": 1.7555, "step": 858 }, { "epoch": 0.06607252612169637, "grad_norm": 4.831605434417725, "learning_rate": 1.9735709895513216e-05, "loss": 1.8695, "step": 860 }, { "epoch": 0.06622618315918868, "grad_norm": 6.022873878479004, "learning_rate": 1.9735095267363246e-05, "loss": 1.7765, "step": 862 }, { "epoch": 0.06637984019668101, "grad_norm": 4.874941825866699, "learning_rate": 1.973448063921328e-05, "loss": 1.757, "step": 864 }, { "epoch": 0.06653349723417333, "grad_norm": 4.488655090332031, "learning_rate": 1.973386601106331e-05, "loss": 1.8837, "step": 866 }, { "epoch": 0.06668715427166565, "grad_norm": 4.5713090896606445, "learning_rate": 1.973325138291334e-05, "loss": 1.6627, "step": 868 }, { "epoch": 0.06684081130915796, "grad_norm": 5.312070846557617, "learning_rate": 1.973263675476337e-05, "loss": 1.7551, "step": 870 }, { "epoch": 0.06699446834665028, "grad_norm": 5.104644775390625, "learning_rate": 1.97320221266134e-05, "loss": 1.8789, "step": 872 }, { "epoch": 0.0671481253841426, "grad_norm": 4.595895290374756, "learning_rate": 1.973140749846343e-05, "loss": 1.8319, "step": 874 }, { "epoch": 0.06730178242163491, "grad_norm": 5.223522186279297, "learning_rate": 1.973079287031346e-05, "loss": 1.6661, "step": 876 }, { "epoch": 0.06745543945912723, "grad_norm": 4.466522693634033, "learning_rate": 1.9730178242163494e-05, "loss": 1.8338, "step": 878 }, { "epoch": 0.06760909649661954, "grad_norm": 4.613927841186523, "learning_rate": 1.9729563614013523e-05, "loss": 1.7562, "step": 880 }, { "epoch": 0.06776275353411186, "grad_norm": 5.868101119995117, "learning_rate": 1.9728948985863553e-05, "loss": 1.9181, "step": 882 }, { "epoch": 0.06791641057160418, "grad_norm": 5.407005786895752, "learning_rate": 1.9728334357713586e-05, "loss": 1.8462, "step": 884 }, { "epoch": 0.0680700676090965, "grad_norm": 6.075082778930664, "learning_rate": 1.9727719729563616e-05, "loss": 1.816, "step": 886 }, { "epoch": 0.06822372464658881, "grad_norm": 5.164397716522217, "learning_rate": 1.9727105101413645e-05, "loss": 1.9236, "step": 888 }, { "epoch": 0.06837738168408113, "grad_norm": 5.688714981079102, "learning_rate": 1.972649047326368e-05, "loss": 1.7171, "step": 890 }, { "epoch": 0.06853103872157344, "grad_norm": 4.9518842697143555, "learning_rate": 1.9725875845113708e-05, "loss": 1.7829, "step": 892 }, { "epoch": 0.06868469575906576, "grad_norm": 5.185763835906982, "learning_rate": 1.9725261216963738e-05, "loss": 1.7146, "step": 894 }, { "epoch": 0.06883835279655809, "grad_norm": 5.043625354766846, "learning_rate": 1.972464658881377e-05, "loss": 1.8594, "step": 896 }, { "epoch": 0.0689920098340504, "grad_norm": 4.783642292022705, "learning_rate": 1.97240319606638e-05, "loss": 1.7053, "step": 898 }, { "epoch": 0.06914566687154272, "grad_norm": 4.887513637542725, "learning_rate": 1.972341733251383e-05, "loss": 1.7929, "step": 900 }, { "epoch": 0.06929932390903504, "grad_norm": 4.4108123779296875, "learning_rate": 1.972280270436386e-05, "loss": 1.7296, "step": 902 }, { "epoch": 0.06945298094652735, "grad_norm": 5.531246662139893, "learning_rate": 1.9722188076213893e-05, "loss": 1.7792, "step": 904 }, { "epoch": 0.06960663798401967, "grad_norm": 4.462593078613281, "learning_rate": 1.9721573448063923e-05, "loss": 1.7121, "step": 906 }, { "epoch": 0.06976029502151199, "grad_norm": 4.543118000030518, "learning_rate": 1.9720958819913952e-05, "loss": 1.7871, "step": 908 }, { "epoch": 0.0699139520590043, "grad_norm": 5.9536638259887695, "learning_rate": 1.9720344191763985e-05, "loss": 1.7956, "step": 910 }, { "epoch": 0.07006760909649662, "grad_norm": 4.735901832580566, "learning_rate": 1.9719729563614015e-05, "loss": 1.7511, "step": 912 }, { "epoch": 0.07022126613398894, "grad_norm": 4.490820407867432, "learning_rate": 1.9719114935464045e-05, "loss": 1.7364, "step": 914 }, { "epoch": 0.07037492317148125, "grad_norm": 4.837772846221924, "learning_rate": 1.9718500307314078e-05, "loss": 1.8204, "step": 916 }, { "epoch": 0.07052858020897357, "grad_norm": 4.400464057922363, "learning_rate": 1.9717885679164108e-05, "loss": 1.8637, "step": 918 }, { "epoch": 0.07068223724646588, "grad_norm": 4.991397857666016, "learning_rate": 1.9717271051014137e-05, "loss": 1.9175, "step": 920 }, { "epoch": 0.0708358942839582, "grad_norm": 4.983181953430176, "learning_rate": 1.971665642286417e-05, "loss": 1.7065, "step": 922 }, { "epoch": 0.07098955132145052, "grad_norm": 4.5055341720581055, "learning_rate": 1.97160417947142e-05, "loss": 1.7113, "step": 924 }, { "epoch": 0.07114320835894283, "grad_norm": 5.021308422088623, "learning_rate": 1.971542716656423e-05, "loss": 1.8903, "step": 926 }, { "epoch": 0.07129686539643516, "grad_norm": 6.712131023406982, "learning_rate": 1.971481253841426e-05, "loss": 1.7608, "step": 928 }, { "epoch": 0.07145052243392748, "grad_norm": 5.711028575897217, "learning_rate": 1.9714197910264292e-05, "loss": 1.7826, "step": 930 }, { "epoch": 0.0716041794714198, "grad_norm": 5.202549457550049, "learning_rate": 1.9713583282114322e-05, "loss": 1.7201, "step": 932 }, { "epoch": 0.07175783650891211, "grad_norm": 4.809873580932617, "learning_rate": 1.9712968653964352e-05, "loss": 1.7824, "step": 934 }, { "epoch": 0.07191149354640443, "grad_norm": 4.417870998382568, "learning_rate": 1.9712354025814385e-05, "loss": 1.8275, "step": 936 }, { "epoch": 0.07206515058389674, "grad_norm": 4.823970794677734, "learning_rate": 1.9711739397664415e-05, "loss": 1.7293, "step": 938 }, { "epoch": 0.07221880762138906, "grad_norm": 5.289034843444824, "learning_rate": 1.9711124769514444e-05, "loss": 1.7507, "step": 940 }, { "epoch": 0.07237246465888138, "grad_norm": 4.538127422332764, "learning_rate": 1.9710510141364477e-05, "loss": 1.6391, "step": 942 }, { "epoch": 0.0725261216963737, "grad_norm": 4.500412464141846, "learning_rate": 1.9709895513214507e-05, "loss": 1.6816, "step": 944 }, { "epoch": 0.07267977873386601, "grad_norm": 5.149693012237549, "learning_rate": 1.9709280885064537e-05, "loss": 1.6366, "step": 946 }, { "epoch": 0.07283343577135833, "grad_norm": 4.554830074310303, "learning_rate": 1.970866625691457e-05, "loss": 1.7712, "step": 948 }, { "epoch": 0.07298709280885064, "grad_norm": 5.352977752685547, "learning_rate": 1.97080516287646e-05, "loss": 1.5819, "step": 950 }, { "epoch": 0.07314074984634296, "grad_norm": 5.3114094734191895, "learning_rate": 1.970743700061463e-05, "loss": 1.802, "step": 952 }, { "epoch": 0.07329440688383528, "grad_norm": 4.19597053527832, "learning_rate": 1.970682237246466e-05, "loss": 1.5855, "step": 954 }, { "epoch": 0.07344806392132759, "grad_norm": 4.087234020233154, "learning_rate": 1.9706207744314692e-05, "loss": 1.7158, "step": 956 }, { "epoch": 0.07360172095881991, "grad_norm": 5.064235210418701, "learning_rate": 1.970559311616472e-05, "loss": 1.7579, "step": 958 }, { "epoch": 0.07375537799631224, "grad_norm": 5.033870697021484, "learning_rate": 1.970497848801475e-05, "loss": 1.8105, "step": 960 }, { "epoch": 0.07390903503380455, "grad_norm": 4.546055793762207, "learning_rate": 1.9704363859864784e-05, "loss": 1.6964, "step": 962 }, { "epoch": 0.07406269207129687, "grad_norm": 5.068551540374756, "learning_rate": 1.9703749231714814e-05, "loss": 1.6664, "step": 964 }, { "epoch": 0.07421634910878919, "grad_norm": 5.1122050285339355, "learning_rate": 1.9703134603564844e-05, "loss": 1.819, "step": 966 }, { "epoch": 0.0743700061462815, "grad_norm": 4.410829544067383, "learning_rate": 1.9702519975414877e-05, "loss": 1.7872, "step": 968 }, { "epoch": 0.07452366318377382, "grad_norm": 4.5425190925598145, "learning_rate": 1.9701905347264906e-05, "loss": 1.6679, "step": 970 }, { "epoch": 0.07467732022126614, "grad_norm": 4.571905612945557, "learning_rate": 1.9701290719114936e-05, "loss": 1.7135, "step": 972 }, { "epoch": 0.07483097725875845, "grad_norm": 4.853597640991211, "learning_rate": 1.970067609096497e-05, "loss": 1.6162, "step": 974 }, { "epoch": 0.07498463429625077, "grad_norm": 5.288270473480225, "learning_rate": 1.9700061462815e-05, "loss": 1.7473, "step": 976 }, { "epoch": 0.07513829133374308, "grad_norm": 4.418172359466553, "learning_rate": 1.9699446834665032e-05, "loss": 1.6986, "step": 978 }, { "epoch": 0.0752919483712354, "grad_norm": 4.6486496925354, "learning_rate": 1.9698832206515058e-05, "loss": 1.6914, "step": 980 }, { "epoch": 0.07544560540872772, "grad_norm": 5.205509185791016, "learning_rate": 1.969821757836509e-05, "loss": 1.6129, "step": 982 }, { "epoch": 0.07559926244622003, "grad_norm": 4.540061950683594, "learning_rate": 1.969760295021512e-05, "loss": 1.7026, "step": 984 }, { "epoch": 0.07575291948371235, "grad_norm": 4.7810845375061035, "learning_rate": 1.969698832206515e-05, "loss": 1.686, "step": 986 }, { "epoch": 0.07590657652120467, "grad_norm": 4.475192546844482, "learning_rate": 1.9696373693915184e-05, "loss": 1.739, "step": 988 }, { "epoch": 0.07606023355869698, "grad_norm": 5.317092418670654, "learning_rate": 1.9695759065765213e-05, "loss": 2.0029, "step": 990 }, { "epoch": 0.0762138905961893, "grad_norm": 5.178996562957764, "learning_rate": 1.9695144437615243e-05, "loss": 1.7278, "step": 992 }, { "epoch": 0.07636754763368163, "grad_norm": 5.976894855499268, "learning_rate": 1.9694529809465276e-05, "loss": 1.753, "step": 994 }, { "epoch": 0.07652120467117395, "grad_norm": 4.536045551300049, "learning_rate": 1.9693915181315306e-05, "loss": 1.6081, "step": 996 }, { "epoch": 0.07667486170866626, "grad_norm": 4.751937389373779, "learning_rate": 1.969330055316534e-05, "loss": 1.6765, "step": 998 }, { "epoch": 0.07682851874615858, "grad_norm": 5.145371437072754, "learning_rate": 1.9692685925015365e-05, "loss": 1.9088, "step": 1000 }, { "epoch": 0.0769821757836509, "grad_norm": 5.149151802062988, "learning_rate": 1.9692071296865398e-05, "loss": 1.7855, "step": 1002 }, { "epoch": 0.07713583282114321, "grad_norm": 4.331295490264893, "learning_rate": 1.9691456668715428e-05, "loss": 1.6398, "step": 1004 }, { "epoch": 0.07728948985863553, "grad_norm": 5.288293361663818, "learning_rate": 1.9690842040565458e-05, "loss": 1.8451, "step": 1006 }, { "epoch": 0.07744314689612784, "grad_norm": 4.709437370300293, "learning_rate": 1.969022741241549e-05, "loss": 1.8546, "step": 1008 }, { "epoch": 0.07759680393362016, "grad_norm": 5.373138427734375, "learning_rate": 1.968961278426552e-05, "loss": 1.7561, "step": 1010 }, { "epoch": 0.07775046097111248, "grad_norm": 6.308657169342041, "learning_rate": 1.968899815611555e-05, "loss": 1.8271, "step": 1012 }, { "epoch": 0.07790411800860479, "grad_norm": 5.380505561828613, "learning_rate": 1.9688383527965583e-05, "loss": 1.7852, "step": 1014 }, { "epoch": 0.07805777504609711, "grad_norm": 5.468803405761719, "learning_rate": 1.9687768899815613e-05, "loss": 1.7797, "step": 1016 }, { "epoch": 0.07821143208358942, "grad_norm": 4.73374080657959, "learning_rate": 1.9687154271665646e-05, "loss": 1.6201, "step": 1018 }, { "epoch": 0.07836508912108174, "grad_norm": 4.9570631980896, "learning_rate": 1.9686539643515676e-05, "loss": 1.7538, "step": 1020 }, { "epoch": 0.07851874615857406, "grad_norm": 4.574160575866699, "learning_rate": 1.9685925015365705e-05, "loss": 1.7656, "step": 1022 }, { "epoch": 0.07867240319606637, "grad_norm": 5.078851222991943, "learning_rate": 1.968531038721574e-05, "loss": 1.806, "step": 1024 }, { "epoch": 0.0788260602335587, "grad_norm": 5.474549293518066, "learning_rate": 1.9684695759065765e-05, "loss": 1.8226, "step": 1026 }, { "epoch": 0.07897971727105102, "grad_norm": 6.257920265197754, "learning_rate": 1.9684081130915798e-05, "loss": 1.7714, "step": 1028 }, { "epoch": 0.07913337430854334, "grad_norm": 4.811653137207031, "learning_rate": 1.9683466502765827e-05, "loss": 1.7903, "step": 1030 }, { "epoch": 0.07928703134603565, "grad_norm": 4.90382194519043, "learning_rate": 1.9682851874615857e-05, "loss": 1.9185, "step": 1032 }, { "epoch": 0.07944068838352797, "grad_norm": 5.112819194793701, "learning_rate": 1.968223724646589e-05, "loss": 1.8098, "step": 1034 }, { "epoch": 0.07959434542102028, "grad_norm": 4.832859039306641, "learning_rate": 1.968162261831592e-05, "loss": 1.7965, "step": 1036 }, { "epoch": 0.0797480024585126, "grad_norm": 5.124858379364014, "learning_rate": 1.9681007990165953e-05, "loss": 1.6745, "step": 1038 }, { "epoch": 0.07990165949600492, "grad_norm": 4.530187606811523, "learning_rate": 1.9680393362015983e-05, "loss": 1.7152, "step": 1040 }, { "epoch": 0.08005531653349723, "grad_norm": 4.918298244476318, "learning_rate": 1.9679778733866012e-05, "loss": 1.7445, "step": 1042 }, { "epoch": 0.08020897357098955, "grad_norm": 4.637542247772217, "learning_rate": 1.9679164105716045e-05, "loss": 1.6802, "step": 1044 }, { "epoch": 0.08036263060848187, "grad_norm": 5.314078330993652, "learning_rate": 1.9678549477566075e-05, "loss": 1.7547, "step": 1046 }, { "epoch": 0.08051628764597418, "grad_norm": 5.02266263961792, "learning_rate": 1.9677934849416105e-05, "loss": 1.7107, "step": 1048 }, { "epoch": 0.0806699446834665, "grad_norm": 4.683084487915039, "learning_rate": 1.9677320221266138e-05, "loss": 1.7415, "step": 1050 }, { "epoch": 0.08082360172095882, "grad_norm": 4.539281845092773, "learning_rate": 1.9676705593116164e-05, "loss": 1.9356, "step": 1052 }, { "epoch": 0.08097725875845113, "grad_norm": 4.526500701904297, "learning_rate": 1.9676090964966197e-05, "loss": 1.6487, "step": 1054 }, { "epoch": 0.08113091579594345, "grad_norm": 4.272531509399414, "learning_rate": 1.9675476336816227e-05, "loss": 1.6478, "step": 1056 }, { "epoch": 0.08128457283343578, "grad_norm": 4.874919414520264, "learning_rate": 1.9674861708666257e-05, "loss": 1.715, "step": 1058 }, { "epoch": 0.0814382298709281, "grad_norm": 5.863064765930176, "learning_rate": 1.967424708051629e-05, "loss": 1.8635, "step": 1060 }, { "epoch": 0.08159188690842041, "grad_norm": 5.5292205810546875, "learning_rate": 1.967363245236632e-05, "loss": 1.7227, "step": 1062 }, { "epoch": 0.08174554394591273, "grad_norm": 4.343425273895264, "learning_rate": 1.9673017824216352e-05, "loss": 1.8257, "step": 1064 }, { "epoch": 0.08189920098340504, "grad_norm": 5.900411128997803, "learning_rate": 1.9672403196066382e-05, "loss": 1.8205, "step": 1066 }, { "epoch": 0.08205285802089736, "grad_norm": 4.439347743988037, "learning_rate": 1.9671788567916412e-05, "loss": 1.6958, "step": 1068 }, { "epoch": 0.08220651505838968, "grad_norm": 5.694681644439697, "learning_rate": 1.9671173939766445e-05, "loss": 1.8959, "step": 1070 }, { "epoch": 0.08236017209588199, "grad_norm": 4.767448425292969, "learning_rate": 1.9670559311616474e-05, "loss": 1.6721, "step": 1072 }, { "epoch": 0.08251382913337431, "grad_norm": 4.426461696624756, "learning_rate": 1.9669944683466504e-05, "loss": 1.607, "step": 1074 }, { "epoch": 0.08266748617086662, "grad_norm": 4.794045925140381, "learning_rate": 1.9669330055316537e-05, "loss": 1.6254, "step": 1076 }, { "epoch": 0.08282114320835894, "grad_norm": 4.544212341308594, "learning_rate": 1.9668715427166564e-05, "loss": 1.6082, "step": 1078 }, { "epoch": 0.08297480024585126, "grad_norm": 4.256971836090088, "learning_rate": 1.9668100799016597e-05, "loss": 1.8001, "step": 1080 }, { "epoch": 0.08312845728334357, "grad_norm": 6.058056831359863, "learning_rate": 1.9667486170866626e-05, "loss": 1.7258, "step": 1082 }, { "epoch": 0.08328211432083589, "grad_norm": 4.815703868865967, "learning_rate": 1.966687154271666e-05, "loss": 1.7528, "step": 1084 }, { "epoch": 0.0834357713583282, "grad_norm": 4.661309719085693, "learning_rate": 1.966625691456669e-05, "loss": 1.7316, "step": 1086 }, { "epoch": 0.08358942839582052, "grad_norm": 4.863770961761475, "learning_rate": 1.966564228641672e-05, "loss": 1.8824, "step": 1088 }, { "epoch": 0.08374308543331284, "grad_norm": 5.061347961425781, "learning_rate": 1.9665027658266752e-05, "loss": 1.6873, "step": 1090 }, { "epoch": 0.08389674247080517, "grad_norm": 4.252581596374512, "learning_rate": 1.966441303011678e-05, "loss": 1.8093, "step": 1092 }, { "epoch": 0.08405039950829749, "grad_norm": 5.112542152404785, "learning_rate": 1.966379840196681e-05, "loss": 1.7366, "step": 1094 }, { "epoch": 0.0842040565457898, "grad_norm": 5.147494792938232, "learning_rate": 1.9663183773816844e-05, "loss": 1.8676, "step": 1096 }, { "epoch": 0.08435771358328212, "grad_norm": 4.757107257843018, "learning_rate": 1.966256914566687e-05, "loss": 1.5888, "step": 1098 }, { "epoch": 0.08451137062077443, "grad_norm": 5.5242600440979, "learning_rate": 1.9661954517516904e-05, "loss": 1.7018, "step": 1100 }, { "epoch": 0.08466502765826675, "grad_norm": 5.5325117111206055, "learning_rate": 1.9661339889366933e-05, "loss": 1.6928, "step": 1102 }, { "epoch": 0.08481868469575907, "grad_norm": 5.308017253875732, "learning_rate": 1.9660725261216966e-05, "loss": 1.7695, "step": 1104 }, { "epoch": 0.08497234173325138, "grad_norm": 4.460916519165039, "learning_rate": 1.9660110633066996e-05, "loss": 1.649, "step": 1106 }, { "epoch": 0.0851259987707437, "grad_norm": 5.222771644592285, "learning_rate": 1.9659496004917026e-05, "loss": 1.8017, "step": 1108 }, { "epoch": 0.08527965580823602, "grad_norm": 4.484593391418457, "learning_rate": 1.965888137676706e-05, "loss": 1.697, "step": 1110 }, { "epoch": 0.08543331284572833, "grad_norm": 4.95808219909668, "learning_rate": 1.965826674861709e-05, "loss": 1.6171, "step": 1112 }, { "epoch": 0.08558696988322065, "grad_norm": 5.313704967498779, "learning_rate": 1.9657652120467118e-05, "loss": 1.665, "step": 1114 }, { "epoch": 0.08574062692071296, "grad_norm": 4.555895805358887, "learning_rate": 1.965703749231715e-05, "loss": 1.7004, "step": 1116 }, { "epoch": 0.08589428395820528, "grad_norm": 4.939544677734375, "learning_rate": 1.965642286416718e-05, "loss": 1.7204, "step": 1118 }, { "epoch": 0.0860479409956976, "grad_norm": 4.291268348693848, "learning_rate": 1.965580823601721e-05, "loss": 1.6592, "step": 1120 }, { "epoch": 0.08620159803318991, "grad_norm": 5.050233840942383, "learning_rate": 1.9655193607867244e-05, "loss": 1.7656, "step": 1122 }, { "epoch": 0.08635525507068224, "grad_norm": 4.5748090744018555, "learning_rate": 1.9654578979717273e-05, "loss": 1.8491, "step": 1124 }, { "epoch": 0.08650891210817456, "grad_norm": 4.3803391456604, "learning_rate": 1.9653964351567303e-05, "loss": 1.697, "step": 1126 }, { "epoch": 0.08666256914566688, "grad_norm": 5.346717834472656, "learning_rate": 1.9653349723417333e-05, "loss": 1.817, "step": 1128 }, { "epoch": 0.08681622618315919, "grad_norm": 7.8311944007873535, "learning_rate": 1.9652735095267366e-05, "loss": 1.7637, "step": 1130 }, { "epoch": 0.08696988322065151, "grad_norm": 4.9822845458984375, "learning_rate": 1.9652120467117395e-05, "loss": 1.6673, "step": 1132 }, { "epoch": 0.08712354025814383, "grad_norm": 5.492576599121094, "learning_rate": 1.9651505838967425e-05, "loss": 1.7831, "step": 1134 }, { "epoch": 0.08727719729563614, "grad_norm": 5.493293285369873, "learning_rate": 1.9650891210817458e-05, "loss": 1.698, "step": 1136 }, { "epoch": 0.08743085433312846, "grad_norm": 6.50536584854126, "learning_rate": 1.9650276582667488e-05, "loss": 1.7632, "step": 1138 }, { "epoch": 0.08758451137062077, "grad_norm": 4.753702640533447, "learning_rate": 1.9649661954517518e-05, "loss": 1.64, "step": 1140 }, { "epoch": 0.08773816840811309, "grad_norm": 4.77907657623291, "learning_rate": 1.964904732636755e-05, "loss": 1.9073, "step": 1142 }, { "epoch": 0.0878918254456054, "grad_norm": 4.242752552032471, "learning_rate": 1.964843269821758e-05, "loss": 1.7625, "step": 1144 }, { "epoch": 0.08804548248309772, "grad_norm": 5.127992630004883, "learning_rate": 1.964781807006761e-05, "loss": 1.6271, "step": 1146 }, { "epoch": 0.08819913952059004, "grad_norm": 4.7297749519348145, "learning_rate": 1.9647203441917643e-05, "loss": 1.6252, "step": 1148 }, { "epoch": 0.08835279655808236, "grad_norm": 5.08091926574707, "learning_rate": 1.9646588813767673e-05, "loss": 1.7145, "step": 1150 }, { "epoch": 0.08850645359557467, "grad_norm": 4.986452102661133, "learning_rate": 1.9645974185617702e-05, "loss": 1.8065, "step": 1152 }, { "epoch": 0.08866011063306699, "grad_norm": 5.4139204025268555, "learning_rate": 1.9645359557467732e-05, "loss": 1.6284, "step": 1154 }, { "epoch": 0.08881376767055932, "grad_norm": 5.257991313934326, "learning_rate": 1.9644744929317765e-05, "loss": 1.8338, "step": 1156 }, { "epoch": 0.08896742470805163, "grad_norm": 4.872262001037598, "learning_rate": 1.9644130301167795e-05, "loss": 1.8488, "step": 1158 }, { "epoch": 0.08912108174554395, "grad_norm": 5.82914924621582, "learning_rate": 1.9643515673017825e-05, "loss": 1.7731, "step": 1160 }, { "epoch": 0.08927473878303627, "grad_norm": 4.288515090942383, "learning_rate": 1.9642901044867858e-05, "loss": 1.6748, "step": 1162 }, { "epoch": 0.08942839582052858, "grad_norm": 4.828827381134033, "learning_rate": 1.9642286416717887e-05, "loss": 1.7893, "step": 1164 }, { "epoch": 0.0895820528580209, "grad_norm": 4.2209577560424805, "learning_rate": 1.9641671788567917e-05, "loss": 1.6139, "step": 1166 }, { "epoch": 0.08973570989551322, "grad_norm": 5.542986869812012, "learning_rate": 1.964105716041795e-05, "loss": 1.5873, "step": 1168 }, { "epoch": 0.08988936693300553, "grad_norm": 4.995078086853027, "learning_rate": 1.964044253226798e-05, "loss": 1.5976, "step": 1170 }, { "epoch": 0.09004302397049785, "grad_norm": 4.493627071380615, "learning_rate": 1.963982790411801e-05, "loss": 1.6423, "step": 1172 }, { "epoch": 0.09019668100799016, "grad_norm": 5.041021823883057, "learning_rate": 1.9639213275968043e-05, "loss": 1.6996, "step": 1174 }, { "epoch": 0.09035033804548248, "grad_norm": 5.527616024017334, "learning_rate": 1.9638598647818072e-05, "loss": 1.7338, "step": 1176 }, { "epoch": 0.0905039950829748, "grad_norm": 4.937674045562744, "learning_rate": 1.9637984019668102e-05, "loss": 1.8472, "step": 1178 }, { "epoch": 0.09065765212046711, "grad_norm": 5.515735149383545, "learning_rate": 1.963736939151813e-05, "loss": 1.8828, "step": 1180 }, { "epoch": 0.09081130915795943, "grad_norm": 5.273295879364014, "learning_rate": 1.9636754763368165e-05, "loss": 1.6819, "step": 1182 }, { "epoch": 0.09096496619545175, "grad_norm": 4.745762825012207, "learning_rate": 1.9636140135218194e-05, "loss": 1.7752, "step": 1184 }, { "epoch": 0.09111862323294406, "grad_norm": 4.8165130615234375, "learning_rate": 1.9635525507068224e-05, "loss": 1.6133, "step": 1186 }, { "epoch": 0.09127228027043639, "grad_norm": 4.156392574310303, "learning_rate": 1.9634910878918257e-05, "loss": 1.7126, "step": 1188 }, { "epoch": 0.09142593730792871, "grad_norm": 4.523599624633789, "learning_rate": 1.9634296250768287e-05, "loss": 1.616, "step": 1190 }, { "epoch": 0.09157959434542103, "grad_norm": 4.08304500579834, "learning_rate": 1.9633681622618316e-05, "loss": 1.6602, "step": 1192 }, { "epoch": 0.09173325138291334, "grad_norm": 4.6315202713012695, "learning_rate": 1.963306699446835e-05, "loss": 1.7004, "step": 1194 }, { "epoch": 0.09188690842040566, "grad_norm": 4.613175868988037, "learning_rate": 1.963245236631838e-05, "loss": 1.7607, "step": 1196 }, { "epoch": 0.09204056545789797, "grad_norm": 5.473268508911133, "learning_rate": 1.963183773816841e-05, "loss": 1.7788, "step": 1198 }, { "epoch": 0.09219422249539029, "grad_norm": 4.436158657073975, "learning_rate": 1.963122311001844e-05, "loss": 1.507, "step": 1200 }, { "epoch": 0.0923478795328826, "grad_norm": 4.081661701202393, "learning_rate": 1.963060848186847e-05, "loss": 1.6724, "step": 1202 }, { "epoch": 0.09250153657037492, "grad_norm": 4.4669318199157715, "learning_rate": 1.96299938537185e-05, "loss": 1.6776, "step": 1204 }, { "epoch": 0.09265519360786724, "grad_norm": 4.446565628051758, "learning_rate": 1.962937922556853e-05, "loss": 1.7225, "step": 1206 }, { "epoch": 0.09280885064535956, "grad_norm": 4.1054816246032715, "learning_rate": 1.9628764597418564e-05, "loss": 1.5964, "step": 1208 }, { "epoch": 0.09296250768285187, "grad_norm": 3.9810330867767334, "learning_rate": 1.9628149969268594e-05, "loss": 1.6933, "step": 1210 }, { "epoch": 0.09311616472034419, "grad_norm": 5.808743476867676, "learning_rate": 1.9627535341118623e-05, "loss": 1.796, "step": 1212 }, { "epoch": 0.0932698217578365, "grad_norm": 5.4757795333862305, "learning_rate": 1.9626920712968657e-05, "loss": 1.6707, "step": 1214 }, { "epoch": 0.09342347879532882, "grad_norm": 4.4285430908203125, "learning_rate": 1.9626306084818686e-05, "loss": 1.6214, "step": 1216 }, { "epoch": 0.09357713583282114, "grad_norm": 4.347642421722412, "learning_rate": 1.9625691456668716e-05, "loss": 1.7012, "step": 1218 }, { "epoch": 0.09373079287031345, "grad_norm": 4.711369037628174, "learning_rate": 1.962507682851875e-05, "loss": 1.6307, "step": 1220 }, { "epoch": 0.09388444990780578, "grad_norm": 4.427557945251465, "learning_rate": 1.962446220036878e-05, "loss": 1.7988, "step": 1222 }, { "epoch": 0.0940381069452981, "grad_norm": 4.078681945800781, "learning_rate": 1.962384757221881e-05, "loss": 1.6596, "step": 1224 }, { "epoch": 0.09419176398279042, "grad_norm": 4.403939247131348, "learning_rate": 1.9623232944068838e-05, "loss": 1.7761, "step": 1226 }, { "epoch": 0.09434542102028273, "grad_norm": 4.222808361053467, "learning_rate": 1.962261831591887e-05, "loss": 1.7145, "step": 1228 }, { "epoch": 0.09449907805777505, "grad_norm": 4.754458904266357, "learning_rate": 1.96220036877689e-05, "loss": 1.6256, "step": 1230 }, { "epoch": 0.09465273509526737, "grad_norm": 4.7824201583862305, "learning_rate": 1.962138905961893e-05, "loss": 1.7846, "step": 1232 }, { "epoch": 0.09480639213275968, "grad_norm": 4.751527786254883, "learning_rate": 1.9620774431468964e-05, "loss": 1.6602, "step": 1234 }, { "epoch": 0.094960049170252, "grad_norm": 4.74616813659668, "learning_rate": 1.9620159803318993e-05, "loss": 1.6707, "step": 1236 }, { "epoch": 0.09511370620774431, "grad_norm": 5.099863052368164, "learning_rate": 1.9619545175169023e-05, "loss": 1.7293, "step": 1238 }, { "epoch": 0.09526736324523663, "grad_norm": 5.293537616729736, "learning_rate": 1.9618930547019056e-05, "loss": 1.7172, "step": 1240 }, { "epoch": 0.09542102028272895, "grad_norm": 5.443637847900391, "learning_rate": 1.9618315918869086e-05, "loss": 1.7516, "step": 1242 }, { "epoch": 0.09557467732022126, "grad_norm": 4.447843551635742, "learning_rate": 1.9617701290719115e-05, "loss": 1.7449, "step": 1244 }, { "epoch": 0.09572833435771358, "grad_norm": 4.490113258361816, "learning_rate": 1.961708666256915e-05, "loss": 1.6253, "step": 1246 }, { "epoch": 0.0958819913952059, "grad_norm": 4.979306221008301, "learning_rate": 1.9616472034419178e-05, "loss": 1.6743, "step": 1248 }, { "epoch": 0.09603564843269821, "grad_norm": 4.146381855010986, "learning_rate": 1.961585740626921e-05, "loss": 1.7014, "step": 1250 }, { "epoch": 0.09618930547019053, "grad_norm": 4.571809768676758, "learning_rate": 1.9615242778119237e-05, "loss": 1.7867, "step": 1252 }, { "epoch": 0.09634296250768286, "grad_norm": 4.7382988929748535, "learning_rate": 1.961462814996927e-05, "loss": 1.7669, "step": 1254 }, { "epoch": 0.09649661954517517, "grad_norm": 4.332629203796387, "learning_rate": 1.96140135218193e-05, "loss": 1.8072, "step": 1256 }, { "epoch": 0.09665027658266749, "grad_norm": 4.376523494720459, "learning_rate": 1.961339889366933e-05, "loss": 1.7595, "step": 1258 }, { "epoch": 0.09680393362015981, "grad_norm": 4.876426696777344, "learning_rate": 1.9612784265519363e-05, "loss": 1.5174, "step": 1260 }, { "epoch": 0.09695759065765212, "grad_norm": 4.8033905029296875, "learning_rate": 1.9612169637369393e-05, "loss": 1.7284, "step": 1262 }, { "epoch": 0.09711124769514444, "grad_norm": 4.221518039703369, "learning_rate": 1.9611555009219422e-05, "loss": 1.7409, "step": 1264 }, { "epoch": 0.09726490473263676, "grad_norm": 4.445687294006348, "learning_rate": 1.9610940381069455e-05, "loss": 1.6211, "step": 1266 }, { "epoch": 0.09741856177012907, "grad_norm": 4.590234279632568, "learning_rate": 1.9610325752919485e-05, "loss": 1.5695, "step": 1268 }, { "epoch": 0.09757221880762139, "grad_norm": 5.60252571105957, "learning_rate": 1.9609711124769518e-05, "loss": 1.6369, "step": 1270 }, { "epoch": 0.0977258758451137, "grad_norm": 4.938035011291504, "learning_rate": 1.9609096496619548e-05, "loss": 1.7329, "step": 1272 }, { "epoch": 0.09787953288260602, "grad_norm": 5.1367106437683105, "learning_rate": 1.9608481868469578e-05, "loss": 1.8568, "step": 1274 }, { "epoch": 0.09803318992009834, "grad_norm": 4.405098915100098, "learning_rate": 1.960786724031961e-05, "loss": 1.6226, "step": 1276 }, { "epoch": 0.09818684695759065, "grad_norm": 5.822478771209717, "learning_rate": 1.9607252612169637e-05, "loss": 1.7561, "step": 1278 }, { "epoch": 0.09834050399508297, "grad_norm": 4.770538806915283, "learning_rate": 1.960663798401967e-05, "loss": 1.6214, "step": 1280 }, { "epoch": 0.09849416103257529, "grad_norm": 6.316437244415283, "learning_rate": 1.96060233558697e-05, "loss": 1.8268, "step": 1282 }, { "epoch": 0.0986478180700676, "grad_norm": 4.640567302703857, "learning_rate": 1.960540872771973e-05, "loss": 1.7484, "step": 1284 }, { "epoch": 0.09880147510755993, "grad_norm": 4.596543312072754, "learning_rate": 1.9604794099569762e-05, "loss": 1.6986, "step": 1286 }, { "epoch": 0.09895513214505225, "grad_norm": 4.36724328994751, "learning_rate": 1.9604179471419792e-05, "loss": 1.6108, "step": 1288 }, { "epoch": 0.09910878918254457, "grad_norm": 5.017337322235107, "learning_rate": 1.9603564843269825e-05, "loss": 1.7047, "step": 1290 }, { "epoch": 0.09926244622003688, "grad_norm": 4.327188491821289, "learning_rate": 1.9602950215119855e-05, "loss": 1.6083, "step": 1292 }, { "epoch": 0.0994161032575292, "grad_norm": 5.734022617340088, "learning_rate": 1.9602335586969885e-05, "loss": 1.7501, "step": 1294 }, { "epoch": 0.09956976029502151, "grad_norm": 4.524082183837891, "learning_rate": 1.9601720958819918e-05, "loss": 1.7556, "step": 1296 }, { "epoch": 0.09972341733251383, "grad_norm": 4.61295747756958, "learning_rate": 1.9601106330669947e-05, "loss": 1.6598, "step": 1298 }, { "epoch": 0.09987707437000615, "grad_norm": 4.453684329986572, "learning_rate": 1.9600491702519977e-05, "loss": 1.6554, "step": 1300 }, { "epoch": 0.10003073140749846, "grad_norm": 4.732148170471191, "learning_rate": 1.959987707437001e-05, "loss": 1.7181, "step": 1302 }, { "epoch": 0.10018438844499078, "grad_norm": 4.715574741363525, "learning_rate": 1.9599262446220036e-05, "loss": 1.6849, "step": 1304 }, { "epoch": 0.1003380454824831, "grad_norm": 4.356414318084717, "learning_rate": 1.959864781807007e-05, "loss": 1.687, "step": 1306 }, { "epoch": 0.10049170251997541, "grad_norm": 4.813374996185303, "learning_rate": 1.95980331899201e-05, "loss": 1.5262, "step": 1308 }, { "epoch": 0.10064535955746773, "grad_norm": 4.9926981925964355, "learning_rate": 1.959741856177013e-05, "loss": 1.8137, "step": 1310 }, { "epoch": 0.10079901659496004, "grad_norm": 5.103787422180176, "learning_rate": 1.9596803933620162e-05, "loss": 1.64, "step": 1312 }, { "epoch": 0.10095267363245236, "grad_norm": 4.895768165588379, "learning_rate": 1.959618930547019e-05, "loss": 1.693, "step": 1314 }, { "epoch": 0.10110633066994468, "grad_norm": 4.513470649719238, "learning_rate": 1.9595574677320225e-05, "loss": 1.5745, "step": 1316 }, { "epoch": 0.10125998770743701, "grad_norm": 5.475149154663086, "learning_rate": 1.9594960049170254e-05, "loss": 1.6189, "step": 1318 }, { "epoch": 0.10141364474492932, "grad_norm": 4.828972339630127, "learning_rate": 1.9594345421020284e-05, "loss": 1.6547, "step": 1320 }, { "epoch": 0.10156730178242164, "grad_norm": 5.218929290771484, "learning_rate": 1.9593730792870317e-05, "loss": 1.6494, "step": 1322 }, { "epoch": 0.10172095881991396, "grad_norm": 4.358766078948975, "learning_rate": 1.9593116164720343e-05, "loss": 1.7283, "step": 1324 }, { "epoch": 0.10187461585740627, "grad_norm": 4.14285135269165, "learning_rate": 1.9592501536570376e-05, "loss": 1.7769, "step": 1326 }, { "epoch": 0.10202827289489859, "grad_norm": 4.319285869598389, "learning_rate": 1.9591886908420406e-05, "loss": 1.4707, "step": 1328 }, { "epoch": 0.1021819299323909, "grad_norm": 5.230128288269043, "learning_rate": 1.9591272280270436e-05, "loss": 1.4906, "step": 1330 }, { "epoch": 0.10233558696988322, "grad_norm": 5.243448257446289, "learning_rate": 1.959065765212047e-05, "loss": 1.8825, "step": 1332 }, { "epoch": 0.10248924400737554, "grad_norm": 4.784072399139404, "learning_rate": 1.95900430239705e-05, "loss": 1.7553, "step": 1334 }, { "epoch": 0.10264290104486785, "grad_norm": 5.595427513122559, "learning_rate": 1.958942839582053e-05, "loss": 1.6913, "step": 1336 }, { "epoch": 0.10279655808236017, "grad_norm": 4.856276512145996, "learning_rate": 1.958881376767056e-05, "loss": 1.7659, "step": 1338 }, { "epoch": 0.10295021511985249, "grad_norm": 5.188042640686035, "learning_rate": 1.958819913952059e-05, "loss": 1.8205, "step": 1340 }, { "epoch": 0.1031038721573448, "grad_norm": 4.261306285858154, "learning_rate": 1.9587584511370624e-05, "loss": 1.5887, "step": 1342 }, { "epoch": 0.10325752919483712, "grad_norm": 4.269975185394287, "learning_rate": 1.9586969883220654e-05, "loss": 1.5869, "step": 1344 }, { "epoch": 0.10341118623232944, "grad_norm": 5.029308795928955, "learning_rate": 1.9586355255070683e-05, "loss": 1.8049, "step": 1346 }, { "epoch": 0.10356484326982175, "grad_norm": 4.857789516448975, "learning_rate": 1.9585740626920716e-05, "loss": 1.5154, "step": 1348 }, { "epoch": 0.10371850030731407, "grad_norm": 4.701939582824707, "learning_rate": 1.9585125998770743e-05, "loss": 1.6822, "step": 1350 }, { "epoch": 0.1038721573448064, "grad_norm": 4.069787979125977, "learning_rate": 1.9584511370620776e-05, "loss": 1.7238, "step": 1352 }, { "epoch": 0.10402581438229871, "grad_norm": 4.703420162200928, "learning_rate": 1.9583896742470806e-05, "loss": 1.6951, "step": 1354 }, { "epoch": 0.10417947141979103, "grad_norm": 4.920733451843262, "learning_rate": 1.958328211432084e-05, "loss": 1.6335, "step": 1356 }, { "epoch": 0.10433312845728335, "grad_norm": 4.38323974609375, "learning_rate": 1.9582667486170868e-05, "loss": 1.7094, "step": 1358 }, { "epoch": 0.10448678549477566, "grad_norm": 4.646501541137695, "learning_rate": 1.9582052858020898e-05, "loss": 1.6878, "step": 1360 }, { "epoch": 0.10464044253226798, "grad_norm": 4.569819450378418, "learning_rate": 1.958143822987093e-05, "loss": 1.7198, "step": 1362 }, { "epoch": 0.1047940995697603, "grad_norm": 4.552595615386963, "learning_rate": 1.958082360172096e-05, "loss": 1.7335, "step": 1364 }, { "epoch": 0.10494775660725261, "grad_norm": 3.9051506519317627, "learning_rate": 1.958020897357099e-05, "loss": 1.5694, "step": 1366 }, { "epoch": 0.10510141364474493, "grad_norm": 3.9420411586761475, "learning_rate": 1.9579594345421023e-05, "loss": 1.7298, "step": 1368 }, { "epoch": 0.10525507068223725, "grad_norm": 4.996294021606445, "learning_rate": 1.9578979717271053e-05, "loss": 1.7769, "step": 1370 }, { "epoch": 0.10540872771972956, "grad_norm": 4.845794677734375, "learning_rate": 1.9578365089121083e-05, "loss": 1.694, "step": 1372 }, { "epoch": 0.10556238475722188, "grad_norm": 4.156089782714844, "learning_rate": 1.9577750460971116e-05, "loss": 1.5621, "step": 1374 }, { "epoch": 0.1057160417947142, "grad_norm": 5.298906326293945, "learning_rate": 1.9577135832821146e-05, "loss": 1.6016, "step": 1376 }, { "epoch": 0.10586969883220651, "grad_norm": 4.974923610687256, "learning_rate": 1.9576521204671175e-05, "loss": 1.9024, "step": 1378 }, { "epoch": 0.10602335586969883, "grad_norm": 4.5802998542785645, "learning_rate": 1.9575906576521205e-05, "loss": 1.8249, "step": 1380 }, { "epoch": 0.10617701290719114, "grad_norm": 5.364488124847412, "learning_rate": 1.9575291948371238e-05, "loss": 1.6205, "step": 1382 }, { "epoch": 0.10633066994468347, "grad_norm": 4.810891151428223, "learning_rate": 1.9574677320221268e-05, "loss": 1.6702, "step": 1384 }, { "epoch": 0.10648432698217579, "grad_norm": 5.155327320098877, "learning_rate": 1.9574062692071297e-05, "loss": 1.6251, "step": 1386 }, { "epoch": 0.1066379840196681, "grad_norm": 4.292688369750977, "learning_rate": 1.957344806392133e-05, "loss": 1.505, "step": 1388 }, { "epoch": 0.10679164105716042, "grad_norm": 4.611319541931152, "learning_rate": 1.957283343577136e-05, "loss": 1.7301, "step": 1390 }, { "epoch": 0.10694529809465274, "grad_norm": 4.324422359466553, "learning_rate": 1.957221880762139e-05, "loss": 1.6551, "step": 1392 }, { "epoch": 0.10709895513214505, "grad_norm": 4.826112747192383, "learning_rate": 1.9571604179471423e-05, "loss": 1.607, "step": 1394 }, { "epoch": 0.10725261216963737, "grad_norm": 4.303924560546875, "learning_rate": 1.9570989551321453e-05, "loss": 1.6128, "step": 1396 }, { "epoch": 0.10740626920712969, "grad_norm": 5.093891620635986, "learning_rate": 1.9570374923171482e-05, "loss": 1.6747, "step": 1398 }, { "epoch": 0.107559926244622, "grad_norm": 4.303253650665283, "learning_rate": 1.9569760295021515e-05, "loss": 1.5981, "step": 1400 }, { "epoch": 0.10771358328211432, "grad_norm": 4.165480613708496, "learning_rate": 1.9569145666871545e-05, "loss": 1.5993, "step": 1402 }, { "epoch": 0.10786724031960664, "grad_norm": 4.655346393585205, "learning_rate": 1.9568531038721575e-05, "loss": 1.6806, "step": 1404 }, { "epoch": 0.10802089735709895, "grad_norm": 4.743736743927002, "learning_rate": 1.9567916410571604e-05, "loss": 1.6624, "step": 1406 }, { "epoch": 0.10817455439459127, "grad_norm": 4.2791643142700195, "learning_rate": 1.9567301782421637e-05, "loss": 1.6776, "step": 1408 }, { "epoch": 0.10832821143208358, "grad_norm": 5.005465030670166, "learning_rate": 1.9566687154271667e-05, "loss": 1.5962, "step": 1410 }, { "epoch": 0.1084818684695759, "grad_norm": 4.345304012298584, "learning_rate": 1.9566072526121697e-05, "loss": 1.5865, "step": 1412 }, { "epoch": 0.10863552550706822, "grad_norm": 3.8712103366851807, "learning_rate": 1.956545789797173e-05, "loss": 1.6583, "step": 1414 }, { "epoch": 0.10878918254456055, "grad_norm": 4.381411075592041, "learning_rate": 1.956484326982176e-05, "loss": 1.6203, "step": 1416 }, { "epoch": 0.10894283958205286, "grad_norm": 3.933609962463379, "learning_rate": 1.956422864167179e-05, "loss": 1.7631, "step": 1418 }, { "epoch": 0.10909649661954518, "grad_norm": 5.570189952850342, "learning_rate": 1.9563614013521822e-05, "loss": 1.7173, "step": 1420 }, { "epoch": 0.1092501536570375, "grad_norm": 4.816314220428467, "learning_rate": 1.9562999385371852e-05, "loss": 1.6941, "step": 1422 }, { "epoch": 0.10940381069452981, "grad_norm": 4.110052585601807, "learning_rate": 1.9562384757221882e-05, "loss": 1.6782, "step": 1424 }, { "epoch": 0.10955746773202213, "grad_norm": 4.069727420806885, "learning_rate": 1.956177012907191e-05, "loss": 1.657, "step": 1426 }, { "epoch": 0.10971112476951445, "grad_norm": 5.244446277618408, "learning_rate": 1.9561155500921944e-05, "loss": 1.683, "step": 1428 }, { "epoch": 0.10986478180700676, "grad_norm": 5.359142780303955, "learning_rate": 1.9560540872771974e-05, "loss": 1.6764, "step": 1430 }, { "epoch": 0.11001843884449908, "grad_norm": 5.057417869567871, "learning_rate": 1.9559926244622004e-05, "loss": 1.787, "step": 1432 }, { "epoch": 0.1101720958819914, "grad_norm": 4.59893274307251, "learning_rate": 1.9559311616472037e-05, "loss": 1.6119, "step": 1434 }, { "epoch": 0.11032575291948371, "grad_norm": 4.8417744636535645, "learning_rate": 1.9558696988322067e-05, "loss": 1.7965, "step": 1436 }, { "epoch": 0.11047940995697603, "grad_norm": 4.829365253448486, "learning_rate": 1.9558082360172096e-05, "loss": 1.6125, "step": 1438 }, { "epoch": 0.11063306699446834, "grad_norm": 4.74966287612915, "learning_rate": 1.955746773202213e-05, "loss": 1.5761, "step": 1440 }, { "epoch": 0.11078672403196066, "grad_norm": 4.8681535720825195, "learning_rate": 1.955685310387216e-05, "loss": 1.5977, "step": 1442 }, { "epoch": 0.11094038106945298, "grad_norm": 4.576766014099121, "learning_rate": 1.955623847572219e-05, "loss": 1.6297, "step": 1444 }, { "epoch": 0.11109403810694529, "grad_norm": 4.206700325012207, "learning_rate": 1.9555623847572222e-05, "loss": 1.6246, "step": 1446 }, { "epoch": 0.11124769514443761, "grad_norm": 4.753570079803467, "learning_rate": 1.955500921942225e-05, "loss": 1.6627, "step": 1448 }, { "epoch": 0.11140135218192994, "grad_norm": 4.992982864379883, "learning_rate": 1.955439459127228e-05, "loss": 1.7223, "step": 1450 }, { "epoch": 0.11155500921942225, "grad_norm": 4.912965297698975, "learning_rate": 1.955377996312231e-05, "loss": 1.5616, "step": 1452 }, { "epoch": 0.11170866625691457, "grad_norm": 4.4759840965271, "learning_rate": 1.9553165334972344e-05, "loss": 1.5403, "step": 1454 }, { "epoch": 0.11186232329440689, "grad_norm": 5.181031703948975, "learning_rate": 1.9552550706822374e-05, "loss": 1.6365, "step": 1456 }, { "epoch": 0.1120159803318992, "grad_norm": 4.845396518707275, "learning_rate": 1.9551936078672403e-05, "loss": 1.6279, "step": 1458 }, { "epoch": 0.11216963736939152, "grad_norm": 4.756799221038818, "learning_rate": 1.9551321450522436e-05, "loss": 1.6919, "step": 1460 }, { "epoch": 0.11232329440688384, "grad_norm": 5.1768107414245605, "learning_rate": 1.9550706822372466e-05, "loss": 1.5765, "step": 1462 }, { "epoch": 0.11247695144437615, "grad_norm": 4.743069648742676, "learning_rate": 1.9550092194222496e-05, "loss": 1.8469, "step": 1464 }, { "epoch": 0.11263060848186847, "grad_norm": 4.831038951873779, "learning_rate": 1.954947756607253e-05, "loss": 1.7746, "step": 1466 }, { "epoch": 0.11278426551936079, "grad_norm": 4.309507846832275, "learning_rate": 1.954886293792256e-05, "loss": 1.5142, "step": 1468 }, { "epoch": 0.1129379225568531, "grad_norm": 55.4850959777832, "learning_rate": 1.9548248309772588e-05, "loss": 1.683, "step": 1470 }, { "epoch": 0.11309157959434542, "grad_norm": 4.364781856536865, "learning_rate": 1.954763368162262e-05, "loss": 1.8977, "step": 1472 }, { "epoch": 0.11324523663183773, "grad_norm": 4.795863151550293, "learning_rate": 1.954701905347265e-05, "loss": 1.6436, "step": 1474 }, { "epoch": 0.11339889366933005, "grad_norm": 4.47898530960083, "learning_rate": 1.954640442532268e-05, "loss": 1.7043, "step": 1476 }, { "epoch": 0.11355255070682237, "grad_norm": 5.761251926422119, "learning_rate": 1.954578979717271e-05, "loss": 1.6943, "step": 1478 }, { "epoch": 0.11370620774431468, "grad_norm": 5.274534225463867, "learning_rate": 1.9545175169022743e-05, "loss": 1.6005, "step": 1480 }, { "epoch": 0.11385986478180701, "grad_norm": 4.412993431091309, "learning_rate": 1.9544560540872773e-05, "loss": 1.4977, "step": 1482 }, { "epoch": 0.11401352181929933, "grad_norm": 4.08578634262085, "learning_rate": 1.9543945912722803e-05, "loss": 1.6459, "step": 1484 }, { "epoch": 0.11416717885679165, "grad_norm": 3.7015979290008545, "learning_rate": 1.9543331284572836e-05, "loss": 1.5978, "step": 1486 }, { "epoch": 0.11432083589428396, "grad_norm": 4.919078826904297, "learning_rate": 1.9542716656422865e-05, "loss": 1.5456, "step": 1488 }, { "epoch": 0.11447449293177628, "grad_norm": 4.756066799163818, "learning_rate": 1.9542102028272895e-05, "loss": 1.7221, "step": 1490 }, { "epoch": 0.1146281499692686, "grad_norm": 4.4432525634765625, "learning_rate": 1.9541487400122928e-05, "loss": 1.5918, "step": 1492 }, { "epoch": 0.11478180700676091, "grad_norm": 5.371875286102295, "learning_rate": 1.9540872771972958e-05, "loss": 1.7009, "step": 1494 }, { "epoch": 0.11493546404425323, "grad_norm": 3.5335211753845215, "learning_rate": 1.9540258143822988e-05, "loss": 1.6021, "step": 1496 }, { "epoch": 0.11508912108174554, "grad_norm": 4.77205753326416, "learning_rate": 1.953964351567302e-05, "loss": 1.5721, "step": 1498 }, { "epoch": 0.11524277811923786, "grad_norm": 5.020537376403809, "learning_rate": 1.953902888752305e-05, "loss": 1.6367, "step": 1500 }, { "epoch": 0.11539643515673018, "grad_norm": 4.866142272949219, "learning_rate": 1.9538414259373083e-05, "loss": 1.6035, "step": 1502 }, { "epoch": 0.11555009219422249, "grad_norm": 3.647397756576538, "learning_rate": 1.953779963122311e-05, "loss": 1.5274, "step": 1504 }, { "epoch": 0.11570374923171481, "grad_norm": 3.999390125274658, "learning_rate": 1.9537185003073143e-05, "loss": 1.7818, "step": 1506 }, { "epoch": 0.11585740626920712, "grad_norm": 4.787381172180176, "learning_rate": 1.9536570374923172e-05, "loss": 1.7033, "step": 1508 }, { "epoch": 0.11601106330669944, "grad_norm": 4.415989398956299, "learning_rate": 1.9535955746773202e-05, "loss": 1.831, "step": 1510 }, { "epoch": 0.11616472034419176, "grad_norm": 4.548354148864746, "learning_rate": 1.9535341118623235e-05, "loss": 1.5145, "step": 1512 }, { "epoch": 0.11631837738168409, "grad_norm": 5.4493560791015625, "learning_rate": 1.9534726490473265e-05, "loss": 1.876, "step": 1514 }, { "epoch": 0.1164720344191764, "grad_norm": 3.9988834857940674, "learning_rate": 1.9534111862323295e-05, "loss": 1.6325, "step": 1516 }, { "epoch": 0.11662569145666872, "grad_norm": 4.861139297485352, "learning_rate": 1.9533497234173328e-05, "loss": 1.6743, "step": 1518 }, { "epoch": 0.11677934849416104, "grad_norm": 5.388833522796631, "learning_rate": 1.9532882606023357e-05, "loss": 1.5854, "step": 1520 }, { "epoch": 0.11693300553165335, "grad_norm": 4.772726058959961, "learning_rate": 1.953226797787339e-05, "loss": 1.6481, "step": 1522 }, { "epoch": 0.11708666256914567, "grad_norm": 4.285337924957275, "learning_rate": 1.9531653349723417e-05, "loss": 1.7054, "step": 1524 }, { "epoch": 0.11724031960663799, "grad_norm": 4.5872626304626465, "learning_rate": 1.953103872157345e-05, "loss": 1.6569, "step": 1526 }, { "epoch": 0.1173939766441303, "grad_norm": 4.3280463218688965, "learning_rate": 1.9530424093423483e-05, "loss": 1.5277, "step": 1528 }, { "epoch": 0.11754763368162262, "grad_norm": 4.480382919311523, "learning_rate": 1.952980946527351e-05, "loss": 1.7289, "step": 1530 }, { "epoch": 0.11770129071911493, "grad_norm": 4.207196235656738, "learning_rate": 1.9529194837123542e-05, "loss": 1.803, "step": 1532 }, { "epoch": 0.11785494775660725, "grad_norm": 4.125123023986816, "learning_rate": 1.9528580208973572e-05, "loss": 1.6289, "step": 1534 }, { "epoch": 0.11800860479409957, "grad_norm": 6.329103469848633, "learning_rate": 1.95279655808236e-05, "loss": 1.6592, "step": 1536 }, { "epoch": 0.11816226183159188, "grad_norm": 4.436602592468262, "learning_rate": 1.9527350952673635e-05, "loss": 1.5904, "step": 1538 }, { "epoch": 0.1183159188690842, "grad_norm": 4.564888954162598, "learning_rate": 1.9526736324523664e-05, "loss": 1.7565, "step": 1540 }, { "epoch": 0.11846957590657652, "grad_norm": 4.3771514892578125, "learning_rate": 1.9526121696373697e-05, "loss": 1.6746, "step": 1542 }, { "epoch": 0.11862323294406883, "grad_norm": 4.449161529541016, "learning_rate": 1.9525507068223727e-05, "loss": 1.6377, "step": 1544 }, { "epoch": 0.11877688998156116, "grad_norm": 4.770364761352539, "learning_rate": 1.9524892440073757e-05, "loss": 1.5953, "step": 1546 }, { "epoch": 0.11893054701905348, "grad_norm": 4.0749640464782715, "learning_rate": 1.952427781192379e-05, "loss": 1.5511, "step": 1548 }, { "epoch": 0.1190842040565458, "grad_norm": 4.361663341522217, "learning_rate": 1.9523663183773816e-05, "loss": 1.5562, "step": 1550 }, { "epoch": 0.11923786109403811, "grad_norm": 4.269155025482178, "learning_rate": 1.952304855562385e-05, "loss": 1.5725, "step": 1552 }, { "epoch": 0.11939151813153043, "grad_norm": 4.128551483154297, "learning_rate": 1.952243392747388e-05, "loss": 1.5, "step": 1554 }, { "epoch": 0.11954517516902274, "grad_norm": 4.763240814208984, "learning_rate": 1.952181929932391e-05, "loss": 1.7572, "step": 1556 }, { "epoch": 0.11969883220651506, "grad_norm": 4.871914386749268, "learning_rate": 1.952120467117394e-05, "loss": 1.609, "step": 1558 }, { "epoch": 0.11985248924400738, "grad_norm": 4.267725467681885, "learning_rate": 1.952059004302397e-05, "loss": 1.5832, "step": 1560 }, { "epoch": 0.12000614628149969, "grad_norm": 4.569482326507568, "learning_rate": 1.9519975414874e-05, "loss": 1.6262, "step": 1562 }, { "epoch": 0.12015980331899201, "grad_norm": 4.285094261169434, "learning_rate": 1.9519360786724034e-05, "loss": 1.7479, "step": 1564 }, { "epoch": 0.12031346035648433, "grad_norm": 4.529351234436035, "learning_rate": 1.9518746158574064e-05, "loss": 1.6297, "step": 1566 }, { "epoch": 0.12046711739397664, "grad_norm": 4.966389179229736, "learning_rate": 1.9518131530424097e-05, "loss": 1.7544, "step": 1568 }, { "epoch": 0.12062077443146896, "grad_norm": 4.608340263366699, "learning_rate": 1.9517516902274127e-05, "loss": 1.4635, "step": 1570 }, { "epoch": 0.12077443146896127, "grad_norm": 3.8790552616119385, "learning_rate": 1.9516902274124156e-05, "loss": 1.6345, "step": 1572 }, { "epoch": 0.12092808850645359, "grad_norm": 5.229369163513184, "learning_rate": 1.951628764597419e-05, "loss": 1.6829, "step": 1574 }, { "epoch": 0.1210817455439459, "grad_norm": 4.269663333892822, "learning_rate": 1.9515673017824216e-05, "loss": 1.568, "step": 1576 }, { "epoch": 0.12123540258143822, "grad_norm": 4.905238151550293, "learning_rate": 1.951505838967425e-05, "loss": 1.671, "step": 1578 }, { "epoch": 0.12138905961893055, "grad_norm": 4.5513596534729, "learning_rate": 1.951444376152428e-05, "loss": 1.6636, "step": 1580 }, { "epoch": 0.12154271665642287, "grad_norm": 4.586058616638184, "learning_rate": 1.9513829133374308e-05, "loss": 1.7669, "step": 1582 }, { "epoch": 0.12169637369391519, "grad_norm": 5.4855170249938965, "learning_rate": 1.951321450522434e-05, "loss": 1.5033, "step": 1584 }, { "epoch": 0.1218500307314075, "grad_norm": 4.668776035308838, "learning_rate": 1.951259987707437e-05, "loss": 1.5859, "step": 1586 }, { "epoch": 0.12200368776889982, "grad_norm": 3.9210376739501953, "learning_rate": 1.9511985248924404e-05, "loss": 1.5757, "step": 1588 }, { "epoch": 0.12215734480639213, "grad_norm": 4.558568000793457, "learning_rate": 1.9511370620774434e-05, "loss": 1.5945, "step": 1590 }, { "epoch": 0.12231100184388445, "grad_norm": 4.247246265411377, "learning_rate": 1.9510755992624463e-05, "loss": 1.624, "step": 1592 }, { "epoch": 0.12246465888137677, "grad_norm": 4.2471604347229, "learning_rate": 1.9510141364474496e-05, "loss": 1.5873, "step": 1594 }, { "epoch": 0.12261831591886908, "grad_norm": 4.362886428833008, "learning_rate": 1.9509526736324526e-05, "loss": 1.7448, "step": 1596 }, { "epoch": 0.1227719729563614, "grad_norm": 5.111678123474121, "learning_rate": 1.9508912108174556e-05, "loss": 1.8134, "step": 1598 }, { "epoch": 0.12292562999385372, "grad_norm": 4.4582624435424805, "learning_rate": 1.950829748002459e-05, "loss": 1.7155, "step": 1600 }, { "epoch": 0.12307928703134603, "grad_norm": 3.796780586242676, "learning_rate": 1.9507682851874615e-05, "loss": 1.5636, "step": 1602 }, { "epoch": 0.12323294406883835, "grad_norm": 4.517824649810791, "learning_rate": 1.9507068223724648e-05, "loss": 1.6092, "step": 1604 }, { "epoch": 0.12338660110633067, "grad_norm": 4.659684181213379, "learning_rate": 1.9506453595574678e-05, "loss": 1.641, "step": 1606 }, { "epoch": 0.12354025814382298, "grad_norm": 4.470782279968262, "learning_rate": 1.950583896742471e-05, "loss": 1.618, "step": 1608 }, { "epoch": 0.1236939151813153, "grad_norm": 4.486400604248047, "learning_rate": 1.950522433927474e-05, "loss": 1.6912, "step": 1610 }, { "epoch": 0.12384757221880763, "grad_norm": 4.459258556365967, "learning_rate": 1.950460971112477e-05, "loss": 1.5627, "step": 1612 }, { "epoch": 0.12400122925629994, "grad_norm": 4.486885070800781, "learning_rate": 1.9503995082974803e-05, "loss": 1.8642, "step": 1614 }, { "epoch": 0.12415488629379226, "grad_norm": 4.576472282409668, "learning_rate": 1.9503380454824833e-05, "loss": 1.6411, "step": 1616 }, { "epoch": 0.12430854333128458, "grad_norm": 4.349391460418701, "learning_rate": 1.9502765826674863e-05, "loss": 1.6382, "step": 1618 }, { "epoch": 0.12446220036877689, "grad_norm": 4.264526844024658, "learning_rate": 1.9502151198524896e-05, "loss": 1.621, "step": 1620 }, { "epoch": 0.12461585740626921, "grad_norm": 4.798770904541016, "learning_rate": 1.9501536570374925e-05, "loss": 1.8124, "step": 1622 }, { "epoch": 0.12476951444376153, "grad_norm": 3.747992515563965, "learning_rate": 1.9500921942224955e-05, "loss": 1.516, "step": 1624 }, { "epoch": 0.12492317148125384, "grad_norm": 4.410411834716797, "learning_rate": 1.9500307314074988e-05, "loss": 1.5645, "step": 1626 }, { "epoch": 0.12507682851874616, "grad_norm": 4.139060020446777, "learning_rate": 1.9499692685925018e-05, "loss": 1.6217, "step": 1628 }, { "epoch": 0.12523048555623847, "grad_norm": 4.380125045776367, "learning_rate": 1.9499078057775048e-05, "loss": 1.6909, "step": 1630 }, { "epoch": 0.1253841425937308, "grad_norm": 4.449796676635742, "learning_rate": 1.9498463429625077e-05, "loss": 1.7215, "step": 1632 }, { "epoch": 0.1255377996312231, "grad_norm": 4.043376922607422, "learning_rate": 1.949784880147511e-05, "loss": 1.6326, "step": 1634 }, { "epoch": 0.12569145666871542, "grad_norm": 4.427875518798828, "learning_rate": 1.949723417332514e-05, "loss": 1.6962, "step": 1636 }, { "epoch": 0.12584511370620774, "grad_norm": 4.617554187774658, "learning_rate": 1.949661954517517e-05, "loss": 1.5711, "step": 1638 }, { "epoch": 0.12599877074370006, "grad_norm": 4.245482444763184, "learning_rate": 1.9496004917025203e-05, "loss": 1.6479, "step": 1640 }, { "epoch": 0.12615242778119237, "grad_norm": 4.876771926879883, "learning_rate": 1.9495390288875232e-05, "loss": 1.7238, "step": 1642 }, { "epoch": 0.1263060848186847, "grad_norm": 4.263737678527832, "learning_rate": 1.9494775660725262e-05, "loss": 1.5666, "step": 1644 }, { "epoch": 0.126459741856177, "grad_norm": 6.202945232391357, "learning_rate": 1.9494161032575295e-05, "loss": 1.6217, "step": 1646 }, { "epoch": 0.12661339889366932, "grad_norm": 4.307828426361084, "learning_rate": 1.9493546404425325e-05, "loss": 1.49, "step": 1648 }, { "epoch": 0.12676705593116164, "grad_norm": 4.122886657714844, "learning_rate": 1.9492931776275355e-05, "loss": 1.7121, "step": 1650 }, { "epoch": 0.12692071296865395, "grad_norm": 4.3632426261901855, "learning_rate": 1.9492317148125384e-05, "loss": 1.6835, "step": 1652 }, { "epoch": 0.12707437000614627, "grad_norm": 4.4186625480651855, "learning_rate": 1.9491702519975417e-05, "loss": 1.7579, "step": 1654 }, { "epoch": 0.1272280270436386, "grad_norm": 4.411682605743408, "learning_rate": 1.9491087891825447e-05, "loss": 1.5771, "step": 1656 }, { "epoch": 0.1273816840811309, "grad_norm": 4.259854316711426, "learning_rate": 1.9490473263675477e-05, "loss": 1.6239, "step": 1658 }, { "epoch": 0.12753534111862325, "grad_norm": 4.225386619567871, "learning_rate": 1.948985863552551e-05, "loss": 1.6777, "step": 1660 }, { "epoch": 0.12768899815611556, "grad_norm": 4.977676868438721, "learning_rate": 1.948924400737554e-05, "loss": 1.6166, "step": 1662 }, { "epoch": 0.12784265519360788, "grad_norm": 3.7306509017944336, "learning_rate": 1.948862937922557e-05, "loss": 1.5834, "step": 1664 }, { "epoch": 0.1279963122311002, "grad_norm": 4.451853275299072, "learning_rate": 1.9488014751075602e-05, "loss": 1.6464, "step": 1666 }, { "epoch": 0.1281499692685925, "grad_norm": 4.641234397888184, "learning_rate": 1.9487400122925632e-05, "loss": 1.6698, "step": 1668 }, { "epoch": 0.12830362630608483, "grad_norm": 5.218206882476807, "learning_rate": 1.948678549477566e-05, "loss": 1.6614, "step": 1670 }, { "epoch": 0.12845728334357714, "grad_norm": 4.623648166656494, "learning_rate": 1.9486170866625695e-05, "loss": 1.6586, "step": 1672 }, { "epoch": 0.12861094038106946, "grad_norm": 5.1708478927612305, "learning_rate": 1.9485556238475724e-05, "loss": 1.6275, "step": 1674 }, { "epoch": 0.12876459741856178, "grad_norm": 4.305856227874756, "learning_rate": 1.9484941610325754e-05, "loss": 1.6349, "step": 1676 }, { "epoch": 0.1289182544560541, "grad_norm": 4.788485050201416, "learning_rate": 1.9484326982175784e-05, "loss": 1.4676, "step": 1678 }, { "epoch": 0.1290719114935464, "grad_norm": 4.4581379890441895, "learning_rate": 1.9483712354025817e-05, "loss": 1.5062, "step": 1680 }, { "epoch": 0.12922556853103873, "grad_norm": 3.9021549224853516, "learning_rate": 1.9483097725875846e-05, "loss": 1.7848, "step": 1682 }, { "epoch": 0.12937922556853104, "grad_norm": 4.530584812164307, "learning_rate": 1.9482483097725876e-05, "loss": 1.6594, "step": 1684 }, { "epoch": 0.12953288260602336, "grad_norm": 4.8017497062683105, "learning_rate": 1.948186846957591e-05, "loss": 1.6167, "step": 1686 }, { "epoch": 0.12968653964351567, "grad_norm": 4.41823148727417, "learning_rate": 1.948125384142594e-05, "loss": 1.5293, "step": 1688 }, { "epoch": 0.129840196681008, "grad_norm": 4.470682144165039, "learning_rate": 1.948063921327597e-05, "loss": 1.6036, "step": 1690 }, { "epoch": 0.1299938537185003, "grad_norm": 3.947842597961426, "learning_rate": 1.9480024585126e-05, "loss": 1.7037, "step": 1692 }, { "epoch": 0.13014751075599262, "grad_norm": 4.953098297119141, "learning_rate": 1.947940995697603e-05, "loss": 1.6436, "step": 1694 }, { "epoch": 0.13030116779348494, "grad_norm": 4.112635135650635, "learning_rate": 1.947879532882606e-05, "loss": 1.4595, "step": 1696 }, { "epoch": 0.13045482483097726, "grad_norm": 4.197033882141113, "learning_rate": 1.9478180700676094e-05, "loss": 1.657, "step": 1698 }, { "epoch": 0.13060848186846957, "grad_norm": 4.02692985534668, "learning_rate": 1.9477566072526124e-05, "loss": 1.6321, "step": 1700 }, { "epoch": 0.1307621389059619, "grad_norm": 4.7861809730529785, "learning_rate": 1.9476951444376153e-05, "loss": 1.5609, "step": 1702 }, { "epoch": 0.1309157959434542, "grad_norm": 4.392903804779053, "learning_rate": 1.9476336816226183e-05, "loss": 1.7549, "step": 1704 }, { "epoch": 0.13106945298094652, "grad_norm": 4.314429759979248, "learning_rate": 1.9475722188076216e-05, "loss": 1.5698, "step": 1706 }, { "epoch": 0.13122311001843884, "grad_norm": 4.254858016967773, "learning_rate": 1.9475107559926246e-05, "loss": 1.6291, "step": 1708 }, { "epoch": 0.13137676705593115, "grad_norm": 4.288058757781982, "learning_rate": 1.9474492931776276e-05, "loss": 1.822, "step": 1710 }, { "epoch": 0.13153042409342347, "grad_norm": 4.206986904144287, "learning_rate": 1.947387830362631e-05, "loss": 1.6372, "step": 1712 }, { "epoch": 0.1316840811309158, "grad_norm": 3.9056224822998047, "learning_rate": 1.947326367547634e-05, "loss": 1.6141, "step": 1714 }, { "epoch": 0.1318377381684081, "grad_norm": 5.1152777671813965, "learning_rate": 1.9472649047326368e-05, "loss": 1.6361, "step": 1716 }, { "epoch": 0.13199139520590042, "grad_norm": 4.0903120040893555, "learning_rate": 1.94720344191764e-05, "loss": 1.5559, "step": 1718 }, { "epoch": 0.13214505224339274, "grad_norm": 4.825276851654053, "learning_rate": 1.947141979102643e-05, "loss": 1.732, "step": 1720 }, { "epoch": 0.13229870928088505, "grad_norm": 4.649293899536133, "learning_rate": 1.947080516287646e-05, "loss": 1.5941, "step": 1722 }, { "epoch": 0.13245236631837737, "grad_norm": 4.052992820739746, "learning_rate": 1.9470190534726494e-05, "loss": 1.5664, "step": 1724 }, { "epoch": 0.1326060233558697, "grad_norm": 4.36129903793335, "learning_rate": 1.9469575906576523e-05, "loss": 1.7345, "step": 1726 }, { "epoch": 0.13275968039336203, "grad_norm": 4.522770404815674, "learning_rate": 1.9468961278426553e-05, "loss": 1.6731, "step": 1728 }, { "epoch": 0.13291333743085434, "grad_norm": 4.922299385070801, "learning_rate": 1.9468346650276583e-05, "loss": 1.8072, "step": 1730 }, { "epoch": 0.13306699446834666, "grad_norm": 4.385134220123291, "learning_rate": 1.9467732022126616e-05, "loss": 1.5836, "step": 1732 }, { "epoch": 0.13322065150583898, "grad_norm": 4.031277179718018, "learning_rate": 1.9467117393976645e-05, "loss": 1.465, "step": 1734 }, { "epoch": 0.1333743085433313, "grad_norm": 4.437002182006836, "learning_rate": 1.9466502765826675e-05, "loss": 1.5624, "step": 1736 }, { "epoch": 0.1335279655808236, "grad_norm": 3.754696846008301, "learning_rate": 1.9465888137676708e-05, "loss": 1.6044, "step": 1738 }, { "epoch": 0.13368162261831593, "grad_norm": 3.967130661010742, "learning_rate": 1.9465273509526738e-05, "loss": 1.5832, "step": 1740 }, { "epoch": 0.13383527965580824, "grad_norm": 3.958448648452759, "learning_rate": 1.9464658881376767e-05, "loss": 1.6812, "step": 1742 }, { "epoch": 0.13398893669330056, "grad_norm": 5.2511982917785645, "learning_rate": 1.94640442532268e-05, "loss": 1.8565, "step": 1744 }, { "epoch": 0.13414259373079288, "grad_norm": 4.229193210601807, "learning_rate": 1.946342962507683e-05, "loss": 1.6365, "step": 1746 }, { "epoch": 0.1342962507682852, "grad_norm": 3.8518741130828857, "learning_rate": 1.946281499692686e-05, "loss": 1.571, "step": 1748 }, { "epoch": 0.1344499078057775, "grad_norm": 4.383627414703369, "learning_rate": 1.946220036877689e-05, "loss": 1.5181, "step": 1750 }, { "epoch": 0.13460356484326982, "grad_norm": 4.58341121673584, "learning_rate": 1.9461585740626923e-05, "loss": 1.6387, "step": 1752 }, { "epoch": 0.13475722188076214, "grad_norm": 4.656858921051025, "learning_rate": 1.9460971112476956e-05, "loss": 1.6474, "step": 1754 }, { "epoch": 0.13491087891825446, "grad_norm": 5.039700031280518, "learning_rate": 1.9460356484326982e-05, "loss": 1.6755, "step": 1756 }, { "epoch": 0.13506453595574677, "grad_norm": 4.46349573135376, "learning_rate": 1.9459741856177015e-05, "loss": 1.5728, "step": 1758 }, { "epoch": 0.1352181929932391, "grad_norm": 4.041154861450195, "learning_rate": 1.9459127228027045e-05, "loss": 1.5847, "step": 1760 }, { "epoch": 0.1353718500307314, "grad_norm": 4.126910209655762, "learning_rate": 1.9458512599877074e-05, "loss": 1.6807, "step": 1762 }, { "epoch": 0.13552550706822372, "grad_norm": 4.063604831695557, "learning_rate": 1.9457897971727108e-05, "loss": 1.5294, "step": 1764 }, { "epoch": 0.13567916410571604, "grad_norm": 4.1347150802612305, "learning_rate": 1.9457283343577137e-05, "loss": 1.5728, "step": 1766 }, { "epoch": 0.13583282114320835, "grad_norm": 4.593793869018555, "learning_rate": 1.9456668715427167e-05, "loss": 1.7155, "step": 1768 }, { "epoch": 0.13598647818070067, "grad_norm": 4.340649127960205, "learning_rate": 1.94560540872772e-05, "loss": 1.6996, "step": 1770 }, { "epoch": 0.136140135218193, "grad_norm": 4.278517246246338, "learning_rate": 1.945543945912723e-05, "loss": 1.6012, "step": 1772 }, { "epoch": 0.1362937922556853, "grad_norm": 4.626030445098877, "learning_rate": 1.9454824830977263e-05, "loss": 1.6195, "step": 1774 }, { "epoch": 0.13644744929317762, "grad_norm": 4.450915813446045, "learning_rate": 1.945421020282729e-05, "loss": 1.6398, "step": 1776 }, { "epoch": 0.13660110633066994, "grad_norm": 4.265727996826172, "learning_rate": 1.9453595574677322e-05, "loss": 1.5958, "step": 1778 }, { "epoch": 0.13675476336816225, "grad_norm": 4.036159038543701, "learning_rate": 1.9452980946527352e-05, "loss": 1.5647, "step": 1780 }, { "epoch": 0.13690842040565457, "grad_norm": 4.2282257080078125, "learning_rate": 1.945236631837738e-05, "loss": 1.6079, "step": 1782 }, { "epoch": 0.13706207744314688, "grad_norm": 4.005040645599365, "learning_rate": 1.9451751690227415e-05, "loss": 1.5044, "step": 1784 }, { "epoch": 0.1372157344806392, "grad_norm": 4.676270484924316, "learning_rate": 1.9451137062077444e-05, "loss": 1.6304, "step": 1786 }, { "epoch": 0.13736939151813152, "grad_norm": 4.598161697387695, "learning_rate": 1.9450522433927474e-05, "loss": 1.6887, "step": 1788 }, { "epoch": 0.13752304855562386, "grad_norm": 5.0116448402404785, "learning_rate": 1.9449907805777507e-05, "loss": 1.6998, "step": 1790 }, { "epoch": 0.13767670559311618, "grad_norm": 4.892838954925537, "learning_rate": 1.9449293177627537e-05, "loss": 1.6274, "step": 1792 }, { "epoch": 0.1378303626306085, "grad_norm": 5.293637752532959, "learning_rate": 1.944867854947757e-05, "loss": 1.628, "step": 1794 }, { "epoch": 0.1379840196681008, "grad_norm": 4.583549976348877, "learning_rate": 1.94480639213276e-05, "loss": 1.4792, "step": 1796 }, { "epoch": 0.13813767670559313, "grad_norm": 3.773277759552002, "learning_rate": 1.944744929317763e-05, "loss": 1.5219, "step": 1798 }, { "epoch": 0.13829133374308544, "grad_norm": 4.440420150756836, "learning_rate": 1.9446834665027662e-05, "loss": 1.6732, "step": 1800 }, { "epoch": 0.13844499078057776, "grad_norm": 4.711763858795166, "learning_rate": 1.944622003687769e-05, "loss": 1.5463, "step": 1802 }, { "epoch": 0.13859864781807008, "grad_norm": 5.035058498382568, "learning_rate": 1.944560540872772e-05, "loss": 1.6961, "step": 1804 }, { "epoch": 0.1387523048555624, "grad_norm": 3.963282346725464, "learning_rate": 1.944499078057775e-05, "loss": 1.5568, "step": 1806 }, { "epoch": 0.1389059618930547, "grad_norm": 4.577483654022217, "learning_rate": 1.944437615242778e-05, "loss": 1.5428, "step": 1808 }, { "epoch": 0.13905961893054702, "grad_norm": 4.509146690368652, "learning_rate": 1.9443761524277814e-05, "loss": 1.6397, "step": 1810 }, { "epoch": 0.13921327596803934, "grad_norm": 4.317050933837891, "learning_rate": 1.9443146896127844e-05, "loss": 1.7304, "step": 1812 }, { "epoch": 0.13936693300553166, "grad_norm": 4.572277069091797, "learning_rate": 1.9442532267977877e-05, "loss": 1.6759, "step": 1814 }, { "epoch": 0.13952059004302397, "grad_norm": 4.773606777191162, "learning_rate": 1.9441917639827906e-05, "loss": 1.6869, "step": 1816 }, { "epoch": 0.1396742470805163, "grad_norm": 4.57815408706665, "learning_rate": 1.9441303011677936e-05, "loss": 1.647, "step": 1818 }, { "epoch": 0.1398279041180086, "grad_norm": 4.822877407073975, "learning_rate": 1.944068838352797e-05, "loss": 1.6182, "step": 1820 }, { "epoch": 0.13998156115550092, "grad_norm": 4.272431373596191, "learning_rate": 1.9440073755378e-05, "loss": 1.6215, "step": 1822 }, { "epoch": 0.14013521819299324, "grad_norm": 4.476557731628418, "learning_rate": 1.943945912722803e-05, "loss": 1.6277, "step": 1824 }, { "epoch": 0.14028887523048555, "grad_norm": 4.522927284240723, "learning_rate": 1.943884449907806e-05, "loss": 1.526, "step": 1826 }, { "epoch": 0.14044253226797787, "grad_norm": 3.991070032119751, "learning_rate": 1.9438229870928088e-05, "loss": 1.6106, "step": 1828 }, { "epoch": 0.1405961893054702, "grad_norm": 4.189483165740967, "learning_rate": 1.943761524277812e-05, "loss": 1.619, "step": 1830 }, { "epoch": 0.1407498463429625, "grad_norm": 4.5693159103393555, "learning_rate": 1.943700061462815e-05, "loss": 1.7438, "step": 1832 }, { "epoch": 0.14090350338045482, "grad_norm": 3.8766119480133057, "learning_rate": 1.943638598647818e-05, "loss": 1.4953, "step": 1834 }, { "epoch": 0.14105716041794714, "grad_norm": 4.294021129608154, "learning_rate": 1.9435771358328213e-05, "loss": 1.5746, "step": 1836 }, { "epoch": 0.14121081745543945, "grad_norm": 4.195743083953857, "learning_rate": 1.9435156730178243e-05, "loss": 1.5049, "step": 1838 }, { "epoch": 0.14136447449293177, "grad_norm": 4.331358909606934, "learning_rate": 1.9434542102028276e-05, "loss": 1.8416, "step": 1840 }, { "epoch": 0.14151813153042408, "grad_norm": 4.328099727630615, "learning_rate": 1.9433927473878306e-05, "loss": 1.7201, "step": 1842 }, { "epoch": 0.1416717885679164, "grad_norm": 4.2462005615234375, "learning_rate": 1.9433312845728336e-05, "loss": 1.6308, "step": 1844 }, { "epoch": 0.14182544560540872, "grad_norm": 4.253352165222168, "learning_rate": 1.943269821757837e-05, "loss": 1.6856, "step": 1846 }, { "epoch": 0.14197910264290103, "grad_norm": 4.154186248779297, "learning_rate": 1.9432083589428395e-05, "loss": 1.6079, "step": 1848 }, { "epoch": 0.14213275968039335, "grad_norm": 6.227648735046387, "learning_rate": 1.9431468961278428e-05, "loss": 1.73, "step": 1850 }, { "epoch": 0.14228641671788567, "grad_norm": 4.038461208343506, "learning_rate": 1.943085433312846e-05, "loss": 1.7227, "step": 1852 }, { "epoch": 0.14244007375537798, "grad_norm": 4.844911098480225, "learning_rate": 1.9430239704978487e-05, "loss": 1.5841, "step": 1854 }, { "epoch": 0.14259373079287033, "grad_norm": 3.845120429992676, "learning_rate": 1.942962507682852e-05, "loss": 1.6545, "step": 1856 }, { "epoch": 0.14274738783036264, "grad_norm": 4.25357723236084, "learning_rate": 1.942901044867855e-05, "loss": 1.6038, "step": 1858 }, { "epoch": 0.14290104486785496, "grad_norm": 4.518612861633301, "learning_rate": 1.9428395820528583e-05, "loss": 1.703, "step": 1860 }, { "epoch": 0.14305470190534728, "grad_norm": 4.541075229644775, "learning_rate": 1.9427781192378613e-05, "loss": 1.6203, "step": 1862 }, { "epoch": 0.1432083589428396, "grad_norm": 4.06412935256958, "learning_rate": 1.9427166564228643e-05, "loss": 1.714, "step": 1864 }, { "epoch": 0.1433620159803319, "grad_norm": 4.289870738983154, "learning_rate": 1.9426551936078676e-05, "loss": 1.5524, "step": 1866 }, { "epoch": 0.14351567301782422, "grad_norm": 3.937469005584717, "learning_rate": 1.9425937307928705e-05, "loss": 1.5572, "step": 1868 }, { "epoch": 0.14366933005531654, "grad_norm": 4.361362457275391, "learning_rate": 1.9425322679778735e-05, "loss": 1.493, "step": 1870 }, { "epoch": 0.14382298709280886, "grad_norm": 3.9257559776306152, "learning_rate": 1.9424708051628768e-05, "loss": 1.502, "step": 1872 }, { "epoch": 0.14397664413030117, "grad_norm": 4.2765655517578125, "learning_rate": 1.9424093423478794e-05, "loss": 1.6001, "step": 1874 }, { "epoch": 0.1441303011677935, "grad_norm": 3.724155902862549, "learning_rate": 1.9423478795328827e-05, "loss": 1.5722, "step": 1876 }, { "epoch": 0.1442839582052858, "grad_norm": 4.135402679443359, "learning_rate": 1.9422864167178857e-05, "loss": 1.7137, "step": 1878 }, { "epoch": 0.14443761524277812, "grad_norm": 4.522433280944824, "learning_rate": 1.942224953902889e-05, "loss": 1.6161, "step": 1880 }, { "epoch": 0.14459127228027044, "grad_norm": 4.247946262359619, "learning_rate": 1.942163491087892e-05, "loss": 1.67, "step": 1882 }, { "epoch": 0.14474492931776275, "grad_norm": 4.322160243988037, "learning_rate": 1.942102028272895e-05, "loss": 1.5362, "step": 1884 }, { "epoch": 0.14489858635525507, "grad_norm": 4.25150728225708, "learning_rate": 1.9420405654578983e-05, "loss": 1.6026, "step": 1886 }, { "epoch": 0.1450522433927474, "grad_norm": 4.95831823348999, "learning_rate": 1.9419791026429012e-05, "loss": 1.6806, "step": 1888 }, { "epoch": 0.1452059004302397, "grad_norm": 4.125936031341553, "learning_rate": 1.9419176398279042e-05, "loss": 1.634, "step": 1890 }, { "epoch": 0.14535955746773202, "grad_norm": 3.6493210792541504, "learning_rate": 1.9418561770129075e-05, "loss": 1.4507, "step": 1892 }, { "epoch": 0.14551321450522434, "grad_norm": 4.338488578796387, "learning_rate": 1.9417947141979105e-05, "loss": 1.5127, "step": 1894 }, { "epoch": 0.14566687154271665, "grad_norm": 4.250901222229004, "learning_rate": 1.9417332513829134e-05, "loss": 1.4201, "step": 1896 }, { "epoch": 0.14582052858020897, "grad_norm": 3.8600480556488037, "learning_rate": 1.9416717885679167e-05, "loss": 1.5595, "step": 1898 }, { "epoch": 0.14597418561770129, "grad_norm": 4.317285537719727, "learning_rate": 1.9416103257529197e-05, "loss": 1.6094, "step": 1900 }, { "epoch": 0.1461278426551936, "grad_norm": 4.718072891235352, "learning_rate": 1.9415488629379227e-05, "loss": 1.768, "step": 1902 }, { "epoch": 0.14628149969268592, "grad_norm": 4.9370808601379395, "learning_rate": 1.9414874001229257e-05, "loss": 1.5801, "step": 1904 }, { "epoch": 0.14643515673017823, "grad_norm": 4.436810493469238, "learning_rate": 1.941425937307929e-05, "loss": 1.5847, "step": 1906 }, { "epoch": 0.14658881376767055, "grad_norm": 4.890700817108154, "learning_rate": 1.941364474492932e-05, "loss": 1.5092, "step": 1908 }, { "epoch": 0.14674247080516287, "grad_norm": 3.926815986633301, "learning_rate": 1.941303011677935e-05, "loss": 1.6612, "step": 1910 }, { "epoch": 0.14689612784265518, "grad_norm": 4.331315994262695, "learning_rate": 1.9412415488629382e-05, "loss": 1.7669, "step": 1912 }, { "epoch": 0.1470497848801475, "grad_norm": 5.178247928619385, "learning_rate": 1.9411800860479412e-05, "loss": 1.6749, "step": 1914 }, { "epoch": 0.14720344191763982, "grad_norm": 3.871377944946289, "learning_rate": 1.941118623232944e-05, "loss": 1.4508, "step": 1916 }, { "epoch": 0.14735709895513213, "grad_norm": 4.062928676605225, "learning_rate": 1.9410571604179474e-05, "loss": 1.458, "step": 1918 }, { "epoch": 0.14751075599262448, "grad_norm": 4.205310344696045, "learning_rate": 1.9409956976029504e-05, "loss": 1.6419, "step": 1920 }, { "epoch": 0.1476644130301168, "grad_norm": 3.822014093399048, "learning_rate": 1.9409342347879534e-05, "loss": 1.558, "step": 1922 }, { "epoch": 0.1478180700676091, "grad_norm": 4.9377593994140625, "learning_rate": 1.9408727719729567e-05, "loss": 1.687, "step": 1924 }, { "epoch": 0.14797172710510142, "grad_norm": 4.531102657318115, "learning_rate": 1.9408113091579597e-05, "loss": 1.7183, "step": 1926 }, { "epoch": 0.14812538414259374, "grad_norm": 4.2398881912231445, "learning_rate": 1.9407498463429626e-05, "loss": 1.5952, "step": 1928 }, { "epoch": 0.14827904118008606, "grad_norm": 4.426001071929932, "learning_rate": 1.9406883835279656e-05, "loss": 1.7844, "step": 1930 }, { "epoch": 0.14843269821757837, "grad_norm": 4.2123494148254395, "learning_rate": 1.940626920712969e-05, "loss": 1.6711, "step": 1932 }, { "epoch": 0.1485863552550707, "grad_norm": 4.681150913238525, "learning_rate": 1.940565457897972e-05, "loss": 1.6319, "step": 1934 }, { "epoch": 0.148740012292563, "grad_norm": 4.499131202697754, "learning_rate": 1.940503995082975e-05, "loss": 1.6108, "step": 1936 }, { "epoch": 0.14889366933005532, "grad_norm": 5.171452522277832, "learning_rate": 1.940442532267978e-05, "loss": 1.7528, "step": 1938 }, { "epoch": 0.14904732636754764, "grad_norm": 4.263631343841553, "learning_rate": 1.940381069452981e-05, "loss": 1.5191, "step": 1940 }, { "epoch": 0.14920098340503996, "grad_norm": 5.029245853424072, "learning_rate": 1.940319606637984e-05, "loss": 1.5623, "step": 1942 }, { "epoch": 0.14935464044253227, "grad_norm": 4.505579948425293, "learning_rate": 1.9402581438229874e-05, "loss": 1.4555, "step": 1944 }, { "epoch": 0.1495082974800246, "grad_norm": 4.269020080566406, "learning_rate": 1.9401966810079904e-05, "loss": 1.5765, "step": 1946 }, { "epoch": 0.1496619545175169, "grad_norm": 3.9689948558807373, "learning_rate": 1.9401352181929933e-05, "loss": 1.6277, "step": 1948 }, { "epoch": 0.14981561155500922, "grad_norm": 3.684130907058716, "learning_rate": 1.9400737553779966e-05, "loss": 1.5172, "step": 1950 }, { "epoch": 0.14996926859250154, "grad_norm": 3.713602066040039, "learning_rate": 1.9400122925629996e-05, "loss": 1.5211, "step": 1952 }, { "epoch": 0.15012292562999385, "grad_norm": 4.927131175994873, "learning_rate": 1.9399508297480026e-05, "loss": 1.6324, "step": 1954 }, { "epoch": 0.15027658266748617, "grad_norm": 4.4799299240112305, "learning_rate": 1.9398893669330055e-05, "loss": 1.5679, "step": 1956 }, { "epoch": 0.15043023970497849, "grad_norm": 4.372833251953125, "learning_rate": 1.939827904118009e-05, "loss": 1.5638, "step": 1958 }, { "epoch": 0.1505838967424708, "grad_norm": 4.321191787719727, "learning_rate": 1.9397664413030118e-05, "loss": 1.4329, "step": 1960 }, { "epoch": 0.15073755377996312, "grad_norm": 4.75023078918457, "learning_rate": 1.9397049784880148e-05, "loss": 1.6682, "step": 1962 }, { "epoch": 0.15089121081745543, "grad_norm": 4.172933101654053, "learning_rate": 1.939643515673018e-05, "loss": 1.7316, "step": 1964 }, { "epoch": 0.15104486785494775, "grad_norm": 4.329415321350098, "learning_rate": 1.939582052858021e-05, "loss": 1.7017, "step": 1966 }, { "epoch": 0.15119852489244007, "grad_norm": 4.249721527099609, "learning_rate": 1.939520590043024e-05, "loss": 1.5796, "step": 1968 }, { "epoch": 0.15135218192993238, "grad_norm": 4.071712970733643, "learning_rate": 1.9394591272280273e-05, "loss": 1.4623, "step": 1970 }, { "epoch": 0.1515058389674247, "grad_norm": 4.0507731437683105, "learning_rate": 1.9393976644130303e-05, "loss": 1.4821, "step": 1972 }, { "epoch": 0.15165949600491702, "grad_norm": 4.356963634490967, "learning_rate": 1.9393362015980333e-05, "loss": 1.547, "step": 1974 }, { "epoch": 0.15181315304240933, "grad_norm": 5.182737350463867, "learning_rate": 1.9392747387830362e-05, "loss": 1.6367, "step": 1976 }, { "epoch": 0.15196681007990165, "grad_norm": 3.9492363929748535, "learning_rate": 1.9392132759680395e-05, "loss": 1.679, "step": 1978 }, { "epoch": 0.15212046711739396, "grad_norm": 4.404160976409912, "learning_rate": 1.9391518131530425e-05, "loss": 1.5288, "step": 1980 }, { "epoch": 0.15227412415488628, "grad_norm": 4.500973701477051, "learning_rate": 1.9390903503380455e-05, "loss": 1.7715, "step": 1982 }, { "epoch": 0.1524277811923786, "grad_norm": 4.121068954467773, "learning_rate": 1.9390288875230488e-05, "loss": 1.5507, "step": 1984 }, { "epoch": 0.15258143822987094, "grad_norm": 3.7095515727996826, "learning_rate": 1.9389674247080518e-05, "loss": 1.616, "step": 1986 }, { "epoch": 0.15273509526736326, "grad_norm": 5.333407878875732, "learning_rate": 1.9389059618930547e-05, "loss": 1.7196, "step": 1988 }, { "epoch": 0.15288875230485557, "grad_norm": 4.188971042633057, "learning_rate": 1.938844499078058e-05, "loss": 1.6256, "step": 1990 }, { "epoch": 0.1530424093423479, "grad_norm": 4.126604080200195, "learning_rate": 1.938783036263061e-05, "loss": 1.5089, "step": 1992 }, { "epoch": 0.1531960663798402, "grad_norm": 4.127197742462158, "learning_rate": 1.938721573448064e-05, "loss": 1.5598, "step": 1994 }, { "epoch": 0.15334972341733252, "grad_norm": 4.481958389282227, "learning_rate": 1.9386601106330673e-05, "loss": 1.6238, "step": 1996 }, { "epoch": 0.15350338045482484, "grad_norm": 4.15784215927124, "learning_rate": 1.9385986478180702e-05, "loss": 1.6611, "step": 1998 }, { "epoch": 0.15365703749231716, "grad_norm": 4.322861194610596, "learning_rate": 1.9385371850030732e-05, "loss": 1.513, "step": 2000 }, { "epoch": 0.15381069452980947, "grad_norm": 3.9926345348358154, "learning_rate": 1.9384757221880762e-05, "loss": 1.5764, "step": 2002 }, { "epoch": 0.1539643515673018, "grad_norm": 5.112368583679199, "learning_rate": 1.9384142593730795e-05, "loss": 1.6181, "step": 2004 }, { "epoch": 0.1541180086047941, "grad_norm": 3.6655466556549072, "learning_rate": 1.9383527965580825e-05, "loss": 1.3721, "step": 2006 }, { "epoch": 0.15427166564228642, "grad_norm": 4.533908367156982, "learning_rate": 1.9382913337430854e-05, "loss": 1.5841, "step": 2008 }, { "epoch": 0.15442532267977874, "grad_norm": 3.722304344177246, "learning_rate": 1.9382298709280887e-05, "loss": 1.5965, "step": 2010 }, { "epoch": 0.15457897971727105, "grad_norm": 4.5336012840271, "learning_rate": 1.9381684081130917e-05, "loss": 1.5995, "step": 2012 }, { "epoch": 0.15473263675476337, "grad_norm": 4.144698619842529, "learning_rate": 1.9381069452980947e-05, "loss": 1.6343, "step": 2014 }, { "epoch": 0.15488629379225569, "grad_norm": 4.033304691314697, "learning_rate": 1.938045482483098e-05, "loss": 1.6888, "step": 2016 }, { "epoch": 0.155039950829748, "grad_norm": 4.00151252746582, "learning_rate": 1.937984019668101e-05, "loss": 1.5246, "step": 2018 }, { "epoch": 0.15519360786724032, "grad_norm": 4.766987323760986, "learning_rate": 1.937922556853104e-05, "loss": 1.7345, "step": 2020 }, { "epoch": 0.15534726490473263, "grad_norm": 4.144290924072266, "learning_rate": 1.9378610940381072e-05, "loss": 1.5848, "step": 2022 }, { "epoch": 0.15550092194222495, "grad_norm": 4.038874626159668, "learning_rate": 1.9377996312231102e-05, "loss": 1.6425, "step": 2024 }, { "epoch": 0.15565457897971727, "grad_norm": 3.832429885864258, "learning_rate": 1.9377381684081135e-05, "loss": 1.5454, "step": 2026 }, { "epoch": 0.15580823601720958, "grad_norm": 4.775087833404541, "learning_rate": 1.937676705593116e-05, "loss": 1.6767, "step": 2028 }, { "epoch": 0.1559618930547019, "grad_norm": 3.997192144393921, "learning_rate": 1.9376152427781194e-05, "loss": 1.5539, "step": 2030 }, { "epoch": 0.15611555009219422, "grad_norm": 3.764519453048706, "learning_rate": 1.9375537799631224e-05, "loss": 1.46, "step": 2032 }, { "epoch": 0.15626920712968653, "grad_norm": 4.074234962463379, "learning_rate": 1.9374923171481254e-05, "loss": 1.5064, "step": 2034 }, { "epoch": 0.15642286416717885, "grad_norm": 4.302229881286621, "learning_rate": 1.9374308543331287e-05, "loss": 1.6276, "step": 2036 }, { "epoch": 0.15657652120467117, "grad_norm": 4.602327346801758, "learning_rate": 1.9373693915181316e-05, "loss": 1.5551, "step": 2038 }, { "epoch": 0.15673017824216348, "grad_norm": 4.131155490875244, "learning_rate": 1.9373079287031346e-05, "loss": 1.4554, "step": 2040 }, { "epoch": 0.1568838352796558, "grad_norm": 4.661727428436279, "learning_rate": 1.937246465888138e-05, "loss": 1.6566, "step": 2042 }, { "epoch": 0.15703749231714811, "grad_norm": 4.23723030090332, "learning_rate": 1.937185003073141e-05, "loss": 1.6038, "step": 2044 }, { "epoch": 0.15719114935464043, "grad_norm": 3.995077133178711, "learning_rate": 1.9371235402581442e-05, "loss": 1.7374, "step": 2046 }, { "epoch": 0.15734480639213275, "grad_norm": 4.001912593841553, "learning_rate": 1.937062077443147e-05, "loss": 1.5991, "step": 2048 }, { "epoch": 0.15749846342962506, "grad_norm": 4.558352470397949, "learning_rate": 1.93700061462815e-05, "loss": 1.6115, "step": 2050 }, { "epoch": 0.1576521204671174, "grad_norm": 4.651041030883789, "learning_rate": 1.9369391518131534e-05, "loss": 1.5451, "step": 2052 }, { "epoch": 0.15780577750460972, "grad_norm": 4.203875541687012, "learning_rate": 1.936877688998156e-05, "loss": 1.5506, "step": 2054 }, { "epoch": 0.15795943454210204, "grad_norm": 4.153205394744873, "learning_rate": 1.9368162261831594e-05, "loss": 1.5209, "step": 2056 }, { "epoch": 0.15811309157959436, "grad_norm": 4.187285900115967, "learning_rate": 1.9367547633681623e-05, "loss": 1.7677, "step": 2058 }, { "epoch": 0.15826674861708667, "grad_norm": 4.654025554656982, "learning_rate": 1.9366933005531653e-05, "loss": 1.63, "step": 2060 }, { "epoch": 0.158420405654579, "grad_norm": 4.145837306976318, "learning_rate": 1.9366318377381686e-05, "loss": 1.7693, "step": 2062 }, { "epoch": 0.1585740626920713, "grad_norm": 4.076268196105957, "learning_rate": 1.9365703749231716e-05, "loss": 1.5436, "step": 2064 }, { "epoch": 0.15872771972956362, "grad_norm": 3.9100687503814697, "learning_rate": 1.936508912108175e-05, "loss": 1.5738, "step": 2066 }, { "epoch": 0.15888137676705594, "grad_norm": 4.173727989196777, "learning_rate": 1.936447449293178e-05, "loss": 1.6155, "step": 2068 }, { "epoch": 0.15903503380454825, "grad_norm": 4.555833339691162, "learning_rate": 1.936385986478181e-05, "loss": 1.6879, "step": 2070 }, { "epoch": 0.15918869084204057, "grad_norm": 4.008725166320801, "learning_rate": 1.936324523663184e-05, "loss": 1.5752, "step": 2072 }, { "epoch": 0.1593423478795329, "grad_norm": 5.628279209136963, "learning_rate": 1.9362630608481868e-05, "loss": 1.6574, "step": 2074 }, { "epoch": 0.1594960049170252, "grad_norm": 4.112339496612549, "learning_rate": 1.93620159803319e-05, "loss": 1.6789, "step": 2076 }, { "epoch": 0.15964966195451752, "grad_norm": 4.378570556640625, "learning_rate": 1.9361401352181934e-05, "loss": 1.4837, "step": 2078 }, { "epoch": 0.15980331899200984, "grad_norm": 3.819582223892212, "learning_rate": 1.936078672403196e-05, "loss": 1.573, "step": 2080 }, { "epoch": 0.15995697602950215, "grad_norm": 4.303280830383301, "learning_rate": 1.9360172095881993e-05, "loss": 1.4278, "step": 2082 }, { "epoch": 0.16011063306699447, "grad_norm": 3.9183764457702637, "learning_rate": 1.9359557467732023e-05, "loss": 1.4808, "step": 2084 }, { "epoch": 0.16026429010448678, "grad_norm": 4.484659671783447, "learning_rate": 1.9358942839582053e-05, "loss": 1.557, "step": 2086 }, { "epoch": 0.1604179471419791, "grad_norm": 4.600409984588623, "learning_rate": 1.9358328211432086e-05, "loss": 1.5848, "step": 2088 }, { "epoch": 0.16057160417947142, "grad_norm": 3.4427108764648438, "learning_rate": 1.9357713583282115e-05, "loss": 1.5589, "step": 2090 }, { "epoch": 0.16072526121696373, "grad_norm": 7.8328375816345215, "learning_rate": 1.935709895513215e-05, "loss": 1.5519, "step": 2092 }, { "epoch": 0.16087891825445605, "grad_norm": 4.630655765533447, "learning_rate": 1.9356484326982178e-05, "loss": 1.6441, "step": 2094 }, { "epoch": 0.16103257529194837, "grad_norm": 4.217165946960449, "learning_rate": 1.9355869698832208e-05, "loss": 1.5559, "step": 2096 }, { "epoch": 0.16118623232944068, "grad_norm": 4.941442012786865, "learning_rate": 1.935525507068224e-05, "loss": 1.5065, "step": 2098 }, { "epoch": 0.161339889366933, "grad_norm": 4.171699523925781, "learning_rate": 1.9354640442532267e-05, "loss": 1.4993, "step": 2100 }, { "epoch": 0.16149354640442531, "grad_norm": 3.8235647678375244, "learning_rate": 1.93540258143823e-05, "loss": 1.6035, "step": 2102 }, { "epoch": 0.16164720344191763, "grad_norm": 5.202590465545654, "learning_rate": 1.935341118623233e-05, "loss": 1.6397, "step": 2104 }, { "epoch": 0.16180086047940995, "grad_norm": 3.8114516735076904, "learning_rate": 1.935279655808236e-05, "loss": 1.5103, "step": 2106 }, { "epoch": 0.16195451751690226, "grad_norm": 4.666793346405029, "learning_rate": 1.9352181929932393e-05, "loss": 1.5363, "step": 2108 }, { "epoch": 0.16210817455439458, "grad_norm": 4.18889856338501, "learning_rate": 1.9351567301782422e-05, "loss": 1.662, "step": 2110 }, { "epoch": 0.1622618315918869, "grad_norm": 4.392826557159424, "learning_rate": 1.9350952673632455e-05, "loss": 1.5492, "step": 2112 }, { "epoch": 0.1624154886293792, "grad_norm": 4.0653839111328125, "learning_rate": 1.9350338045482485e-05, "loss": 1.7512, "step": 2114 }, { "epoch": 0.16256914566687156, "grad_norm": 4.020386219024658, "learning_rate": 1.9349723417332515e-05, "loss": 1.7604, "step": 2116 }, { "epoch": 0.16272280270436387, "grad_norm": 4.062155723571777, "learning_rate": 1.9349108789182548e-05, "loss": 1.7181, "step": 2118 }, { "epoch": 0.1628764597418562, "grad_norm": 4.4019999504089355, "learning_rate": 1.9348494161032578e-05, "loss": 1.639, "step": 2120 }, { "epoch": 0.1630301167793485, "grad_norm": 3.761319875717163, "learning_rate": 1.9347879532882607e-05, "loss": 1.4084, "step": 2122 }, { "epoch": 0.16318377381684082, "grad_norm": 4.566369533538818, "learning_rate": 1.934726490473264e-05, "loss": 1.5193, "step": 2124 }, { "epoch": 0.16333743085433314, "grad_norm": 4.366701602935791, "learning_rate": 1.9346650276582667e-05, "loss": 1.5211, "step": 2126 }, { "epoch": 0.16349108789182545, "grad_norm": 4.0437116622924805, "learning_rate": 1.93460356484327e-05, "loss": 1.5901, "step": 2128 }, { "epoch": 0.16364474492931777, "grad_norm": 3.9914474487304688, "learning_rate": 1.934542102028273e-05, "loss": 1.4975, "step": 2130 }, { "epoch": 0.1637984019668101, "grad_norm": 4.267744541168213, "learning_rate": 1.9344806392132762e-05, "loss": 1.6574, "step": 2132 }, { "epoch": 0.1639520590043024, "grad_norm": 3.843414068222046, "learning_rate": 1.9344191763982792e-05, "loss": 1.4976, "step": 2134 }, { "epoch": 0.16410571604179472, "grad_norm": 4.3155975341796875, "learning_rate": 1.9343577135832822e-05, "loss": 1.5333, "step": 2136 }, { "epoch": 0.16425937307928704, "grad_norm": 4.159292697906494, "learning_rate": 1.9342962507682855e-05, "loss": 1.5053, "step": 2138 }, { "epoch": 0.16441303011677935, "grad_norm": 4.296607971191406, "learning_rate": 1.9342347879532885e-05, "loss": 1.6112, "step": 2140 }, { "epoch": 0.16456668715427167, "grad_norm": 3.9782469272613525, "learning_rate": 1.9341733251382914e-05, "loss": 1.5241, "step": 2142 }, { "epoch": 0.16472034419176398, "grad_norm": 4.183950901031494, "learning_rate": 1.9341118623232947e-05, "loss": 1.5901, "step": 2144 }, { "epoch": 0.1648740012292563, "grad_norm": 3.9188575744628906, "learning_rate": 1.9340503995082977e-05, "loss": 1.621, "step": 2146 }, { "epoch": 0.16502765826674862, "grad_norm": 4.474676132202148, "learning_rate": 1.9339889366933007e-05, "loss": 1.487, "step": 2148 }, { "epoch": 0.16518131530424093, "grad_norm": 4.6410017013549805, "learning_rate": 1.933927473878304e-05, "loss": 1.6997, "step": 2150 }, { "epoch": 0.16533497234173325, "grad_norm": 10.933167457580566, "learning_rate": 1.933866011063307e-05, "loss": 1.4752, "step": 2152 }, { "epoch": 0.16548862937922557, "grad_norm": 4.885360240936279, "learning_rate": 1.93380454824831e-05, "loss": 1.5604, "step": 2154 }, { "epoch": 0.16564228641671788, "grad_norm": 3.9420063495635986, "learning_rate": 1.933743085433313e-05, "loss": 1.5909, "step": 2156 }, { "epoch": 0.1657959434542102, "grad_norm": 3.5523521900177, "learning_rate": 1.9336816226183162e-05, "loss": 1.6274, "step": 2158 }, { "epoch": 0.16594960049170251, "grad_norm": 3.6621317863464355, "learning_rate": 1.933620159803319e-05, "loss": 1.5111, "step": 2160 }, { "epoch": 0.16610325752919483, "grad_norm": 4.57207727432251, "learning_rate": 1.933558696988322e-05, "loss": 1.6702, "step": 2162 }, { "epoch": 0.16625691456668715, "grad_norm": 3.9489428997039795, "learning_rate": 1.9334972341733254e-05, "loss": 1.589, "step": 2164 }, { "epoch": 0.16641057160417946, "grad_norm": 3.8509654998779297, "learning_rate": 1.9334357713583284e-05, "loss": 1.5522, "step": 2166 }, { "epoch": 0.16656422864167178, "grad_norm": 3.6893975734710693, "learning_rate": 1.9333743085433314e-05, "loss": 1.4767, "step": 2168 }, { "epoch": 0.1667178856791641, "grad_norm": 4.064876079559326, "learning_rate": 1.9333128457283347e-05, "loss": 1.5515, "step": 2170 }, { "epoch": 0.1668715427166564, "grad_norm": 4.427343845367432, "learning_rate": 1.9332513829133376e-05, "loss": 1.7098, "step": 2172 }, { "epoch": 0.16702519975414873, "grad_norm": 3.724740505218506, "learning_rate": 1.9331899200983406e-05, "loss": 1.5612, "step": 2174 }, { "epoch": 0.16717885679164105, "grad_norm": 4.302028656005859, "learning_rate": 1.933128457283344e-05, "loss": 1.641, "step": 2176 }, { "epoch": 0.16733251382913336, "grad_norm": 4.149264335632324, "learning_rate": 1.933066994468347e-05, "loss": 1.6923, "step": 2178 }, { "epoch": 0.16748617086662568, "grad_norm": 4.091360092163086, "learning_rate": 1.93300553165335e-05, "loss": 1.4934, "step": 2180 }, { "epoch": 0.16763982790411802, "grad_norm": 4.653087139129639, "learning_rate": 1.9329440688383528e-05, "loss": 1.6434, "step": 2182 }, { "epoch": 0.16779348494161034, "grad_norm": 4.141650199890137, "learning_rate": 1.932882606023356e-05, "loss": 1.5228, "step": 2184 }, { "epoch": 0.16794714197910265, "grad_norm": 4.257270336151123, "learning_rate": 1.932821143208359e-05, "loss": 1.7463, "step": 2186 }, { "epoch": 0.16810079901659497, "grad_norm": 4.1163554191589355, "learning_rate": 1.932759680393362e-05, "loss": 1.4543, "step": 2188 }, { "epoch": 0.1682544560540873, "grad_norm": 3.6683640480041504, "learning_rate": 1.9326982175783654e-05, "loss": 1.4778, "step": 2190 }, { "epoch": 0.1684081130915796, "grad_norm": 3.734006881713867, "learning_rate": 1.9326367547633683e-05, "loss": 1.5738, "step": 2192 }, { "epoch": 0.16856177012907192, "grad_norm": 4.454776287078857, "learning_rate": 1.9325752919483713e-05, "loss": 1.6569, "step": 2194 }, { "epoch": 0.16871542716656424, "grad_norm": 4.1497883796691895, "learning_rate": 1.9325138291333746e-05, "loss": 1.5172, "step": 2196 }, { "epoch": 0.16886908420405655, "grad_norm": 4.288064479827881, "learning_rate": 1.9324523663183776e-05, "loss": 1.6416, "step": 2198 }, { "epoch": 0.16902274124154887, "grad_norm": 3.463115930557251, "learning_rate": 1.9323909035033806e-05, "loss": 1.4196, "step": 2200 }, { "epoch": 0.16917639827904118, "grad_norm": 5.139834403991699, "learning_rate": 1.9323294406883835e-05, "loss": 1.6058, "step": 2202 }, { "epoch": 0.1693300553165335, "grad_norm": 4.1170806884765625, "learning_rate": 1.9322679778733868e-05, "loss": 1.4493, "step": 2204 }, { "epoch": 0.16948371235402582, "grad_norm": 3.8090291023254395, "learning_rate": 1.9322065150583898e-05, "loss": 1.6205, "step": 2206 }, { "epoch": 0.16963736939151813, "grad_norm": 3.461530923843384, "learning_rate": 1.9321450522433928e-05, "loss": 1.4338, "step": 2208 }, { "epoch": 0.16979102642901045, "grad_norm": 4.355661392211914, "learning_rate": 1.932083589428396e-05, "loss": 1.6044, "step": 2210 }, { "epoch": 0.16994468346650277, "grad_norm": 3.4141671657562256, "learning_rate": 1.932022126613399e-05, "loss": 1.5096, "step": 2212 }, { "epoch": 0.17009834050399508, "grad_norm": 4.202045917510986, "learning_rate": 1.931960663798402e-05, "loss": 1.7529, "step": 2214 }, { "epoch": 0.1702519975414874, "grad_norm": 3.8714635372161865, "learning_rate": 1.9318992009834053e-05, "loss": 1.5085, "step": 2216 }, { "epoch": 0.17040565457897972, "grad_norm": 5.047643184661865, "learning_rate": 1.9318377381684083e-05, "loss": 1.6937, "step": 2218 }, { "epoch": 0.17055931161647203, "grad_norm": 4.094550132751465, "learning_rate": 1.9317762753534113e-05, "loss": 1.7053, "step": 2220 }, { "epoch": 0.17071296865396435, "grad_norm": 3.65474796295166, "learning_rate": 1.9317148125384146e-05, "loss": 1.5531, "step": 2222 }, { "epoch": 0.17086662569145666, "grad_norm": 4.38794469833374, "learning_rate": 1.9316533497234175e-05, "loss": 1.7151, "step": 2224 }, { "epoch": 0.17102028272894898, "grad_norm": 4.336912631988525, "learning_rate": 1.9315918869084205e-05, "loss": 1.4871, "step": 2226 }, { "epoch": 0.1711739397664413, "grad_norm": 3.9137933254241943, "learning_rate": 1.9315304240934235e-05, "loss": 1.3785, "step": 2228 }, { "epoch": 0.1713275968039336, "grad_norm": 4.241963863372803, "learning_rate": 1.9314689612784268e-05, "loss": 1.482, "step": 2230 }, { "epoch": 0.17148125384142593, "grad_norm": 4.047958850860596, "learning_rate": 1.9314074984634297e-05, "loss": 1.7098, "step": 2232 }, { "epoch": 0.17163491087891825, "grad_norm": 4.688525676727295, "learning_rate": 1.9313460356484327e-05, "loss": 1.5618, "step": 2234 }, { "epoch": 0.17178856791641056, "grad_norm": 4.020751953125, "learning_rate": 1.931284572833436e-05, "loss": 1.5526, "step": 2236 }, { "epoch": 0.17194222495390288, "grad_norm": 3.93445086479187, "learning_rate": 1.931223110018439e-05, "loss": 1.6595, "step": 2238 }, { "epoch": 0.1720958819913952, "grad_norm": 5.324620723724365, "learning_rate": 1.931161647203442e-05, "loss": 1.657, "step": 2240 }, { "epoch": 0.1722495390288875, "grad_norm": 3.6193902492523193, "learning_rate": 1.9311001843884453e-05, "loss": 1.5, "step": 2242 }, { "epoch": 0.17240319606637983, "grad_norm": 4.382716178894043, "learning_rate": 1.9310387215734482e-05, "loss": 1.6785, "step": 2244 }, { "epoch": 0.17255685310387217, "grad_norm": 3.7359304428100586, "learning_rate": 1.9309772587584512e-05, "loss": 1.5003, "step": 2246 }, { "epoch": 0.1727105101413645, "grad_norm": 4.570140838623047, "learning_rate": 1.9309157959434545e-05, "loss": 1.6353, "step": 2248 }, { "epoch": 0.1728641671788568, "grad_norm": 4.809631824493408, "learning_rate": 1.9308543331284575e-05, "loss": 1.5898, "step": 2250 }, { "epoch": 0.17301782421634912, "grad_norm": 4.994627475738525, "learning_rate": 1.9307928703134604e-05, "loss": 1.6614, "step": 2252 }, { "epoch": 0.17317148125384144, "grad_norm": 4.121060371398926, "learning_rate": 1.9307314074984634e-05, "loss": 1.6817, "step": 2254 }, { "epoch": 0.17332513829133375, "grad_norm": 4.009014129638672, "learning_rate": 1.9306699446834667e-05, "loss": 1.5645, "step": 2256 }, { "epoch": 0.17347879532882607, "grad_norm": 4.27223539352417, "learning_rate": 1.9306084818684697e-05, "loss": 1.6764, "step": 2258 }, { "epoch": 0.17363245236631838, "grad_norm": 4.074213027954102, "learning_rate": 1.9305470190534727e-05, "loss": 1.6095, "step": 2260 }, { "epoch": 0.1737861094038107, "grad_norm": 3.6030173301696777, "learning_rate": 1.930485556238476e-05, "loss": 1.5907, "step": 2262 }, { "epoch": 0.17393976644130302, "grad_norm": 4.34961462020874, "learning_rate": 1.930424093423479e-05, "loss": 1.611, "step": 2264 }, { "epoch": 0.17409342347879533, "grad_norm": 3.9723429679870605, "learning_rate": 1.930362630608482e-05, "loss": 1.5256, "step": 2266 }, { "epoch": 0.17424708051628765, "grad_norm": 3.7899746894836426, "learning_rate": 1.9303011677934852e-05, "loss": 1.6923, "step": 2268 }, { "epoch": 0.17440073755377997, "grad_norm": 4.415828227996826, "learning_rate": 1.9302397049784882e-05, "loss": 1.4681, "step": 2270 }, { "epoch": 0.17455439459127228, "grad_norm": 5.616640567779541, "learning_rate": 1.930178242163491e-05, "loss": 1.6252, "step": 2272 }, { "epoch": 0.1747080516287646, "grad_norm": 3.5017950534820557, "learning_rate": 1.9301167793484944e-05, "loss": 1.4621, "step": 2274 }, { "epoch": 0.17486170866625692, "grad_norm": 4.120169639587402, "learning_rate": 1.9300553165334974e-05, "loss": 1.5537, "step": 2276 }, { "epoch": 0.17501536570374923, "grad_norm": 4.489522457122803, "learning_rate": 1.9299938537185007e-05, "loss": 1.6915, "step": 2278 }, { "epoch": 0.17516902274124155, "grad_norm": 4.285830974578857, "learning_rate": 1.9299323909035034e-05, "loss": 1.5101, "step": 2280 }, { "epoch": 0.17532267977873386, "grad_norm": 3.9038150310516357, "learning_rate": 1.9298709280885067e-05, "loss": 1.6067, "step": 2282 }, { "epoch": 0.17547633681622618, "grad_norm": 3.8521156311035156, "learning_rate": 1.9298094652735096e-05, "loss": 1.4494, "step": 2284 }, { "epoch": 0.1756299938537185, "grad_norm": 7.329223155975342, "learning_rate": 1.9297480024585126e-05, "loss": 1.5588, "step": 2286 }, { "epoch": 0.1757836508912108, "grad_norm": 3.97939395904541, "learning_rate": 1.929686539643516e-05, "loss": 1.4802, "step": 2288 }, { "epoch": 0.17593730792870313, "grad_norm": 3.464115858078003, "learning_rate": 1.929625076828519e-05, "loss": 1.4913, "step": 2290 }, { "epoch": 0.17609096496619545, "grad_norm": 4.677506446838379, "learning_rate": 1.929563614013522e-05, "loss": 1.5254, "step": 2292 }, { "epoch": 0.17624462200368776, "grad_norm": 3.7886929512023926, "learning_rate": 1.929502151198525e-05, "loss": 1.6297, "step": 2294 }, { "epoch": 0.17639827904118008, "grad_norm": 3.5035488605499268, "learning_rate": 1.929440688383528e-05, "loss": 1.657, "step": 2296 }, { "epoch": 0.1765519360786724, "grad_norm": 4.172173976898193, "learning_rate": 1.9293792255685314e-05, "loss": 1.5612, "step": 2298 }, { "epoch": 0.1767055931161647, "grad_norm": 3.9481425285339355, "learning_rate": 1.929317762753534e-05, "loss": 1.4419, "step": 2300 }, { "epoch": 0.17685925015365703, "grad_norm": 3.922159433364868, "learning_rate": 1.9292562999385374e-05, "loss": 1.5459, "step": 2302 }, { "epoch": 0.17701290719114934, "grad_norm": 4.2247233390808105, "learning_rate": 1.9291948371235403e-05, "loss": 1.6869, "step": 2304 }, { "epoch": 0.17716656422864166, "grad_norm": 4.960201740264893, "learning_rate": 1.9291333743085433e-05, "loss": 1.4055, "step": 2306 }, { "epoch": 0.17732022126613398, "grad_norm": 4.675178527832031, "learning_rate": 1.9290719114935466e-05, "loss": 1.4635, "step": 2308 }, { "epoch": 0.1774738783036263, "grad_norm": 4.3724589347839355, "learning_rate": 1.9290104486785496e-05, "loss": 1.4423, "step": 2310 }, { "epoch": 0.17762753534111864, "grad_norm": 4.629543304443359, "learning_rate": 1.9289489858635525e-05, "loss": 1.6696, "step": 2312 }, { "epoch": 0.17778119237861095, "grad_norm": 3.8183395862579346, "learning_rate": 1.928887523048556e-05, "loss": 1.5558, "step": 2314 }, { "epoch": 0.17793484941610327, "grad_norm": 3.7984275817871094, "learning_rate": 1.9288260602335588e-05, "loss": 1.5884, "step": 2316 }, { "epoch": 0.17808850645359559, "grad_norm": 3.9068145751953125, "learning_rate": 1.928764597418562e-05, "loss": 1.6217, "step": 2318 }, { "epoch": 0.1782421634910879, "grad_norm": 4.159458160400391, "learning_rate": 1.928703134603565e-05, "loss": 1.5934, "step": 2320 }, { "epoch": 0.17839582052858022, "grad_norm": 4.013321876525879, "learning_rate": 1.928641671788568e-05, "loss": 1.5542, "step": 2322 }, { "epoch": 0.17854947756607253, "grad_norm": 4.504942893981934, "learning_rate": 1.9285802089735714e-05, "loss": 1.7811, "step": 2324 }, { "epoch": 0.17870313460356485, "grad_norm": 4.721269130706787, "learning_rate": 1.928518746158574e-05, "loss": 1.4974, "step": 2326 }, { "epoch": 0.17885679164105717, "grad_norm": 3.662440776824951, "learning_rate": 1.9284572833435773e-05, "loss": 1.6497, "step": 2328 }, { "epoch": 0.17901044867854948, "grad_norm": 3.8075759410858154, "learning_rate": 1.9283958205285803e-05, "loss": 1.4787, "step": 2330 }, { "epoch": 0.1791641057160418, "grad_norm": 4.013290882110596, "learning_rate": 1.9283343577135832e-05, "loss": 1.5371, "step": 2332 }, { "epoch": 0.17931776275353412, "grad_norm": 4.095331192016602, "learning_rate": 1.9282728948985865e-05, "loss": 1.6089, "step": 2334 }, { "epoch": 0.17947141979102643, "grad_norm": 4.137665748596191, "learning_rate": 1.9282114320835895e-05, "loss": 1.6971, "step": 2336 }, { "epoch": 0.17962507682851875, "grad_norm": 4.847195625305176, "learning_rate": 1.9281499692685928e-05, "loss": 1.365, "step": 2338 }, { "epoch": 0.17977873386601106, "grad_norm": 4.068114280700684, "learning_rate": 1.9280885064535958e-05, "loss": 1.7029, "step": 2340 }, { "epoch": 0.17993239090350338, "grad_norm": 4.104188442230225, "learning_rate": 1.9280270436385988e-05, "loss": 1.636, "step": 2342 }, { "epoch": 0.1800860479409957, "grad_norm": 4.033984661102295, "learning_rate": 1.927965580823602e-05, "loss": 1.5451, "step": 2344 }, { "epoch": 0.180239704978488, "grad_norm": 4.00771951675415, "learning_rate": 1.927904118008605e-05, "loss": 1.5162, "step": 2346 }, { "epoch": 0.18039336201598033, "grad_norm": 4.097219467163086, "learning_rate": 1.927842655193608e-05, "loss": 1.6558, "step": 2348 }, { "epoch": 0.18054701905347265, "grad_norm": 4.354104042053223, "learning_rate": 1.9277811923786113e-05, "loss": 1.61, "step": 2350 }, { "epoch": 0.18070067609096496, "grad_norm": 4.535645484924316, "learning_rate": 1.927719729563614e-05, "loss": 1.6289, "step": 2352 }, { "epoch": 0.18085433312845728, "grad_norm": 4.078785419464111, "learning_rate": 1.9276582667486172e-05, "loss": 1.7284, "step": 2354 }, { "epoch": 0.1810079901659496, "grad_norm": 4.275857448577881, "learning_rate": 1.9275968039336202e-05, "loss": 1.5461, "step": 2356 }, { "epoch": 0.1811616472034419, "grad_norm": 4.156821250915527, "learning_rate": 1.9275353411186232e-05, "loss": 1.5956, "step": 2358 }, { "epoch": 0.18131530424093423, "grad_norm": 3.6036367416381836, "learning_rate": 1.9274738783036265e-05, "loss": 1.5949, "step": 2360 }, { "epoch": 0.18146896127842654, "grad_norm": 4.079240798950195, "learning_rate": 1.9274124154886295e-05, "loss": 1.5671, "step": 2362 }, { "epoch": 0.18162261831591886, "grad_norm": 4.024125576019287, "learning_rate": 1.9273509526736328e-05, "loss": 1.4904, "step": 2364 }, { "epoch": 0.18177627535341118, "grad_norm": 3.7651987075805664, "learning_rate": 1.9272894898586357e-05, "loss": 1.5179, "step": 2366 }, { "epoch": 0.1819299323909035, "grad_norm": 3.8718831539154053, "learning_rate": 1.9272280270436387e-05, "loss": 1.4699, "step": 2368 }, { "epoch": 0.1820835894283958, "grad_norm": 4.548869609832764, "learning_rate": 1.927166564228642e-05, "loss": 1.6476, "step": 2370 }, { "epoch": 0.18223724646588813, "grad_norm": 4.528201580047607, "learning_rate": 1.927105101413645e-05, "loss": 1.6755, "step": 2372 }, { "epoch": 0.18239090350338044, "grad_norm": 4.0802388191223145, "learning_rate": 1.927043638598648e-05, "loss": 1.5722, "step": 2374 }, { "epoch": 0.18254456054087279, "grad_norm": 4.145775318145752, "learning_rate": 1.9269821757836513e-05, "loss": 1.5816, "step": 2376 }, { "epoch": 0.1826982175783651, "grad_norm": 3.925696611404419, "learning_rate": 1.926920712968654e-05, "loss": 1.625, "step": 2378 }, { "epoch": 0.18285187461585742, "grad_norm": 3.8499910831451416, "learning_rate": 1.9268592501536572e-05, "loss": 1.5522, "step": 2380 }, { "epoch": 0.18300553165334973, "grad_norm": 4.174738883972168, "learning_rate": 1.92679778733866e-05, "loss": 1.561, "step": 2382 }, { "epoch": 0.18315918869084205, "grad_norm": 3.801260232925415, "learning_rate": 1.9267363245236635e-05, "loss": 1.5159, "step": 2384 }, { "epoch": 0.18331284572833437, "grad_norm": 4.0040202140808105, "learning_rate": 1.9266748617086664e-05, "loss": 1.6021, "step": 2386 }, { "epoch": 0.18346650276582668, "grad_norm": 4.132852554321289, "learning_rate": 1.9266133988936694e-05, "loss": 1.6887, "step": 2388 }, { "epoch": 0.183620159803319, "grad_norm": 3.7313075065612793, "learning_rate": 1.9265519360786727e-05, "loss": 1.4582, "step": 2390 }, { "epoch": 0.18377381684081132, "grad_norm": 3.824453115463257, "learning_rate": 1.9264904732636757e-05, "loss": 1.4394, "step": 2392 }, { "epoch": 0.18392747387830363, "grad_norm": 4.368152141571045, "learning_rate": 1.9264290104486786e-05, "loss": 1.5397, "step": 2394 }, { "epoch": 0.18408113091579595, "grad_norm": 3.7525463104248047, "learning_rate": 1.926367547633682e-05, "loss": 1.6875, "step": 2396 }, { "epoch": 0.18423478795328826, "grad_norm": 4.229045391082764, "learning_rate": 1.9263060848186846e-05, "loss": 1.6248, "step": 2398 }, { "epoch": 0.18438844499078058, "grad_norm": 3.9596312046051025, "learning_rate": 1.926244622003688e-05, "loss": 1.5278, "step": 2400 } ], "logging_steps": 2, "max_steps": 65080, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5320091651263693e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }