{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5608481868469576, "eval_steps": 500, "global_step": 7300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015365703749231714, "grad_norm": 54.03105926513672, "learning_rate": 1.9999385371850035e-05, "loss": 5.8486, "step": 2 }, { "epoch": 0.00030731407498463427, "grad_norm": 60.26078414916992, "learning_rate": 1.999877074370006e-05, "loss": 5.5476, "step": 4 }, { "epoch": 0.00046097111247695143, "grad_norm": 172.71730041503906, "learning_rate": 1.9998156115550094e-05, "loss": 4.8213, "step": 6 }, { "epoch": 0.0006146281499692685, "grad_norm": 80.93025970458984, "learning_rate": 1.9997541487400124e-05, "loss": 4.8849, "step": 8 }, { "epoch": 0.0007682851874615857, "grad_norm": 118.46244049072266, "learning_rate": 1.9996926859250153e-05, "loss": 4.0143, "step": 10 }, { "epoch": 0.0009219422249539029, "grad_norm": 58.34870910644531, "learning_rate": 1.9996312231100187e-05, "loss": 4.0411, "step": 12 }, { "epoch": 0.0010755992624462201, "grad_norm": 35.50437927246094, "learning_rate": 1.9995697602950216e-05, "loss": 3.9182, "step": 14 }, { "epoch": 0.001229256299938537, "grad_norm": 60.75965118408203, "learning_rate": 1.999508297480025e-05, "loss": 3.7527, "step": 16 }, { "epoch": 0.0013829133374308542, "grad_norm": 53.328765869140625, "learning_rate": 1.999446834665028e-05, "loss": 3.5424, "step": 18 }, { "epoch": 0.0015365703749231714, "grad_norm": 40.73623275756836, "learning_rate": 1.999385371850031e-05, "loss": 3.7179, "step": 20 }, { "epoch": 0.0016902274124154886, "grad_norm": 15.511174201965332, "learning_rate": 1.9993239090350342e-05, "loss": 3.3177, "step": 22 }, { "epoch": 0.0018438844499078057, "grad_norm": 62.24359130859375, "learning_rate": 1.9992624462200368e-05, "loss": 3.5207, "step": 24 }, { "epoch": 0.001997541487400123, "grad_norm": 69.20399475097656, "learning_rate": 1.99920098340504e-05, "loss": 3.6051, "step": 26 }, { "epoch": 0.0021511985248924403, "grad_norm": 39.98881530761719, "learning_rate": 1.999139520590043e-05, "loss": 3.3193, "step": 28 }, { "epoch": 0.0023048555623847574, "grad_norm": 10.113142013549805, "learning_rate": 1.999078057775046e-05, "loss": 3.2187, "step": 30 }, { "epoch": 0.002458512599877074, "grad_norm": 28.31175422668457, "learning_rate": 1.9990165949600494e-05, "loss": 3.4458, "step": 32 }, { "epoch": 0.0026121696373693913, "grad_norm": 21.829612731933594, "learning_rate": 1.9989551321450523e-05, "loss": 3.5843, "step": 34 }, { "epoch": 0.0027658266748617085, "grad_norm": 18.00796127319336, "learning_rate": 1.9988936693300556e-05, "loss": 3.2827, "step": 36 }, { "epoch": 0.0029194837123540257, "grad_norm": 16.2840576171875, "learning_rate": 1.9988322065150586e-05, "loss": 3.2059, "step": 38 }, { "epoch": 0.003073140749846343, "grad_norm": 11.987384796142578, "learning_rate": 1.9987707437000616e-05, "loss": 3.1313, "step": 40 }, { "epoch": 0.00322679778733866, "grad_norm": 6.873617649078369, "learning_rate": 1.998709280885065e-05, "loss": 2.8568, "step": 42 }, { "epoch": 0.003380454824830977, "grad_norm": 6.603146076202393, "learning_rate": 1.998647818070068e-05, "loss": 3.0781, "step": 44 }, { "epoch": 0.0035341118623232943, "grad_norm": 25.308164596557617, "learning_rate": 1.9985863552550708e-05, "loss": 3.0764, "step": 46 }, { "epoch": 0.0036877688998156115, "grad_norm": 15.176654815673828, "learning_rate": 1.998524892440074e-05, "loss": 3.0356, "step": 48 }, { "epoch": 0.0038414259373079286, "grad_norm": 7.444390773773193, "learning_rate": 1.9984634296250767e-05, "loss": 2.9488, "step": 50 }, { "epoch": 0.003995082974800246, "grad_norm": 18.565139770507812, "learning_rate": 1.99840196681008e-05, "loss": 2.7179, "step": 52 }, { "epoch": 0.004148740012292563, "grad_norm": 10.658416748046875, "learning_rate": 1.998340503995083e-05, "loss": 2.716, "step": 54 }, { "epoch": 0.0043023970497848806, "grad_norm": 9.682657241821289, "learning_rate": 1.9982790411800863e-05, "loss": 2.9189, "step": 56 }, { "epoch": 0.004456054087277198, "grad_norm": 20.967639923095703, "learning_rate": 1.9982175783650893e-05, "loss": 2.8078, "step": 58 }, { "epoch": 0.004609711124769515, "grad_norm": 16.931556701660156, "learning_rate": 1.9981561155500923e-05, "loss": 2.837, "step": 60 }, { "epoch": 0.004763368162261831, "grad_norm": 12.055686950683594, "learning_rate": 1.9980946527350956e-05, "loss": 2.7392, "step": 62 }, { "epoch": 0.004917025199754148, "grad_norm": 7.959167957305908, "learning_rate": 1.9980331899200985e-05, "loss": 2.8915, "step": 64 }, { "epoch": 0.0050706822372464655, "grad_norm": 9.24318790435791, "learning_rate": 1.9979717271051015e-05, "loss": 2.5171, "step": 66 }, { "epoch": 0.005224339274738783, "grad_norm": 20.02304458618164, "learning_rate": 1.9979102642901048e-05, "loss": 2.7947, "step": 68 }, { "epoch": 0.0053779963122311, "grad_norm": 8.09688663482666, "learning_rate": 1.9978488014751078e-05, "loss": 2.7323, "step": 70 }, { "epoch": 0.005531653349723417, "grad_norm": 8.636987686157227, "learning_rate": 1.9977873386601108e-05, "loss": 2.6309, "step": 72 }, { "epoch": 0.005685310387215734, "grad_norm": 6.815808296203613, "learning_rate": 1.997725875845114e-05, "loss": 2.4741, "step": 74 }, { "epoch": 0.005838967424708051, "grad_norm": 7.532662868499756, "learning_rate": 1.997664413030117e-05, "loss": 2.5875, "step": 76 }, { "epoch": 0.0059926244622003685, "grad_norm": 6.733164310455322, "learning_rate": 1.99760295021512e-05, "loss": 2.6103, "step": 78 }, { "epoch": 0.006146281499692686, "grad_norm": 6.442116737365723, "learning_rate": 1.997541487400123e-05, "loss": 2.6208, "step": 80 }, { "epoch": 0.006299938537185003, "grad_norm": 6.882765769958496, "learning_rate": 1.9974800245851263e-05, "loss": 2.5554, "step": 82 }, { "epoch": 0.00645359557467732, "grad_norm": 6.64527702331543, "learning_rate": 1.9974185617701292e-05, "loss": 2.5667, "step": 84 }, { "epoch": 0.006607252612169637, "grad_norm": 7.69775390625, "learning_rate": 1.9973570989551322e-05, "loss": 2.6117, "step": 86 }, { "epoch": 0.006760909649661954, "grad_norm": 7.077218532562256, "learning_rate": 1.9972956361401355e-05, "loss": 2.4501, "step": 88 }, { "epoch": 0.0069145666871542714, "grad_norm": 5.539189338684082, "learning_rate": 1.9972341733251385e-05, "loss": 2.4775, "step": 90 }, { "epoch": 0.007068223724646589, "grad_norm": 6.602914333343506, "learning_rate": 1.9971727105101415e-05, "loss": 2.3944, "step": 92 }, { "epoch": 0.007221880762138906, "grad_norm": 5.995626449584961, "learning_rate": 1.9971112476951448e-05, "loss": 2.4826, "step": 94 }, { "epoch": 0.007375537799631223, "grad_norm": 6.836587429046631, "learning_rate": 1.9970497848801477e-05, "loss": 2.468, "step": 96 }, { "epoch": 0.00752919483712354, "grad_norm": 6.4697651863098145, "learning_rate": 1.9969883220651507e-05, "loss": 2.2833, "step": 98 }, { "epoch": 0.007682851874615857, "grad_norm": 8.081903457641602, "learning_rate": 1.996926859250154e-05, "loss": 2.6544, "step": 100 }, { "epoch": 0.007836508912108174, "grad_norm": 6.688724517822266, "learning_rate": 1.996865396435157e-05, "loss": 2.55, "step": 102 }, { "epoch": 0.007990165949600492, "grad_norm": 6.878283977508545, "learning_rate": 1.99680393362016e-05, "loss": 2.2658, "step": 104 }, { "epoch": 0.008143822987092809, "grad_norm": 7.079164505004883, "learning_rate": 1.996742470805163e-05, "loss": 2.4793, "step": 106 }, { "epoch": 0.008297480024585127, "grad_norm": 6.391737461090088, "learning_rate": 1.9966810079901662e-05, "loss": 2.4154, "step": 108 }, { "epoch": 0.008451137062077443, "grad_norm": 7.503854274749756, "learning_rate": 1.9966195451751692e-05, "loss": 2.3137, "step": 110 }, { "epoch": 0.008604794099569761, "grad_norm": 6.10397481918335, "learning_rate": 1.996558082360172e-05, "loss": 2.4306, "step": 112 }, { "epoch": 0.008758451137062077, "grad_norm": 6.1603264808654785, "learning_rate": 1.9964966195451755e-05, "loss": 2.2859, "step": 114 }, { "epoch": 0.008912108174554395, "grad_norm": 7.389194011688232, "learning_rate": 1.9964351567301784e-05, "loss": 2.3876, "step": 116 }, { "epoch": 0.009065765212046712, "grad_norm": 6.887446403503418, "learning_rate": 1.9963736939151814e-05, "loss": 2.5139, "step": 118 }, { "epoch": 0.00921942224953903, "grad_norm": 7.2416768074035645, "learning_rate": 1.9963122311001847e-05, "loss": 2.5653, "step": 120 }, { "epoch": 0.009373079287031346, "grad_norm": 7.454037189483643, "learning_rate": 1.9962507682851877e-05, "loss": 2.3707, "step": 122 }, { "epoch": 0.009526736324523662, "grad_norm": 6.9176483154296875, "learning_rate": 1.9961893054701906e-05, "loss": 2.4158, "step": 124 }, { "epoch": 0.00968039336201598, "grad_norm": 7.838490009307861, "learning_rate": 1.9961278426551936e-05, "loss": 2.45, "step": 126 }, { "epoch": 0.009834050399508297, "grad_norm": 6.680061340332031, "learning_rate": 1.996066379840197e-05, "loss": 2.1975, "step": 128 }, { "epoch": 0.009987707437000615, "grad_norm": 7.567671775817871, "learning_rate": 1.9960049170252e-05, "loss": 2.1892, "step": 130 }, { "epoch": 0.010141364474492931, "grad_norm": 6.0987396240234375, "learning_rate": 1.995943454210203e-05, "loss": 2.2957, "step": 132 }, { "epoch": 0.010295021511985249, "grad_norm": 6.579552173614502, "learning_rate": 1.995881991395206e-05, "loss": 2.4326, "step": 134 }, { "epoch": 0.010448678549477565, "grad_norm": 7.131938934326172, "learning_rate": 1.995820528580209e-05, "loss": 2.4767, "step": 136 }, { "epoch": 0.010602335586969883, "grad_norm": 6.883522033691406, "learning_rate": 1.995759065765212e-05, "loss": 2.3893, "step": 138 }, { "epoch": 0.0107559926244622, "grad_norm": 5.52859354019165, "learning_rate": 1.9956976029502154e-05, "loss": 2.1851, "step": 140 }, { "epoch": 0.010909649661954518, "grad_norm": 6.14478874206543, "learning_rate": 1.9956361401352184e-05, "loss": 2.2019, "step": 142 }, { "epoch": 0.011063306699446834, "grad_norm": 6.477922439575195, "learning_rate": 1.9955746773202213e-05, "loss": 2.2746, "step": 144 }, { "epoch": 0.011216963736939152, "grad_norm": 7.661022186279297, "learning_rate": 1.9955132145052246e-05, "loss": 2.3499, "step": 146 }, { "epoch": 0.011370620774431468, "grad_norm": 7.439324378967285, "learning_rate": 1.9954517516902276e-05, "loss": 2.1848, "step": 148 }, { "epoch": 0.011524277811923786, "grad_norm": 7.070183753967285, "learning_rate": 1.9953902888752306e-05, "loss": 2.2816, "step": 150 }, { "epoch": 0.011677934849416103, "grad_norm": 5.912161350250244, "learning_rate": 1.9953288260602336e-05, "loss": 2.3688, "step": 152 }, { "epoch": 0.01183159188690842, "grad_norm": 6.827462673187256, "learning_rate": 1.995267363245237e-05, "loss": 2.3945, "step": 154 }, { "epoch": 0.011985248924400737, "grad_norm": 5.7712082862854, "learning_rate": 1.9952059004302398e-05, "loss": 2.1618, "step": 156 }, { "epoch": 0.012138905961893055, "grad_norm": 5.9169020652771, "learning_rate": 1.9951444376152428e-05, "loss": 2.1781, "step": 158 }, { "epoch": 0.012292562999385371, "grad_norm": 5.994232177734375, "learning_rate": 1.995082974800246e-05, "loss": 2.1474, "step": 160 }, { "epoch": 0.01244622003687769, "grad_norm": 6.10550594329834, "learning_rate": 1.995021511985249e-05, "loss": 2.2227, "step": 162 }, { "epoch": 0.012599877074370006, "grad_norm": 7.107779502868652, "learning_rate": 1.994960049170252e-05, "loss": 2.334, "step": 164 }, { "epoch": 0.012753534111862324, "grad_norm": 4.990610122680664, "learning_rate": 1.9948985863552553e-05, "loss": 2.2313, "step": 166 }, { "epoch": 0.01290719114935464, "grad_norm": 8.93641185760498, "learning_rate": 1.9948371235402583e-05, "loss": 2.1062, "step": 168 }, { "epoch": 0.013060848186846958, "grad_norm": 5.389564037322998, "learning_rate": 1.9947756607252613e-05, "loss": 2.1729, "step": 170 }, { "epoch": 0.013214505224339274, "grad_norm": 5.347591400146484, "learning_rate": 1.9947141979102646e-05, "loss": 2.0474, "step": 172 }, { "epoch": 0.013368162261831592, "grad_norm": 6.475700378417969, "learning_rate": 1.9946527350952676e-05, "loss": 2.1939, "step": 174 }, { "epoch": 0.013521819299323909, "grad_norm": 6.144668102264404, "learning_rate": 1.9945912722802705e-05, "loss": 2.217, "step": 176 }, { "epoch": 0.013675476336816227, "grad_norm": 6.778875350952148, "learning_rate": 1.9945298094652735e-05, "loss": 2.139, "step": 178 }, { "epoch": 0.013829133374308543, "grad_norm": 7.560453414916992, "learning_rate": 1.9944683466502768e-05, "loss": 2.1931, "step": 180 }, { "epoch": 0.013982790411800861, "grad_norm": 5.251035690307617, "learning_rate": 1.9944068838352798e-05, "loss": 2.1596, "step": 182 }, { "epoch": 0.014136447449293177, "grad_norm": 5.9772162437438965, "learning_rate": 1.9943454210202827e-05, "loss": 2.232, "step": 184 }, { "epoch": 0.014290104486785495, "grad_norm": 7.088453769683838, "learning_rate": 1.994283958205286e-05, "loss": 2.3468, "step": 186 }, { "epoch": 0.014443761524277812, "grad_norm": 6.209799289703369, "learning_rate": 1.994222495390289e-05, "loss": 2.3158, "step": 188 }, { "epoch": 0.01459741856177013, "grad_norm": 6.048709392547607, "learning_rate": 1.994161032575292e-05, "loss": 1.9986, "step": 190 }, { "epoch": 0.014751075599262446, "grad_norm": 5.292468070983887, "learning_rate": 1.9940995697602953e-05, "loss": 2.1564, "step": 192 }, { "epoch": 0.014904732636754764, "grad_norm": 6.045801639556885, "learning_rate": 1.9940381069452983e-05, "loss": 2.2064, "step": 194 }, { "epoch": 0.01505838967424708, "grad_norm": 6.204288482666016, "learning_rate": 1.9939766441303012e-05, "loss": 2.2869, "step": 196 }, { "epoch": 0.015212046711739398, "grad_norm": 6.579591274261475, "learning_rate": 1.9939151813153045e-05, "loss": 2.1494, "step": 198 }, { "epoch": 0.015365703749231715, "grad_norm": 6.20919942855835, "learning_rate": 1.9938537185003075e-05, "loss": 2.0245, "step": 200 }, { "epoch": 0.015519360786724033, "grad_norm": 6.129773139953613, "learning_rate": 1.9937922556853108e-05, "loss": 2.0684, "step": 202 }, { "epoch": 0.01567301782421635, "grad_norm": 7.500084400177002, "learning_rate": 1.9937307928703134e-05, "loss": 2.1818, "step": 204 }, { "epoch": 0.015826674861708665, "grad_norm": 6.189898490905762, "learning_rate": 1.9936693300553167e-05, "loss": 2.1377, "step": 206 }, { "epoch": 0.015980331899200985, "grad_norm": 5.788628101348877, "learning_rate": 1.9936078672403197e-05, "loss": 2.1195, "step": 208 }, { "epoch": 0.0161339889366933, "grad_norm": 6.9061055183410645, "learning_rate": 1.9935464044253227e-05, "loss": 2.1815, "step": 210 }, { "epoch": 0.016287645974185617, "grad_norm": 7.366201877593994, "learning_rate": 1.993484941610326e-05, "loss": 2.2884, "step": 212 }, { "epoch": 0.016441303011677934, "grad_norm": 5.979190826416016, "learning_rate": 1.993423478795329e-05, "loss": 2.3251, "step": 214 }, { "epoch": 0.016594960049170254, "grad_norm": 6.170030117034912, "learning_rate": 1.993362015980332e-05, "loss": 2.2108, "step": 216 }, { "epoch": 0.01674861708666257, "grad_norm": 6.819857120513916, "learning_rate": 1.9933005531653352e-05, "loss": 2.2231, "step": 218 }, { "epoch": 0.016902274124154886, "grad_norm": 7.386382579803467, "learning_rate": 1.9932390903503382e-05, "loss": 2.1647, "step": 220 }, { "epoch": 0.017055931161647202, "grad_norm": 5.797331809997559, "learning_rate": 1.9931776275353415e-05, "loss": 2.2092, "step": 222 }, { "epoch": 0.017209588199139522, "grad_norm": 5.605097770690918, "learning_rate": 1.993116164720344e-05, "loss": 2.2266, "step": 224 }, { "epoch": 0.01736324523663184, "grad_norm": 5.865804672241211, "learning_rate": 1.9930547019053474e-05, "loss": 2.0874, "step": 226 }, { "epoch": 0.017516902274124155, "grad_norm": 7.769106864929199, "learning_rate": 1.9929932390903508e-05, "loss": 2.1032, "step": 228 }, { "epoch": 0.01767055931161647, "grad_norm": 6.673518180847168, "learning_rate": 1.9929317762753534e-05, "loss": 2.0957, "step": 230 }, { "epoch": 0.01782421634910879, "grad_norm": 6.331215858459473, "learning_rate": 1.9928703134603567e-05, "loss": 2.143, "step": 232 }, { "epoch": 0.017977873386601107, "grad_norm": 5.792760848999023, "learning_rate": 1.9928088506453597e-05, "loss": 2.0157, "step": 234 }, { "epoch": 0.018131530424093423, "grad_norm": 6.460434436798096, "learning_rate": 1.9927473878303626e-05, "loss": 2.0018, "step": 236 }, { "epoch": 0.01828518746158574, "grad_norm": 6.339091777801514, "learning_rate": 1.992685925015366e-05, "loss": 2.2635, "step": 238 }, { "epoch": 0.01843884449907806, "grad_norm": 5.446582317352295, "learning_rate": 1.992624462200369e-05, "loss": 2.036, "step": 240 }, { "epoch": 0.018592501536570376, "grad_norm": 6.4099273681640625, "learning_rate": 1.9925629993853722e-05, "loss": 2.0031, "step": 242 }, { "epoch": 0.018746158574062692, "grad_norm": 7.307748794555664, "learning_rate": 1.9925015365703752e-05, "loss": 2.0746, "step": 244 }, { "epoch": 0.01889981561155501, "grad_norm": 5.755754470825195, "learning_rate": 1.992440073755378e-05, "loss": 2.1756, "step": 246 }, { "epoch": 0.019053472649047325, "grad_norm": 5.9470744132995605, "learning_rate": 1.9923786109403815e-05, "loss": 2.1579, "step": 248 }, { "epoch": 0.019207129686539644, "grad_norm": 5.4200873374938965, "learning_rate": 1.992317148125384e-05, "loss": 2.1868, "step": 250 }, { "epoch": 0.01936078672403196, "grad_norm": 6.8247175216674805, "learning_rate": 1.9922556853103874e-05, "loss": 2.1525, "step": 252 }, { "epoch": 0.019514443761524277, "grad_norm": 6.334802627563477, "learning_rate": 1.9921942224953904e-05, "loss": 2.1261, "step": 254 }, { "epoch": 0.019668100799016593, "grad_norm": 7.025927543640137, "learning_rate": 1.9921327596803933e-05, "loss": 2.2474, "step": 256 }, { "epoch": 0.019821757836508913, "grad_norm": 6.594686508178711, "learning_rate": 1.9920712968653966e-05, "loss": 1.9885, "step": 258 }, { "epoch": 0.01997541487400123, "grad_norm": 6.713582992553711, "learning_rate": 1.9920098340503996e-05, "loss": 2.3728, "step": 260 }, { "epoch": 0.020129071911493546, "grad_norm": 5.78023099899292, "learning_rate": 1.9919483712354026e-05, "loss": 2.0887, "step": 262 }, { "epoch": 0.020282728948985862, "grad_norm": 5.462549686431885, "learning_rate": 1.991886908420406e-05, "loss": 2.0673, "step": 264 }, { "epoch": 0.020436385986478182, "grad_norm": 6.792922019958496, "learning_rate": 1.991825445605409e-05, "loss": 2.177, "step": 266 }, { "epoch": 0.020590043023970498, "grad_norm": 6.281880855560303, "learning_rate": 1.991763982790412e-05, "loss": 1.9686, "step": 268 }, { "epoch": 0.020743700061462814, "grad_norm": 5.745354175567627, "learning_rate": 1.991702519975415e-05, "loss": 2.1414, "step": 270 }, { "epoch": 0.02089735709895513, "grad_norm": 6.046512126922607, "learning_rate": 1.991641057160418e-05, "loss": 2.1541, "step": 272 }, { "epoch": 0.02105101413644745, "grad_norm": 7.513150691986084, "learning_rate": 1.9915795943454214e-05, "loss": 2.1383, "step": 274 }, { "epoch": 0.021204671173939767, "grad_norm": 8.351797103881836, "learning_rate": 1.991518131530424e-05, "loss": 2.209, "step": 276 }, { "epoch": 0.021358328211432083, "grad_norm": 6.781789302825928, "learning_rate": 1.9914566687154273e-05, "loss": 1.9494, "step": 278 }, { "epoch": 0.0215119852489244, "grad_norm": 5.912288188934326, "learning_rate": 1.9913952059004303e-05, "loss": 1.9871, "step": 280 }, { "epoch": 0.02166564228641672, "grad_norm": 5.441234111785889, "learning_rate": 1.9913337430854333e-05, "loss": 2.118, "step": 282 }, { "epoch": 0.021819299323909035, "grad_norm": 6.041057109832764, "learning_rate": 1.9912722802704366e-05, "loss": 2.0064, "step": 284 }, { "epoch": 0.02197295636140135, "grad_norm": 6.26601505279541, "learning_rate": 1.9912108174554395e-05, "loss": 1.9593, "step": 286 }, { "epoch": 0.022126613398893668, "grad_norm": 6.992424488067627, "learning_rate": 1.991149354640443e-05, "loss": 2.1785, "step": 288 }, { "epoch": 0.022280270436385988, "grad_norm": 7.048946857452393, "learning_rate": 1.9910878918254458e-05, "loss": 2.0809, "step": 290 }, { "epoch": 0.022433927473878304, "grad_norm": 7.00367546081543, "learning_rate": 1.9910264290104488e-05, "loss": 2.0688, "step": 292 }, { "epoch": 0.02258758451137062, "grad_norm": 6.326030731201172, "learning_rate": 1.990964966195452e-05, "loss": 2.1279, "step": 294 }, { "epoch": 0.022741241548862937, "grad_norm": 5.886343002319336, "learning_rate": 1.990903503380455e-05, "loss": 1.9146, "step": 296 }, { "epoch": 0.022894898586355256, "grad_norm": 6.407416820526123, "learning_rate": 1.990842040565458e-05, "loss": 2.073, "step": 298 }, { "epoch": 0.023048555623847573, "grad_norm": 5.35817289352417, "learning_rate": 1.9907805777504613e-05, "loss": 2.064, "step": 300 }, { "epoch": 0.02320221266133989, "grad_norm": 5.71148157119751, "learning_rate": 1.990719114935464e-05, "loss": 2.2207, "step": 302 }, { "epoch": 0.023355869698832205, "grad_norm": 7.2422051429748535, "learning_rate": 1.9906576521204673e-05, "loss": 2.1518, "step": 304 }, { "epoch": 0.023509526736324525, "grad_norm": 7.267468452453613, "learning_rate": 1.9905961893054702e-05, "loss": 2.0082, "step": 306 }, { "epoch": 0.02366318377381684, "grad_norm": 6.504114627838135, "learning_rate": 1.9905347264904736e-05, "loss": 1.9722, "step": 308 }, { "epoch": 0.023816840811309158, "grad_norm": 7.074812889099121, "learning_rate": 1.9904732636754765e-05, "loss": 2.1789, "step": 310 }, { "epoch": 0.023970497848801474, "grad_norm": 6.774876117706299, "learning_rate": 1.9904118008604795e-05, "loss": 2.219, "step": 312 }, { "epoch": 0.024124154886293794, "grad_norm": 5.666469097137451, "learning_rate": 1.9903503380454828e-05, "loss": 1.8294, "step": 314 }, { "epoch": 0.02427781192378611, "grad_norm": 6.548127174377441, "learning_rate": 1.9902888752304858e-05, "loss": 2.0859, "step": 316 }, { "epoch": 0.024431468961278426, "grad_norm": 5.174642562866211, "learning_rate": 1.9902274124154887e-05, "loss": 1.989, "step": 318 }, { "epoch": 0.024585125998770743, "grad_norm": 5.891490936279297, "learning_rate": 1.990165949600492e-05, "loss": 2.0776, "step": 320 }, { "epoch": 0.024738783036263062, "grad_norm": 5.7647504806518555, "learning_rate": 1.9901044867854947e-05, "loss": 1.9681, "step": 322 }, { "epoch": 0.02489244007375538, "grad_norm": 5.61868143081665, "learning_rate": 1.990043023970498e-05, "loss": 1.8923, "step": 324 }, { "epoch": 0.025046097111247695, "grad_norm": 7.358055114746094, "learning_rate": 1.9899815611555013e-05, "loss": 1.9859, "step": 326 }, { "epoch": 0.02519975414874001, "grad_norm": 5.265814781188965, "learning_rate": 1.9899200983405043e-05, "loss": 1.939, "step": 328 }, { "epoch": 0.02535341118623233, "grad_norm": 9.370257377624512, "learning_rate": 1.9898586355255072e-05, "loss": 1.9538, "step": 330 }, { "epoch": 0.025507068223724647, "grad_norm": 7.504848003387451, "learning_rate": 1.9897971727105102e-05, "loss": 2.0802, "step": 332 }, { "epoch": 0.025660725261216964, "grad_norm": 5.975841045379639, "learning_rate": 1.9897357098955135e-05, "loss": 1.853, "step": 334 }, { "epoch": 0.02581438229870928, "grad_norm": 6.099985122680664, "learning_rate": 1.9896742470805165e-05, "loss": 2.0014, "step": 336 }, { "epoch": 0.0259680393362016, "grad_norm": 6.825030326843262, "learning_rate": 1.9896127842655194e-05, "loss": 1.9608, "step": 338 }, { "epoch": 0.026121696373693916, "grad_norm": 6.16441535949707, "learning_rate": 1.9895513214505227e-05, "loss": 2.0848, "step": 340 }, { "epoch": 0.026275353411186232, "grad_norm": 6.392692565917969, "learning_rate": 1.9894898586355257e-05, "loss": 1.9651, "step": 342 }, { "epoch": 0.02642901044867855, "grad_norm": 5.567882537841797, "learning_rate": 1.9894283958205287e-05, "loss": 2.1211, "step": 344 }, { "epoch": 0.026582667486170868, "grad_norm": 10.182480812072754, "learning_rate": 1.989366933005532e-05, "loss": 1.9924, "step": 346 }, { "epoch": 0.026736324523663185, "grad_norm": 5.608663558959961, "learning_rate": 1.989305470190535e-05, "loss": 1.936, "step": 348 }, { "epoch": 0.0268899815611555, "grad_norm": 5.883683204650879, "learning_rate": 1.989244007375538e-05, "loss": 2.0998, "step": 350 }, { "epoch": 0.027043638598647817, "grad_norm": 8.584614753723145, "learning_rate": 1.989182544560541e-05, "loss": 2.1266, "step": 352 }, { "epoch": 0.027197295636140137, "grad_norm": 6.828667640686035, "learning_rate": 1.9891210817455442e-05, "loss": 1.8693, "step": 354 }, { "epoch": 0.027350952673632453, "grad_norm": 7.0278449058532715, "learning_rate": 1.989059618930547e-05, "loss": 1.9258, "step": 356 }, { "epoch": 0.02750460971112477, "grad_norm": 5.643075466156006, "learning_rate": 1.98899815611555e-05, "loss": 2.071, "step": 358 }, { "epoch": 0.027658266748617086, "grad_norm": 6.685908794403076, "learning_rate": 1.9889366933005534e-05, "loss": 1.8658, "step": 360 }, { "epoch": 0.027811923786109402, "grad_norm": 5.766722679138184, "learning_rate": 1.9888752304855564e-05, "loss": 2.0608, "step": 362 }, { "epoch": 0.027965580823601722, "grad_norm": 6.229999542236328, "learning_rate": 1.9888137676705594e-05, "loss": 1.8478, "step": 364 }, { "epoch": 0.028119237861094038, "grad_norm": 14.6449613571167, "learning_rate": 1.9887523048555627e-05, "loss": 2.0233, "step": 366 }, { "epoch": 0.028272894898586354, "grad_norm": 5.458970069885254, "learning_rate": 1.9886908420405657e-05, "loss": 1.8742, "step": 368 }, { "epoch": 0.02842655193607867, "grad_norm": 9.708429336547852, "learning_rate": 1.9886293792255686e-05, "loss": 2.0435, "step": 370 }, { "epoch": 0.02858020897357099, "grad_norm": 8.345685958862305, "learning_rate": 1.988567916410572e-05, "loss": 1.9448, "step": 372 }, { "epoch": 0.028733866011063307, "grad_norm": 5.213901519775391, "learning_rate": 1.988506453595575e-05, "loss": 1.8902, "step": 374 }, { "epoch": 0.028887523048555623, "grad_norm": 6.842494964599609, "learning_rate": 1.988444990780578e-05, "loss": 2.0958, "step": 376 }, { "epoch": 0.02904118008604794, "grad_norm": 6.533809185028076, "learning_rate": 1.988383527965581e-05, "loss": 2.0073, "step": 378 }, { "epoch": 0.02919483712354026, "grad_norm": 5.832721710205078, "learning_rate": 1.988322065150584e-05, "loss": 1.9462, "step": 380 }, { "epoch": 0.029348494161032575, "grad_norm": 6.040827751159668, "learning_rate": 1.988260602335587e-05, "loss": 2.0111, "step": 382 }, { "epoch": 0.02950215119852489, "grad_norm": 6.082043647766113, "learning_rate": 1.98819913952059e-05, "loss": 2.0088, "step": 384 }, { "epoch": 0.029655808236017208, "grad_norm": 4.5363383293151855, "learning_rate": 1.9881376767055934e-05, "loss": 1.9059, "step": 386 }, { "epoch": 0.029809465273509528, "grad_norm": 4.769321918487549, "learning_rate": 1.9880762138905964e-05, "loss": 1.8781, "step": 388 }, { "epoch": 0.029963122311001844, "grad_norm": 6.1424994468688965, "learning_rate": 1.9880147510755993e-05, "loss": 2.0232, "step": 390 }, { "epoch": 0.03011677934849416, "grad_norm": 6.081544399261475, "learning_rate": 1.9879532882606026e-05, "loss": 1.8908, "step": 392 }, { "epoch": 0.030270436385986477, "grad_norm": 6.146285057067871, "learning_rate": 1.9878918254456056e-05, "loss": 2.0144, "step": 394 }, { "epoch": 0.030424093423478796, "grad_norm": 5.401834011077881, "learning_rate": 1.9878303626306086e-05, "loss": 1.8258, "step": 396 }, { "epoch": 0.030577750460971113, "grad_norm": 6.835007667541504, "learning_rate": 1.987768899815612e-05, "loss": 2.0515, "step": 398 }, { "epoch": 0.03073140749846343, "grad_norm": 7.031691551208496, "learning_rate": 1.987707437000615e-05, "loss": 2.0362, "step": 400 }, { "epoch": 0.030885064535955745, "grad_norm": 5.733877182006836, "learning_rate": 1.9876459741856178e-05, "loss": 2.099, "step": 402 }, { "epoch": 0.031038721573448065, "grad_norm": 6.152698516845703, "learning_rate": 1.9875845113706208e-05, "loss": 1.9393, "step": 404 }, { "epoch": 0.03119237861094038, "grad_norm": 5.859741687774658, "learning_rate": 1.987523048555624e-05, "loss": 1.995, "step": 406 }, { "epoch": 0.0313460356484327, "grad_norm": 6.834084510803223, "learning_rate": 1.987461585740627e-05, "loss": 2.0035, "step": 408 }, { "epoch": 0.031499692685925014, "grad_norm": 6.169229030609131, "learning_rate": 1.98740012292563e-05, "loss": 2.1276, "step": 410 }, { "epoch": 0.03165334972341733, "grad_norm": 5.270079135894775, "learning_rate": 1.9873386601106333e-05, "loss": 2.0073, "step": 412 }, { "epoch": 0.03180700676090965, "grad_norm": 5.952144145965576, "learning_rate": 1.9872771972956363e-05, "loss": 1.8817, "step": 414 }, { "epoch": 0.03196066379840197, "grad_norm": 6.3290019035339355, "learning_rate": 1.9872157344806393e-05, "loss": 1.9526, "step": 416 }, { "epoch": 0.032114320835894286, "grad_norm": 5.712306499481201, "learning_rate": 1.9871542716656426e-05, "loss": 2.0793, "step": 418 }, { "epoch": 0.0322679778733866, "grad_norm": 5.497166156768799, "learning_rate": 1.9870928088506455e-05, "loss": 1.9061, "step": 420 }, { "epoch": 0.03242163491087892, "grad_norm": 6.435750484466553, "learning_rate": 1.9870313460356485e-05, "loss": 1.8971, "step": 422 }, { "epoch": 0.032575291948371235, "grad_norm": 5.9519734382629395, "learning_rate": 1.9869698832206518e-05, "loss": 2.0295, "step": 424 }, { "epoch": 0.03272894898586355, "grad_norm": 6.359841823577881, "learning_rate": 1.9869084204056548e-05, "loss": 1.9017, "step": 426 }, { "epoch": 0.03288260602335587, "grad_norm": 6.195022106170654, "learning_rate": 1.9868469575906578e-05, "loss": 2.0663, "step": 428 }, { "epoch": 0.033036263060848184, "grad_norm": 5.500522613525391, "learning_rate": 1.9867854947756607e-05, "loss": 1.9694, "step": 430 }, { "epoch": 0.03318992009834051, "grad_norm": 7.16880464553833, "learning_rate": 1.986724031960664e-05, "loss": 1.918, "step": 432 }, { "epoch": 0.03334357713583282, "grad_norm": 6.0987348556518555, "learning_rate": 1.986662569145667e-05, "loss": 1.8705, "step": 434 }, { "epoch": 0.03349723417332514, "grad_norm": 6.8652753829956055, "learning_rate": 1.98660110633067e-05, "loss": 1.9383, "step": 436 }, { "epoch": 0.033650891210817456, "grad_norm": 5.421166896820068, "learning_rate": 1.9865396435156733e-05, "loss": 1.879, "step": 438 }, { "epoch": 0.03380454824830977, "grad_norm": 5.929842948913574, "learning_rate": 1.9864781807006762e-05, "loss": 1.7183, "step": 440 }, { "epoch": 0.03395820528580209, "grad_norm": 5.500015735626221, "learning_rate": 1.9864167178856792e-05, "loss": 1.9168, "step": 442 }, { "epoch": 0.034111862323294405, "grad_norm": 6.267481327056885, "learning_rate": 1.9863552550706825e-05, "loss": 1.8126, "step": 444 }, { "epoch": 0.03426551936078672, "grad_norm": 6.300197124481201, "learning_rate": 1.9862937922556855e-05, "loss": 2.0519, "step": 446 }, { "epoch": 0.034419176398279044, "grad_norm": 8.094818115234375, "learning_rate": 1.9862323294406885e-05, "loss": 1.8122, "step": 448 }, { "epoch": 0.03457283343577136, "grad_norm": 5.738587379455566, "learning_rate": 1.9861708666256914e-05, "loss": 1.8155, "step": 450 }, { "epoch": 0.03472649047326368, "grad_norm": 5.194686412811279, "learning_rate": 1.9861094038106947e-05, "loss": 1.9198, "step": 452 }, { "epoch": 0.03488014751075599, "grad_norm": 4.97174072265625, "learning_rate": 1.9860479409956977e-05, "loss": 1.9955, "step": 454 }, { "epoch": 0.03503380454824831, "grad_norm": 5.790378570556641, "learning_rate": 1.9859864781807007e-05, "loss": 1.8218, "step": 456 }, { "epoch": 0.035187461585740626, "grad_norm": 5.287135124206543, "learning_rate": 1.985925015365704e-05, "loss": 1.9169, "step": 458 }, { "epoch": 0.03534111862323294, "grad_norm": 8.098136901855469, "learning_rate": 1.985863552550707e-05, "loss": 1.9039, "step": 460 }, { "epoch": 0.03549477566072526, "grad_norm": 6.957726955413818, "learning_rate": 1.98580208973571e-05, "loss": 2.0036, "step": 462 }, { "epoch": 0.03564843269821758, "grad_norm": 4.368841171264648, "learning_rate": 1.9857406269207132e-05, "loss": 1.8883, "step": 464 }, { "epoch": 0.0358020897357099, "grad_norm": 5.95673131942749, "learning_rate": 1.9856791641057162e-05, "loss": 1.8977, "step": 466 }, { "epoch": 0.035955746773202214, "grad_norm": 7.365513324737549, "learning_rate": 1.985617701290719e-05, "loss": 1.9865, "step": 468 }, { "epoch": 0.03610940381069453, "grad_norm": 5.386063098907471, "learning_rate": 1.9855562384757225e-05, "loss": 1.8164, "step": 470 }, { "epoch": 0.03626306084818685, "grad_norm": 6.155988693237305, "learning_rate": 1.9854947756607254e-05, "loss": 2.0083, "step": 472 }, { "epoch": 0.03641671788567916, "grad_norm": 6.110922336578369, "learning_rate": 1.9854333128457287e-05, "loss": 1.8688, "step": 474 }, { "epoch": 0.03657037492317148, "grad_norm": 5.692699909210205, "learning_rate": 1.9853718500307314e-05, "loss": 1.8501, "step": 476 }, { "epoch": 0.036724031960663796, "grad_norm": 6.044013977050781, "learning_rate": 1.9853103872157347e-05, "loss": 1.8486, "step": 478 }, { "epoch": 0.03687768899815612, "grad_norm": 6.102372169494629, "learning_rate": 1.9852489244007376e-05, "loss": 2.0873, "step": 480 }, { "epoch": 0.037031346035648435, "grad_norm": 5.4327239990234375, "learning_rate": 1.9851874615857406e-05, "loss": 1.8635, "step": 482 }, { "epoch": 0.03718500307314075, "grad_norm": 5.779347896575928, "learning_rate": 1.985125998770744e-05, "loss": 2.0413, "step": 484 }, { "epoch": 0.03733866011063307, "grad_norm": 5.000186920166016, "learning_rate": 1.985064535955747e-05, "loss": 2.0214, "step": 486 }, { "epoch": 0.037492317148125384, "grad_norm": 6.581515312194824, "learning_rate": 1.98500307314075e-05, "loss": 1.9141, "step": 488 }, { "epoch": 0.0376459741856177, "grad_norm": 6.037952423095703, "learning_rate": 1.984941610325753e-05, "loss": 1.9475, "step": 490 }, { "epoch": 0.03779963122311002, "grad_norm": 4.99038553237915, "learning_rate": 1.984880147510756e-05, "loss": 1.8296, "step": 492 }, { "epoch": 0.03795328826060233, "grad_norm": 5.351291656494141, "learning_rate": 1.9848186846957594e-05, "loss": 1.9845, "step": 494 }, { "epoch": 0.03810694529809465, "grad_norm": 6.249404430389404, "learning_rate": 1.9847572218807624e-05, "loss": 1.8824, "step": 496 }, { "epoch": 0.03826060233558697, "grad_norm": 5.460664749145508, "learning_rate": 1.9846957590657654e-05, "loss": 1.9348, "step": 498 }, { "epoch": 0.03841425937307929, "grad_norm": 5.399702072143555, "learning_rate": 1.9846342962507687e-05, "loss": 1.8646, "step": 500 }, { "epoch": 0.038567916410571605, "grad_norm": 6.00943660736084, "learning_rate": 1.9845728334357713e-05, "loss": 1.8804, "step": 502 }, { "epoch": 0.03872157344806392, "grad_norm": 6.057244300842285, "learning_rate": 1.9845113706207746e-05, "loss": 1.8876, "step": 504 }, { "epoch": 0.03887523048555624, "grad_norm": 5.178292274475098, "learning_rate": 1.9844499078057776e-05, "loss": 1.8163, "step": 506 }, { "epoch": 0.039028887523048554, "grad_norm": 5.430099964141846, "learning_rate": 1.9843884449907806e-05, "loss": 1.9221, "step": 508 }, { "epoch": 0.03918254456054087, "grad_norm": 5.2391791343688965, "learning_rate": 1.984326982175784e-05, "loss": 1.9671, "step": 510 }, { "epoch": 0.03933620159803319, "grad_norm": 6.54328727722168, "learning_rate": 1.9842655193607868e-05, "loss": 1.9916, "step": 512 }, { "epoch": 0.03948985863552551, "grad_norm": 5.6781134605407715, "learning_rate": 1.98420405654579e-05, "loss": 1.9, "step": 514 }, { "epoch": 0.039643515673017826, "grad_norm": 5.34329891204834, "learning_rate": 1.984142593730793e-05, "loss": 1.7433, "step": 516 }, { "epoch": 0.03979717271051014, "grad_norm": 6.142169952392578, "learning_rate": 1.984081130915796e-05, "loss": 1.8559, "step": 518 }, { "epoch": 0.03995082974800246, "grad_norm": 5.825856685638428, "learning_rate": 1.9840196681007994e-05, "loss": 1.7434, "step": 520 }, { "epoch": 0.040104486785494775, "grad_norm": 4.883429050445557, "learning_rate": 1.9839582052858023e-05, "loss": 1.8403, "step": 522 }, { "epoch": 0.04025814382298709, "grad_norm": 5.759003162384033, "learning_rate": 1.9838967424708053e-05, "loss": 1.872, "step": 524 }, { "epoch": 0.04041180086047941, "grad_norm": 5.845025539398193, "learning_rate": 1.9838352796558086e-05, "loss": 1.818, "step": 526 }, { "epoch": 0.040565457897971724, "grad_norm": 6.238631248474121, "learning_rate": 1.9837738168408113e-05, "loss": 1.9553, "step": 528 }, { "epoch": 0.04071911493546405, "grad_norm": 5.450825214385986, "learning_rate": 1.9837123540258146e-05, "loss": 1.9314, "step": 530 }, { "epoch": 0.040872771972956363, "grad_norm": 5.4290385246276855, "learning_rate": 1.9836508912108175e-05, "loss": 1.8316, "step": 532 }, { "epoch": 0.04102642901044868, "grad_norm": 6.243955612182617, "learning_rate": 1.9835894283958205e-05, "loss": 1.9605, "step": 534 }, { "epoch": 0.041180086047940996, "grad_norm": 5.5207672119140625, "learning_rate": 1.9835279655808238e-05, "loss": 1.9377, "step": 536 }, { "epoch": 0.04133374308543331, "grad_norm": 5.570779323577881, "learning_rate": 1.9834665027658268e-05, "loss": 1.9706, "step": 538 }, { "epoch": 0.04148740012292563, "grad_norm": 4.921234130859375, "learning_rate": 1.98340503995083e-05, "loss": 1.8666, "step": 540 }, { "epoch": 0.041641057160417945, "grad_norm": 6.029317855834961, "learning_rate": 1.983343577135833e-05, "loss": 1.8431, "step": 542 }, { "epoch": 0.04179471419791026, "grad_norm": 5.6237664222717285, "learning_rate": 1.983282114320836e-05, "loss": 2.0265, "step": 544 }, { "epoch": 0.041948371235402585, "grad_norm": 4.848809719085693, "learning_rate": 1.9832206515058393e-05, "loss": 1.851, "step": 546 }, { "epoch": 0.0421020282728949, "grad_norm": 6.06104040145874, "learning_rate": 1.983159188690842e-05, "loss": 1.9252, "step": 548 }, { "epoch": 0.04225568531038722, "grad_norm": 6.721662521362305, "learning_rate": 1.9830977258758453e-05, "loss": 1.9046, "step": 550 }, { "epoch": 0.04240934234787953, "grad_norm": 5.039158821105957, "learning_rate": 1.9830362630608482e-05, "loss": 1.9457, "step": 552 }, { "epoch": 0.04256299938537185, "grad_norm": 4.985758304595947, "learning_rate": 1.9829748002458512e-05, "loss": 1.7706, "step": 554 }, { "epoch": 0.042716656422864166, "grad_norm": 5.59445858001709, "learning_rate": 1.9829133374308545e-05, "loss": 1.9232, "step": 556 }, { "epoch": 0.04287031346035648, "grad_norm": 5.786518573760986, "learning_rate": 1.9828518746158575e-05, "loss": 1.9535, "step": 558 }, { "epoch": 0.0430239704978488, "grad_norm": 5.362064838409424, "learning_rate": 1.9827904118008608e-05, "loss": 1.774, "step": 560 }, { "epoch": 0.04317762753534112, "grad_norm": 6.807535171508789, "learning_rate": 1.9827289489858637e-05, "loss": 1.9963, "step": 562 }, { "epoch": 0.04333128457283344, "grad_norm": 4.927182197570801, "learning_rate": 1.9826674861708667e-05, "loss": 1.8839, "step": 564 }, { "epoch": 0.043484941610325754, "grad_norm": 7.077647686004639, "learning_rate": 1.98260602335587e-05, "loss": 1.8577, "step": 566 }, { "epoch": 0.04363859864781807, "grad_norm": 4.930956840515137, "learning_rate": 1.982544560540873e-05, "loss": 1.9032, "step": 568 }, { "epoch": 0.04379225568531039, "grad_norm": 5.537839889526367, "learning_rate": 1.982483097725876e-05, "loss": 1.8599, "step": 570 }, { "epoch": 0.0439459127228027, "grad_norm": 4.91294527053833, "learning_rate": 1.9824216349108793e-05, "loss": 1.8962, "step": 572 }, { "epoch": 0.04409956976029502, "grad_norm": 7.946929931640625, "learning_rate": 1.982360172095882e-05, "loss": 2.0401, "step": 574 }, { "epoch": 0.044253226797787336, "grad_norm": 5.566417217254639, "learning_rate": 1.9822987092808852e-05, "loss": 1.7317, "step": 576 }, { "epoch": 0.04440688383527966, "grad_norm": 6.196030616760254, "learning_rate": 1.9822372464658882e-05, "loss": 1.9818, "step": 578 }, { "epoch": 0.044560540872771975, "grad_norm": 5.8990888595581055, "learning_rate": 1.9821757836508915e-05, "loss": 1.9209, "step": 580 }, { "epoch": 0.04471419791026429, "grad_norm": 4.752439022064209, "learning_rate": 1.9821143208358944e-05, "loss": 1.8661, "step": 582 }, { "epoch": 0.04486785494775661, "grad_norm": 5.3692121505737305, "learning_rate": 1.9820528580208974e-05, "loss": 1.8574, "step": 584 }, { "epoch": 0.045021511985248924, "grad_norm": 4.94577169418335, "learning_rate": 1.9819913952059007e-05, "loss": 1.76, "step": 586 }, { "epoch": 0.04517516902274124, "grad_norm": 5.1533708572387695, "learning_rate": 1.9819299323909037e-05, "loss": 1.8634, "step": 588 }, { "epoch": 0.04532882606023356, "grad_norm": 5.460253715515137, "learning_rate": 1.9818684695759067e-05, "loss": 1.7615, "step": 590 }, { "epoch": 0.04548248309772587, "grad_norm": 6.106910705566406, "learning_rate": 1.98180700676091e-05, "loss": 1.8658, "step": 592 }, { "epoch": 0.045636140135218196, "grad_norm": 8.604896545410156, "learning_rate": 1.981745543945913e-05, "loss": 1.8234, "step": 594 }, { "epoch": 0.04578979717271051, "grad_norm": 5.533381938934326, "learning_rate": 1.981684081130916e-05, "loss": 1.8133, "step": 596 }, { "epoch": 0.04594345421020283, "grad_norm": 5.140172481536865, "learning_rate": 1.9816226183159192e-05, "loss": 1.7655, "step": 598 }, { "epoch": 0.046097111247695145, "grad_norm": 5.633389472961426, "learning_rate": 1.9815611555009222e-05, "loss": 1.8804, "step": 600 }, { "epoch": 0.04625076828518746, "grad_norm": 5.397654056549072, "learning_rate": 1.981499692685925e-05, "loss": 1.9422, "step": 602 }, { "epoch": 0.04640442532267978, "grad_norm": 5.916885852813721, "learning_rate": 1.981438229870928e-05, "loss": 1.9222, "step": 604 }, { "epoch": 0.046558082360172094, "grad_norm": 4.4198198318481445, "learning_rate": 1.9813767670559314e-05, "loss": 1.8088, "step": 606 }, { "epoch": 0.04671173939766441, "grad_norm": 6.035666465759277, "learning_rate": 1.9813153042409344e-05, "loss": 1.9505, "step": 608 }, { "epoch": 0.04686539643515673, "grad_norm": 5.293002605438232, "learning_rate": 1.9812538414259374e-05, "loss": 1.9354, "step": 610 }, { "epoch": 0.04701905347264905, "grad_norm": 5.066743850708008, "learning_rate": 1.9811923786109407e-05, "loss": 2.001, "step": 612 }, { "epoch": 0.047172710510141366, "grad_norm": 6.867171764373779, "learning_rate": 1.9811309157959436e-05, "loss": 1.86, "step": 614 }, { "epoch": 0.04732636754763368, "grad_norm": 4.908615589141846, "learning_rate": 1.9810694529809466e-05, "loss": 1.8855, "step": 616 }, { "epoch": 0.047480024585126, "grad_norm": 5.6588006019592285, "learning_rate": 1.98100799016595e-05, "loss": 1.8047, "step": 618 }, { "epoch": 0.047633681622618315, "grad_norm": 5.6555304527282715, "learning_rate": 1.980946527350953e-05, "loss": 1.7656, "step": 620 }, { "epoch": 0.04778733866011063, "grad_norm": 4.742602348327637, "learning_rate": 1.980885064535956e-05, "loss": 1.976, "step": 622 }, { "epoch": 0.04794099569760295, "grad_norm": 5.0910868644714355, "learning_rate": 1.980823601720959e-05, "loss": 1.8894, "step": 624 }, { "epoch": 0.048094652735095264, "grad_norm": 5.279669761657715, "learning_rate": 1.980762138905962e-05, "loss": 1.9323, "step": 626 }, { "epoch": 0.04824830977258759, "grad_norm": 5.603051662445068, "learning_rate": 1.980700676090965e-05, "loss": 1.9327, "step": 628 }, { "epoch": 0.048401966810079904, "grad_norm": 5.823456764221191, "learning_rate": 1.980639213275968e-05, "loss": 1.9087, "step": 630 }, { "epoch": 0.04855562384757222, "grad_norm": 4.226296424865723, "learning_rate": 1.9805777504609714e-05, "loss": 1.7298, "step": 632 }, { "epoch": 0.048709280885064536, "grad_norm": 4.537020683288574, "learning_rate": 1.9805162876459743e-05, "loss": 1.8588, "step": 634 }, { "epoch": 0.04886293792255685, "grad_norm": 5.843430519104004, "learning_rate": 1.9804548248309773e-05, "loss": 1.8581, "step": 636 }, { "epoch": 0.04901659496004917, "grad_norm": 5.234043598175049, "learning_rate": 1.9803933620159806e-05, "loss": 1.8016, "step": 638 }, { "epoch": 0.049170251997541485, "grad_norm": 6.091218948364258, "learning_rate": 1.9803318992009836e-05, "loss": 1.8419, "step": 640 }, { "epoch": 0.0493239090350338, "grad_norm": 5.473825454711914, "learning_rate": 1.9802704363859865e-05, "loss": 1.8742, "step": 642 }, { "epoch": 0.049477566072526125, "grad_norm": 5.018134117126465, "learning_rate": 1.98020897357099e-05, "loss": 1.9246, "step": 644 }, { "epoch": 0.04963122311001844, "grad_norm": 5.1250505447387695, "learning_rate": 1.9801475107559928e-05, "loss": 1.8988, "step": 646 }, { "epoch": 0.04978488014751076, "grad_norm": 5.310157299041748, "learning_rate": 1.9800860479409958e-05, "loss": 1.9718, "step": 648 }, { "epoch": 0.049938537185003073, "grad_norm": 5.5490570068359375, "learning_rate": 1.980024585125999e-05, "loss": 1.8779, "step": 650 }, { "epoch": 0.05009219422249539, "grad_norm": 5.242208480834961, "learning_rate": 1.979963122311002e-05, "loss": 1.8712, "step": 652 }, { "epoch": 0.050245851259987706, "grad_norm": 4.680446624755859, "learning_rate": 1.979901659496005e-05, "loss": 1.9475, "step": 654 }, { "epoch": 0.05039950829748002, "grad_norm": 12.400496482849121, "learning_rate": 1.979840196681008e-05, "loss": 1.9387, "step": 656 }, { "epoch": 0.05055316533497234, "grad_norm": 4.818700313568115, "learning_rate": 1.9797787338660113e-05, "loss": 1.7356, "step": 658 }, { "epoch": 0.05070682237246466, "grad_norm": 4.733686923980713, "learning_rate": 1.9797172710510143e-05, "loss": 1.7161, "step": 660 }, { "epoch": 0.05086047940995698, "grad_norm": 5.9219865798950195, "learning_rate": 1.9796558082360172e-05, "loss": 1.9821, "step": 662 }, { "epoch": 0.051014136447449294, "grad_norm": 4.954675197601318, "learning_rate": 1.9795943454210206e-05, "loss": 1.9392, "step": 664 }, { "epoch": 0.05116779348494161, "grad_norm": 4.482631206512451, "learning_rate": 1.9795328826060235e-05, "loss": 1.9687, "step": 666 }, { "epoch": 0.05132145052243393, "grad_norm": 6.749068737030029, "learning_rate": 1.9794714197910265e-05, "loss": 1.8242, "step": 668 }, { "epoch": 0.05147510755992624, "grad_norm": 4.532095909118652, "learning_rate": 1.9794099569760298e-05, "loss": 1.7596, "step": 670 }, { "epoch": 0.05162876459741856, "grad_norm": 5.727676868438721, "learning_rate": 1.9793484941610328e-05, "loss": 1.94, "step": 672 }, { "epoch": 0.051782421634910876, "grad_norm": 5.493950843811035, "learning_rate": 1.9792870313460357e-05, "loss": 1.9243, "step": 674 }, { "epoch": 0.0519360786724032, "grad_norm": 5.48468017578125, "learning_rate": 1.9792255685310387e-05, "loss": 1.7862, "step": 676 }, { "epoch": 0.052089735709895515, "grad_norm": 5.862773895263672, "learning_rate": 1.979164105716042e-05, "loss": 1.843, "step": 678 }, { "epoch": 0.05224339274738783, "grad_norm": 5.505096912384033, "learning_rate": 1.979102642901045e-05, "loss": 1.8366, "step": 680 }, { "epoch": 0.05239704978488015, "grad_norm": 5.697121620178223, "learning_rate": 1.979041180086048e-05, "loss": 1.9764, "step": 682 }, { "epoch": 0.052550706822372464, "grad_norm": 4.900547027587891, "learning_rate": 1.9789797172710513e-05, "loss": 1.9252, "step": 684 }, { "epoch": 0.05270436385986478, "grad_norm": 5.347836017608643, "learning_rate": 1.9789182544560542e-05, "loss": 1.8527, "step": 686 }, { "epoch": 0.0528580208973571, "grad_norm": 5.393474102020264, "learning_rate": 1.9788567916410572e-05, "loss": 1.8422, "step": 688 }, { "epoch": 0.05301167793484941, "grad_norm": 5.27833366394043, "learning_rate": 1.9787953288260605e-05, "loss": 1.8933, "step": 690 }, { "epoch": 0.053165334972341736, "grad_norm": 5.38336181640625, "learning_rate": 1.9787338660110635e-05, "loss": 1.9456, "step": 692 }, { "epoch": 0.05331899200983405, "grad_norm": 5.273176193237305, "learning_rate": 1.9786724031960664e-05, "loss": 1.8142, "step": 694 }, { "epoch": 0.05347264904732637, "grad_norm": 5.413751125335693, "learning_rate": 1.9786109403810697e-05, "loss": 1.7652, "step": 696 }, { "epoch": 0.053626306084818685, "grad_norm": 5.373195648193359, "learning_rate": 1.9785494775660727e-05, "loss": 1.883, "step": 698 }, { "epoch": 0.053779963122311, "grad_norm": 4.942586421966553, "learning_rate": 1.9784880147510757e-05, "loss": 1.76, "step": 700 }, { "epoch": 0.05393362015980332, "grad_norm": 5.6196980476379395, "learning_rate": 1.9784265519360786e-05, "loss": 1.7673, "step": 702 }, { "epoch": 0.054087277197295634, "grad_norm": 5.702764987945557, "learning_rate": 1.978365089121082e-05, "loss": 1.8437, "step": 704 }, { "epoch": 0.05424093423478795, "grad_norm": 4.99530553817749, "learning_rate": 1.978303626306085e-05, "loss": 1.8747, "step": 706 }, { "epoch": 0.054394591272280274, "grad_norm": 5.105679035186768, "learning_rate": 1.978242163491088e-05, "loss": 1.6632, "step": 708 }, { "epoch": 0.05454824830977259, "grad_norm": 4.710418701171875, "learning_rate": 1.9781807006760912e-05, "loss": 1.8736, "step": 710 }, { "epoch": 0.054701905347264906, "grad_norm": 4.792379856109619, "learning_rate": 1.978119237861094e-05, "loss": 1.8016, "step": 712 }, { "epoch": 0.05485556238475722, "grad_norm": 4.937024116516113, "learning_rate": 1.978057775046097e-05, "loss": 1.7436, "step": 714 }, { "epoch": 0.05500921942224954, "grad_norm": 5.5544867515563965, "learning_rate": 1.9779963122311004e-05, "loss": 1.926, "step": 716 }, { "epoch": 0.055162876459741855, "grad_norm": 6.484194278717041, "learning_rate": 1.9779348494161034e-05, "loss": 1.9102, "step": 718 }, { "epoch": 0.05531653349723417, "grad_norm": 5.408361434936523, "learning_rate": 1.9778733866011064e-05, "loss": 1.7786, "step": 720 }, { "epoch": 0.05547019053472649, "grad_norm": 5.705206394195557, "learning_rate": 1.9778119237861097e-05, "loss": 1.779, "step": 722 }, { "epoch": 0.055623847572218804, "grad_norm": 6.138594627380371, "learning_rate": 1.9777504609711127e-05, "loss": 1.6926, "step": 724 }, { "epoch": 0.05577750460971113, "grad_norm": 5.507882595062256, "learning_rate": 1.977688998156116e-05, "loss": 2.0119, "step": 726 }, { "epoch": 0.055931161647203444, "grad_norm": 5.1471710205078125, "learning_rate": 1.9776275353411186e-05, "loss": 1.7674, "step": 728 }, { "epoch": 0.05608481868469576, "grad_norm": 5.558322906494141, "learning_rate": 1.977566072526122e-05, "loss": 1.7262, "step": 730 }, { "epoch": 0.056238475722188076, "grad_norm": 5.859812259674072, "learning_rate": 1.977504609711125e-05, "loss": 2.0273, "step": 732 }, { "epoch": 0.05639213275968039, "grad_norm": 4.931456565856934, "learning_rate": 1.977443146896128e-05, "loss": 1.7008, "step": 734 }, { "epoch": 0.05654578979717271, "grad_norm": 4.835200786590576, "learning_rate": 1.977381684081131e-05, "loss": 1.7615, "step": 736 }, { "epoch": 0.056699446834665025, "grad_norm": 5.542105674743652, "learning_rate": 1.977320221266134e-05, "loss": 1.7991, "step": 738 }, { "epoch": 0.05685310387215734, "grad_norm": 5.737773895263672, "learning_rate": 1.977258758451137e-05, "loss": 1.7067, "step": 740 }, { "epoch": 0.057006760909649665, "grad_norm": 4.556394100189209, "learning_rate": 1.9771972956361404e-05, "loss": 1.8418, "step": 742 }, { "epoch": 0.05716041794714198, "grad_norm": 4.682400226593018, "learning_rate": 1.9771358328211434e-05, "loss": 1.7565, "step": 744 }, { "epoch": 0.0573140749846343, "grad_norm": 5.617753982543945, "learning_rate": 1.9770743700061467e-05, "loss": 1.8314, "step": 746 }, { "epoch": 0.057467732022126614, "grad_norm": 4.796401500701904, "learning_rate": 1.9770129071911496e-05, "loss": 1.6892, "step": 748 }, { "epoch": 0.05762138905961893, "grad_norm": 5.084446430206299, "learning_rate": 1.9769514443761526e-05, "loss": 1.6731, "step": 750 }, { "epoch": 0.057775046097111246, "grad_norm": 5.344216823577881, "learning_rate": 1.976889981561156e-05, "loss": 1.8096, "step": 752 }, { "epoch": 0.05792870313460356, "grad_norm": 4.87506103515625, "learning_rate": 1.9768285187461585e-05, "loss": 1.9015, "step": 754 }, { "epoch": 0.05808236017209588, "grad_norm": 5.019058704376221, "learning_rate": 1.976767055931162e-05, "loss": 1.9467, "step": 756 }, { "epoch": 0.0582360172095882, "grad_norm": 5.275008678436279, "learning_rate": 1.9767055931161648e-05, "loss": 1.6159, "step": 758 }, { "epoch": 0.05838967424708052, "grad_norm": 5.17955207824707, "learning_rate": 1.9766441303011678e-05, "loss": 1.6819, "step": 760 }, { "epoch": 0.058543331284572835, "grad_norm": 5.578658580780029, "learning_rate": 1.976582667486171e-05, "loss": 1.8369, "step": 762 }, { "epoch": 0.05869698832206515, "grad_norm": 4.934607982635498, "learning_rate": 1.976521204671174e-05, "loss": 1.8909, "step": 764 }, { "epoch": 0.05885064535955747, "grad_norm": 5.5896759033203125, "learning_rate": 1.9764597418561774e-05, "loss": 1.7238, "step": 766 }, { "epoch": 0.05900430239704978, "grad_norm": 5.263469696044922, "learning_rate": 1.9763982790411803e-05, "loss": 1.7776, "step": 768 }, { "epoch": 0.0591579594345421, "grad_norm": 4.459990978240967, "learning_rate": 1.9763368162261833e-05, "loss": 1.7082, "step": 770 }, { "epoch": 0.059311616472034416, "grad_norm": 5.528759002685547, "learning_rate": 1.9762753534111866e-05, "loss": 1.9014, "step": 772 }, { "epoch": 0.05946527350952674, "grad_norm": 5.372073650360107, "learning_rate": 1.9762138905961892e-05, "loss": 1.691, "step": 774 }, { "epoch": 0.059618930547019056, "grad_norm": 5.765900135040283, "learning_rate": 1.9761524277811925e-05, "loss": 1.9243, "step": 776 }, { "epoch": 0.05977258758451137, "grad_norm": 5.123989105224609, "learning_rate": 1.9760909649661955e-05, "loss": 2.0223, "step": 778 }, { "epoch": 0.05992624462200369, "grad_norm": 5.149808406829834, "learning_rate": 1.9760295021511985e-05, "loss": 1.8063, "step": 780 }, { "epoch": 0.060079901659496004, "grad_norm": 5.047703266143799, "learning_rate": 1.9759680393362018e-05, "loss": 1.9353, "step": 782 }, { "epoch": 0.06023355869698832, "grad_norm": 5.555423259735107, "learning_rate": 1.9759065765212048e-05, "loss": 1.695, "step": 784 }, { "epoch": 0.06038721573448064, "grad_norm": 5.100247859954834, "learning_rate": 1.9758451137062077e-05, "loss": 1.6758, "step": 786 }, { "epoch": 0.06054087277197295, "grad_norm": 4.941176891326904, "learning_rate": 1.975783650891211e-05, "loss": 1.7391, "step": 788 }, { "epoch": 0.06069452980946528, "grad_norm": 5.3119964599609375, "learning_rate": 1.975722188076214e-05, "loss": 2.0366, "step": 790 }, { "epoch": 0.06084818684695759, "grad_norm": 6.0686235427856445, "learning_rate": 1.9756607252612173e-05, "loss": 1.7149, "step": 792 }, { "epoch": 0.06100184388444991, "grad_norm": 6.141575336456299, "learning_rate": 1.9755992624462203e-05, "loss": 1.7808, "step": 794 }, { "epoch": 0.061155500921942225, "grad_norm": 5.157688140869141, "learning_rate": 1.9755377996312232e-05, "loss": 1.8069, "step": 796 }, { "epoch": 0.06130915795943454, "grad_norm": 5.358695983886719, "learning_rate": 1.9754763368162266e-05, "loss": 1.9632, "step": 798 }, { "epoch": 0.06146281499692686, "grad_norm": 5.3423261642456055, "learning_rate": 1.9754148740012292e-05, "loss": 1.6711, "step": 800 }, { "epoch": 0.061616472034419174, "grad_norm": 5.9911980628967285, "learning_rate": 1.9753534111862325e-05, "loss": 1.6384, "step": 802 }, { "epoch": 0.06177012907191149, "grad_norm": 5.021694183349609, "learning_rate": 1.9752919483712355e-05, "loss": 1.7412, "step": 804 }, { "epoch": 0.061923786109403814, "grad_norm": 5.38372802734375, "learning_rate": 1.9752304855562384e-05, "loss": 1.5895, "step": 806 }, { "epoch": 0.06207744314689613, "grad_norm": 5.618641376495361, "learning_rate": 1.9751690227412417e-05, "loss": 1.8097, "step": 808 }, { "epoch": 0.062231100184388446, "grad_norm": 5.081387519836426, "learning_rate": 1.9751075599262447e-05, "loss": 1.7789, "step": 810 }, { "epoch": 0.06238475722188076, "grad_norm": 5.361464500427246, "learning_rate": 1.975046097111248e-05, "loss": 1.7821, "step": 812 }, { "epoch": 0.06253841425937308, "grad_norm": 5.113397598266602, "learning_rate": 1.974984634296251e-05, "loss": 1.7292, "step": 814 }, { "epoch": 0.0626920712968654, "grad_norm": 5.261277198791504, "learning_rate": 1.974923171481254e-05, "loss": 1.6956, "step": 816 }, { "epoch": 0.06284572833435771, "grad_norm": 7.06874942779541, "learning_rate": 1.9748617086662573e-05, "loss": 1.865, "step": 818 }, { "epoch": 0.06299938537185003, "grad_norm": 4.949324131011963, "learning_rate": 1.9748002458512602e-05, "loss": 1.7885, "step": 820 }, { "epoch": 0.06315304240934234, "grad_norm": 6.291264533996582, "learning_rate": 1.9747387830362632e-05, "loss": 1.8685, "step": 822 }, { "epoch": 0.06330669944683466, "grad_norm": 4.500913143157959, "learning_rate": 1.9746773202212665e-05, "loss": 1.6422, "step": 824 }, { "epoch": 0.06346035648432698, "grad_norm": 5.313440322875977, "learning_rate": 1.974615857406269e-05, "loss": 1.8328, "step": 826 }, { "epoch": 0.0636140135218193, "grad_norm": 5.809798240661621, "learning_rate": 1.9745543945912724e-05, "loss": 1.8912, "step": 828 }, { "epoch": 0.06376767055931162, "grad_norm": 4.69051456451416, "learning_rate": 1.9744929317762754e-05, "loss": 1.6786, "step": 830 }, { "epoch": 0.06392132759680394, "grad_norm": 5.292459487915039, "learning_rate": 1.9744314689612787e-05, "loss": 1.8497, "step": 832 }, { "epoch": 0.06407498463429626, "grad_norm": 4.772144794464111, "learning_rate": 1.9743700061462817e-05, "loss": 1.6261, "step": 834 }, { "epoch": 0.06422864167178857, "grad_norm": 4.984364032745361, "learning_rate": 1.9743085433312846e-05, "loss": 1.7354, "step": 836 }, { "epoch": 0.06438229870928089, "grad_norm": 4.450577735900879, "learning_rate": 1.974247080516288e-05, "loss": 1.7349, "step": 838 }, { "epoch": 0.0645359557467732, "grad_norm": 5.341747760772705, "learning_rate": 1.974185617701291e-05, "loss": 1.8332, "step": 840 }, { "epoch": 0.06468961278426552, "grad_norm": 5.368303298950195, "learning_rate": 1.974124154886294e-05, "loss": 1.8219, "step": 842 }, { "epoch": 0.06484326982175784, "grad_norm": 4.4096360206604, "learning_rate": 1.9740626920712972e-05, "loss": 1.8158, "step": 844 }, { "epoch": 0.06499692685925015, "grad_norm": 6.098479270935059, "learning_rate": 1.9740012292563e-05, "loss": 1.7323, "step": 846 }, { "epoch": 0.06515058389674247, "grad_norm": 4.606769561767578, "learning_rate": 1.973939766441303e-05, "loss": 1.7527, "step": 848 }, { "epoch": 0.06530424093423479, "grad_norm": 6.082258701324463, "learning_rate": 1.9738783036263064e-05, "loss": 1.8471, "step": 850 }, { "epoch": 0.0654578979717271, "grad_norm": 5.389958381652832, "learning_rate": 1.9738168408113094e-05, "loss": 1.8564, "step": 852 }, { "epoch": 0.06561155500921942, "grad_norm": 5.574385643005371, "learning_rate": 1.9737553779963124e-05, "loss": 1.784, "step": 854 }, { "epoch": 0.06576521204671174, "grad_norm": 5.1567487716674805, "learning_rate": 1.9736939151813153e-05, "loss": 1.713, "step": 856 }, { "epoch": 0.06591886908420405, "grad_norm": 5.475706577301025, "learning_rate": 1.9736324523663187e-05, "loss": 1.7555, "step": 858 }, { "epoch": 0.06607252612169637, "grad_norm": 4.831605434417725, "learning_rate": 1.9735709895513216e-05, "loss": 1.8695, "step": 860 }, { "epoch": 0.06622618315918868, "grad_norm": 6.022873878479004, "learning_rate": 1.9735095267363246e-05, "loss": 1.7765, "step": 862 }, { "epoch": 0.06637984019668101, "grad_norm": 4.874941825866699, "learning_rate": 1.973448063921328e-05, "loss": 1.757, "step": 864 }, { "epoch": 0.06653349723417333, "grad_norm": 4.488655090332031, "learning_rate": 1.973386601106331e-05, "loss": 1.8837, "step": 866 }, { "epoch": 0.06668715427166565, "grad_norm": 4.5713090896606445, "learning_rate": 1.973325138291334e-05, "loss": 1.6627, "step": 868 }, { "epoch": 0.06684081130915796, "grad_norm": 5.312070846557617, "learning_rate": 1.973263675476337e-05, "loss": 1.7551, "step": 870 }, { "epoch": 0.06699446834665028, "grad_norm": 5.104644775390625, "learning_rate": 1.97320221266134e-05, "loss": 1.8789, "step": 872 }, { "epoch": 0.0671481253841426, "grad_norm": 4.595895290374756, "learning_rate": 1.973140749846343e-05, "loss": 1.8319, "step": 874 }, { "epoch": 0.06730178242163491, "grad_norm": 5.223522186279297, "learning_rate": 1.973079287031346e-05, "loss": 1.6661, "step": 876 }, { "epoch": 0.06745543945912723, "grad_norm": 4.466522693634033, "learning_rate": 1.9730178242163494e-05, "loss": 1.8338, "step": 878 }, { "epoch": 0.06760909649661954, "grad_norm": 4.613927841186523, "learning_rate": 1.9729563614013523e-05, "loss": 1.7562, "step": 880 }, { "epoch": 0.06776275353411186, "grad_norm": 5.868101119995117, "learning_rate": 1.9728948985863553e-05, "loss": 1.9181, "step": 882 }, { "epoch": 0.06791641057160418, "grad_norm": 5.407005786895752, "learning_rate": 1.9728334357713586e-05, "loss": 1.8462, "step": 884 }, { "epoch": 0.0680700676090965, "grad_norm": 6.075082778930664, "learning_rate": 1.9727719729563616e-05, "loss": 1.816, "step": 886 }, { "epoch": 0.06822372464658881, "grad_norm": 5.164397716522217, "learning_rate": 1.9727105101413645e-05, "loss": 1.9236, "step": 888 }, { "epoch": 0.06837738168408113, "grad_norm": 5.688714981079102, "learning_rate": 1.972649047326368e-05, "loss": 1.7171, "step": 890 }, { "epoch": 0.06853103872157344, "grad_norm": 4.9518842697143555, "learning_rate": 1.9725875845113708e-05, "loss": 1.7829, "step": 892 }, { "epoch": 0.06868469575906576, "grad_norm": 5.185763835906982, "learning_rate": 1.9725261216963738e-05, "loss": 1.7146, "step": 894 }, { "epoch": 0.06883835279655809, "grad_norm": 5.043625354766846, "learning_rate": 1.972464658881377e-05, "loss": 1.8594, "step": 896 }, { "epoch": 0.0689920098340504, "grad_norm": 4.783642292022705, "learning_rate": 1.97240319606638e-05, "loss": 1.7053, "step": 898 }, { "epoch": 0.06914566687154272, "grad_norm": 4.887513637542725, "learning_rate": 1.972341733251383e-05, "loss": 1.7929, "step": 900 }, { "epoch": 0.06929932390903504, "grad_norm": 4.4108123779296875, "learning_rate": 1.972280270436386e-05, "loss": 1.7296, "step": 902 }, { "epoch": 0.06945298094652735, "grad_norm": 5.531246662139893, "learning_rate": 1.9722188076213893e-05, "loss": 1.7792, "step": 904 }, { "epoch": 0.06960663798401967, "grad_norm": 4.462593078613281, "learning_rate": 1.9721573448063923e-05, "loss": 1.7121, "step": 906 }, { "epoch": 0.06976029502151199, "grad_norm": 4.543118000030518, "learning_rate": 1.9720958819913952e-05, "loss": 1.7871, "step": 908 }, { "epoch": 0.0699139520590043, "grad_norm": 5.9536638259887695, "learning_rate": 1.9720344191763985e-05, "loss": 1.7956, "step": 910 }, { "epoch": 0.07006760909649662, "grad_norm": 4.735901832580566, "learning_rate": 1.9719729563614015e-05, "loss": 1.7511, "step": 912 }, { "epoch": 0.07022126613398894, "grad_norm": 4.490820407867432, "learning_rate": 1.9719114935464045e-05, "loss": 1.7364, "step": 914 }, { "epoch": 0.07037492317148125, "grad_norm": 4.837772846221924, "learning_rate": 1.9718500307314078e-05, "loss": 1.8204, "step": 916 }, { "epoch": 0.07052858020897357, "grad_norm": 4.400464057922363, "learning_rate": 1.9717885679164108e-05, "loss": 1.8637, "step": 918 }, { "epoch": 0.07068223724646588, "grad_norm": 4.991397857666016, "learning_rate": 1.9717271051014137e-05, "loss": 1.9175, "step": 920 }, { "epoch": 0.0708358942839582, "grad_norm": 4.983181953430176, "learning_rate": 1.971665642286417e-05, "loss": 1.7065, "step": 922 }, { "epoch": 0.07098955132145052, "grad_norm": 4.5055341720581055, "learning_rate": 1.97160417947142e-05, "loss": 1.7113, "step": 924 }, { "epoch": 0.07114320835894283, "grad_norm": 5.021308422088623, "learning_rate": 1.971542716656423e-05, "loss": 1.8903, "step": 926 }, { "epoch": 0.07129686539643516, "grad_norm": 6.712131023406982, "learning_rate": 1.971481253841426e-05, "loss": 1.7608, "step": 928 }, { "epoch": 0.07145052243392748, "grad_norm": 5.711028575897217, "learning_rate": 1.9714197910264292e-05, "loss": 1.7826, "step": 930 }, { "epoch": 0.0716041794714198, "grad_norm": 5.202549457550049, "learning_rate": 1.9713583282114322e-05, "loss": 1.7201, "step": 932 }, { "epoch": 0.07175783650891211, "grad_norm": 4.809873580932617, "learning_rate": 1.9712968653964352e-05, "loss": 1.7824, "step": 934 }, { "epoch": 0.07191149354640443, "grad_norm": 4.417870998382568, "learning_rate": 1.9712354025814385e-05, "loss": 1.8275, "step": 936 }, { "epoch": 0.07206515058389674, "grad_norm": 4.823970794677734, "learning_rate": 1.9711739397664415e-05, "loss": 1.7293, "step": 938 }, { "epoch": 0.07221880762138906, "grad_norm": 5.289034843444824, "learning_rate": 1.9711124769514444e-05, "loss": 1.7507, "step": 940 }, { "epoch": 0.07237246465888138, "grad_norm": 4.538127422332764, "learning_rate": 1.9710510141364477e-05, "loss": 1.6391, "step": 942 }, { "epoch": 0.0725261216963737, "grad_norm": 4.500412464141846, "learning_rate": 1.9709895513214507e-05, "loss": 1.6816, "step": 944 }, { "epoch": 0.07267977873386601, "grad_norm": 5.149693012237549, "learning_rate": 1.9709280885064537e-05, "loss": 1.6366, "step": 946 }, { "epoch": 0.07283343577135833, "grad_norm": 4.554830074310303, "learning_rate": 1.970866625691457e-05, "loss": 1.7712, "step": 948 }, { "epoch": 0.07298709280885064, "grad_norm": 5.352977752685547, "learning_rate": 1.97080516287646e-05, "loss": 1.5819, "step": 950 }, { "epoch": 0.07314074984634296, "grad_norm": 5.3114094734191895, "learning_rate": 1.970743700061463e-05, "loss": 1.802, "step": 952 }, { "epoch": 0.07329440688383528, "grad_norm": 4.19597053527832, "learning_rate": 1.970682237246466e-05, "loss": 1.5855, "step": 954 }, { "epoch": 0.07344806392132759, "grad_norm": 4.087234020233154, "learning_rate": 1.9706207744314692e-05, "loss": 1.7158, "step": 956 }, { "epoch": 0.07360172095881991, "grad_norm": 5.064235210418701, "learning_rate": 1.970559311616472e-05, "loss": 1.7579, "step": 958 }, { "epoch": 0.07375537799631224, "grad_norm": 5.033870697021484, "learning_rate": 1.970497848801475e-05, "loss": 1.8105, "step": 960 }, { "epoch": 0.07390903503380455, "grad_norm": 4.546055793762207, "learning_rate": 1.9704363859864784e-05, "loss": 1.6964, "step": 962 }, { "epoch": 0.07406269207129687, "grad_norm": 5.068551540374756, "learning_rate": 1.9703749231714814e-05, "loss": 1.6664, "step": 964 }, { "epoch": 0.07421634910878919, "grad_norm": 5.1122050285339355, "learning_rate": 1.9703134603564844e-05, "loss": 1.819, "step": 966 }, { "epoch": 0.0743700061462815, "grad_norm": 4.410829544067383, "learning_rate": 1.9702519975414877e-05, "loss": 1.7872, "step": 968 }, { "epoch": 0.07452366318377382, "grad_norm": 4.5425190925598145, "learning_rate": 1.9701905347264906e-05, "loss": 1.6679, "step": 970 }, { "epoch": 0.07467732022126614, "grad_norm": 4.571905612945557, "learning_rate": 1.9701290719114936e-05, "loss": 1.7135, "step": 972 }, { "epoch": 0.07483097725875845, "grad_norm": 4.853597640991211, "learning_rate": 1.970067609096497e-05, "loss": 1.6162, "step": 974 }, { "epoch": 0.07498463429625077, "grad_norm": 5.288270473480225, "learning_rate": 1.9700061462815e-05, "loss": 1.7473, "step": 976 }, { "epoch": 0.07513829133374308, "grad_norm": 4.418172359466553, "learning_rate": 1.9699446834665032e-05, "loss": 1.6986, "step": 978 }, { "epoch": 0.0752919483712354, "grad_norm": 4.6486496925354, "learning_rate": 1.9698832206515058e-05, "loss": 1.6914, "step": 980 }, { "epoch": 0.07544560540872772, "grad_norm": 5.205509185791016, "learning_rate": 1.969821757836509e-05, "loss": 1.6129, "step": 982 }, { "epoch": 0.07559926244622003, "grad_norm": 4.540061950683594, "learning_rate": 1.969760295021512e-05, "loss": 1.7026, "step": 984 }, { "epoch": 0.07575291948371235, "grad_norm": 4.7810845375061035, "learning_rate": 1.969698832206515e-05, "loss": 1.686, "step": 986 }, { "epoch": 0.07590657652120467, "grad_norm": 4.475192546844482, "learning_rate": 1.9696373693915184e-05, "loss": 1.739, "step": 988 }, { "epoch": 0.07606023355869698, "grad_norm": 5.317092418670654, "learning_rate": 1.9695759065765213e-05, "loss": 2.0029, "step": 990 }, { "epoch": 0.0762138905961893, "grad_norm": 5.178996562957764, "learning_rate": 1.9695144437615243e-05, "loss": 1.7278, "step": 992 }, { "epoch": 0.07636754763368163, "grad_norm": 5.976894855499268, "learning_rate": 1.9694529809465276e-05, "loss": 1.753, "step": 994 }, { "epoch": 0.07652120467117395, "grad_norm": 4.536045551300049, "learning_rate": 1.9693915181315306e-05, "loss": 1.6081, "step": 996 }, { "epoch": 0.07667486170866626, "grad_norm": 4.751937389373779, "learning_rate": 1.969330055316534e-05, "loss": 1.6765, "step": 998 }, { "epoch": 0.07682851874615858, "grad_norm": 5.145371437072754, "learning_rate": 1.9692685925015365e-05, "loss": 1.9088, "step": 1000 }, { "epoch": 0.0769821757836509, "grad_norm": 5.149151802062988, "learning_rate": 1.9692071296865398e-05, "loss": 1.7855, "step": 1002 }, { "epoch": 0.07713583282114321, "grad_norm": 4.331295490264893, "learning_rate": 1.9691456668715428e-05, "loss": 1.6398, "step": 1004 }, { "epoch": 0.07728948985863553, "grad_norm": 5.288293361663818, "learning_rate": 1.9690842040565458e-05, "loss": 1.8451, "step": 1006 }, { "epoch": 0.07744314689612784, "grad_norm": 4.709437370300293, "learning_rate": 1.969022741241549e-05, "loss": 1.8546, "step": 1008 }, { "epoch": 0.07759680393362016, "grad_norm": 5.373138427734375, "learning_rate": 1.968961278426552e-05, "loss": 1.7561, "step": 1010 }, { "epoch": 0.07775046097111248, "grad_norm": 6.308657169342041, "learning_rate": 1.968899815611555e-05, "loss": 1.8271, "step": 1012 }, { "epoch": 0.07790411800860479, "grad_norm": 5.380505561828613, "learning_rate": 1.9688383527965583e-05, "loss": 1.7852, "step": 1014 }, { "epoch": 0.07805777504609711, "grad_norm": 5.468803405761719, "learning_rate": 1.9687768899815613e-05, "loss": 1.7797, "step": 1016 }, { "epoch": 0.07821143208358942, "grad_norm": 4.73374080657959, "learning_rate": 1.9687154271665646e-05, "loss": 1.6201, "step": 1018 }, { "epoch": 0.07836508912108174, "grad_norm": 4.9570631980896, "learning_rate": 1.9686539643515676e-05, "loss": 1.7538, "step": 1020 }, { "epoch": 0.07851874615857406, "grad_norm": 4.574160575866699, "learning_rate": 1.9685925015365705e-05, "loss": 1.7656, "step": 1022 }, { "epoch": 0.07867240319606637, "grad_norm": 5.078851222991943, "learning_rate": 1.968531038721574e-05, "loss": 1.806, "step": 1024 }, { "epoch": 0.0788260602335587, "grad_norm": 5.474549293518066, "learning_rate": 1.9684695759065765e-05, "loss": 1.8226, "step": 1026 }, { "epoch": 0.07897971727105102, "grad_norm": 6.257920265197754, "learning_rate": 1.9684081130915798e-05, "loss": 1.7714, "step": 1028 }, { "epoch": 0.07913337430854334, "grad_norm": 4.811653137207031, "learning_rate": 1.9683466502765827e-05, "loss": 1.7903, "step": 1030 }, { "epoch": 0.07928703134603565, "grad_norm": 4.90382194519043, "learning_rate": 1.9682851874615857e-05, "loss": 1.9185, "step": 1032 }, { "epoch": 0.07944068838352797, "grad_norm": 5.112819194793701, "learning_rate": 1.968223724646589e-05, "loss": 1.8098, "step": 1034 }, { "epoch": 0.07959434542102028, "grad_norm": 4.832859039306641, "learning_rate": 1.968162261831592e-05, "loss": 1.7965, "step": 1036 }, { "epoch": 0.0797480024585126, "grad_norm": 5.124858379364014, "learning_rate": 1.9681007990165953e-05, "loss": 1.6745, "step": 1038 }, { "epoch": 0.07990165949600492, "grad_norm": 4.530187606811523, "learning_rate": 1.9680393362015983e-05, "loss": 1.7152, "step": 1040 }, { "epoch": 0.08005531653349723, "grad_norm": 4.918298244476318, "learning_rate": 1.9679778733866012e-05, "loss": 1.7445, "step": 1042 }, { "epoch": 0.08020897357098955, "grad_norm": 4.637542247772217, "learning_rate": 1.9679164105716045e-05, "loss": 1.6802, "step": 1044 }, { "epoch": 0.08036263060848187, "grad_norm": 5.314078330993652, "learning_rate": 1.9678549477566075e-05, "loss": 1.7547, "step": 1046 }, { "epoch": 0.08051628764597418, "grad_norm": 5.02266263961792, "learning_rate": 1.9677934849416105e-05, "loss": 1.7107, "step": 1048 }, { "epoch": 0.0806699446834665, "grad_norm": 4.683084487915039, "learning_rate": 1.9677320221266138e-05, "loss": 1.7415, "step": 1050 }, { "epoch": 0.08082360172095882, "grad_norm": 4.539281845092773, "learning_rate": 1.9676705593116164e-05, "loss": 1.9356, "step": 1052 }, { "epoch": 0.08097725875845113, "grad_norm": 4.526500701904297, "learning_rate": 1.9676090964966197e-05, "loss": 1.6487, "step": 1054 }, { "epoch": 0.08113091579594345, "grad_norm": 4.272531509399414, "learning_rate": 1.9675476336816227e-05, "loss": 1.6478, "step": 1056 }, { "epoch": 0.08128457283343578, "grad_norm": 4.874919414520264, "learning_rate": 1.9674861708666257e-05, "loss": 1.715, "step": 1058 }, { "epoch": 0.0814382298709281, "grad_norm": 5.863064765930176, "learning_rate": 1.967424708051629e-05, "loss": 1.8635, "step": 1060 }, { "epoch": 0.08159188690842041, "grad_norm": 5.5292205810546875, "learning_rate": 1.967363245236632e-05, "loss": 1.7227, "step": 1062 }, { "epoch": 0.08174554394591273, "grad_norm": 4.343425273895264, "learning_rate": 1.9673017824216352e-05, "loss": 1.8257, "step": 1064 }, { "epoch": 0.08189920098340504, "grad_norm": 5.900411128997803, "learning_rate": 1.9672403196066382e-05, "loss": 1.8205, "step": 1066 }, { "epoch": 0.08205285802089736, "grad_norm": 4.439347743988037, "learning_rate": 1.9671788567916412e-05, "loss": 1.6958, "step": 1068 }, { "epoch": 0.08220651505838968, "grad_norm": 5.694681644439697, "learning_rate": 1.9671173939766445e-05, "loss": 1.8959, "step": 1070 }, { "epoch": 0.08236017209588199, "grad_norm": 4.767448425292969, "learning_rate": 1.9670559311616474e-05, "loss": 1.6721, "step": 1072 }, { "epoch": 0.08251382913337431, "grad_norm": 4.426461696624756, "learning_rate": 1.9669944683466504e-05, "loss": 1.607, "step": 1074 }, { "epoch": 0.08266748617086662, "grad_norm": 4.794045925140381, "learning_rate": 1.9669330055316537e-05, "loss": 1.6254, "step": 1076 }, { "epoch": 0.08282114320835894, "grad_norm": 4.544212341308594, "learning_rate": 1.9668715427166564e-05, "loss": 1.6082, "step": 1078 }, { "epoch": 0.08297480024585126, "grad_norm": 4.256971836090088, "learning_rate": 1.9668100799016597e-05, "loss": 1.8001, "step": 1080 }, { "epoch": 0.08312845728334357, "grad_norm": 6.058056831359863, "learning_rate": 1.9667486170866626e-05, "loss": 1.7258, "step": 1082 }, { "epoch": 0.08328211432083589, "grad_norm": 4.815703868865967, "learning_rate": 1.966687154271666e-05, "loss": 1.7528, "step": 1084 }, { "epoch": 0.0834357713583282, "grad_norm": 4.661309719085693, "learning_rate": 1.966625691456669e-05, "loss": 1.7316, "step": 1086 }, { "epoch": 0.08358942839582052, "grad_norm": 4.863770961761475, "learning_rate": 1.966564228641672e-05, "loss": 1.8824, "step": 1088 }, { "epoch": 0.08374308543331284, "grad_norm": 5.061347961425781, "learning_rate": 1.9665027658266752e-05, "loss": 1.6873, "step": 1090 }, { "epoch": 0.08389674247080517, "grad_norm": 4.252581596374512, "learning_rate": 1.966441303011678e-05, "loss": 1.8093, "step": 1092 }, { "epoch": 0.08405039950829749, "grad_norm": 5.112542152404785, "learning_rate": 1.966379840196681e-05, "loss": 1.7366, "step": 1094 }, { "epoch": 0.0842040565457898, "grad_norm": 5.147494792938232, "learning_rate": 1.9663183773816844e-05, "loss": 1.8676, "step": 1096 }, { "epoch": 0.08435771358328212, "grad_norm": 4.757107257843018, "learning_rate": 1.966256914566687e-05, "loss": 1.5888, "step": 1098 }, { "epoch": 0.08451137062077443, "grad_norm": 5.5242600440979, "learning_rate": 1.9661954517516904e-05, "loss": 1.7018, "step": 1100 }, { "epoch": 0.08466502765826675, "grad_norm": 5.5325117111206055, "learning_rate": 1.9661339889366933e-05, "loss": 1.6928, "step": 1102 }, { "epoch": 0.08481868469575907, "grad_norm": 5.308017253875732, "learning_rate": 1.9660725261216966e-05, "loss": 1.7695, "step": 1104 }, { "epoch": 0.08497234173325138, "grad_norm": 4.460916519165039, "learning_rate": 1.9660110633066996e-05, "loss": 1.649, "step": 1106 }, { "epoch": 0.0851259987707437, "grad_norm": 5.222771644592285, "learning_rate": 1.9659496004917026e-05, "loss": 1.8017, "step": 1108 }, { "epoch": 0.08527965580823602, "grad_norm": 4.484593391418457, "learning_rate": 1.965888137676706e-05, "loss": 1.697, "step": 1110 }, { "epoch": 0.08543331284572833, "grad_norm": 4.95808219909668, "learning_rate": 1.965826674861709e-05, "loss": 1.6171, "step": 1112 }, { "epoch": 0.08558696988322065, "grad_norm": 5.313704967498779, "learning_rate": 1.9657652120467118e-05, "loss": 1.665, "step": 1114 }, { "epoch": 0.08574062692071296, "grad_norm": 4.555895805358887, "learning_rate": 1.965703749231715e-05, "loss": 1.7004, "step": 1116 }, { "epoch": 0.08589428395820528, "grad_norm": 4.939544677734375, "learning_rate": 1.965642286416718e-05, "loss": 1.7204, "step": 1118 }, { "epoch": 0.0860479409956976, "grad_norm": 4.291268348693848, "learning_rate": 1.965580823601721e-05, "loss": 1.6592, "step": 1120 }, { "epoch": 0.08620159803318991, "grad_norm": 5.050233840942383, "learning_rate": 1.9655193607867244e-05, "loss": 1.7656, "step": 1122 }, { "epoch": 0.08635525507068224, "grad_norm": 4.5748090744018555, "learning_rate": 1.9654578979717273e-05, "loss": 1.8491, "step": 1124 }, { "epoch": 0.08650891210817456, "grad_norm": 4.3803391456604, "learning_rate": 1.9653964351567303e-05, "loss": 1.697, "step": 1126 }, { "epoch": 0.08666256914566688, "grad_norm": 5.346717834472656, "learning_rate": 1.9653349723417333e-05, "loss": 1.817, "step": 1128 }, { "epoch": 0.08681622618315919, "grad_norm": 7.8311944007873535, "learning_rate": 1.9652735095267366e-05, "loss": 1.7637, "step": 1130 }, { "epoch": 0.08696988322065151, "grad_norm": 4.9822845458984375, "learning_rate": 1.9652120467117395e-05, "loss": 1.6673, "step": 1132 }, { "epoch": 0.08712354025814383, "grad_norm": 5.492576599121094, "learning_rate": 1.9651505838967425e-05, "loss": 1.7831, "step": 1134 }, { "epoch": 0.08727719729563614, "grad_norm": 5.493293285369873, "learning_rate": 1.9650891210817458e-05, "loss": 1.698, "step": 1136 }, { "epoch": 0.08743085433312846, "grad_norm": 6.50536584854126, "learning_rate": 1.9650276582667488e-05, "loss": 1.7632, "step": 1138 }, { "epoch": 0.08758451137062077, "grad_norm": 4.753702640533447, "learning_rate": 1.9649661954517518e-05, "loss": 1.64, "step": 1140 }, { "epoch": 0.08773816840811309, "grad_norm": 4.77907657623291, "learning_rate": 1.964904732636755e-05, "loss": 1.9073, "step": 1142 }, { "epoch": 0.0878918254456054, "grad_norm": 4.242752552032471, "learning_rate": 1.964843269821758e-05, "loss": 1.7625, "step": 1144 }, { "epoch": 0.08804548248309772, "grad_norm": 5.127992630004883, "learning_rate": 1.964781807006761e-05, "loss": 1.6271, "step": 1146 }, { "epoch": 0.08819913952059004, "grad_norm": 4.7297749519348145, "learning_rate": 1.9647203441917643e-05, "loss": 1.6252, "step": 1148 }, { "epoch": 0.08835279655808236, "grad_norm": 5.08091926574707, "learning_rate": 1.9646588813767673e-05, "loss": 1.7145, "step": 1150 }, { "epoch": 0.08850645359557467, "grad_norm": 4.986452102661133, "learning_rate": 1.9645974185617702e-05, "loss": 1.8065, "step": 1152 }, { "epoch": 0.08866011063306699, "grad_norm": 5.4139204025268555, "learning_rate": 1.9645359557467732e-05, "loss": 1.6284, "step": 1154 }, { "epoch": 0.08881376767055932, "grad_norm": 5.257991313934326, "learning_rate": 1.9644744929317765e-05, "loss": 1.8338, "step": 1156 }, { "epoch": 0.08896742470805163, "grad_norm": 4.872262001037598, "learning_rate": 1.9644130301167795e-05, "loss": 1.8488, "step": 1158 }, { "epoch": 0.08912108174554395, "grad_norm": 5.82914924621582, "learning_rate": 1.9643515673017825e-05, "loss": 1.7731, "step": 1160 }, { "epoch": 0.08927473878303627, "grad_norm": 4.288515090942383, "learning_rate": 1.9642901044867858e-05, "loss": 1.6748, "step": 1162 }, { "epoch": 0.08942839582052858, "grad_norm": 4.828827381134033, "learning_rate": 1.9642286416717887e-05, "loss": 1.7893, "step": 1164 }, { "epoch": 0.0895820528580209, "grad_norm": 4.2209577560424805, "learning_rate": 1.9641671788567917e-05, "loss": 1.6139, "step": 1166 }, { "epoch": 0.08973570989551322, "grad_norm": 5.542986869812012, "learning_rate": 1.964105716041795e-05, "loss": 1.5873, "step": 1168 }, { "epoch": 0.08988936693300553, "grad_norm": 4.995078086853027, "learning_rate": 1.964044253226798e-05, "loss": 1.5976, "step": 1170 }, { "epoch": 0.09004302397049785, "grad_norm": 4.493627071380615, "learning_rate": 1.963982790411801e-05, "loss": 1.6423, "step": 1172 }, { "epoch": 0.09019668100799016, "grad_norm": 5.041021823883057, "learning_rate": 1.9639213275968043e-05, "loss": 1.6996, "step": 1174 }, { "epoch": 0.09035033804548248, "grad_norm": 5.527616024017334, "learning_rate": 1.9638598647818072e-05, "loss": 1.7338, "step": 1176 }, { "epoch": 0.0905039950829748, "grad_norm": 4.937674045562744, "learning_rate": 1.9637984019668102e-05, "loss": 1.8472, "step": 1178 }, { "epoch": 0.09065765212046711, "grad_norm": 5.515735149383545, "learning_rate": 1.963736939151813e-05, "loss": 1.8828, "step": 1180 }, { "epoch": 0.09081130915795943, "grad_norm": 5.273295879364014, "learning_rate": 1.9636754763368165e-05, "loss": 1.6819, "step": 1182 }, { "epoch": 0.09096496619545175, "grad_norm": 4.745762825012207, "learning_rate": 1.9636140135218194e-05, "loss": 1.7752, "step": 1184 }, { "epoch": 0.09111862323294406, "grad_norm": 4.8165130615234375, "learning_rate": 1.9635525507068224e-05, "loss": 1.6133, "step": 1186 }, { "epoch": 0.09127228027043639, "grad_norm": 4.156392574310303, "learning_rate": 1.9634910878918257e-05, "loss": 1.7126, "step": 1188 }, { "epoch": 0.09142593730792871, "grad_norm": 4.523599624633789, "learning_rate": 1.9634296250768287e-05, "loss": 1.616, "step": 1190 }, { "epoch": 0.09157959434542103, "grad_norm": 4.08304500579834, "learning_rate": 1.9633681622618316e-05, "loss": 1.6602, "step": 1192 }, { "epoch": 0.09173325138291334, "grad_norm": 4.6315202713012695, "learning_rate": 1.963306699446835e-05, "loss": 1.7004, "step": 1194 }, { "epoch": 0.09188690842040566, "grad_norm": 4.613175868988037, "learning_rate": 1.963245236631838e-05, "loss": 1.7607, "step": 1196 }, { "epoch": 0.09204056545789797, "grad_norm": 5.473268508911133, "learning_rate": 1.963183773816841e-05, "loss": 1.7788, "step": 1198 }, { "epoch": 0.09219422249539029, "grad_norm": 4.436158657073975, "learning_rate": 1.963122311001844e-05, "loss": 1.507, "step": 1200 }, { "epoch": 0.0923478795328826, "grad_norm": 4.081661701202393, "learning_rate": 1.963060848186847e-05, "loss": 1.6724, "step": 1202 }, { "epoch": 0.09250153657037492, "grad_norm": 4.4669318199157715, "learning_rate": 1.96299938537185e-05, "loss": 1.6776, "step": 1204 }, { "epoch": 0.09265519360786724, "grad_norm": 4.446565628051758, "learning_rate": 1.962937922556853e-05, "loss": 1.7225, "step": 1206 }, { "epoch": 0.09280885064535956, "grad_norm": 4.1054816246032715, "learning_rate": 1.9628764597418564e-05, "loss": 1.5964, "step": 1208 }, { "epoch": 0.09296250768285187, "grad_norm": 3.9810330867767334, "learning_rate": 1.9628149969268594e-05, "loss": 1.6933, "step": 1210 }, { "epoch": 0.09311616472034419, "grad_norm": 5.808743476867676, "learning_rate": 1.9627535341118623e-05, "loss": 1.796, "step": 1212 }, { "epoch": 0.0932698217578365, "grad_norm": 5.4757795333862305, "learning_rate": 1.9626920712968657e-05, "loss": 1.6707, "step": 1214 }, { "epoch": 0.09342347879532882, "grad_norm": 4.4285430908203125, "learning_rate": 1.9626306084818686e-05, "loss": 1.6214, "step": 1216 }, { "epoch": 0.09357713583282114, "grad_norm": 4.347642421722412, "learning_rate": 1.9625691456668716e-05, "loss": 1.7012, "step": 1218 }, { "epoch": 0.09373079287031345, "grad_norm": 4.711369037628174, "learning_rate": 1.962507682851875e-05, "loss": 1.6307, "step": 1220 }, { "epoch": 0.09388444990780578, "grad_norm": 4.427557945251465, "learning_rate": 1.962446220036878e-05, "loss": 1.7988, "step": 1222 }, { "epoch": 0.0940381069452981, "grad_norm": 4.078681945800781, "learning_rate": 1.962384757221881e-05, "loss": 1.6596, "step": 1224 }, { "epoch": 0.09419176398279042, "grad_norm": 4.403939247131348, "learning_rate": 1.9623232944068838e-05, "loss": 1.7761, "step": 1226 }, { "epoch": 0.09434542102028273, "grad_norm": 4.222808361053467, "learning_rate": 1.962261831591887e-05, "loss": 1.7145, "step": 1228 }, { "epoch": 0.09449907805777505, "grad_norm": 4.754458904266357, "learning_rate": 1.96220036877689e-05, "loss": 1.6256, "step": 1230 }, { "epoch": 0.09465273509526737, "grad_norm": 4.7824201583862305, "learning_rate": 1.962138905961893e-05, "loss": 1.7846, "step": 1232 }, { "epoch": 0.09480639213275968, "grad_norm": 4.751527786254883, "learning_rate": 1.9620774431468964e-05, "loss": 1.6602, "step": 1234 }, { "epoch": 0.094960049170252, "grad_norm": 4.74616813659668, "learning_rate": 1.9620159803318993e-05, "loss": 1.6707, "step": 1236 }, { "epoch": 0.09511370620774431, "grad_norm": 5.099863052368164, "learning_rate": 1.9619545175169023e-05, "loss": 1.7293, "step": 1238 }, { "epoch": 0.09526736324523663, "grad_norm": 5.293537616729736, "learning_rate": 1.9618930547019056e-05, "loss": 1.7172, "step": 1240 }, { "epoch": 0.09542102028272895, "grad_norm": 5.443637847900391, "learning_rate": 1.9618315918869086e-05, "loss": 1.7516, "step": 1242 }, { "epoch": 0.09557467732022126, "grad_norm": 4.447843551635742, "learning_rate": 1.9617701290719115e-05, "loss": 1.7449, "step": 1244 }, { "epoch": 0.09572833435771358, "grad_norm": 4.490113258361816, "learning_rate": 1.961708666256915e-05, "loss": 1.6253, "step": 1246 }, { "epoch": 0.0958819913952059, "grad_norm": 4.979306221008301, "learning_rate": 1.9616472034419178e-05, "loss": 1.6743, "step": 1248 }, { "epoch": 0.09603564843269821, "grad_norm": 4.146381855010986, "learning_rate": 1.961585740626921e-05, "loss": 1.7014, "step": 1250 }, { "epoch": 0.09618930547019053, "grad_norm": 4.571809768676758, "learning_rate": 1.9615242778119237e-05, "loss": 1.7867, "step": 1252 }, { "epoch": 0.09634296250768286, "grad_norm": 4.7382988929748535, "learning_rate": 1.961462814996927e-05, "loss": 1.7669, "step": 1254 }, { "epoch": 0.09649661954517517, "grad_norm": 4.332629203796387, "learning_rate": 1.96140135218193e-05, "loss": 1.8072, "step": 1256 }, { "epoch": 0.09665027658266749, "grad_norm": 4.376523494720459, "learning_rate": 1.961339889366933e-05, "loss": 1.7595, "step": 1258 }, { "epoch": 0.09680393362015981, "grad_norm": 4.876426696777344, "learning_rate": 1.9612784265519363e-05, "loss": 1.5174, "step": 1260 }, { "epoch": 0.09695759065765212, "grad_norm": 4.8033905029296875, "learning_rate": 1.9612169637369393e-05, "loss": 1.7284, "step": 1262 }, { "epoch": 0.09711124769514444, "grad_norm": 4.221518039703369, "learning_rate": 1.9611555009219422e-05, "loss": 1.7409, "step": 1264 }, { "epoch": 0.09726490473263676, "grad_norm": 4.445687294006348, "learning_rate": 1.9610940381069455e-05, "loss": 1.6211, "step": 1266 }, { "epoch": 0.09741856177012907, "grad_norm": 4.590234279632568, "learning_rate": 1.9610325752919485e-05, "loss": 1.5695, "step": 1268 }, { "epoch": 0.09757221880762139, "grad_norm": 5.60252571105957, "learning_rate": 1.9609711124769518e-05, "loss": 1.6369, "step": 1270 }, { "epoch": 0.0977258758451137, "grad_norm": 4.938035011291504, "learning_rate": 1.9609096496619548e-05, "loss": 1.7329, "step": 1272 }, { "epoch": 0.09787953288260602, "grad_norm": 5.1367106437683105, "learning_rate": 1.9608481868469578e-05, "loss": 1.8568, "step": 1274 }, { "epoch": 0.09803318992009834, "grad_norm": 4.405098915100098, "learning_rate": 1.960786724031961e-05, "loss": 1.6226, "step": 1276 }, { "epoch": 0.09818684695759065, "grad_norm": 5.822478771209717, "learning_rate": 1.9607252612169637e-05, "loss": 1.7561, "step": 1278 }, { "epoch": 0.09834050399508297, "grad_norm": 4.770538806915283, "learning_rate": 1.960663798401967e-05, "loss": 1.6214, "step": 1280 }, { "epoch": 0.09849416103257529, "grad_norm": 6.316437244415283, "learning_rate": 1.96060233558697e-05, "loss": 1.8268, "step": 1282 }, { "epoch": 0.0986478180700676, "grad_norm": 4.640567302703857, "learning_rate": 1.960540872771973e-05, "loss": 1.7484, "step": 1284 }, { "epoch": 0.09880147510755993, "grad_norm": 4.596543312072754, "learning_rate": 1.9604794099569762e-05, "loss": 1.6986, "step": 1286 }, { "epoch": 0.09895513214505225, "grad_norm": 4.36724328994751, "learning_rate": 1.9604179471419792e-05, "loss": 1.6108, "step": 1288 }, { "epoch": 0.09910878918254457, "grad_norm": 5.017337322235107, "learning_rate": 1.9603564843269825e-05, "loss": 1.7047, "step": 1290 }, { "epoch": 0.09926244622003688, "grad_norm": 4.327188491821289, "learning_rate": 1.9602950215119855e-05, "loss": 1.6083, "step": 1292 }, { "epoch": 0.0994161032575292, "grad_norm": 5.734022617340088, "learning_rate": 1.9602335586969885e-05, "loss": 1.7501, "step": 1294 }, { "epoch": 0.09956976029502151, "grad_norm": 4.524082183837891, "learning_rate": 1.9601720958819918e-05, "loss": 1.7556, "step": 1296 }, { "epoch": 0.09972341733251383, "grad_norm": 4.61295747756958, "learning_rate": 1.9601106330669947e-05, "loss": 1.6598, "step": 1298 }, { "epoch": 0.09987707437000615, "grad_norm": 4.453684329986572, "learning_rate": 1.9600491702519977e-05, "loss": 1.6554, "step": 1300 }, { "epoch": 0.10003073140749846, "grad_norm": 4.732148170471191, "learning_rate": 1.959987707437001e-05, "loss": 1.7181, "step": 1302 }, { "epoch": 0.10018438844499078, "grad_norm": 4.715574741363525, "learning_rate": 1.9599262446220036e-05, "loss": 1.6849, "step": 1304 }, { "epoch": 0.1003380454824831, "grad_norm": 4.356414318084717, "learning_rate": 1.959864781807007e-05, "loss": 1.687, "step": 1306 }, { "epoch": 0.10049170251997541, "grad_norm": 4.813374996185303, "learning_rate": 1.95980331899201e-05, "loss": 1.5262, "step": 1308 }, { "epoch": 0.10064535955746773, "grad_norm": 4.9926981925964355, "learning_rate": 1.959741856177013e-05, "loss": 1.8137, "step": 1310 }, { "epoch": 0.10079901659496004, "grad_norm": 5.103787422180176, "learning_rate": 1.9596803933620162e-05, "loss": 1.64, "step": 1312 }, { "epoch": 0.10095267363245236, "grad_norm": 4.895768165588379, "learning_rate": 1.959618930547019e-05, "loss": 1.693, "step": 1314 }, { "epoch": 0.10110633066994468, "grad_norm": 4.513470649719238, "learning_rate": 1.9595574677320225e-05, "loss": 1.5745, "step": 1316 }, { "epoch": 0.10125998770743701, "grad_norm": 5.475149154663086, "learning_rate": 1.9594960049170254e-05, "loss": 1.6189, "step": 1318 }, { "epoch": 0.10141364474492932, "grad_norm": 4.828972339630127, "learning_rate": 1.9594345421020284e-05, "loss": 1.6547, "step": 1320 }, { "epoch": 0.10156730178242164, "grad_norm": 5.218929290771484, "learning_rate": 1.9593730792870317e-05, "loss": 1.6494, "step": 1322 }, { "epoch": 0.10172095881991396, "grad_norm": 4.358766078948975, "learning_rate": 1.9593116164720343e-05, "loss": 1.7283, "step": 1324 }, { "epoch": 0.10187461585740627, "grad_norm": 4.14285135269165, "learning_rate": 1.9592501536570376e-05, "loss": 1.7769, "step": 1326 }, { "epoch": 0.10202827289489859, "grad_norm": 4.319285869598389, "learning_rate": 1.9591886908420406e-05, "loss": 1.4707, "step": 1328 }, { "epoch": 0.1021819299323909, "grad_norm": 5.230128288269043, "learning_rate": 1.9591272280270436e-05, "loss": 1.4906, "step": 1330 }, { "epoch": 0.10233558696988322, "grad_norm": 5.243448257446289, "learning_rate": 1.959065765212047e-05, "loss": 1.8825, "step": 1332 }, { "epoch": 0.10248924400737554, "grad_norm": 4.784072399139404, "learning_rate": 1.95900430239705e-05, "loss": 1.7553, "step": 1334 }, { "epoch": 0.10264290104486785, "grad_norm": 5.595427513122559, "learning_rate": 1.958942839582053e-05, "loss": 1.6913, "step": 1336 }, { "epoch": 0.10279655808236017, "grad_norm": 4.856276512145996, "learning_rate": 1.958881376767056e-05, "loss": 1.7659, "step": 1338 }, { "epoch": 0.10295021511985249, "grad_norm": 5.188042640686035, "learning_rate": 1.958819913952059e-05, "loss": 1.8205, "step": 1340 }, { "epoch": 0.1031038721573448, "grad_norm": 4.261306285858154, "learning_rate": 1.9587584511370624e-05, "loss": 1.5887, "step": 1342 }, { "epoch": 0.10325752919483712, "grad_norm": 4.269975185394287, "learning_rate": 1.9586969883220654e-05, "loss": 1.5869, "step": 1344 }, { "epoch": 0.10341118623232944, "grad_norm": 5.029308795928955, "learning_rate": 1.9586355255070683e-05, "loss": 1.8049, "step": 1346 }, { "epoch": 0.10356484326982175, "grad_norm": 4.857789516448975, "learning_rate": 1.9585740626920716e-05, "loss": 1.5154, "step": 1348 }, { "epoch": 0.10371850030731407, "grad_norm": 4.701939582824707, "learning_rate": 1.9585125998770743e-05, "loss": 1.6822, "step": 1350 }, { "epoch": 0.1038721573448064, "grad_norm": 4.069787979125977, "learning_rate": 1.9584511370620776e-05, "loss": 1.7238, "step": 1352 }, { "epoch": 0.10402581438229871, "grad_norm": 4.703420162200928, "learning_rate": 1.9583896742470806e-05, "loss": 1.6951, "step": 1354 }, { "epoch": 0.10417947141979103, "grad_norm": 4.920733451843262, "learning_rate": 1.958328211432084e-05, "loss": 1.6335, "step": 1356 }, { "epoch": 0.10433312845728335, "grad_norm": 4.38323974609375, "learning_rate": 1.9582667486170868e-05, "loss": 1.7094, "step": 1358 }, { "epoch": 0.10448678549477566, "grad_norm": 4.646501541137695, "learning_rate": 1.9582052858020898e-05, "loss": 1.6878, "step": 1360 }, { "epoch": 0.10464044253226798, "grad_norm": 4.569819450378418, "learning_rate": 1.958143822987093e-05, "loss": 1.7198, "step": 1362 }, { "epoch": 0.1047940995697603, "grad_norm": 4.552595615386963, "learning_rate": 1.958082360172096e-05, "loss": 1.7335, "step": 1364 }, { "epoch": 0.10494775660725261, "grad_norm": 3.9051506519317627, "learning_rate": 1.958020897357099e-05, "loss": 1.5694, "step": 1366 }, { "epoch": 0.10510141364474493, "grad_norm": 3.9420411586761475, "learning_rate": 1.9579594345421023e-05, "loss": 1.7298, "step": 1368 }, { "epoch": 0.10525507068223725, "grad_norm": 4.996294021606445, "learning_rate": 1.9578979717271053e-05, "loss": 1.7769, "step": 1370 }, { "epoch": 0.10540872771972956, "grad_norm": 4.845794677734375, "learning_rate": 1.9578365089121083e-05, "loss": 1.694, "step": 1372 }, { "epoch": 0.10556238475722188, "grad_norm": 4.156089782714844, "learning_rate": 1.9577750460971116e-05, "loss": 1.5621, "step": 1374 }, { "epoch": 0.1057160417947142, "grad_norm": 5.298906326293945, "learning_rate": 1.9577135832821146e-05, "loss": 1.6016, "step": 1376 }, { "epoch": 0.10586969883220651, "grad_norm": 4.974923610687256, "learning_rate": 1.9576521204671175e-05, "loss": 1.9024, "step": 1378 }, { "epoch": 0.10602335586969883, "grad_norm": 4.5802998542785645, "learning_rate": 1.9575906576521205e-05, "loss": 1.8249, "step": 1380 }, { "epoch": 0.10617701290719114, "grad_norm": 5.364488124847412, "learning_rate": 1.9575291948371238e-05, "loss": 1.6205, "step": 1382 }, { "epoch": 0.10633066994468347, "grad_norm": 4.810891151428223, "learning_rate": 1.9574677320221268e-05, "loss": 1.6702, "step": 1384 }, { "epoch": 0.10648432698217579, "grad_norm": 5.155327320098877, "learning_rate": 1.9574062692071297e-05, "loss": 1.6251, "step": 1386 }, { "epoch": 0.1066379840196681, "grad_norm": 4.292688369750977, "learning_rate": 1.957344806392133e-05, "loss": 1.505, "step": 1388 }, { "epoch": 0.10679164105716042, "grad_norm": 4.611319541931152, "learning_rate": 1.957283343577136e-05, "loss": 1.7301, "step": 1390 }, { "epoch": 0.10694529809465274, "grad_norm": 4.324422359466553, "learning_rate": 1.957221880762139e-05, "loss": 1.6551, "step": 1392 }, { "epoch": 0.10709895513214505, "grad_norm": 4.826112747192383, "learning_rate": 1.9571604179471423e-05, "loss": 1.607, "step": 1394 }, { "epoch": 0.10725261216963737, "grad_norm": 4.303924560546875, "learning_rate": 1.9570989551321453e-05, "loss": 1.6128, "step": 1396 }, { "epoch": 0.10740626920712969, "grad_norm": 5.093891620635986, "learning_rate": 1.9570374923171482e-05, "loss": 1.6747, "step": 1398 }, { "epoch": 0.107559926244622, "grad_norm": 4.303253650665283, "learning_rate": 1.9569760295021515e-05, "loss": 1.5981, "step": 1400 }, { "epoch": 0.10771358328211432, "grad_norm": 4.165480613708496, "learning_rate": 1.9569145666871545e-05, "loss": 1.5993, "step": 1402 }, { "epoch": 0.10786724031960664, "grad_norm": 4.655346393585205, "learning_rate": 1.9568531038721575e-05, "loss": 1.6806, "step": 1404 }, { "epoch": 0.10802089735709895, "grad_norm": 4.743736743927002, "learning_rate": 1.9567916410571604e-05, "loss": 1.6624, "step": 1406 }, { "epoch": 0.10817455439459127, "grad_norm": 4.2791643142700195, "learning_rate": 1.9567301782421637e-05, "loss": 1.6776, "step": 1408 }, { "epoch": 0.10832821143208358, "grad_norm": 5.005465030670166, "learning_rate": 1.9566687154271667e-05, "loss": 1.5962, "step": 1410 }, { "epoch": 0.1084818684695759, "grad_norm": 4.345304012298584, "learning_rate": 1.9566072526121697e-05, "loss": 1.5865, "step": 1412 }, { "epoch": 0.10863552550706822, "grad_norm": 3.8712103366851807, "learning_rate": 1.956545789797173e-05, "loss": 1.6583, "step": 1414 }, { "epoch": 0.10878918254456055, "grad_norm": 4.381411075592041, "learning_rate": 1.956484326982176e-05, "loss": 1.6203, "step": 1416 }, { "epoch": 0.10894283958205286, "grad_norm": 3.933609962463379, "learning_rate": 1.956422864167179e-05, "loss": 1.7631, "step": 1418 }, { "epoch": 0.10909649661954518, "grad_norm": 5.570189952850342, "learning_rate": 1.9563614013521822e-05, "loss": 1.7173, "step": 1420 }, { "epoch": 0.1092501536570375, "grad_norm": 4.816314220428467, "learning_rate": 1.9562999385371852e-05, "loss": 1.6941, "step": 1422 }, { "epoch": 0.10940381069452981, "grad_norm": 4.110052585601807, "learning_rate": 1.9562384757221882e-05, "loss": 1.6782, "step": 1424 }, { "epoch": 0.10955746773202213, "grad_norm": 4.069727420806885, "learning_rate": 1.956177012907191e-05, "loss": 1.657, "step": 1426 }, { "epoch": 0.10971112476951445, "grad_norm": 5.244446277618408, "learning_rate": 1.9561155500921944e-05, "loss": 1.683, "step": 1428 }, { "epoch": 0.10986478180700676, "grad_norm": 5.359142780303955, "learning_rate": 1.9560540872771974e-05, "loss": 1.6764, "step": 1430 }, { "epoch": 0.11001843884449908, "grad_norm": 5.057417869567871, "learning_rate": 1.9559926244622004e-05, "loss": 1.787, "step": 1432 }, { "epoch": 0.1101720958819914, "grad_norm": 4.59893274307251, "learning_rate": 1.9559311616472037e-05, "loss": 1.6119, "step": 1434 }, { "epoch": 0.11032575291948371, "grad_norm": 4.8417744636535645, "learning_rate": 1.9558696988322067e-05, "loss": 1.7965, "step": 1436 }, { "epoch": 0.11047940995697603, "grad_norm": 4.829365253448486, "learning_rate": 1.9558082360172096e-05, "loss": 1.6125, "step": 1438 }, { "epoch": 0.11063306699446834, "grad_norm": 4.74966287612915, "learning_rate": 1.955746773202213e-05, "loss": 1.5761, "step": 1440 }, { "epoch": 0.11078672403196066, "grad_norm": 4.8681535720825195, "learning_rate": 1.955685310387216e-05, "loss": 1.5977, "step": 1442 }, { "epoch": 0.11094038106945298, "grad_norm": 4.576766014099121, "learning_rate": 1.955623847572219e-05, "loss": 1.6297, "step": 1444 }, { "epoch": 0.11109403810694529, "grad_norm": 4.206700325012207, "learning_rate": 1.9555623847572222e-05, "loss": 1.6246, "step": 1446 }, { "epoch": 0.11124769514443761, "grad_norm": 4.753570079803467, "learning_rate": 1.955500921942225e-05, "loss": 1.6627, "step": 1448 }, { "epoch": 0.11140135218192994, "grad_norm": 4.992982864379883, "learning_rate": 1.955439459127228e-05, "loss": 1.7223, "step": 1450 }, { "epoch": 0.11155500921942225, "grad_norm": 4.912965297698975, "learning_rate": 1.955377996312231e-05, "loss": 1.5616, "step": 1452 }, { "epoch": 0.11170866625691457, "grad_norm": 4.4759840965271, "learning_rate": 1.9553165334972344e-05, "loss": 1.5403, "step": 1454 }, { "epoch": 0.11186232329440689, "grad_norm": 5.181031703948975, "learning_rate": 1.9552550706822374e-05, "loss": 1.6365, "step": 1456 }, { "epoch": 0.1120159803318992, "grad_norm": 4.845396518707275, "learning_rate": 1.9551936078672403e-05, "loss": 1.6279, "step": 1458 }, { "epoch": 0.11216963736939152, "grad_norm": 4.756799221038818, "learning_rate": 1.9551321450522436e-05, "loss": 1.6919, "step": 1460 }, { "epoch": 0.11232329440688384, "grad_norm": 5.1768107414245605, "learning_rate": 1.9550706822372466e-05, "loss": 1.5765, "step": 1462 }, { "epoch": 0.11247695144437615, "grad_norm": 4.743069648742676, "learning_rate": 1.9550092194222496e-05, "loss": 1.8469, "step": 1464 }, { "epoch": 0.11263060848186847, "grad_norm": 4.831038951873779, "learning_rate": 1.954947756607253e-05, "loss": 1.7746, "step": 1466 }, { "epoch": 0.11278426551936079, "grad_norm": 4.309507846832275, "learning_rate": 1.954886293792256e-05, "loss": 1.5142, "step": 1468 }, { "epoch": 0.1129379225568531, "grad_norm": 55.4850959777832, "learning_rate": 1.9548248309772588e-05, "loss": 1.683, "step": 1470 }, { "epoch": 0.11309157959434542, "grad_norm": 4.364781856536865, "learning_rate": 1.954763368162262e-05, "loss": 1.8977, "step": 1472 }, { "epoch": 0.11324523663183773, "grad_norm": 4.795863151550293, "learning_rate": 1.954701905347265e-05, "loss": 1.6436, "step": 1474 }, { "epoch": 0.11339889366933005, "grad_norm": 4.47898530960083, "learning_rate": 1.954640442532268e-05, "loss": 1.7043, "step": 1476 }, { "epoch": 0.11355255070682237, "grad_norm": 5.761251926422119, "learning_rate": 1.954578979717271e-05, "loss": 1.6943, "step": 1478 }, { "epoch": 0.11370620774431468, "grad_norm": 5.274534225463867, "learning_rate": 1.9545175169022743e-05, "loss": 1.6005, "step": 1480 }, { "epoch": 0.11385986478180701, "grad_norm": 4.412993431091309, "learning_rate": 1.9544560540872773e-05, "loss": 1.4977, "step": 1482 }, { "epoch": 0.11401352181929933, "grad_norm": 4.08578634262085, "learning_rate": 1.9543945912722803e-05, "loss": 1.6459, "step": 1484 }, { "epoch": 0.11416717885679165, "grad_norm": 3.7015979290008545, "learning_rate": 1.9543331284572836e-05, "loss": 1.5978, "step": 1486 }, { "epoch": 0.11432083589428396, "grad_norm": 4.919078826904297, "learning_rate": 1.9542716656422865e-05, "loss": 1.5456, "step": 1488 }, { "epoch": 0.11447449293177628, "grad_norm": 4.756066799163818, "learning_rate": 1.9542102028272895e-05, "loss": 1.7221, "step": 1490 }, { "epoch": 0.1146281499692686, "grad_norm": 4.4432525634765625, "learning_rate": 1.9541487400122928e-05, "loss": 1.5918, "step": 1492 }, { "epoch": 0.11478180700676091, "grad_norm": 5.371875286102295, "learning_rate": 1.9540872771972958e-05, "loss": 1.7009, "step": 1494 }, { "epoch": 0.11493546404425323, "grad_norm": 3.5335211753845215, "learning_rate": 1.9540258143822988e-05, "loss": 1.6021, "step": 1496 }, { "epoch": 0.11508912108174554, "grad_norm": 4.77205753326416, "learning_rate": 1.953964351567302e-05, "loss": 1.5721, "step": 1498 }, { "epoch": 0.11524277811923786, "grad_norm": 5.020537376403809, "learning_rate": 1.953902888752305e-05, "loss": 1.6367, "step": 1500 }, { "epoch": 0.11539643515673018, "grad_norm": 4.866142272949219, "learning_rate": 1.9538414259373083e-05, "loss": 1.6035, "step": 1502 }, { "epoch": 0.11555009219422249, "grad_norm": 3.647397756576538, "learning_rate": 1.953779963122311e-05, "loss": 1.5274, "step": 1504 }, { "epoch": 0.11570374923171481, "grad_norm": 3.999390125274658, "learning_rate": 1.9537185003073143e-05, "loss": 1.7818, "step": 1506 }, { "epoch": 0.11585740626920712, "grad_norm": 4.787381172180176, "learning_rate": 1.9536570374923172e-05, "loss": 1.7033, "step": 1508 }, { "epoch": 0.11601106330669944, "grad_norm": 4.415989398956299, "learning_rate": 1.9535955746773202e-05, "loss": 1.831, "step": 1510 }, { "epoch": 0.11616472034419176, "grad_norm": 4.548354148864746, "learning_rate": 1.9535341118623235e-05, "loss": 1.5145, "step": 1512 }, { "epoch": 0.11631837738168409, "grad_norm": 5.4493560791015625, "learning_rate": 1.9534726490473265e-05, "loss": 1.876, "step": 1514 }, { "epoch": 0.1164720344191764, "grad_norm": 3.9988834857940674, "learning_rate": 1.9534111862323295e-05, "loss": 1.6325, "step": 1516 }, { "epoch": 0.11662569145666872, "grad_norm": 4.861139297485352, "learning_rate": 1.9533497234173328e-05, "loss": 1.6743, "step": 1518 }, { "epoch": 0.11677934849416104, "grad_norm": 5.388833522796631, "learning_rate": 1.9532882606023357e-05, "loss": 1.5854, "step": 1520 }, { "epoch": 0.11693300553165335, "grad_norm": 4.772726058959961, "learning_rate": 1.953226797787339e-05, "loss": 1.6481, "step": 1522 }, { "epoch": 0.11708666256914567, "grad_norm": 4.285337924957275, "learning_rate": 1.9531653349723417e-05, "loss": 1.7054, "step": 1524 }, { "epoch": 0.11724031960663799, "grad_norm": 4.5872626304626465, "learning_rate": 1.953103872157345e-05, "loss": 1.6569, "step": 1526 }, { "epoch": 0.1173939766441303, "grad_norm": 4.3280463218688965, "learning_rate": 1.9530424093423483e-05, "loss": 1.5277, "step": 1528 }, { "epoch": 0.11754763368162262, "grad_norm": 4.480382919311523, "learning_rate": 1.952980946527351e-05, "loss": 1.7289, "step": 1530 }, { "epoch": 0.11770129071911493, "grad_norm": 4.207196235656738, "learning_rate": 1.9529194837123542e-05, "loss": 1.803, "step": 1532 }, { "epoch": 0.11785494775660725, "grad_norm": 4.125123023986816, "learning_rate": 1.9528580208973572e-05, "loss": 1.6289, "step": 1534 }, { "epoch": 0.11800860479409957, "grad_norm": 6.329103469848633, "learning_rate": 1.95279655808236e-05, "loss": 1.6592, "step": 1536 }, { "epoch": 0.11816226183159188, "grad_norm": 4.436602592468262, "learning_rate": 1.9527350952673635e-05, "loss": 1.5904, "step": 1538 }, { "epoch": 0.1183159188690842, "grad_norm": 4.564888954162598, "learning_rate": 1.9526736324523664e-05, "loss": 1.7565, "step": 1540 }, { "epoch": 0.11846957590657652, "grad_norm": 4.3771514892578125, "learning_rate": 1.9526121696373697e-05, "loss": 1.6746, "step": 1542 }, { "epoch": 0.11862323294406883, "grad_norm": 4.449161529541016, "learning_rate": 1.9525507068223727e-05, "loss": 1.6377, "step": 1544 }, { "epoch": 0.11877688998156116, "grad_norm": 4.770364761352539, "learning_rate": 1.9524892440073757e-05, "loss": 1.5953, "step": 1546 }, { "epoch": 0.11893054701905348, "grad_norm": 4.0749640464782715, "learning_rate": 1.952427781192379e-05, "loss": 1.5511, "step": 1548 }, { "epoch": 0.1190842040565458, "grad_norm": 4.361663341522217, "learning_rate": 1.9523663183773816e-05, "loss": 1.5562, "step": 1550 }, { "epoch": 0.11923786109403811, "grad_norm": 4.269155025482178, "learning_rate": 1.952304855562385e-05, "loss": 1.5725, "step": 1552 }, { "epoch": 0.11939151813153043, "grad_norm": 4.128551483154297, "learning_rate": 1.952243392747388e-05, "loss": 1.5, "step": 1554 }, { "epoch": 0.11954517516902274, "grad_norm": 4.763240814208984, "learning_rate": 1.952181929932391e-05, "loss": 1.7572, "step": 1556 }, { "epoch": 0.11969883220651506, "grad_norm": 4.871914386749268, "learning_rate": 1.952120467117394e-05, "loss": 1.609, "step": 1558 }, { "epoch": 0.11985248924400738, "grad_norm": 4.267725467681885, "learning_rate": 1.952059004302397e-05, "loss": 1.5832, "step": 1560 }, { "epoch": 0.12000614628149969, "grad_norm": 4.569482326507568, "learning_rate": 1.9519975414874e-05, "loss": 1.6262, "step": 1562 }, { "epoch": 0.12015980331899201, "grad_norm": 4.285094261169434, "learning_rate": 1.9519360786724034e-05, "loss": 1.7479, "step": 1564 }, { "epoch": 0.12031346035648433, "grad_norm": 4.529351234436035, "learning_rate": 1.9518746158574064e-05, "loss": 1.6297, "step": 1566 }, { "epoch": 0.12046711739397664, "grad_norm": 4.966389179229736, "learning_rate": 1.9518131530424097e-05, "loss": 1.7544, "step": 1568 }, { "epoch": 0.12062077443146896, "grad_norm": 4.608340263366699, "learning_rate": 1.9517516902274127e-05, "loss": 1.4635, "step": 1570 }, { "epoch": 0.12077443146896127, "grad_norm": 3.8790552616119385, "learning_rate": 1.9516902274124156e-05, "loss": 1.6345, "step": 1572 }, { "epoch": 0.12092808850645359, "grad_norm": 5.229369163513184, "learning_rate": 1.951628764597419e-05, "loss": 1.6829, "step": 1574 }, { "epoch": 0.1210817455439459, "grad_norm": 4.269663333892822, "learning_rate": 1.9515673017824216e-05, "loss": 1.568, "step": 1576 }, { "epoch": 0.12123540258143822, "grad_norm": 4.905238151550293, "learning_rate": 1.951505838967425e-05, "loss": 1.671, "step": 1578 }, { "epoch": 0.12138905961893055, "grad_norm": 4.5513596534729, "learning_rate": 1.951444376152428e-05, "loss": 1.6636, "step": 1580 }, { "epoch": 0.12154271665642287, "grad_norm": 4.586058616638184, "learning_rate": 1.9513829133374308e-05, "loss": 1.7669, "step": 1582 }, { "epoch": 0.12169637369391519, "grad_norm": 5.4855170249938965, "learning_rate": 1.951321450522434e-05, "loss": 1.5033, "step": 1584 }, { "epoch": 0.1218500307314075, "grad_norm": 4.668776035308838, "learning_rate": 1.951259987707437e-05, "loss": 1.5859, "step": 1586 }, { "epoch": 0.12200368776889982, "grad_norm": 3.9210376739501953, "learning_rate": 1.9511985248924404e-05, "loss": 1.5757, "step": 1588 }, { "epoch": 0.12215734480639213, "grad_norm": 4.558568000793457, "learning_rate": 1.9511370620774434e-05, "loss": 1.5945, "step": 1590 }, { "epoch": 0.12231100184388445, "grad_norm": 4.247246265411377, "learning_rate": 1.9510755992624463e-05, "loss": 1.624, "step": 1592 }, { "epoch": 0.12246465888137677, "grad_norm": 4.2471604347229, "learning_rate": 1.9510141364474496e-05, "loss": 1.5873, "step": 1594 }, { "epoch": 0.12261831591886908, "grad_norm": 4.362886428833008, "learning_rate": 1.9509526736324526e-05, "loss": 1.7448, "step": 1596 }, { "epoch": 0.1227719729563614, "grad_norm": 5.111678123474121, "learning_rate": 1.9508912108174556e-05, "loss": 1.8134, "step": 1598 }, { "epoch": 0.12292562999385372, "grad_norm": 4.4582624435424805, "learning_rate": 1.950829748002459e-05, "loss": 1.7155, "step": 1600 }, { "epoch": 0.12307928703134603, "grad_norm": 3.796780586242676, "learning_rate": 1.9507682851874615e-05, "loss": 1.5636, "step": 1602 }, { "epoch": 0.12323294406883835, "grad_norm": 4.517824649810791, "learning_rate": 1.9507068223724648e-05, "loss": 1.6092, "step": 1604 }, { "epoch": 0.12338660110633067, "grad_norm": 4.659684181213379, "learning_rate": 1.9506453595574678e-05, "loss": 1.641, "step": 1606 }, { "epoch": 0.12354025814382298, "grad_norm": 4.470782279968262, "learning_rate": 1.950583896742471e-05, "loss": 1.618, "step": 1608 }, { "epoch": 0.1236939151813153, "grad_norm": 4.486400604248047, "learning_rate": 1.950522433927474e-05, "loss": 1.6912, "step": 1610 }, { "epoch": 0.12384757221880763, "grad_norm": 4.459258556365967, "learning_rate": 1.950460971112477e-05, "loss": 1.5627, "step": 1612 }, { "epoch": 0.12400122925629994, "grad_norm": 4.486885070800781, "learning_rate": 1.9503995082974803e-05, "loss": 1.8642, "step": 1614 }, { "epoch": 0.12415488629379226, "grad_norm": 4.576472282409668, "learning_rate": 1.9503380454824833e-05, "loss": 1.6411, "step": 1616 }, { "epoch": 0.12430854333128458, "grad_norm": 4.349391460418701, "learning_rate": 1.9502765826674863e-05, "loss": 1.6382, "step": 1618 }, { "epoch": 0.12446220036877689, "grad_norm": 4.264526844024658, "learning_rate": 1.9502151198524896e-05, "loss": 1.621, "step": 1620 }, { "epoch": 0.12461585740626921, "grad_norm": 4.798770904541016, "learning_rate": 1.9501536570374925e-05, "loss": 1.8124, "step": 1622 }, { "epoch": 0.12476951444376153, "grad_norm": 3.747992515563965, "learning_rate": 1.9500921942224955e-05, "loss": 1.516, "step": 1624 }, { "epoch": 0.12492317148125384, "grad_norm": 4.410411834716797, "learning_rate": 1.9500307314074988e-05, "loss": 1.5645, "step": 1626 }, { "epoch": 0.12507682851874616, "grad_norm": 4.139060020446777, "learning_rate": 1.9499692685925018e-05, "loss": 1.6217, "step": 1628 }, { "epoch": 0.12523048555623847, "grad_norm": 4.380125045776367, "learning_rate": 1.9499078057775048e-05, "loss": 1.6909, "step": 1630 }, { "epoch": 0.1253841425937308, "grad_norm": 4.449796676635742, "learning_rate": 1.9498463429625077e-05, "loss": 1.7215, "step": 1632 }, { "epoch": 0.1255377996312231, "grad_norm": 4.043376922607422, "learning_rate": 1.949784880147511e-05, "loss": 1.6326, "step": 1634 }, { "epoch": 0.12569145666871542, "grad_norm": 4.427875518798828, "learning_rate": 1.949723417332514e-05, "loss": 1.6962, "step": 1636 }, { "epoch": 0.12584511370620774, "grad_norm": 4.617554187774658, "learning_rate": 1.949661954517517e-05, "loss": 1.5711, "step": 1638 }, { "epoch": 0.12599877074370006, "grad_norm": 4.245482444763184, "learning_rate": 1.9496004917025203e-05, "loss": 1.6479, "step": 1640 }, { "epoch": 0.12615242778119237, "grad_norm": 4.876771926879883, "learning_rate": 1.9495390288875232e-05, "loss": 1.7238, "step": 1642 }, { "epoch": 0.1263060848186847, "grad_norm": 4.263737678527832, "learning_rate": 1.9494775660725262e-05, "loss": 1.5666, "step": 1644 }, { "epoch": 0.126459741856177, "grad_norm": 6.202945232391357, "learning_rate": 1.9494161032575295e-05, "loss": 1.6217, "step": 1646 }, { "epoch": 0.12661339889366932, "grad_norm": 4.307828426361084, "learning_rate": 1.9493546404425325e-05, "loss": 1.49, "step": 1648 }, { "epoch": 0.12676705593116164, "grad_norm": 4.122886657714844, "learning_rate": 1.9492931776275355e-05, "loss": 1.7121, "step": 1650 }, { "epoch": 0.12692071296865395, "grad_norm": 4.3632426261901855, "learning_rate": 1.9492317148125384e-05, "loss": 1.6835, "step": 1652 }, { "epoch": 0.12707437000614627, "grad_norm": 4.4186625480651855, "learning_rate": 1.9491702519975417e-05, "loss": 1.7579, "step": 1654 }, { "epoch": 0.1272280270436386, "grad_norm": 4.411682605743408, "learning_rate": 1.9491087891825447e-05, "loss": 1.5771, "step": 1656 }, { "epoch": 0.1273816840811309, "grad_norm": 4.259854316711426, "learning_rate": 1.9490473263675477e-05, "loss": 1.6239, "step": 1658 }, { "epoch": 0.12753534111862325, "grad_norm": 4.225386619567871, "learning_rate": 1.948985863552551e-05, "loss": 1.6777, "step": 1660 }, { "epoch": 0.12768899815611556, "grad_norm": 4.977676868438721, "learning_rate": 1.948924400737554e-05, "loss": 1.6166, "step": 1662 }, { "epoch": 0.12784265519360788, "grad_norm": 3.7306509017944336, "learning_rate": 1.948862937922557e-05, "loss": 1.5834, "step": 1664 }, { "epoch": 0.1279963122311002, "grad_norm": 4.451853275299072, "learning_rate": 1.9488014751075602e-05, "loss": 1.6464, "step": 1666 }, { "epoch": 0.1281499692685925, "grad_norm": 4.641234397888184, "learning_rate": 1.9487400122925632e-05, "loss": 1.6698, "step": 1668 }, { "epoch": 0.12830362630608483, "grad_norm": 5.218206882476807, "learning_rate": 1.948678549477566e-05, "loss": 1.6614, "step": 1670 }, { "epoch": 0.12845728334357714, "grad_norm": 4.623648166656494, "learning_rate": 1.9486170866625695e-05, "loss": 1.6586, "step": 1672 }, { "epoch": 0.12861094038106946, "grad_norm": 5.1708478927612305, "learning_rate": 1.9485556238475724e-05, "loss": 1.6275, "step": 1674 }, { "epoch": 0.12876459741856178, "grad_norm": 4.305856227874756, "learning_rate": 1.9484941610325754e-05, "loss": 1.6349, "step": 1676 }, { "epoch": 0.1289182544560541, "grad_norm": 4.788485050201416, "learning_rate": 1.9484326982175784e-05, "loss": 1.4676, "step": 1678 }, { "epoch": 0.1290719114935464, "grad_norm": 4.4581379890441895, "learning_rate": 1.9483712354025817e-05, "loss": 1.5062, "step": 1680 }, { "epoch": 0.12922556853103873, "grad_norm": 3.9021549224853516, "learning_rate": 1.9483097725875846e-05, "loss": 1.7848, "step": 1682 }, { "epoch": 0.12937922556853104, "grad_norm": 4.530584812164307, "learning_rate": 1.9482483097725876e-05, "loss": 1.6594, "step": 1684 }, { "epoch": 0.12953288260602336, "grad_norm": 4.8017497062683105, "learning_rate": 1.948186846957591e-05, "loss": 1.6167, "step": 1686 }, { "epoch": 0.12968653964351567, "grad_norm": 4.41823148727417, "learning_rate": 1.948125384142594e-05, "loss": 1.5293, "step": 1688 }, { "epoch": 0.129840196681008, "grad_norm": 4.470682144165039, "learning_rate": 1.948063921327597e-05, "loss": 1.6036, "step": 1690 }, { "epoch": 0.1299938537185003, "grad_norm": 3.947842597961426, "learning_rate": 1.9480024585126e-05, "loss": 1.7037, "step": 1692 }, { "epoch": 0.13014751075599262, "grad_norm": 4.953098297119141, "learning_rate": 1.947940995697603e-05, "loss": 1.6436, "step": 1694 }, { "epoch": 0.13030116779348494, "grad_norm": 4.112635135650635, "learning_rate": 1.947879532882606e-05, "loss": 1.4595, "step": 1696 }, { "epoch": 0.13045482483097726, "grad_norm": 4.197033882141113, "learning_rate": 1.9478180700676094e-05, "loss": 1.657, "step": 1698 }, { "epoch": 0.13060848186846957, "grad_norm": 4.02692985534668, "learning_rate": 1.9477566072526124e-05, "loss": 1.6321, "step": 1700 }, { "epoch": 0.1307621389059619, "grad_norm": 4.7861809730529785, "learning_rate": 1.9476951444376153e-05, "loss": 1.5609, "step": 1702 }, { "epoch": 0.1309157959434542, "grad_norm": 4.392903804779053, "learning_rate": 1.9476336816226183e-05, "loss": 1.7549, "step": 1704 }, { "epoch": 0.13106945298094652, "grad_norm": 4.314429759979248, "learning_rate": 1.9475722188076216e-05, "loss": 1.5698, "step": 1706 }, { "epoch": 0.13122311001843884, "grad_norm": 4.254858016967773, "learning_rate": 1.9475107559926246e-05, "loss": 1.6291, "step": 1708 }, { "epoch": 0.13137676705593115, "grad_norm": 4.288058757781982, "learning_rate": 1.9474492931776276e-05, "loss": 1.822, "step": 1710 }, { "epoch": 0.13153042409342347, "grad_norm": 4.206986904144287, "learning_rate": 1.947387830362631e-05, "loss": 1.6372, "step": 1712 }, { "epoch": 0.1316840811309158, "grad_norm": 3.9056224822998047, "learning_rate": 1.947326367547634e-05, "loss": 1.6141, "step": 1714 }, { "epoch": 0.1318377381684081, "grad_norm": 5.1152777671813965, "learning_rate": 1.9472649047326368e-05, "loss": 1.6361, "step": 1716 }, { "epoch": 0.13199139520590042, "grad_norm": 4.0903120040893555, "learning_rate": 1.94720344191764e-05, "loss": 1.5559, "step": 1718 }, { "epoch": 0.13214505224339274, "grad_norm": 4.825276851654053, "learning_rate": 1.947141979102643e-05, "loss": 1.732, "step": 1720 }, { "epoch": 0.13229870928088505, "grad_norm": 4.649293899536133, "learning_rate": 1.947080516287646e-05, "loss": 1.5941, "step": 1722 }, { "epoch": 0.13245236631837737, "grad_norm": 4.052992820739746, "learning_rate": 1.9470190534726494e-05, "loss": 1.5664, "step": 1724 }, { "epoch": 0.1326060233558697, "grad_norm": 4.36129903793335, "learning_rate": 1.9469575906576523e-05, "loss": 1.7345, "step": 1726 }, { "epoch": 0.13275968039336203, "grad_norm": 4.522770404815674, "learning_rate": 1.9468961278426553e-05, "loss": 1.6731, "step": 1728 }, { "epoch": 0.13291333743085434, "grad_norm": 4.922299385070801, "learning_rate": 1.9468346650276583e-05, "loss": 1.8072, "step": 1730 }, { "epoch": 0.13306699446834666, "grad_norm": 4.385134220123291, "learning_rate": 1.9467732022126616e-05, "loss": 1.5836, "step": 1732 }, { "epoch": 0.13322065150583898, "grad_norm": 4.031277179718018, "learning_rate": 1.9467117393976645e-05, "loss": 1.465, "step": 1734 }, { "epoch": 0.1333743085433313, "grad_norm": 4.437002182006836, "learning_rate": 1.9466502765826675e-05, "loss": 1.5624, "step": 1736 }, { "epoch": 0.1335279655808236, "grad_norm": 3.754696846008301, "learning_rate": 1.9465888137676708e-05, "loss": 1.6044, "step": 1738 }, { "epoch": 0.13368162261831593, "grad_norm": 3.967130661010742, "learning_rate": 1.9465273509526738e-05, "loss": 1.5832, "step": 1740 }, { "epoch": 0.13383527965580824, "grad_norm": 3.958448648452759, "learning_rate": 1.9464658881376767e-05, "loss": 1.6812, "step": 1742 }, { "epoch": 0.13398893669330056, "grad_norm": 5.2511982917785645, "learning_rate": 1.94640442532268e-05, "loss": 1.8565, "step": 1744 }, { "epoch": 0.13414259373079288, "grad_norm": 4.229193210601807, "learning_rate": 1.946342962507683e-05, "loss": 1.6365, "step": 1746 }, { "epoch": 0.1342962507682852, "grad_norm": 3.8518741130828857, "learning_rate": 1.946281499692686e-05, "loss": 1.571, "step": 1748 }, { "epoch": 0.1344499078057775, "grad_norm": 4.383627414703369, "learning_rate": 1.946220036877689e-05, "loss": 1.5181, "step": 1750 }, { "epoch": 0.13460356484326982, "grad_norm": 4.58341121673584, "learning_rate": 1.9461585740626923e-05, "loss": 1.6387, "step": 1752 }, { "epoch": 0.13475722188076214, "grad_norm": 4.656858921051025, "learning_rate": 1.9460971112476956e-05, "loss": 1.6474, "step": 1754 }, { "epoch": 0.13491087891825446, "grad_norm": 5.039700031280518, "learning_rate": 1.9460356484326982e-05, "loss": 1.6755, "step": 1756 }, { "epoch": 0.13506453595574677, "grad_norm": 4.46349573135376, "learning_rate": 1.9459741856177015e-05, "loss": 1.5728, "step": 1758 }, { "epoch": 0.1352181929932391, "grad_norm": 4.041154861450195, "learning_rate": 1.9459127228027045e-05, "loss": 1.5847, "step": 1760 }, { "epoch": 0.1353718500307314, "grad_norm": 4.126910209655762, "learning_rate": 1.9458512599877074e-05, "loss": 1.6807, "step": 1762 }, { "epoch": 0.13552550706822372, "grad_norm": 4.063604831695557, "learning_rate": 1.9457897971727108e-05, "loss": 1.5294, "step": 1764 }, { "epoch": 0.13567916410571604, "grad_norm": 4.1347150802612305, "learning_rate": 1.9457283343577137e-05, "loss": 1.5728, "step": 1766 }, { "epoch": 0.13583282114320835, "grad_norm": 4.593793869018555, "learning_rate": 1.9456668715427167e-05, "loss": 1.7155, "step": 1768 }, { "epoch": 0.13598647818070067, "grad_norm": 4.340649127960205, "learning_rate": 1.94560540872772e-05, "loss": 1.6996, "step": 1770 }, { "epoch": 0.136140135218193, "grad_norm": 4.278517246246338, "learning_rate": 1.945543945912723e-05, "loss": 1.6012, "step": 1772 }, { "epoch": 0.1362937922556853, "grad_norm": 4.626030445098877, "learning_rate": 1.9454824830977263e-05, "loss": 1.6195, "step": 1774 }, { "epoch": 0.13644744929317762, "grad_norm": 4.450915813446045, "learning_rate": 1.945421020282729e-05, "loss": 1.6398, "step": 1776 }, { "epoch": 0.13660110633066994, "grad_norm": 4.265727996826172, "learning_rate": 1.9453595574677322e-05, "loss": 1.5958, "step": 1778 }, { "epoch": 0.13675476336816225, "grad_norm": 4.036159038543701, "learning_rate": 1.9452980946527352e-05, "loss": 1.5647, "step": 1780 }, { "epoch": 0.13690842040565457, "grad_norm": 4.2282257080078125, "learning_rate": 1.945236631837738e-05, "loss": 1.6079, "step": 1782 }, { "epoch": 0.13706207744314688, "grad_norm": 4.005040645599365, "learning_rate": 1.9451751690227415e-05, "loss": 1.5044, "step": 1784 }, { "epoch": 0.1372157344806392, "grad_norm": 4.676270484924316, "learning_rate": 1.9451137062077444e-05, "loss": 1.6304, "step": 1786 }, { "epoch": 0.13736939151813152, "grad_norm": 4.598161697387695, "learning_rate": 1.9450522433927474e-05, "loss": 1.6887, "step": 1788 }, { "epoch": 0.13752304855562386, "grad_norm": 5.0116448402404785, "learning_rate": 1.9449907805777507e-05, "loss": 1.6998, "step": 1790 }, { "epoch": 0.13767670559311618, "grad_norm": 4.892838954925537, "learning_rate": 1.9449293177627537e-05, "loss": 1.6274, "step": 1792 }, { "epoch": 0.1378303626306085, "grad_norm": 5.293637752532959, "learning_rate": 1.944867854947757e-05, "loss": 1.628, "step": 1794 }, { "epoch": 0.1379840196681008, "grad_norm": 4.583549976348877, "learning_rate": 1.94480639213276e-05, "loss": 1.4792, "step": 1796 }, { "epoch": 0.13813767670559313, "grad_norm": 3.773277759552002, "learning_rate": 1.944744929317763e-05, "loss": 1.5219, "step": 1798 }, { "epoch": 0.13829133374308544, "grad_norm": 4.440420150756836, "learning_rate": 1.9446834665027662e-05, "loss": 1.6732, "step": 1800 }, { "epoch": 0.13844499078057776, "grad_norm": 4.711763858795166, "learning_rate": 1.944622003687769e-05, "loss": 1.5463, "step": 1802 }, { "epoch": 0.13859864781807008, "grad_norm": 5.035058498382568, "learning_rate": 1.944560540872772e-05, "loss": 1.6961, "step": 1804 }, { "epoch": 0.1387523048555624, "grad_norm": 3.963282346725464, "learning_rate": 1.944499078057775e-05, "loss": 1.5568, "step": 1806 }, { "epoch": 0.1389059618930547, "grad_norm": 4.577483654022217, "learning_rate": 1.944437615242778e-05, "loss": 1.5428, "step": 1808 }, { "epoch": 0.13905961893054702, "grad_norm": 4.509146690368652, "learning_rate": 1.9443761524277814e-05, "loss": 1.6397, "step": 1810 }, { "epoch": 0.13921327596803934, "grad_norm": 4.317050933837891, "learning_rate": 1.9443146896127844e-05, "loss": 1.7304, "step": 1812 }, { "epoch": 0.13936693300553166, "grad_norm": 4.572277069091797, "learning_rate": 1.9442532267977877e-05, "loss": 1.6759, "step": 1814 }, { "epoch": 0.13952059004302397, "grad_norm": 4.773606777191162, "learning_rate": 1.9441917639827906e-05, "loss": 1.6869, "step": 1816 }, { "epoch": 0.1396742470805163, "grad_norm": 4.57815408706665, "learning_rate": 1.9441303011677936e-05, "loss": 1.647, "step": 1818 }, { "epoch": 0.1398279041180086, "grad_norm": 4.822877407073975, "learning_rate": 1.944068838352797e-05, "loss": 1.6182, "step": 1820 }, { "epoch": 0.13998156115550092, "grad_norm": 4.272431373596191, "learning_rate": 1.9440073755378e-05, "loss": 1.6215, "step": 1822 }, { "epoch": 0.14013521819299324, "grad_norm": 4.476557731628418, "learning_rate": 1.943945912722803e-05, "loss": 1.6277, "step": 1824 }, { "epoch": 0.14028887523048555, "grad_norm": 4.522927284240723, "learning_rate": 1.943884449907806e-05, "loss": 1.526, "step": 1826 }, { "epoch": 0.14044253226797787, "grad_norm": 3.991070032119751, "learning_rate": 1.9438229870928088e-05, "loss": 1.6106, "step": 1828 }, { "epoch": 0.1405961893054702, "grad_norm": 4.189483165740967, "learning_rate": 1.943761524277812e-05, "loss": 1.619, "step": 1830 }, { "epoch": 0.1407498463429625, "grad_norm": 4.5693159103393555, "learning_rate": 1.943700061462815e-05, "loss": 1.7438, "step": 1832 }, { "epoch": 0.14090350338045482, "grad_norm": 3.8766119480133057, "learning_rate": 1.943638598647818e-05, "loss": 1.4953, "step": 1834 }, { "epoch": 0.14105716041794714, "grad_norm": 4.294021129608154, "learning_rate": 1.9435771358328213e-05, "loss": 1.5746, "step": 1836 }, { "epoch": 0.14121081745543945, "grad_norm": 4.195743083953857, "learning_rate": 1.9435156730178243e-05, "loss": 1.5049, "step": 1838 }, { "epoch": 0.14136447449293177, "grad_norm": 4.331358909606934, "learning_rate": 1.9434542102028276e-05, "loss": 1.8416, "step": 1840 }, { "epoch": 0.14151813153042408, "grad_norm": 4.328099727630615, "learning_rate": 1.9433927473878306e-05, "loss": 1.7201, "step": 1842 }, { "epoch": 0.1416717885679164, "grad_norm": 4.2462005615234375, "learning_rate": 1.9433312845728336e-05, "loss": 1.6308, "step": 1844 }, { "epoch": 0.14182544560540872, "grad_norm": 4.253352165222168, "learning_rate": 1.943269821757837e-05, "loss": 1.6856, "step": 1846 }, { "epoch": 0.14197910264290103, "grad_norm": 4.154186248779297, "learning_rate": 1.9432083589428395e-05, "loss": 1.6079, "step": 1848 }, { "epoch": 0.14213275968039335, "grad_norm": 6.227648735046387, "learning_rate": 1.9431468961278428e-05, "loss": 1.73, "step": 1850 }, { "epoch": 0.14228641671788567, "grad_norm": 4.038461208343506, "learning_rate": 1.943085433312846e-05, "loss": 1.7227, "step": 1852 }, { "epoch": 0.14244007375537798, "grad_norm": 4.844911098480225, "learning_rate": 1.9430239704978487e-05, "loss": 1.5841, "step": 1854 }, { "epoch": 0.14259373079287033, "grad_norm": 3.845120429992676, "learning_rate": 1.942962507682852e-05, "loss": 1.6545, "step": 1856 }, { "epoch": 0.14274738783036264, "grad_norm": 4.25357723236084, "learning_rate": 1.942901044867855e-05, "loss": 1.6038, "step": 1858 }, { "epoch": 0.14290104486785496, "grad_norm": 4.518612861633301, "learning_rate": 1.9428395820528583e-05, "loss": 1.703, "step": 1860 }, { "epoch": 0.14305470190534728, "grad_norm": 4.541075229644775, "learning_rate": 1.9427781192378613e-05, "loss": 1.6203, "step": 1862 }, { "epoch": 0.1432083589428396, "grad_norm": 4.06412935256958, "learning_rate": 1.9427166564228643e-05, "loss": 1.714, "step": 1864 }, { "epoch": 0.1433620159803319, "grad_norm": 4.289870738983154, "learning_rate": 1.9426551936078676e-05, "loss": 1.5524, "step": 1866 }, { "epoch": 0.14351567301782422, "grad_norm": 3.937469005584717, "learning_rate": 1.9425937307928705e-05, "loss": 1.5572, "step": 1868 }, { "epoch": 0.14366933005531654, "grad_norm": 4.361362457275391, "learning_rate": 1.9425322679778735e-05, "loss": 1.493, "step": 1870 }, { "epoch": 0.14382298709280886, "grad_norm": 3.9257559776306152, "learning_rate": 1.9424708051628768e-05, "loss": 1.502, "step": 1872 }, { "epoch": 0.14397664413030117, "grad_norm": 4.2765655517578125, "learning_rate": 1.9424093423478794e-05, "loss": 1.6001, "step": 1874 }, { "epoch": 0.1441303011677935, "grad_norm": 3.724155902862549, "learning_rate": 1.9423478795328827e-05, "loss": 1.5722, "step": 1876 }, { "epoch": 0.1442839582052858, "grad_norm": 4.135402679443359, "learning_rate": 1.9422864167178857e-05, "loss": 1.7137, "step": 1878 }, { "epoch": 0.14443761524277812, "grad_norm": 4.522433280944824, "learning_rate": 1.942224953902889e-05, "loss": 1.6161, "step": 1880 }, { "epoch": 0.14459127228027044, "grad_norm": 4.247946262359619, "learning_rate": 1.942163491087892e-05, "loss": 1.67, "step": 1882 }, { "epoch": 0.14474492931776275, "grad_norm": 4.322160243988037, "learning_rate": 1.942102028272895e-05, "loss": 1.5362, "step": 1884 }, { "epoch": 0.14489858635525507, "grad_norm": 4.25150728225708, "learning_rate": 1.9420405654578983e-05, "loss": 1.6026, "step": 1886 }, { "epoch": 0.1450522433927474, "grad_norm": 4.95831823348999, "learning_rate": 1.9419791026429012e-05, "loss": 1.6806, "step": 1888 }, { "epoch": 0.1452059004302397, "grad_norm": 4.125936031341553, "learning_rate": 1.9419176398279042e-05, "loss": 1.634, "step": 1890 }, { "epoch": 0.14535955746773202, "grad_norm": 3.6493210792541504, "learning_rate": 1.9418561770129075e-05, "loss": 1.4507, "step": 1892 }, { "epoch": 0.14551321450522434, "grad_norm": 4.338488578796387, "learning_rate": 1.9417947141979105e-05, "loss": 1.5127, "step": 1894 }, { "epoch": 0.14566687154271665, "grad_norm": 4.250901222229004, "learning_rate": 1.9417332513829134e-05, "loss": 1.4201, "step": 1896 }, { "epoch": 0.14582052858020897, "grad_norm": 3.8600480556488037, "learning_rate": 1.9416717885679167e-05, "loss": 1.5595, "step": 1898 }, { "epoch": 0.14597418561770129, "grad_norm": 4.317285537719727, "learning_rate": 1.9416103257529197e-05, "loss": 1.6094, "step": 1900 }, { "epoch": 0.1461278426551936, "grad_norm": 4.718072891235352, "learning_rate": 1.9415488629379227e-05, "loss": 1.768, "step": 1902 }, { "epoch": 0.14628149969268592, "grad_norm": 4.9370808601379395, "learning_rate": 1.9414874001229257e-05, "loss": 1.5801, "step": 1904 }, { "epoch": 0.14643515673017823, "grad_norm": 4.436810493469238, "learning_rate": 1.941425937307929e-05, "loss": 1.5847, "step": 1906 }, { "epoch": 0.14658881376767055, "grad_norm": 4.890700817108154, "learning_rate": 1.941364474492932e-05, "loss": 1.5092, "step": 1908 }, { "epoch": 0.14674247080516287, "grad_norm": 3.926815986633301, "learning_rate": 1.941303011677935e-05, "loss": 1.6612, "step": 1910 }, { "epoch": 0.14689612784265518, "grad_norm": 4.331315994262695, "learning_rate": 1.9412415488629382e-05, "loss": 1.7669, "step": 1912 }, { "epoch": 0.1470497848801475, "grad_norm": 5.178247928619385, "learning_rate": 1.9411800860479412e-05, "loss": 1.6749, "step": 1914 }, { "epoch": 0.14720344191763982, "grad_norm": 3.871377944946289, "learning_rate": 1.941118623232944e-05, "loss": 1.4508, "step": 1916 }, { "epoch": 0.14735709895513213, "grad_norm": 4.062928676605225, "learning_rate": 1.9410571604179474e-05, "loss": 1.458, "step": 1918 }, { "epoch": 0.14751075599262448, "grad_norm": 4.205310344696045, "learning_rate": 1.9409956976029504e-05, "loss": 1.6419, "step": 1920 }, { "epoch": 0.1476644130301168, "grad_norm": 3.822014093399048, "learning_rate": 1.9409342347879534e-05, "loss": 1.558, "step": 1922 }, { "epoch": 0.1478180700676091, "grad_norm": 4.9377593994140625, "learning_rate": 1.9408727719729567e-05, "loss": 1.687, "step": 1924 }, { "epoch": 0.14797172710510142, "grad_norm": 4.531102657318115, "learning_rate": 1.9408113091579597e-05, "loss": 1.7183, "step": 1926 }, { "epoch": 0.14812538414259374, "grad_norm": 4.2398881912231445, "learning_rate": 1.9407498463429626e-05, "loss": 1.5952, "step": 1928 }, { "epoch": 0.14827904118008606, "grad_norm": 4.426001071929932, "learning_rate": 1.9406883835279656e-05, "loss": 1.7844, "step": 1930 }, { "epoch": 0.14843269821757837, "grad_norm": 4.2123494148254395, "learning_rate": 1.940626920712969e-05, "loss": 1.6711, "step": 1932 }, { "epoch": 0.1485863552550707, "grad_norm": 4.681150913238525, "learning_rate": 1.940565457897972e-05, "loss": 1.6319, "step": 1934 }, { "epoch": 0.148740012292563, "grad_norm": 4.499131202697754, "learning_rate": 1.940503995082975e-05, "loss": 1.6108, "step": 1936 }, { "epoch": 0.14889366933005532, "grad_norm": 5.171452522277832, "learning_rate": 1.940442532267978e-05, "loss": 1.7528, "step": 1938 }, { "epoch": 0.14904732636754764, "grad_norm": 4.263631343841553, "learning_rate": 1.940381069452981e-05, "loss": 1.5191, "step": 1940 }, { "epoch": 0.14920098340503996, "grad_norm": 5.029245853424072, "learning_rate": 1.940319606637984e-05, "loss": 1.5623, "step": 1942 }, { "epoch": 0.14935464044253227, "grad_norm": 4.505579948425293, "learning_rate": 1.9402581438229874e-05, "loss": 1.4555, "step": 1944 }, { "epoch": 0.1495082974800246, "grad_norm": 4.269020080566406, "learning_rate": 1.9401966810079904e-05, "loss": 1.5765, "step": 1946 }, { "epoch": 0.1496619545175169, "grad_norm": 3.9689948558807373, "learning_rate": 1.9401352181929933e-05, "loss": 1.6277, "step": 1948 }, { "epoch": 0.14981561155500922, "grad_norm": 3.684130907058716, "learning_rate": 1.9400737553779966e-05, "loss": 1.5172, "step": 1950 }, { "epoch": 0.14996926859250154, "grad_norm": 3.713602066040039, "learning_rate": 1.9400122925629996e-05, "loss": 1.5211, "step": 1952 }, { "epoch": 0.15012292562999385, "grad_norm": 4.927131175994873, "learning_rate": 1.9399508297480026e-05, "loss": 1.6324, "step": 1954 }, { "epoch": 0.15027658266748617, "grad_norm": 4.4799299240112305, "learning_rate": 1.9398893669330055e-05, "loss": 1.5679, "step": 1956 }, { "epoch": 0.15043023970497849, "grad_norm": 4.372833251953125, "learning_rate": 1.939827904118009e-05, "loss": 1.5638, "step": 1958 }, { "epoch": 0.1505838967424708, "grad_norm": 4.321191787719727, "learning_rate": 1.9397664413030118e-05, "loss": 1.4329, "step": 1960 }, { "epoch": 0.15073755377996312, "grad_norm": 4.75023078918457, "learning_rate": 1.9397049784880148e-05, "loss": 1.6682, "step": 1962 }, { "epoch": 0.15089121081745543, "grad_norm": 4.172933101654053, "learning_rate": 1.939643515673018e-05, "loss": 1.7316, "step": 1964 }, { "epoch": 0.15104486785494775, "grad_norm": 4.329415321350098, "learning_rate": 1.939582052858021e-05, "loss": 1.7017, "step": 1966 }, { "epoch": 0.15119852489244007, "grad_norm": 4.249721527099609, "learning_rate": 1.939520590043024e-05, "loss": 1.5796, "step": 1968 }, { "epoch": 0.15135218192993238, "grad_norm": 4.071712970733643, "learning_rate": 1.9394591272280273e-05, "loss": 1.4623, "step": 1970 }, { "epoch": 0.1515058389674247, "grad_norm": 4.0507731437683105, "learning_rate": 1.9393976644130303e-05, "loss": 1.4821, "step": 1972 }, { "epoch": 0.15165949600491702, "grad_norm": 4.356963634490967, "learning_rate": 1.9393362015980333e-05, "loss": 1.547, "step": 1974 }, { "epoch": 0.15181315304240933, "grad_norm": 5.182737350463867, "learning_rate": 1.9392747387830362e-05, "loss": 1.6367, "step": 1976 }, { "epoch": 0.15196681007990165, "grad_norm": 3.9492363929748535, "learning_rate": 1.9392132759680395e-05, "loss": 1.679, "step": 1978 }, { "epoch": 0.15212046711739396, "grad_norm": 4.404160976409912, "learning_rate": 1.9391518131530425e-05, "loss": 1.5288, "step": 1980 }, { "epoch": 0.15227412415488628, "grad_norm": 4.500973701477051, "learning_rate": 1.9390903503380455e-05, "loss": 1.7715, "step": 1982 }, { "epoch": 0.1524277811923786, "grad_norm": 4.121068954467773, "learning_rate": 1.9390288875230488e-05, "loss": 1.5507, "step": 1984 }, { "epoch": 0.15258143822987094, "grad_norm": 3.7095515727996826, "learning_rate": 1.9389674247080518e-05, "loss": 1.616, "step": 1986 }, { "epoch": 0.15273509526736326, "grad_norm": 5.333407878875732, "learning_rate": 1.9389059618930547e-05, "loss": 1.7196, "step": 1988 }, { "epoch": 0.15288875230485557, "grad_norm": 4.188971042633057, "learning_rate": 1.938844499078058e-05, "loss": 1.6256, "step": 1990 }, { "epoch": 0.1530424093423479, "grad_norm": 4.126604080200195, "learning_rate": 1.938783036263061e-05, "loss": 1.5089, "step": 1992 }, { "epoch": 0.1531960663798402, "grad_norm": 4.127197742462158, "learning_rate": 1.938721573448064e-05, "loss": 1.5598, "step": 1994 }, { "epoch": 0.15334972341733252, "grad_norm": 4.481958389282227, "learning_rate": 1.9386601106330673e-05, "loss": 1.6238, "step": 1996 }, { "epoch": 0.15350338045482484, "grad_norm": 4.15784215927124, "learning_rate": 1.9385986478180702e-05, "loss": 1.6611, "step": 1998 }, { "epoch": 0.15365703749231716, "grad_norm": 4.322861194610596, "learning_rate": 1.9385371850030732e-05, "loss": 1.513, "step": 2000 }, { "epoch": 0.15381069452980947, "grad_norm": 3.9926345348358154, "learning_rate": 1.9384757221880762e-05, "loss": 1.5764, "step": 2002 }, { "epoch": 0.1539643515673018, "grad_norm": 5.112368583679199, "learning_rate": 1.9384142593730795e-05, "loss": 1.6181, "step": 2004 }, { "epoch": 0.1541180086047941, "grad_norm": 3.6655466556549072, "learning_rate": 1.9383527965580825e-05, "loss": 1.3721, "step": 2006 }, { "epoch": 0.15427166564228642, "grad_norm": 4.533908367156982, "learning_rate": 1.9382913337430854e-05, "loss": 1.5841, "step": 2008 }, { "epoch": 0.15442532267977874, "grad_norm": 3.722304344177246, "learning_rate": 1.9382298709280887e-05, "loss": 1.5965, "step": 2010 }, { "epoch": 0.15457897971727105, "grad_norm": 4.5336012840271, "learning_rate": 1.9381684081130917e-05, "loss": 1.5995, "step": 2012 }, { "epoch": 0.15473263675476337, "grad_norm": 4.144698619842529, "learning_rate": 1.9381069452980947e-05, "loss": 1.6343, "step": 2014 }, { "epoch": 0.15488629379225569, "grad_norm": 4.033304691314697, "learning_rate": 1.938045482483098e-05, "loss": 1.6888, "step": 2016 }, { "epoch": 0.155039950829748, "grad_norm": 4.00151252746582, "learning_rate": 1.937984019668101e-05, "loss": 1.5246, "step": 2018 }, { "epoch": 0.15519360786724032, "grad_norm": 4.766987323760986, "learning_rate": 1.937922556853104e-05, "loss": 1.7345, "step": 2020 }, { "epoch": 0.15534726490473263, "grad_norm": 4.144290924072266, "learning_rate": 1.9378610940381072e-05, "loss": 1.5848, "step": 2022 }, { "epoch": 0.15550092194222495, "grad_norm": 4.038874626159668, "learning_rate": 1.9377996312231102e-05, "loss": 1.6425, "step": 2024 }, { "epoch": 0.15565457897971727, "grad_norm": 3.832429885864258, "learning_rate": 1.9377381684081135e-05, "loss": 1.5454, "step": 2026 }, { "epoch": 0.15580823601720958, "grad_norm": 4.775087833404541, "learning_rate": 1.937676705593116e-05, "loss": 1.6767, "step": 2028 }, { "epoch": 0.1559618930547019, "grad_norm": 3.997192144393921, "learning_rate": 1.9376152427781194e-05, "loss": 1.5539, "step": 2030 }, { "epoch": 0.15611555009219422, "grad_norm": 3.764519453048706, "learning_rate": 1.9375537799631224e-05, "loss": 1.46, "step": 2032 }, { "epoch": 0.15626920712968653, "grad_norm": 4.074234962463379, "learning_rate": 1.9374923171481254e-05, "loss": 1.5064, "step": 2034 }, { "epoch": 0.15642286416717885, "grad_norm": 4.302229881286621, "learning_rate": 1.9374308543331287e-05, "loss": 1.6276, "step": 2036 }, { "epoch": 0.15657652120467117, "grad_norm": 4.602327346801758, "learning_rate": 1.9373693915181316e-05, "loss": 1.5551, "step": 2038 }, { "epoch": 0.15673017824216348, "grad_norm": 4.131155490875244, "learning_rate": 1.9373079287031346e-05, "loss": 1.4554, "step": 2040 }, { "epoch": 0.1568838352796558, "grad_norm": 4.661727428436279, "learning_rate": 1.937246465888138e-05, "loss": 1.6566, "step": 2042 }, { "epoch": 0.15703749231714811, "grad_norm": 4.23723030090332, "learning_rate": 1.937185003073141e-05, "loss": 1.6038, "step": 2044 }, { "epoch": 0.15719114935464043, "grad_norm": 3.995077133178711, "learning_rate": 1.9371235402581442e-05, "loss": 1.7374, "step": 2046 }, { "epoch": 0.15734480639213275, "grad_norm": 4.001912593841553, "learning_rate": 1.937062077443147e-05, "loss": 1.5991, "step": 2048 }, { "epoch": 0.15749846342962506, "grad_norm": 4.558352470397949, "learning_rate": 1.93700061462815e-05, "loss": 1.6115, "step": 2050 }, { "epoch": 0.1576521204671174, "grad_norm": 4.651041030883789, "learning_rate": 1.9369391518131534e-05, "loss": 1.5451, "step": 2052 }, { "epoch": 0.15780577750460972, "grad_norm": 4.203875541687012, "learning_rate": 1.936877688998156e-05, "loss": 1.5506, "step": 2054 }, { "epoch": 0.15795943454210204, "grad_norm": 4.153205394744873, "learning_rate": 1.9368162261831594e-05, "loss": 1.5209, "step": 2056 }, { "epoch": 0.15811309157959436, "grad_norm": 4.187285900115967, "learning_rate": 1.9367547633681623e-05, "loss": 1.7677, "step": 2058 }, { "epoch": 0.15826674861708667, "grad_norm": 4.654025554656982, "learning_rate": 1.9366933005531653e-05, "loss": 1.63, "step": 2060 }, { "epoch": 0.158420405654579, "grad_norm": 4.145837306976318, "learning_rate": 1.9366318377381686e-05, "loss": 1.7693, "step": 2062 }, { "epoch": 0.1585740626920713, "grad_norm": 4.076268196105957, "learning_rate": 1.9365703749231716e-05, "loss": 1.5436, "step": 2064 }, { "epoch": 0.15872771972956362, "grad_norm": 3.9100687503814697, "learning_rate": 1.936508912108175e-05, "loss": 1.5738, "step": 2066 }, { "epoch": 0.15888137676705594, "grad_norm": 4.173727989196777, "learning_rate": 1.936447449293178e-05, "loss": 1.6155, "step": 2068 }, { "epoch": 0.15903503380454825, "grad_norm": 4.555833339691162, "learning_rate": 1.936385986478181e-05, "loss": 1.6879, "step": 2070 }, { "epoch": 0.15918869084204057, "grad_norm": 4.008725166320801, "learning_rate": 1.936324523663184e-05, "loss": 1.5752, "step": 2072 }, { "epoch": 0.1593423478795329, "grad_norm": 5.628279209136963, "learning_rate": 1.9362630608481868e-05, "loss": 1.6574, "step": 2074 }, { "epoch": 0.1594960049170252, "grad_norm": 4.112339496612549, "learning_rate": 1.93620159803319e-05, "loss": 1.6789, "step": 2076 }, { "epoch": 0.15964966195451752, "grad_norm": 4.378570556640625, "learning_rate": 1.9361401352181934e-05, "loss": 1.4837, "step": 2078 }, { "epoch": 0.15980331899200984, "grad_norm": 3.819582223892212, "learning_rate": 1.936078672403196e-05, "loss": 1.573, "step": 2080 }, { "epoch": 0.15995697602950215, "grad_norm": 4.303280830383301, "learning_rate": 1.9360172095881993e-05, "loss": 1.4278, "step": 2082 }, { "epoch": 0.16011063306699447, "grad_norm": 3.9183764457702637, "learning_rate": 1.9359557467732023e-05, "loss": 1.4808, "step": 2084 }, { "epoch": 0.16026429010448678, "grad_norm": 4.484659671783447, "learning_rate": 1.9358942839582053e-05, "loss": 1.557, "step": 2086 }, { "epoch": 0.1604179471419791, "grad_norm": 4.600409984588623, "learning_rate": 1.9358328211432086e-05, "loss": 1.5848, "step": 2088 }, { "epoch": 0.16057160417947142, "grad_norm": 3.4427108764648438, "learning_rate": 1.9357713583282115e-05, "loss": 1.5589, "step": 2090 }, { "epoch": 0.16072526121696373, "grad_norm": 7.8328375816345215, "learning_rate": 1.935709895513215e-05, "loss": 1.5519, "step": 2092 }, { "epoch": 0.16087891825445605, "grad_norm": 4.630655765533447, "learning_rate": 1.9356484326982178e-05, "loss": 1.6441, "step": 2094 }, { "epoch": 0.16103257529194837, "grad_norm": 4.217165946960449, "learning_rate": 1.9355869698832208e-05, "loss": 1.5559, "step": 2096 }, { "epoch": 0.16118623232944068, "grad_norm": 4.941442012786865, "learning_rate": 1.935525507068224e-05, "loss": 1.5065, "step": 2098 }, { "epoch": 0.161339889366933, "grad_norm": 4.171699523925781, "learning_rate": 1.9354640442532267e-05, "loss": 1.4993, "step": 2100 }, { "epoch": 0.16149354640442531, "grad_norm": 3.8235647678375244, "learning_rate": 1.93540258143823e-05, "loss": 1.6035, "step": 2102 }, { "epoch": 0.16164720344191763, "grad_norm": 5.202590465545654, "learning_rate": 1.935341118623233e-05, "loss": 1.6397, "step": 2104 }, { "epoch": 0.16180086047940995, "grad_norm": 3.8114516735076904, "learning_rate": 1.935279655808236e-05, "loss": 1.5103, "step": 2106 }, { "epoch": 0.16195451751690226, "grad_norm": 4.666793346405029, "learning_rate": 1.9352181929932393e-05, "loss": 1.5363, "step": 2108 }, { "epoch": 0.16210817455439458, "grad_norm": 4.18889856338501, "learning_rate": 1.9351567301782422e-05, "loss": 1.662, "step": 2110 }, { "epoch": 0.1622618315918869, "grad_norm": 4.392826557159424, "learning_rate": 1.9350952673632455e-05, "loss": 1.5492, "step": 2112 }, { "epoch": 0.1624154886293792, "grad_norm": 4.0653839111328125, "learning_rate": 1.9350338045482485e-05, "loss": 1.7512, "step": 2114 }, { "epoch": 0.16256914566687156, "grad_norm": 4.020386219024658, "learning_rate": 1.9349723417332515e-05, "loss": 1.7604, "step": 2116 }, { "epoch": 0.16272280270436387, "grad_norm": 4.062155723571777, "learning_rate": 1.9349108789182548e-05, "loss": 1.7181, "step": 2118 }, { "epoch": 0.1628764597418562, "grad_norm": 4.4019999504089355, "learning_rate": 1.9348494161032578e-05, "loss": 1.639, "step": 2120 }, { "epoch": 0.1630301167793485, "grad_norm": 3.761319875717163, "learning_rate": 1.9347879532882607e-05, "loss": 1.4084, "step": 2122 }, { "epoch": 0.16318377381684082, "grad_norm": 4.566369533538818, "learning_rate": 1.934726490473264e-05, "loss": 1.5193, "step": 2124 }, { "epoch": 0.16333743085433314, "grad_norm": 4.366701602935791, "learning_rate": 1.9346650276582667e-05, "loss": 1.5211, "step": 2126 }, { "epoch": 0.16349108789182545, "grad_norm": 4.0437116622924805, "learning_rate": 1.93460356484327e-05, "loss": 1.5901, "step": 2128 }, { "epoch": 0.16364474492931777, "grad_norm": 3.9914474487304688, "learning_rate": 1.934542102028273e-05, "loss": 1.4975, "step": 2130 }, { "epoch": 0.1637984019668101, "grad_norm": 4.267744541168213, "learning_rate": 1.9344806392132762e-05, "loss": 1.6574, "step": 2132 }, { "epoch": 0.1639520590043024, "grad_norm": 3.843414068222046, "learning_rate": 1.9344191763982792e-05, "loss": 1.4976, "step": 2134 }, { "epoch": 0.16410571604179472, "grad_norm": 4.3155975341796875, "learning_rate": 1.9343577135832822e-05, "loss": 1.5333, "step": 2136 }, { "epoch": 0.16425937307928704, "grad_norm": 4.159292697906494, "learning_rate": 1.9342962507682855e-05, "loss": 1.5053, "step": 2138 }, { "epoch": 0.16441303011677935, "grad_norm": 4.296607971191406, "learning_rate": 1.9342347879532885e-05, "loss": 1.6112, "step": 2140 }, { "epoch": 0.16456668715427167, "grad_norm": 3.9782469272613525, "learning_rate": 1.9341733251382914e-05, "loss": 1.5241, "step": 2142 }, { "epoch": 0.16472034419176398, "grad_norm": 4.183950901031494, "learning_rate": 1.9341118623232947e-05, "loss": 1.5901, "step": 2144 }, { "epoch": 0.1648740012292563, "grad_norm": 3.9188575744628906, "learning_rate": 1.9340503995082977e-05, "loss": 1.621, "step": 2146 }, { "epoch": 0.16502765826674862, "grad_norm": 4.474676132202148, "learning_rate": 1.9339889366933007e-05, "loss": 1.487, "step": 2148 }, { "epoch": 0.16518131530424093, "grad_norm": 4.6410017013549805, "learning_rate": 1.933927473878304e-05, "loss": 1.6997, "step": 2150 }, { "epoch": 0.16533497234173325, "grad_norm": 10.933167457580566, "learning_rate": 1.933866011063307e-05, "loss": 1.4752, "step": 2152 }, { "epoch": 0.16548862937922557, "grad_norm": 4.885360240936279, "learning_rate": 1.93380454824831e-05, "loss": 1.5604, "step": 2154 }, { "epoch": 0.16564228641671788, "grad_norm": 3.9420063495635986, "learning_rate": 1.933743085433313e-05, "loss": 1.5909, "step": 2156 }, { "epoch": 0.1657959434542102, "grad_norm": 3.5523521900177, "learning_rate": 1.9336816226183162e-05, "loss": 1.6274, "step": 2158 }, { "epoch": 0.16594960049170251, "grad_norm": 3.6621317863464355, "learning_rate": 1.933620159803319e-05, "loss": 1.5111, "step": 2160 }, { "epoch": 0.16610325752919483, "grad_norm": 4.57207727432251, "learning_rate": 1.933558696988322e-05, "loss": 1.6702, "step": 2162 }, { "epoch": 0.16625691456668715, "grad_norm": 3.9489428997039795, "learning_rate": 1.9334972341733254e-05, "loss": 1.589, "step": 2164 }, { "epoch": 0.16641057160417946, "grad_norm": 3.8509654998779297, "learning_rate": 1.9334357713583284e-05, "loss": 1.5522, "step": 2166 }, { "epoch": 0.16656422864167178, "grad_norm": 3.6893975734710693, "learning_rate": 1.9333743085433314e-05, "loss": 1.4767, "step": 2168 }, { "epoch": 0.1667178856791641, "grad_norm": 4.064876079559326, "learning_rate": 1.9333128457283347e-05, "loss": 1.5515, "step": 2170 }, { "epoch": 0.1668715427166564, "grad_norm": 4.427343845367432, "learning_rate": 1.9332513829133376e-05, "loss": 1.7098, "step": 2172 }, { "epoch": 0.16702519975414873, "grad_norm": 3.724740505218506, "learning_rate": 1.9331899200983406e-05, "loss": 1.5612, "step": 2174 }, { "epoch": 0.16717885679164105, "grad_norm": 4.302028656005859, "learning_rate": 1.933128457283344e-05, "loss": 1.641, "step": 2176 }, { "epoch": 0.16733251382913336, "grad_norm": 4.149264335632324, "learning_rate": 1.933066994468347e-05, "loss": 1.6923, "step": 2178 }, { "epoch": 0.16748617086662568, "grad_norm": 4.091360092163086, "learning_rate": 1.93300553165335e-05, "loss": 1.4934, "step": 2180 }, { "epoch": 0.16763982790411802, "grad_norm": 4.653087139129639, "learning_rate": 1.9329440688383528e-05, "loss": 1.6434, "step": 2182 }, { "epoch": 0.16779348494161034, "grad_norm": 4.141650199890137, "learning_rate": 1.932882606023356e-05, "loss": 1.5228, "step": 2184 }, { "epoch": 0.16794714197910265, "grad_norm": 4.257270336151123, "learning_rate": 1.932821143208359e-05, "loss": 1.7463, "step": 2186 }, { "epoch": 0.16810079901659497, "grad_norm": 4.1163554191589355, "learning_rate": 1.932759680393362e-05, "loss": 1.4543, "step": 2188 }, { "epoch": 0.1682544560540873, "grad_norm": 3.6683640480041504, "learning_rate": 1.9326982175783654e-05, "loss": 1.4778, "step": 2190 }, { "epoch": 0.1684081130915796, "grad_norm": 3.734006881713867, "learning_rate": 1.9326367547633683e-05, "loss": 1.5738, "step": 2192 }, { "epoch": 0.16856177012907192, "grad_norm": 4.454776287078857, "learning_rate": 1.9325752919483713e-05, "loss": 1.6569, "step": 2194 }, { "epoch": 0.16871542716656424, "grad_norm": 4.1497883796691895, "learning_rate": 1.9325138291333746e-05, "loss": 1.5172, "step": 2196 }, { "epoch": 0.16886908420405655, "grad_norm": 4.288064479827881, "learning_rate": 1.9324523663183776e-05, "loss": 1.6416, "step": 2198 }, { "epoch": 0.16902274124154887, "grad_norm": 3.463115930557251, "learning_rate": 1.9323909035033806e-05, "loss": 1.4196, "step": 2200 }, { "epoch": 0.16917639827904118, "grad_norm": 5.139834403991699, "learning_rate": 1.9323294406883835e-05, "loss": 1.6058, "step": 2202 }, { "epoch": 0.1693300553165335, "grad_norm": 4.1170806884765625, "learning_rate": 1.9322679778733868e-05, "loss": 1.4493, "step": 2204 }, { "epoch": 0.16948371235402582, "grad_norm": 3.8090291023254395, "learning_rate": 1.9322065150583898e-05, "loss": 1.6205, "step": 2206 }, { "epoch": 0.16963736939151813, "grad_norm": 3.461530923843384, "learning_rate": 1.9321450522433928e-05, "loss": 1.4338, "step": 2208 }, { "epoch": 0.16979102642901045, "grad_norm": 4.355661392211914, "learning_rate": 1.932083589428396e-05, "loss": 1.6044, "step": 2210 }, { "epoch": 0.16994468346650277, "grad_norm": 3.4141671657562256, "learning_rate": 1.932022126613399e-05, "loss": 1.5096, "step": 2212 }, { "epoch": 0.17009834050399508, "grad_norm": 4.202045917510986, "learning_rate": 1.931960663798402e-05, "loss": 1.7529, "step": 2214 }, { "epoch": 0.1702519975414874, "grad_norm": 3.8714635372161865, "learning_rate": 1.9318992009834053e-05, "loss": 1.5085, "step": 2216 }, { "epoch": 0.17040565457897972, "grad_norm": 5.047643184661865, "learning_rate": 1.9318377381684083e-05, "loss": 1.6937, "step": 2218 }, { "epoch": 0.17055931161647203, "grad_norm": 4.094550132751465, "learning_rate": 1.9317762753534113e-05, "loss": 1.7053, "step": 2220 }, { "epoch": 0.17071296865396435, "grad_norm": 3.65474796295166, "learning_rate": 1.9317148125384146e-05, "loss": 1.5531, "step": 2222 }, { "epoch": 0.17086662569145666, "grad_norm": 4.38794469833374, "learning_rate": 1.9316533497234175e-05, "loss": 1.7151, "step": 2224 }, { "epoch": 0.17102028272894898, "grad_norm": 4.336912631988525, "learning_rate": 1.9315918869084205e-05, "loss": 1.4871, "step": 2226 }, { "epoch": 0.1711739397664413, "grad_norm": 3.9137933254241943, "learning_rate": 1.9315304240934235e-05, "loss": 1.3785, "step": 2228 }, { "epoch": 0.1713275968039336, "grad_norm": 4.241963863372803, "learning_rate": 1.9314689612784268e-05, "loss": 1.482, "step": 2230 }, { "epoch": 0.17148125384142593, "grad_norm": 4.047958850860596, "learning_rate": 1.9314074984634297e-05, "loss": 1.7098, "step": 2232 }, { "epoch": 0.17163491087891825, "grad_norm": 4.688525676727295, "learning_rate": 1.9313460356484327e-05, "loss": 1.5618, "step": 2234 }, { "epoch": 0.17178856791641056, "grad_norm": 4.020751953125, "learning_rate": 1.931284572833436e-05, "loss": 1.5526, "step": 2236 }, { "epoch": 0.17194222495390288, "grad_norm": 3.93445086479187, "learning_rate": 1.931223110018439e-05, "loss": 1.6595, "step": 2238 }, { "epoch": 0.1720958819913952, "grad_norm": 5.324620723724365, "learning_rate": 1.931161647203442e-05, "loss": 1.657, "step": 2240 }, { "epoch": 0.1722495390288875, "grad_norm": 3.6193902492523193, "learning_rate": 1.9311001843884453e-05, "loss": 1.5, "step": 2242 }, { "epoch": 0.17240319606637983, "grad_norm": 4.382716178894043, "learning_rate": 1.9310387215734482e-05, "loss": 1.6785, "step": 2244 }, { "epoch": 0.17255685310387217, "grad_norm": 3.7359304428100586, "learning_rate": 1.9309772587584512e-05, "loss": 1.5003, "step": 2246 }, { "epoch": 0.1727105101413645, "grad_norm": 4.570140838623047, "learning_rate": 1.9309157959434545e-05, "loss": 1.6353, "step": 2248 }, { "epoch": 0.1728641671788568, "grad_norm": 4.809631824493408, "learning_rate": 1.9308543331284575e-05, "loss": 1.5898, "step": 2250 }, { "epoch": 0.17301782421634912, "grad_norm": 4.994627475738525, "learning_rate": 1.9307928703134604e-05, "loss": 1.6614, "step": 2252 }, { "epoch": 0.17317148125384144, "grad_norm": 4.121060371398926, "learning_rate": 1.9307314074984634e-05, "loss": 1.6817, "step": 2254 }, { "epoch": 0.17332513829133375, "grad_norm": 4.009014129638672, "learning_rate": 1.9306699446834667e-05, "loss": 1.5645, "step": 2256 }, { "epoch": 0.17347879532882607, "grad_norm": 4.27223539352417, "learning_rate": 1.9306084818684697e-05, "loss": 1.6764, "step": 2258 }, { "epoch": 0.17363245236631838, "grad_norm": 4.074213027954102, "learning_rate": 1.9305470190534727e-05, "loss": 1.6095, "step": 2260 }, { "epoch": 0.1737861094038107, "grad_norm": 3.6030173301696777, "learning_rate": 1.930485556238476e-05, "loss": 1.5907, "step": 2262 }, { "epoch": 0.17393976644130302, "grad_norm": 4.34961462020874, "learning_rate": 1.930424093423479e-05, "loss": 1.611, "step": 2264 }, { "epoch": 0.17409342347879533, "grad_norm": 3.9723429679870605, "learning_rate": 1.930362630608482e-05, "loss": 1.5256, "step": 2266 }, { "epoch": 0.17424708051628765, "grad_norm": 3.7899746894836426, "learning_rate": 1.9303011677934852e-05, "loss": 1.6923, "step": 2268 }, { "epoch": 0.17440073755377997, "grad_norm": 4.415828227996826, "learning_rate": 1.9302397049784882e-05, "loss": 1.4681, "step": 2270 }, { "epoch": 0.17455439459127228, "grad_norm": 5.616640567779541, "learning_rate": 1.930178242163491e-05, "loss": 1.6252, "step": 2272 }, { "epoch": 0.1747080516287646, "grad_norm": 3.5017950534820557, "learning_rate": 1.9301167793484944e-05, "loss": 1.4621, "step": 2274 }, { "epoch": 0.17486170866625692, "grad_norm": 4.120169639587402, "learning_rate": 1.9300553165334974e-05, "loss": 1.5537, "step": 2276 }, { "epoch": 0.17501536570374923, "grad_norm": 4.489522457122803, "learning_rate": 1.9299938537185007e-05, "loss": 1.6915, "step": 2278 }, { "epoch": 0.17516902274124155, "grad_norm": 4.285830974578857, "learning_rate": 1.9299323909035034e-05, "loss": 1.5101, "step": 2280 }, { "epoch": 0.17532267977873386, "grad_norm": 3.9038150310516357, "learning_rate": 1.9298709280885067e-05, "loss": 1.6067, "step": 2282 }, { "epoch": 0.17547633681622618, "grad_norm": 3.8521156311035156, "learning_rate": 1.9298094652735096e-05, "loss": 1.4494, "step": 2284 }, { "epoch": 0.1756299938537185, "grad_norm": 7.329223155975342, "learning_rate": 1.9297480024585126e-05, "loss": 1.5588, "step": 2286 }, { "epoch": 0.1757836508912108, "grad_norm": 3.97939395904541, "learning_rate": 1.929686539643516e-05, "loss": 1.4802, "step": 2288 }, { "epoch": 0.17593730792870313, "grad_norm": 3.464115858078003, "learning_rate": 1.929625076828519e-05, "loss": 1.4913, "step": 2290 }, { "epoch": 0.17609096496619545, "grad_norm": 4.677506446838379, "learning_rate": 1.929563614013522e-05, "loss": 1.5254, "step": 2292 }, { "epoch": 0.17624462200368776, "grad_norm": 3.7886929512023926, "learning_rate": 1.929502151198525e-05, "loss": 1.6297, "step": 2294 }, { "epoch": 0.17639827904118008, "grad_norm": 3.5035488605499268, "learning_rate": 1.929440688383528e-05, "loss": 1.657, "step": 2296 }, { "epoch": 0.1765519360786724, "grad_norm": 4.172173976898193, "learning_rate": 1.9293792255685314e-05, "loss": 1.5612, "step": 2298 }, { "epoch": 0.1767055931161647, "grad_norm": 3.9481425285339355, "learning_rate": 1.929317762753534e-05, "loss": 1.4419, "step": 2300 }, { "epoch": 0.17685925015365703, "grad_norm": 3.922159433364868, "learning_rate": 1.9292562999385374e-05, "loss": 1.5459, "step": 2302 }, { "epoch": 0.17701290719114934, "grad_norm": 4.2247233390808105, "learning_rate": 1.9291948371235403e-05, "loss": 1.6869, "step": 2304 }, { "epoch": 0.17716656422864166, "grad_norm": 4.960201740264893, "learning_rate": 1.9291333743085433e-05, "loss": 1.4055, "step": 2306 }, { "epoch": 0.17732022126613398, "grad_norm": 4.675178527832031, "learning_rate": 1.9290719114935466e-05, "loss": 1.4635, "step": 2308 }, { "epoch": 0.1774738783036263, "grad_norm": 4.3724589347839355, "learning_rate": 1.9290104486785496e-05, "loss": 1.4423, "step": 2310 }, { "epoch": 0.17762753534111864, "grad_norm": 4.629543304443359, "learning_rate": 1.9289489858635525e-05, "loss": 1.6696, "step": 2312 }, { "epoch": 0.17778119237861095, "grad_norm": 3.8183395862579346, "learning_rate": 1.928887523048556e-05, "loss": 1.5558, "step": 2314 }, { "epoch": 0.17793484941610327, "grad_norm": 3.7984275817871094, "learning_rate": 1.9288260602335588e-05, "loss": 1.5884, "step": 2316 }, { "epoch": 0.17808850645359559, "grad_norm": 3.9068145751953125, "learning_rate": 1.928764597418562e-05, "loss": 1.6217, "step": 2318 }, { "epoch": 0.1782421634910879, "grad_norm": 4.159458160400391, "learning_rate": 1.928703134603565e-05, "loss": 1.5934, "step": 2320 }, { "epoch": 0.17839582052858022, "grad_norm": 4.013321876525879, "learning_rate": 1.928641671788568e-05, "loss": 1.5542, "step": 2322 }, { "epoch": 0.17854947756607253, "grad_norm": 4.504942893981934, "learning_rate": 1.9285802089735714e-05, "loss": 1.7811, "step": 2324 }, { "epoch": 0.17870313460356485, "grad_norm": 4.721269130706787, "learning_rate": 1.928518746158574e-05, "loss": 1.4974, "step": 2326 }, { "epoch": 0.17885679164105717, "grad_norm": 3.662440776824951, "learning_rate": 1.9284572833435773e-05, "loss": 1.6497, "step": 2328 }, { "epoch": 0.17901044867854948, "grad_norm": 3.8075759410858154, "learning_rate": 1.9283958205285803e-05, "loss": 1.4787, "step": 2330 }, { "epoch": 0.1791641057160418, "grad_norm": 4.013290882110596, "learning_rate": 1.9283343577135832e-05, "loss": 1.5371, "step": 2332 }, { "epoch": 0.17931776275353412, "grad_norm": 4.095331192016602, "learning_rate": 1.9282728948985865e-05, "loss": 1.6089, "step": 2334 }, { "epoch": 0.17947141979102643, "grad_norm": 4.137665748596191, "learning_rate": 1.9282114320835895e-05, "loss": 1.6971, "step": 2336 }, { "epoch": 0.17962507682851875, "grad_norm": 4.847195625305176, "learning_rate": 1.9281499692685928e-05, "loss": 1.365, "step": 2338 }, { "epoch": 0.17977873386601106, "grad_norm": 4.068114280700684, "learning_rate": 1.9280885064535958e-05, "loss": 1.7029, "step": 2340 }, { "epoch": 0.17993239090350338, "grad_norm": 4.104188442230225, "learning_rate": 1.9280270436385988e-05, "loss": 1.636, "step": 2342 }, { "epoch": 0.1800860479409957, "grad_norm": 4.033984661102295, "learning_rate": 1.927965580823602e-05, "loss": 1.5451, "step": 2344 }, { "epoch": 0.180239704978488, "grad_norm": 4.00771951675415, "learning_rate": 1.927904118008605e-05, "loss": 1.5162, "step": 2346 }, { "epoch": 0.18039336201598033, "grad_norm": 4.097219467163086, "learning_rate": 1.927842655193608e-05, "loss": 1.6558, "step": 2348 }, { "epoch": 0.18054701905347265, "grad_norm": 4.354104042053223, "learning_rate": 1.9277811923786113e-05, "loss": 1.61, "step": 2350 }, { "epoch": 0.18070067609096496, "grad_norm": 4.535645484924316, "learning_rate": 1.927719729563614e-05, "loss": 1.6289, "step": 2352 }, { "epoch": 0.18085433312845728, "grad_norm": 4.078785419464111, "learning_rate": 1.9276582667486172e-05, "loss": 1.7284, "step": 2354 }, { "epoch": 0.1810079901659496, "grad_norm": 4.275857448577881, "learning_rate": 1.9275968039336202e-05, "loss": 1.5461, "step": 2356 }, { "epoch": 0.1811616472034419, "grad_norm": 4.156821250915527, "learning_rate": 1.9275353411186232e-05, "loss": 1.5956, "step": 2358 }, { "epoch": 0.18131530424093423, "grad_norm": 3.6036367416381836, "learning_rate": 1.9274738783036265e-05, "loss": 1.5949, "step": 2360 }, { "epoch": 0.18146896127842654, "grad_norm": 4.079240798950195, "learning_rate": 1.9274124154886295e-05, "loss": 1.5671, "step": 2362 }, { "epoch": 0.18162261831591886, "grad_norm": 4.024125576019287, "learning_rate": 1.9273509526736328e-05, "loss": 1.4904, "step": 2364 }, { "epoch": 0.18177627535341118, "grad_norm": 3.7651987075805664, "learning_rate": 1.9272894898586357e-05, "loss": 1.5179, "step": 2366 }, { "epoch": 0.1819299323909035, "grad_norm": 3.8718831539154053, "learning_rate": 1.9272280270436387e-05, "loss": 1.4699, "step": 2368 }, { "epoch": 0.1820835894283958, "grad_norm": 4.548869609832764, "learning_rate": 1.927166564228642e-05, "loss": 1.6476, "step": 2370 }, { "epoch": 0.18223724646588813, "grad_norm": 4.528201580047607, "learning_rate": 1.927105101413645e-05, "loss": 1.6755, "step": 2372 }, { "epoch": 0.18239090350338044, "grad_norm": 4.0802388191223145, "learning_rate": 1.927043638598648e-05, "loss": 1.5722, "step": 2374 }, { "epoch": 0.18254456054087279, "grad_norm": 4.145775318145752, "learning_rate": 1.9269821757836513e-05, "loss": 1.5816, "step": 2376 }, { "epoch": 0.1826982175783651, "grad_norm": 3.925696611404419, "learning_rate": 1.926920712968654e-05, "loss": 1.625, "step": 2378 }, { "epoch": 0.18285187461585742, "grad_norm": 3.8499910831451416, "learning_rate": 1.9268592501536572e-05, "loss": 1.5522, "step": 2380 }, { "epoch": 0.18300553165334973, "grad_norm": 4.174738883972168, "learning_rate": 1.92679778733866e-05, "loss": 1.561, "step": 2382 }, { "epoch": 0.18315918869084205, "grad_norm": 3.801260232925415, "learning_rate": 1.9267363245236635e-05, "loss": 1.5159, "step": 2384 }, { "epoch": 0.18331284572833437, "grad_norm": 4.0040202140808105, "learning_rate": 1.9266748617086664e-05, "loss": 1.6021, "step": 2386 }, { "epoch": 0.18346650276582668, "grad_norm": 4.132852554321289, "learning_rate": 1.9266133988936694e-05, "loss": 1.6887, "step": 2388 }, { "epoch": 0.183620159803319, "grad_norm": 3.7313075065612793, "learning_rate": 1.9265519360786727e-05, "loss": 1.4582, "step": 2390 }, { "epoch": 0.18377381684081132, "grad_norm": 3.824453115463257, "learning_rate": 1.9264904732636757e-05, "loss": 1.4394, "step": 2392 }, { "epoch": 0.18392747387830363, "grad_norm": 4.368152141571045, "learning_rate": 1.9264290104486786e-05, "loss": 1.5397, "step": 2394 }, { "epoch": 0.18408113091579595, "grad_norm": 3.7525463104248047, "learning_rate": 1.926367547633682e-05, "loss": 1.6875, "step": 2396 }, { "epoch": 0.18423478795328826, "grad_norm": 4.229045391082764, "learning_rate": 1.9263060848186846e-05, "loss": 1.6248, "step": 2398 }, { "epoch": 0.18438844499078058, "grad_norm": 3.9596312046051025, "learning_rate": 1.926244622003688e-05, "loss": 1.5278, "step": 2400 }, { "epoch": 0.1845421020282729, "grad_norm": 3.97829270362854, "learning_rate": 1.9261831591886912e-05, "loss": 1.5206, "step": 2402 }, { "epoch": 0.1846957590657652, "grad_norm": 3.6935653686523438, "learning_rate": 1.926121696373694e-05, "loss": 1.5395, "step": 2404 }, { "epoch": 0.18484941610325753, "grad_norm": 3.869833469390869, "learning_rate": 1.926060233558697e-05, "loss": 1.4951, "step": 2406 }, { "epoch": 0.18500307314074985, "grad_norm": 3.3914384841918945, "learning_rate": 1.9259987707437e-05, "loss": 1.3401, "step": 2408 }, { "epoch": 0.18515673017824216, "grad_norm": 4.250161647796631, "learning_rate": 1.9259373079287034e-05, "loss": 1.6384, "step": 2410 }, { "epoch": 0.18531038721573448, "grad_norm": 4.362060070037842, "learning_rate": 1.9258758451137064e-05, "loss": 1.5895, "step": 2412 }, { "epoch": 0.1854640442532268, "grad_norm": 3.7243545055389404, "learning_rate": 1.9258143822987093e-05, "loss": 1.4715, "step": 2414 }, { "epoch": 0.1856177012907191, "grad_norm": 4.146886825561523, "learning_rate": 1.9257529194837127e-05, "loss": 1.5387, "step": 2416 }, { "epoch": 0.18577135832821143, "grad_norm": 3.769721746444702, "learning_rate": 1.9256914566687156e-05, "loss": 1.5978, "step": 2418 }, { "epoch": 0.18592501536570374, "grad_norm": 4.210140228271484, "learning_rate": 1.9256299938537186e-05, "loss": 1.4916, "step": 2420 }, { "epoch": 0.18607867240319606, "grad_norm": 3.8122193813323975, "learning_rate": 1.925568531038722e-05, "loss": 1.4786, "step": 2422 }, { "epoch": 0.18623232944068838, "grad_norm": 4.192070484161377, "learning_rate": 1.925507068223725e-05, "loss": 1.5604, "step": 2424 }, { "epoch": 0.1863859864781807, "grad_norm": 4.002556800842285, "learning_rate": 1.925445605408728e-05, "loss": 1.6205, "step": 2426 }, { "epoch": 0.186539643515673, "grad_norm": 4.130584716796875, "learning_rate": 1.9253841425937308e-05, "loss": 1.6588, "step": 2428 }, { "epoch": 0.18669330055316533, "grad_norm": 4.271806240081787, "learning_rate": 1.925322679778734e-05, "loss": 1.4771, "step": 2430 }, { "epoch": 0.18684695759065764, "grad_norm": 3.9257447719573975, "learning_rate": 1.925261216963737e-05, "loss": 1.4746, "step": 2432 }, { "epoch": 0.18700061462814996, "grad_norm": 3.5739989280700684, "learning_rate": 1.92519975414874e-05, "loss": 1.6682, "step": 2434 }, { "epoch": 0.18715427166564227, "grad_norm": 4.071542263031006, "learning_rate": 1.9251382913337434e-05, "loss": 1.7167, "step": 2436 }, { "epoch": 0.1873079287031346, "grad_norm": 3.6181557178497314, "learning_rate": 1.9250768285187463e-05, "loss": 1.4887, "step": 2438 }, { "epoch": 0.1874615857406269, "grad_norm": 3.9054434299468994, "learning_rate": 1.9250153657037493e-05, "loss": 1.5903, "step": 2440 }, { "epoch": 0.18761524277811925, "grad_norm": 3.884770631790161, "learning_rate": 1.9249539028887526e-05, "loss": 1.533, "step": 2442 }, { "epoch": 0.18776889981561157, "grad_norm": 4.007852554321289, "learning_rate": 1.9248924400737556e-05, "loss": 1.4776, "step": 2444 }, { "epoch": 0.18792255685310388, "grad_norm": 4.36082649230957, "learning_rate": 1.9248309772587585e-05, "loss": 1.6813, "step": 2446 }, { "epoch": 0.1880762138905962, "grad_norm": 3.7035789489746094, "learning_rate": 1.924769514443762e-05, "loss": 1.5808, "step": 2448 }, { "epoch": 0.18822987092808852, "grad_norm": 4.472562789916992, "learning_rate": 1.9247080516287648e-05, "loss": 1.6157, "step": 2450 }, { "epoch": 0.18838352796558083, "grad_norm": 3.8329532146453857, "learning_rate": 1.9246465888137678e-05, "loss": 1.6105, "step": 2452 }, { "epoch": 0.18853718500307315, "grad_norm": 3.5638465881347656, "learning_rate": 1.9245851259987707e-05, "loss": 1.3973, "step": 2454 }, { "epoch": 0.18869084204056547, "grad_norm": 3.769814968109131, "learning_rate": 1.924523663183774e-05, "loss": 1.436, "step": 2456 }, { "epoch": 0.18884449907805778, "grad_norm": 3.7624809741973877, "learning_rate": 1.924462200368777e-05, "loss": 1.6093, "step": 2458 }, { "epoch": 0.1889981561155501, "grad_norm": 3.934250593185425, "learning_rate": 1.92440073755378e-05, "loss": 1.5041, "step": 2460 }, { "epoch": 0.18915181315304241, "grad_norm": 4.421424865722656, "learning_rate": 1.9243392747387833e-05, "loss": 1.4708, "step": 2462 }, { "epoch": 0.18930547019053473, "grad_norm": 4.019033432006836, "learning_rate": 1.9242778119237863e-05, "loss": 1.5314, "step": 2464 }, { "epoch": 0.18945912722802705, "grad_norm": 4.172586917877197, "learning_rate": 1.9242163491087892e-05, "loss": 1.5259, "step": 2466 }, { "epoch": 0.18961278426551936, "grad_norm": 3.919337034225464, "learning_rate": 1.9241548862937925e-05, "loss": 1.6305, "step": 2468 }, { "epoch": 0.18976644130301168, "grad_norm": 4.666354656219482, "learning_rate": 1.9240934234787955e-05, "loss": 1.66, "step": 2470 }, { "epoch": 0.189920098340504, "grad_norm": 3.4628660678863525, "learning_rate": 1.9240319606637985e-05, "loss": 1.4733, "step": 2472 }, { "epoch": 0.1900737553779963, "grad_norm": 4.151192665100098, "learning_rate": 1.9239704978488018e-05, "loss": 1.6414, "step": 2474 }, { "epoch": 0.19022741241548863, "grad_norm": 3.8772740364074707, "learning_rate": 1.9239090350338048e-05, "loss": 1.6062, "step": 2476 }, { "epoch": 0.19038106945298094, "grad_norm": 4.541169166564941, "learning_rate": 1.9238475722188077e-05, "loss": 1.5332, "step": 2478 }, { "epoch": 0.19053472649047326, "grad_norm": 3.821932315826416, "learning_rate": 1.9237861094038107e-05, "loss": 1.4712, "step": 2480 }, { "epoch": 0.19068838352796558, "grad_norm": 4.174224853515625, "learning_rate": 1.923724646588814e-05, "loss": 1.5465, "step": 2482 }, { "epoch": 0.1908420405654579, "grad_norm": 3.4857685565948486, "learning_rate": 1.923663183773817e-05, "loss": 1.3857, "step": 2484 }, { "epoch": 0.1909956976029502, "grad_norm": 3.488931655883789, "learning_rate": 1.92360172095882e-05, "loss": 1.4848, "step": 2486 }, { "epoch": 0.19114935464044253, "grad_norm": 4.406091213226318, "learning_rate": 1.9235402581438232e-05, "loss": 1.5634, "step": 2488 }, { "epoch": 0.19130301167793484, "grad_norm": 4.208408832550049, "learning_rate": 1.9234787953288262e-05, "loss": 1.4935, "step": 2490 }, { "epoch": 0.19145666871542716, "grad_norm": 4.038366794586182, "learning_rate": 1.9234173325138292e-05, "loss": 1.435, "step": 2492 }, { "epoch": 0.19161032575291947, "grad_norm": 3.6954140663146973, "learning_rate": 1.9233558696988325e-05, "loss": 1.6045, "step": 2494 }, { "epoch": 0.1917639827904118, "grad_norm": 3.928567409515381, "learning_rate": 1.9232944068838355e-05, "loss": 1.6917, "step": 2496 }, { "epoch": 0.1919176398279041, "grad_norm": 3.8990769386291504, "learning_rate": 1.9232329440688384e-05, "loss": 1.6294, "step": 2498 }, { "epoch": 0.19207129686539642, "grad_norm": 4.251073360443115, "learning_rate": 1.9231714812538417e-05, "loss": 1.7429, "step": 2500 }, { "epoch": 0.19222495390288874, "grad_norm": 3.8998966217041016, "learning_rate": 1.9231100184388447e-05, "loss": 1.5395, "step": 2502 }, { "epoch": 0.19237861094038106, "grad_norm": 3.5932552814483643, "learning_rate": 1.9230485556238477e-05, "loss": 1.4188, "step": 2504 }, { "epoch": 0.1925322679778734, "grad_norm": 4.385290622711182, "learning_rate": 1.9229870928088506e-05, "loss": 1.5512, "step": 2506 }, { "epoch": 0.19268592501536572, "grad_norm": 4.489023685455322, "learning_rate": 1.922925629993854e-05, "loss": 1.5199, "step": 2508 }, { "epoch": 0.19283958205285803, "grad_norm": 3.464556932449341, "learning_rate": 1.922864167178857e-05, "loss": 1.6343, "step": 2510 }, { "epoch": 0.19299323909035035, "grad_norm": 3.967116594314575, "learning_rate": 1.92280270436386e-05, "loss": 1.5112, "step": 2512 }, { "epoch": 0.19314689612784267, "grad_norm": 3.6863479614257812, "learning_rate": 1.9227412415488632e-05, "loss": 1.4394, "step": 2514 }, { "epoch": 0.19330055316533498, "grad_norm": 3.8512773513793945, "learning_rate": 1.922679778733866e-05, "loss": 1.5762, "step": 2516 }, { "epoch": 0.1934542102028273, "grad_norm": 4.207437515258789, "learning_rate": 1.922618315918869e-05, "loss": 1.5081, "step": 2518 }, { "epoch": 0.19360786724031961, "grad_norm": 3.882289409637451, "learning_rate": 1.9225568531038724e-05, "loss": 1.542, "step": 2520 }, { "epoch": 0.19376152427781193, "grad_norm": 3.5873258113861084, "learning_rate": 1.9224953902888754e-05, "loss": 1.4955, "step": 2522 }, { "epoch": 0.19391518131530425, "grad_norm": 4.062012195587158, "learning_rate": 1.9224339274738784e-05, "loss": 1.6206, "step": 2524 }, { "epoch": 0.19406883835279656, "grad_norm": 4.009846210479736, "learning_rate": 1.9223724646588813e-05, "loss": 1.4736, "step": 2526 }, { "epoch": 0.19422249539028888, "grad_norm": 3.5200116634368896, "learning_rate": 1.9223110018438846e-05, "loss": 1.5089, "step": 2528 }, { "epoch": 0.1943761524277812, "grad_norm": 3.888718366622925, "learning_rate": 1.9222495390288876e-05, "loss": 1.5568, "step": 2530 }, { "epoch": 0.1945298094652735, "grad_norm": 4.041849136352539, "learning_rate": 1.9221880762138906e-05, "loss": 1.57, "step": 2532 }, { "epoch": 0.19468346650276583, "grad_norm": 3.235793352127075, "learning_rate": 1.922126613398894e-05, "loss": 1.562, "step": 2534 }, { "epoch": 0.19483712354025814, "grad_norm": 3.7582755088806152, "learning_rate": 1.922065150583897e-05, "loss": 1.6906, "step": 2536 }, { "epoch": 0.19499078057775046, "grad_norm": 4.138317108154297, "learning_rate": 1.9220036877688998e-05, "loss": 1.6045, "step": 2538 }, { "epoch": 0.19514443761524278, "grad_norm": 3.8657236099243164, "learning_rate": 1.921942224953903e-05, "loss": 1.6388, "step": 2540 }, { "epoch": 0.1952980946527351, "grad_norm": 4.225650310516357, "learning_rate": 1.921880762138906e-05, "loss": 1.5472, "step": 2542 }, { "epoch": 0.1954517516902274, "grad_norm": 4.000063896179199, "learning_rate": 1.921819299323909e-05, "loss": 1.5838, "step": 2544 }, { "epoch": 0.19560540872771973, "grad_norm": 3.2265849113464355, "learning_rate": 1.9217578365089124e-05, "loss": 1.401, "step": 2546 }, { "epoch": 0.19575906576521204, "grad_norm": 4.049646854400635, "learning_rate": 1.9216963736939153e-05, "loss": 1.6306, "step": 2548 }, { "epoch": 0.19591272280270436, "grad_norm": 4.136661052703857, "learning_rate": 1.9216349108789187e-05, "loss": 1.6809, "step": 2550 }, { "epoch": 0.19606637984019668, "grad_norm": 4.1102294921875, "learning_rate": 1.9215734480639213e-05, "loss": 1.5619, "step": 2552 }, { "epoch": 0.196220036877689, "grad_norm": 3.632099151611328, "learning_rate": 1.9215119852489246e-05, "loss": 1.4556, "step": 2554 }, { "epoch": 0.1963736939151813, "grad_norm": 3.7188732624053955, "learning_rate": 1.9214505224339276e-05, "loss": 1.6132, "step": 2556 }, { "epoch": 0.19652735095267362, "grad_norm": 3.4973299503326416, "learning_rate": 1.9213890596189305e-05, "loss": 1.5363, "step": 2558 }, { "epoch": 0.19668100799016594, "grad_norm": 4.495885848999023, "learning_rate": 1.921327596803934e-05, "loss": 1.4821, "step": 2560 }, { "epoch": 0.19683466502765826, "grad_norm": 4.349817752838135, "learning_rate": 1.9212661339889368e-05, "loss": 1.5153, "step": 2562 }, { "epoch": 0.19698832206515057, "grad_norm": 3.5700700283050537, "learning_rate": 1.9212046711739398e-05, "loss": 1.6255, "step": 2564 }, { "epoch": 0.1971419791026429, "grad_norm": 3.944878101348877, "learning_rate": 1.921143208358943e-05, "loss": 1.5505, "step": 2566 }, { "epoch": 0.1972956361401352, "grad_norm": 4.354259490966797, "learning_rate": 1.921081745543946e-05, "loss": 1.6005, "step": 2568 }, { "epoch": 0.19744929317762752, "grad_norm": 4.167532920837402, "learning_rate": 1.9210202827289494e-05, "loss": 1.6639, "step": 2570 }, { "epoch": 0.19760295021511987, "grad_norm": 4.1236958503723145, "learning_rate": 1.9209588199139523e-05, "loss": 1.6543, "step": 2572 }, { "epoch": 0.19775660725261218, "grad_norm": 4.3486328125, "learning_rate": 1.9208973570989553e-05, "loss": 1.6486, "step": 2574 }, { "epoch": 0.1979102642901045, "grad_norm": 4.074930191040039, "learning_rate": 1.9208358942839586e-05, "loss": 1.5789, "step": 2576 }, { "epoch": 0.19806392132759681, "grad_norm": 4.678352355957031, "learning_rate": 1.9207744314689612e-05, "loss": 1.5615, "step": 2578 }, { "epoch": 0.19821757836508913, "grad_norm": 3.689697504043579, "learning_rate": 1.9207129686539645e-05, "loss": 1.4933, "step": 2580 }, { "epoch": 0.19837123540258145, "grad_norm": 3.895163059234619, "learning_rate": 1.9206515058389675e-05, "loss": 1.5115, "step": 2582 }, { "epoch": 0.19852489244007376, "grad_norm": 4.155092239379883, "learning_rate": 1.9205900430239705e-05, "loss": 1.4538, "step": 2584 }, { "epoch": 0.19867854947756608, "grad_norm": 4.396788120269775, "learning_rate": 1.9205285802089738e-05, "loss": 1.5629, "step": 2586 }, { "epoch": 0.1988322065150584, "grad_norm": 4.8076934814453125, "learning_rate": 1.9204671173939767e-05, "loss": 1.4844, "step": 2588 }, { "epoch": 0.1989858635525507, "grad_norm": 3.6861722469329834, "learning_rate": 1.92040565457898e-05, "loss": 1.501, "step": 2590 }, { "epoch": 0.19913952059004303, "grad_norm": 4.285679817199707, "learning_rate": 1.920344191763983e-05, "loss": 1.6599, "step": 2592 }, { "epoch": 0.19929317762753535, "grad_norm": 4.052687644958496, "learning_rate": 1.920282728948986e-05, "loss": 1.4244, "step": 2594 }, { "epoch": 0.19944683466502766, "grad_norm": 3.6357898712158203, "learning_rate": 1.9202212661339893e-05, "loss": 1.5088, "step": 2596 }, { "epoch": 0.19960049170251998, "grad_norm": 3.789736270904541, "learning_rate": 1.9201598033189923e-05, "loss": 1.5779, "step": 2598 }, { "epoch": 0.1997541487400123, "grad_norm": 4.087263107299805, "learning_rate": 1.9200983405039952e-05, "loss": 1.6094, "step": 2600 }, { "epoch": 0.1999078057775046, "grad_norm": 4.693444728851318, "learning_rate": 1.9200368776889985e-05, "loss": 1.3526, "step": 2602 }, { "epoch": 0.20006146281499693, "grad_norm": 3.850968837738037, "learning_rate": 1.919975414874001e-05, "loss": 1.5124, "step": 2604 }, { "epoch": 0.20021511985248924, "grad_norm": 3.671071767807007, "learning_rate": 1.9199139520590045e-05, "loss": 1.5842, "step": 2606 }, { "epoch": 0.20036877688998156, "grad_norm": 3.6253607273101807, "learning_rate": 1.9198524892440074e-05, "loss": 1.5148, "step": 2608 }, { "epoch": 0.20052243392747388, "grad_norm": 4.358476638793945, "learning_rate": 1.9197910264290104e-05, "loss": 1.531, "step": 2610 }, { "epoch": 0.2006760909649662, "grad_norm": 3.9361257553100586, "learning_rate": 1.9197295636140137e-05, "loss": 1.5476, "step": 2612 }, { "epoch": 0.2008297480024585, "grad_norm": 4.064403533935547, "learning_rate": 1.9196681007990167e-05, "loss": 1.5514, "step": 2614 }, { "epoch": 0.20098340503995082, "grad_norm": 4.13236665725708, "learning_rate": 1.91960663798402e-05, "loss": 1.5569, "step": 2616 }, { "epoch": 0.20113706207744314, "grad_norm": 3.7941882610321045, "learning_rate": 1.919545175169023e-05, "loss": 1.511, "step": 2618 }, { "epoch": 0.20129071911493546, "grad_norm": 3.5626373291015625, "learning_rate": 1.919483712354026e-05, "loss": 1.4677, "step": 2620 }, { "epoch": 0.20144437615242777, "grad_norm": 4.068457126617432, "learning_rate": 1.9194222495390292e-05, "loss": 1.5312, "step": 2622 }, { "epoch": 0.2015980331899201, "grad_norm": 4.036095142364502, "learning_rate": 1.919360786724032e-05, "loss": 1.6681, "step": 2624 }, { "epoch": 0.2017516902274124, "grad_norm": 3.8620471954345703, "learning_rate": 1.9192993239090352e-05, "loss": 1.5411, "step": 2626 }, { "epoch": 0.20190534726490472, "grad_norm": 3.6075820922851562, "learning_rate": 1.919237861094038e-05, "loss": 1.5125, "step": 2628 }, { "epoch": 0.20205900430239704, "grad_norm": 3.6271731853485107, "learning_rate": 1.919176398279041e-05, "loss": 1.5795, "step": 2630 }, { "epoch": 0.20221266133988935, "grad_norm": 4.1661810874938965, "learning_rate": 1.9191149354640444e-05, "loss": 1.5523, "step": 2632 }, { "epoch": 0.20236631837738167, "grad_norm": 4.115680694580078, "learning_rate": 1.9190534726490474e-05, "loss": 1.5146, "step": 2634 }, { "epoch": 0.20251997541487401, "grad_norm": 3.4160807132720947, "learning_rate": 1.9189920098340507e-05, "loss": 1.5035, "step": 2636 }, { "epoch": 0.20267363245236633, "grad_norm": 4.1160969734191895, "learning_rate": 1.9189305470190537e-05, "loss": 1.481, "step": 2638 }, { "epoch": 0.20282728948985865, "grad_norm": 4.941380500793457, "learning_rate": 1.9188690842040566e-05, "loss": 1.7115, "step": 2640 }, { "epoch": 0.20298094652735096, "grad_norm": 3.743597984313965, "learning_rate": 1.91880762138906e-05, "loss": 1.5442, "step": 2642 }, { "epoch": 0.20313460356484328, "grad_norm": 4.364147186279297, "learning_rate": 1.918746158574063e-05, "loss": 1.3992, "step": 2644 }, { "epoch": 0.2032882606023356, "grad_norm": 3.69368052482605, "learning_rate": 1.918684695759066e-05, "loss": 1.4778, "step": 2646 }, { "epoch": 0.2034419176398279, "grad_norm": 4.028282642364502, "learning_rate": 1.9186232329440692e-05, "loss": 1.6028, "step": 2648 }, { "epoch": 0.20359557467732023, "grad_norm": 3.767077922821045, "learning_rate": 1.9185617701290718e-05, "loss": 1.4907, "step": 2650 }, { "epoch": 0.20374923171481255, "grad_norm": 3.511185646057129, "learning_rate": 1.918500307314075e-05, "loss": 1.6204, "step": 2652 }, { "epoch": 0.20390288875230486, "grad_norm": 3.6800639629364014, "learning_rate": 1.918438844499078e-05, "loss": 1.5762, "step": 2654 }, { "epoch": 0.20405654578979718, "grad_norm": 3.3730273246765137, "learning_rate": 1.9183773816840814e-05, "loss": 1.4401, "step": 2656 }, { "epoch": 0.2042102028272895, "grad_norm": 4.639964580535889, "learning_rate": 1.9183159188690844e-05, "loss": 1.5344, "step": 2658 }, { "epoch": 0.2043638598647818, "grad_norm": 3.624997138977051, "learning_rate": 1.9182544560540873e-05, "loss": 1.5386, "step": 2660 }, { "epoch": 0.20451751690227413, "grad_norm": 4.5834503173828125, "learning_rate": 1.9181929932390906e-05, "loss": 1.6546, "step": 2662 }, { "epoch": 0.20467117393976644, "grad_norm": 3.5726184844970703, "learning_rate": 1.9181315304240936e-05, "loss": 1.4267, "step": 2664 }, { "epoch": 0.20482483097725876, "grad_norm": 3.3437399864196777, "learning_rate": 1.9180700676090966e-05, "loss": 1.6013, "step": 2666 }, { "epoch": 0.20497848801475108, "grad_norm": 3.5842888355255127, "learning_rate": 1.9180086047941e-05, "loss": 1.4479, "step": 2668 }, { "epoch": 0.2051321450522434, "grad_norm": 4.096655368804932, "learning_rate": 1.917947141979103e-05, "loss": 1.4553, "step": 2670 }, { "epoch": 0.2052858020897357, "grad_norm": 4.347322940826416, "learning_rate": 1.9178856791641058e-05, "loss": 1.3843, "step": 2672 }, { "epoch": 0.20543945912722802, "grad_norm": 3.9682047367095947, "learning_rate": 1.917824216349109e-05, "loss": 1.6752, "step": 2674 }, { "epoch": 0.20559311616472034, "grad_norm": 4.238104343414307, "learning_rate": 1.917762753534112e-05, "loss": 1.5107, "step": 2676 }, { "epoch": 0.20574677320221266, "grad_norm": 3.6419684886932373, "learning_rate": 1.917701290719115e-05, "loss": 1.6268, "step": 2678 }, { "epoch": 0.20590043023970497, "grad_norm": 4.493232727050781, "learning_rate": 1.917639827904118e-05, "loss": 1.5479, "step": 2680 }, { "epoch": 0.2060540872771973, "grad_norm": 3.5530130863189697, "learning_rate": 1.9175783650891213e-05, "loss": 1.4865, "step": 2682 }, { "epoch": 0.2062077443146896, "grad_norm": 3.930048942565918, "learning_rate": 1.9175169022741243e-05, "loss": 1.4627, "step": 2684 }, { "epoch": 0.20636140135218192, "grad_norm": 4.193355560302734, "learning_rate": 1.9174554394591273e-05, "loss": 1.5918, "step": 2686 }, { "epoch": 0.20651505838967424, "grad_norm": 3.4783825874328613, "learning_rate": 1.9173939766441306e-05, "loss": 1.441, "step": 2688 }, { "epoch": 0.20666871542716655, "grad_norm": 3.6716883182525635, "learning_rate": 1.9173325138291336e-05, "loss": 1.5617, "step": 2690 }, { "epoch": 0.20682237246465887, "grad_norm": 3.806039810180664, "learning_rate": 1.9172710510141365e-05, "loss": 1.5566, "step": 2692 }, { "epoch": 0.2069760295021512, "grad_norm": 3.7331125736236572, "learning_rate": 1.9172095881991398e-05, "loss": 1.4934, "step": 2694 }, { "epoch": 0.2071296865396435, "grad_norm": 4.192966938018799, "learning_rate": 1.9171481253841428e-05, "loss": 1.5679, "step": 2696 }, { "epoch": 0.20728334357713582, "grad_norm": 3.590785503387451, "learning_rate": 1.9170866625691458e-05, "loss": 1.4876, "step": 2698 }, { "epoch": 0.20743700061462814, "grad_norm": 4.1026458740234375, "learning_rate": 1.917025199754149e-05, "loss": 1.5708, "step": 2700 }, { "epoch": 0.20759065765212048, "grad_norm": 3.8768205642700195, "learning_rate": 1.916963736939152e-05, "loss": 1.4715, "step": 2702 }, { "epoch": 0.2077443146896128, "grad_norm": 3.648543357849121, "learning_rate": 1.916902274124155e-05, "loss": 1.6127, "step": 2704 }, { "epoch": 0.2078979717271051, "grad_norm": 3.6755056381225586, "learning_rate": 1.916840811309158e-05, "loss": 1.6711, "step": 2706 }, { "epoch": 0.20805162876459743, "grad_norm": 3.4196550846099854, "learning_rate": 1.9167793484941613e-05, "loss": 1.4866, "step": 2708 }, { "epoch": 0.20820528580208975, "grad_norm": 3.9521472454071045, "learning_rate": 1.9167178856791643e-05, "loss": 1.5474, "step": 2710 }, { "epoch": 0.20835894283958206, "grad_norm": 4.077064037322998, "learning_rate": 1.9166564228641672e-05, "loss": 1.6824, "step": 2712 }, { "epoch": 0.20851259987707438, "grad_norm": 4.164952278137207, "learning_rate": 1.9165949600491705e-05, "loss": 1.4411, "step": 2714 }, { "epoch": 0.2086662569145667, "grad_norm": 3.788529396057129, "learning_rate": 1.9165334972341735e-05, "loss": 1.6062, "step": 2716 }, { "epoch": 0.208819913952059, "grad_norm": 4.074838638305664, "learning_rate": 1.9164720344191765e-05, "loss": 1.567, "step": 2718 }, { "epoch": 0.20897357098955133, "grad_norm": 3.398486852645874, "learning_rate": 1.9164105716041798e-05, "loss": 1.6186, "step": 2720 }, { "epoch": 0.20912722802704364, "grad_norm": 3.534534215927124, "learning_rate": 1.9163491087891827e-05, "loss": 1.4404, "step": 2722 }, { "epoch": 0.20928088506453596, "grad_norm": 3.3846664428710938, "learning_rate": 1.9162876459741857e-05, "loss": 1.4741, "step": 2724 }, { "epoch": 0.20943454210202828, "grad_norm": 3.8569769859313965, "learning_rate": 1.916226183159189e-05, "loss": 1.4941, "step": 2726 }, { "epoch": 0.2095881991395206, "grad_norm": 3.6158618927001953, "learning_rate": 1.916164720344192e-05, "loss": 1.5539, "step": 2728 }, { "epoch": 0.2097418561770129, "grad_norm": 4.233087062835693, "learning_rate": 1.916103257529195e-05, "loss": 1.5474, "step": 2730 }, { "epoch": 0.20989551321450522, "grad_norm": 3.693319797515869, "learning_rate": 1.916041794714198e-05, "loss": 1.5927, "step": 2732 }, { "epoch": 0.21004917025199754, "grad_norm": 3.5169527530670166, "learning_rate": 1.9159803318992012e-05, "loss": 1.3885, "step": 2734 }, { "epoch": 0.21020282728948986, "grad_norm": 3.507129430770874, "learning_rate": 1.9159188690842042e-05, "loss": 1.4943, "step": 2736 }, { "epoch": 0.21035648432698217, "grad_norm": 4.948350429534912, "learning_rate": 1.915857406269207e-05, "loss": 1.505, "step": 2738 }, { "epoch": 0.2105101413644745, "grad_norm": 3.834376573562622, "learning_rate": 1.9157959434542105e-05, "loss": 1.6174, "step": 2740 }, { "epoch": 0.2106637984019668, "grad_norm": 4.351836204528809, "learning_rate": 1.9157344806392134e-05, "loss": 1.5882, "step": 2742 }, { "epoch": 0.21081745543945912, "grad_norm": 3.8399126529693604, "learning_rate": 1.9156730178242164e-05, "loss": 1.5669, "step": 2744 }, { "epoch": 0.21097111247695144, "grad_norm": 4.092167377471924, "learning_rate": 1.9156115550092197e-05, "loss": 1.5737, "step": 2746 }, { "epoch": 0.21112476951444376, "grad_norm": 3.5932557582855225, "learning_rate": 1.9155500921942227e-05, "loss": 1.5082, "step": 2748 }, { "epoch": 0.21127842655193607, "grad_norm": 4.129446506500244, "learning_rate": 1.9154886293792257e-05, "loss": 1.6286, "step": 2750 }, { "epoch": 0.2114320835894284, "grad_norm": 3.489421844482422, "learning_rate": 1.9154271665642286e-05, "loss": 1.5005, "step": 2752 }, { "epoch": 0.2115857406269207, "grad_norm": 3.7528555393218994, "learning_rate": 1.915365703749232e-05, "loss": 1.5174, "step": 2754 }, { "epoch": 0.21173939766441302, "grad_norm": 3.6098313331604004, "learning_rate": 1.915304240934235e-05, "loss": 1.4932, "step": 2756 }, { "epoch": 0.21189305470190534, "grad_norm": 3.8902368545532227, "learning_rate": 1.915242778119238e-05, "loss": 1.4891, "step": 2758 }, { "epoch": 0.21204671173939765, "grad_norm": 4.011685371398926, "learning_rate": 1.9151813153042412e-05, "loss": 1.5252, "step": 2760 }, { "epoch": 0.21220036877688997, "grad_norm": 3.6917941570281982, "learning_rate": 1.915119852489244e-05, "loss": 1.5622, "step": 2762 }, { "epoch": 0.21235402581438229, "grad_norm": 3.5468204021453857, "learning_rate": 1.915058389674247e-05, "loss": 1.3541, "step": 2764 }, { "epoch": 0.21250768285187463, "grad_norm": 3.7897398471832275, "learning_rate": 1.9149969268592504e-05, "loss": 1.7065, "step": 2766 }, { "epoch": 0.21266133988936695, "grad_norm": 3.539816379547119, "learning_rate": 1.9149354640442534e-05, "loss": 1.4987, "step": 2768 }, { "epoch": 0.21281499692685926, "grad_norm": 3.866915702819824, "learning_rate": 1.9148740012292564e-05, "loss": 1.6211, "step": 2770 }, { "epoch": 0.21296865396435158, "grad_norm": 4.1955180168151855, "learning_rate": 1.9148125384142597e-05, "loss": 1.448, "step": 2772 }, { "epoch": 0.2131223110018439, "grad_norm": 3.6780476570129395, "learning_rate": 1.9147510755992626e-05, "loss": 1.5676, "step": 2774 }, { "epoch": 0.2132759680393362, "grad_norm": 4.097123622894287, "learning_rate": 1.9146896127842656e-05, "loss": 1.5146, "step": 2776 }, { "epoch": 0.21342962507682853, "grad_norm": 3.8207781314849854, "learning_rate": 1.9146281499692686e-05, "loss": 1.4983, "step": 2778 }, { "epoch": 0.21358328211432084, "grad_norm": 3.585047960281372, "learning_rate": 1.914566687154272e-05, "loss": 1.4271, "step": 2780 }, { "epoch": 0.21373693915181316, "grad_norm": 3.4344520568847656, "learning_rate": 1.914505224339275e-05, "loss": 1.5673, "step": 2782 }, { "epoch": 0.21389059618930548, "grad_norm": 3.5881597995758057, "learning_rate": 1.9144437615242778e-05, "loss": 1.4111, "step": 2784 }, { "epoch": 0.2140442532267978, "grad_norm": 3.65913724899292, "learning_rate": 1.914382298709281e-05, "loss": 1.5894, "step": 2786 }, { "epoch": 0.2141979102642901, "grad_norm": 3.761591911315918, "learning_rate": 1.914320835894284e-05, "loss": 1.4502, "step": 2788 }, { "epoch": 0.21435156730178243, "grad_norm": 3.943924903869629, "learning_rate": 1.914259373079287e-05, "loss": 1.4754, "step": 2790 }, { "epoch": 0.21450522433927474, "grad_norm": 4.111385345458984, "learning_rate": 1.9141979102642904e-05, "loss": 1.5946, "step": 2792 }, { "epoch": 0.21465888137676706, "grad_norm": 3.8199594020843506, "learning_rate": 1.9141364474492933e-05, "loss": 1.4817, "step": 2794 }, { "epoch": 0.21481253841425937, "grad_norm": 3.667856454849243, "learning_rate": 1.9140749846342963e-05, "loss": 1.5357, "step": 2796 }, { "epoch": 0.2149661954517517, "grad_norm": 3.980133295059204, "learning_rate": 1.9140135218192996e-05, "loss": 1.6328, "step": 2798 }, { "epoch": 0.215119852489244, "grad_norm": 4.220106601715088, "learning_rate": 1.9139520590043026e-05, "loss": 1.5825, "step": 2800 }, { "epoch": 0.21527350952673632, "grad_norm": 3.8425889015197754, "learning_rate": 1.913890596189306e-05, "loss": 1.5565, "step": 2802 }, { "epoch": 0.21542716656422864, "grad_norm": 3.5814743041992188, "learning_rate": 1.9138291333743085e-05, "loss": 1.5044, "step": 2804 }, { "epoch": 0.21558082360172096, "grad_norm": 4.015439510345459, "learning_rate": 1.9137676705593118e-05, "loss": 1.6277, "step": 2806 }, { "epoch": 0.21573448063921327, "grad_norm": 4.553494930267334, "learning_rate": 1.9137062077443148e-05, "loss": 1.4374, "step": 2808 }, { "epoch": 0.2158881376767056, "grad_norm": 3.7021353244781494, "learning_rate": 1.9136447449293178e-05, "loss": 1.5173, "step": 2810 }, { "epoch": 0.2160417947141979, "grad_norm": 4.46372652053833, "learning_rate": 1.913583282114321e-05, "loss": 1.7864, "step": 2812 }, { "epoch": 0.21619545175169022, "grad_norm": 3.551791191101074, "learning_rate": 1.913521819299324e-05, "loss": 1.3458, "step": 2814 }, { "epoch": 0.21634910878918254, "grad_norm": 3.224550724029541, "learning_rate": 1.913460356484327e-05, "loss": 1.4272, "step": 2816 }, { "epoch": 0.21650276582667485, "grad_norm": 4.008483409881592, "learning_rate": 1.9133988936693303e-05, "loss": 1.5502, "step": 2818 }, { "epoch": 0.21665642286416717, "grad_norm": 3.782654047012329, "learning_rate": 1.9133374308543333e-05, "loss": 1.413, "step": 2820 }, { "epoch": 0.21681007990165949, "grad_norm": 3.2302143573760986, "learning_rate": 1.9132759680393366e-05, "loss": 1.4368, "step": 2822 }, { "epoch": 0.2169637369391518, "grad_norm": 3.4619314670562744, "learning_rate": 1.9132145052243395e-05, "loss": 1.4042, "step": 2824 }, { "epoch": 0.21711739397664412, "grad_norm": 4.229294300079346, "learning_rate": 1.9131530424093425e-05, "loss": 1.5187, "step": 2826 }, { "epoch": 0.21727105101413643, "grad_norm": 3.9806225299835205, "learning_rate": 1.9130915795943458e-05, "loss": 1.4521, "step": 2828 }, { "epoch": 0.21742470805162875, "grad_norm": 3.7580087184906006, "learning_rate": 1.9130301167793485e-05, "loss": 1.5982, "step": 2830 }, { "epoch": 0.2175783650891211, "grad_norm": 3.9534852504730225, "learning_rate": 1.9129686539643518e-05, "loss": 1.6518, "step": 2832 }, { "epoch": 0.2177320221266134, "grad_norm": 3.770359992980957, "learning_rate": 1.9129071911493547e-05, "loss": 1.3771, "step": 2834 }, { "epoch": 0.21788567916410573, "grad_norm": 4.148795127868652, "learning_rate": 1.9128457283343577e-05, "loss": 1.6236, "step": 2836 }, { "epoch": 0.21803933620159804, "grad_norm": 4.049807548522949, "learning_rate": 1.912784265519361e-05, "loss": 1.6702, "step": 2838 }, { "epoch": 0.21819299323909036, "grad_norm": 3.7045743465423584, "learning_rate": 1.912722802704364e-05, "loss": 1.54, "step": 2840 }, { "epoch": 0.21834665027658268, "grad_norm": 3.9694302082061768, "learning_rate": 1.9126613398893673e-05, "loss": 1.6067, "step": 2842 }, { "epoch": 0.218500307314075, "grad_norm": 3.4496958255767822, "learning_rate": 1.9125998770743702e-05, "loss": 1.5128, "step": 2844 }, { "epoch": 0.2186539643515673, "grad_norm": 3.6536827087402344, "learning_rate": 1.9125384142593732e-05, "loss": 1.5447, "step": 2846 }, { "epoch": 0.21880762138905963, "grad_norm": 3.9362754821777344, "learning_rate": 1.9124769514443765e-05, "loss": 1.4012, "step": 2848 }, { "epoch": 0.21896127842655194, "grad_norm": 3.2885048389434814, "learning_rate": 1.912415488629379e-05, "loss": 1.4489, "step": 2850 }, { "epoch": 0.21911493546404426, "grad_norm": 3.519296884536743, "learning_rate": 1.9123540258143825e-05, "loss": 1.4015, "step": 2852 }, { "epoch": 0.21926859250153657, "grad_norm": 4.157144546508789, "learning_rate": 1.9122925629993854e-05, "loss": 1.5355, "step": 2854 }, { "epoch": 0.2194222495390289, "grad_norm": 3.6727182865142822, "learning_rate": 1.9122311001843884e-05, "loss": 1.5225, "step": 2856 }, { "epoch": 0.2195759065765212, "grad_norm": 3.801722288131714, "learning_rate": 1.9121696373693917e-05, "loss": 1.4482, "step": 2858 }, { "epoch": 0.21972956361401352, "grad_norm": 4.15856409072876, "learning_rate": 1.9121081745543947e-05, "loss": 1.5494, "step": 2860 }, { "epoch": 0.21988322065150584, "grad_norm": 4.247369289398193, "learning_rate": 1.912046711739398e-05, "loss": 1.4578, "step": 2862 }, { "epoch": 0.22003687768899816, "grad_norm": 4.413219928741455, "learning_rate": 1.911985248924401e-05, "loss": 1.5631, "step": 2864 }, { "epoch": 0.22019053472649047, "grad_norm": 3.894334077835083, "learning_rate": 1.911923786109404e-05, "loss": 1.5072, "step": 2866 }, { "epoch": 0.2203441917639828, "grad_norm": 3.5060431957244873, "learning_rate": 1.9118623232944072e-05, "loss": 1.5318, "step": 2868 }, { "epoch": 0.2204978488014751, "grad_norm": 3.312140464782715, "learning_rate": 1.9118008604794102e-05, "loss": 1.5173, "step": 2870 }, { "epoch": 0.22065150583896742, "grad_norm": 4.122204780578613, "learning_rate": 1.911739397664413e-05, "loss": 1.5222, "step": 2872 }, { "epoch": 0.22080516287645974, "grad_norm": 3.994349956512451, "learning_rate": 1.9116779348494165e-05, "loss": 1.6783, "step": 2874 }, { "epoch": 0.22095881991395205, "grad_norm": 3.831480026245117, "learning_rate": 1.911616472034419e-05, "loss": 1.4112, "step": 2876 }, { "epoch": 0.22111247695144437, "grad_norm": 3.1337056159973145, "learning_rate": 1.9115550092194224e-05, "loss": 1.4297, "step": 2878 }, { "epoch": 0.2212661339889367, "grad_norm": 3.5576846599578857, "learning_rate": 1.9114935464044254e-05, "loss": 1.5911, "step": 2880 }, { "epoch": 0.221419791026429, "grad_norm": 4.290283679962158, "learning_rate": 1.9114320835894283e-05, "loss": 1.381, "step": 2882 }, { "epoch": 0.22157344806392132, "grad_norm": 3.7715091705322266, "learning_rate": 1.9113706207744316e-05, "loss": 1.4368, "step": 2884 }, { "epoch": 0.22172710510141364, "grad_norm": 3.3459980487823486, "learning_rate": 1.9113091579594346e-05, "loss": 1.3408, "step": 2886 }, { "epoch": 0.22188076213890595, "grad_norm": 3.626512050628662, "learning_rate": 1.911247695144438e-05, "loss": 1.5328, "step": 2888 }, { "epoch": 0.22203441917639827, "grad_norm": 4.11099100112915, "learning_rate": 1.911186232329441e-05, "loss": 1.3911, "step": 2890 }, { "epoch": 0.22218807621389058, "grad_norm": 3.386157751083374, "learning_rate": 1.911124769514444e-05, "loss": 1.4783, "step": 2892 }, { "epoch": 0.2223417332513829, "grad_norm": 3.7897472381591797, "learning_rate": 1.911063306699447e-05, "loss": 1.723, "step": 2894 }, { "epoch": 0.22249539028887522, "grad_norm": 4.359883785247803, "learning_rate": 1.91100184388445e-05, "loss": 1.515, "step": 2896 }, { "epoch": 0.22264904732636756, "grad_norm": 3.69624924659729, "learning_rate": 1.910940381069453e-05, "loss": 1.4789, "step": 2898 }, { "epoch": 0.22280270436385988, "grad_norm": 3.6302242279052734, "learning_rate": 1.9108789182544564e-05, "loss": 1.4188, "step": 2900 }, { "epoch": 0.2229563614013522, "grad_norm": 3.765510082244873, "learning_rate": 1.910817455439459e-05, "loss": 1.5475, "step": 2902 }, { "epoch": 0.2231100184388445, "grad_norm": 3.6644978523254395, "learning_rate": 1.9107559926244623e-05, "loss": 1.3725, "step": 2904 }, { "epoch": 0.22326367547633683, "grad_norm": 3.9660632610321045, "learning_rate": 1.9106945298094653e-05, "loss": 1.6626, "step": 2906 }, { "epoch": 0.22341733251382914, "grad_norm": 3.578211545944214, "learning_rate": 1.9106330669944686e-05, "loss": 1.7112, "step": 2908 }, { "epoch": 0.22357098955132146, "grad_norm": 3.61378812789917, "learning_rate": 1.9105716041794716e-05, "loss": 1.4498, "step": 2910 }, { "epoch": 0.22372464658881377, "grad_norm": 3.705375909805298, "learning_rate": 1.9105101413644746e-05, "loss": 1.5779, "step": 2912 }, { "epoch": 0.2238783036263061, "grad_norm": 3.6211791038513184, "learning_rate": 1.910448678549478e-05, "loss": 1.5316, "step": 2914 }, { "epoch": 0.2240319606637984, "grad_norm": 3.84789776802063, "learning_rate": 1.910387215734481e-05, "loss": 1.4939, "step": 2916 }, { "epoch": 0.22418561770129072, "grad_norm": 3.532919406890869, "learning_rate": 1.9103257529194838e-05, "loss": 1.4652, "step": 2918 }, { "epoch": 0.22433927473878304, "grad_norm": 4.090033531188965, "learning_rate": 1.910264290104487e-05, "loss": 1.6013, "step": 2920 }, { "epoch": 0.22449293177627536, "grad_norm": 3.964073896408081, "learning_rate": 1.91020282728949e-05, "loss": 1.5672, "step": 2922 }, { "epoch": 0.22464658881376767, "grad_norm": 3.7312428951263428, "learning_rate": 1.910141364474493e-05, "loss": 1.4724, "step": 2924 }, { "epoch": 0.22480024585126, "grad_norm": 3.863544464111328, "learning_rate": 1.9100799016594964e-05, "loss": 1.4465, "step": 2926 }, { "epoch": 0.2249539028887523, "grad_norm": 3.6252338886260986, "learning_rate": 1.9100184388444993e-05, "loss": 1.5351, "step": 2928 }, { "epoch": 0.22510755992624462, "grad_norm": 3.6253201961517334, "learning_rate": 1.9099569760295023e-05, "loss": 1.487, "step": 2930 }, { "epoch": 0.22526121696373694, "grad_norm": 4.433292865753174, "learning_rate": 1.9098955132145053e-05, "loss": 1.5361, "step": 2932 }, { "epoch": 0.22541487400122925, "grad_norm": 3.5505495071411133, "learning_rate": 1.9098340503995086e-05, "loss": 1.5658, "step": 2934 }, { "epoch": 0.22556853103872157, "grad_norm": 3.8930625915527344, "learning_rate": 1.9097725875845115e-05, "loss": 1.6665, "step": 2936 }, { "epoch": 0.2257221880762139, "grad_norm": 3.6677374839782715, "learning_rate": 1.9097111247695145e-05, "loss": 1.4741, "step": 2938 }, { "epoch": 0.2258758451137062, "grad_norm": 3.793630838394165, "learning_rate": 1.9096496619545178e-05, "loss": 1.5319, "step": 2940 }, { "epoch": 0.22602950215119852, "grad_norm": 3.9335010051727295, "learning_rate": 1.9095881991395208e-05, "loss": 1.5558, "step": 2942 }, { "epoch": 0.22618315918869084, "grad_norm": 3.583728551864624, "learning_rate": 1.9095267363245237e-05, "loss": 1.4462, "step": 2944 }, { "epoch": 0.22633681622618315, "grad_norm": 3.9286651611328125, "learning_rate": 1.909465273509527e-05, "loss": 1.438, "step": 2946 }, { "epoch": 0.22649047326367547, "grad_norm": 3.526460647583008, "learning_rate": 1.90940381069453e-05, "loss": 1.3429, "step": 2948 }, { "epoch": 0.22664413030116778, "grad_norm": 4.004605293273926, "learning_rate": 1.909342347879533e-05, "loss": 1.6075, "step": 2950 }, { "epoch": 0.2267977873386601, "grad_norm": 3.9438345432281494, "learning_rate": 1.909280885064536e-05, "loss": 1.5637, "step": 2952 }, { "epoch": 0.22695144437615242, "grad_norm": 4.000015735626221, "learning_rate": 1.9092194222495393e-05, "loss": 1.4984, "step": 2954 }, { "epoch": 0.22710510141364473, "grad_norm": 3.9805843830108643, "learning_rate": 1.9091579594345422e-05, "loss": 1.5232, "step": 2956 }, { "epoch": 0.22725875845113705, "grad_norm": 3.8267416954040527, "learning_rate": 1.9090964966195452e-05, "loss": 1.496, "step": 2958 }, { "epoch": 0.22741241548862937, "grad_norm": 3.9185121059417725, "learning_rate": 1.9090350338045485e-05, "loss": 1.5375, "step": 2960 }, { "epoch": 0.2275660725261217, "grad_norm": 3.3180205821990967, "learning_rate": 1.9089735709895515e-05, "loss": 1.3771, "step": 2962 }, { "epoch": 0.22771972956361403, "grad_norm": 3.9198102951049805, "learning_rate": 1.9089121081745544e-05, "loss": 1.4476, "step": 2964 }, { "epoch": 0.22787338660110634, "grad_norm": 4.271195411682129, "learning_rate": 1.9088506453595578e-05, "loss": 1.5008, "step": 2966 }, { "epoch": 0.22802704363859866, "grad_norm": 3.8531033992767334, "learning_rate": 1.9087891825445607e-05, "loss": 1.5622, "step": 2968 }, { "epoch": 0.22818070067609098, "grad_norm": 3.83855938911438, "learning_rate": 1.9087277197295637e-05, "loss": 1.5225, "step": 2970 }, { "epoch": 0.2283343577135833, "grad_norm": 3.9290051460266113, "learning_rate": 1.908666256914567e-05, "loss": 1.5943, "step": 2972 }, { "epoch": 0.2284880147510756, "grad_norm": 3.9568498134613037, "learning_rate": 1.90860479409957e-05, "loss": 1.4396, "step": 2974 }, { "epoch": 0.22864167178856792, "grad_norm": 3.8864598274230957, "learning_rate": 1.908543331284573e-05, "loss": 1.5007, "step": 2976 }, { "epoch": 0.22879532882606024, "grad_norm": 3.8972291946411133, "learning_rate": 1.908481868469576e-05, "loss": 1.4453, "step": 2978 }, { "epoch": 0.22894898586355256, "grad_norm": 3.3269383907318115, "learning_rate": 1.9084204056545792e-05, "loss": 1.4875, "step": 2980 }, { "epoch": 0.22910264290104487, "grad_norm": 4.227247714996338, "learning_rate": 1.9083589428395822e-05, "loss": 1.4625, "step": 2982 }, { "epoch": 0.2292562999385372, "grad_norm": 4.22311544418335, "learning_rate": 1.908297480024585e-05, "loss": 1.6099, "step": 2984 }, { "epoch": 0.2294099569760295, "grad_norm": 3.618152141571045, "learning_rate": 1.9082360172095885e-05, "loss": 1.3739, "step": 2986 }, { "epoch": 0.22956361401352182, "grad_norm": 4.052188873291016, "learning_rate": 1.9081745543945914e-05, "loss": 1.4373, "step": 2988 }, { "epoch": 0.22971727105101414, "grad_norm": 3.879241943359375, "learning_rate": 1.9081130915795944e-05, "loss": 1.4802, "step": 2990 }, { "epoch": 0.22987092808850645, "grad_norm": 4.114073276519775, "learning_rate": 1.9080516287645977e-05, "loss": 1.4496, "step": 2992 }, { "epoch": 0.23002458512599877, "grad_norm": 3.420759677886963, "learning_rate": 1.9079901659496007e-05, "loss": 1.4345, "step": 2994 }, { "epoch": 0.2301782421634911, "grad_norm": 3.7326130867004395, "learning_rate": 1.9079287031346036e-05, "loss": 1.4949, "step": 2996 }, { "epoch": 0.2303318992009834, "grad_norm": 4.011677265167236, "learning_rate": 1.907867240319607e-05, "loss": 1.504, "step": 2998 }, { "epoch": 0.23048555623847572, "grad_norm": 4.186577796936035, "learning_rate": 1.90780577750461e-05, "loss": 1.5432, "step": 3000 }, { "epoch": 0.23063921327596804, "grad_norm": 3.768660306930542, "learning_rate": 1.907744314689613e-05, "loss": 1.403, "step": 3002 }, { "epoch": 0.23079287031346035, "grad_norm": 3.8193624019622803, "learning_rate": 1.907682851874616e-05, "loss": 1.401, "step": 3004 }, { "epoch": 0.23094652735095267, "grad_norm": 3.425020217895508, "learning_rate": 1.907621389059619e-05, "loss": 1.4576, "step": 3006 }, { "epoch": 0.23110018438844498, "grad_norm": 3.75453782081604, "learning_rate": 1.907559926244622e-05, "loss": 1.5563, "step": 3008 }, { "epoch": 0.2312538414259373, "grad_norm": 3.68115496635437, "learning_rate": 1.907498463429625e-05, "loss": 1.3805, "step": 3010 }, { "epoch": 0.23140749846342962, "grad_norm": 3.3006389141082764, "learning_rate": 1.9074370006146284e-05, "loss": 1.5252, "step": 3012 }, { "epoch": 0.23156115550092193, "grad_norm": 4.148028373718262, "learning_rate": 1.9073755377996314e-05, "loss": 1.5676, "step": 3014 }, { "epoch": 0.23171481253841425, "grad_norm": 3.5741395950317383, "learning_rate": 1.9073140749846343e-05, "loss": 1.5295, "step": 3016 }, { "epoch": 0.23186846957590657, "grad_norm": 3.9004082679748535, "learning_rate": 1.9072526121696376e-05, "loss": 1.4535, "step": 3018 }, { "epoch": 0.23202212661339888, "grad_norm": 4.176894664764404, "learning_rate": 1.9071911493546406e-05, "loss": 1.3629, "step": 3020 }, { "epoch": 0.2321757836508912, "grad_norm": 5.075558662414551, "learning_rate": 1.9071296865396436e-05, "loss": 1.5217, "step": 3022 }, { "epoch": 0.23232944068838352, "grad_norm": 3.4459147453308105, "learning_rate": 1.907068223724647e-05, "loss": 1.4933, "step": 3024 }, { "epoch": 0.23248309772587583, "grad_norm": 7.433279037475586, "learning_rate": 1.90700676090965e-05, "loss": 1.5533, "step": 3026 }, { "epoch": 0.23263675476336818, "grad_norm": 3.6367766857147217, "learning_rate": 1.9069452980946528e-05, "loss": 1.5066, "step": 3028 }, { "epoch": 0.2327904118008605, "grad_norm": 3.537909746170044, "learning_rate": 1.9068838352796558e-05, "loss": 1.337, "step": 3030 }, { "epoch": 0.2329440688383528, "grad_norm": 4.002386093139648, "learning_rate": 1.906822372464659e-05, "loss": 1.6041, "step": 3032 }, { "epoch": 0.23309772587584512, "grad_norm": 3.76108717918396, "learning_rate": 1.906760909649662e-05, "loss": 1.415, "step": 3034 }, { "epoch": 0.23325138291333744, "grad_norm": 3.783128023147583, "learning_rate": 1.906699446834665e-05, "loss": 1.3389, "step": 3036 }, { "epoch": 0.23340503995082976, "grad_norm": 3.628431558609009, "learning_rate": 1.9066379840196683e-05, "loss": 1.4391, "step": 3038 }, { "epoch": 0.23355869698832207, "grad_norm": 3.474383592605591, "learning_rate": 1.9065765212046713e-05, "loss": 1.6119, "step": 3040 }, { "epoch": 0.2337123540258144, "grad_norm": 3.4599947929382324, "learning_rate": 1.9065150583896743e-05, "loss": 1.5611, "step": 3042 }, { "epoch": 0.2338660110633067, "grad_norm": 3.5529699325561523, "learning_rate": 1.9064535955746776e-05, "loss": 1.5067, "step": 3044 }, { "epoch": 0.23401966810079902, "grad_norm": 3.7097628116607666, "learning_rate": 1.9063921327596806e-05, "loss": 1.5433, "step": 3046 }, { "epoch": 0.23417332513829134, "grad_norm": 3.5367672443389893, "learning_rate": 1.9063306699446835e-05, "loss": 1.5336, "step": 3048 }, { "epoch": 0.23432698217578365, "grad_norm": 3.9764819145202637, "learning_rate": 1.9062692071296868e-05, "loss": 1.4671, "step": 3050 }, { "epoch": 0.23448063921327597, "grad_norm": 3.8383781909942627, "learning_rate": 1.9062077443146898e-05, "loss": 1.5263, "step": 3052 }, { "epoch": 0.2346342962507683, "grad_norm": 3.899062395095825, "learning_rate": 1.906146281499693e-05, "loss": 1.4592, "step": 3054 }, { "epoch": 0.2347879532882606, "grad_norm": 3.9547131061553955, "learning_rate": 1.9060848186846957e-05, "loss": 1.4835, "step": 3056 }, { "epoch": 0.23494161032575292, "grad_norm": 4.053800582885742, "learning_rate": 1.906023355869699e-05, "loss": 1.3099, "step": 3058 }, { "epoch": 0.23509526736324524, "grad_norm": 3.65938401222229, "learning_rate": 1.905961893054702e-05, "loss": 1.5249, "step": 3060 }, { "epoch": 0.23524892440073755, "grad_norm": 3.514390707015991, "learning_rate": 1.905900430239705e-05, "loss": 1.4615, "step": 3062 }, { "epoch": 0.23540258143822987, "grad_norm": 3.5489819049835205, "learning_rate": 1.9058389674247083e-05, "loss": 1.3366, "step": 3064 }, { "epoch": 0.23555623847572218, "grad_norm": 3.88149356842041, "learning_rate": 1.9057775046097113e-05, "loss": 1.6467, "step": 3066 }, { "epoch": 0.2357098955132145, "grad_norm": 3.912346601486206, "learning_rate": 1.9057160417947142e-05, "loss": 1.6762, "step": 3068 }, { "epoch": 0.23586355255070682, "grad_norm": 3.9258928298950195, "learning_rate": 1.9056545789797175e-05, "loss": 1.6048, "step": 3070 }, { "epoch": 0.23601720958819913, "grad_norm": 4.021396636962891, "learning_rate": 1.9055931161647205e-05, "loss": 1.6012, "step": 3072 }, { "epoch": 0.23617086662569145, "grad_norm": 3.756319999694824, "learning_rate": 1.9055316533497238e-05, "loss": 1.505, "step": 3074 }, { "epoch": 0.23632452366318377, "grad_norm": 3.8204259872436523, "learning_rate": 1.9054701905347264e-05, "loss": 1.4867, "step": 3076 }, { "epoch": 0.23647818070067608, "grad_norm": 3.6957874298095703, "learning_rate": 1.9054087277197297e-05, "loss": 1.6277, "step": 3078 }, { "epoch": 0.2366318377381684, "grad_norm": 3.272287607192993, "learning_rate": 1.9053472649047327e-05, "loss": 1.276, "step": 3080 }, { "epoch": 0.23678549477566072, "grad_norm": 3.6933488845825195, "learning_rate": 1.9052858020897357e-05, "loss": 1.569, "step": 3082 }, { "epoch": 0.23693915181315303, "grad_norm": 4.391809940338135, "learning_rate": 1.905224339274739e-05, "loss": 1.4694, "step": 3084 }, { "epoch": 0.23709280885064535, "grad_norm": 3.9479763507843018, "learning_rate": 1.905162876459742e-05, "loss": 1.4931, "step": 3086 }, { "epoch": 0.23724646588813766, "grad_norm": 3.6774582862854004, "learning_rate": 1.905101413644745e-05, "loss": 1.6148, "step": 3088 }, { "epoch": 0.23740012292562998, "grad_norm": 3.2491109371185303, "learning_rate": 1.9050399508297482e-05, "loss": 1.629, "step": 3090 }, { "epoch": 0.23755377996312232, "grad_norm": 3.8977487087249756, "learning_rate": 1.9049784880147512e-05, "loss": 1.4423, "step": 3092 }, { "epoch": 0.23770743700061464, "grad_norm": 3.438864231109619, "learning_rate": 1.9049170251997545e-05, "loss": 1.3834, "step": 3094 }, { "epoch": 0.23786109403810696, "grad_norm": 3.799286365509033, "learning_rate": 1.9048555623847575e-05, "loss": 1.6176, "step": 3096 }, { "epoch": 0.23801475107559927, "grad_norm": 3.665806770324707, "learning_rate": 1.9047940995697604e-05, "loss": 1.6198, "step": 3098 }, { "epoch": 0.2381684081130916, "grad_norm": 4.014431476593018, "learning_rate": 1.9047326367547637e-05, "loss": 1.4675, "step": 3100 }, { "epoch": 0.2383220651505839, "grad_norm": 3.293631076812744, "learning_rate": 1.9046711739397664e-05, "loss": 1.4766, "step": 3102 }, { "epoch": 0.23847572218807622, "grad_norm": 3.3270959854125977, "learning_rate": 1.9046097111247697e-05, "loss": 1.3699, "step": 3104 }, { "epoch": 0.23862937922556854, "grad_norm": 3.4034950733184814, "learning_rate": 1.9045482483097727e-05, "loss": 1.3973, "step": 3106 }, { "epoch": 0.23878303626306085, "grad_norm": 4.333171367645264, "learning_rate": 1.9044867854947756e-05, "loss": 1.4756, "step": 3108 }, { "epoch": 0.23893669330055317, "grad_norm": 3.8026325702667236, "learning_rate": 1.904425322679779e-05, "loss": 1.5391, "step": 3110 }, { "epoch": 0.2390903503380455, "grad_norm": 4.236443996429443, "learning_rate": 1.904363859864782e-05, "loss": 1.5477, "step": 3112 }, { "epoch": 0.2392440073755378, "grad_norm": 3.979402780532837, "learning_rate": 1.9043023970497852e-05, "loss": 1.5373, "step": 3114 }, { "epoch": 0.23939766441303012, "grad_norm": 4.0372443199157715, "learning_rate": 1.9042409342347882e-05, "loss": 1.5543, "step": 3116 }, { "epoch": 0.23955132145052244, "grad_norm": 3.884770393371582, "learning_rate": 1.904179471419791e-05, "loss": 1.4172, "step": 3118 }, { "epoch": 0.23970497848801475, "grad_norm": 3.852644205093384, "learning_rate": 1.9041180086047944e-05, "loss": 1.5288, "step": 3120 }, { "epoch": 0.23985863552550707, "grad_norm": 3.6635940074920654, "learning_rate": 1.9040565457897974e-05, "loss": 1.4201, "step": 3122 }, { "epoch": 0.24001229256299939, "grad_norm": 3.9604082107543945, "learning_rate": 1.9039950829748004e-05, "loss": 1.5166, "step": 3124 }, { "epoch": 0.2401659496004917, "grad_norm": 3.084805965423584, "learning_rate": 1.9039336201598037e-05, "loss": 1.472, "step": 3126 }, { "epoch": 0.24031960663798402, "grad_norm": 3.741560697555542, "learning_rate": 1.9038721573448063e-05, "loss": 1.5564, "step": 3128 }, { "epoch": 0.24047326367547633, "grad_norm": 3.3543457984924316, "learning_rate": 1.9038106945298096e-05, "loss": 1.5025, "step": 3130 }, { "epoch": 0.24062692071296865, "grad_norm": 3.7456090450286865, "learning_rate": 1.9037492317148126e-05, "loss": 1.5189, "step": 3132 }, { "epoch": 0.24078057775046097, "grad_norm": 3.4162986278533936, "learning_rate": 1.9036877688998156e-05, "loss": 1.622, "step": 3134 }, { "epoch": 0.24093423478795328, "grad_norm": 4.069525241851807, "learning_rate": 1.903626306084819e-05, "loss": 1.6334, "step": 3136 }, { "epoch": 0.2410878918254456, "grad_norm": 3.6952528953552246, "learning_rate": 1.903564843269822e-05, "loss": 1.4343, "step": 3138 }, { "epoch": 0.24124154886293792, "grad_norm": 3.562882900238037, "learning_rate": 1.903503380454825e-05, "loss": 1.3561, "step": 3140 }, { "epoch": 0.24139520590043023, "grad_norm": 3.8922019004821777, "learning_rate": 1.903441917639828e-05, "loss": 1.4585, "step": 3142 }, { "epoch": 0.24154886293792255, "grad_norm": 3.665839910507202, "learning_rate": 1.903380454824831e-05, "loss": 1.7023, "step": 3144 }, { "epoch": 0.24170251997541486, "grad_norm": 3.4272682666778564, "learning_rate": 1.9033189920098344e-05, "loss": 1.5242, "step": 3146 }, { "epoch": 0.24185617701290718, "grad_norm": 3.8519303798675537, "learning_rate": 1.9032575291948374e-05, "loss": 1.4245, "step": 3148 }, { "epoch": 0.2420098340503995, "grad_norm": 3.6139211654663086, "learning_rate": 1.9031960663798403e-05, "loss": 1.3585, "step": 3150 }, { "epoch": 0.2421634910878918, "grad_norm": 4.0420966148376465, "learning_rate": 1.9031346035648436e-05, "loss": 1.577, "step": 3152 }, { "epoch": 0.24231714812538413, "grad_norm": 3.215604305267334, "learning_rate": 1.9030731407498463e-05, "loss": 1.4625, "step": 3154 }, { "epoch": 0.24247080516287645, "grad_norm": 4.21508264541626, "learning_rate": 1.9030116779348496e-05, "loss": 1.4881, "step": 3156 }, { "epoch": 0.2426244622003688, "grad_norm": 3.351759910583496, "learning_rate": 1.9029502151198525e-05, "loss": 1.5047, "step": 3158 }, { "epoch": 0.2427781192378611, "grad_norm": 3.681420087814331, "learning_rate": 1.902888752304856e-05, "loss": 1.3997, "step": 3160 }, { "epoch": 0.24293177627535342, "grad_norm": 3.751452922821045, "learning_rate": 1.9028272894898588e-05, "loss": 1.5441, "step": 3162 }, { "epoch": 0.24308543331284574, "grad_norm": 3.7483127117156982, "learning_rate": 1.9027658266748618e-05, "loss": 1.5719, "step": 3164 }, { "epoch": 0.24323909035033806, "grad_norm": 4.023811340332031, "learning_rate": 1.902704363859865e-05, "loss": 1.5686, "step": 3166 }, { "epoch": 0.24339274738783037, "grad_norm": 3.655235767364502, "learning_rate": 1.902642901044868e-05, "loss": 1.4781, "step": 3168 }, { "epoch": 0.2435464044253227, "grad_norm": 3.8731863498687744, "learning_rate": 1.902581438229871e-05, "loss": 1.4679, "step": 3170 }, { "epoch": 0.243700061462815, "grad_norm": 3.9271247386932373, "learning_rate": 1.9025199754148743e-05, "loss": 1.6756, "step": 3172 }, { "epoch": 0.24385371850030732, "grad_norm": 3.7917888164520264, "learning_rate": 1.902458512599877e-05, "loss": 1.5004, "step": 3174 }, { "epoch": 0.24400737553779964, "grad_norm": 3.341165781021118, "learning_rate": 1.9023970497848803e-05, "loss": 1.4226, "step": 3176 }, { "epoch": 0.24416103257529195, "grad_norm": 3.6708719730377197, "learning_rate": 1.9023355869698832e-05, "loss": 1.4637, "step": 3178 }, { "epoch": 0.24431468961278427, "grad_norm": 3.8046674728393555, "learning_rate": 1.9022741241548865e-05, "loss": 1.4156, "step": 3180 }, { "epoch": 0.24446834665027659, "grad_norm": 3.7367637157440186, "learning_rate": 1.9022126613398895e-05, "loss": 1.5416, "step": 3182 }, { "epoch": 0.2446220036877689, "grad_norm": 3.453035354614258, "learning_rate": 1.9021511985248925e-05, "loss": 1.459, "step": 3184 }, { "epoch": 0.24477566072526122, "grad_norm": 3.8223092555999756, "learning_rate": 1.9020897357098958e-05, "loss": 1.5786, "step": 3186 }, { "epoch": 0.24492931776275353, "grad_norm": 3.4199178218841553, "learning_rate": 1.9020282728948988e-05, "loss": 1.6042, "step": 3188 }, { "epoch": 0.24508297480024585, "grad_norm": 3.828610420227051, "learning_rate": 1.9019668100799017e-05, "loss": 1.4459, "step": 3190 }, { "epoch": 0.24523663183773817, "grad_norm": 3.5266432762145996, "learning_rate": 1.901905347264905e-05, "loss": 1.5675, "step": 3192 }, { "epoch": 0.24539028887523048, "grad_norm": 3.738628387451172, "learning_rate": 1.901843884449908e-05, "loss": 1.4532, "step": 3194 }, { "epoch": 0.2455439459127228, "grad_norm": 3.32661771774292, "learning_rate": 1.901782421634911e-05, "loss": 1.4821, "step": 3196 }, { "epoch": 0.24569760295021512, "grad_norm": 3.8040380477905273, "learning_rate": 1.9017209588199143e-05, "loss": 1.5357, "step": 3198 }, { "epoch": 0.24585125998770743, "grad_norm": 3.24855637550354, "learning_rate": 1.9016594960049172e-05, "loss": 1.36, "step": 3200 }, { "epoch": 0.24600491702519975, "grad_norm": 3.494410514831543, "learning_rate": 1.9015980331899202e-05, "loss": 1.705, "step": 3202 }, { "epoch": 0.24615857406269206, "grad_norm": 3.760875940322876, "learning_rate": 1.9015365703749232e-05, "loss": 1.4332, "step": 3204 }, { "epoch": 0.24631223110018438, "grad_norm": 3.2367358207702637, "learning_rate": 1.9014751075599265e-05, "loss": 1.429, "step": 3206 }, { "epoch": 0.2464658881376767, "grad_norm": 3.4821407794952393, "learning_rate": 1.9014136447449295e-05, "loss": 1.5069, "step": 3208 }, { "epoch": 0.246619545175169, "grad_norm": 3.640047073364258, "learning_rate": 1.9013521819299324e-05, "loss": 1.4913, "step": 3210 }, { "epoch": 0.24677320221266133, "grad_norm": 3.7262701988220215, "learning_rate": 1.9012907191149357e-05, "loss": 1.4896, "step": 3212 }, { "epoch": 0.24692685925015365, "grad_norm": 4.199042797088623, "learning_rate": 1.9012292562999387e-05, "loss": 1.5823, "step": 3214 }, { "epoch": 0.24708051628764596, "grad_norm": 3.3966314792633057, "learning_rate": 1.9011677934849417e-05, "loss": 1.4982, "step": 3216 }, { "epoch": 0.24723417332513828, "grad_norm": 3.9125616550445557, "learning_rate": 1.901106330669945e-05, "loss": 1.4921, "step": 3218 }, { "epoch": 0.2473878303626306, "grad_norm": 3.583458423614502, "learning_rate": 1.901044867854948e-05, "loss": 1.4831, "step": 3220 }, { "epoch": 0.24754148740012294, "grad_norm": 3.9417824745178223, "learning_rate": 1.900983405039951e-05, "loss": 1.533, "step": 3222 }, { "epoch": 0.24769514443761526, "grad_norm": 3.6411452293395996, "learning_rate": 1.9009219422249542e-05, "loss": 1.4955, "step": 3224 }, { "epoch": 0.24784880147510757, "grad_norm": 3.75683856010437, "learning_rate": 1.9008604794099572e-05, "loss": 1.5469, "step": 3226 }, { "epoch": 0.2480024585125999, "grad_norm": 4.777811050415039, "learning_rate": 1.90079901659496e-05, "loss": 1.4417, "step": 3228 }, { "epoch": 0.2481561155500922, "grad_norm": 3.5624022483825684, "learning_rate": 1.900737553779963e-05, "loss": 1.4786, "step": 3230 }, { "epoch": 0.24830977258758452, "grad_norm": 3.511544704437256, "learning_rate": 1.9006760909649664e-05, "loss": 1.4188, "step": 3232 }, { "epoch": 0.24846342962507684, "grad_norm": 3.966139078140259, "learning_rate": 1.9006146281499694e-05, "loss": 1.5502, "step": 3234 }, { "epoch": 0.24861708666256915, "grad_norm": 3.792802095413208, "learning_rate": 1.9005531653349724e-05, "loss": 1.4778, "step": 3236 }, { "epoch": 0.24877074370006147, "grad_norm": 4.079867839813232, "learning_rate": 1.9004917025199757e-05, "loss": 1.5482, "step": 3238 }, { "epoch": 0.24892440073755379, "grad_norm": 3.424563407897949, "learning_rate": 1.9004302397049786e-05, "loss": 1.5189, "step": 3240 }, { "epoch": 0.2490780577750461, "grad_norm": 3.5327744483947754, "learning_rate": 1.9003687768899816e-05, "loss": 1.4631, "step": 3242 }, { "epoch": 0.24923171481253842, "grad_norm": 3.8727915287017822, "learning_rate": 1.900307314074985e-05, "loss": 1.5833, "step": 3244 }, { "epoch": 0.24938537185003073, "grad_norm": 4.0103302001953125, "learning_rate": 1.900245851259988e-05, "loss": 1.4591, "step": 3246 }, { "epoch": 0.24953902888752305, "grad_norm": 4.012939453125, "learning_rate": 1.900184388444991e-05, "loss": 1.5609, "step": 3248 }, { "epoch": 0.24969268592501537, "grad_norm": 3.425726890563965, "learning_rate": 1.900122925629994e-05, "loss": 1.3594, "step": 3250 }, { "epoch": 0.24984634296250768, "grad_norm": 3.660951614379883, "learning_rate": 1.900061462814997e-05, "loss": 1.3977, "step": 3252 }, { "epoch": 0.25, "grad_norm": 3.4327516555786133, "learning_rate": 1.9e-05, "loss": 1.5269, "step": 3254 }, { "epoch": 0.2501536570374923, "grad_norm": 3.792966365814209, "learning_rate": 1.899938537185003e-05, "loss": 1.4157, "step": 3256 }, { "epoch": 0.25030731407498463, "grad_norm": 3.777806282043457, "learning_rate": 1.8998770743700064e-05, "loss": 1.61, "step": 3258 }, { "epoch": 0.25046097111247695, "grad_norm": 3.6547727584838867, "learning_rate": 1.8998156115550093e-05, "loss": 1.578, "step": 3260 }, { "epoch": 0.25061462814996927, "grad_norm": 3.7142393589019775, "learning_rate": 1.8997541487400123e-05, "loss": 1.5387, "step": 3262 }, { "epoch": 0.2507682851874616, "grad_norm": 3.795875310897827, "learning_rate": 1.8996926859250156e-05, "loss": 1.4108, "step": 3264 }, { "epoch": 0.2509219422249539, "grad_norm": 4.018312931060791, "learning_rate": 1.8996312231100186e-05, "loss": 1.609, "step": 3266 }, { "epoch": 0.2510755992624462, "grad_norm": 3.576310157775879, "learning_rate": 1.8995697602950216e-05, "loss": 1.5313, "step": 3268 }, { "epoch": 0.25122925629993853, "grad_norm": 3.7783751487731934, "learning_rate": 1.899508297480025e-05, "loss": 1.5393, "step": 3270 }, { "epoch": 0.25138291333743085, "grad_norm": 3.8343939781188965, "learning_rate": 1.899446834665028e-05, "loss": 1.4593, "step": 3272 }, { "epoch": 0.25153657037492316, "grad_norm": 3.8293821811676025, "learning_rate": 1.8993853718500308e-05, "loss": 1.4991, "step": 3274 }, { "epoch": 0.2516902274124155, "grad_norm": 3.804919958114624, "learning_rate": 1.8993239090350338e-05, "loss": 1.5475, "step": 3276 }, { "epoch": 0.2518438844499078, "grad_norm": 3.3940484523773193, "learning_rate": 1.899262446220037e-05, "loss": 1.329, "step": 3278 }, { "epoch": 0.2519975414874001, "grad_norm": 4.106419086456299, "learning_rate": 1.89920098340504e-05, "loss": 1.6195, "step": 3280 }, { "epoch": 0.25215119852489243, "grad_norm": 3.56506609916687, "learning_rate": 1.899139520590043e-05, "loss": 1.6574, "step": 3282 }, { "epoch": 0.25230485556238474, "grad_norm": 3.6036717891693115, "learning_rate": 1.8990780577750463e-05, "loss": 1.4525, "step": 3284 }, { "epoch": 0.25245851259987706, "grad_norm": 4.163785934448242, "learning_rate": 1.8990165949600493e-05, "loss": 1.4881, "step": 3286 }, { "epoch": 0.2526121696373694, "grad_norm": 4.115818023681641, "learning_rate": 1.8989551321450523e-05, "loss": 1.5126, "step": 3288 }, { "epoch": 0.2527658266748617, "grad_norm": 3.6225428581237793, "learning_rate": 1.8988936693300556e-05, "loss": 1.4663, "step": 3290 }, { "epoch": 0.252919483712354, "grad_norm": 3.335973024368286, "learning_rate": 1.8988322065150585e-05, "loss": 1.3223, "step": 3292 }, { "epoch": 0.2530731407498463, "grad_norm": 3.5263774394989014, "learning_rate": 1.8987707437000615e-05, "loss": 1.4948, "step": 3294 }, { "epoch": 0.25322679778733864, "grad_norm": 3.5017597675323486, "learning_rate": 1.8987092808850648e-05, "loss": 1.2772, "step": 3296 }, { "epoch": 0.25338045482483096, "grad_norm": 3.744292736053467, "learning_rate": 1.8986478180700678e-05, "loss": 1.3346, "step": 3298 }, { "epoch": 0.2535341118623233, "grad_norm": 4.142531871795654, "learning_rate": 1.8985863552550707e-05, "loss": 1.4425, "step": 3300 }, { "epoch": 0.2536877688998156, "grad_norm": 3.734403371810913, "learning_rate": 1.8985248924400737e-05, "loss": 1.5465, "step": 3302 }, { "epoch": 0.2538414259373079, "grad_norm": 3.28890061378479, "learning_rate": 1.898463429625077e-05, "loss": 1.4063, "step": 3304 }, { "epoch": 0.2539950829748002, "grad_norm": 3.3916125297546387, "learning_rate": 1.89840196681008e-05, "loss": 1.5272, "step": 3306 }, { "epoch": 0.25414874001229254, "grad_norm": 3.462782382965088, "learning_rate": 1.898340503995083e-05, "loss": 1.5789, "step": 3308 }, { "epoch": 0.25430239704978486, "grad_norm": 3.189033031463623, "learning_rate": 1.8982790411800863e-05, "loss": 1.4295, "step": 3310 }, { "epoch": 0.2544560540872772, "grad_norm": 3.8161280155181885, "learning_rate": 1.8982175783650892e-05, "loss": 1.4154, "step": 3312 }, { "epoch": 0.2546097111247695, "grad_norm": 3.279613494873047, "learning_rate": 1.8981561155500922e-05, "loss": 1.3957, "step": 3314 }, { "epoch": 0.2547633681622618, "grad_norm": 3.655524492263794, "learning_rate": 1.8980946527350955e-05, "loss": 1.5173, "step": 3316 }, { "epoch": 0.2549170251997541, "grad_norm": 3.407174587249756, "learning_rate": 1.8980331899200985e-05, "loss": 1.4034, "step": 3318 }, { "epoch": 0.2550706822372465, "grad_norm": 3.6055874824523926, "learning_rate": 1.8979717271051014e-05, "loss": 1.4338, "step": 3320 }, { "epoch": 0.2552243392747388, "grad_norm": 3.903757333755493, "learning_rate": 1.8979102642901048e-05, "loss": 1.4843, "step": 3322 }, { "epoch": 0.2553779963122311, "grad_norm": 3.445702314376831, "learning_rate": 1.8978488014751077e-05, "loss": 1.5697, "step": 3324 }, { "epoch": 0.25553165334972344, "grad_norm": 3.888247013092041, "learning_rate": 1.897787338660111e-05, "loss": 1.3772, "step": 3326 }, { "epoch": 0.25568531038721576, "grad_norm": 3.8378067016601562, "learning_rate": 1.8977258758451137e-05, "loss": 1.461, "step": 3328 }, { "epoch": 0.2558389674247081, "grad_norm": 4.017933368682861, "learning_rate": 1.897664413030117e-05, "loss": 1.4029, "step": 3330 }, { "epoch": 0.2559926244622004, "grad_norm": 3.43816876411438, "learning_rate": 1.89760295021512e-05, "loss": 1.4058, "step": 3332 }, { "epoch": 0.2561462814996927, "grad_norm": 3.6177170276641846, "learning_rate": 1.897541487400123e-05, "loss": 1.5816, "step": 3334 }, { "epoch": 0.256299938537185, "grad_norm": 3.708869218826294, "learning_rate": 1.8974800245851262e-05, "loss": 1.5383, "step": 3336 }, { "epoch": 0.25645359557467734, "grad_norm": 3.4780690670013428, "learning_rate": 1.8974185617701292e-05, "loss": 1.4352, "step": 3338 }, { "epoch": 0.25660725261216966, "grad_norm": 3.964945077896118, "learning_rate": 1.897357098955132e-05, "loss": 1.4872, "step": 3340 }, { "epoch": 0.256760909649662, "grad_norm": 3.3842320442199707, "learning_rate": 1.8972956361401355e-05, "loss": 1.4933, "step": 3342 }, { "epoch": 0.2569145666871543, "grad_norm": 3.7447729110717773, "learning_rate": 1.8972341733251384e-05, "loss": 1.6113, "step": 3344 }, { "epoch": 0.2570682237246466, "grad_norm": 4.0971174240112305, "learning_rate": 1.8971727105101417e-05, "loss": 1.4749, "step": 3346 }, { "epoch": 0.2572218807621389, "grad_norm": 3.2165610790252686, "learning_rate": 1.8971112476951447e-05, "loss": 1.4361, "step": 3348 }, { "epoch": 0.25737553779963124, "grad_norm": 3.8490447998046875, "learning_rate": 1.8970497848801477e-05, "loss": 1.5455, "step": 3350 }, { "epoch": 0.25752919483712355, "grad_norm": 4.487706661224365, "learning_rate": 1.896988322065151e-05, "loss": 1.5265, "step": 3352 }, { "epoch": 0.25768285187461587, "grad_norm": 3.487905502319336, "learning_rate": 1.8969268592501536e-05, "loss": 1.4864, "step": 3354 }, { "epoch": 0.2578365089121082, "grad_norm": 4.049886703491211, "learning_rate": 1.896865396435157e-05, "loss": 1.3788, "step": 3356 }, { "epoch": 0.2579901659496005, "grad_norm": 3.7233083248138428, "learning_rate": 1.89680393362016e-05, "loss": 1.4227, "step": 3358 }, { "epoch": 0.2581438229870928, "grad_norm": 3.6753907203674316, "learning_rate": 1.896742470805163e-05, "loss": 1.5348, "step": 3360 }, { "epoch": 0.25829748002458514, "grad_norm": 3.5778372287750244, "learning_rate": 1.896681007990166e-05, "loss": 1.4769, "step": 3362 }, { "epoch": 0.25845113706207745, "grad_norm": 3.8472750186920166, "learning_rate": 1.896619545175169e-05, "loss": 1.4646, "step": 3364 }, { "epoch": 0.25860479409956977, "grad_norm": 3.4961729049682617, "learning_rate": 1.8965580823601724e-05, "loss": 1.4237, "step": 3366 }, { "epoch": 0.2587584511370621, "grad_norm": 3.384688377380371, "learning_rate": 1.8964966195451754e-05, "loss": 1.3757, "step": 3368 }, { "epoch": 0.2589121081745544, "grad_norm": 3.461256265640259, "learning_rate": 1.8964351567301784e-05, "loss": 1.6313, "step": 3370 }, { "epoch": 0.2590657652120467, "grad_norm": 3.5917470455169678, "learning_rate": 1.8963736939151817e-05, "loss": 1.6363, "step": 3372 }, { "epoch": 0.25921942224953903, "grad_norm": 4.257856845855713, "learning_rate": 1.8963122311001846e-05, "loss": 1.5657, "step": 3374 }, { "epoch": 0.25937307928703135, "grad_norm": 3.825030565261841, "learning_rate": 1.8962507682851876e-05, "loss": 1.4029, "step": 3376 }, { "epoch": 0.25952673632452367, "grad_norm": 3.698003053665161, "learning_rate": 1.896189305470191e-05, "loss": 1.4794, "step": 3378 }, { "epoch": 0.259680393362016, "grad_norm": 3.656878709793091, "learning_rate": 1.8961278426551935e-05, "loss": 1.4764, "step": 3380 }, { "epoch": 0.2598340503995083, "grad_norm": 3.9345102310180664, "learning_rate": 1.896066379840197e-05, "loss": 1.4978, "step": 3382 }, { "epoch": 0.2599877074370006, "grad_norm": 3.2846667766571045, "learning_rate": 1.8960049170251998e-05, "loss": 1.4882, "step": 3384 }, { "epoch": 0.26014136447449293, "grad_norm": 3.6656949520111084, "learning_rate": 1.8959434542102028e-05, "loss": 1.5028, "step": 3386 }, { "epoch": 0.26029502151198525, "grad_norm": 3.3560938835144043, "learning_rate": 1.895881991395206e-05, "loss": 1.462, "step": 3388 }, { "epoch": 0.26044867854947756, "grad_norm": 3.903484582901001, "learning_rate": 1.895820528580209e-05, "loss": 1.5055, "step": 3390 }, { "epoch": 0.2606023355869699, "grad_norm": 3.4434473514556885, "learning_rate": 1.8957590657652124e-05, "loss": 1.4052, "step": 3392 }, { "epoch": 0.2607559926244622, "grad_norm": 3.8662304878234863, "learning_rate": 1.8956976029502153e-05, "loss": 1.4433, "step": 3394 }, { "epoch": 0.2609096496619545, "grad_norm": 3.921478509902954, "learning_rate": 1.8956361401352183e-05, "loss": 1.4143, "step": 3396 }, { "epoch": 0.26106330669944683, "grad_norm": 4.5299601554870605, "learning_rate": 1.8955746773202216e-05, "loss": 1.5001, "step": 3398 }, { "epoch": 0.26121696373693915, "grad_norm": 4.293244361877441, "learning_rate": 1.8955132145052242e-05, "loss": 1.4988, "step": 3400 }, { "epoch": 0.26137062077443146, "grad_norm": 3.8071391582489014, "learning_rate": 1.8954517516902276e-05, "loss": 1.5376, "step": 3402 }, { "epoch": 0.2615242778119238, "grad_norm": 3.9262189865112305, "learning_rate": 1.8953902888752305e-05, "loss": 1.6795, "step": 3404 }, { "epoch": 0.2616779348494161, "grad_norm": 3.64919376373291, "learning_rate": 1.8953288260602335e-05, "loss": 1.4907, "step": 3406 }, { "epoch": 0.2618315918869084, "grad_norm": 3.7635629177093506, "learning_rate": 1.8952673632452368e-05, "loss": 1.6125, "step": 3408 }, { "epoch": 0.2619852489244007, "grad_norm": 3.243130683898926, "learning_rate": 1.8952059004302398e-05, "loss": 1.4673, "step": 3410 }, { "epoch": 0.26213890596189304, "grad_norm": 4.027158737182617, "learning_rate": 1.895144437615243e-05, "loss": 1.572, "step": 3412 }, { "epoch": 0.26229256299938536, "grad_norm": 3.7150518894195557, "learning_rate": 1.895082974800246e-05, "loss": 1.5544, "step": 3414 }, { "epoch": 0.2624462200368777, "grad_norm": 3.56063175201416, "learning_rate": 1.895021511985249e-05, "loss": 1.4073, "step": 3416 }, { "epoch": 0.26259987707437, "grad_norm": 3.2205371856689453, "learning_rate": 1.8949600491702523e-05, "loss": 1.3674, "step": 3418 }, { "epoch": 0.2627535341118623, "grad_norm": 3.5181539058685303, "learning_rate": 1.8948985863552553e-05, "loss": 1.5704, "step": 3420 }, { "epoch": 0.2629071911493546, "grad_norm": 3.9559669494628906, "learning_rate": 1.8948371235402583e-05, "loss": 1.4059, "step": 3422 }, { "epoch": 0.26306084818684694, "grad_norm": 3.9452176094055176, "learning_rate": 1.8947756607252616e-05, "loss": 1.5914, "step": 3424 }, { "epoch": 0.26321450522433926, "grad_norm": 3.5629098415374756, "learning_rate": 1.8947141979102642e-05, "loss": 1.4329, "step": 3426 }, { "epoch": 0.2633681622618316, "grad_norm": 3.84218430519104, "learning_rate": 1.8946527350952675e-05, "loss": 1.4207, "step": 3428 }, { "epoch": 0.2635218192993239, "grad_norm": 3.7253074645996094, "learning_rate": 1.8945912722802705e-05, "loss": 1.623, "step": 3430 }, { "epoch": 0.2636754763368162, "grad_norm": 3.367321729660034, "learning_rate": 1.8945298094652738e-05, "loss": 1.3962, "step": 3432 }, { "epoch": 0.2638291333743085, "grad_norm": 3.7452688217163086, "learning_rate": 1.8944683466502767e-05, "loss": 1.5708, "step": 3434 }, { "epoch": 0.26398279041180084, "grad_norm": 4.207433700561523, "learning_rate": 1.8944068838352797e-05, "loss": 1.5347, "step": 3436 }, { "epoch": 0.26413644744929315, "grad_norm": 3.2880661487579346, "learning_rate": 1.894345421020283e-05, "loss": 1.4852, "step": 3438 }, { "epoch": 0.26429010448678547, "grad_norm": 3.8437421321868896, "learning_rate": 1.894283958205286e-05, "loss": 1.4737, "step": 3440 }, { "epoch": 0.2644437615242778, "grad_norm": 4.380143642425537, "learning_rate": 1.894222495390289e-05, "loss": 1.4646, "step": 3442 }, { "epoch": 0.2645974185617701, "grad_norm": 3.446812391281128, "learning_rate": 1.8941610325752923e-05, "loss": 1.5911, "step": 3444 }, { "epoch": 0.2647510755992624, "grad_norm": 3.8308660984039307, "learning_rate": 1.8940995697602952e-05, "loss": 1.4695, "step": 3446 }, { "epoch": 0.26490473263675474, "grad_norm": 3.475552558898926, "learning_rate": 1.8940381069452982e-05, "loss": 1.6034, "step": 3448 }, { "epoch": 0.2650583896742471, "grad_norm": 3.3068044185638428, "learning_rate": 1.8939766441303015e-05, "loss": 1.4709, "step": 3450 }, { "epoch": 0.2652120467117394, "grad_norm": 3.57163143157959, "learning_rate": 1.8939151813153045e-05, "loss": 1.4525, "step": 3452 }, { "epoch": 0.26536570374923174, "grad_norm": 3.2639644145965576, "learning_rate": 1.8938537185003074e-05, "loss": 1.5822, "step": 3454 }, { "epoch": 0.26551936078672406, "grad_norm": 3.7956414222717285, "learning_rate": 1.8937922556853104e-05, "loss": 1.4841, "step": 3456 }, { "epoch": 0.2656730178242164, "grad_norm": 3.958355665206909, "learning_rate": 1.8937307928703137e-05, "loss": 1.6366, "step": 3458 }, { "epoch": 0.2658266748617087, "grad_norm": 4.118971347808838, "learning_rate": 1.8936693300553167e-05, "loss": 1.6656, "step": 3460 }, { "epoch": 0.265980331899201, "grad_norm": 3.3143534660339355, "learning_rate": 1.8936078672403197e-05, "loss": 1.5126, "step": 3462 }, { "epoch": 0.2661339889366933, "grad_norm": 3.237959384918213, "learning_rate": 1.893546404425323e-05, "loss": 1.4093, "step": 3464 }, { "epoch": 0.26628764597418564, "grad_norm": 3.183522939682007, "learning_rate": 1.893484941610326e-05, "loss": 1.3261, "step": 3466 }, { "epoch": 0.26644130301167795, "grad_norm": 3.7242038249969482, "learning_rate": 1.893423478795329e-05, "loss": 1.5207, "step": 3468 }, { "epoch": 0.26659496004917027, "grad_norm": 3.6746206283569336, "learning_rate": 1.8933620159803322e-05, "loss": 1.5599, "step": 3470 }, { "epoch": 0.2667486170866626, "grad_norm": 3.866694211959839, "learning_rate": 1.8933005531653352e-05, "loss": 1.4262, "step": 3472 }, { "epoch": 0.2669022741241549, "grad_norm": 3.508993148803711, "learning_rate": 1.893239090350338e-05, "loss": 1.4758, "step": 3474 }, { "epoch": 0.2670559311616472, "grad_norm": 3.6842212677001953, "learning_rate": 1.8931776275353415e-05, "loss": 1.5336, "step": 3476 }, { "epoch": 0.26720958819913954, "grad_norm": 3.6138060092926025, "learning_rate": 1.8931161647203444e-05, "loss": 1.4907, "step": 3478 }, { "epoch": 0.26736324523663185, "grad_norm": 3.9442241191864014, "learning_rate": 1.8930547019053474e-05, "loss": 1.4674, "step": 3480 }, { "epoch": 0.26751690227412417, "grad_norm": 3.853156089782715, "learning_rate": 1.8929932390903504e-05, "loss": 1.5482, "step": 3482 }, { "epoch": 0.2676705593116165, "grad_norm": 3.863734245300293, "learning_rate": 1.8929317762753537e-05, "loss": 1.3643, "step": 3484 }, { "epoch": 0.2678242163491088, "grad_norm": 3.796949625015259, "learning_rate": 1.8928703134603566e-05, "loss": 1.5439, "step": 3486 }, { "epoch": 0.2679778733866011, "grad_norm": 3.865708827972412, "learning_rate": 1.8928088506453596e-05, "loss": 1.3416, "step": 3488 }, { "epoch": 0.26813153042409343, "grad_norm": 3.347963571548462, "learning_rate": 1.892747387830363e-05, "loss": 1.495, "step": 3490 }, { "epoch": 0.26828518746158575, "grad_norm": 3.663733959197998, "learning_rate": 1.892685925015366e-05, "loss": 1.5101, "step": 3492 }, { "epoch": 0.26843884449907807, "grad_norm": 3.4801948070526123, "learning_rate": 1.892624462200369e-05, "loss": 1.5782, "step": 3494 }, { "epoch": 0.2685925015365704, "grad_norm": 3.6601758003234863, "learning_rate": 1.892562999385372e-05, "loss": 1.6072, "step": 3496 }, { "epoch": 0.2687461585740627, "grad_norm": 3.8066141605377197, "learning_rate": 1.892501536570375e-05, "loss": 1.4949, "step": 3498 }, { "epoch": 0.268899815611555, "grad_norm": 4.257297515869141, "learning_rate": 1.892440073755378e-05, "loss": 1.5364, "step": 3500 }, { "epoch": 0.26905347264904733, "grad_norm": 3.5987801551818848, "learning_rate": 1.892378610940381e-05, "loss": 1.4022, "step": 3502 }, { "epoch": 0.26920712968653965, "grad_norm": 3.342484474182129, "learning_rate": 1.8923171481253844e-05, "loss": 1.5268, "step": 3504 }, { "epoch": 0.26936078672403196, "grad_norm": 3.992051601409912, "learning_rate": 1.8922556853103873e-05, "loss": 1.4786, "step": 3506 }, { "epoch": 0.2695144437615243, "grad_norm": 3.877612829208374, "learning_rate": 1.8921942224953903e-05, "loss": 1.5529, "step": 3508 }, { "epoch": 0.2696681007990166, "grad_norm": 3.6077497005462646, "learning_rate": 1.8921327596803936e-05, "loss": 1.577, "step": 3510 }, { "epoch": 0.2698217578365089, "grad_norm": 3.497678279876709, "learning_rate": 1.8920712968653966e-05, "loss": 1.4717, "step": 3512 }, { "epoch": 0.26997541487400123, "grad_norm": 3.7486648559570312, "learning_rate": 1.8920098340503995e-05, "loss": 1.3829, "step": 3514 }, { "epoch": 0.27012907191149355, "grad_norm": 3.9724059104919434, "learning_rate": 1.891948371235403e-05, "loss": 1.6304, "step": 3516 }, { "epoch": 0.27028272894898586, "grad_norm": 3.657163381576538, "learning_rate": 1.8918869084204058e-05, "loss": 1.5072, "step": 3518 }, { "epoch": 0.2704363859864782, "grad_norm": 3.7878036499023438, "learning_rate": 1.8918254456054088e-05, "loss": 1.5066, "step": 3520 }, { "epoch": 0.2705900430239705, "grad_norm": 3.5070736408233643, "learning_rate": 1.891763982790412e-05, "loss": 1.5437, "step": 3522 }, { "epoch": 0.2707437000614628, "grad_norm": 3.5397820472717285, "learning_rate": 1.891702519975415e-05, "loss": 1.502, "step": 3524 }, { "epoch": 0.2708973570989551, "grad_norm": 3.4602363109588623, "learning_rate": 1.891641057160418e-05, "loss": 1.4634, "step": 3526 }, { "epoch": 0.27105101413644744, "grad_norm": 3.507657051086426, "learning_rate": 1.891579594345421e-05, "loss": 1.4342, "step": 3528 }, { "epoch": 0.27120467117393976, "grad_norm": 3.7547881603240967, "learning_rate": 1.8915181315304243e-05, "loss": 1.3555, "step": 3530 }, { "epoch": 0.2713583282114321, "grad_norm": 3.522979497909546, "learning_rate": 1.8914566687154273e-05, "loss": 1.4903, "step": 3532 }, { "epoch": 0.2715119852489244, "grad_norm": 3.673581123352051, "learning_rate": 1.8913952059004302e-05, "loss": 1.3561, "step": 3534 }, { "epoch": 0.2716656422864167, "grad_norm": 3.447091579437256, "learning_rate": 1.8913337430854336e-05, "loss": 1.5173, "step": 3536 }, { "epoch": 0.271819299323909, "grad_norm": 3.915567398071289, "learning_rate": 1.8912722802704365e-05, "loss": 1.502, "step": 3538 }, { "epoch": 0.27197295636140134, "grad_norm": 3.157322883605957, "learning_rate": 1.8912108174554395e-05, "loss": 1.4469, "step": 3540 }, { "epoch": 0.27212661339889366, "grad_norm": 3.3162407875061035, "learning_rate": 1.8911493546404428e-05, "loss": 1.3842, "step": 3542 }, { "epoch": 0.272280270436386, "grad_norm": 4.137020111083984, "learning_rate": 1.8910878918254458e-05, "loss": 1.4895, "step": 3544 }, { "epoch": 0.2724339274738783, "grad_norm": 3.5766658782958984, "learning_rate": 1.8910264290104487e-05, "loss": 1.4133, "step": 3546 }, { "epoch": 0.2725875845113706, "grad_norm": 3.5843989849090576, "learning_rate": 1.890964966195452e-05, "loss": 1.5026, "step": 3548 }, { "epoch": 0.2727412415488629, "grad_norm": 3.568018913269043, "learning_rate": 1.890903503380455e-05, "loss": 1.4175, "step": 3550 }, { "epoch": 0.27289489858635524, "grad_norm": 3.131871461868286, "learning_rate": 1.890842040565458e-05, "loss": 1.348, "step": 3552 }, { "epoch": 0.27304855562384756, "grad_norm": 4.097682476043701, "learning_rate": 1.890780577750461e-05, "loss": 1.4924, "step": 3554 }, { "epoch": 0.27320221266133987, "grad_norm": 3.9119415283203125, "learning_rate": 1.8907191149354643e-05, "loss": 1.431, "step": 3556 }, { "epoch": 0.2733558696988322, "grad_norm": 3.3313307762145996, "learning_rate": 1.8906576521204672e-05, "loss": 1.5508, "step": 3558 }, { "epoch": 0.2735095267363245, "grad_norm": 3.149726152420044, "learning_rate": 1.8905961893054702e-05, "loss": 1.4358, "step": 3560 }, { "epoch": 0.2736631837738168, "grad_norm": 3.852184534072876, "learning_rate": 1.8905347264904735e-05, "loss": 1.4537, "step": 3562 }, { "epoch": 0.27381684081130914, "grad_norm": 3.5580079555511475, "learning_rate": 1.8904732636754765e-05, "loss": 1.6419, "step": 3564 }, { "epoch": 0.27397049784880145, "grad_norm": 3.484915018081665, "learning_rate": 1.8904118008604794e-05, "loss": 1.4562, "step": 3566 }, { "epoch": 0.27412415488629377, "grad_norm": 4.093445301055908, "learning_rate": 1.8903503380454827e-05, "loss": 1.4078, "step": 3568 }, { "epoch": 0.2742778119237861, "grad_norm": 3.514277458190918, "learning_rate": 1.8902888752304857e-05, "loss": 1.4331, "step": 3570 }, { "epoch": 0.2744314689612784, "grad_norm": 4.0411810874938965, "learning_rate": 1.8902274124154887e-05, "loss": 1.4517, "step": 3572 }, { "epoch": 0.2745851259987707, "grad_norm": 3.505648374557495, "learning_rate": 1.890165949600492e-05, "loss": 1.415, "step": 3574 }, { "epoch": 0.27473878303626303, "grad_norm": 4.12168550491333, "learning_rate": 1.890104486785495e-05, "loss": 1.4809, "step": 3576 }, { "epoch": 0.27489244007375535, "grad_norm": 3.5654399394989014, "learning_rate": 1.8900430239704983e-05, "loss": 1.5533, "step": 3578 }, { "epoch": 0.2750460971112477, "grad_norm": 4.086021900177002, "learning_rate": 1.889981561155501e-05, "loss": 1.5845, "step": 3580 }, { "epoch": 0.27519975414874004, "grad_norm": 3.6145718097686768, "learning_rate": 1.8899200983405042e-05, "loss": 1.3608, "step": 3582 }, { "epoch": 0.27535341118623236, "grad_norm": 3.477968692779541, "learning_rate": 1.889858635525507e-05, "loss": 1.4685, "step": 3584 }, { "epoch": 0.27550706822372467, "grad_norm": 3.627117872238159, "learning_rate": 1.88979717271051e-05, "loss": 1.4376, "step": 3586 }, { "epoch": 0.275660725261217, "grad_norm": 3.631131649017334, "learning_rate": 1.8897357098955134e-05, "loss": 1.5543, "step": 3588 }, { "epoch": 0.2758143822987093, "grad_norm": 3.5396790504455566, "learning_rate": 1.8896742470805164e-05, "loss": 1.4909, "step": 3590 }, { "epoch": 0.2759680393362016, "grad_norm": 3.816535472869873, "learning_rate": 1.8896127842655194e-05, "loss": 1.399, "step": 3592 }, { "epoch": 0.27612169637369394, "grad_norm": 3.565800428390503, "learning_rate": 1.8895513214505227e-05, "loss": 1.2989, "step": 3594 }, { "epoch": 0.27627535341118625, "grad_norm": 4.057579040527344, "learning_rate": 1.8894898586355257e-05, "loss": 1.484, "step": 3596 }, { "epoch": 0.27642901044867857, "grad_norm": 3.517569065093994, "learning_rate": 1.889428395820529e-05, "loss": 1.5513, "step": 3598 }, { "epoch": 0.2765826674861709, "grad_norm": 3.6325526237487793, "learning_rate": 1.8893669330055316e-05, "loss": 1.4866, "step": 3600 }, { "epoch": 0.2767363245236632, "grad_norm": 3.697222948074341, "learning_rate": 1.889305470190535e-05, "loss": 1.5287, "step": 3602 }, { "epoch": 0.2768899815611555, "grad_norm": 3.3492066860198975, "learning_rate": 1.8892440073755382e-05, "loss": 1.5102, "step": 3604 }, { "epoch": 0.27704363859864783, "grad_norm": 3.629889726638794, "learning_rate": 1.889182544560541e-05, "loss": 1.5843, "step": 3606 }, { "epoch": 0.27719729563614015, "grad_norm": 4.252458095550537, "learning_rate": 1.889121081745544e-05, "loss": 1.4569, "step": 3608 }, { "epoch": 0.27735095267363247, "grad_norm": 3.4219071865081787, "learning_rate": 1.889059618930547e-05, "loss": 1.5789, "step": 3610 }, { "epoch": 0.2775046097111248, "grad_norm": 3.4524829387664795, "learning_rate": 1.88899815611555e-05, "loss": 1.5203, "step": 3612 }, { "epoch": 0.2776582667486171, "grad_norm": 3.130014181137085, "learning_rate": 1.8889366933005534e-05, "loss": 1.3825, "step": 3614 }, { "epoch": 0.2778119237861094, "grad_norm": 3.439162015914917, "learning_rate": 1.8888752304855564e-05, "loss": 1.3977, "step": 3616 }, { "epoch": 0.27796558082360173, "grad_norm": 3.3094050884246826, "learning_rate": 1.8888137676705597e-05, "loss": 1.4268, "step": 3618 }, { "epoch": 0.27811923786109405, "grad_norm": 3.6266913414001465, "learning_rate": 1.8887523048555626e-05, "loss": 1.7052, "step": 3620 }, { "epoch": 0.27827289489858636, "grad_norm": 3.5086283683776855, "learning_rate": 1.8886908420405656e-05, "loss": 1.4714, "step": 3622 }, { "epoch": 0.2784265519360787, "grad_norm": 3.7588820457458496, "learning_rate": 1.888629379225569e-05, "loss": 1.414, "step": 3624 }, { "epoch": 0.278580208973571, "grad_norm": 3.1712088584899902, "learning_rate": 1.8885679164105715e-05, "loss": 1.4258, "step": 3626 }, { "epoch": 0.2787338660110633, "grad_norm": 3.323568820953369, "learning_rate": 1.888506453595575e-05, "loss": 1.5158, "step": 3628 }, { "epoch": 0.27888752304855563, "grad_norm": 3.520214557647705, "learning_rate": 1.8884449907805778e-05, "loss": 1.5343, "step": 3630 }, { "epoch": 0.27904118008604795, "grad_norm": 3.6342597007751465, "learning_rate": 1.8883835279655808e-05, "loss": 1.5991, "step": 3632 }, { "epoch": 0.27919483712354026, "grad_norm": 4.191102027893066, "learning_rate": 1.888322065150584e-05, "loss": 1.4219, "step": 3634 }, { "epoch": 0.2793484941610326, "grad_norm": 4.2305779457092285, "learning_rate": 1.888260602335587e-05, "loss": 1.6116, "step": 3636 }, { "epoch": 0.2795021511985249, "grad_norm": 3.5002593994140625, "learning_rate": 1.8881991395205904e-05, "loss": 1.4798, "step": 3638 }, { "epoch": 0.2796558082360172, "grad_norm": 2.9989280700683594, "learning_rate": 1.8881376767055933e-05, "loss": 1.2869, "step": 3640 }, { "epoch": 0.27980946527350953, "grad_norm": 3.6314494609832764, "learning_rate": 1.8880762138905963e-05, "loss": 1.3329, "step": 3642 }, { "epoch": 0.27996312231100184, "grad_norm": 3.1968257427215576, "learning_rate": 1.8880147510755996e-05, "loss": 1.4851, "step": 3644 }, { "epoch": 0.28011677934849416, "grad_norm": 3.2353439331054688, "learning_rate": 1.8879532882606026e-05, "loss": 1.5025, "step": 3646 }, { "epoch": 0.2802704363859865, "grad_norm": 3.394658327102661, "learning_rate": 1.8878918254456055e-05, "loss": 1.4325, "step": 3648 }, { "epoch": 0.2804240934234788, "grad_norm": 3.303316831588745, "learning_rate": 1.887830362630609e-05, "loss": 1.482, "step": 3650 }, { "epoch": 0.2805777504609711, "grad_norm": 3.44572114944458, "learning_rate": 1.8877688998156115e-05, "loss": 1.4126, "step": 3652 }, { "epoch": 0.2807314074984634, "grad_norm": 3.64277982711792, "learning_rate": 1.8877074370006148e-05, "loss": 1.4845, "step": 3654 }, { "epoch": 0.28088506453595574, "grad_norm": 3.9562630653381348, "learning_rate": 1.8876459741856178e-05, "loss": 1.4596, "step": 3656 }, { "epoch": 0.28103872157344806, "grad_norm": 3.3409039974212646, "learning_rate": 1.8875845113706207e-05, "loss": 1.4672, "step": 3658 }, { "epoch": 0.2811923786109404, "grad_norm": 3.783766031265259, "learning_rate": 1.887523048555624e-05, "loss": 1.662, "step": 3660 }, { "epoch": 0.2813460356484327, "grad_norm": 4.031844139099121, "learning_rate": 1.887461585740627e-05, "loss": 1.5427, "step": 3662 }, { "epoch": 0.281499692685925, "grad_norm": 3.9957854747772217, "learning_rate": 1.8874001229256303e-05, "loss": 1.505, "step": 3664 }, { "epoch": 0.2816533497234173, "grad_norm": 3.1964595317840576, "learning_rate": 1.8873386601106333e-05, "loss": 1.3482, "step": 3666 }, { "epoch": 0.28180700676090964, "grad_norm": 4.197731018066406, "learning_rate": 1.8872771972956362e-05, "loss": 1.4717, "step": 3668 }, { "epoch": 0.28196066379840196, "grad_norm": 3.6347153186798096, "learning_rate": 1.8872157344806395e-05, "loss": 1.5804, "step": 3670 }, { "epoch": 0.28211432083589427, "grad_norm": 3.2941391468048096, "learning_rate": 1.8871542716656425e-05, "loss": 1.4578, "step": 3672 }, { "epoch": 0.2822679778733866, "grad_norm": 3.4472978115081787, "learning_rate": 1.8870928088506455e-05, "loss": 1.445, "step": 3674 }, { "epoch": 0.2824216349108789, "grad_norm": 3.385477304458618, "learning_rate": 1.8870313460356488e-05, "loss": 1.3812, "step": 3676 }, { "epoch": 0.2825752919483712, "grad_norm": 3.4385907649993896, "learning_rate": 1.8869698832206514e-05, "loss": 1.4371, "step": 3678 }, { "epoch": 0.28272894898586354, "grad_norm": 3.3431503772735596, "learning_rate": 1.8869084204056547e-05, "loss": 1.4938, "step": 3680 }, { "epoch": 0.28288260602335585, "grad_norm": 3.215934991836548, "learning_rate": 1.8868469575906577e-05, "loss": 1.4857, "step": 3682 }, { "epoch": 0.28303626306084817, "grad_norm": 3.9575302600860596, "learning_rate": 1.886785494775661e-05, "loss": 1.4792, "step": 3684 }, { "epoch": 0.2831899200983405, "grad_norm": 2.991589307785034, "learning_rate": 1.886724031960664e-05, "loss": 1.3701, "step": 3686 }, { "epoch": 0.2833435771358328, "grad_norm": 3.557347536087036, "learning_rate": 1.886662569145667e-05, "loss": 1.4585, "step": 3688 }, { "epoch": 0.2834972341733251, "grad_norm": 3.5175206661224365, "learning_rate": 1.8866011063306702e-05, "loss": 1.6076, "step": 3690 }, { "epoch": 0.28365089121081744, "grad_norm": 4.530017375946045, "learning_rate": 1.8865396435156732e-05, "loss": 1.6293, "step": 3692 }, { "epoch": 0.28380454824830975, "grad_norm": 4.1233320236206055, "learning_rate": 1.8864781807006762e-05, "loss": 1.3835, "step": 3694 }, { "epoch": 0.28395820528580207, "grad_norm": 3.4351844787597656, "learning_rate": 1.8864167178856795e-05, "loss": 1.3736, "step": 3696 }, { "epoch": 0.2841118623232944, "grad_norm": 3.3016061782836914, "learning_rate": 1.8863552550706825e-05, "loss": 1.5559, "step": 3698 }, { "epoch": 0.2842655193607867, "grad_norm": 4.053586006164551, "learning_rate": 1.8862937922556854e-05, "loss": 1.492, "step": 3700 }, { "epoch": 0.284419176398279, "grad_norm": 3.7303128242492676, "learning_rate": 1.8862323294406887e-05, "loss": 1.368, "step": 3702 }, { "epoch": 0.28457283343577133, "grad_norm": 3.206644296646118, "learning_rate": 1.8861708666256917e-05, "loss": 1.405, "step": 3704 }, { "epoch": 0.28472649047326365, "grad_norm": 3.58850359916687, "learning_rate": 1.8861094038106947e-05, "loss": 1.3416, "step": 3706 }, { "epoch": 0.28488014751075597, "grad_norm": 4.489683628082275, "learning_rate": 1.8860479409956976e-05, "loss": 1.315, "step": 3708 }, { "epoch": 0.28503380454824834, "grad_norm": 3.5646450519561768, "learning_rate": 1.885986478180701e-05, "loss": 1.4451, "step": 3710 }, { "epoch": 0.28518746158574065, "grad_norm": 3.7019946575164795, "learning_rate": 1.885925015365704e-05, "loss": 1.4403, "step": 3712 }, { "epoch": 0.28534111862323297, "grad_norm": 3.1615312099456787, "learning_rate": 1.885863552550707e-05, "loss": 1.4083, "step": 3714 }, { "epoch": 0.2854947756607253, "grad_norm": 3.486154556274414, "learning_rate": 1.8858020897357102e-05, "loss": 1.46, "step": 3716 }, { "epoch": 0.2856484326982176, "grad_norm": 4.051648139953613, "learning_rate": 1.885740626920713e-05, "loss": 1.4501, "step": 3718 }, { "epoch": 0.2858020897357099, "grad_norm": 3.6418347358703613, "learning_rate": 1.885679164105716e-05, "loss": 1.4339, "step": 3720 }, { "epoch": 0.28595574677320224, "grad_norm": 3.5506417751312256, "learning_rate": 1.8856177012907194e-05, "loss": 1.4697, "step": 3722 }, { "epoch": 0.28610940381069455, "grad_norm": 3.2061004638671875, "learning_rate": 1.8855562384757224e-05, "loss": 1.4207, "step": 3724 }, { "epoch": 0.28626306084818687, "grad_norm": 3.7919108867645264, "learning_rate": 1.8854947756607254e-05, "loss": 1.4708, "step": 3726 }, { "epoch": 0.2864167178856792, "grad_norm": 3.9891295433044434, "learning_rate": 1.8854333128457283e-05, "loss": 1.4744, "step": 3728 }, { "epoch": 0.2865703749231715, "grad_norm": 3.63608455657959, "learning_rate": 1.8853718500307316e-05, "loss": 1.5032, "step": 3730 }, { "epoch": 0.2867240319606638, "grad_norm": 3.5813047885894775, "learning_rate": 1.8853103872157346e-05, "loss": 1.581, "step": 3732 }, { "epoch": 0.28687768899815613, "grad_norm": 2.9888648986816406, "learning_rate": 1.8852489244007376e-05, "loss": 1.5092, "step": 3734 }, { "epoch": 0.28703134603564845, "grad_norm": 3.926024913787842, "learning_rate": 1.885187461585741e-05, "loss": 1.6346, "step": 3736 }, { "epoch": 0.28718500307314077, "grad_norm": 3.657963275909424, "learning_rate": 1.885125998770744e-05, "loss": 1.4397, "step": 3738 }, { "epoch": 0.2873386601106331, "grad_norm": 3.7995994091033936, "learning_rate": 1.8850645359557468e-05, "loss": 1.5096, "step": 3740 }, { "epoch": 0.2874923171481254, "grad_norm": 3.9522006511688232, "learning_rate": 1.88500307314075e-05, "loss": 1.3611, "step": 3742 }, { "epoch": 0.2876459741856177, "grad_norm": 4.043625831604004, "learning_rate": 1.884941610325753e-05, "loss": 1.5588, "step": 3744 }, { "epoch": 0.28779963122311003, "grad_norm": 3.6213269233703613, "learning_rate": 1.884880147510756e-05, "loss": 1.6376, "step": 3746 }, { "epoch": 0.28795328826060235, "grad_norm": 3.4300777912139893, "learning_rate": 1.8848186846957594e-05, "loss": 1.6227, "step": 3748 }, { "epoch": 0.28810694529809466, "grad_norm": 3.5119426250457764, "learning_rate": 1.8847572218807623e-05, "loss": 1.479, "step": 3750 }, { "epoch": 0.288260602335587, "grad_norm": 3.7524571418762207, "learning_rate": 1.8846957590657653e-05, "loss": 1.4244, "step": 3752 }, { "epoch": 0.2884142593730793, "grad_norm": 2.923687219619751, "learning_rate": 1.8846342962507683e-05, "loss": 1.4444, "step": 3754 }, { "epoch": 0.2885679164105716, "grad_norm": 4.0984296798706055, "learning_rate": 1.8845728334357716e-05, "loss": 1.5281, "step": 3756 }, { "epoch": 0.28872157344806393, "grad_norm": 3.296266794204712, "learning_rate": 1.8845113706207746e-05, "loss": 1.4179, "step": 3758 }, { "epoch": 0.28887523048555624, "grad_norm": 3.4929823875427246, "learning_rate": 1.8844499078057775e-05, "loss": 1.5606, "step": 3760 }, { "epoch": 0.28902888752304856, "grad_norm": 3.201611042022705, "learning_rate": 1.884388444990781e-05, "loss": 1.4212, "step": 3762 }, { "epoch": 0.2891825445605409, "grad_norm": 3.264183282852173, "learning_rate": 1.8843269821757838e-05, "loss": 1.3158, "step": 3764 }, { "epoch": 0.2893362015980332, "grad_norm": 2.94294810295105, "learning_rate": 1.8842655193607868e-05, "loss": 1.4594, "step": 3766 }, { "epoch": 0.2894898586355255, "grad_norm": 18.42613410949707, "learning_rate": 1.88420405654579e-05, "loss": 1.4853, "step": 3768 }, { "epoch": 0.2896435156730178, "grad_norm": 3.516327381134033, "learning_rate": 1.884142593730793e-05, "loss": 1.4182, "step": 3770 }, { "epoch": 0.28979717271051014, "grad_norm": 3.5198872089385986, "learning_rate": 1.884081130915796e-05, "loss": 1.3558, "step": 3772 }, { "epoch": 0.28995082974800246, "grad_norm": 3.396672487258911, "learning_rate": 1.8840196681007993e-05, "loss": 1.292, "step": 3774 }, { "epoch": 0.2901044867854948, "grad_norm": 3.304868698120117, "learning_rate": 1.8839582052858023e-05, "loss": 1.5348, "step": 3776 }, { "epoch": 0.2902581438229871, "grad_norm": 3.4228885173797607, "learning_rate": 1.8838967424708053e-05, "loss": 1.4141, "step": 3778 }, { "epoch": 0.2904118008604794, "grad_norm": 3.2031636238098145, "learning_rate": 1.8838352796558082e-05, "loss": 1.3834, "step": 3780 }, { "epoch": 0.2905654578979717, "grad_norm": 3.6728415489196777, "learning_rate": 1.8837738168408115e-05, "loss": 1.3677, "step": 3782 }, { "epoch": 0.29071911493546404, "grad_norm": 3.418509006500244, "learning_rate": 1.8837123540258145e-05, "loss": 1.5158, "step": 3784 }, { "epoch": 0.29087277197295636, "grad_norm": 3.4060676097869873, "learning_rate": 1.8836508912108175e-05, "loss": 1.561, "step": 3786 }, { "epoch": 0.2910264290104487, "grad_norm": 3.5852086544036865, "learning_rate": 1.8835894283958208e-05, "loss": 1.4347, "step": 3788 }, { "epoch": 0.291180086047941, "grad_norm": 3.987565517425537, "learning_rate": 1.8835279655808237e-05, "loss": 1.4233, "step": 3790 }, { "epoch": 0.2913337430854333, "grad_norm": 3.6521356105804443, "learning_rate": 1.8834665027658267e-05, "loss": 1.4408, "step": 3792 }, { "epoch": 0.2914874001229256, "grad_norm": 5.208387851715088, "learning_rate": 1.88340503995083e-05, "loss": 1.6046, "step": 3794 }, { "epoch": 0.29164105716041794, "grad_norm": 3.065742254257202, "learning_rate": 1.883343577135833e-05, "loss": 1.4079, "step": 3796 }, { "epoch": 0.29179471419791025, "grad_norm": 3.513312816619873, "learning_rate": 1.883282114320836e-05, "loss": 1.547, "step": 3798 }, { "epoch": 0.29194837123540257, "grad_norm": 2.8478775024414062, "learning_rate": 1.8832206515058393e-05, "loss": 1.3478, "step": 3800 }, { "epoch": 0.2921020282728949, "grad_norm": 3.3886022567749023, "learning_rate": 1.8831591886908422e-05, "loss": 1.3928, "step": 3802 }, { "epoch": 0.2922556853103872, "grad_norm": 3.5944290161132812, "learning_rate": 1.8830977258758452e-05, "loss": 1.3548, "step": 3804 }, { "epoch": 0.2924093423478795, "grad_norm": 4.22208309173584, "learning_rate": 1.8830362630608482e-05, "loss": 1.5221, "step": 3806 }, { "epoch": 0.29256299938537184, "grad_norm": 3.431133985519409, "learning_rate": 1.8829748002458515e-05, "loss": 1.5649, "step": 3808 }, { "epoch": 0.29271665642286415, "grad_norm": 3.4773547649383545, "learning_rate": 1.8829133374308544e-05, "loss": 1.3719, "step": 3810 }, { "epoch": 0.29287031346035647, "grad_norm": 3.4864048957824707, "learning_rate": 1.8828518746158574e-05, "loss": 1.3556, "step": 3812 }, { "epoch": 0.2930239704978488, "grad_norm": 3.3974108695983887, "learning_rate": 1.8827904118008607e-05, "loss": 1.6966, "step": 3814 }, { "epoch": 0.2931776275353411, "grad_norm": 3.71232533454895, "learning_rate": 1.8827289489858637e-05, "loss": 1.5968, "step": 3816 }, { "epoch": 0.2933312845728334, "grad_norm": 3.693206787109375, "learning_rate": 1.8826674861708667e-05, "loss": 1.5184, "step": 3818 }, { "epoch": 0.29348494161032573, "grad_norm": 3.710134744644165, "learning_rate": 1.88260602335587e-05, "loss": 1.5327, "step": 3820 }, { "epoch": 0.29363859864781805, "grad_norm": 3.417041540145874, "learning_rate": 1.882544560540873e-05, "loss": 1.5009, "step": 3822 }, { "epoch": 0.29379225568531037, "grad_norm": 3.0500283241271973, "learning_rate": 1.882483097725876e-05, "loss": 1.5554, "step": 3824 }, { "epoch": 0.2939459127228027, "grad_norm": 3.6560235023498535, "learning_rate": 1.882421634910879e-05, "loss": 1.4728, "step": 3826 }, { "epoch": 0.294099569760295, "grad_norm": 3.733311653137207, "learning_rate": 1.8823601720958822e-05, "loss": 1.291, "step": 3828 }, { "epoch": 0.2942532267977873, "grad_norm": 3.633963108062744, "learning_rate": 1.8822987092808855e-05, "loss": 1.6095, "step": 3830 }, { "epoch": 0.29440688383527963, "grad_norm": 3.664761781692505, "learning_rate": 1.882237246465888e-05, "loss": 1.3628, "step": 3832 }, { "epoch": 0.29456054087277195, "grad_norm": 4.00673246383667, "learning_rate": 1.8821757836508914e-05, "loss": 1.5943, "step": 3834 }, { "epoch": 0.29471419791026426, "grad_norm": 3.2937426567077637, "learning_rate": 1.8821143208358944e-05, "loss": 1.341, "step": 3836 }, { "epoch": 0.2948678549477566, "grad_norm": 2.9937968254089355, "learning_rate": 1.8820528580208974e-05, "loss": 1.5026, "step": 3838 }, { "epoch": 0.29502151198524895, "grad_norm": 3.309907913208008, "learning_rate": 1.8819913952059007e-05, "loss": 1.4908, "step": 3840 }, { "epoch": 0.29517516902274127, "grad_norm": 3.4216926097869873, "learning_rate": 1.8819299323909036e-05, "loss": 1.4074, "step": 3842 }, { "epoch": 0.2953288260602336, "grad_norm": 3.285968780517578, "learning_rate": 1.8818684695759066e-05, "loss": 1.4249, "step": 3844 }, { "epoch": 0.2954824830977259, "grad_norm": 3.103705644607544, "learning_rate": 1.88180700676091e-05, "loss": 1.3529, "step": 3846 }, { "epoch": 0.2956361401352182, "grad_norm": 3.3987631797790527, "learning_rate": 1.881745543945913e-05, "loss": 1.5761, "step": 3848 }, { "epoch": 0.29578979717271053, "grad_norm": 3.2349257469177246, "learning_rate": 1.8816840811309162e-05, "loss": 1.5087, "step": 3850 }, { "epoch": 0.29594345421020285, "grad_norm": 3.374030113220215, "learning_rate": 1.8816226183159188e-05, "loss": 1.4778, "step": 3852 }, { "epoch": 0.29609711124769517, "grad_norm": 3.8146629333496094, "learning_rate": 1.881561155500922e-05, "loss": 1.4556, "step": 3854 }, { "epoch": 0.2962507682851875, "grad_norm": 3.6150548458099365, "learning_rate": 1.881499692685925e-05, "loss": 1.2892, "step": 3856 }, { "epoch": 0.2964044253226798, "grad_norm": 3.345280170440674, "learning_rate": 1.881438229870928e-05, "loss": 1.4552, "step": 3858 }, { "epoch": 0.2965580823601721, "grad_norm": 2.790616273880005, "learning_rate": 1.8813767670559314e-05, "loss": 1.4375, "step": 3860 }, { "epoch": 0.29671173939766443, "grad_norm": 3.415255546569824, "learning_rate": 1.8813153042409343e-05, "loss": 1.5357, "step": 3862 }, { "epoch": 0.29686539643515675, "grad_norm": 3.4177236557006836, "learning_rate": 1.8812538414259373e-05, "loss": 1.6121, "step": 3864 }, { "epoch": 0.29701905347264906, "grad_norm": 3.5526952743530273, "learning_rate": 1.8811923786109406e-05, "loss": 1.4373, "step": 3866 }, { "epoch": 0.2971727105101414, "grad_norm": 3.729252576828003, "learning_rate": 1.8811309157959436e-05, "loss": 1.3217, "step": 3868 }, { "epoch": 0.2973263675476337, "grad_norm": 3.5976545810699463, "learning_rate": 1.881069452980947e-05, "loss": 1.4549, "step": 3870 }, { "epoch": 0.297480024585126, "grad_norm": 3.2108335494995117, "learning_rate": 1.88100799016595e-05, "loss": 1.5147, "step": 3872 }, { "epoch": 0.29763368162261833, "grad_norm": 3.279672622680664, "learning_rate": 1.8809465273509528e-05, "loss": 1.3839, "step": 3874 }, { "epoch": 0.29778733866011065, "grad_norm": 3.832663059234619, "learning_rate": 1.880885064535956e-05, "loss": 1.4755, "step": 3876 }, { "epoch": 0.29794099569760296, "grad_norm": 3.84242844581604, "learning_rate": 1.8808236017209588e-05, "loss": 1.4298, "step": 3878 }, { "epoch": 0.2980946527350953, "grad_norm": 3.9651095867156982, "learning_rate": 1.880762138905962e-05, "loss": 1.4646, "step": 3880 }, { "epoch": 0.2982483097725876, "grad_norm": 3.306433916091919, "learning_rate": 1.880700676090965e-05, "loss": 1.44, "step": 3882 }, { "epoch": 0.2984019668100799, "grad_norm": 3.3314905166625977, "learning_rate": 1.880639213275968e-05, "loss": 1.6509, "step": 3884 }, { "epoch": 0.2985556238475722, "grad_norm": 3.340235471725464, "learning_rate": 1.8805777504609713e-05, "loss": 1.4151, "step": 3886 }, { "epoch": 0.29870928088506454, "grad_norm": 4.01276159286499, "learning_rate": 1.8805162876459743e-05, "loss": 1.4507, "step": 3888 }, { "epoch": 0.29886293792255686, "grad_norm": 3.1373071670532227, "learning_rate": 1.8804548248309776e-05, "loss": 1.3502, "step": 3890 }, { "epoch": 0.2990165949600492, "grad_norm": 3.7189245223999023, "learning_rate": 1.8803933620159806e-05, "loss": 1.5203, "step": 3892 }, { "epoch": 0.2991702519975415, "grad_norm": 3.4619784355163574, "learning_rate": 1.8803318992009835e-05, "loss": 1.4551, "step": 3894 }, { "epoch": 0.2993239090350338, "grad_norm": 4.235640048980713, "learning_rate": 1.8802704363859868e-05, "loss": 1.4857, "step": 3896 }, { "epoch": 0.2994775660725261, "grad_norm": 3.1417219638824463, "learning_rate": 1.8802089735709898e-05, "loss": 1.4426, "step": 3898 }, { "epoch": 0.29963122311001844, "grad_norm": 3.5244154930114746, "learning_rate": 1.8801475107559928e-05, "loss": 1.4637, "step": 3900 }, { "epoch": 0.29978488014751076, "grad_norm": 3.3270487785339355, "learning_rate": 1.880086047940996e-05, "loss": 1.2798, "step": 3902 }, { "epoch": 0.2999385371850031, "grad_norm": 4.055757522583008, "learning_rate": 1.8800245851259987e-05, "loss": 1.4502, "step": 3904 }, { "epoch": 0.3000921942224954, "grad_norm": 3.4112069606781006, "learning_rate": 1.879963122311002e-05, "loss": 1.4785, "step": 3906 }, { "epoch": 0.3002458512599877, "grad_norm": 3.5137009620666504, "learning_rate": 1.879901659496005e-05, "loss": 1.4411, "step": 3908 }, { "epoch": 0.30039950829748, "grad_norm": 3.7082371711730957, "learning_rate": 1.879840196681008e-05, "loss": 1.4781, "step": 3910 }, { "epoch": 0.30055316533497234, "grad_norm": 3.384655237197876, "learning_rate": 1.8797787338660113e-05, "loss": 1.4317, "step": 3912 }, { "epoch": 0.30070682237246465, "grad_norm": 3.7674429416656494, "learning_rate": 1.8797172710510142e-05, "loss": 1.4593, "step": 3914 }, { "epoch": 0.30086047940995697, "grad_norm": 3.340071439743042, "learning_rate": 1.8796558082360175e-05, "loss": 1.5556, "step": 3916 }, { "epoch": 0.3010141364474493, "grad_norm": 4.023772239685059, "learning_rate": 1.8795943454210205e-05, "loss": 1.4398, "step": 3918 }, { "epoch": 0.3011677934849416, "grad_norm": 3.8660881519317627, "learning_rate": 1.8795328826060235e-05, "loss": 1.5777, "step": 3920 }, { "epoch": 0.3013214505224339, "grad_norm": 3.843226909637451, "learning_rate": 1.8794714197910268e-05, "loss": 1.5498, "step": 3922 }, { "epoch": 0.30147510755992624, "grad_norm": 3.1243956089019775, "learning_rate": 1.8794099569760294e-05, "loss": 1.4583, "step": 3924 }, { "epoch": 0.30162876459741855, "grad_norm": 5.0555219650268555, "learning_rate": 1.8793484941610327e-05, "loss": 1.5479, "step": 3926 }, { "epoch": 0.30178242163491087, "grad_norm": 3.7559964656829834, "learning_rate": 1.879287031346036e-05, "loss": 1.3887, "step": 3928 }, { "epoch": 0.3019360786724032, "grad_norm": 4.365975856781006, "learning_rate": 1.8792255685310386e-05, "loss": 1.5871, "step": 3930 }, { "epoch": 0.3020897357098955, "grad_norm": 3.261347770690918, "learning_rate": 1.879164105716042e-05, "loss": 1.3479, "step": 3932 }, { "epoch": 0.3022433927473878, "grad_norm": 3.1579368114471436, "learning_rate": 1.879102642901045e-05, "loss": 1.5464, "step": 3934 }, { "epoch": 0.30239704978488013, "grad_norm": 3.574812173843384, "learning_rate": 1.8790411800860482e-05, "loss": 1.4928, "step": 3936 }, { "epoch": 0.30255070682237245, "grad_norm": 3.7639217376708984, "learning_rate": 1.8789797172710512e-05, "loss": 1.4009, "step": 3938 }, { "epoch": 0.30270436385986477, "grad_norm": 3.222525119781494, "learning_rate": 1.878918254456054e-05, "loss": 1.3771, "step": 3940 }, { "epoch": 0.3028580208973571, "grad_norm": 3.6078226566314697, "learning_rate": 1.8788567916410575e-05, "loss": 1.4167, "step": 3942 }, { "epoch": 0.3030116779348494, "grad_norm": 3.52523136138916, "learning_rate": 1.8787953288260604e-05, "loss": 1.5286, "step": 3944 }, { "epoch": 0.3031653349723417, "grad_norm": 3.7852256298065186, "learning_rate": 1.8787338660110634e-05, "loss": 1.5492, "step": 3946 }, { "epoch": 0.30331899200983403, "grad_norm": 3.428528308868408, "learning_rate": 1.8786724031960667e-05, "loss": 1.5489, "step": 3948 }, { "epoch": 0.30347264904732635, "grad_norm": 3.4774372577667236, "learning_rate": 1.8786109403810693e-05, "loss": 1.4985, "step": 3950 }, { "epoch": 0.30362630608481866, "grad_norm": 3.802961587905884, "learning_rate": 1.8785494775660727e-05, "loss": 1.4524, "step": 3952 }, { "epoch": 0.303779963122311, "grad_norm": 3.172055959701538, "learning_rate": 1.8784880147510756e-05, "loss": 1.519, "step": 3954 }, { "epoch": 0.3039336201598033, "grad_norm": 3.412998676300049, "learning_rate": 1.878426551936079e-05, "loss": 1.2872, "step": 3956 }, { "epoch": 0.3040872771972956, "grad_norm": 3.683685302734375, "learning_rate": 1.878365089121082e-05, "loss": 1.5138, "step": 3958 }, { "epoch": 0.30424093423478793, "grad_norm": 3.2270660400390625, "learning_rate": 1.878303626306085e-05, "loss": 1.4364, "step": 3960 }, { "epoch": 0.30439459127228025, "grad_norm": 3.330509901046753, "learning_rate": 1.8782421634910882e-05, "loss": 1.5553, "step": 3962 }, { "epoch": 0.30454824830977256, "grad_norm": 4.157684803009033, "learning_rate": 1.878180700676091e-05, "loss": 1.5988, "step": 3964 }, { "epoch": 0.3047019053472649, "grad_norm": 3.3523051738739014, "learning_rate": 1.878119237861094e-05, "loss": 1.5015, "step": 3966 }, { "epoch": 0.3048555623847572, "grad_norm": 3.5124311447143555, "learning_rate": 1.8780577750460974e-05, "loss": 1.4289, "step": 3968 }, { "epoch": 0.30500921942224957, "grad_norm": 3.454429864883423, "learning_rate": 1.8779963122311004e-05, "loss": 1.4927, "step": 3970 }, { "epoch": 0.3051628764597419, "grad_norm": 3.9791088104248047, "learning_rate": 1.8779348494161034e-05, "loss": 1.6143, "step": 3972 }, { "epoch": 0.3053165334972342, "grad_norm": 3.819364547729492, "learning_rate": 1.8778733866011067e-05, "loss": 1.5797, "step": 3974 }, { "epoch": 0.3054701905347265, "grad_norm": 3.4424610137939453, "learning_rate": 1.8778119237861096e-05, "loss": 1.4364, "step": 3976 }, { "epoch": 0.30562384757221883, "grad_norm": 3.86603045463562, "learning_rate": 1.8777504609711126e-05, "loss": 1.5102, "step": 3978 }, { "epoch": 0.30577750460971115, "grad_norm": 3.8857436180114746, "learning_rate": 1.8776889981561156e-05, "loss": 1.5109, "step": 3980 }, { "epoch": 0.30593116164720346, "grad_norm": 3.8796331882476807, "learning_rate": 1.877627535341119e-05, "loss": 1.4598, "step": 3982 }, { "epoch": 0.3060848186846958, "grad_norm": 3.510911464691162, "learning_rate": 1.877566072526122e-05, "loss": 1.4632, "step": 3984 }, { "epoch": 0.3062384757221881, "grad_norm": 3.9845762252807617, "learning_rate": 1.8775046097111248e-05, "loss": 1.5021, "step": 3986 }, { "epoch": 0.3063921327596804, "grad_norm": 3.142535448074341, "learning_rate": 1.877443146896128e-05, "loss": 1.4725, "step": 3988 }, { "epoch": 0.30654578979717273, "grad_norm": 3.894141435623169, "learning_rate": 1.877381684081131e-05, "loss": 1.5427, "step": 3990 }, { "epoch": 0.30669944683466505, "grad_norm": 3.3294742107391357, "learning_rate": 1.877320221266134e-05, "loss": 1.5218, "step": 3992 }, { "epoch": 0.30685310387215736, "grad_norm": 3.9828765392303467, "learning_rate": 1.8772587584511374e-05, "loss": 1.5058, "step": 3994 }, { "epoch": 0.3070067609096497, "grad_norm": 3.469477415084839, "learning_rate": 1.8771972956361403e-05, "loss": 1.4868, "step": 3996 }, { "epoch": 0.307160417947142, "grad_norm": 3.239800453186035, "learning_rate": 1.8771358328211433e-05, "loss": 1.3784, "step": 3998 }, { "epoch": 0.3073140749846343, "grad_norm": 3.649010419845581, "learning_rate": 1.8770743700061466e-05, "loss": 1.3653, "step": 4000 }, { "epoch": 0.3074677320221266, "grad_norm": 3.0160863399505615, "learning_rate": 1.8770129071911496e-05, "loss": 1.2752, "step": 4002 }, { "epoch": 0.30762138905961894, "grad_norm": 3.026254892349243, "learning_rate": 1.8769514443761525e-05, "loss": 1.52, "step": 4004 }, { "epoch": 0.30777504609711126, "grad_norm": 3.2732443809509277, "learning_rate": 1.8768899815611555e-05, "loss": 1.4611, "step": 4006 }, { "epoch": 0.3079287031346036, "grad_norm": 3.575145721435547, "learning_rate": 1.8768285187461588e-05, "loss": 1.4099, "step": 4008 }, { "epoch": 0.3080823601720959, "grad_norm": 4.473994255065918, "learning_rate": 1.8767670559311618e-05, "loss": 1.5474, "step": 4010 }, { "epoch": 0.3082360172095882, "grad_norm": 3.5895843505859375, "learning_rate": 1.8767055931161648e-05, "loss": 1.3964, "step": 4012 }, { "epoch": 0.3083896742470805, "grad_norm": 3.707585334777832, "learning_rate": 1.876644130301168e-05, "loss": 1.4779, "step": 4014 }, { "epoch": 0.30854333128457284, "grad_norm": 2.8725762367248535, "learning_rate": 1.876582667486171e-05, "loss": 1.4191, "step": 4016 }, { "epoch": 0.30869698832206516, "grad_norm": 4.254179000854492, "learning_rate": 1.876521204671174e-05, "loss": 1.5689, "step": 4018 }, { "epoch": 0.3088506453595575, "grad_norm": 3.4411301612854004, "learning_rate": 1.8764597418561773e-05, "loss": 1.4418, "step": 4020 }, { "epoch": 0.3090043023970498, "grad_norm": 3.499048948287964, "learning_rate": 1.8763982790411803e-05, "loss": 1.4129, "step": 4022 }, { "epoch": 0.3091579594345421, "grad_norm": 3.3201608657836914, "learning_rate": 1.8763368162261832e-05, "loss": 1.4546, "step": 4024 }, { "epoch": 0.3093116164720344, "grad_norm": 3.6823911666870117, "learning_rate": 1.8762753534111865e-05, "loss": 1.3802, "step": 4026 }, { "epoch": 0.30946527350952674, "grad_norm": 3.8812990188598633, "learning_rate": 1.8762138905961895e-05, "loss": 1.4478, "step": 4028 }, { "epoch": 0.30961893054701906, "grad_norm": 3.55722713470459, "learning_rate": 1.8761524277811925e-05, "loss": 1.5156, "step": 4030 }, { "epoch": 0.30977258758451137, "grad_norm": 3.832512378692627, "learning_rate": 1.8760909649661955e-05, "loss": 1.4308, "step": 4032 }, { "epoch": 0.3099262446220037, "grad_norm": 3.5508055686950684, "learning_rate": 1.8760295021511988e-05, "loss": 1.3902, "step": 4034 }, { "epoch": 0.310079901659496, "grad_norm": 3.4884133338928223, "learning_rate": 1.8759680393362017e-05, "loss": 1.3769, "step": 4036 }, { "epoch": 0.3102335586969883, "grad_norm": 3.4577252864837646, "learning_rate": 1.8759065765212047e-05, "loss": 1.4883, "step": 4038 }, { "epoch": 0.31038721573448064, "grad_norm": 3.4539401531219482, "learning_rate": 1.875845113706208e-05, "loss": 1.4841, "step": 4040 }, { "epoch": 0.31054087277197295, "grad_norm": 3.1228976249694824, "learning_rate": 1.875783650891211e-05, "loss": 1.366, "step": 4042 }, { "epoch": 0.31069452980946527, "grad_norm": 3.5434763431549072, "learning_rate": 1.875722188076214e-05, "loss": 1.5336, "step": 4044 }, { "epoch": 0.3108481868469576, "grad_norm": 3.5703988075256348, "learning_rate": 1.8756607252612172e-05, "loss": 1.3806, "step": 4046 }, { "epoch": 0.3110018438844499, "grad_norm": 3.933476686477661, "learning_rate": 1.8755992624462202e-05, "loss": 1.4786, "step": 4048 }, { "epoch": 0.3111555009219422, "grad_norm": 3.1267027854919434, "learning_rate": 1.8755377996312232e-05, "loss": 1.3594, "step": 4050 }, { "epoch": 0.31130915795943453, "grad_norm": 3.2897634506225586, "learning_rate": 1.875476336816226e-05, "loss": 1.4306, "step": 4052 }, { "epoch": 0.31146281499692685, "grad_norm": 3.3133482933044434, "learning_rate": 1.8754148740012295e-05, "loss": 1.4541, "step": 4054 }, { "epoch": 0.31161647203441917, "grad_norm": 3.548305034637451, "learning_rate": 1.8753534111862324e-05, "loss": 1.3178, "step": 4056 }, { "epoch": 0.3117701290719115, "grad_norm": 3.642669200897217, "learning_rate": 1.8752919483712354e-05, "loss": 1.6155, "step": 4058 }, { "epoch": 0.3119237861094038, "grad_norm": 3.6712703704833984, "learning_rate": 1.8752304855562387e-05, "loss": 1.4287, "step": 4060 }, { "epoch": 0.3120774431468961, "grad_norm": 3.957775592803955, "learning_rate": 1.8751690227412417e-05, "loss": 1.5324, "step": 4062 }, { "epoch": 0.31223110018438843, "grad_norm": 3.234064817428589, "learning_rate": 1.8751075599262446e-05, "loss": 1.4191, "step": 4064 }, { "epoch": 0.31238475722188075, "grad_norm": 4.080970287322998, "learning_rate": 1.875046097111248e-05, "loss": 1.5567, "step": 4066 }, { "epoch": 0.31253841425937307, "grad_norm": 3.7865617275238037, "learning_rate": 1.874984634296251e-05, "loss": 1.3855, "step": 4068 }, { "epoch": 0.3126920712968654, "grad_norm": 3.3486435413360596, "learning_rate": 1.874923171481254e-05, "loss": 1.4618, "step": 4070 }, { "epoch": 0.3128457283343577, "grad_norm": 3.3620622158050537, "learning_rate": 1.8748617086662572e-05, "loss": 1.4594, "step": 4072 }, { "epoch": 0.31299938537185, "grad_norm": 3.3821537494659424, "learning_rate": 1.87480024585126e-05, "loss": 1.3137, "step": 4074 }, { "epoch": 0.31315304240934233, "grad_norm": 3.1241402626037598, "learning_rate": 1.874738783036263e-05, "loss": 1.5348, "step": 4076 }, { "epoch": 0.31330669944683465, "grad_norm": 3.696268081665039, "learning_rate": 1.874677320221266e-05, "loss": 1.5934, "step": 4078 }, { "epoch": 0.31346035648432696, "grad_norm": 3.3421854972839355, "learning_rate": 1.8746158574062694e-05, "loss": 1.4672, "step": 4080 }, { "epoch": 0.3136140135218193, "grad_norm": 4.347311496734619, "learning_rate": 1.8745543945912724e-05, "loss": 1.4414, "step": 4082 }, { "epoch": 0.3137676705593116, "grad_norm": 3.2006402015686035, "learning_rate": 1.8744929317762753e-05, "loss": 1.4057, "step": 4084 }, { "epoch": 0.3139213275968039, "grad_norm": 3.5505783557891846, "learning_rate": 1.8744314689612786e-05, "loss": 1.4324, "step": 4086 }, { "epoch": 0.31407498463429623, "grad_norm": 3.9366791248321533, "learning_rate": 1.8743700061462816e-05, "loss": 1.5607, "step": 4088 }, { "epoch": 0.31422864167178854, "grad_norm": 3.4869041442871094, "learning_rate": 1.8743085433312846e-05, "loss": 1.368, "step": 4090 }, { "epoch": 0.31438229870928086, "grad_norm": 3.363190174102783, "learning_rate": 1.874247080516288e-05, "loss": 1.4657, "step": 4092 }, { "epoch": 0.3145359557467732, "grad_norm": 3.5293331146240234, "learning_rate": 1.874185617701291e-05, "loss": 1.527, "step": 4094 }, { "epoch": 0.3146896127842655, "grad_norm": 3.2504899501800537, "learning_rate": 1.8741241548862938e-05, "loss": 1.4395, "step": 4096 }, { "epoch": 0.3148432698217578, "grad_norm": 3.309957981109619, "learning_rate": 1.874062692071297e-05, "loss": 1.4809, "step": 4098 }, { "epoch": 0.3149969268592501, "grad_norm": 3.5390985012054443, "learning_rate": 1.8740012292563e-05, "loss": 1.4781, "step": 4100 }, { "epoch": 0.3151505838967425, "grad_norm": 3.856665849685669, "learning_rate": 1.8739397664413034e-05, "loss": 1.569, "step": 4102 }, { "epoch": 0.3153042409342348, "grad_norm": 3.2815372943878174, "learning_rate": 1.873878303626306e-05, "loss": 1.4387, "step": 4104 }, { "epoch": 0.31545789797172713, "grad_norm": 3.4734060764312744, "learning_rate": 1.8738168408113093e-05, "loss": 1.5414, "step": 4106 }, { "epoch": 0.31561155500921945, "grad_norm": 3.0709571838378906, "learning_rate": 1.8737553779963123e-05, "loss": 1.3674, "step": 4108 }, { "epoch": 0.31576521204671176, "grad_norm": 3.460763692855835, "learning_rate": 1.8736939151813153e-05, "loss": 1.4759, "step": 4110 }, { "epoch": 0.3159188690842041, "grad_norm": 3.14346981048584, "learning_rate": 1.8736324523663186e-05, "loss": 1.4407, "step": 4112 }, { "epoch": 0.3160725261216964, "grad_norm": 3.1773383617401123, "learning_rate": 1.8735709895513216e-05, "loss": 1.4686, "step": 4114 }, { "epoch": 0.3162261831591887, "grad_norm": 3.2053041458129883, "learning_rate": 1.8735095267363245e-05, "loss": 1.4797, "step": 4116 }, { "epoch": 0.31637984019668103, "grad_norm": 3.4825661182403564, "learning_rate": 1.873448063921328e-05, "loss": 1.3502, "step": 4118 }, { "epoch": 0.31653349723417334, "grad_norm": 3.6239278316497803, "learning_rate": 1.8733866011063308e-05, "loss": 1.5613, "step": 4120 }, { "epoch": 0.31668715427166566, "grad_norm": 3.647029161453247, "learning_rate": 1.873325138291334e-05, "loss": 1.504, "step": 4122 }, { "epoch": 0.316840811309158, "grad_norm": 3.517030715942383, "learning_rate": 1.873263675476337e-05, "loss": 1.4534, "step": 4124 }, { "epoch": 0.3169944683466503, "grad_norm": 3.620820999145508, "learning_rate": 1.87320221266134e-05, "loss": 1.6111, "step": 4126 }, { "epoch": 0.3171481253841426, "grad_norm": 3.410433769226074, "learning_rate": 1.8731407498463434e-05, "loss": 1.6134, "step": 4128 }, { "epoch": 0.3173017824216349, "grad_norm": 3.1551625728607178, "learning_rate": 1.873079287031346e-05, "loss": 1.3274, "step": 4130 }, { "epoch": 0.31745543945912724, "grad_norm": 2.9773385524749756, "learning_rate": 1.8730178242163493e-05, "loss": 1.3743, "step": 4132 }, { "epoch": 0.31760909649661956, "grad_norm": 3.5205631256103516, "learning_rate": 1.8729563614013523e-05, "loss": 1.5177, "step": 4134 }, { "epoch": 0.3177627535341119, "grad_norm": 3.157318592071533, "learning_rate": 1.8728948985863552e-05, "loss": 1.3897, "step": 4136 }, { "epoch": 0.3179164105716042, "grad_norm": 3.2768332958221436, "learning_rate": 1.8728334357713585e-05, "loss": 1.5896, "step": 4138 }, { "epoch": 0.3180700676090965, "grad_norm": 3.3517439365386963, "learning_rate": 1.8727719729563615e-05, "loss": 1.4006, "step": 4140 }, { "epoch": 0.3182237246465888, "grad_norm": 3.4043068885803223, "learning_rate": 1.8727105101413648e-05, "loss": 1.4092, "step": 4142 }, { "epoch": 0.31837738168408114, "grad_norm": 4.253170490264893, "learning_rate": 1.8726490473263678e-05, "loss": 1.499, "step": 4144 }, { "epoch": 0.31853103872157346, "grad_norm": 3.3389856815338135, "learning_rate": 1.8725875845113707e-05, "loss": 1.3179, "step": 4146 }, { "epoch": 0.3186846957590658, "grad_norm": 3.3309342861175537, "learning_rate": 1.872526121696374e-05, "loss": 1.3684, "step": 4148 }, { "epoch": 0.3188383527965581, "grad_norm": 3.2987401485443115, "learning_rate": 1.8724646588813767e-05, "loss": 1.4179, "step": 4150 }, { "epoch": 0.3189920098340504, "grad_norm": 4.022191047668457, "learning_rate": 1.87240319606638e-05, "loss": 1.6518, "step": 4152 }, { "epoch": 0.3191456668715427, "grad_norm": 3.6432507038116455, "learning_rate": 1.8723417332513833e-05, "loss": 1.6108, "step": 4154 }, { "epoch": 0.31929932390903504, "grad_norm": 3.7931549549102783, "learning_rate": 1.872280270436386e-05, "loss": 1.4346, "step": 4156 }, { "epoch": 0.31945298094652735, "grad_norm": 3.829770088195801, "learning_rate": 1.8722188076213892e-05, "loss": 1.3766, "step": 4158 }, { "epoch": 0.31960663798401967, "grad_norm": 3.3749098777770996, "learning_rate": 1.8721573448063922e-05, "loss": 1.5031, "step": 4160 }, { "epoch": 0.319760295021512, "grad_norm": 3.3764944076538086, "learning_rate": 1.8720958819913955e-05, "loss": 1.4594, "step": 4162 }, { "epoch": 0.3199139520590043, "grad_norm": 3.541229248046875, "learning_rate": 1.8720344191763985e-05, "loss": 1.5586, "step": 4164 }, { "epoch": 0.3200676090964966, "grad_norm": 3.983872175216675, "learning_rate": 1.8719729563614014e-05, "loss": 1.4576, "step": 4166 }, { "epoch": 0.32022126613398894, "grad_norm": 3.3289988040924072, "learning_rate": 1.8719114935464048e-05, "loss": 1.5158, "step": 4168 }, { "epoch": 0.32037492317148125, "grad_norm": 3.6405656337738037, "learning_rate": 1.8718500307314077e-05, "loss": 1.3827, "step": 4170 }, { "epoch": 0.32052858020897357, "grad_norm": 3.8185787200927734, "learning_rate": 1.8717885679164107e-05, "loss": 1.4772, "step": 4172 }, { "epoch": 0.3206822372464659, "grad_norm": 2.896965980529785, "learning_rate": 1.871727105101414e-05, "loss": 1.4799, "step": 4174 }, { "epoch": 0.3208358942839582, "grad_norm": 3.191843271255493, "learning_rate": 1.8716656422864166e-05, "loss": 1.4894, "step": 4176 }, { "epoch": 0.3209895513214505, "grad_norm": 3.1855785846710205, "learning_rate": 1.87160417947142e-05, "loss": 1.3661, "step": 4178 }, { "epoch": 0.32114320835894283, "grad_norm": 3.66670298576355, "learning_rate": 1.871542716656423e-05, "loss": 1.4934, "step": 4180 }, { "epoch": 0.32129686539643515, "grad_norm": 2.9798126220703125, "learning_rate": 1.871481253841426e-05, "loss": 1.3584, "step": 4182 }, { "epoch": 0.32145052243392747, "grad_norm": 3.9265804290771484, "learning_rate": 1.8714197910264292e-05, "loss": 1.3534, "step": 4184 }, { "epoch": 0.3216041794714198, "grad_norm": 3.341581344604492, "learning_rate": 1.871358328211432e-05, "loss": 1.349, "step": 4186 }, { "epoch": 0.3217578365089121, "grad_norm": 3.206651449203491, "learning_rate": 1.8712968653964355e-05, "loss": 1.368, "step": 4188 }, { "epoch": 0.3219114935464044, "grad_norm": 3.7142248153686523, "learning_rate": 1.8712354025814384e-05, "loss": 1.4761, "step": 4190 }, { "epoch": 0.32206515058389673, "grad_norm": 3.654179096221924, "learning_rate": 1.8711739397664414e-05, "loss": 1.5724, "step": 4192 }, { "epoch": 0.32221880762138905, "grad_norm": 3.130634069442749, "learning_rate": 1.8711124769514447e-05, "loss": 1.3865, "step": 4194 }, { "epoch": 0.32237246465888136, "grad_norm": 3.973550319671631, "learning_rate": 1.8710510141364477e-05, "loss": 1.4493, "step": 4196 }, { "epoch": 0.3225261216963737, "grad_norm": 3.044903039932251, "learning_rate": 1.8709895513214506e-05, "loss": 1.4508, "step": 4198 }, { "epoch": 0.322679778733866, "grad_norm": 3.1042187213897705, "learning_rate": 1.870928088506454e-05, "loss": 1.5592, "step": 4200 }, { "epoch": 0.3228334357713583, "grad_norm": 3.174510955810547, "learning_rate": 1.8708666256914566e-05, "loss": 1.5262, "step": 4202 }, { "epoch": 0.32298709280885063, "grad_norm": 3.3536434173583984, "learning_rate": 1.87080516287646e-05, "loss": 1.3904, "step": 4204 }, { "epoch": 0.32314074984634295, "grad_norm": 3.7183215618133545, "learning_rate": 1.870743700061463e-05, "loss": 1.3558, "step": 4206 }, { "epoch": 0.32329440688383526, "grad_norm": 3.7605910301208496, "learning_rate": 1.870682237246466e-05, "loss": 1.4543, "step": 4208 }, { "epoch": 0.3234480639213276, "grad_norm": 3.1257007122039795, "learning_rate": 1.870620774431469e-05, "loss": 1.3025, "step": 4210 }, { "epoch": 0.3236017209588199, "grad_norm": 3.429586172103882, "learning_rate": 1.870559311616472e-05, "loss": 1.4547, "step": 4212 }, { "epoch": 0.3237553779963122, "grad_norm": 3.820537805557251, "learning_rate": 1.8704978488014754e-05, "loss": 1.5153, "step": 4214 }, { "epoch": 0.3239090350338045, "grad_norm": 3.532174825668335, "learning_rate": 1.8704363859864784e-05, "loss": 1.3529, "step": 4216 }, { "epoch": 0.32406269207129684, "grad_norm": 3.619643211364746, "learning_rate": 1.8703749231714813e-05, "loss": 1.4404, "step": 4218 }, { "epoch": 0.32421634910878916, "grad_norm": 3.474963665008545, "learning_rate": 1.8703134603564846e-05, "loss": 1.4391, "step": 4220 }, { "epoch": 0.3243700061462815, "grad_norm": 3.2901859283447266, "learning_rate": 1.8702519975414876e-05, "loss": 1.4308, "step": 4222 }, { "epoch": 0.3245236631837738, "grad_norm": 3.482391834259033, "learning_rate": 1.8701905347264906e-05, "loss": 1.4473, "step": 4224 }, { "epoch": 0.3246773202212661, "grad_norm": 3.2671616077423096, "learning_rate": 1.870129071911494e-05, "loss": 1.385, "step": 4226 }, { "epoch": 0.3248309772587584, "grad_norm": 3.6331212520599365, "learning_rate": 1.870067609096497e-05, "loss": 1.4433, "step": 4228 }, { "epoch": 0.32498463429625074, "grad_norm": 3.0199925899505615, "learning_rate": 1.8700061462814998e-05, "loss": 1.4552, "step": 4230 }, { "epoch": 0.3251382913337431, "grad_norm": 3.5435800552368164, "learning_rate": 1.8699446834665028e-05, "loss": 1.4098, "step": 4232 }, { "epoch": 0.32529194837123543, "grad_norm": 3.5522637367248535, "learning_rate": 1.869883220651506e-05, "loss": 1.3704, "step": 4234 }, { "epoch": 0.32544560540872775, "grad_norm": 3.2306251525878906, "learning_rate": 1.869821757836509e-05, "loss": 1.3251, "step": 4236 }, { "epoch": 0.32559926244622006, "grad_norm": 3.570638656616211, "learning_rate": 1.869760295021512e-05, "loss": 1.3477, "step": 4238 }, { "epoch": 0.3257529194837124, "grad_norm": 3.919713020324707, "learning_rate": 1.8696988322065153e-05, "loss": 1.455, "step": 4240 }, { "epoch": 0.3259065765212047, "grad_norm": 3.3150136470794678, "learning_rate": 1.8696373693915183e-05, "loss": 1.5392, "step": 4242 }, { "epoch": 0.326060233558697, "grad_norm": 3.362748146057129, "learning_rate": 1.8695759065765213e-05, "loss": 1.3796, "step": 4244 }, { "epoch": 0.3262138905961893, "grad_norm": 3.556650161743164, "learning_rate": 1.8695144437615246e-05, "loss": 1.3962, "step": 4246 }, { "epoch": 0.32636754763368164, "grad_norm": 3.651918649673462, "learning_rate": 1.8694529809465276e-05, "loss": 1.4432, "step": 4248 }, { "epoch": 0.32652120467117396, "grad_norm": 3.641026496887207, "learning_rate": 1.8693915181315305e-05, "loss": 1.3968, "step": 4250 }, { "epoch": 0.3266748617086663, "grad_norm": 3.573974370956421, "learning_rate": 1.869330055316534e-05, "loss": 1.5365, "step": 4252 }, { "epoch": 0.3268285187461586, "grad_norm": 3.4720635414123535, "learning_rate": 1.8692685925015368e-05, "loss": 1.4469, "step": 4254 }, { "epoch": 0.3269821757836509, "grad_norm": 3.5822393894195557, "learning_rate": 1.8692071296865398e-05, "loss": 1.5314, "step": 4256 }, { "epoch": 0.3271358328211432, "grad_norm": 3.3091721534729004, "learning_rate": 1.8691456668715427e-05, "loss": 1.5533, "step": 4258 }, { "epoch": 0.32728948985863554, "grad_norm": 3.3937950134277344, "learning_rate": 1.869084204056546e-05, "loss": 1.498, "step": 4260 }, { "epoch": 0.32744314689612786, "grad_norm": 3.1424973011016846, "learning_rate": 1.869022741241549e-05, "loss": 1.384, "step": 4262 }, { "epoch": 0.3275968039336202, "grad_norm": 3.1673672199249268, "learning_rate": 1.868961278426552e-05, "loss": 1.3467, "step": 4264 }, { "epoch": 0.3277504609711125, "grad_norm": 3.0541131496429443, "learning_rate": 1.8688998156115553e-05, "loss": 1.5197, "step": 4266 }, { "epoch": 0.3279041180086048, "grad_norm": 3.423753261566162, "learning_rate": 1.8688383527965583e-05, "loss": 1.4421, "step": 4268 }, { "epoch": 0.3280577750460971, "grad_norm": 3.221930980682373, "learning_rate": 1.8687768899815612e-05, "loss": 1.3816, "step": 4270 }, { "epoch": 0.32821143208358944, "grad_norm": 3.318981170654297, "learning_rate": 1.8687154271665645e-05, "loss": 1.3417, "step": 4272 }, { "epoch": 0.32836508912108175, "grad_norm": 3.644233226776123, "learning_rate": 1.8686539643515675e-05, "loss": 1.5216, "step": 4274 }, { "epoch": 0.32851874615857407, "grad_norm": 3.196216106414795, "learning_rate": 1.8685925015365705e-05, "loss": 1.313, "step": 4276 }, { "epoch": 0.3286724031960664, "grad_norm": 3.06890606880188, "learning_rate": 1.8685310387215734e-05, "loss": 1.446, "step": 4278 }, { "epoch": 0.3288260602335587, "grad_norm": 3.106513500213623, "learning_rate": 1.8684695759065767e-05, "loss": 1.3891, "step": 4280 }, { "epoch": 0.328979717271051, "grad_norm": 3.394026517868042, "learning_rate": 1.8684081130915797e-05, "loss": 1.3584, "step": 4282 }, { "epoch": 0.32913337430854334, "grad_norm": 3.0050766468048096, "learning_rate": 1.8683466502765827e-05, "loss": 1.4602, "step": 4284 }, { "epoch": 0.32928703134603565, "grad_norm": 3.0258212089538574, "learning_rate": 1.868285187461586e-05, "loss": 1.3084, "step": 4286 }, { "epoch": 0.32944068838352797, "grad_norm": 3.1330137252807617, "learning_rate": 1.868223724646589e-05, "loss": 1.367, "step": 4288 }, { "epoch": 0.3295943454210203, "grad_norm": 3.145908832550049, "learning_rate": 1.868162261831592e-05, "loss": 1.3691, "step": 4290 }, { "epoch": 0.3297480024585126, "grad_norm": 3.0322062969207764, "learning_rate": 1.8681007990165952e-05, "loss": 1.4524, "step": 4292 }, { "epoch": 0.3299016594960049, "grad_norm": 8.492423057556152, "learning_rate": 1.8680393362015982e-05, "loss": 1.4982, "step": 4294 }, { "epoch": 0.33005531653349723, "grad_norm": 3.640282154083252, "learning_rate": 1.867977873386601e-05, "loss": 1.4207, "step": 4296 }, { "epoch": 0.33020897357098955, "grad_norm": 3.1707777976989746, "learning_rate": 1.8679164105716045e-05, "loss": 1.4874, "step": 4298 }, { "epoch": 0.33036263060848187, "grad_norm": 3.1674551963806152, "learning_rate": 1.8678549477566074e-05, "loss": 1.5869, "step": 4300 }, { "epoch": 0.3305162876459742, "grad_norm": 3.2966535091400146, "learning_rate": 1.8677934849416104e-05, "loss": 1.4055, "step": 4302 }, { "epoch": 0.3306699446834665, "grad_norm": 3.6385233402252197, "learning_rate": 1.8677320221266134e-05, "loss": 1.6527, "step": 4304 }, { "epoch": 0.3308236017209588, "grad_norm": 3.1749191284179688, "learning_rate": 1.8676705593116167e-05, "loss": 1.3133, "step": 4306 }, { "epoch": 0.33097725875845113, "grad_norm": 3.473065137863159, "learning_rate": 1.8676090964966197e-05, "loss": 1.5052, "step": 4308 }, { "epoch": 0.33113091579594345, "grad_norm": 3.3100507259368896, "learning_rate": 1.8675476336816226e-05, "loss": 1.4369, "step": 4310 }, { "epoch": 0.33128457283343576, "grad_norm": 3.4080681800842285, "learning_rate": 1.867486170866626e-05, "loss": 1.5524, "step": 4312 }, { "epoch": 0.3314382298709281, "grad_norm": 3.419074296951294, "learning_rate": 1.867424708051629e-05, "loss": 1.515, "step": 4314 }, { "epoch": 0.3315918869084204, "grad_norm": 4.8144965171813965, "learning_rate": 1.867363245236632e-05, "loss": 1.425, "step": 4316 }, { "epoch": 0.3317455439459127, "grad_norm": 2.8420140743255615, "learning_rate": 1.8673017824216352e-05, "loss": 1.3018, "step": 4318 }, { "epoch": 0.33189920098340503, "grad_norm": 3.841867208480835, "learning_rate": 1.867240319606638e-05, "loss": 1.4571, "step": 4320 }, { "epoch": 0.33205285802089735, "grad_norm": 3.476323127746582, "learning_rate": 1.867178856791641e-05, "loss": 1.4041, "step": 4322 }, { "epoch": 0.33220651505838966, "grad_norm": 3.4778249263763428, "learning_rate": 1.8671173939766444e-05, "loss": 1.5176, "step": 4324 }, { "epoch": 0.332360172095882, "grad_norm": 3.6123509407043457, "learning_rate": 1.8670559311616474e-05, "loss": 1.4301, "step": 4326 }, { "epoch": 0.3325138291333743, "grad_norm": 3.1025562286376953, "learning_rate": 1.8669944683466504e-05, "loss": 1.3542, "step": 4328 }, { "epoch": 0.3326674861708666, "grad_norm": 3.7515599727630615, "learning_rate": 1.8669330055316533e-05, "loss": 1.5406, "step": 4330 }, { "epoch": 0.3328211432083589, "grad_norm": 3.2796072959899902, "learning_rate": 1.8668715427166566e-05, "loss": 1.4538, "step": 4332 }, { "epoch": 0.33297480024585124, "grad_norm": 3.37237548828125, "learning_rate": 1.8668100799016596e-05, "loss": 1.4781, "step": 4334 }, { "epoch": 0.33312845728334356, "grad_norm": 3.1982147693634033, "learning_rate": 1.8667486170866626e-05, "loss": 1.3791, "step": 4336 }, { "epoch": 0.3332821143208359, "grad_norm": 3.7179698944091797, "learning_rate": 1.866687154271666e-05, "loss": 1.4296, "step": 4338 }, { "epoch": 0.3334357713583282, "grad_norm": 3.3739073276519775, "learning_rate": 1.866625691456669e-05, "loss": 1.4229, "step": 4340 }, { "epoch": 0.3335894283958205, "grad_norm": 4.1158447265625, "learning_rate": 1.8665642286416718e-05, "loss": 1.4937, "step": 4342 }, { "epoch": 0.3337430854333128, "grad_norm": 3.25079345703125, "learning_rate": 1.866502765826675e-05, "loss": 1.3979, "step": 4344 }, { "epoch": 0.33389674247080514, "grad_norm": 3.4430036544799805, "learning_rate": 1.866441303011678e-05, "loss": 1.3599, "step": 4346 }, { "epoch": 0.33405039950829746, "grad_norm": 3.156423568725586, "learning_rate": 1.866379840196681e-05, "loss": 1.3951, "step": 4348 }, { "epoch": 0.3342040565457898, "grad_norm": 3.685227632522583, "learning_rate": 1.8663183773816844e-05, "loss": 1.4817, "step": 4350 }, { "epoch": 0.3343577135832821, "grad_norm": 3.6018106937408447, "learning_rate": 1.8662569145666873e-05, "loss": 1.4947, "step": 4352 }, { "epoch": 0.3345113706207744, "grad_norm": 3.826826572418213, "learning_rate": 1.8661954517516906e-05, "loss": 1.4161, "step": 4354 }, { "epoch": 0.3346650276582667, "grad_norm": 3.780466318130493, "learning_rate": 1.8661339889366933e-05, "loss": 1.4293, "step": 4356 }, { "epoch": 0.33481868469575904, "grad_norm": 3.5221405029296875, "learning_rate": 1.8660725261216966e-05, "loss": 1.359, "step": 4358 }, { "epoch": 0.33497234173325136, "grad_norm": 5.303091526031494, "learning_rate": 1.8660110633066995e-05, "loss": 1.5138, "step": 4360 }, { "epoch": 0.3351259987707437, "grad_norm": 4.311324596405029, "learning_rate": 1.8659496004917025e-05, "loss": 1.3831, "step": 4362 }, { "epoch": 0.33527965580823604, "grad_norm": 3.16186261177063, "learning_rate": 1.8658881376767058e-05, "loss": 1.4087, "step": 4364 }, { "epoch": 0.33543331284572836, "grad_norm": 3.3038687705993652, "learning_rate": 1.8658266748617088e-05, "loss": 1.4246, "step": 4366 }, { "epoch": 0.3355869698832207, "grad_norm": 3.750396490097046, "learning_rate": 1.8657652120467118e-05, "loss": 1.4069, "step": 4368 }, { "epoch": 0.335740626920713, "grad_norm": 3.6830053329467773, "learning_rate": 1.865703749231715e-05, "loss": 1.4182, "step": 4370 }, { "epoch": 0.3358942839582053, "grad_norm": 3.1328139305114746, "learning_rate": 1.865642286416718e-05, "loss": 1.573, "step": 4372 }, { "epoch": 0.3360479409956976, "grad_norm": 3.484109401702881, "learning_rate": 1.8655808236017213e-05, "loss": 1.4309, "step": 4374 }, { "epoch": 0.33620159803318994, "grad_norm": 3.408421516418457, "learning_rate": 1.865519360786724e-05, "loss": 1.4844, "step": 4376 }, { "epoch": 0.33635525507068226, "grad_norm": 3.305617094039917, "learning_rate": 1.8654578979717273e-05, "loss": 1.5163, "step": 4378 }, { "epoch": 0.3365089121081746, "grad_norm": 3.2654824256896973, "learning_rate": 1.8653964351567302e-05, "loss": 1.4274, "step": 4380 }, { "epoch": 0.3366625691456669, "grad_norm": 3.064093589782715, "learning_rate": 1.8653349723417332e-05, "loss": 1.504, "step": 4382 }, { "epoch": 0.3368162261831592, "grad_norm": 3.359511375427246, "learning_rate": 1.8652735095267365e-05, "loss": 1.486, "step": 4384 }, { "epoch": 0.3369698832206515, "grad_norm": 4.074959754943848, "learning_rate": 1.8652120467117395e-05, "loss": 1.5215, "step": 4386 }, { "epoch": 0.33712354025814384, "grad_norm": 3.4263994693756104, "learning_rate": 1.8651505838967425e-05, "loss": 1.2797, "step": 4388 }, { "epoch": 0.33727719729563616, "grad_norm": 3.5367822647094727, "learning_rate": 1.8650891210817458e-05, "loss": 1.3524, "step": 4390 }, { "epoch": 0.33743085433312847, "grad_norm": 5.375179290771484, "learning_rate": 1.8650276582667487e-05, "loss": 1.5596, "step": 4392 }, { "epoch": 0.3375845113706208, "grad_norm": 2.911515474319458, "learning_rate": 1.864966195451752e-05, "loss": 1.3833, "step": 4394 }, { "epoch": 0.3377381684081131, "grad_norm": 3.0453405380249023, "learning_rate": 1.864904732636755e-05, "loss": 1.4149, "step": 4396 }, { "epoch": 0.3378918254456054, "grad_norm": 5.521705150604248, "learning_rate": 1.864843269821758e-05, "loss": 1.4082, "step": 4398 }, { "epoch": 0.33804548248309774, "grad_norm": 3.640129804611206, "learning_rate": 1.8647818070067613e-05, "loss": 1.6099, "step": 4400 }, { "epoch": 0.33819913952059005, "grad_norm": 3.4472126960754395, "learning_rate": 1.864720344191764e-05, "loss": 1.4893, "step": 4402 }, { "epoch": 0.33835279655808237, "grad_norm": 3.440143585205078, "learning_rate": 1.8646588813767672e-05, "loss": 1.4867, "step": 4404 }, { "epoch": 0.3385064535955747, "grad_norm": 2.8615074157714844, "learning_rate": 1.8645974185617702e-05, "loss": 1.3287, "step": 4406 }, { "epoch": 0.338660110633067, "grad_norm": 3.6857492923736572, "learning_rate": 1.864535955746773e-05, "loss": 1.4894, "step": 4408 }, { "epoch": 0.3388137676705593, "grad_norm": 3.317317485809326, "learning_rate": 1.8644744929317765e-05, "loss": 1.4137, "step": 4410 }, { "epoch": 0.33896742470805163, "grad_norm": 3.293468952178955, "learning_rate": 1.8644130301167794e-05, "loss": 1.3174, "step": 4412 }, { "epoch": 0.33912108174554395, "grad_norm": 2.963331699371338, "learning_rate": 1.8643515673017827e-05, "loss": 1.396, "step": 4414 }, { "epoch": 0.33927473878303627, "grad_norm": 3.2345709800720215, "learning_rate": 1.8642901044867857e-05, "loss": 1.428, "step": 4416 }, { "epoch": 0.3394283958205286, "grad_norm": 3.399947166442871, "learning_rate": 1.8642286416717887e-05, "loss": 1.3047, "step": 4418 }, { "epoch": 0.3395820528580209, "grad_norm": 3.6432290077209473, "learning_rate": 1.864167178856792e-05, "loss": 1.4678, "step": 4420 }, { "epoch": 0.3397357098955132, "grad_norm": 3.2603604793548584, "learning_rate": 1.864105716041795e-05, "loss": 1.4103, "step": 4422 }, { "epoch": 0.33988936693300553, "grad_norm": 2.959282159805298, "learning_rate": 1.864044253226798e-05, "loss": 1.4762, "step": 4424 }, { "epoch": 0.34004302397049785, "grad_norm": 3.5913710594177246, "learning_rate": 1.8639827904118012e-05, "loss": 1.5115, "step": 4426 }, { "epoch": 0.34019668100799016, "grad_norm": 3.229787826538086, "learning_rate": 1.863921327596804e-05, "loss": 1.3089, "step": 4428 }, { "epoch": 0.3403503380454825, "grad_norm": 3.356717109680176, "learning_rate": 1.863859864781807e-05, "loss": 1.5121, "step": 4430 }, { "epoch": 0.3405039950829748, "grad_norm": 3.2696216106414795, "learning_rate": 1.86379840196681e-05, "loss": 1.5658, "step": 4432 }, { "epoch": 0.3406576521204671, "grad_norm": 3.3706064224243164, "learning_rate": 1.863736939151813e-05, "loss": 1.4288, "step": 4434 }, { "epoch": 0.34081130915795943, "grad_norm": 3.3257944583892822, "learning_rate": 1.8636754763368164e-05, "loss": 1.4619, "step": 4436 }, { "epoch": 0.34096496619545175, "grad_norm": 3.006544589996338, "learning_rate": 1.8636140135218194e-05, "loss": 1.3658, "step": 4438 }, { "epoch": 0.34111862323294406, "grad_norm": 3.39756441116333, "learning_rate": 1.8635525507068227e-05, "loss": 1.2458, "step": 4440 }, { "epoch": 0.3412722802704364, "grad_norm": 3.541398286819458, "learning_rate": 1.8634910878918257e-05, "loss": 1.4737, "step": 4442 }, { "epoch": 0.3414259373079287, "grad_norm": 3.466864824295044, "learning_rate": 1.8634296250768286e-05, "loss": 1.518, "step": 4444 }, { "epoch": 0.341579594345421, "grad_norm": 3.820302963256836, "learning_rate": 1.863368162261832e-05, "loss": 1.4187, "step": 4446 }, { "epoch": 0.34173325138291333, "grad_norm": 3.4347641468048096, "learning_rate": 1.863306699446835e-05, "loss": 1.3245, "step": 4448 }, { "epoch": 0.34188690842040564, "grad_norm": 3.3268067836761475, "learning_rate": 1.863245236631838e-05, "loss": 1.3987, "step": 4450 }, { "epoch": 0.34204056545789796, "grad_norm": 3.1883060932159424, "learning_rate": 1.8631837738168412e-05, "loss": 1.3047, "step": 4452 }, { "epoch": 0.3421942224953903, "grad_norm": 3.4428231716156006, "learning_rate": 1.8631223110018438e-05, "loss": 1.5088, "step": 4454 }, { "epoch": 0.3423478795328826, "grad_norm": 3.522672414779663, "learning_rate": 1.863060848186847e-05, "loss": 1.3722, "step": 4456 }, { "epoch": 0.3425015365703749, "grad_norm": 3.2964699268341064, "learning_rate": 1.86299938537185e-05, "loss": 1.4182, "step": 4458 }, { "epoch": 0.3426551936078672, "grad_norm": 3.4219284057617188, "learning_rate": 1.8629379225568534e-05, "loss": 1.3521, "step": 4460 }, { "epoch": 0.34280885064535954, "grad_norm": 3.896960496902466, "learning_rate": 1.8628764597418564e-05, "loss": 1.5334, "step": 4462 }, { "epoch": 0.34296250768285186, "grad_norm": 3.8269264698028564, "learning_rate": 1.8628149969268593e-05, "loss": 1.4035, "step": 4464 }, { "epoch": 0.3431161647203442, "grad_norm": 3.4882993698120117, "learning_rate": 1.8627535341118626e-05, "loss": 1.4092, "step": 4466 }, { "epoch": 0.3432698217578365, "grad_norm": 3.7473082542419434, "learning_rate": 1.8626920712968656e-05, "loss": 1.5193, "step": 4468 }, { "epoch": 0.3434234787953288, "grad_norm": 3.521932601928711, "learning_rate": 1.8626306084818686e-05, "loss": 1.3961, "step": 4470 }, { "epoch": 0.3435771358328211, "grad_norm": 3.807311534881592, "learning_rate": 1.862569145666872e-05, "loss": 1.5003, "step": 4472 }, { "epoch": 0.34373079287031344, "grad_norm": 3.610978603363037, "learning_rate": 1.8625076828518745e-05, "loss": 1.4156, "step": 4474 }, { "epoch": 0.34388444990780576, "grad_norm": 3.065843343734741, "learning_rate": 1.8624462200368778e-05, "loss": 1.5464, "step": 4476 }, { "epoch": 0.34403810694529807, "grad_norm": 3.488435745239258, "learning_rate": 1.862384757221881e-05, "loss": 1.4361, "step": 4478 }, { "epoch": 0.3441917639827904, "grad_norm": 4.204352378845215, "learning_rate": 1.862323294406884e-05, "loss": 1.5593, "step": 4480 }, { "epoch": 0.3443454210202827, "grad_norm": 3.2348999977111816, "learning_rate": 1.862261831591887e-05, "loss": 1.3608, "step": 4482 }, { "epoch": 0.344499078057775, "grad_norm": 3.1907153129577637, "learning_rate": 1.86220036877689e-05, "loss": 1.3123, "step": 4484 }, { "epoch": 0.34465273509526734, "grad_norm": 3.5538814067840576, "learning_rate": 1.8621389059618933e-05, "loss": 1.4566, "step": 4486 }, { "epoch": 0.34480639213275965, "grad_norm": 3.1382246017456055, "learning_rate": 1.8620774431468963e-05, "loss": 1.4596, "step": 4488 }, { "epoch": 0.34496004917025197, "grad_norm": 3.199798583984375, "learning_rate": 1.8620159803318993e-05, "loss": 1.313, "step": 4490 }, { "epoch": 0.34511370620774434, "grad_norm": 3.067171812057495, "learning_rate": 1.8619545175169026e-05, "loss": 1.3282, "step": 4492 }, { "epoch": 0.34526736324523666, "grad_norm": 3.06500506401062, "learning_rate": 1.8618930547019055e-05, "loss": 1.5624, "step": 4494 }, { "epoch": 0.345421020282729, "grad_norm": 3.439253091812134, "learning_rate": 1.8618315918869085e-05, "loss": 1.4413, "step": 4496 }, { "epoch": 0.3455746773202213, "grad_norm": 3.672922134399414, "learning_rate": 1.8617701290719118e-05, "loss": 1.2986, "step": 4498 }, { "epoch": 0.3457283343577136, "grad_norm": 3.6982719898223877, "learning_rate": 1.8617086662569148e-05, "loss": 1.503, "step": 4500 }, { "epoch": 0.3458819913952059, "grad_norm": 3.3970751762390137, "learning_rate": 1.8616472034419178e-05, "loss": 1.5047, "step": 4502 }, { "epoch": 0.34603564843269824, "grad_norm": 3.2691447734832764, "learning_rate": 1.8615857406269207e-05, "loss": 1.4006, "step": 4504 }, { "epoch": 0.34618930547019056, "grad_norm": 3.3114163875579834, "learning_rate": 1.861524277811924e-05, "loss": 1.4555, "step": 4506 }, { "epoch": 0.34634296250768287, "grad_norm": 3.086209535598755, "learning_rate": 1.861462814996927e-05, "loss": 1.2947, "step": 4508 }, { "epoch": 0.3464966195451752, "grad_norm": 3.334463596343994, "learning_rate": 1.86140135218193e-05, "loss": 1.4271, "step": 4510 }, { "epoch": 0.3466502765826675, "grad_norm": 3.682609796524048, "learning_rate": 1.8613398893669333e-05, "loss": 1.5643, "step": 4512 }, { "epoch": 0.3468039336201598, "grad_norm": 3.6021106243133545, "learning_rate": 1.8612784265519362e-05, "loss": 1.5502, "step": 4514 }, { "epoch": 0.34695759065765214, "grad_norm": 3.319687843322754, "learning_rate": 1.8612169637369392e-05, "loss": 1.5814, "step": 4516 }, { "epoch": 0.34711124769514445, "grad_norm": 3.140190362930298, "learning_rate": 1.8611555009219425e-05, "loss": 1.4551, "step": 4518 }, { "epoch": 0.34726490473263677, "grad_norm": 3.0607619285583496, "learning_rate": 1.8610940381069455e-05, "loss": 1.3777, "step": 4520 }, { "epoch": 0.3474185617701291, "grad_norm": 2.9984660148620605, "learning_rate": 1.8610325752919485e-05, "loss": 1.4872, "step": 4522 }, { "epoch": 0.3475722188076214, "grad_norm": 3.7872512340545654, "learning_rate": 1.8609711124769518e-05, "loss": 1.4314, "step": 4524 }, { "epoch": 0.3477258758451137, "grad_norm": 3.393026113510132, "learning_rate": 1.8609096496619547e-05, "loss": 1.5033, "step": 4526 }, { "epoch": 0.34787953288260604, "grad_norm": 3.257423162460327, "learning_rate": 1.8608481868469577e-05, "loss": 1.4024, "step": 4528 }, { "epoch": 0.34803318992009835, "grad_norm": 3.547748565673828, "learning_rate": 1.8607867240319607e-05, "loss": 1.5249, "step": 4530 }, { "epoch": 0.34818684695759067, "grad_norm": 3.3628032207489014, "learning_rate": 1.860725261216964e-05, "loss": 1.4415, "step": 4532 }, { "epoch": 0.348340503995083, "grad_norm": 3.198662519454956, "learning_rate": 1.860663798401967e-05, "loss": 1.4072, "step": 4534 }, { "epoch": 0.3484941610325753, "grad_norm": 4.2712721824646, "learning_rate": 1.86060233558697e-05, "loss": 1.4693, "step": 4536 }, { "epoch": 0.3486478180700676, "grad_norm": 2.8440463542938232, "learning_rate": 1.8605408727719732e-05, "loss": 1.4625, "step": 4538 }, { "epoch": 0.34880147510755993, "grad_norm": 3.3915908336639404, "learning_rate": 1.8604794099569762e-05, "loss": 1.2992, "step": 4540 }, { "epoch": 0.34895513214505225, "grad_norm": 3.731377601623535, "learning_rate": 1.860417947141979e-05, "loss": 1.4686, "step": 4542 }, { "epoch": 0.34910878918254457, "grad_norm": 3.094580888748169, "learning_rate": 1.8603564843269825e-05, "loss": 1.5452, "step": 4544 }, { "epoch": 0.3492624462200369, "grad_norm": 3.4237303733825684, "learning_rate": 1.8602950215119854e-05, "loss": 1.498, "step": 4546 }, { "epoch": 0.3494161032575292, "grad_norm": 3.4610507488250732, "learning_rate": 1.8602335586969884e-05, "loss": 1.3702, "step": 4548 }, { "epoch": 0.3495697602950215, "grad_norm": 3.4906504154205322, "learning_rate": 1.8601720958819917e-05, "loss": 1.3723, "step": 4550 }, { "epoch": 0.34972341733251383, "grad_norm": 3.214956521987915, "learning_rate": 1.8601106330669947e-05, "loss": 1.5002, "step": 4552 }, { "epoch": 0.34987707437000615, "grad_norm": 3.5659472942352295, "learning_rate": 1.8600491702519976e-05, "loss": 1.4491, "step": 4554 }, { "epoch": 0.35003073140749846, "grad_norm": 3.7536747455596924, "learning_rate": 1.8599877074370006e-05, "loss": 1.6745, "step": 4556 }, { "epoch": 0.3501843884449908, "grad_norm": 3.7435832023620605, "learning_rate": 1.859926244622004e-05, "loss": 1.3983, "step": 4558 }, { "epoch": 0.3503380454824831, "grad_norm": 3.1772406101226807, "learning_rate": 1.859864781807007e-05, "loss": 1.2819, "step": 4560 }, { "epoch": 0.3504917025199754, "grad_norm": 3.307325839996338, "learning_rate": 1.85980331899201e-05, "loss": 1.4716, "step": 4562 }, { "epoch": 0.35064535955746773, "grad_norm": 3.418013334274292, "learning_rate": 1.859741856177013e-05, "loss": 1.3982, "step": 4564 }, { "epoch": 0.35079901659496004, "grad_norm": 3.2819008827209473, "learning_rate": 1.859680393362016e-05, "loss": 1.468, "step": 4566 }, { "epoch": 0.35095267363245236, "grad_norm": 3.145008087158203, "learning_rate": 1.859618930547019e-05, "loss": 1.3179, "step": 4568 }, { "epoch": 0.3511063306699447, "grad_norm": 3.330179452896118, "learning_rate": 1.8595574677320224e-05, "loss": 1.3268, "step": 4570 }, { "epoch": 0.351259987707437, "grad_norm": 3.0777249336242676, "learning_rate": 1.8594960049170254e-05, "loss": 1.4471, "step": 4572 }, { "epoch": 0.3514136447449293, "grad_norm": 3.1006267070770264, "learning_rate": 1.8594345421020283e-05, "loss": 1.3894, "step": 4574 }, { "epoch": 0.3515673017824216, "grad_norm": 3.358553409576416, "learning_rate": 1.8593730792870316e-05, "loss": 1.4034, "step": 4576 }, { "epoch": 0.35172095881991394, "grad_norm": 3.211599349975586, "learning_rate": 1.8593116164720346e-05, "loss": 1.5581, "step": 4578 }, { "epoch": 0.35187461585740626, "grad_norm": 3.6108851432800293, "learning_rate": 1.8592501536570376e-05, "loss": 1.5294, "step": 4580 }, { "epoch": 0.3520282728948986, "grad_norm": 3.4219422340393066, "learning_rate": 1.8591886908420406e-05, "loss": 1.2853, "step": 4582 }, { "epoch": 0.3521819299323909, "grad_norm": 3.706087112426758, "learning_rate": 1.859127228027044e-05, "loss": 1.4577, "step": 4584 }, { "epoch": 0.3523355869698832, "grad_norm": 4.492853164672852, "learning_rate": 1.8590657652120468e-05, "loss": 1.5574, "step": 4586 }, { "epoch": 0.3524892440073755, "grad_norm": 2.998133420944214, "learning_rate": 1.8590043023970498e-05, "loss": 1.39, "step": 4588 }, { "epoch": 0.35264290104486784, "grad_norm": 4.351780414581299, "learning_rate": 1.858942839582053e-05, "loss": 1.4118, "step": 4590 }, { "epoch": 0.35279655808236016, "grad_norm": 3.8137764930725098, "learning_rate": 1.858881376767056e-05, "loss": 1.324, "step": 4592 }, { "epoch": 0.3529502151198525, "grad_norm": 3.1416056156158447, "learning_rate": 1.858819913952059e-05, "loss": 1.3979, "step": 4594 }, { "epoch": 0.3531038721573448, "grad_norm": 3.3871965408325195, "learning_rate": 1.8587584511370623e-05, "loss": 1.3873, "step": 4596 }, { "epoch": 0.3532575291948371, "grad_norm": 3.1268036365509033, "learning_rate": 1.8586969883220653e-05, "loss": 1.4496, "step": 4598 }, { "epoch": 0.3534111862323294, "grad_norm": 4.005764007568359, "learning_rate": 1.8586355255070683e-05, "loss": 1.5423, "step": 4600 }, { "epoch": 0.35356484326982174, "grad_norm": 3.157153367996216, "learning_rate": 1.8585740626920713e-05, "loss": 1.3039, "step": 4602 }, { "epoch": 0.35371850030731405, "grad_norm": 3.5315701961517334, "learning_rate": 1.8585125998770746e-05, "loss": 1.3771, "step": 4604 }, { "epoch": 0.35387215734480637, "grad_norm": 3.5096662044525146, "learning_rate": 1.8584511370620775e-05, "loss": 1.4756, "step": 4606 }, { "epoch": 0.3540258143822987, "grad_norm": 3.7469310760498047, "learning_rate": 1.8583896742470805e-05, "loss": 1.5494, "step": 4608 }, { "epoch": 0.354179471419791, "grad_norm": 3.916029691696167, "learning_rate": 1.8583282114320838e-05, "loss": 1.4195, "step": 4610 }, { "epoch": 0.3543331284572833, "grad_norm": 3.3973281383514404, "learning_rate": 1.8582667486170868e-05, "loss": 1.4924, "step": 4612 }, { "epoch": 0.35448678549477564, "grad_norm": 3.974994421005249, "learning_rate": 1.8582052858020897e-05, "loss": 1.3841, "step": 4614 }, { "epoch": 0.35464044253226795, "grad_norm": 3.317247152328491, "learning_rate": 1.858143822987093e-05, "loss": 1.3693, "step": 4616 }, { "epoch": 0.35479409956976027, "grad_norm": 3.2448232173919678, "learning_rate": 1.858082360172096e-05, "loss": 1.5438, "step": 4618 }, { "epoch": 0.3549477566072526, "grad_norm": 3.4983530044555664, "learning_rate": 1.858020897357099e-05, "loss": 1.5307, "step": 4620 }, { "epoch": 0.35510141364474496, "grad_norm": 4.106202602386475, "learning_rate": 1.8579594345421023e-05, "loss": 1.3608, "step": 4622 }, { "epoch": 0.3552550706822373, "grad_norm": 3.7495827674865723, "learning_rate": 1.8578979717271053e-05, "loss": 1.4775, "step": 4624 }, { "epoch": 0.3554087277197296, "grad_norm": 3.6079421043395996, "learning_rate": 1.8578365089121086e-05, "loss": 1.3453, "step": 4626 }, { "epoch": 0.3555623847572219, "grad_norm": 3.424074649810791, "learning_rate": 1.8577750460971112e-05, "loss": 1.4793, "step": 4628 }, { "epoch": 0.3557160417947142, "grad_norm": 3.2874410152435303, "learning_rate": 1.8577135832821145e-05, "loss": 1.4401, "step": 4630 }, { "epoch": 0.35586969883220654, "grad_norm": 3.3648531436920166, "learning_rate": 1.8576521204671175e-05, "loss": 1.3748, "step": 4632 }, { "epoch": 0.35602335586969885, "grad_norm": 3.250709056854248, "learning_rate": 1.8575906576521204e-05, "loss": 1.4623, "step": 4634 }, { "epoch": 0.35617701290719117, "grad_norm": 3.538073778152466, "learning_rate": 1.8575291948371237e-05, "loss": 1.5085, "step": 4636 }, { "epoch": 0.3563306699446835, "grad_norm": 3.82171630859375, "learning_rate": 1.8574677320221267e-05, "loss": 1.5804, "step": 4638 }, { "epoch": 0.3564843269821758, "grad_norm": 3.834066390991211, "learning_rate": 1.8574062692071297e-05, "loss": 1.4279, "step": 4640 }, { "epoch": 0.3566379840196681, "grad_norm": 3.344313144683838, "learning_rate": 1.857344806392133e-05, "loss": 1.3728, "step": 4642 }, { "epoch": 0.35679164105716044, "grad_norm": 3.3498642444610596, "learning_rate": 1.857283343577136e-05, "loss": 1.3697, "step": 4644 }, { "epoch": 0.35694529809465275, "grad_norm": 3.3379592895507812, "learning_rate": 1.8572218807621393e-05, "loss": 1.2771, "step": 4646 }, { "epoch": 0.35709895513214507, "grad_norm": 3.0588574409484863, "learning_rate": 1.8571604179471422e-05, "loss": 1.4029, "step": 4648 }, { "epoch": 0.3572526121696374, "grad_norm": 3.230083703994751, "learning_rate": 1.8570989551321452e-05, "loss": 1.5251, "step": 4650 }, { "epoch": 0.3574062692071297, "grad_norm": 3.0492098331451416, "learning_rate": 1.8570374923171485e-05, "loss": 1.4231, "step": 4652 }, { "epoch": 0.357559926244622, "grad_norm": 3.1024155616760254, "learning_rate": 1.856976029502151e-05, "loss": 1.3485, "step": 4654 }, { "epoch": 0.35771358328211433, "grad_norm": 3.091562509536743, "learning_rate": 1.8569145666871544e-05, "loss": 1.4922, "step": 4656 }, { "epoch": 0.35786724031960665, "grad_norm": 3.7615408897399902, "learning_rate": 1.8568531038721574e-05, "loss": 1.4149, "step": 4658 }, { "epoch": 0.35802089735709897, "grad_norm": 2.9202842712402344, "learning_rate": 1.8567916410571604e-05, "loss": 1.3638, "step": 4660 }, { "epoch": 0.3581745543945913, "grad_norm": 3.4049057960510254, "learning_rate": 1.8567301782421637e-05, "loss": 1.3405, "step": 4662 }, { "epoch": 0.3583282114320836, "grad_norm": 3.389155864715576, "learning_rate": 1.8566687154271667e-05, "loss": 1.4706, "step": 4664 }, { "epoch": 0.3584818684695759, "grad_norm": 3.2135565280914307, "learning_rate": 1.85660725261217e-05, "loss": 1.3997, "step": 4666 }, { "epoch": 0.35863552550706823, "grad_norm": 3.588104009628296, "learning_rate": 1.856545789797173e-05, "loss": 1.4596, "step": 4668 }, { "epoch": 0.35878918254456055, "grad_norm": 3.40895676612854, "learning_rate": 1.856484326982176e-05, "loss": 1.4499, "step": 4670 }, { "epoch": 0.35894283958205286, "grad_norm": 3.071735143661499, "learning_rate": 1.8564228641671792e-05, "loss": 1.369, "step": 4672 }, { "epoch": 0.3590964966195452, "grad_norm": 3.500002861022949, "learning_rate": 1.8563614013521822e-05, "loss": 1.3623, "step": 4674 }, { "epoch": 0.3592501536570375, "grad_norm": 3.3356125354766846, "learning_rate": 1.856299938537185e-05, "loss": 1.4402, "step": 4676 }, { "epoch": 0.3594038106945298, "grad_norm": 3.249798059463501, "learning_rate": 1.8562384757221885e-05, "loss": 1.335, "step": 4678 }, { "epoch": 0.35955746773202213, "grad_norm": 3.477461099624634, "learning_rate": 1.856177012907191e-05, "loss": 1.4934, "step": 4680 }, { "epoch": 0.35971112476951445, "grad_norm": 3.0536158084869385, "learning_rate": 1.8561155500921944e-05, "loss": 1.3237, "step": 4682 }, { "epoch": 0.35986478180700676, "grad_norm": 2.8353586196899414, "learning_rate": 1.8560540872771974e-05, "loss": 1.4192, "step": 4684 }, { "epoch": 0.3600184388444991, "grad_norm": 3.6377627849578857, "learning_rate": 1.8559926244622003e-05, "loss": 1.5224, "step": 4686 }, { "epoch": 0.3601720958819914, "grad_norm": 3.3342795372009277, "learning_rate": 1.8559311616472036e-05, "loss": 1.5283, "step": 4688 }, { "epoch": 0.3603257529194837, "grad_norm": 3.5192248821258545, "learning_rate": 1.8558696988322066e-05, "loss": 1.4173, "step": 4690 }, { "epoch": 0.360479409956976, "grad_norm": 3.1181795597076416, "learning_rate": 1.85580823601721e-05, "loss": 1.3981, "step": 4692 }, { "epoch": 0.36063306699446834, "grad_norm": 3.264928102493286, "learning_rate": 1.855746773202213e-05, "loss": 1.3971, "step": 4694 }, { "epoch": 0.36078672403196066, "grad_norm": 3.1449787616729736, "learning_rate": 1.855685310387216e-05, "loss": 1.3952, "step": 4696 }, { "epoch": 0.360940381069453, "grad_norm": 3.4378745555877686, "learning_rate": 1.855623847572219e-05, "loss": 1.4382, "step": 4698 }, { "epoch": 0.3610940381069453, "grad_norm": 3.424116373062134, "learning_rate": 1.8555623847572218e-05, "loss": 1.3557, "step": 4700 }, { "epoch": 0.3612476951444376, "grad_norm": 3.6776251792907715, "learning_rate": 1.855500921942225e-05, "loss": 1.5211, "step": 4702 }, { "epoch": 0.3614013521819299, "grad_norm": 3.3191077709198, "learning_rate": 1.855439459127228e-05, "loss": 1.2799, "step": 4704 }, { "epoch": 0.36155500921942224, "grad_norm": 3.217733383178711, "learning_rate": 1.855377996312231e-05, "loss": 1.3677, "step": 4706 }, { "epoch": 0.36170866625691456, "grad_norm": 3.2171108722686768, "learning_rate": 1.8553165334972343e-05, "loss": 1.5173, "step": 4708 }, { "epoch": 0.3618623232944069, "grad_norm": 3.5864431858062744, "learning_rate": 1.8552550706822373e-05, "loss": 1.4608, "step": 4710 }, { "epoch": 0.3620159803318992, "grad_norm": 3.2654552459716797, "learning_rate": 1.8551936078672406e-05, "loss": 1.3668, "step": 4712 }, { "epoch": 0.3621696373693915, "grad_norm": 3.540778160095215, "learning_rate": 1.8551321450522436e-05, "loss": 1.346, "step": 4714 }, { "epoch": 0.3623232944068838, "grad_norm": 3.4130847454071045, "learning_rate": 1.8550706822372465e-05, "loss": 1.3636, "step": 4716 }, { "epoch": 0.36247695144437614, "grad_norm": 3.9183061122894287, "learning_rate": 1.85500921942225e-05, "loss": 1.3392, "step": 4718 }, { "epoch": 0.36263060848186845, "grad_norm": 3.619670867919922, "learning_rate": 1.8549477566072528e-05, "loss": 1.4334, "step": 4720 }, { "epoch": 0.36278426551936077, "grad_norm": 3.4983813762664795, "learning_rate": 1.8548862937922558e-05, "loss": 1.3492, "step": 4722 }, { "epoch": 0.3629379225568531, "grad_norm": 3.3781394958496094, "learning_rate": 1.854824830977259e-05, "loss": 1.4709, "step": 4724 }, { "epoch": 0.3630915795943454, "grad_norm": 3.9609100818634033, "learning_rate": 1.8547633681622617e-05, "loss": 1.3665, "step": 4726 }, { "epoch": 0.3632452366318377, "grad_norm": 3.8073222637176514, "learning_rate": 1.854701905347265e-05, "loss": 1.441, "step": 4728 }, { "epoch": 0.36339889366933004, "grad_norm": 3.3414063453674316, "learning_rate": 1.854640442532268e-05, "loss": 1.3758, "step": 4730 }, { "epoch": 0.36355255070682235, "grad_norm": 3.2114644050598145, "learning_rate": 1.8545789797172713e-05, "loss": 1.3025, "step": 4732 }, { "epoch": 0.36370620774431467, "grad_norm": 3.299943447113037, "learning_rate": 1.8545175169022743e-05, "loss": 1.4549, "step": 4734 }, { "epoch": 0.363859864781807, "grad_norm": 3.959162473678589, "learning_rate": 1.8544560540872772e-05, "loss": 1.431, "step": 4736 }, { "epoch": 0.3640135218192993, "grad_norm": 3.4247779846191406, "learning_rate": 1.8543945912722806e-05, "loss": 1.5114, "step": 4738 }, { "epoch": 0.3641671788567916, "grad_norm": 3.407259941101074, "learning_rate": 1.8543331284572835e-05, "loss": 1.5001, "step": 4740 }, { "epoch": 0.36432083589428393, "grad_norm": 3.3463499546051025, "learning_rate": 1.8542716656422865e-05, "loss": 1.3259, "step": 4742 }, { "epoch": 0.36447449293177625, "grad_norm": 2.8831546306610107, "learning_rate": 1.8542102028272898e-05, "loss": 1.4593, "step": 4744 }, { "epoch": 0.36462814996926857, "grad_norm": 3.3898556232452393, "learning_rate": 1.8541487400122928e-05, "loss": 1.4173, "step": 4746 }, { "epoch": 0.3647818070067609, "grad_norm": 3.1895546913146973, "learning_rate": 1.8540872771972957e-05, "loss": 1.4014, "step": 4748 }, { "epoch": 0.3649354640442532, "grad_norm": 3.217935085296631, "learning_rate": 1.854025814382299e-05, "loss": 1.3577, "step": 4750 }, { "epoch": 0.36508912108174557, "grad_norm": 3.2080564498901367, "learning_rate": 1.853964351567302e-05, "loss": 1.2894, "step": 4752 }, { "epoch": 0.3652427781192379, "grad_norm": 3.4375319480895996, "learning_rate": 1.853902888752305e-05, "loss": 1.4444, "step": 4754 }, { "epoch": 0.3653964351567302, "grad_norm": 24.822725296020508, "learning_rate": 1.853841425937308e-05, "loss": 1.3199, "step": 4756 }, { "epoch": 0.3655500921942225, "grad_norm": 3.4176087379455566, "learning_rate": 1.8537799631223113e-05, "loss": 1.4149, "step": 4758 }, { "epoch": 0.36570374923171484, "grad_norm": 3.6621382236480713, "learning_rate": 1.8537185003073142e-05, "loss": 1.4023, "step": 4760 }, { "epoch": 0.36585740626920715, "grad_norm": 3.013288974761963, "learning_rate": 1.8536570374923172e-05, "loss": 1.4663, "step": 4762 }, { "epoch": 0.36601106330669947, "grad_norm": 3.411132335662842, "learning_rate": 1.8535955746773205e-05, "loss": 1.379, "step": 4764 }, { "epoch": 0.3661647203441918, "grad_norm": 3.7102386951446533, "learning_rate": 1.8535341118623235e-05, "loss": 1.4119, "step": 4766 }, { "epoch": 0.3663183773816841, "grad_norm": 3.6515626907348633, "learning_rate": 1.8534726490473264e-05, "loss": 1.4011, "step": 4768 }, { "epoch": 0.3664720344191764, "grad_norm": 2.929818630218506, "learning_rate": 1.8534111862323297e-05, "loss": 1.3097, "step": 4770 }, { "epoch": 0.36662569145666873, "grad_norm": 3.2975289821624756, "learning_rate": 1.8533497234173327e-05, "loss": 1.3629, "step": 4772 }, { "epoch": 0.36677934849416105, "grad_norm": 3.4439969062805176, "learning_rate": 1.8532882606023357e-05, "loss": 1.3984, "step": 4774 }, { "epoch": 0.36693300553165337, "grad_norm": 3.5669538974761963, "learning_rate": 1.853226797787339e-05, "loss": 1.4848, "step": 4776 }, { "epoch": 0.3670866625691457, "grad_norm": 3.3473563194274902, "learning_rate": 1.853165334972342e-05, "loss": 1.3792, "step": 4778 }, { "epoch": 0.367240319606638, "grad_norm": 3.0193610191345215, "learning_rate": 1.853103872157345e-05, "loss": 1.3099, "step": 4780 }, { "epoch": 0.3673939766441303, "grad_norm": 3.254918336868286, "learning_rate": 1.853042409342348e-05, "loss": 1.4501, "step": 4782 }, { "epoch": 0.36754763368162263, "grad_norm": 3.2965073585510254, "learning_rate": 1.8529809465273512e-05, "loss": 1.5442, "step": 4784 }, { "epoch": 0.36770129071911495, "grad_norm": 3.3364098072052, "learning_rate": 1.852919483712354e-05, "loss": 1.3758, "step": 4786 }, { "epoch": 0.36785494775660726, "grad_norm": 3.033813714981079, "learning_rate": 1.852858020897357e-05, "loss": 1.4798, "step": 4788 }, { "epoch": 0.3680086047940996, "grad_norm": 3.194025993347168, "learning_rate": 1.8527965580823604e-05, "loss": 1.2675, "step": 4790 }, { "epoch": 0.3681622618315919, "grad_norm": 3.411519765853882, "learning_rate": 1.8527350952673634e-05, "loss": 1.5546, "step": 4792 }, { "epoch": 0.3683159188690842, "grad_norm": 3.634873628616333, "learning_rate": 1.8526736324523664e-05, "loss": 1.5041, "step": 4794 }, { "epoch": 0.36846957590657653, "grad_norm": 2.9618473052978516, "learning_rate": 1.8526121696373697e-05, "loss": 1.4126, "step": 4796 }, { "epoch": 0.36862323294406885, "grad_norm": 3.2538132667541504, "learning_rate": 1.8525507068223727e-05, "loss": 1.4021, "step": 4798 }, { "epoch": 0.36877688998156116, "grad_norm": 3.3406293392181396, "learning_rate": 1.8524892440073756e-05, "loss": 1.349, "step": 4800 }, { "epoch": 0.3689305470190535, "grad_norm": 3.3935546875, "learning_rate": 1.852427781192379e-05, "loss": 1.2304, "step": 4802 }, { "epoch": 0.3690842040565458, "grad_norm": 3.685398578643799, "learning_rate": 1.852366318377382e-05, "loss": 1.3805, "step": 4804 }, { "epoch": 0.3692378610940381, "grad_norm": 3.6336817741394043, "learning_rate": 1.852304855562385e-05, "loss": 1.5106, "step": 4806 }, { "epoch": 0.3693915181315304, "grad_norm": 3.2699592113494873, "learning_rate": 1.852243392747388e-05, "loss": 1.3488, "step": 4808 }, { "epoch": 0.36954517516902274, "grad_norm": 3.5057644844055176, "learning_rate": 1.852181929932391e-05, "loss": 1.4388, "step": 4810 }, { "epoch": 0.36969883220651506, "grad_norm": 3.1006228923797607, "learning_rate": 1.852120467117394e-05, "loss": 1.6204, "step": 4812 }, { "epoch": 0.3698524892440074, "grad_norm": 3.2444615364074707, "learning_rate": 1.852059004302397e-05, "loss": 1.3704, "step": 4814 }, { "epoch": 0.3700061462814997, "grad_norm": 3.3723206520080566, "learning_rate": 1.8519975414874004e-05, "loss": 1.3931, "step": 4816 }, { "epoch": 0.370159803318992, "grad_norm": 3.3920679092407227, "learning_rate": 1.8519360786724034e-05, "loss": 1.2844, "step": 4818 }, { "epoch": 0.3703134603564843, "grad_norm": 3.732646942138672, "learning_rate": 1.8518746158574063e-05, "loss": 1.3408, "step": 4820 }, { "epoch": 0.37046711739397664, "grad_norm": 3.0246357917785645, "learning_rate": 1.8518131530424096e-05, "loss": 1.2567, "step": 4822 }, { "epoch": 0.37062077443146896, "grad_norm": 3.309549331665039, "learning_rate": 1.8517516902274126e-05, "loss": 1.3551, "step": 4824 }, { "epoch": 0.3707744314689613, "grad_norm": 3.1484642028808594, "learning_rate": 1.8516902274124156e-05, "loss": 1.4835, "step": 4826 }, { "epoch": 0.3709280885064536, "grad_norm": 3.240321636199951, "learning_rate": 1.8516287645974185e-05, "loss": 1.4054, "step": 4828 }, { "epoch": 0.3710817455439459, "grad_norm": 3.533933162689209, "learning_rate": 1.851567301782422e-05, "loss": 1.4685, "step": 4830 }, { "epoch": 0.3712354025814382, "grad_norm": 3.4116733074188232, "learning_rate": 1.8515058389674248e-05, "loss": 1.3839, "step": 4832 }, { "epoch": 0.37138905961893054, "grad_norm": 3.154917001724243, "learning_rate": 1.8514443761524278e-05, "loss": 1.4637, "step": 4834 }, { "epoch": 0.37154271665642286, "grad_norm": 2.889686107635498, "learning_rate": 1.851382913337431e-05, "loss": 1.38, "step": 4836 }, { "epoch": 0.37169637369391517, "grad_norm": 3.351166009902954, "learning_rate": 1.851321450522434e-05, "loss": 1.4498, "step": 4838 }, { "epoch": 0.3718500307314075, "grad_norm": 3.114797592163086, "learning_rate": 1.851259987707437e-05, "loss": 1.3205, "step": 4840 }, { "epoch": 0.3720036877688998, "grad_norm": 3.2657954692840576, "learning_rate": 1.8511985248924403e-05, "loss": 1.3892, "step": 4842 }, { "epoch": 0.3721573448063921, "grad_norm": 3.140523910522461, "learning_rate": 1.8511370620774433e-05, "loss": 1.5039, "step": 4844 }, { "epoch": 0.37231100184388444, "grad_norm": 3.459165334701538, "learning_rate": 1.8510755992624463e-05, "loss": 1.4855, "step": 4846 }, { "epoch": 0.37246465888137675, "grad_norm": 3.286754846572876, "learning_rate": 1.8510141364474496e-05, "loss": 1.4934, "step": 4848 }, { "epoch": 0.37261831591886907, "grad_norm": 3.208974599838257, "learning_rate": 1.8509526736324525e-05, "loss": 1.3993, "step": 4850 }, { "epoch": 0.3727719729563614, "grad_norm": 3.7249045372009277, "learning_rate": 1.8508912108174555e-05, "loss": 1.5347, "step": 4852 }, { "epoch": 0.3729256299938537, "grad_norm": 2.87788724899292, "learning_rate": 1.8508297480024585e-05, "loss": 1.2677, "step": 4854 }, { "epoch": 0.373079287031346, "grad_norm": 3.778860330581665, "learning_rate": 1.8507682851874618e-05, "loss": 1.3966, "step": 4856 }, { "epoch": 0.37323294406883833, "grad_norm": 3.1479995250701904, "learning_rate": 1.8507068223724648e-05, "loss": 1.268, "step": 4858 }, { "epoch": 0.37338660110633065, "grad_norm": 3.2577826976776123, "learning_rate": 1.8506453595574677e-05, "loss": 1.4929, "step": 4860 }, { "epoch": 0.37354025814382297, "grad_norm": 3.440727472305298, "learning_rate": 1.850583896742471e-05, "loss": 1.4138, "step": 4862 }, { "epoch": 0.3736939151813153, "grad_norm": 4.165016174316406, "learning_rate": 1.850522433927474e-05, "loss": 1.4426, "step": 4864 }, { "epoch": 0.3738475722188076, "grad_norm": 3.2028725147247314, "learning_rate": 1.850460971112477e-05, "loss": 1.2641, "step": 4866 }, { "epoch": 0.3740012292562999, "grad_norm": 2.9112865924835205, "learning_rate": 1.8503995082974803e-05, "loss": 1.4361, "step": 4868 }, { "epoch": 0.37415488629379223, "grad_norm": 3.539193868637085, "learning_rate": 1.8503380454824832e-05, "loss": 1.3404, "step": 4870 }, { "epoch": 0.37430854333128455, "grad_norm": 3.1732802391052246, "learning_rate": 1.8502765826674862e-05, "loss": 1.4955, "step": 4872 }, { "epoch": 0.37446220036877687, "grad_norm": 3.662275791168213, "learning_rate": 1.8502151198524895e-05, "loss": 1.4514, "step": 4874 }, { "epoch": 0.3746158574062692, "grad_norm": 3.213059186935425, "learning_rate": 1.8501536570374925e-05, "loss": 1.517, "step": 4876 }, { "epoch": 0.3747695144437615, "grad_norm": 3.332075834274292, "learning_rate": 1.8500921942224958e-05, "loss": 1.4251, "step": 4878 }, { "epoch": 0.3749231714812538, "grad_norm": 3.2899796962738037, "learning_rate": 1.8500307314074984e-05, "loss": 1.3975, "step": 4880 }, { "epoch": 0.3750768285187462, "grad_norm": 3.4021992683410645, "learning_rate": 1.8499692685925017e-05, "loss": 1.4664, "step": 4882 }, { "epoch": 0.3752304855562385, "grad_norm": 3.553896188735962, "learning_rate": 1.8499078057775047e-05, "loss": 1.356, "step": 4884 }, { "epoch": 0.3753841425937308, "grad_norm": 3.311842203140259, "learning_rate": 1.8498463429625077e-05, "loss": 1.4542, "step": 4886 }, { "epoch": 0.37553779963122313, "grad_norm": 3.7482736110687256, "learning_rate": 1.849784880147511e-05, "loss": 1.4163, "step": 4888 }, { "epoch": 0.37569145666871545, "grad_norm": 3.5204427242279053, "learning_rate": 1.849723417332514e-05, "loss": 1.3673, "step": 4890 }, { "epoch": 0.37584511370620777, "grad_norm": 3.4200925827026367, "learning_rate": 1.849661954517517e-05, "loss": 1.3482, "step": 4892 }, { "epoch": 0.3759987707437001, "grad_norm": 3.378998279571533, "learning_rate": 1.8496004917025202e-05, "loss": 1.5857, "step": 4894 }, { "epoch": 0.3761524277811924, "grad_norm": 3.0784049034118652, "learning_rate": 1.8495390288875232e-05, "loss": 1.4947, "step": 4896 }, { "epoch": 0.3763060848186847, "grad_norm": 3.1790988445281982, "learning_rate": 1.8494775660725265e-05, "loss": 1.4122, "step": 4898 }, { "epoch": 0.37645974185617703, "grad_norm": 3.191070318222046, "learning_rate": 1.8494161032575295e-05, "loss": 1.3629, "step": 4900 }, { "epoch": 0.37661339889366935, "grad_norm": 3.2839808464050293, "learning_rate": 1.8493546404425324e-05, "loss": 1.3496, "step": 4902 }, { "epoch": 0.37676705593116167, "grad_norm": 3.522803783416748, "learning_rate": 1.8492931776275357e-05, "loss": 1.4566, "step": 4904 }, { "epoch": 0.376920712968654, "grad_norm": 3.301510810852051, "learning_rate": 1.8492317148125384e-05, "loss": 1.4704, "step": 4906 }, { "epoch": 0.3770743700061463, "grad_norm": 3.3755650520324707, "learning_rate": 1.8491702519975417e-05, "loss": 1.33, "step": 4908 }, { "epoch": 0.3772280270436386, "grad_norm": 3.328902244567871, "learning_rate": 1.8491087891825446e-05, "loss": 1.2691, "step": 4910 }, { "epoch": 0.37738168408113093, "grad_norm": 3.257596254348755, "learning_rate": 1.8490473263675476e-05, "loss": 1.5409, "step": 4912 }, { "epoch": 0.37753534111862325, "grad_norm": 3.0310218334198, "learning_rate": 1.848985863552551e-05, "loss": 1.3193, "step": 4914 }, { "epoch": 0.37768899815611556, "grad_norm": 3.495089054107666, "learning_rate": 1.848924400737554e-05, "loss": 1.5036, "step": 4916 }, { "epoch": 0.3778426551936079, "grad_norm": 3.5136234760284424, "learning_rate": 1.8488629379225572e-05, "loss": 1.4914, "step": 4918 }, { "epoch": 0.3779963122311002, "grad_norm": 2.943639039993286, "learning_rate": 1.84880147510756e-05, "loss": 1.3518, "step": 4920 }, { "epoch": 0.3781499692685925, "grad_norm": 3.4134724140167236, "learning_rate": 1.848740012292563e-05, "loss": 1.3874, "step": 4922 }, { "epoch": 0.37830362630608483, "grad_norm": 3.4609227180480957, "learning_rate": 1.8486785494775664e-05, "loss": 1.4883, "step": 4924 }, { "epoch": 0.37845728334357714, "grad_norm": 3.085495710372925, "learning_rate": 1.848617086662569e-05, "loss": 1.429, "step": 4926 }, { "epoch": 0.37861094038106946, "grad_norm": 3.304070234298706, "learning_rate": 1.8485556238475724e-05, "loss": 1.5389, "step": 4928 }, { "epoch": 0.3787645974185618, "grad_norm": 2.85673189163208, "learning_rate": 1.8484941610325753e-05, "loss": 1.4052, "step": 4930 }, { "epoch": 0.3789182544560541, "grad_norm": 3.134815216064453, "learning_rate": 1.8484326982175783e-05, "loss": 1.4306, "step": 4932 }, { "epoch": 0.3790719114935464, "grad_norm": 3.489353656768799, "learning_rate": 1.8483712354025816e-05, "loss": 1.3598, "step": 4934 }, { "epoch": 0.3792255685310387, "grad_norm": 3.4628639221191406, "learning_rate": 1.8483097725875846e-05, "loss": 1.3958, "step": 4936 }, { "epoch": 0.37937922556853104, "grad_norm": 3.2604832649230957, "learning_rate": 1.848248309772588e-05, "loss": 1.3876, "step": 4938 }, { "epoch": 0.37953288260602336, "grad_norm": 3.188798189163208, "learning_rate": 1.848186846957591e-05, "loss": 1.3992, "step": 4940 }, { "epoch": 0.3796865396435157, "grad_norm": 3.7080070972442627, "learning_rate": 1.8481253841425938e-05, "loss": 1.4537, "step": 4942 }, { "epoch": 0.379840196681008, "grad_norm": 2.9099485874176025, "learning_rate": 1.848063921327597e-05, "loss": 1.4259, "step": 4944 }, { "epoch": 0.3799938537185003, "grad_norm": 3.4322965145111084, "learning_rate": 1.8480024585126e-05, "loss": 1.4752, "step": 4946 }, { "epoch": 0.3801475107559926, "grad_norm": 3.3011248111724854, "learning_rate": 1.847940995697603e-05, "loss": 1.4632, "step": 4948 }, { "epoch": 0.38030116779348494, "grad_norm": 2.9199092388153076, "learning_rate": 1.8478795328826064e-05, "loss": 1.3915, "step": 4950 }, { "epoch": 0.38045482483097726, "grad_norm": 2.939312219619751, "learning_rate": 1.847818070067609e-05, "loss": 1.3709, "step": 4952 }, { "epoch": 0.3806084818684696, "grad_norm": 3.100517749786377, "learning_rate": 1.8477566072526123e-05, "loss": 1.3626, "step": 4954 }, { "epoch": 0.3807621389059619, "grad_norm": 3.323359489440918, "learning_rate": 1.8476951444376153e-05, "loss": 1.388, "step": 4956 }, { "epoch": 0.3809157959434542, "grad_norm": 3.2481391429901123, "learning_rate": 1.8476336816226183e-05, "loss": 1.3469, "step": 4958 }, { "epoch": 0.3810694529809465, "grad_norm": 2.9566774368286133, "learning_rate": 1.8475722188076216e-05, "loss": 1.4155, "step": 4960 }, { "epoch": 0.38122311001843884, "grad_norm": 3.426717758178711, "learning_rate": 1.8475107559926245e-05, "loss": 1.509, "step": 4962 }, { "epoch": 0.38137676705593115, "grad_norm": 3.242187738418579, "learning_rate": 1.847449293177628e-05, "loss": 1.5115, "step": 4964 }, { "epoch": 0.38153042409342347, "grad_norm": 3.4956090450286865, "learning_rate": 1.8473878303626308e-05, "loss": 1.3576, "step": 4966 }, { "epoch": 0.3816840811309158, "grad_norm": 12.086480140686035, "learning_rate": 1.8473263675476338e-05, "loss": 1.3618, "step": 4968 }, { "epoch": 0.3818377381684081, "grad_norm": 3.5222816467285156, "learning_rate": 1.847264904732637e-05, "loss": 1.4648, "step": 4970 }, { "epoch": 0.3819913952059004, "grad_norm": 2.8597071170806885, "learning_rate": 1.84720344191764e-05, "loss": 1.4428, "step": 4972 }, { "epoch": 0.38214505224339274, "grad_norm": 2.9223949909210205, "learning_rate": 1.847141979102643e-05, "loss": 1.3655, "step": 4974 }, { "epoch": 0.38229870928088505, "grad_norm": 3.160203695297241, "learning_rate": 1.8470805162876463e-05, "loss": 1.5182, "step": 4976 }, { "epoch": 0.38245236631837737, "grad_norm": 3.2447376251220703, "learning_rate": 1.847019053472649e-05, "loss": 1.5215, "step": 4978 }, { "epoch": 0.3826060233558697, "grad_norm": 3.0082247257232666, "learning_rate": 1.8469575906576523e-05, "loss": 1.3198, "step": 4980 }, { "epoch": 0.382759680393362, "grad_norm": 3.1510775089263916, "learning_rate": 1.8468961278426552e-05, "loss": 1.3984, "step": 4982 }, { "epoch": 0.3829133374308543, "grad_norm": 3.150709629058838, "learning_rate": 1.8468346650276585e-05, "loss": 1.3621, "step": 4984 }, { "epoch": 0.38306699446834663, "grad_norm": 2.9365999698638916, "learning_rate": 1.8467732022126615e-05, "loss": 1.4187, "step": 4986 }, { "epoch": 0.38322065150583895, "grad_norm": 3.299741744995117, "learning_rate": 1.8467117393976645e-05, "loss": 1.4882, "step": 4988 }, { "epoch": 0.38337430854333127, "grad_norm": 3.2206766605377197, "learning_rate": 1.8466502765826678e-05, "loss": 1.4838, "step": 4990 }, { "epoch": 0.3835279655808236, "grad_norm": 3.3576810359954834, "learning_rate": 1.8465888137676707e-05, "loss": 1.3716, "step": 4992 }, { "epoch": 0.3836816226183159, "grad_norm": 3.3749849796295166, "learning_rate": 1.8465273509526737e-05, "loss": 1.5736, "step": 4994 }, { "epoch": 0.3838352796558082, "grad_norm": 3.3339991569519043, "learning_rate": 1.846465888137677e-05, "loss": 1.4127, "step": 4996 }, { "epoch": 0.38398893669330053, "grad_norm": 3.272212266921997, "learning_rate": 1.84640442532268e-05, "loss": 1.3956, "step": 4998 }, { "epoch": 0.38414259373079285, "grad_norm": 3.243849515914917, "learning_rate": 1.846342962507683e-05, "loss": 1.4803, "step": 5000 }, { "epoch": 0.38429625076828516, "grad_norm": 3.5325818061828613, "learning_rate": 1.8462814996926863e-05, "loss": 1.4568, "step": 5002 }, { "epoch": 0.3844499078057775, "grad_norm": 3.187891721725464, "learning_rate": 1.8462200368776892e-05, "loss": 1.4075, "step": 5004 }, { "epoch": 0.3846035648432698, "grad_norm": 3.035670757293701, "learning_rate": 1.8461585740626922e-05, "loss": 1.3194, "step": 5006 }, { "epoch": 0.3847572218807621, "grad_norm": 3.086202383041382, "learning_rate": 1.8460971112476952e-05, "loss": 1.5004, "step": 5008 }, { "epoch": 0.38491087891825443, "grad_norm": 3.264146327972412, "learning_rate": 1.8460356484326985e-05, "loss": 1.4502, "step": 5010 }, { "epoch": 0.3850645359557468, "grad_norm": 3.0813944339752197, "learning_rate": 1.8459741856177014e-05, "loss": 1.4018, "step": 5012 }, { "epoch": 0.3852181929932391, "grad_norm": 3.097153663635254, "learning_rate": 1.8459127228027044e-05, "loss": 1.3495, "step": 5014 }, { "epoch": 0.38537185003073143, "grad_norm": 3.213472366333008, "learning_rate": 1.8458512599877077e-05, "loss": 1.3695, "step": 5016 }, { "epoch": 0.38552550706822375, "grad_norm": 3.733430862426758, "learning_rate": 1.8457897971727107e-05, "loss": 1.5006, "step": 5018 }, { "epoch": 0.38567916410571607, "grad_norm": 3.3197145462036133, "learning_rate": 1.8457283343577137e-05, "loss": 1.4598, "step": 5020 }, { "epoch": 0.3858328211432084, "grad_norm": 2.798349618911743, "learning_rate": 1.845666871542717e-05, "loss": 1.2987, "step": 5022 }, { "epoch": 0.3859864781807007, "grad_norm": 3.056670665740967, "learning_rate": 1.84560540872772e-05, "loss": 1.5097, "step": 5024 }, { "epoch": 0.386140135218193, "grad_norm": 4.263172149658203, "learning_rate": 1.845543945912723e-05, "loss": 1.4077, "step": 5026 }, { "epoch": 0.38629379225568533, "grad_norm": 2.9781241416931152, "learning_rate": 1.845482483097726e-05, "loss": 1.4007, "step": 5028 }, { "epoch": 0.38644744929317765, "grad_norm": 3.4377338886260986, "learning_rate": 1.8454210202827292e-05, "loss": 1.2981, "step": 5030 }, { "epoch": 0.38660110633066996, "grad_norm": 2.9486420154571533, "learning_rate": 1.845359557467732e-05, "loss": 1.3136, "step": 5032 }, { "epoch": 0.3867547633681623, "grad_norm": 3.4080779552459717, "learning_rate": 1.845298094652735e-05, "loss": 1.4167, "step": 5034 }, { "epoch": 0.3869084204056546, "grad_norm": 4.366278648376465, "learning_rate": 1.8452366318377384e-05, "loss": 1.3073, "step": 5036 }, { "epoch": 0.3870620774431469, "grad_norm": 3.295130491256714, "learning_rate": 1.8451751690227414e-05, "loss": 1.391, "step": 5038 }, { "epoch": 0.38721573448063923, "grad_norm": 3.4234602451324463, "learning_rate": 1.8451137062077444e-05, "loss": 1.4918, "step": 5040 }, { "epoch": 0.38736939151813155, "grad_norm": 3.76176118850708, "learning_rate": 1.8450522433927477e-05, "loss": 1.3417, "step": 5042 }, { "epoch": 0.38752304855562386, "grad_norm": 2.9911139011383057, "learning_rate": 1.8449907805777506e-05, "loss": 1.3087, "step": 5044 }, { "epoch": 0.3876767055931162, "grad_norm": 3.2155096530914307, "learning_rate": 1.8449293177627536e-05, "loss": 1.2759, "step": 5046 }, { "epoch": 0.3878303626306085, "grad_norm": 3.25315523147583, "learning_rate": 1.844867854947757e-05, "loss": 1.4139, "step": 5048 }, { "epoch": 0.3879840196681008, "grad_norm": 3.077777862548828, "learning_rate": 1.84480639213276e-05, "loss": 1.4828, "step": 5050 }, { "epoch": 0.3881376767055931, "grad_norm": 3.0072484016418457, "learning_rate": 1.844744929317763e-05, "loss": 1.2987, "step": 5052 }, { "epoch": 0.38829133374308544, "grad_norm": 3.023391008377075, "learning_rate": 1.8446834665027658e-05, "loss": 1.4497, "step": 5054 }, { "epoch": 0.38844499078057776, "grad_norm": 2.8816518783569336, "learning_rate": 1.844622003687769e-05, "loss": 1.316, "step": 5056 }, { "epoch": 0.3885986478180701, "grad_norm": 3.7871954441070557, "learning_rate": 1.844560540872772e-05, "loss": 1.3195, "step": 5058 }, { "epoch": 0.3887523048555624, "grad_norm": 3.246473550796509, "learning_rate": 1.844499078057775e-05, "loss": 1.4516, "step": 5060 }, { "epoch": 0.3889059618930547, "grad_norm": 3.081164836883545, "learning_rate": 1.8444376152427784e-05, "loss": 1.4083, "step": 5062 }, { "epoch": 0.389059618930547, "grad_norm": 3.482865810394287, "learning_rate": 1.8443761524277813e-05, "loss": 1.4287, "step": 5064 }, { "epoch": 0.38921327596803934, "grad_norm": 3.7230827808380127, "learning_rate": 1.8443146896127843e-05, "loss": 1.4355, "step": 5066 }, { "epoch": 0.38936693300553166, "grad_norm": 2.9292538166046143, "learning_rate": 1.8442532267977876e-05, "loss": 1.3523, "step": 5068 }, { "epoch": 0.389520590043024, "grad_norm": 3.3166749477386475, "learning_rate": 1.8441917639827906e-05, "loss": 1.3417, "step": 5070 }, { "epoch": 0.3896742470805163, "grad_norm": 3.5913453102111816, "learning_rate": 1.8441303011677935e-05, "loss": 1.4755, "step": 5072 }, { "epoch": 0.3898279041180086, "grad_norm": 3.5193488597869873, "learning_rate": 1.844068838352797e-05, "loss": 1.5068, "step": 5074 }, { "epoch": 0.3899815611555009, "grad_norm": 3.6477646827697754, "learning_rate": 1.8440073755377998e-05, "loss": 1.4358, "step": 5076 }, { "epoch": 0.39013521819299324, "grad_norm": 3.212559700012207, "learning_rate": 1.8439459127228028e-05, "loss": 1.3363, "step": 5078 }, { "epoch": 0.39028887523048555, "grad_norm": 3.282552480697632, "learning_rate": 1.8438844499078058e-05, "loss": 1.4447, "step": 5080 }, { "epoch": 0.39044253226797787, "grad_norm": 3.2416465282440186, "learning_rate": 1.843822987092809e-05, "loss": 1.4563, "step": 5082 }, { "epoch": 0.3905961893054702, "grad_norm": 3.1679558753967285, "learning_rate": 1.843761524277812e-05, "loss": 1.4569, "step": 5084 }, { "epoch": 0.3907498463429625, "grad_norm": 2.9866411685943604, "learning_rate": 1.843700061462815e-05, "loss": 1.3765, "step": 5086 }, { "epoch": 0.3909035033804548, "grad_norm": 3.1638734340667725, "learning_rate": 1.8436385986478183e-05, "loss": 1.5417, "step": 5088 }, { "epoch": 0.39105716041794714, "grad_norm": 3.5159435272216797, "learning_rate": 1.8435771358328213e-05, "loss": 1.4869, "step": 5090 }, { "epoch": 0.39121081745543945, "grad_norm": 3.1329610347747803, "learning_rate": 1.8435156730178242e-05, "loss": 1.4025, "step": 5092 }, { "epoch": 0.39136447449293177, "grad_norm": 3.4398860931396484, "learning_rate": 1.8434542102028276e-05, "loss": 1.5109, "step": 5094 }, { "epoch": 0.3915181315304241, "grad_norm": 3.5402066707611084, "learning_rate": 1.8433927473878305e-05, "loss": 1.4089, "step": 5096 }, { "epoch": 0.3916717885679164, "grad_norm": 3.4249396324157715, "learning_rate": 1.8433312845728335e-05, "loss": 1.4015, "step": 5098 }, { "epoch": 0.3918254456054087, "grad_norm": 2.852504014968872, "learning_rate": 1.8432698217578368e-05, "loss": 1.4679, "step": 5100 }, { "epoch": 0.39197910264290103, "grad_norm": 3.3587238788604736, "learning_rate": 1.8432083589428398e-05, "loss": 1.5021, "step": 5102 }, { "epoch": 0.39213275968039335, "grad_norm": 3.091439723968506, "learning_rate": 1.8431468961278427e-05, "loss": 1.531, "step": 5104 }, { "epoch": 0.39228641671788567, "grad_norm": 3.6447200775146484, "learning_rate": 1.8430854333128457e-05, "loss": 1.2355, "step": 5106 }, { "epoch": 0.392440073755378, "grad_norm": 3.580310821533203, "learning_rate": 1.843023970497849e-05, "loss": 1.5096, "step": 5108 }, { "epoch": 0.3925937307928703, "grad_norm": 5.039089679718018, "learning_rate": 1.842962507682852e-05, "loss": 1.3975, "step": 5110 }, { "epoch": 0.3927473878303626, "grad_norm": 3.476210594177246, "learning_rate": 1.842901044867855e-05, "loss": 1.4081, "step": 5112 }, { "epoch": 0.39290104486785493, "grad_norm": 3.437044382095337, "learning_rate": 1.8428395820528583e-05, "loss": 1.4921, "step": 5114 }, { "epoch": 0.39305470190534725, "grad_norm": 3.2391202449798584, "learning_rate": 1.8427781192378612e-05, "loss": 1.3004, "step": 5116 }, { "epoch": 0.39320835894283956, "grad_norm": 3.0900301933288574, "learning_rate": 1.8427166564228642e-05, "loss": 1.3734, "step": 5118 }, { "epoch": 0.3933620159803319, "grad_norm": 3.2002086639404297, "learning_rate": 1.8426551936078675e-05, "loss": 1.3398, "step": 5120 }, { "epoch": 0.3935156730178242, "grad_norm": 3.1345863342285156, "learning_rate": 1.8425937307928705e-05, "loss": 1.4848, "step": 5122 }, { "epoch": 0.3936693300553165, "grad_norm": 3.710662603378296, "learning_rate": 1.8425322679778734e-05, "loss": 1.3935, "step": 5124 }, { "epoch": 0.39382298709280883, "grad_norm": 3.5913710594177246, "learning_rate": 1.8424708051628767e-05, "loss": 1.4266, "step": 5126 }, { "epoch": 0.39397664413030115, "grad_norm": 3.0973658561706543, "learning_rate": 1.8424093423478797e-05, "loss": 1.516, "step": 5128 }, { "epoch": 0.39413030116779346, "grad_norm": 3.180147171020508, "learning_rate": 1.842347879532883e-05, "loss": 1.3113, "step": 5130 }, { "epoch": 0.3942839582052858, "grad_norm": 3.239496946334839, "learning_rate": 1.8422864167178856e-05, "loss": 1.4985, "step": 5132 }, { "epoch": 0.3944376152427781, "grad_norm": 3.4438583850860596, "learning_rate": 1.842224953902889e-05, "loss": 1.4734, "step": 5134 }, { "epoch": 0.3945912722802704, "grad_norm": 3.0532734394073486, "learning_rate": 1.842163491087892e-05, "loss": 1.3405, "step": 5136 }, { "epoch": 0.3947449293177627, "grad_norm": 3.0784571170806885, "learning_rate": 1.842102028272895e-05, "loss": 1.3506, "step": 5138 }, { "epoch": 0.39489858635525504, "grad_norm": 3.235511064529419, "learning_rate": 1.8420405654578982e-05, "loss": 1.3963, "step": 5140 }, { "epoch": 0.3950522433927474, "grad_norm": 2.876587390899658, "learning_rate": 1.841979102642901e-05, "loss": 1.4846, "step": 5142 }, { "epoch": 0.39520590043023973, "grad_norm": 3.13354754447937, "learning_rate": 1.841917639827904e-05, "loss": 1.3671, "step": 5144 }, { "epoch": 0.39535955746773205, "grad_norm": 3.12070894241333, "learning_rate": 1.8418561770129074e-05, "loss": 1.4328, "step": 5146 }, { "epoch": 0.39551321450522436, "grad_norm": 3.660349130630493, "learning_rate": 1.8417947141979104e-05, "loss": 1.4445, "step": 5148 }, { "epoch": 0.3956668715427167, "grad_norm": 3.272484064102173, "learning_rate": 1.8417332513829137e-05, "loss": 1.3548, "step": 5150 }, { "epoch": 0.395820528580209, "grad_norm": 2.723353624343872, "learning_rate": 1.8416717885679163e-05, "loss": 1.3979, "step": 5152 }, { "epoch": 0.3959741856177013, "grad_norm": 3.0933709144592285, "learning_rate": 1.8416103257529197e-05, "loss": 1.4523, "step": 5154 }, { "epoch": 0.39612784265519363, "grad_norm": 3.3411343097686768, "learning_rate": 1.8415488629379226e-05, "loss": 1.5236, "step": 5156 }, { "epoch": 0.39628149969268595, "grad_norm": 3.618424415588379, "learning_rate": 1.8414874001229256e-05, "loss": 1.3047, "step": 5158 }, { "epoch": 0.39643515673017826, "grad_norm": 3.5820114612579346, "learning_rate": 1.841425937307929e-05, "loss": 1.4815, "step": 5160 }, { "epoch": 0.3965888137676706, "grad_norm": 3.064063549041748, "learning_rate": 1.841364474492932e-05, "loss": 1.4813, "step": 5162 }, { "epoch": 0.3967424708051629, "grad_norm": 3.082622528076172, "learning_rate": 1.841303011677935e-05, "loss": 1.3751, "step": 5164 }, { "epoch": 0.3968961278426552, "grad_norm": 3.184774398803711, "learning_rate": 1.841241548862938e-05, "loss": 1.3173, "step": 5166 }, { "epoch": 0.3970497848801475, "grad_norm": 3.2373135089874268, "learning_rate": 1.841180086047941e-05, "loss": 1.5307, "step": 5168 }, { "epoch": 0.39720344191763984, "grad_norm": 2.656855583190918, "learning_rate": 1.8411186232329444e-05, "loss": 1.2303, "step": 5170 }, { "epoch": 0.39735709895513216, "grad_norm": 3.1388304233551025, "learning_rate": 1.8410571604179474e-05, "loss": 1.4094, "step": 5172 }, { "epoch": 0.3975107559926245, "grad_norm": 3.365753173828125, "learning_rate": 1.8409956976029504e-05, "loss": 1.3659, "step": 5174 }, { "epoch": 0.3976644130301168, "grad_norm": 3.192204236984253, "learning_rate": 1.8409342347879537e-05, "loss": 1.3244, "step": 5176 }, { "epoch": 0.3978180700676091, "grad_norm": 3.1940553188323975, "learning_rate": 1.8408727719729563e-05, "loss": 1.4838, "step": 5178 }, { "epoch": 0.3979717271051014, "grad_norm": 3.3534750938415527, "learning_rate": 1.8408113091579596e-05, "loss": 1.3472, "step": 5180 }, { "epoch": 0.39812538414259374, "grad_norm": 3.4053544998168945, "learning_rate": 1.8407498463429626e-05, "loss": 1.4988, "step": 5182 }, { "epoch": 0.39827904118008606, "grad_norm": 3.329324960708618, "learning_rate": 1.8406883835279655e-05, "loss": 1.5343, "step": 5184 }, { "epoch": 0.3984326982175784, "grad_norm": 3.0776827335357666, "learning_rate": 1.840626920712969e-05, "loss": 1.3744, "step": 5186 }, { "epoch": 0.3985863552550707, "grad_norm": 3.1414880752563477, "learning_rate": 1.8405654578979718e-05, "loss": 1.3941, "step": 5188 }, { "epoch": 0.398740012292563, "grad_norm": 3.3760602474212646, "learning_rate": 1.840503995082975e-05, "loss": 1.4755, "step": 5190 }, { "epoch": 0.3988936693300553, "grad_norm": 3.2794857025146484, "learning_rate": 1.840442532267978e-05, "loss": 1.3525, "step": 5192 }, { "epoch": 0.39904732636754764, "grad_norm": 2.952638864517212, "learning_rate": 1.840381069452981e-05, "loss": 1.3713, "step": 5194 }, { "epoch": 0.39920098340503996, "grad_norm": 3.3649606704711914, "learning_rate": 1.8403196066379844e-05, "loss": 1.4609, "step": 5196 }, { "epoch": 0.39935464044253227, "grad_norm": 3.112147569656372, "learning_rate": 1.8402581438229873e-05, "loss": 1.4294, "step": 5198 }, { "epoch": 0.3995082974800246, "grad_norm": 3.05891489982605, "learning_rate": 1.8401966810079903e-05, "loss": 1.4269, "step": 5200 }, { "epoch": 0.3996619545175169, "grad_norm": 3.5872931480407715, "learning_rate": 1.8401352181929936e-05, "loss": 1.4834, "step": 5202 }, { "epoch": 0.3998156115550092, "grad_norm": 2.9866600036621094, "learning_rate": 1.8400737553779962e-05, "loss": 1.2641, "step": 5204 }, { "epoch": 0.39996926859250154, "grad_norm": 3.228916645050049, "learning_rate": 1.8400122925629995e-05, "loss": 1.3528, "step": 5206 }, { "epoch": 0.40012292562999385, "grad_norm": 3.571171998977661, "learning_rate": 1.8399508297480025e-05, "loss": 1.3518, "step": 5208 }, { "epoch": 0.40027658266748617, "grad_norm": 3.4012258052825928, "learning_rate": 1.8398893669330055e-05, "loss": 1.3364, "step": 5210 }, { "epoch": 0.4004302397049785, "grad_norm": 3.279327630996704, "learning_rate": 1.8398279041180088e-05, "loss": 1.3591, "step": 5212 }, { "epoch": 0.4005838967424708, "grad_norm": 3.180152416229248, "learning_rate": 1.8397664413030118e-05, "loss": 1.4357, "step": 5214 }, { "epoch": 0.4007375537799631, "grad_norm": 3.127516269683838, "learning_rate": 1.839704978488015e-05, "loss": 1.3258, "step": 5216 }, { "epoch": 0.40089121081745543, "grad_norm": 3.2782299518585205, "learning_rate": 1.839643515673018e-05, "loss": 1.3646, "step": 5218 }, { "epoch": 0.40104486785494775, "grad_norm": 3.528404712677002, "learning_rate": 1.839582052858021e-05, "loss": 1.3139, "step": 5220 }, { "epoch": 0.40119852489244007, "grad_norm": 3.2569220066070557, "learning_rate": 1.8395205900430243e-05, "loss": 1.4107, "step": 5222 }, { "epoch": 0.4013521819299324, "grad_norm": 3.212218761444092, "learning_rate": 1.8394591272280273e-05, "loss": 1.4875, "step": 5224 }, { "epoch": 0.4015058389674247, "grad_norm": 3.204953670501709, "learning_rate": 1.8393976644130302e-05, "loss": 1.4317, "step": 5226 }, { "epoch": 0.401659496004917, "grad_norm": 3.2656290531158447, "learning_rate": 1.8393362015980336e-05, "loss": 1.4636, "step": 5228 }, { "epoch": 0.40181315304240933, "grad_norm": 2.8223979473114014, "learning_rate": 1.8392747387830362e-05, "loss": 1.4513, "step": 5230 }, { "epoch": 0.40196681007990165, "grad_norm": 2.845686912536621, "learning_rate": 1.8392132759680395e-05, "loss": 1.3737, "step": 5232 }, { "epoch": 0.40212046711739396, "grad_norm": 2.8757717609405518, "learning_rate": 1.8391518131530425e-05, "loss": 1.3632, "step": 5234 }, { "epoch": 0.4022741241548863, "grad_norm": 3.4212610721588135, "learning_rate": 1.8390903503380458e-05, "loss": 1.357, "step": 5236 }, { "epoch": 0.4024277811923786, "grad_norm": 3.0520594120025635, "learning_rate": 1.8390288875230487e-05, "loss": 1.3139, "step": 5238 }, { "epoch": 0.4025814382298709, "grad_norm": 3.0812041759490967, "learning_rate": 1.8389674247080517e-05, "loss": 1.3224, "step": 5240 }, { "epoch": 0.40273509526736323, "grad_norm": 3.380028009414673, "learning_rate": 1.838905961893055e-05, "loss": 1.3579, "step": 5242 }, { "epoch": 0.40288875230485555, "grad_norm": 3.7895190715789795, "learning_rate": 1.838844499078058e-05, "loss": 1.3326, "step": 5244 }, { "epoch": 0.40304240934234786, "grad_norm": 3.471078395843506, "learning_rate": 1.838783036263061e-05, "loss": 1.2679, "step": 5246 }, { "epoch": 0.4031960663798402, "grad_norm": 3.2340338230133057, "learning_rate": 1.8387215734480643e-05, "loss": 1.4917, "step": 5248 }, { "epoch": 0.4033497234173325, "grad_norm": 3.610034942626953, "learning_rate": 1.838660110633067e-05, "loss": 1.549, "step": 5250 }, { "epoch": 0.4035033804548248, "grad_norm": 3.2047059535980225, "learning_rate": 1.8385986478180702e-05, "loss": 1.538, "step": 5252 }, { "epoch": 0.40365703749231713, "grad_norm": 3.190843105316162, "learning_rate": 1.838537185003073e-05, "loss": 1.5433, "step": 5254 }, { "epoch": 0.40381069452980944, "grad_norm": 3.0473310947418213, "learning_rate": 1.8384757221880765e-05, "loss": 1.4303, "step": 5256 }, { "epoch": 0.40396435156730176, "grad_norm": 3.305148124694824, "learning_rate": 1.8384142593730794e-05, "loss": 1.4647, "step": 5258 }, { "epoch": 0.4041180086047941, "grad_norm": 3.2546324729919434, "learning_rate": 1.8383527965580824e-05, "loss": 1.403, "step": 5260 }, { "epoch": 0.4042716656422864, "grad_norm": 3.26485276222229, "learning_rate": 1.8382913337430857e-05, "loss": 1.3281, "step": 5262 }, { "epoch": 0.4044253226797787, "grad_norm": 3.755387306213379, "learning_rate": 1.8382298709280887e-05, "loss": 1.3544, "step": 5264 }, { "epoch": 0.404578979717271, "grad_norm": 3.23287034034729, "learning_rate": 1.8381684081130916e-05, "loss": 1.4258, "step": 5266 }, { "epoch": 0.40473263675476334, "grad_norm": 3.2097320556640625, "learning_rate": 1.838106945298095e-05, "loss": 1.3425, "step": 5268 }, { "epoch": 0.40488629379225566, "grad_norm": 3.3704915046691895, "learning_rate": 1.838045482483098e-05, "loss": 1.5884, "step": 5270 }, { "epoch": 0.40503995082974803, "grad_norm": 2.9925537109375, "learning_rate": 1.837984019668101e-05, "loss": 1.3085, "step": 5272 }, { "epoch": 0.40519360786724035, "grad_norm": 2.810943841934204, "learning_rate": 1.8379225568531042e-05, "loss": 1.3259, "step": 5274 }, { "epoch": 0.40534726490473266, "grad_norm": 3.005441904067993, "learning_rate": 1.837861094038107e-05, "loss": 1.5093, "step": 5276 }, { "epoch": 0.405500921942225, "grad_norm": 3.141219139099121, "learning_rate": 1.83779963122311e-05, "loss": 1.3509, "step": 5278 }, { "epoch": 0.4056545789797173, "grad_norm": 3.446096658706665, "learning_rate": 1.837738168408113e-05, "loss": 1.4114, "step": 5280 }, { "epoch": 0.4058082360172096, "grad_norm": 3.5960264205932617, "learning_rate": 1.8376767055931164e-05, "loss": 1.5067, "step": 5282 }, { "epoch": 0.40596189305470193, "grad_norm": 3.441776990890503, "learning_rate": 1.8376152427781194e-05, "loss": 1.3281, "step": 5284 }, { "epoch": 0.40611555009219424, "grad_norm": 2.811274290084839, "learning_rate": 1.8375537799631223e-05, "loss": 1.4239, "step": 5286 }, { "epoch": 0.40626920712968656, "grad_norm": 2.918977737426758, "learning_rate": 1.8374923171481257e-05, "loss": 1.3116, "step": 5288 }, { "epoch": 0.4064228641671789, "grad_norm": 3.3775150775909424, "learning_rate": 1.8374308543331286e-05, "loss": 1.476, "step": 5290 }, { "epoch": 0.4065765212046712, "grad_norm": 3.5085885524749756, "learning_rate": 1.8373693915181316e-05, "loss": 1.4227, "step": 5292 }, { "epoch": 0.4067301782421635, "grad_norm": 3.496614933013916, "learning_rate": 1.837307928703135e-05, "loss": 1.3704, "step": 5294 }, { "epoch": 0.4068838352796558, "grad_norm": 3.1365761756896973, "learning_rate": 1.837246465888138e-05, "loss": 1.4542, "step": 5296 }, { "epoch": 0.40703749231714814, "grad_norm": 3.266559600830078, "learning_rate": 1.837185003073141e-05, "loss": 1.2663, "step": 5298 }, { "epoch": 0.40719114935464046, "grad_norm": 3.0944910049438477, "learning_rate": 1.837123540258144e-05, "loss": 1.3376, "step": 5300 }, { "epoch": 0.4073448063921328, "grad_norm": 3.3353304862976074, "learning_rate": 1.837062077443147e-05, "loss": 1.4512, "step": 5302 }, { "epoch": 0.4074984634296251, "grad_norm": 3.0446059703826904, "learning_rate": 1.83700061462815e-05, "loss": 1.3703, "step": 5304 }, { "epoch": 0.4076521204671174, "grad_norm": 2.9475607872009277, "learning_rate": 1.836939151813153e-05, "loss": 1.4063, "step": 5306 }, { "epoch": 0.4078057775046097, "grad_norm": 3.077463388442993, "learning_rate": 1.8368776889981564e-05, "loss": 1.378, "step": 5308 }, { "epoch": 0.40795943454210204, "grad_norm": 3.3104872703552246, "learning_rate": 1.8368162261831593e-05, "loss": 1.3273, "step": 5310 }, { "epoch": 0.40811309157959436, "grad_norm": 3.311328649520874, "learning_rate": 1.8367547633681623e-05, "loss": 1.3674, "step": 5312 }, { "epoch": 0.40826674861708667, "grad_norm": 3.025520086288452, "learning_rate": 1.8366933005531656e-05, "loss": 1.3716, "step": 5314 }, { "epoch": 0.408420405654579, "grad_norm": 3.5785486698150635, "learning_rate": 1.8366318377381686e-05, "loss": 1.4323, "step": 5316 }, { "epoch": 0.4085740626920713, "grad_norm": 3.210676670074463, "learning_rate": 1.8365703749231715e-05, "loss": 1.5345, "step": 5318 }, { "epoch": 0.4087277197295636, "grad_norm": 3.0286824703216553, "learning_rate": 1.836508912108175e-05, "loss": 1.423, "step": 5320 }, { "epoch": 0.40888137676705594, "grad_norm": 3.5791726112365723, "learning_rate": 1.8364474492931778e-05, "loss": 1.4609, "step": 5322 }, { "epoch": 0.40903503380454825, "grad_norm": 2.9176530838012695, "learning_rate": 1.8363859864781808e-05, "loss": 1.2732, "step": 5324 }, { "epoch": 0.40918869084204057, "grad_norm": 3.217237949371338, "learning_rate": 1.836324523663184e-05, "loss": 1.4485, "step": 5326 }, { "epoch": 0.4093423478795329, "grad_norm": 3.778691530227661, "learning_rate": 1.836263060848187e-05, "loss": 1.3841, "step": 5328 }, { "epoch": 0.4094960049170252, "grad_norm": 2.9347708225250244, "learning_rate": 1.83620159803319e-05, "loss": 1.3844, "step": 5330 }, { "epoch": 0.4096496619545175, "grad_norm": 3.1779749393463135, "learning_rate": 1.836140135218193e-05, "loss": 1.4284, "step": 5332 }, { "epoch": 0.40980331899200984, "grad_norm": 3.506448745727539, "learning_rate": 1.8360786724031963e-05, "loss": 1.4299, "step": 5334 }, { "epoch": 0.40995697602950215, "grad_norm": 2.704458713531494, "learning_rate": 1.8360172095881993e-05, "loss": 1.3064, "step": 5336 }, { "epoch": 0.41011063306699447, "grad_norm": 2.8528892993927, "learning_rate": 1.8359557467732022e-05, "loss": 1.2785, "step": 5338 }, { "epoch": 0.4102642901044868, "grad_norm": 3.1593194007873535, "learning_rate": 1.8358942839582055e-05, "loss": 1.4634, "step": 5340 }, { "epoch": 0.4104179471419791, "grad_norm": 4.361567974090576, "learning_rate": 1.8358328211432085e-05, "loss": 1.3619, "step": 5342 }, { "epoch": 0.4105716041794714, "grad_norm": 3.1610984802246094, "learning_rate": 1.8357713583282115e-05, "loss": 1.4343, "step": 5344 }, { "epoch": 0.41072526121696373, "grad_norm": 3.0249135494232178, "learning_rate": 1.8357098955132148e-05, "loss": 1.3613, "step": 5346 }, { "epoch": 0.41087891825445605, "grad_norm": 3.4013311862945557, "learning_rate": 1.8356484326982178e-05, "loss": 1.2754, "step": 5348 }, { "epoch": 0.41103257529194837, "grad_norm": 3.1088316440582275, "learning_rate": 1.8355869698832207e-05, "loss": 1.328, "step": 5350 }, { "epoch": 0.4111862323294407, "grad_norm": 2.942134141921997, "learning_rate": 1.8355255070682237e-05, "loss": 1.4541, "step": 5352 }, { "epoch": 0.411339889366933, "grad_norm": 3.704908847808838, "learning_rate": 1.835464044253227e-05, "loss": 1.477, "step": 5354 }, { "epoch": 0.4114935464044253, "grad_norm": 3.170875072479248, "learning_rate": 1.83540258143823e-05, "loss": 1.4399, "step": 5356 }, { "epoch": 0.41164720344191763, "grad_norm": 2.908482074737549, "learning_rate": 1.835341118623233e-05, "loss": 1.2963, "step": 5358 }, { "epoch": 0.41180086047940995, "grad_norm": 3.5838634967803955, "learning_rate": 1.8352796558082362e-05, "loss": 1.4518, "step": 5360 }, { "epoch": 0.41195451751690226, "grad_norm": 3.6148693561553955, "learning_rate": 1.8352181929932392e-05, "loss": 1.3671, "step": 5362 }, { "epoch": 0.4121081745543946, "grad_norm": 3.1762709617614746, "learning_rate": 1.8351567301782422e-05, "loss": 1.3727, "step": 5364 }, { "epoch": 0.4122618315918869, "grad_norm": 3.1139426231384277, "learning_rate": 1.8350952673632455e-05, "loss": 1.4239, "step": 5366 }, { "epoch": 0.4124154886293792, "grad_norm": 3.6204798221588135, "learning_rate": 1.8350338045482485e-05, "loss": 1.3177, "step": 5368 }, { "epoch": 0.41256914566687153, "grad_norm": 5.793487071990967, "learning_rate": 1.8349723417332514e-05, "loss": 1.4471, "step": 5370 }, { "epoch": 0.41272280270436384, "grad_norm": 3.2737765312194824, "learning_rate": 1.8349108789182547e-05, "loss": 1.4576, "step": 5372 }, { "epoch": 0.41287645974185616, "grad_norm": 3.0231080055236816, "learning_rate": 1.8348494161032577e-05, "loss": 1.3666, "step": 5374 }, { "epoch": 0.4130301167793485, "grad_norm": 2.8385233879089355, "learning_rate": 1.8347879532882607e-05, "loss": 1.3454, "step": 5376 }, { "epoch": 0.4131837738168408, "grad_norm": 3.1396775245666504, "learning_rate": 1.8347264904732636e-05, "loss": 1.4084, "step": 5378 }, { "epoch": 0.4133374308543331, "grad_norm": 3.143645763397217, "learning_rate": 1.834665027658267e-05, "loss": 1.3337, "step": 5380 }, { "epoch": 0.4134910878918254, "grad_norm": 2.823948383331299, "learning_rate": 1.83460356484327e-05, "loss": 1.3681, "step": 5382 }, { "epoch": 0.41364474492931774, "grad_norm": 3.2592108249664307, "learning_rate": 1.834542102028273e-05, "loss": 1.4719, "step": 5384 }, { "epoch": 0.41379840196681006, "grad_norm": 2.9613685607910156, "learning_rate": 1.8344806392132762e-05, "loss": 1.3567, "step": 5386 }, { "epoch": 0.4139520590043024, "grad_norm": 3.1878020763397217, "learning_rate": 1.834419176398279e-05, "loss": 1.4586, "step": 5388 }, { "epoch": 0.4141057160417947, "grad_norm": 3.22352933883667, "learning_rate": 1.834357713583282e-05, "loss": 1.3671, "step": 5390 }, { "epoch": 0.414259373079287, "grad_norm": 3.33892822265625, "learning_rate": 1.8342962507682854e-05, "loss": 1.3421, "step": 5392 }, { "epoch": 0.4144130301167793, "grad_norm": 3.0953426361083984, "learning_rate": 1.8342347879532884e-05, "loss": 1.4798, "step": 5394 }, { "epoch": 0.41456668715427164, "grad_norm": 3.559352397918701, "learning_rate": 1.8341733251382914e-05, "loss": 1.4564, "step": 5396 }, { "epoch": 0.41472034419176396, "grad_norm": 3.4220855236053467, "learning_rate": 1.8341118623232947e-05, "loss": 1.3482, "step": 5398 }, { "epoch": 0.4148740012292563, "grad_norm": 3.376953363418579, "learning_rate": 1.8340503995082976e-05, "loss": 1.3386, "step": 5400 }, { "epoch": 0.41502765826674864, "grad_norm": 3.256744861602783, "learning_rate": 1.833988936693301e-05, "loss": 1.4248, "step": 5402 }, { "epoch": 0.41518131530424096, "grad_norm": 3.20052170753479, "learning_rate": 1.8339274738783036e-05, "loss": 1.3001, "step": 5404 }, { "epoch": 0.4153349723417333, "grad_norm": 3.402547836303711, "learning_rate": 1.833866011063307e-05, "loss": 1.3549, "step": 5406 }, { "epoch": 0.4154886293792256, "grad_norm": 3.4516971111297607, "learning_rate": 1.83380454824831e-05, "loss": 1.4308, "step": 5408 }, { "epoch": 0.4156422864167179, "grad_norm": 3.404031753540039, "learning_rate": 1.8337430854333128e-05, "loss": 1.4279, "step": 5410 }, { "epoch": 0.4157959434542102, "grad_norm": 3.22731876373291, "learning_rate": 1.833681622618316e-05, "loss": 1.3986, "step": 5412 }, { "epoch": 0.41594960049170254, "grad_norm": 3.5908353328704834, "learning_rate": 1.833620159803319e-05, "loss": 1.4378, "step": 5414 }, { "epoch": 0.41610325752919486, "grad_norm": 2.8062829971313477, "learning_rate": 1.833558696988322e-05, "loss": 1.3553, "step": 5416 }, { "epoch": 0.4162569145666872, "grad_norm": 3.7070417404174805, "learning_rate": 1.8334972341733254e-05, "loss": 1.473, "step": 5418 }, { "epoch": 0.4164105716041795, "grad_norm": 3.7002766132354736, "learning_rate": 1.8334357713583283e-05, "loss": 1.5181, "step": 5420 }, { "epoch": 0.4165642286416718, "grad_norm": 3.2762959003448486, "learning_rate": 1.8333743085433316e-05, "loss": 1.2852, "step": 5422 }, { "epoch": 0.4167178856791641, "grad_norm": 3.2527976036071777, "learning_rate": 1.8333128457283346e-05, "loss": 1.5333, "step": 5424 }, { "epoch": 0.41687154271665644, "grad_norm": 3.0944836139678955, "learning_rate": 1.8332513829133376e-05, "loss": 1.3534, "step": 5426 }, { "epoch": 0.41702519975414876, "grad_norm": 2.9821205139160156, "learning_rate": 1.833189920098341e-05, "loss": 1.3932, "step": 5428 }, { "epoch": 0.4171788567916411, "grad_norm": 2.741713285446167, "learning_rate": 1.8331284572833435e-05, "loss": 1.3016, "step": 5430 }, { "epoch": 0.4173325138291334, "grad_norm": 2.732677459716797, "learning_rate": 1.8330669944683468e-05, "loss": 1.2872, "step": 5432 }, { "epoch": 0.4174861708666257, "grad_norm": 3.852531909942627, "learning_rate": 1.8330055316533498e-05, "loss": 1.4938, "step": 5434 }, { "epoch": 0.417639827904118, "grad_norm": 2.77213716506958, "learning_rate": 1.8329440688383528e-05, "loss": 1.2675, "step": 5436 }, { "epoch": 0.41779348494161034, "grad_norm": 3.0634377002716064, "learning_rate": 1.832882606023356e-05, "loss": 1.4362, "step": 5438 }, { "epoch": 0.41794714197910265, "grad_norm": 3.7038559913635254, "learning_rate": 1.832821143208359e-05, "loss": 1.5786, "step": 5440 }, { "epoch": 0.41810079901659497, "grad_norm": 2.922403335571289, "learning_rate": 1.8327596803933623e-05, "loss": 1.36, "step": 5442 }, { "epoch": 0.4182544560540873, "grad_norm": 3.7074828147888184, "learning_rate": 1.8326982175783653e-05, "loss": 1.4259, "step": 5444 }, { "epoch": 0.4184081130915796, "grad_norm": 3.401524305343628, "learning_rate": 1.8326367547633683e-05, "loss": 1.4031, "step": 5446 }, { "epoch": 0.4185617701290719, "grad_norm": 3.994004487991333, "learning_rate": 1.8325752919483716e-05, "loss": 1.3395, "step": 5448 }, { "epoch": 0.41871542716656424, "grad_norm": 3.46433162689209, "learning_rate": 1.8325138291333746e-05, "loss": 1.3823, "step": 5450 }, { "epoch": 0.41886908420405655, "grad_norm": 3.4628031253814697, "learning_rate": 1.8324523663183775e-05, "loss": 1.4543, "step": 5452 }, { "epoch": 0.41902274124154887, "grad_norm": 3.4159624576568604, "learning_rate": 1.832390903503381e-05, "loss": 1.3729, "step": 5454 }, { "epoch": 0.4191763982790412, "grad_norm": 3.102025270462036, "learning_rate": 1.8323294406883835e-05, "loss": 1.3801, "step": 5456 }, { "epoch": 0.4193300553165335, "grad_norm": 3.3577568531036377, "learning_rate": 1.8322679778733868e-05, "loss": 1.3856, "step": 5458 }, { "epoch": 0.4194837123540258, "grad_norm": 3.043654203414917, "learning_rate": 1.8322065150583897e-05, "loss": 1.5341, "step": 5460 }, { "epoch": 0.41963736939151813, "grad_norm": 3.0720834732055664, "learning_rate": 1.832145052243393e-05, "loss": 1.3789, "step": 5462 }, { "epoch": 0.41979102642901045, "grad_norm": 3.7414183616638184, "learning_rate": 1.832083589428396e-05, "loss": 1.3286, "step": 5464 }, { "epoch": 0.41994468346650277, "grad_norm": 3.149899482727051, "learning_rate": 1.832022126613399e-05, "loss": 1.5493, "step": 5466 }, { "epoch": 0.4200983405039951, "grad_norm": 3.5700736045837402, "learning_rate": 1.8319606637984023e-05, "loss": 1.4777, "step": 5468 }, { "epoch": 0.4202519975414874, "grad_norm": 2.8731086254119873, "learning_rate": 1.8318992009834053e-05, "loss": 1.3664, "step": 5470 }, { "epoch": 0.4204056545789797, "grad_norm": 3.048271417617798, "learning_rate": 1.8318377381684082e-05, "loss": 1.5034, "step": 5472 }, { "epoch": 0.42055931161647203, "grad_norm": 2.9073920249938965, "learning_rate": 1.8317762753534115e-05, "loss": 1.3296, "step": 5474 }, { "epoch": 0.42071296865396435, "grad_norm": 3.0303492546081543, "learning_rate": 1.831714812538414e-05, "loss": 1.3736, "step": 5476 }, { "epoch": 0.42086662569145666, "grad_norm": 3.4354753494262695, "learning_rate": 1.8316533497234175e-05, "loss": 1.4299, "step": 5478 }, { "epoch": 0.421020282728949, "grad_norm": 3.6933016777038574, "learning_rate": 1.8315918869084204e-05, "loss": 1.4725, "step": 5480 }, { "epoch": 0.4211739397664413, "grad_norm": 3.131046772003174, "learning_rate": 1.8315304240934234e-05, "loss": 1.3902, "step": 5482 }, { "epoch": 0.4213275968039336, "grad_norm": 3.8743679523468018, "learning_rate": 1.8314689612784267e-05, "loss": 1.3884, "step": 5484 }, { "epoch": 0.42148125384142593, "grad_norm": 3.1388769149780273, "learning_rate": 1.8314074984634297e-05, "loss": 1.3651, "step": 5486 }, { "epoch": 0.42163491087891825, "grad_norm": 3.010105609893799, "learning_rate": 1.831346035648433e-05, "loss": 1.3886, "step": 5488 }, { "epoch": 0.42178856791641056, "grad_norm": 3.1552770137786865, "learning_rate": 1.831284572833436e-05, "loss": 1.511, "step": 5490 }, { "epoch": 0.4219422249539029, "grad_norm": 3.428985834121704, "learning_rate": 1.831223110018439e-05, "loss": 1.3053, "step": 5492 }, { "epoch": 0.4220958819913952, "grad_norm": 3.2660410404205322, "learning_rate": 1.8311616472034422e-05, "loss": 1.4879, "step": 5494 }, { "epoch": 0.4222495390288875, "grad_norm": 3.157233238220215, "learning_rate": 1.8311001843884452e-05, "loss": 1.3789, "step": 5496 }, { "epoch": 0.4224031960663798, "grad_norm": 3.170839786529541, "learning_rate": 1.8310387215734482e-05, "loss": 1.2828, "step": 5498 }, { "epoch": 0.42255685310387214, "grad_norm": 4.559313774108887, "learning_rate": 1.8309772587584515e-05, "loss": 1.4493, "step": 5500 }, { "epoch": 0.42271051014136446, "grad_norm": 3.0980498790740967, "learning_rate": 1.830915795943454e-05, "loss": 1.2786, "step": 5502 }, { "epoch": 0.4228641671788568, "grad_norm": 3.488553047180176, "learning_rate": 1.8308543331284574e-05, "loss": 1.5031, "step": 5504 }, { "epoch": 0.4230178242163491, "grad_norm": 3.1194798946380615, "learning_rate": 1.8307928703134604e-05, "loss": 1.2781, "step": 5506 }, { "epoch": 0.4231714812538414, "grad_norm": 3.33617901802063, "learning_rate": 1.8307314074984637e-05, "loss": 1.3998, "step": 5508 }, { "epoch": 0.4233251382913337, "grad_norm": 3.321866273880005, "learning_rate": 1.8306699446834667e-05, "loss": 1.412, "step": 5510 }, { "epoch": 0.42347879532882604, "grad_norm": 3.68853497505188, "learning_rate": 1.8306084818684696e-05, "loss": 1.374, "step": 5512 }, { "epoch": 0.42363245236631836, "grad_norm": 3.372403383255005, "learning_rate": 1.830547019053473e-05, "loss": 1.3341, "step": 5514 }, { "epoch": 0.4237861094038107, "grad_norm": 3.3488733768463135, "learning_rate": 1.830485556238476e-05, "loss": 1.3271, "step": 5516 }, { "epoch": 0.423939766441303, "grad_norm": 3.3170018196105957, "learning_rate": 1.830424093423479e-05, "loss": 1.5108, "step": 5518 }, { "epoch": 0.4240934234787953, "grad_norm": 3.225128412246704, "learning_rate": 1.8303626306084822e-05, "loss": 1.5062, "step": 5520 }, { "epoch": 0.4242470805162876, "grad_norm": 3.2453081607818604, "learning_rate": 1.830301167793485e-05, "loss": 1.3055, "step": 5522 }, { "epoch": 0.42440073755377994, "grad_norm": 2.8871963024139404, "learning_rate": 1.830239704978488e-05, "loss": 1.4496, "step": 5524 }, { "epoch": 0.42455439459127225, "grad_norm": 3.092216730117798, "learning_rate": 1.8301782421634914e-05, "loss": 1.3905, "step": 5526 }, { "epoch": 0.42470805162876457, "grad_norm": 3.2610034942626953, "learning_rate": 1.8301167793484944e-05, "loss": 1.4187, "step": 5528 }, { "epoch": 0.4248617086662569, "grad_norm": 3.0660836696624756, "learning_rate": 1.8300553165334974e-05, "loss": 1.4302, "step": 5530 }, { "epoch": 0.42501536570374926, "grad_norm": 2.9052979946136475, "learning_rate": 1.8299938537185003e-05, "loss": 1.3238, "step": 5532 }, { "epoch": 0.4251690227412416, "grad_norm": 3.2881360054016113, "learning_rate": 1.8299323909035036e-05, "loss": 1.6043, "step": 5534 }, { "epoch": 0.4253226797787339, "grad_norm": 3.306642532348633, "learning_rate": 1.8298709280885066e-05, "loss": 1.398, "step": 5536 }, { "epoch": 0.4254763368162262, "grad_norm": 3.069509983062744, "learning_rate": 1.8298094652735096e-05, "loss": 1.2468, "step": 5538 }, { "epoch": 0.4256299938537185, "grad_norm": 3.1374897956848145, "learning_rate": 1.829748002458513e-05, "loss": 1.479, "step": 5540 }, { "epoch": 0.42578365089121084, "grad_norm": 8.134029388427734, "learning_rate": 1.829686539643516e-05, "loss": 1.4549, "step": 5542 }, { "epoch": 0.42593730792870316, "grad_norm": 3.1467490196228027, "learning_rate": 1.8296250768285188e-05, "loss": 1.3115, "step": 5544 }, { "epoch": 0.4260909649661955, "grad_norm": 2.9459738731384277, "learning_rate": 1.829563614013522e-05, "loss": 1.2946, "step": 5546 }, { "epoch": 0.4262446220036878, "grad_norm": 2.924170970916748, "learning_rate": 1.829502151198525e-05, "loss": 1.4114, "step": 5548 }, { "epoch": 0.4263982790411801, "grad_norm": 3.5444743633270264, "learning_rate": 1.829440688383528e-05, "loss": 1.413, "step": 5550 }, { "epoch": 0.4265519360786724, "grad_norm": 3.2071783542633057, "learning_rate": 1.8293792255685314e-05, "loss": 1.4042, "step": 5552 }, { "epoch": 0.42670559311616474, "grad_norm": 3.367691993713379, "learning_rate": 1.8293177627535343e-05, "loss": 1.3649, "step": 5554 }, { "epoch": 0.42685925015365705, "grad_norm": 3.16751766204834, "learning_rate": 1.8292562999385373e-05, "loss": 1.3487, "step": 5556 }, { "epoch": 0.42701290719114937, "grad_norm": 3.2429380416870117, "learning_rate": 1.8291948371235403e-05, "loss": 1.3935, "step": 5558 }, { "epoch": 0.4271665642286417, "grad_norm": 4.151123046875, "learning_rate": 1.8291333743085436e-05, "loss": 1.3883, "step": 5560 }, { "epoch": 0.427320221266134, "grad_norm": 3.531403064727783, "learning_rate": 1.8290719114935465e-05, "loss": 1.4176, "step": 5562 }, { "epoch": 0.4274738783036263, "grad_norm": 2.87608003616333, "learning_rate": 1.8290104486785495e-05, "loss": 1.3478, "step": 5564 }, { "epoch": 0.42762753534111864, "grad_norm": 3.214362621307373, "learning_rate": 1.8289489858635528e-05, "loss": 1.3697, "step": 5566 }, { "epoch": 0.42778119237861095, "grad_norm": 3.0433409214019775, "learning_rate": 1.8288875230485558e-05, "loss": 1.4356, "step": 5568 }, { "epoch": 0.42793484941610327, "grad_norm": 3.267775297164917, "learning_rate": 1.8288260602335588e-05, "loss": 1.4391, "step": 5570 }, { "epoch": 0.4280885064535956, "grad_norm": 3.465855121612549, "learning_rate": 1.828764597418562e-05, "loss": 1.3637, "step": 5572 }, { "epoch": 0.4282421634910879, "grad_norm": 2.8692171573638916, "learning_rate": 1.828703134603565e-05, "loss": 1.3866, "step": 5574 }, { "epoch": 0.4283958205285802, "grad_norm": 2.991672992706299, "learning_rate": 1.828641671788568e-05, "loss": 1.3646, "step": 5576 }, { "epoch": 0.42854947756607253, "grad_norm": 3.042018413543701, "learning_rate": 1.828580208973571e-05, "loss": 1.3627, "step": 5578 }, { "epoch": 0.42870313460356485, "grad_norm": 3.0607712268829346, "learning_rate": 1.8285187461585743e-05, "loss": 1.4435, "step": 5580 }, { "epoch": 0.42885679164105717, "grad_norm": 3.6141839027404785, "learning_rate": 1.8284572833435772e-05, "loss": 1.3652, "step": 5582 }, { "epoch": 0.4290104486785495, "grad_norm": 2.990736722946167, "learning_rate": 1.8283958205285802e-05, "loss": 1.3635, "step": 5584 }, { "epoch": 0.4291641057160418, "grad_norm": 3.5047953128814697, "learning_rate": 1.8283343577135835e-05, "loss": 1.3506, "step": 5586 }, { "epoch": 0.4293177627535341, "grad_norm": 3.17103910446167, "learning_rate": 1.8282728948985865e-05, "loss": 1.2826, "step": 5588 }, { "epoch": 0.42947141979102643, "grad_norm": 2.9763083457946777, "learning_rate": 1.8282114320835895e-05, "loss": 1.5198, "step": 5590 }, { "epoch": 0.42962507682851875, "grad_norm": 3.388418197631836, "learning_rate": 1.8281499692685928e-05, "loss": 1.3798, "step": 5592 }, { "epoch": 0.42977873386601106, "grad_norm": 3.1480460166931152, "learning_rate": 1.8280885064535957e-05, "loss": 1.3923, "step": 5594 }, { "epoch": 0.4299323909035034, "grad_norm": 3.1141629219055176, "learning_rate": 1.8280270436385987e-05, "loss": 1.4089, "step": 5596 }, { "epoch": 0.4300860479409957, "grad_norm": 2.6607182025909424, "learning_rate": 1.827965580823602e-05, "loss": 1.3245, "step": 5598 }, { "epoch": 0.430239704978488, "grad_norm": 3.3764638900756836, "learning_rate": 1.827904118008605e-05, "loss": 1.3798, "step": 5600 }, { "epoch": 0.43039336201598033, "grad_norm": 2.802183151245117, "learning_rate": 1.827842655193608e-05, "loss": 1.3138, "step": 5602 }, { "epoch": 0.43054701905347265, "grad_norm": 3.2850708961486816, "learning_rate": 1.827781192378611e-05, "loss": 1.3341, "step": 5604 }, { "epoch": 0.43070067609096496, "grad_norm": 3.2753207683563232, "learning_rate": 1.8277197295636142e-05, "loss": 1.3529, "step": 5606 }, { "epoch": 0.4308543331284573, "grad_norm": 3.2558462619781494, "learning_rate": 1.8276582667486172e-05, "loss": 1.4033, "step": 5608 }, { "epoch": 0.4310079901659496, "grad_norm": 2.962024211883545, "learning_rate": 1.82759680393362e-05, "loss": 1.3368, "step": 5610 }, { "epoch": 0.4311616472034419, "grad_norm": 3.1479384899139404, "learning_rate": 1.8275353411186235e-05, "loss": 1.3557, "step": 5612 }, { "epoch": 0.4313153042409342, "grad_norm": 3.0928778648376465, "learning_rate": 1.8274738783036264e-05, "loss": 1.4615, "step": 5614 }, { "epoch": 0.43146896127842654, "grad_norm": 3.5253334045410156, "learning_rate": 1.8274124154886294e-05, "loss": 1.3321, "step": 5616 }, { "epoch": 0.43162261831591886, "grad_norm": 3.1185035705566406, "learning_rate": 1.8273509526736327e-05, "loss": 1.4343, "step": 5618 }, { "epoch": 0.4317762753534112, "grad_norm": 3.1733176708221436, "learning_rate": 1.8272894898586357e-05, "loss": 1.4088, "step": 5620 }, { "epoch": 0.4319299323909035, "grad_norm": 3.129121780395508, "learning_rate": 1.8272280270436386e-05, "loss": 1.3631, "step": 5622 }, { "epoch": 0.4320835894283958, "grad_norm": 3.0119454860687256, "learning_rate": 1.827166564228642e-05, "loss": 1.3877, "step": 5624 }, { "epoch": 0.4322372464658881, "grad_norm": 3.1937038898468018, "learning_rate": 1.827105101413645e-05, "loss": 1.3881, "step": 5626 }, { "epoch": 0.43239090350338044, "grad_norm": 2.997652769088745, "learning_rate": 1.827043638598648e-05, "loss": 1.3087, "step": 5628 }, { "epoch": 0.43254456054087276, "grad_norm": 2.841203212738037, "learning_rate": 1.826982175783651e-05, "loss": 1.2979, "step": 5630 }, { "epoch": 0.4326982175783651, "grad_norm": 3.034036636352539, "learning_rate": 1.826920712968654e-05, "loss": 1.229, "step": 5632 }, { "epoch": 0.4328518746158574, "grad_norm": 2.866893768310547, "learning_rate": 1.826859250153657e-05, "loss": 1.4217, "step": 5634 }, { "epoch": 0.4330055316533497, "grad_norm": 2.9957258701324463, "learning_rate": 1.82679778733866e-05, "loss": 1.5528, "step": 5636 }, { "epoch": 0.433159188690842, "grad_norm": 2.990673303604126, "learning_rate": 1.8267363245236634e-05, "loss": 1.4186, "step": 5638 }, { "epoch": 0.43331284572833434, "grad_norm": 3.3053412437438965, "learning_rate": 1.8266748617086664e-05, "loss": 1.4479, "step": 5640 }, { "epoch": 0.43346650276582666, "grad_norm": 3.35445237159729, "learning_rate": 1.8266133988936693e-05, "loss": 1.3, "step": 5642 }, { "epoch": 0.43362015980331897, "grad_norm": 3.1358654499053955, "learning_rate": 1.8265519360786727e-05, "loss": 1.3337, "step": 5644 }, { "epoch": 0.4337738168408113, "grad_norm": 3.132446050643921, "learning_rate": 1.8264904732636756e-05, "loss": 1.5941, "step": 5646 }, { "epoch": 0.4339274738783036, "grad_norm": 3.0220460891723633, "learning_rate": 1.8264290104486786e-05, "loss": 1.336, "step": 5648 }, { "epoch": 0.4340811309157959, "grad_norm": 3.412433385848999, "learning_rate": 1.826367547633682e-05, "loss": 1.4985, "step": 5650 }, { "epoch": 0.43423478795328824, "grad_norm": 3.2753655910491943, "learning_rate": 1.826306084818685e-05, "loss": 1.3647, "step": 5652 }, { "epoch": 0.43438844499078055, "grad_norm": 3.021898031234741, "learning_rate": 1.8262446220036882e-05, "loss": 1.416, "step": 5654 }, { "epoch": 0.43454210202827287, "grad_norm": 2.817472457885742, "learning_rate": 1.8261831591886908e-05, "loss": 1.3628, "step": 5656 }, { "epoch": 0.4346957590657652, "grad_norm": 2.896252155303955, "learning_rate": 1.826121696373694e-05, "loss": 1.4208, "step": 5658 }, { "epoch": 0.4348494161032575, "grad_norm": 3.5040619373321533, "learning_rate": 1.826060233558697e-05, "loss": 1.4589, "step": 5660 }, { "epoch": 0.4350030731407499, "grad_norm": 2.9098873138427734, "learning_rate": 1.8259987707437e-05, "loss": 1.4208, "step": 5662 }, { "epoch": 0.4351567301782422, "grad_norm": 3.0636959075927734, "learning_rate": 1.8259373079287034e-05, "loss": 1.4636, "step": 5664 }, { "epoch": 0.4353103872157345, "grad_norm": 2.761655807495117, "learning_rate": 1.8258758451137063e-05, "loss": 1.2639, "step": 5666 }, { "epoch": 0.4354640442532268, "grad_norm": 2.907930374145508, "learning_rate": 1.8258143822987093e-05, "loss": 1.498, "step": 5668 }, { "epoch": 0.43561770129071914, "grad_norm": 3.2700185775756836, "learning_rate": 1.8257529194837126e-05, "loss": 1.3444, "step": 5670 }, { "epoch": 0.43577135832821146, "grad_norm": 3.1657397747039795, "learning_rate": 1.8256914566687156e-05, "loss": 1.4352, "step": 5672 }, { "epoch": 0.43592501536570377, "grad_norm": 3.0951035022735596, "learning_rate": 1.825629993853719e-05, "loss": 1.3414, "step": 5674 }, { "epoch": 0.4360786724031961, "grad_norm": 2.763122797012329, "learning_rate": 1.8255685310387215e-05, "loss": 1.3182, "step": 5676 }, { "epoch": 0.4362323294406884, "grad_norm": 3.504037618637085, "learning_rate": 1.8255070682237248e-05, "loss": 1.3866, "step": 5678 }, { "epoch": 0.4363859864781807, "grad_norm": 3.1605093479156494, "learning_rate": 1.825445605408728e-05, "loss": 1.4574, "step": 5680 }, { "epoch": 0.43653964351567304, "grad_norm": 3.2070469856262207, "learning_rate": 1.8253841425937307e-05, "loss": 1.2922, "step": 5682 }, { "epoch": 0.43669330055316535, "grad_norm": 3.452393054962158, "learning_rate": 1.825322679778734e-05, "loss": 1.4231, "step": 5684 }, { "epoch": 0.43684695759065767, "grad_norm": 2.997011184692383, "learning_rate": 1.825261216963737e-05, "loss": 1.2949, "step": 5686 }, { "epoch": 0.43700061462815, "grad_norm": 3.404040813446045, "learning_rate": 1.82519975414874e-05, "loss": 1.3552, "step": 5688 }, { "epoch": 0.4371542716656423, "grad_norm": 3.3398168087005615, "learning_rate": 1.8251382913337433e-05, "loss": 1.3336, "step": 5690 }, { "epoch": 0.4373079287031346, "grad_norm": 3.090134859085083, "learning_rate": 1.8250768285187463e-05, "loss": 1.351, "step": 5692 }, { "epoch": 0.43746158574062693, "grad_norm": 3.0853729248046875, "learning_rate": 1.8250153657037496e-05, "loss": 1.2877, "step": 5694 }, { "epoch": 0.43761524277811925, "grad_norm": 2.9759137630462646, "learning_rate": 1.8249539028887525e-05, "loss": 1.3217, "step": 5696 }, { "epoch": 0.43776889981561157, "grad_norm": 3.0432422161102295, "learning_rate": 1.8248924400737555e-05, "loss": 1.4036, "step": 5698 }, { "epoch": 0.4379225568531039, "grad_norm": 2.5743117332458496, "learning_rate": 1.8248309772587588e-05, "loss": 1.3252, "step": 5700 }, { "epoch": 0.4380762138905962, "grad_norm": 2.912013530731201, "learning_rate": 1.8247695144437614e-05, "loss": 1.3125, "step": 5702 }, { "epoch": 0.4382298709280885, "grad_norm": 3.1190555095672607, "learning_rate": 1.8247080516287648e-05, "loss": 1.5832, "step": 5704 }, { "epoch": 0.43838352796558083, "grad_norm": 3.2913010120391846, "learning_rate": 1.8246465888137677e-05, "loss": 1.4592, "step": 5706 }, { "epoch": 0.43853718500307315, "grad_norm": 3.0158445835113525, "learning_rate": 1.8245851259987707e-05, "loss": 1.258, "step": 5708 }, { "epoch": 0.43869084204056547, "grad_norm": 2.7523088455200195, "learning_rate": 1.824523663183774e-05, "loss": 1.3033, "step": 5710 }, { "epoch": 0.4388444990780578, "grad_norm": 3.2428712844848633, "learning_rate": 1.824462200368777e-05, "loss": 1.5227, "step": 5712 }, { "epoch": 0.4389981561155501, "grad_norm": 3.1375772953033447, "learning_rate": 1.8244007375537803e-05, "loss": 1.2947, "step": 5714 }, { "epoch": 0.4391518131530424, "grad_norm": 3.0907890796661377, "learning_rate": 1.8243392747387832e-05, "loss": 1.5986, "step": 5716 }, { "epoch": 0.43930547019053473, "grad_norm": 2.9191091060638428, "learning_rate": 1.8242778119237862e-05, "loss": 1.3305, "step": 5718 }, { "epoch": 0.43945912722802705, "grad_norm": 2.765789747238159, "learning_rate": 1.8242163491087895e-05, "loss": 1.3996, "step": 5720 }, { "epoch": 0.43961278426551936, "grad_norm": 3.196028470993042, "learning_rate": 1.8241548862937925e-05, "loss": 1.4643, "step": 5722 }, { "epoch": 0.4397664413030117, "grad_norm": 3.159723997116089, "learning_rate": 1.8240934234787955e-05, "loss": 1.2982, "step": 5724 }, { "epoch": 0.439920098340504, "grad_norm": 4.066509246826172, "learning_rate": 1.8240319606637988e-05, "loss": 1.4798, "step": 5726 }, { "epoch": 0.4400737553779963, "grad_norm": 3.310293436050415, "learning_rate": 1.8239704978488014e-05, "loss": 1.3133, "step": 5728 }, { "epoch": 0.44022741241548863, "grad_norm": 3.166078805923462, "learning_rate": 1.8239090350338047e-05, "loss": 1.3163, "step": 5730 }, { "epoch": 0.44038106945298094, "grad_norm": 3.0429303646087646, "learning_rate": 1.8238475722188077e-05, "loss": 1.3178, "step": 5732 }, { "epoch": 0.44053472649047326, "grad_norm": 2.984344959259033, "learning_rate": 1.8237861094038106e-05, "loss": 1.4449, "step": 5734 }, { "epoch": 0.4406883835279656, "grad_norm": 3.379456043243408, "learning_rate": 1.823724646588814e-05, "loss": 1.3448, "step": 5736 }, { "epoch": 0.4408420405654579, "grad_norm": 3.0715973377227783, "learning_rate": 1.823663183773817e-05, "loss": 1.4384, "step": 5738 }, { "epoch": 0.4409956976029502, "grad_norm": 3.0291173458099365, "learning_rate": 1.8236017209588202e-05, "loss": 1.452, "step": 5740 }, { "epoch": 0.4411493546404425, "grad_norm": 3.0475244522094727, "learning_rate": 1.8235402581438232e-05, "loss": 1.3113, "step": 5742 }, { "epoch": 0.44130301167793484, "grad_norm": 3.0464026927948, "learning_rate": 1.823478795328826e-05, "loss": 1.2598, "step": 5744 }, { "epoch": 0.44145666871542716, "grad_norm": 3.1016945838928223, "learning_rate": 1.8234173325138295e-05, "loss": 1.396, "step": 5746 }, { "epoch": 0.4416103257529195, "grad_norm": 3.356527328491211, "learning_rate": 1.8233558696988324e-05, "loss": 1.4148, "step": 5748 }, { "epoch": 0.4417639827904118, "grad_norm": 2.8270328044891357, "learning_rate": 1.8232944068838354e-05, "loss": 1.346, "step": 5750 }, { "epoch": 0.4419176398279041, "grad_norm": 3.364140510559082, "learning_rate": 1.8232329440688387e-05, "loss": 1.3203, "step": 5752 }, { "epoch": 0.4420712968653964, "grad_norm": 3.246849298477173, "learning_rate": 1.8231714812538413e-05, "loss": 1.5842, "step": 5754 }, { "epoch": 0.44222495390288874, "grad_norm": 3.0349695682525635, "learning_rate": 1.8231100184388446e-05, "loss": 1.3516, "step": 5756 }, { "epoch": 0.44237861094038106, "grad_norm": 3.3905465602874756, "learning_rate": 1.8230485556238476e-05, "loss": 1.3804, "step": 5758 }, { "epoch": 0.4425322679778734, "grad_norm": 3.100095510482788, "learning_rate": 1.822987092808851e-05, "loss": 1.3149, "step": 5760 }, { "epoch": 0.4426859250153657, "grad_norm": 2.685727596282959, "learning_rate": 1.822925629993854e-05, "loss": 1.4374, "step": 5762 }, { "epoch": 0.442839582052858, "grad_norm": 3.614854335784912, "learning_rate": 1.822864167178857e-05, "loss": 1.4598, "step": 5764 }, { "epoch": 0.4429932390903503, "grad_norm": 3.2616031169891357, "learning_rate": 1.82280270436386e-05, "loss": 1.3823, "step": 5766 }, { "epoch": 0.44314689612784264, "grad_norm": 3.1923673152923584, "learning_rate": 1.822741241548863e-05, "loss": 1.3283, "step": 5768 }, { "epoch": 0.44330055316533495, "grad_norm": 2.938978433609009, "learning_rate": 1.822679778733866e-05, "loss": 1.3428, "step": 5770 }, { "epoch": 0.44345421020282727, "grad_norm": 2.983696222305298, "learning_rate": 1.8226183159188694e-05, "loss": 1.4124, "step": 5772 }, { "epoch": 0.4436078672403196, "grad_norm": 2.893930673599243, "learning_rate": 1.8225568531038724e-05, "loss": 1.3289, "step": 5774 }, { "epoch": 0.4437615242778119, "grad_norm": 2.9682881832122803, "learning_rate": 1.8224953902888753e-05, "loss": 1.3341, "step": 5776 }, { "epoch": 0.4439151813153042, "grad_norm": 3.4092354774475098, "learning_rate": 1.8224339274738786e-05, "loss": 1.4816, "step": 5778 }, { "epoch": 0.44406883835279654, "grad_norm": 3.190753698348999, "learning_rate": 1.8223724646588816e-05, "loss": 1.4722, "step": 5780 }, { "epoch": 0.44422249539028885, "grad_norm": 3.21653413772583, "learning_rate": 1.8223110018438846e-05, "loss": 1.3672, "step": 5782 }, { "epoch": 0.44437615242778117, "grad_norm": 2.8916025161743164, "learning_rate": 1.8222495390288876e-05, "loss": 1.3928, "step": 5784 }, { "epoch": 0.4445298094652735, "grad_norm": 3.229156494140625, "learning_rate": 1.822188076213891e-05, "loss": 1.3865, "step": 5786 }, { "epoch": 0.4446834665027658, "grad_norm": 3.3504526615142822, "learning_rate": 1.8221266133988938e-05, "loss": 1.4361, "step": 5788 }, { "epoch": 0.4448371235402581, "grad_norm": 3.143098831176758, "learning_rate": 1.8220651505838968e-05, "loss": 1.3759, "step": 5790 }, { "epoch": 0.44499078057775043, "grad_norm": 3.4050183296203613, "learning_rate": 1.8220036877689e-05, "loss": 1.4189, "step": 5792 }, { "epoch": 0.4451444376152428, "grad_norm": 2.914482593536377, "learning_rate": 1.821942224953903e-05, "loss": 1.4096, "step": 5794 }, { "epoch": 0.4452980946527351, "grad_norm": 3.0142135620117188, "learning_rate": 1.821880762138906e-05, "loss": 1.3218, "step": 5796 }, { "epoch": 0.44545175169022744, "grad_norm": 3.1696937084198, "learning_rate": 1.8218192993239093e-05, "loss": 1.3977, "step": 5798 }, { "epoch": 0.44560540872771975, "grad_norm": 3.157047986984253, "learning_rate": 1.8217578365089123e-05, "loss": 1.4031, "step": 5800 }, { "epoch": 0.44575906576521207, "grad_norm": 3.1851911544799805, "learning_rate": 1.8216963736939153e-05, "loss": 1.4248, "step": 5802 }, { "epoch": 0.4459127228027044, "grad_norm": 2.895350456237793, "learning_rate": 1.8216349108789183e-05, "loss": 1.3526, "step": 5804 }, { "epoch": 0.4460663798401967, "grad_norm": 3.438236713409424, "learning_rate": 1.8215734480639216e-05, "loss": 1.4285, "step": 5806 }, { "epoch": 0.446220036877689, "grad_norm": 3.439058780670166, "learning_rate": 1.8215119852489245e-05, "loss": 1.3343, "step": 5808 }, { "epoch": 0.44637369391518134, "grad_norm": 3.126304864883423, "learning_rate": 1.8214505224339275e-05, "loss": 1.4201, "step": 5810 }, { "epoch": 0.44652735095267365, "grad_norm": 2.642670154571533, "learning_rate": 1.8213890596189308e-05, "loss": 1.3266, "step": 5812 }, { "epoch": 0.44668100799016597, "grad_norm": 3.0952022075653076, "learning_rate": 1.8213275968039338e-05, "loss": 1.4502, "step": 5814 }, { "epoch": 0.4468346650276583, "grad_norm": 3.7239139080047607, "learning_rate": 1.8212661339889367e-05, "loss": 1.5575, "step": 5816 }, { "epoch": 0.4469883220651506, "grad_norm": 3.190653085708618, "learning_rate": 1.82120467117394e-05, "loss": 1.4421, "step": 5818 }, { "epoch": 0.4471419791026429, "grad_norm": 3.4979770183563232, "learning_rate": 1.821143208358943e-05, "loss": 1.3664, "step": 5820 }, { "epoch": 0.44729563614013523, "grad_norm": 3.044233798980713, "learning_rate": 1.821081745543946e-05, "loss": 1.3876, "step": 5822 }, { "epoch": 0.44744929317762755, "grad_norm": 3.0839719772338867, "learning_rate": 1.8210202827289493e-05, "loss": 1.294, "step": 5824 }, { "epoch": 0.44760295021511987, "grad_norm": 3.3031883239746094, "learning_rate": 1.8209588199139523e-05, "loss": 1.4067, "step": 5826 }, { "epoch": 0.4477566072526122, "grad_norm": 3.392949104309082, "learning_rate": 1.8208973570989552e-05, "loss": 1.4638, "step": 5828 }, { "epoch": 0.4479102642901045, "grad_norm": 3.3899407386779785, "learning_rate": 1.8208358942839582e-05, "loss": 1.3866, "step": 5830 }, { "epoch": 0.4480639213275968, "grad_norm": 2.626540422439575, "learning_rate": 1.8207744314689615e-05, "loss": 1.2544, "step": 5832 }, { "epoch": 0.44821757836508913, "grad_norm": 3.6084189414978027, "learning_rate": 1.8207129686539645e-05, "loss": 1.3872, "step": 5834 }, { "epoch": 0.44837123540258145, "grad_norm": 2.835602045059204, "learning_rate": 1.8206515058389674e-05, "loss": 1.3862, "step": 5836 }, { "epoch": 0.44852489244007376, "grad_norm": 2.9570329189300537, "learning_rate": 1.8205900430239707e-05, "loss": 1.4073, "step": 5838 }, { "epoch": 0.4486785494775661, "grad_norm": 2.910385847091675, "learning_rate": 1.8205285802089737e-05, "loss": 1.3464, "step": 5840 }, { "epoch": 0.4488322065150584, "grad_norm": 3.0790252685546875, "learning_rate": 1.8204671173939767e-05, "loss": 1.3937, "step": 5842 }, { "epoch": 0.4489858635525507, "grad_norm": 3.0580132007598877, "learning_rate": 1.82040565457898e-05, "loss": 1.2775, "step": 5844 }, { "epoch": 0.44913952059004303, "grad_norm": 2.8806533813476562, "learning_rate": 1.820344191763983e-05, "loss": 1.4138, "step": 5846 }, { "epoch": 0.44929317762753535, "grad_norm": 3.8401284217834473, "learning_rate": 1.820282728948986e-05, "loss": 1.3532, "step": 5848 }, { "epoch": 0.44944683466502766, "grad_norm": 3.237717628479004, "learning_rate": 1.8202212661339892e-05, "loss": 1.3316, "step": 5850 }, { "epoch": 0.44960049170252, "grad_norm": 2.958611011505127, "learning_rate": 1.8201598033189922e-05, "loss": 1.2768, "step": 5852 }, { "epoch": 0.4497541487400123, "grad_norm": 3.4589743614196777, "learning_rate": 1.8200983405039952e-05, "loss": 1.5463, "step": 5854 }, { "epoch": 0.4499078057775046, "grad_norm": 2.9172708988189697, "learning_rate": 1.820036877688998e-05, "loss": 1.4517, "step": 5856 }, { "epoch": 0.4500614628149969, "grad_norm": 3.213388681411743, "learning_rate": 1.8199754148740014e-05, "loss": 1.2623, "step": 5858 }, { "epoch": 0.45021511985248924, "grad_norm": 2.901362180709839, "learning_rate": 1.8199139520590044e-05, "loss": 1.2491, "step": 5860 }, { "epoch": 0.45036877688998156, "grad_norm": 3.0259957313537598, "learning_rate": 1.8198524892440074e-05, "loss": 1.5275, "step": 5862 }, { "epoch": 0.4505224339274739, "grad_norm": 3.00325870513916, "learning_rate": 1.8197910264290107e-05, "loss": 1.2732, "step": 5864 }, { "epoch": 0.4506760909649662, "grad_norm": 3.229722023010254, "learning_rate": 1.8197295636140137e-05, "loss": 1.4167, "step": 5866 }, { "epoch": 0.4508297480024585, "grad_norm": 3.2484633922576904, "learning_rate": 1.8196681007990166e-05, "loss": 1.3736, "step": 5868 }, { "epoch": 0.4509834050399508, "grad_norm": 2.872192859649658, "learning_rate": 1.81960663798402e-05, "loss": 1.4017, "step": 5870 }, { "epoch": 0.45113706207744314, "grad_norm": 3.5792553424835205, "learning_rate": 1.819545175169023e-05, "loss": 1.3938, "step": 5872 }, { "epoch": 0.45129071911493546, "grad_norm": 3.1069889068603516, "learning_rate": 1.819483712354026e-05, "loss": 1.3554, "step": 5874 }, { "epoch": 0.4514443761524278, "grad_norm": 3.3849682807922363, "learning_rate": 1.8194222495390292e-05, "loss": 1.3308, "step": 5876 }, { "epoch": 0.4515980331899201, "grad_norm": 3.07407808303833, "learning_rate": 1.819360786724032e-05, "loss": 1.3343, "step": 5878 }, { "epoch": 0.4517516902274124, "grad_norm": 3.1967318058013916, "learning_rate": 1.819299323909035e-05, "loss": 1.3538, "step": 5880 }, { "epoch": 0.4519053472649047, "grad_norm": 3.133248805999756, "learning_rate": 1.819237861094038e-05, "loss": 1.4287, "step": 5882 }, { "epoch": 0.45205900430239704, "grad_norm": 3.287682056427002, "learning_rate": 1.8191763982790414e-05, "loss": 1.3651, "step": 5884 }, { "epoch": 0.45221266133988935, "grad_norm": 3.5104236602783203, "learning_rate": 1.8191149354640444e-05, "loss": 1.3389, "step": 5886 }, { "epoch": 0.45236631837738167, "grad_norm": 3.1581168174743652, "learning_rate": 1.8190534726490473e-05, "loss": 1.493, "step": 5888 }, { "epoch": 0.452519975414874, "grad_norm": 3.408336877822876, "learning_rate": 1.8189920098340506e-05, "loss": 1.3866, "step": 5890 }, { "epoch": 0.4526736324523663, "grad_norm": 3.840761661529541, "learning_rate": 1.8189305470190536e-05, "loss": 1.5172, "step": 5892 }, { "epoch": 0.4528272894898586, "grad_norm": 3.4090771675109863, "learning_rate": 1.8188690842040566e-05, "loss": 1.3251, "step": 5894 }, { "epoch": 0.45298094652735094, "grad_norm": 2.792132616043091, "learning_rate": 1.81880762138906e-05, "loss": 1.4536, "step": 5896 }, { "epoch": 0.45313460356484325, "grad_norm": 2.6732709407806396, "learning_rate": 1.818746158574063e-05, "loss": 1.3353, "step": 5898 }, { "epoch": 0.45328826060233557, "grad_norm": 3.2554478645324707, "learning_rate": 1.8186846957590658e-05, "loss": 1.3426, "step": 5900 }, { "epoch": 0.4534419176398279, "grad_norm": 3.0684661865234375, "learning_rate": 1.8186232329440688e-05, "loss": 1.2457, "step": 5902 }, { "epoch": 0.4535955746773202, "grad_norm": 2.9400198459625244, "learning_rate": 1.818561770129072e-05, "loss": 1.386, "step": 5904 }, { "epoch": 0.4537492317148125, "grad_norm": 3.3350679874420166, "learning_rate": 1.8185003073140754e-05, "loss": 1.4366, "step": 5906 }, { "epoch": 0.45390288875230483, "grad_norm": 2.902355670928955, "learning_rate": 1.818438844499078e-05, "loss": 1.2785, "step": 5908 }, { "epoch": 0.45405654578979715, "grad_norm": 2.7429327964782715, "learning_rate": 1.8183773816840813e-05, "loss": 1.2129, "step": 5910 }, { "epoch": 0.45421020282728947, "grad_norm": 3.4028749465942383, "learning_rate": 1.8183159188690843e-05, "loss": 1.5344, "step": 5912 }, { "epoch": 0.4543638598647818, "grad_norm": 3.4940731525421143, "learning_rate": 1.8182544560540873e-05, "loss": 1.3376, "step": 5914 }, { "epoch": 0.4545175169022741, "grad_norm": 3.099562644958496, "learning_rate": 1.8181929932390906e-05, "loss": 1.3071, "step": 5916 }, { "epoch": 0.4546711739397664, "grad_norm": 2.8082468509674072, "learning_rate": 1.8181315304240935e-05, "loss": 1.273, "step": 5918 }, { "epoch": 0.45482483097725873, "grad_norm": 2.951359510421753, "learning_rate": 1.8180700676090965e-05, "loss": 1.2602, "step": 5920 }, { "epoch": 0.45497848801475105, "grad_norm": 3.421891212463379, "learning_rate": 1.8180086047940998e-05, "loss": 1.4899, "step": 5922 }, { "epoch": 0.4551321450522434, "grad_norm": 3.0956802368164062, "learning_rate": 1.8179471419791028e-05, "loss": 1.4078, "step": 5924 }, { "epoch": 0.45528580208973574, "grad_norm": 3.26657772064209, "learning_rate": 1.817885679164106e-05, "loss": 1.4338, "step": 5926 }, { "epoch": 0.45543945912722805, "grad_norm": 3.3916943073272705, "learning_rate": 1.8178242163491087e-05, "loss": 1.4113, "step": 5928 }, { "epoch": 0.45559311616472037, "grad_norm": 3.281963348388672, "learning_rate": 1.817762753534112e-05, "loss": 1.3594, "step": 5930 }, { "epoch": 0.4557467732022127, "grad_norm": 2.989858627319336, "learning_rate": 1.817701290719115e-05, "loss": 1.4444, "step": 5932 }, { "epoch": 0.455900430239705, "grad_norm": 2.8269715309143066, "learning_rate": 1.817639827904118e-05, "loss": 1.3883, "step": 5934 }, { "epoch": 0.4560540872771973, "grad_norm": 3.383490800857544, "learning_rate": 1.8175783650891213e-05, "loss": 1.4248, "step": 5936 }, { "epoch": 0.45620774431468963, "grad_norm": 3.237833023071289, "learning_rate": 1.8175169022741242e-05, "loss": 1.3974, "step": 5938 }, { "epoch": 0.45636140135218195, "grad_norm": 3.240793466567993, "learning_rate": 1.8174554394591272e-05, "loss": 1.3285, "step": 5940 }, { "epoch": 0.45651505838967427, "grad_norm": 3.475192070007324, "learning_rate": 1.8173939766441305e-05, "loss": 1.384, "step": 5942 }, { "epoch": 0.4566687154271666, "grad_norm": 3.198943853378296, "learning_rate": 1.8173325138291335e-05, "loss": 1.4785, "step": 5944 }, { "epoch": 0.4568223724646589, "grad_norm": 3.021594524383545, "learning_rate": 1.8172710510141368e-05, "loss": 1.3124, "step": 5946 }, { "epoch": 0.4569760295021512, "grad_norm": 3.0245521068573, "learning_rate": 1.8172095881991398e-05, "loss": 1.439, "step": 5948 }, { "epoch": 0.45712968653964353, "grad_norm": 3.3448681831359863, "learning_rate": 1.8171481253841427e-05, "loss": 1.401, "step": 5950 }, { "epoch": 0.45728334357713585, "grad_norm": 3.0003669261932373, "learning_rate": 1.817086662569146e-05, "loss": 1.4555, "step": 5952 }, { "epoch": 0.45743700061462816, "grad_norm": 3.3074393272399902, "learning_rate": 1.8170251997541487e-05, "loss": 1.3984, "step": 5954 }, { "epoch": 0.4575906576521205, "grad_norm": 3.2424259185791016, "learning_rate": 1.816963736939152e-05, "loss": 1.3531, "step": 5956 }, { "epoch": 0.4577443146896128, "grad_norm": 3.414992094039917, "learning_rate": 1.816902274124155e-05, "loss": 1.4555, "step": 5958 }, { "epoch": 0.4578979717271051, "grad_norm": 3.0615060329437256, "learning_rate": 1.816840811309158e-05, "loss": 1.5183, "step": 5960 }, { "epoch": 0.45805162876459743, "grad_norm": 3.127685070037842, "learning_rate": 1.8167793484941612e-05, "loss": 1.4996, "step": 5962 }, { "epoch": 0.45820528580208975, "grad_norm": 2.8747687339782715, "learning_rate": 1.8167178856791642e-05, "loss": 1.363, "step": 5964 }, { "epoch": 0.45835894283958206, "grad_norm": 3.215275764465332, "learning_rate": 1.8166564228641675e-05, "loss": 1.32, "step": 5966 }, { "epoch": 0.4585125998770744, "grad_norm": 2.8502352237701416, "learning_rate": 1.8165949600491705e-05, "loss": 1.3669, "step": 5968 }, { "epoch": 0.4586662569145667, "grad_norm": 3.226792812347412, "learning_rate": 1.8165334972341734e-05, "loss": 1.4084, "step": 5970 }, { "epoch": 0.458819913952059, "grad_norm": 2.946282386779785, "learning_rate": 1.8164720344191767e-05, "loss": 1.3186, "step": 5972 }, { "epoch": 0.4589735709895513, "grad_norm": 2.6620097160339355, "learning_rate": 1.8164105716041797e-05, "loss": 1.219, "step": 5974 }, { "epoch": 0.45912722802704364, "grad_norm": 3.052964925765991, "learning_rate": 1.8163491087891827e-05, "loss": 1.2451, "step": 5976 }, { "epoch": 0.45928088506453596, "grad_norm": 2.9971115589141846, "learning_rate": 1.816287645974186e-05, "loss": 1.3349, "step": 5978 }, { "epoch": 0.4594345421020283, "grad_norm": 2.765615701675415, "learning_rate": 1.8162261831591886e-05, "loss": 1.3993, "step": 5980 }, { "epoch": 0.4595881991395206, "grad_norm": 3.110050916671753, "learning_rate": 1.816164720344192e-05, "loss": 1.3793, "step": 5982 }, { "epoch": 0.4597418561770129, "grad_norm": 3.004164934158325, "learning_rate": 1.816103257529195e-05, "loss": 1.3538, "step": 5984 }, { "epoch": 0.4598955132145052, "grad_norm": 2.9276111125946045, "learning_rate": 1.8160417947141982e-05, "loss": 1.4343, "step": 5986 }, { "epoch": 0.46004917025199754, "grad_norm": 3.2313802242279053, "learning_rate": 1.815980331899201e-05, "loss": 1.4701, "step": 5988 }, { "epoch": 0.46020282728948986, "grad_norm": 3.9364774227142334, "learning_rate": 1.815918869084204e-05, "loss": 1.4805, "step": 5990 }, { "epoch": 0.4603564843269822, "grad_norm": 3.274184465408325, "learning_rate": 1.8158574062692074e-05, "loss": 1.3289, "step": 5992 }, { "epoch": 0.4605101413644745, "grad_norm": 3.1217334270477295, "learning_rate": 1.8157959434542104e-05, "loss": 1.4466, "step": 5994 }, { "epoch": 0.4606637984019668, "grad_norm": 2.6427552700042725, "learning_rate": 1.8157344806392134e-05, "loss": 1.3293, "step": 5996 }, { "epoch": 0.4608174554394591, "grad_norm": 3.3423068523406982, "learning_rate": 1.8156730178242167e-05, "loss": 1.3001, "step": 5998 }, { "epoch": 0.46097111247695144, "grad_norm": 3.421719551086426, "learning_rate": 1.8156115550092193e-05, "loss": 1.2599, "step": 6000 }, { "epoch": 0.46112476951444376, "grad_norm": 2.9063069820404053, "learning_rate": 1.8155500921942226e-05, "loss": 1.2448, "step": 6002 }, { "epoch": 0.46127842655193607, "grad_norm": 3.389843225479126, "learning_rate": 1.815488629379226e-05, "loss": 1.465, "step": 6004 }, { "epoch": 0.4614320835894284, "grad_norm": 3.1748673915863037, "learning_rate": 1.8154271665642286e-05, "loss": 1.442, "step": 6006 }, { "epoch": 0.4615857406269207, "grad_norm": 3.1274852752685547, "learning_rate": 1.815365703749232e-05, "loss": 1.3625, "step": 6008 }, { "epoch": 0.461739397664413, "grad_norm": 2.8795769214630127, "learning_rate": 1.815304240934235e-05, "loss": 1.3289, "step": 6010 }, { "epoch": 0.46189305470190534, "grad_norm": 2.991797924041748, "learning_rate": 1.815242778119238e-05, "loss": 1.3398, "step": 6012 }, { "epoch": 0.46204671173939765, "grad_norm": 2.745926856994629, "learning_rate": 1.815181315304241e-05, "loss": 1.3411, "step": 6014 }, { "epoch": 0.46220036877688997, "grad_norm": 3.4233834743499756, "learning_rate": 1.815119852489244e-05, "loss": 1.476, "step": 6016 }, { "epoch": 0.4623540258143823, "grad_norm": 3.179094076156616, "learning_rate": 1.8150583896742474e-05, "loss": 1.3088, "step": 6018 }, { "epoch": 0.4625076828518746, "grad_norm": 3.251293897628784, "learning_rate": 1.8149969268592504e-05, "loss": 1.3724, "step": 6020 }, { "epoch": 0.4626613398893669, "grad_norm": 3.3762452602386475, "learning_rate": 1.8149354640442533e-05, "loss": 1.4215, "step": 6022 }, { "epoch": 0.46281499692685923, "grad_norm": 2.5178112983703613, "learning_rate": 1.8148740012292566e-05, "loss": 1.2476, "step": 6024 }, { "epoch": 0.46296865396435155, "grad_norm": 3.1534478664398193, "learning_rate": 1.8148125384142593e-05, "loss": 1.2818, "step": 6026 }, { "epoch": 0.46312231100184387, "grad_norm": 2.7992680072784424, "learning_rate": 1.8147510755992626e-05, "loss": 1.3746, "step": 6028 }, { "epoch": 0.4632759680393362, "grad_norm": 3.016869068145752, "learning_rate": 1.8146896127842655e-05, "loss": 1.1759, "step": 6030 }, { "epoch": 0.4634296250768285, "grad_norm": 3.1666297912597656, "learning_rate": 1.814628149969269e-05, "loss": 1.2996, "step": 6032 }, { "epoch": 0.4635832821143208, "grad_norm": 3.2005155086517334, "learning_rate": 1.8145666871542718e-05, "loss": 1.354, "step": 6034 }, { "epoch": 0.46373693915181313, "grad_norm": 2.9398839473724365, "learning_rate": 1.8145052243392748e-05, "loss": 1.3371, "step": 6036 }, { "epoch": 0.46389059618930545, "grad_norm": 2.675481081008911, "learning_rate": 1.814443761524278e-05, "loss": 1.4086, "step": 6038 }, { "epoch": 0.46404425322679776, "grad_norm": 2.6939587593078613, "learning_rate": 1.814382298709281e-05, "loss": 1.4703, "step": 6040 }, { "epoch": 0.4641979102642901, "grad_norm": 3.3314614295959473, "learning_rate": 1.814320835894284e-05, "loss": 1.3802, "step": 6042 }, { "epoch": 0.4643515673017824, "grad_norm": 2.7407829761505127, "learning_rate": 1.8142593730792873e-05, "loss": 1.2443, "step": 6044 }, { "epoch": 0.4645052243392747, "grad_norm": 3.5611183643341064, "learning_rate": 1.8141979102642903e-05, "loss": 1.3102, "step": 6046 }, { "epoch": 0.46465888137676703, "grad_norm": 3.135925054550171, "learning_rate": 1.8141364474492933e-05, "loss": 1.2848, "step": 6048 }, { "epoch": 0.46481253841425935, "grad_norm": 2.807861566543579, "learning_rate": 1.8140749846342966e-05, "loss": 1.498, "step": 6050 }, { "epoch": 0.46496619545175166, "grad_norm": 2.9039602279663086, "learning_rate": 1.8140135218192995e-05, "loss": 1.3336, "step": 6052 }, { "epoch": 0.46511985248924403, "grad_norm": 3.3446543216705322, "learning_rate": 1.8139520590043025e-05, "loss": 1.3786, "step": 6054 }, { "epoch": 0.46527350952673635, "grad_norm": 3.125364065170288, "learning_rate": 1.8138905961893055e-05, "loss": 1.3462, "step": 6056 }, { "epoch": 0.46542716656422867, "grad_norm": 3.407083511352539, "learning_rate": 1.8138291333743088e-05, "loss": 1.4261, "step": 6058 }, { "epoch": 0.465580823601721, "grad_norm": 2.6646227836608887, "learning_rate": 1.8137676705593118e-05, "loss": 1.2944, "step": 6060 }, { "epoch": 0.4657344806392133, "grad_norm": 3.1010098457336426, "learning_rate": 1.8137062077443147e-05, "loss": 1.4409, "step": 6062 }, { "epoch": 0.4658881376767056, "grad_norm": 2.994729518890381, "learning_rate": 1.813644744929318e-05, "loss": 1.2887, "step": 6064 }, { "epoch": 0.46604179471419793, "grad_norm": 2.98126482963562, "learning_rate": 1.813583282114321e-05, "loss": 1.4378, "step": 6066 }, { "epoch": 0.46619545175169025, "grad_norm": 3.0406136512756348, "learning_rate": 1.813521819299324e-05, "loss": 1.4089, "step": 6068 }, { "epoch": 0.46634910878918256, "grad_norm": 2.912886619567871, "learning_rate": 1.8134603564843273e-05, "loss": 1.297, "step": 6070 }, { "epoch": 0.4665027658266749, "grad_norm": 3.1055288314819336, "learning_rate": 1.8133988936693302e-05, "loss": 1.3387, "step": 6072 }, { "epoch": 0.4666564228641672, "grad_norm": 2.698050022125244, "learning_rate": 1.8133374308543332e-05, "loss": 1.3589, "step": 6074 }, { "epoch": 0.4668100799016595, "grad_norm": 3.304744243621826, "learning_rate": 1.8132759680393365e-05, "loss": 1.3344, "step": 6076 }, { "epoch": 0.46696373693915183, "grad_norm": 3.2374093532562256, "learning_rate": 1.8132145052243395e-05, "loss": 1.3627, "step": 6078 }, { "epoch": 0.46711739397664415, "grad_norm": 3.138913631439209, "learning_rate": 1.8131530424093425e-05, "loss": 1.3703, "step": 6080 }, { "epoch": 0.46727105101413646, "grad_norm": 3.517970085144043, "learning_rate": 1.8130915795943454e-05, "loss": 1.3284, "step": 6082 }, { "epoch": 0.4674247080516288, "grad_norm": 2.9830198287963867, "learning_rate": 1.8130301167793487e-05, "loss": 1.3917, "step": 6084 }, { "epoch": 0.4675783650891211, "grad_norm": 2.6390559673309326, "learning_rate": 1.8129686539643517e-05, "loss": 1.3801, "step": 6086 }, { "epoch": 0.4677320221266134, "grad_norm": 3.0859718322753906, "learning_rate": 1.8129071911493547e-05, "loss": 1.4656, "step": 6088 }, { "epoch": 0.46788567916410573, "grad_norm": 2.984755516052246, "learning_rate": 1.812845728334358e-05, "loss": 1.4782, "step": 6090 }, { "epoch": 0.46803933620159804, "grad_norm": 3.045100450515747, "learning_rate": 1.812784265519361e-05, "loss": 1.4161, "step": 6092 }, { "epoch": 0.46819299323909036, "grad_norm": 3.148865222930908, "learning_rate": 1.812722802704364e-05, "loss": 1.3084, "step": 6094 }, { "epoch": 0.4683466502765827, "grad_norm": 3.177959680557251, "learning_rate": 1.8126613398893672e-05, "loss": 1.3234, "step": 6096 }, { "epoch": 0.468500307314075, "grad_norm": 2.8266592025756836, "learning_rate": 1.8125998770743702e-05, "loss": 1.4313, "step": 6098 }, { "epoch": 0.4686539643515673, "grad_norm": 3.3804705142974854, "learning_rate": 1.812538414259373e-05, "loss": 1.3802, "step": 6100 }, { "epoch": 0.4688076213890596, "grad_norm": 3.3395400047302246, "learning_rate": 1.8124769514443765e-05, "loss": 1.2999, "step": 6102 }, { "epoch": 0.46896127842655194, "grad_norm": 2.902346134185791, "learning_rate": 1.8124154886293794e-05, "loss": 1.3411, "step": 6104 }, { "epoch": 0.46911493546404426, "grad_norm": 3.264467477798462, "learning_rate": 1.8123540258143824e-05, "loss": 1.4063, "step": 6106 }, { "epoch": 0.4692685925015366, "grad_norm": 2.926862955093384, "learning_rate": 1.8122925629993854e-05, "loss": 1.2848, "step": 6108 }, { "epoch": 0.4694222495390289, "grad_norm": 3.159416913986206, "learning_rate": 1.8122311001843887e-05, "loss": 1.526, "step": 6110 }, { "epoch": 0.4695759065765212, "grad_norm": 2.464360237121582, "learning_rate": 1.8121696373693916e-05, "loss": 1.2014, "step": 6112 }, { "epoch": 0.4697295636140135, "grad_norm": 3.0088672637939453, "learning_rate": 1.8121081745543946e-05, "loss": 1.4058, "step": 6114 }, { "epoch": 0.46988322065150584, "grad_norm": 2.786822557449341, "learning_rate": 1.812046711739398e-05, "loss": 1.2714, "step": 6116 }, { "epoch": 0.47003687768899816, "grad_norm": 2.730426549911499, "learning_rate": 1.811985248924401e-05, "loss": 1.1898, "step": 6118 }, { "epoch": 0.47019053472649047, "grad_norm": 2.9403066635131836, "learning_rate": 1.811923786109404e-05, "loss": 1.4679, "step": 6120 }, { "epoch": 0.4703441917639828, "grad_norm": 2.8969335556030273, "learning_rate": 1.811862323294407e-05, "loss": 1.3628, "step": 6122 }, { "epoch": 0.4704978488014751, "grad_norm": 3.1321475505828857, "learning_rate": 1.81180086047941e-05, "loss": 1.2599, "step": 6124 }, { "epoch": 0.4706515058389674, "grad_norm": 2.7310781478881836, "learning_rate": 1.811739397664413e-05, "loss": 1.343, "step": 6126 }, { "epoch": 0.47080516287645974, "grad_norm": 3.2368597984313965, "learning_rate": 1.811677934849416e-05, "loss": 1.452, "step": 6128 }, { "epoch": 0.47095881991395205, "grad_norm": 3.1757235527038574, "learning_rate": 1.8116164720344194e-05, "loss": 1.2629, "step": 6130 }, { "epoch": 0.47111247695144437, "grad_norm": 3.524467706680298, "learning_rate": 1.8115550092194223e-05, "loss": 1.3946, "step": 6132 }, { "epoch": 0.4712661339889367, "grad_norm": 2.9439032077789307, "learning_rate": 1.8114935464044253e-05, "loss": 1.3863, "step": 6134 }, { "epoch": 0.471419791026429, "grad_norm": 3.1204442977905273, "learning_rate": 1.8114320835894286e-05, "loss": 1.3305, "step": 6136 }, { "epoch": 0.4715734480639213, "grad_norm": 2.8491554260253906, "learning_rate": 1.8113706207744316e-05, "loss": 1.3718, "step": 6138 }, { "epoch": 0.47172710510141364, "grad_norm": 3.3984553813934326, "learning_rate": 1.8113091579594346e-05, "loss": 1.3786, "step": 6140 }, { "epoch": 0.47188076213890595, "grad_norm": 2.843414545059204, "learning_rate": 1.811247695144438e-05, "loss": 1.4403, "step": 6142 }, { "epoch": 0.47203441917639827, "grad_norm": 3.248155355453491, "learning_rate": 1.811186232329441e-05, "loss": 1.3445, "step": 6144 }, { "epoch": 0.4721880762138906, "grad_norm": 2.920718193054199, "learning_rate": 1.8111247695144438e-05, "loss": 1.4953, "step": 6146 }, { "epoch": 0.4723417332513829, "grad_norm": 2.642434597015381, "learning_rate": 1.811063306699447e-05, "loss": 1.4154, "step": 6148 }, { "epoch": 0.4724953902888752, "grad_norm": 2.8133106231689453, "learning_rate": 1.81100184388445e-05, "loss": 1.2154, "step": 6150 }, { "epoch": 0.47264904732636753, "grad_norm": 2.921673059463501, "learning_rate": 1.810940381069453e-05, "loss": 1.4372, "step": 6152 }, { "epoch": 0.47280270436385985, "grad_norm": 2.9900777339935303, "learning_rate": 1.810878918254456e-05, "loss": 1.3489, "step": 6154 }, { "epoch": 0.47295636140135217, "grad_norm": 3.1354458332061768, "learning_rate": 1.8108174554394593e-05, "loss": 1.3848, "step": 6156 }, { "epoch": 0.4731100184388445, "grad_norm": 2.98395037651062, "learning_rate": 1.8107559926244623e-05, "loss": 1.1782, "step": 6158 }, { "epoch": 0.4732636754763368, "grad_norm": 3.0558385848999023, "learning_rate": 1.8106945298094653e-05, "loss": 1.4362, "step": 6160 }, { "epoch": 0.4734173325138291, "grad_norm": 2.871683120727539, "learning_rate": 1.8106330669944686e-05, "loss": 1.361, "step": 6162 }, { "epoch": 0.47357098955132143, "grad_norm": 3.1952767372131348, "learning_rate": 1.8105716041794715e-05, "loss": 1.371, "step": 6164 }, { "epoch": 0.47372464658881375, "grad_norm": 3.256314754486084, "learning_rate": 1.8105101413644745e-05, "loss": 1.5385, "step": 6166 }, { "epoch": 0.47387830362630606, "grad_norm": 3.1399924755096436, "learning_rate": 1.8104486785494778e-05, "loss": 1.4099, "step": 6168 }, { "epoch": 0.4740319606637984, "grad_norm": 2.872492551803589, "learning_rate": 1.8103872157344808e-05, "loss": 1.3904, "step": 6170 }, { "epoch": 0.4741856177012907, "grad_norm": 3.3130695819854736, "learning_rate": 1.8103257529194837e-05, "loss": 1.4766, "step": 6172 }, { "epoch": 0.474339274738783, "grad_norm": 3.3041231632232666, "learning_rate": 1.810264290104487e-05, "loss": 1.3754, "step": 6174 }, { "epoch": 0.47449293177627533, "grad_norm": 2.94787859916687, "learning_rate": 1.81020282728949e-05, "loss": 1.407, "step": 6176 }, { "epoch": 0.47464658881376764, "grad_norm": 2.9895570278167725, "learning_rate": 1.8101413644744933e-05, "loss": 1.4253, "step": 6178 }, { "epoch": 0.47480024585125996, "grad_norm": 3.096012830734253, "learning_rate": 1.810079901659496e-05, "loss": 1.4598, "step": 6180 }, { "epoch": 0.4749539028887523, "grad_norm": 3.4266176223754883, "learning_rate": 1.8100184388444993e-05, "loss": 1.3014, "step": 6182 }, { "epoch": 0.47510755992624465, "grad_norm": 3.3502449989318848, "learning_rate": 1.8099569760295022e-05, "loss": 1.403, "step": 6184 }, { "epoch": 0.47526121696373697, "grad_norm": 2.791080951690674, "learning_rate": 1.8098955132145052e-05, "loss": 1.3161, "step": 6186 }, { "epoch": 0.4754148740012293, "grad_norm": 3.389315366744995, "learning_rate": 1.8098340503995085e-05, "loss": 1.3105, "step": 6188 }, { "epoch": 0.4755685310387216, "grad_norm": 3.1894092559814453, "learning_rate": 1.8097725875845115e-05, "loss": 1.334, "step": 6190 }, { "epoch": 0.4757221880762139, "grad_norm": 3.219374418258667, "learning_rate": 1.8097111247695144e-05, "loss": 1.2583, "step": 6192 }, { "epoch": 0.47587584511370623, "grad_norm": 2.9629125595092773, "learning_rate": 1.8096496619545178e-05, "loss": 1.472, "step": 6194 }, { "epoch": 0.47602950215119855, "grad_norm": 2.8309929370880127, "learning_rate": 1.8095881991395207e-05, "loss": 1.4128, "step": 6196 }, { "epoch": 0.47618315918869086, "grad_norm": 3.1029868125915527, "learning_rate": 1.809526736324524e-05, "loss": 1.4316, "step": 6198 }, { "epoch": 0.4763368162261832, "grad_norm": 3.910332679748535, "learning_rate": 1.809465273509527e-05, "loss": 1.4264, "step": 6200 }, { "epoch": 0.4764904732636755, "grad_norm": 3.1213109493255615, "learning_rate": 1.80940381069453e-05, "loss": 1.3166, "step": 6202 }, { "epoch": 0.4766441303011678, "grad_norm": 3.3485963344573975, "learning_rate": 1.8093423478795333e-05, "loss": 1.4312, "step": 6204 }, { "epoch": 0.47679778733866013, "grad_norm": 2.8326988220214844, "learning_rate": 1.809280885064536e-05, "loss": 1.3337, "step": 6206 }, { "epoch": 0.47695144437615244, "grad_norm": 3.0213944911956787, "learning_rate": 1.8092194222495392e-05, "loss": 1.3796, "step": 6208 }, { "epoch": 0.47710510141364476, "grad_norm": 3.090485095977783, "learning_rate": 1.8091579594345422e-05, "loss": 1.3794, "step": 6210 }, { "epoch": 0.4772587584511371, "grad_norm": 2.931671619415283, "learning_rate": 1.809096496619545e-05, "loss": 1.4677, "step": 6212 }, { "epoch": 0.4774124154886294, "grad_norm": 3.267240524291992, "learning_rate": 1.8090350338045485e-05, "loss": 1.3751, "step": 6214 }, { "epoch": 0.4775660725261217, "grad_norm": 2.553067207336426, "learning_rate": 1.8089735709895514e-05, "loss": 1.3013, "step": 6216 }, { "epoch": 0.477719729563614, "grad_norm": 3.1998229026794434, "learning_rate": 1.8089121081745547e-05, "loss": 1.3474, "step": 6218 }, { "epoch": 0.47787338660110634, "grad_norm": 11.610280990600586, "learning_rate": 1.8088506453595577e-05, "loss": 1.2955, "step": 6220 }, { "epoch": 0.47802704363859866, "grad_norm": 2.8859567642211914, "learning_rate": 1.8087891825445607e-05, "loss": 1.3495, "step": 6222 }, { "epoch": 0.478180700676091, "grad_norm": 2.9452872276306152, "learning_rate": 1.808727719729564e-05, "loss": 1.276, "step": 6224 }, { "epoch": 0.4783343577135833, "grad_norm": 3.28800892829895, "learning_rate": 1.8086662569145666e-05, "loss": 1.4918, "step": 6226 }, { "epoch": 0.4784880147510756, "grad_norm": 3.066276788711548, "learning_rate": 1.80860479409957e-05, "loss": 1.3771, "step": 6228 }, { "epoch": 0.4786416717885679, "grad_norm": 2.8848133087158203, "learning_rate": 1.8085433312845732e-05, "loss": 1.3354, "step": 6230 }, { "epoch": 0.47879532882606024, "grad_norm": 2.9447460174560547, "learning_rate": 1.808481868469576e-05, "loss": 1.4236, "step": 6232 }, { "epoch": 0.47894898586355256, "grad_norm": 3.061072826385498, "learning_rate": 1.808420405654579e-05, "loss": 1.2652, "step": 6234 }, { "epoch": 0.4791026429010449, "grad_norm": 3.3094377517700195, "learning_rate": 1.808358942839582e-05, "loss": 1.3274, "step": 6236 }, { "epoch": 0.4792562999385372, "grad_norm": 2.868401050567627, "learning_rate": 1.8082974800245854e-05, "loss": 1.361, "step": 6238 }, { "epoch": 0.4794099569760295, "grad_norm": 2.7703821659088135, "learning_rate": 1.8082360172095884e-05, "loss": 1.3177, "step": 6240 }, { "epoch": 0.4795636140135218, "grad_norm": 2.5703728199005127, "learning_rate": 1.8081745543945914e-05, "loss": 1.3978, "step": 6242 }, { "epoch": 0.47971727105101414, "grad_norm": 3.038760185241699, "learning_rate": 1.8081130915795947e-05, "loss": 1.3651, "step": 6244 }, { "epoch": 0.47987092808850645, "grad_norm": 3.6976478099823, "learning_rate": 1.8080516287645976e-05, "loss": 1.4703, "step": 6246 }, { "epoch": 0.48002458512599877, "grad_norm": 3.0694258213043213, "learning_rate": 1.8079901659496006e-05, "loss": 1.3212, "step": 6248 }, { "epoch": 0.4801782421634911, "grad_norm": 3.3709535598754883, "learning_rate": 1.807928703134604e-05, "loss": 1.36, "step": 6250 }, { "epoch": 0.4803318992009834, "grad_norm": 3.294551134109497, "learning_rate": 1.8078672403196065e-05, "loss": 1.5437, "step": 6252 }, { "epoch": 0.4804855562384757, "grad_norm": 3.8295395374298096, "learning_rate": 1.80780577750461e-05, "loss": 1.2844, "step": 6254 }, { "epoch": 0.48063921327596804, "grad_norm": 2.620631217956543, "learning_rate": 1.8077443146896128e-05, "loss": 1.2676, "step": 6256 }, { "epoch": 0.48079287031346035, "grad_norm": 2.7182457447052, "learning_rate": 1.8076828518746158e-05, "loss": 1.4544, "step": 6258 }, { "epoch": 0.48094652735095267, "grad_norm": 3.396521806716919, "learning_rate": 1.807621389059619e-05, "loss": 1.3121, "step": 6260 }, { "epoch": 0.481100184388445, "grad_norm": 2.63437557220459, "learning_rate": 1.807559926244622e-05, "loss": 1.234, "step": 6262 }, { "epoch": 0.4812538414259373, "grad_norm": 2.776506185531616, "learning_rate": 1.8074984634296254e-05, "loss": 1.4487, "step": 6264 }, { "epoch": 0.4814074984634296, "grad_norm": 3.227975845336914, "learning_rate": 1.8074370006146283e-05, "loss": 1.2383, "step": 6266 }, { "epoch": 0.48156115550092193, "grad_norm": 2.9529471397399902, "learning_rate": 1.8073755377996313e-05, "loss": 1.3515, "step": 6268 }, { "epoch": 0.48171481253841425, "grad_norm": 3.3522536754608154, "learning_rate": 1.8073140749846346e-05, "loss": 1.3238, "step": 6270 }, { "epoch": 0.48186846957590657, "grad_norm": 3.3435351848602295, "learning_rate": 1.8072526121696376e-05, "loss": 1.4359, "step": 6272 }, { "epoch": 0.4820221266133989, "grad_norm": 2.8637821674346924, "learning_rate": 1.8071911493546406e-05, "loss": 1.2188, "step": 6274 }, { "epoch": 0.4821757836508912, "grad_norm": 2.939358949661255, "learning_rate": 1.807129686539644e-05, "loss": 1.2918, "step": 6276 }, { "epoch": 0.4823294406883835, "grad_norm": 3.1831789016723633, "learning_rate": 1.8070682237246465e-05, "loss": 1.4176, "step": 6278 }, { "epoch": 0.48248309772587583, "grad_norm": 3.3042044639587402, "learning_rate": 1.8070067609096498e-05, "loss": 1.3979, "step": 6280 }, { "epoch": 0.48263675476336815, "grad_norm": 3.7356269359588623, "learning_rate": 1.8069452980946528e-05, "loss": 1.3986, "step": 6282 }, { "epoch": 0.48279041180086046, "grad_norm": 3.174906015396118, "learning_rate": 1.806883835279656e-05, "loss": 1.2884, "step": 6284 }, { "epoch": 0.4829440688383528, "grad_norm": 2.9945216178894043, "learning_rate": 1.806822372464659e-05, "loss": 1.4439, "step": 6286 }, { "epoch": 0.4830977258758451, "grad_norm": 3.113851308822632, "learning_rate": 1.806760909649662e-05, "loss": 1.3341, "step": 6288 }, { "epoch": 0.4832513829133374, "grad_norm": 3.578928232192993, "learning_rate": 1.8066994468346653e-05, "loss": 1.4566, "step": 6290 }, { "epoch": 0.48340503995082973, "grad_norm": 3.227860450744629, "learning_rate": 1.8066379840196683e-05, "loss": 1.3845, "step": 6292 }, { "epoch": 0.48355869698832205, "grad_norm": 3.249185562133789, "learning_rate": 1.8065765212046713e-05, "loss": 1.3953, "step": 6294 }, { "epoch": 0.48371235402581436, "grad_norm": 2.9868662357330322, "learning_rate": 1.8065150583896746e-05, "loss": 1.2176, "step": 6296 }, { "epoch": 0.4838660110633067, "grad_norm": 2.748054265975952, "learning_rate": 1.8064535955746775e-05, "loss": 1.251, "step": 6298 }, { "epoch": 0.484019668100799, "grad_norm": 3.2431583404541016, "learning_rate": 1.8063921327596805e-05, "loss": 1.3409, "step": 6300 }, { "epoch": 0.4841733251382913, "grad_norm": 3.484886407852173, "learning_rate": 1.8063306699446838e-05, "loss": 1.3708, "step": 6302 }, { "epoch": 0.4843269821757836, "grad_norm": 3.2029964923858643, "learning_rate": 1.8062692071296868e-05, "loss": 1.4806, "step": 6304 }, { "epoch": 0.48448063921327594, "grad_norm": 2.78397274017334, "learning_rate": 1.8062077443146897e-05, "loss": 1.2184, "step": 6306 }, { "epoch": 0.48463429625076826, "grad_norm": 2.9794437885284424, "learning_rate": 1.8061462814996927e-05, "loss": 1.4035, "step": 6308 }, { "epoch": 0.4847879532882606, "grad_norm": 2.6800053119659424, "learning_rate": 1.806084818684696e-05, "loss": 1.3018, "step": 6310 }, { "epoch": 0.4849416103257529, "grad_norm": 2.774409055709839, "learning_rate": 1.806023355869699e-05, "loss": 1.4247, "step": 6312 }, { "epoch": 0.48509526736324526, "grad_norm": 3.3923118114471436, "learning_rate": 1.805961893054702e-05, "loss": 1.1499, "step": 6314 }, { "epoch": 0.4852489244007376, "grad_norm": 2.7925145626068115, "learning_rate": 1.8059004302397053e-05, "loss": 1.2022, "step": 6316 }, { "epoch": 0.4854025814382299, "grad_norm": 3.3250386714935303, "learning_rate": 1.8058389674247082e-05, "loss": 1.3755, "step": 6318 }, { "epoch": 0.4855562384757222, "grad_norm": 3.531944990158081, "learning_rate": 1.8057775046097112e-05, "loss": 1.4049, "step": 6320 }, { "epoch": 0.48570989551321453, "grad_norm": 3.0159146785736084, "learning_rate": 1.8057160417947145e-05, "loss": 1.4002, "step": 6322 }, { "epoch": 0.48586355255070685, "grad_norm": 3.2475900650024414, "learning_rate": 1.8056545789797175e-05, "loss": 1.3624, "step": 6324 }, { "epoch": 0.48601720958819916, "grad_norm": 3.2522149085998535, "learning_rate": 1.8055931161647204e-05, "loss": 1.3565, "step": 6326 }, { "epoch": 0.4861708666256915, "grad_norm": 3.1061174869537354, "learning_rate": 1.8055316533497237e-05, "loss": 1.3549, "step": 6328 }, { "epoch": 0.4863245236631838, "grad_norm": 3.162954330444336, "learning_rate": 1.8054701905347267e-05, "loss": 1.4947, "step": 6330 }, { "epoch": 0.4864781807006761, "grad_norm": 3.179232597351074, "learning_rate": 1.8054087277197297e-05, "loss": 1.32, "step": 6332 }, { "epoch": 0.4866318377381684, "grad_norm": 2.8867878913879395, "learning_rate": 1.8053472649047327e-05, "loss": 1.2925, "step": 6334 }, { "epoch": 0.48678549477566074, "grad_norm": 3.4606575965881348, "learning_rate": 1.805285802089736e-05, "loss": 1.4834, "step": 6336 }, { "epoch": 0.48693915181315306, "grad_norm": 3.118943214416504, "learning_rate": 1.805224339274739e-05, "loss": 1.227, "step": 6338 }, { "epoch": 0.4870928088506454, "grad_norm": 3.1794466972351074, "learning_rate": 1.805162876459742e-05, "loss": 1.4092, "step": 6340 }, { "epoch": 0.4872464658881377, "grad_norm": 2.719515562057495, "learning_rate": 1.8051014136447452e-05, "loss": 1.3534, "step": 6342 }, { "epoch": 0.48740012292563, "grad_norm": 3.8740384578704834, "learning_rate": 1.8050399508297482e-05, "loss": 1.4607, "step": 6344 }, { "epoch": 0.4875537799631223, "grad_norm": 3.121920585632324, "learning_rate": 1.804978488014751e-05, "loss": 1.365, "step": 6346 }, { "epoch": 0.48770743700061464, "grad_norm": 2.8695993423461914, "learning_rate": 1.8049170251997544e-05, "loss": 1.2682, "step": 6348 }, { "epoch": 0.48786109403810696, "grad_norm": 2.9940059185028076, "learning_rate": 1.8048555623847574e-05, "loss": 1.3376, "step": 6350 }, { "epoch": 0.4880147510755993, "grad_norm": 3.0862960815429688, "learning_rate": 1.8047940995697604e-05, "loss": 1.3186, "step": 6352 }, { "epoch": 0.4881684081130916, "grad_norm": 3.2420318126678467, "learning_rate": 1.8047326367547634e-05, "loss": 1.3098, "step": 6354 }, { "epoch": 0.4883220651505839, "grad_norm": 2.9130239486694336, "learning_rate": 1.8046711739397667e-05, "loss": 1.3989, "step": 6356 }, { "epoch": 0.4884757221880762, "grad_norm": 3.1051688194274902, "learning_rate": 1.8046097111247696e-05, "loss": 1.3173, "step": 6358 }, { "epoch": 0.48862937922556854, "grad_norm": 2.9414143562316895, "learning_rate": 1.8045482483097726e-05, "loss": 1.3141, "step": 6360 }, { "epoch": 0.48878303626306085, "grad_norm": 2.926856279373169, "learning_rate": 1.804486785494776e-05, "loss": 1.3659, "step": 6362 }, { "epoch": 0.48893669330055317, "grad_norm": 3.0181140899658203, "learning_rate": 1.804425322679779e-05, "loss": 1.4083, "step": 6364 }, { "epoch": 0.4890903503380455, "grad_norm": 2.822953462600708, "learning_rate": 1.804363859864782e-05, "loss": 1.3084, "step": 6366 }, { "epoch": 0.4892440073755378, "grad_norm": 3.2265994548797607, "learning_rate": 1.804302397049785e-05, "loss": 1.4421, "step": 6368 }, { "epoch": 0.4893976644130301, "grad_norm": 2.932751417160034, "learning_rate": 1.804240934234788e-05, "loss": 1.3666, "step": 6370 }, { "epoch": 0.48955132145052244, "grad_norm": 3.190852403640747, "learning_rate": 1.804179471419791e-05, "loss": 1.4584, "step": 6372 }, { "epoch": 0.48970497848801475, "grad_norm": 2.9671835899353027, "learning_rate": 1.8041180086047944e-05, "loss": 1.4951, "step": 6374 }, { "epoch": 0.48985863552550707, "grad_norm": 3.1517322063446045, "learning_rate": 1.8040565457897974e-05, "loss": 1.4506, "step": 6376 }, { "epoch": 0.4900122925629994, "grad_norm": 3.0485892295837402, "learning_rate": 1.8039950829748003e-05, "loss": 1.3996, "step": 6378 }, { "epoch": 0.4901659496004917, "grad_norm": 2.753948450088501, "learning_rate": 1.8039336201598033e-05, "loss": 1.4296, "step": 6380 }, { "epoch": 0.490319606637984, "grad_norm": 2.8919365406036377, "learning_rate": 1.8038721573448066e-05, "loss": 1.3162, "step": 6382 }, { "epoch": 0.49047326367547633, "grad_norm": 2.782630205154419, "learning_rate": 1.8038106945298096e-05, "loss": 1.3588, "step": 6384 }, { "epoch": 0.49062692071296865, "grad_norm": 2.8316001892089844, "learning_rate": 1.8037492317148125e-05, "loss": 1.4375, "step": 6386 }, { "epoch": 0.49078057775046097, "grad_norm": 2.9996912479400635, "learning_rate": 1.803687768899816e-05, "loss": 1.3141, "step": 6388 }, { "epoch": 0.4909342347879533, "grad_norm": 2.8670809268951416, "learning_rate": 1.8036263060848188e-05, "loss": 1.424, "step": 6390 }, { "epoch": 0.4910878918254456, "grad_norm": 3.0059220790863037, "learning_rate": 1.8035648432698218e-05, "loss": 1.3933, "step": 6392 }, { "epoch": 0.4912415488629379, "grad_norm": 3.1974833011627197, "learning_rate": 1.803503380454825e-05, "loss": 1.3125, "step": 6394 }, { "epoch": 0.49139520590043023, "grad_norm": 2.683246612548828, "learning_rate": 1.803441917639828e-05, "loss": 1.258, "step": 6396 }, { "epoch": 0.49154886293792255, "grad_norm": 3.330538034439087, "learning_rate": 1.803380454824831e-05, "loss": 1.4212, "step": 6398 }, { "epoch": 0.49170251997541486, "grad_norm": 3.116828680038452, "learning_rate": 1.8033189920098343e-05, "loss": 1.3991, "step": 6400 }, { "epoch": 0.4918561770129072, "grad_norm": 3.45032000541687, "learning_rate": 1.8032575291948373e-05, "loss": 1.3591, "step": 6402 }, { "epoch": 0.4920098340503995, "grad_norm": 3.455242872238159, "learning_rate": 1.8031960663798403e-05, "loss": 1.3312, "step": 6404 }, { "epoch": 0.4921634910878918, "grad_norm": 2.821232557296753, "learning_rate": 1.8031346035648432e-05, "loss": 1.2594, "step": 6406 }, { "epoch": 0.49231714812538413, "grad_norm": 3.163733720779419, "learning_rate": 1.8030731407498465e-05, "loss": 1.3575, "step": 6408 }, { "epoch": 0.49247080516287645, "grad_norm": 3.2715537548065186, "learning_rate": 1.8030116779348495e-05, "loss": 1.4909, "step": 6410 }, { "epoch": 0.49262446220036876, "grad_norm": 3.1828911304473877, "learning_rate": 1.8029502151198525e-05, "loss": 1.3965, "step": 6412 }, { "epoch": 0.4927781192378611, "grad_norm": 3.2268998622894287, "learning_rate": 1.8028887523048558e-05, "loss": 1.3785, "step": 6414 }, { "epoch": 0.4929317762753534, "grad_norm": 3.106019973754883, "learning_rate": 1.8028272894898588e-05, "loss": 1.2841, "step": 6416 }, { "epoch": 0.4930854333128457, "grad_norm": 3.220978260040283, "learning_rate": 1.8027658266748617e-05, "loss": 1.5033, "step": 6418 }, { "epoch": 0.493239090350338, "grad_norm": 3.288722276687622, "learning_rate": 1.802704363859865e-05, "loss": 1.3628, "step": 6420 }, { "epoch": 0.49339274738783034, "grad_norm": 3.134910821914673, "learning_rate": 1.802642901044868e-05, "loss": 1.3892, "step": 6422 }, { "epoch": 0.49354640442532266, "grad_norm": 2.868943929672241, "learning_rate": 1.802581438229871e-05, "loss": 1.4807, "step": 6424 }, { "epoch": 0.493700061462815, "grad_norm": 3.257479190826416, "learning_rate": 1.8025199754148743e-05, "loss": 1.3037, "step": 6426 }, { "epoch": 0.4938537185003073, "grad_norm": 2.6661643981933594, "learning_rate": 1.8024585125998772e-05, "loss": 1.3603, "step": 6428 }, { "epoch": 0.4940073755377996, "grad_norm": 3.075345039367676, "learning_rate": 1.8023970497848806e-05, "loss": 1.3357, "step": 6430 }, { "epoch": 0.4941610325752919, "grad_norm": 3.3657965660095215, "learning_rate": 1.8023355869698832e-05, "loss": 1.3358, "step": 6432 }, { "epoch": 0.49431468961278424, "grad_norm": 3.102015495300293, "learning_rate": 1.8022741241548865e-05, "loss": 1.4151, "step": 6434 }, { "epoch": 0.49446834665027656, "grad_norm": 2.722320795059204, "learning_rate": 1.8022126613398895e-05, "loss": 1.3975, "step": 6436 }, { "epoch": 0.4946220036877689, "grad_norm": 3.373051166534424, "learning_rate": 1.8021511985248924e-05, "loss": 1.2497, "step": 6438 }, { "epoch": 0.4947756607252612, "grad_norm": 2.7113919258117676, "learning_rate": 1.8020897357098957e-05, "loss": 1.3809, "step": 6440 }, { "epoch": 0.4949293177627535, "grad_norm": 3.1061747074127197, "learning_rate": 1.8020282728948987e-05, "loss": 1.4414, "step": 6442 }, { "epoch": 0.4950829748002459, "grad_norm": 3.091012477874756, "learning_rate": 1.8019668100799017e-05, "loss": 1.3563, "step": 6444 }, { "epoch": 0.4952366318377382, "grad_norm": 2.845564126968384, "learning_rate": 1.801905347264905e-05, "loss": 1.3844, "step": 6446 }, { "epoch": 0.4953902888752305, "grad_norm": 3.5024054050445557, "learning_rate": 1.801843884449908e-05, "loss": 1.4408, "step": 6448 }, { "epoch": 0.4955439459127228, "grad_norm": 2.915093421936035, "learning_rate": 1.8017824216349113e-05, "loss": 1.2814, "step": 6450 }, { "epoch": 0.49569760295021514, "grad_norm": 3.042811632156372, "learning_rate": 1.801720958819914e-05, "loss": 1.5103, "step": 6452 }, { "epoch": 0.49585125998770746, "grad_norm": 2.851787805557251, "learning_rate": 1.8016594960049172e-05, "loss": 1.4586, "step": 6454 }, { "epoch": 0.4960049170251998, "grad_norm": 3.1319057941436768, "learning_rate": 1.80159803318992e-05, "loss": 1.2987, "step": 6456 }, { "epoch": 0.4961585740626921, "grad_norm": 2.650089740753174, "learning_rate": 1.801536570374923e-05, "loss": 1.3441, "step": 6458 }, { "epoch": 0.4963122311001844, "grad_norm": 3.493260383605957, "learning_rate": 1.8014751075599264e-05, "loss": 1.6013, "step": 6460 }, { "epoch": 0.4964658881376767, "grad_norm": 2.9392683506011963, "learning_rate": 1.8014136447449294e-05, "loss": 1.3622, "step": 6462 }, { "epoch": 0.49661954517516904, "grad_norm": 3.106326103210449, "learning_rate": 1.8013521819299324e-05, "loss": 1.3347, "step": 6464 }, { "epoch": 0.49677320221266136, "grad_norm": 2.914437770843506, "learning_rate": 1.8012907191149357e-05, "loss": 1.2599, "step": 6466 }, { "epoch": 0.4969268592501537, "grad_norm": 2.910841464996338, "learning_rate": 1.8012292562999386e-05, "loss": 1.3622, "step": 6468 }, { "epoch": 0.497080516287646, "grad_norm": 2.89884352684021, "learning_rate": 1.801167793484942e-05, "loss": 1.3582, "step": 6470 }, { "epoch": 0.4972341733251383, "grad_norm": 2.8821210861206055, "learning_rate": 1.801106330669945e-05, "loss": 1.4164, "step": 6472 }, { "epoch": 0.4973878303626306, "grad_norm": 2.601987838745117, "learning_rate": 1.801044867854948e-05, "loss": 1.5184, "step": 6474 }, { "epoch": 0.49754148740012294, "grad_norm": 2.958704710006714, "learning_rate": 1.8009834050399512e-05, "loss": 1.3903, "step": 6476 }, { "epoch": 0.49769514443761526, "grad_norm": 2.9979889392852783, "learning_rate": 1.8009219422249538e-05, "loss": 1.2592, "step": 6478 }, { "epoch": 0.49784880147510757, "grad_norm": 2.916813611984253, "learning_rate": 1.800860479409957e-05, "loss": 1.3302, "step": 6480 }, { "epoch": 0.4980024585125999, "grad_norm": 3.3828582763671875, "learning_rate": 1.80079901659496e-05, "loss": 1.4222, "step": 6482 }, { "epoch": 0.4981561155500922, "grad_norm": 3.1152589321136475, "learning_rate": 1.800737553779963e-05, "loss": 1.3731, "step": 6484 }, { "epoch": 0.4983097725875845, "grad_norm": 2.974968671798706, "learning_rate": 1.8006760909649664e-05, "loss": 1.2744, "step": 6486 }, { "epoch": 0.49846342962507684, "grad_norm": 3.652846336364746, "learning_rate": 1.8006146281499693e-05, "loss": 1.4291, "step": 6488 }, { "epoch": 0.49861708666256915, "grad_norm": 2.827791213989258, "learning_rate": 1.8005531653349727e-05, "loss": 1.2982, "step": 6490 }, { "epoch": 0.49877074370006147, "grad_norm": 3.1473135948181152, "learning_rate": 1.8004917025199756e-05, "loss": 1.3051, "step": 6492 }, { "epoch": 0.4989244007375538, "grad_norm": 3.156839370727539, "learning_rate": 1.8004302397049786e-05, "loss": 1.4366, "step": 6494 }, { "epoch": 0.4990780577750461, "grad_norm": 2.8511626720428467, "learning_rate": 1.800368776889982e-05, "loss": 1.3214, "step": 6496 }, { "epoch": 0.4992317148125384, "grad_norm": 2.7867062091827393, "learning_rate": 1.800307314074985e-05, "loss": 1.3791, "step": 6498 }, { "epoch": 0.49938537185003073, "grad_norm": 3.0567257404327393, "learning_rate": 1.800245851259988e-05, "loss": 1.3549, "step": 6500 }, { "epoch": 0.49953902888752305, "grad_norm": 2.9443788528442383, "learning_rate": 1.800184388444991e-05, "loss": 1.3798, "step": 6502 }, { "epoch": 0.49969268592501537, "grad_norm": 2.903205156326294, "learning_rate": 1.8001229256299938e-05, "loss": 1.3004, "step": 6504 }, { "epoch": 0.4998463429625077, "grad_norm": 3.2189865112304688, "learning_rate": 1.800061462814997e-05, "loss": 1.3023, "step": 6506 }, { "epoch": 0.5, "grad_norm": 3.1087801456451416, "learning_rate": 1.8e-05, "loss": 1.508, "step": 6508 }, { "epoch": 0.5001536570374924, "grad_norm": 3.1676154136657715, "learning_rate": 1.799938537185003e-05, "loss": 1.4198, "step": 6510 }, { "epoch": 0.5003073140749846, "grad_norm": 2.717998504638672, "learning_rate": 1.7998770743700063e-05, "loss": 1.3673, "step": 6512 }, { "epoch": 0.500460971112477, "grad_norm": 2.779644012451172, "learning_rate": 1.7998156115550093e-05, "loss": 1.493, "step": 6514 }, { "epoch": 0.5006146281499693, "grad_norm": 2.706282377243042, "learning_rate": 1.7997541487400126e-05, "loss": 1.2036, "step": 6516 }, { "epoch": 0.5007682851874616, "grad_norm": 2.846618175506592, "learning_rate": 1.7996926859250156e-05, "loss": 1.431, "step": 6518 }, { "epoch": 0.5009219422249539, "grad_norm": 2.9443068504333496, "learning_rate": 1.7996312231100185e-05, "loss": 1.2797, "step": 6520 }, { "epoch": 0.5010755992624463, "grad_norm": 3.471630334854126, "learning_rate": 1.799569760295022e-05, "loss": 1.436, "step": 6522 }, { "epoch": 0.5012292562999385, "grad_norm": 3.2783944606781006, "learning_rate": 1.7995082974800248e-05, "loss": 1.4904, "step": 6524 }, { "epoch": 0.5013829133374309, "grad_norm": 3.51324725151062, "learning_rate": 1.7994468346650278e-05, "loss": 1.3418, "step": 6526 }, { "epoch": 0.5015365703749232, "grad_norm": 2.648725748062134, "learning_rate": 1.799385371850031e-05, "loss": 1.2583, "step": 6528 }, { "epoch": 0.5016902274124155, "grad_norm": 2.8676857948303223, "learning_rate": 1.7993239090350337e-05, "loss": 1.3348, "step": 6530 }, { "epoch": 0.5018438844499078, "grad_norm": 3.1224286556243896, "learning_rate": 1.799262446220037e-05, "loss": 1.3221, "step": 6532 }, { "epoch": 0.5019975414874002, "grad_norm": 3.003473997116089, "learning_rate": 1.79920098340504e-05, "loss": 1.3466, "step": 6534 }, { "epoch": 0.5021511985248924, "grad_norm": 2.8612453937530518, "learning_rate": 1.7991395205900433e-05, "loss": 1.2505, "step": 6536 }, { "epoch": 0.5023048555623848, "grad_norm": 2.9750072956085205, "learning_rate": 1.7990780577750463e-05, "loss": 1.3482, "step": 6538 }, { "epoch": 0.5024585125998771, "grad_norm": 2.6988656520843506, "learning_rate": 1.7990165949600492e-05, "loss": 1.2996, "step": 6540 }, { "epoch": 0.5026121696373694, "grad_norm": 3.0278279781341553, "learning_rate": 1.7989551321450525e-05, "loss": 1.3919, "step": 6542 }, { "epoch": 0.5027658266748617, "grad_norm": 2.9088683128356934, "learning_rate": 1.7988936693300555e-05, "loss": 1.4706, "step": 6544 }, { "epoch": 0.5029194837123541, "grad_norm": 2.8318252563476562, "learning_rate": 1.7988322065150585e-05, "loss": 1.4209, "step": 6546 }, { "epoch": 0.5030731407498463, "grad_norm": 2.994474411010742, "learning_rate": 1.7987707437000618e-05, "loss": 1.3159, "step": 6548 }, { "epoch": 0.5032267977873387, "grad_norm": 2.9801907539367676, "learning_rate": 1.7987092808850644e-05, "loss": 1.2645, "step": 6550 }, { "epoch": 0.503380454824831, "grad_norm": 3.4261326789855957, "learning_rate": 1.7986478180700677e-05, "loss": 1.4416, "step": 6552 }, { "epoch": 0.5035341118623233, "grad_norm": 2.65868878364563, "learning_rate": 1.798586355255071e-05, "loss": 1.2989, "step": 6554 }, { "epoch": 0.5036877688998156, "grad_norm": 3.106370210647583, "learning_rate": 1.798524892440074e-05, "loss": 1.3748, "step": 6556 }, { "epoch": 0.503841425937308, "grad_norm": 2.6410956382751465, "learning_rate": 1.798463429625077e-05, "loss": 1.3152, "step": 6558 }, { "epoch": 0.5039950829748002, "grad_norm": 2.8190648555755615, "learning_rate": 1.79840196681008e-05, "loss": 1.4078, "step": 6560 }, { "epoch": 0.5041487400122926, "grad_norm": 2.9152209758758545, "learning_rate": 1.7983405039950832e-05, "loss": 1.2513, "step": 6562 }, { "epoch": 0.5043023970497849, "grad_norm": 3.070107936859131, "learning_rate": 1.7982790411800862e-05, "loss": 1.4488, "step": 6564 }, { "epoch": 0.5044560540872772, "grad_norm": 2.9198601245880127, "learning_rate": 1.7982175783650892e-05, "loss": 1.2694, "step": 6566 }, { "epoch": 0.5046097111247695, "grad_norm": 3.084261417388916, "learning_rate": 1.7981561155500925e-05, "loss": 1.3638, "step": 6568 }, { "epoch": 0.5047633681622619, "grad_norm": 3.0732178688049316, "learning_rate": 1.7980946527350955e-05, "loss": 1.2477, "step": 6570 }, { "epoch": 0.5049170251997541, "grad_norm": 3.3027431964874268, "learning_rate": 1.7980331899200984e-05, "loss": 1.335, "step": 6572 }, { "epoch": 0.5050706822372465, "grad_norm": 3.3048105239868164, "learning_rate": 1.7979717271051017e-05, "loss": 1.4374, "step": 6574 }, { "epoch": 0.5052243392747388, "grad_norm": 3.292642831802368, "learning_rate": 1.7979102642901047e-05, "loss": 1.353, "step": 6576 }, { "epoch": 0.5053779963122311, "grad_norm": 3.257822036743164, "learning_rate": 1.7978488014751077e-05, "loss": 1.3081, "step": 6578 }, { "epoch": 0.5055316533497234, "grad_norm": 2.796616792678833, "learning_rate": 1.7977873386601106e-05, "loss": 1.3497, "step": 6580 }, { "epoch": 0.5056853103872158, "grad_norm": 3.194371223449707, "learning_rate": 1.797725875845114e-05, "loss": 1.3587, "step": 6582 }, { "epoch": 0.505838967424708, "grad_norm": 2.6798551082611084, "learning_rate": 1.797664413030117e-05, "loss": 1.3413, "step": 6584 }, { "epoch": 0.5059926244622004, "grad_norm": 3.123992681503296, "learning_rate": 1.79760295021512e-05, "loss": 1.2649, "step": 6586 }, { "epoch": 0.5061462814996927, "grad_norm": 3.01867413520813, "learning_rate": 1.7975414874001232e-05, "loss": 1.3729, "step": 6588 }, { "epoch": 0.506299938537185, "grad_norm": 3.5075738430023193, "learning_rate": 1.797480024585126e-05, "loss": 1.3458, "step": 6590 }, { "epoch": 0.5064535955746773, "grad_norm": 3.0404345989227295, "learning_rate": 1.797418561770129e-05, "loss": 1.4686, "step": 6592 }, { "epoch": 0.5066072526121697, "grad_norm": 2.7560205459594727, "learning_rate": 1.7973570989551324e-05, "loss": 1.3492, "step": 6594 }, { "epoch": 0.5067609096496619, "grad_norm": 3.127852439880371, "learning_rate": 1.7972956361401354e-05, "loss": 1.3995, "step": 6596 }, { "epoch": 0.5069145666871543, "grad_norm": 2.963434934616089, "learning_rate": 1.7972341733251384e-05, "loss": 1.3773, "step": 6598 }, { "epoch": 0.5070682237246465, "grad_norm": 3.146491050720215, "learning_rate": 1.7971727105101417e-05, "loss": 1.282, "step": 6600 }, { "epoch": 0.5072218807621389, "grad_norm": 2.8593227863311768, "learning_rate": 1.7971112476951446e-05, "loss": 1.2804, "step": 6602 }, { "epoch": 0.5073755377996312, "grad_norm": 3.011699914932251, "learning_rate": 1.7970497848801476e-05, "loss": 1.3739, "step": 6604 }, { "epoch": 0.5075291948371236, "grad_norm": 3.7797420024871826, "learning_rate": 1.7969883220651506e-05, "loss": 1.4324, "step": 6606 }, { "epoch": 0.5076828518746158, "grad_norm": 2.8261046409606934, "learning_rate": 1.796926859250154e-05, "loss": 1.3562, "step": 6608 }, { "epoch": 0.5078365089121082, "grad_norm": 2.9160585403442383, "learning_rate": 1.796865396435157e-05, "loss": 1.5061, "step": 6610 }, { "epoch": 0.5079901659496004, "grad_norm": 3.1277146339416504, "learning_rate": 1.7968039336201598e-05, "loss": 1.3759, "step": 6612 }, { "epoch": 0.5081438229870928, "grad_norm": 3.4111275672912598, "learning_rate": 1.796742470805163e-05, "loss": 1.3682, "step": 6614 }, { "epoch": 0.5082974800245851, "grad_norm": 3.055345296859741, "learning_rate": 1.796681007990166e-05, "loss": 1.4154, "step": 6616 }, { "epoch": 0.5084511370620775, "grad_norm": 3.3309273719787598, "learning_rate": 1.796619545175169e-05, "loss": 1.3364, "step": 6618 }, { "epoch": 0.5086047940995697, "grad_norm": 2.7727885246276855, "learning_rate": 1.7965580823601724e-05, "loss": 1.2019, "step": 6620 }, { "epoch": 0.5087584511370621, "grad_norm": 3.5600733757019043, "learning_rate": 1.7964966195451753e-05, "loss": 1.4411, "step": 6622 }, { "epoch": 0.5089121081745543, "grad_norm": 3.1073739528656006, "learning_rate": 1.7964351567301783e-05, "loss": 1.4255, "step": 6624 }, { "epoch": 0.5090657652120467, "grad_norm": 2.9542529582977295, "learning_rate": 1.7963736939151816e-05, "loss": 1.2744, "step": 6626 }, { "epoch": 0.509219422249539, "grad_norm": 3.2199599742889404, "learning_rate": 1.7963122311001846e-05, "loss": 1.289, "step": 6628 }, { "epoch": 0.5093730792870313, "grad_norm": 3.095400333404541, "learning_rate": 1.7962507682851876e-05, "loss": 1.3363, "step": 6630 }, { "epoch": 0.5095267363245236, "grad_norm": 3.337625503540039, "learning_rate": 1.7961893054701905e-05, "loss": 1.4096, "step": 6632 }, { "epoch": 0.509680393362016, "grad_norm": 3.3786509037017822, "learning_rate": 1.7961278426551938e-05, "loss": 1.3938, "step": 6634 }, { "epoch": 0.5098340503995082, "grad_norm": 2.652902364730835, "learning_rate": 1.7960663798401968e-05, "loss": 1.2738, "step": 6636 }, { "epoch": 0.5099877074370006, "grad_norm": 3.4588985443115234, "learning_rate": 1.7960049170251998e-05, "loss": 1.2758, "step": 6638 }, { "epoch": 0.510141364474493, "grad_norm": 3.1711127758026123, "learning_rate": 1.795943454210203e-05, "loss": 1.3747, "step": 6640 }, { "epoch": 0.5102950215119852, "grad_norm": 3.4093017578125, "learning_rate": 1.795881991395206e-05, "loss": 1.2861, "step": 6642 }, { "epoch": 0.5104486785494776, "grad_norm": 3.0328543186187744, "learning_rate": 1.795820528580209e-05, "loss": 1.5437, "step": 6644 }, { "epoch": 0.5106023355869699, "grad_norm": 2.8662827014923096, "learning_rate": 1.7957590657652123e-05, "loss": 1.3798, "step": 6646 }, { "epoch": 0.5107559926244623, "grad_norm": 2.6307129859924316, "learning_rate": 1.7956976029502153e-05, "loss": 1.1287, "step": 6648 }, { "epoch": 0.5109096496619545, "grad_norm": 3.1315627098083496, "learning_rate": 1.7956361401352183e-05, "loss": 1.4458, "step": 6650 }, { "epoch": 0.5110633066994469, "grad_norm": 3.4800686836242676, "learning_rate": 1.7955746773202216e-05, "loss": 1.5149, "step": 6652 }, { "epoch": 0.5112169637369391, "grad_norm": 3.0013492107391357, "learning_rate": 1.7955132145052245e-05, "loss": 1.3806, "step": 6654 }, { "epoch": 0.5113706207744315, "grad_norm": 3.450124502182007, "learning_rate": 1.7954517516902275e-05, "loss": 1.4322, "step": 6656 }, { "epoch": 0.5115242778119238, "grad_norm": 2.9068808555603027, "learning_rate": 1.7953902888752305e-05, "loss": 1.3998, "step": 6658 }, { "epoch": 0.5116779348494161, "grad_norm": 2.6469430923461914, "learning_rate": 1.7953288260602338e-05, "loss": 1.4466, "step": 6660 }, { "epoch": 0.5118315918869084, "grad_norm": 2.9947774410247803, "learning_rate": 1.7952673632452367e-05, "loss": 1.3114, "step": 6662 }, { "epoch": 0.5119852489244008, "grad_norm": 3.5101516246795654, "learning_rate": 1.7952059004302397e-05, "loss": 1.3693, "step": 6664 }, { "epoch": 0.512138905961893, "grad_norm": 3.2445108890533447, "learning_rate": 1.795144437615243e-05, "loss": 1.3274, "step": 6666 }, { "epoch": 0.5122925629993854, "grad_norm": 3.050546169281006, "learning_rate": 1.795082974800246e-05, "loss": 1.3715, "step": 6668 }, { "epoch": 0.5124462200368777, "grad_norm": 3.0119810104370117, "learning_rate": 1.795021511985249e-05, "loss": 1.3978, "step": 6670 }, { "epoch": 0.51259987707437, "grad_norm": 3.514958381652832, "learning_rate": 1.7949600491702523e-05, "loss": 1.4204, "step": 6672 }, { "epoch": 0.5127535341118623, "grad_norm": 2.9083685874938965, "learning_rate": 1.7948985863552552e-05, "loss": 1.2959, "step": 6674 }, { "epoch": 0.5129071911493547, "grad_norm": 2.6381547451019287, "learning_rate": 1.7948371235402582e-05, "loss": 1.3359, "step": 6676 }, { "epoch": 0.5130608481868469, "grad_norm": 3.478752374649048, "learning_rate": 1.794775660725261e-05, "loss": 1.4583, "step": 6678 }, { "epoch": 0.5132145052243393, "grad_norm": 2.834408760070801, "learning_rate": 1.7947141979102645e-05, "loss": 1.3429, "step": 6680 }, { "epoch": 0.5133681622618316, "grad_norm": 3.0341265201568604, "learning_rate": 1.7946527350952674e-05, "loss": 1.3716, "step": 6682 }, { "epoch": 0.513521819299324, "grad_norm": 2.8251383304595947, "learning_rate": 1.7945912722802704e-05, "loss": 1.5223, "step": 6684 }, { "epoch": 0.5136754763368162, "grad_norm": 3.265641212463379, "learning_rate": 1.7945298094652737e-05, "loss": 1.269, "step": 6686 }, { "epoch": 0.5138291333743086, "grad_norm": 3.247816562652588, "learning_rate": 1.7944683466502767e-05, "loss": 1.2121, "step": 6688 }, { "epoch": 0.5139827904118008, "grad_norm": 3.2081658840179443, "learning_rate": 1.7944068838352797e-05, "loss": 1.2807, "step": 6690 }, { "epoch": 0.5141364474492932, "grad_norm": 3.1236660480499268, "learning_rate": 1.794345421020283e-05, "loss": 1.5633, "step": 6692 }, { "epoch": 0.5142901044867855, "grad_norm": 2.951101064682007, "learning_rate": 1.794283958205286e-05, "loss": 1.3899, "step": 6694 }, { "epoch": 0.5144437615242778, "grad_norm": 3.1760647296905518, "learning_rate": 1.794222495390289e-05, "loss": 1.2689, "step": 6696 }, { "epoch": 0.5145974185617701, "grad_norm": 3.310821294784546, "learning_rate": 1.7941610325752922e-05, "loss": 1.4301, "step": 6698 }, { "epoch": 0.5147510755992625, "grad_norm": 3.1752665042877197, "learning_rate": 1.7940995697602952e-05, "loss": 1.2948, "step": 6700 }, { "epoch": 0.5149047326367547, "grad_norm": 3.2648708820343018, "learning_rate": 1.7940381069452985e-05, "loss": 1.3254, "step": 6702 }, { "epoch": 0.5150583896742471, "grad_norm": 3.0156726837158203, "learning_rate": 1.793976644130301e-05, "loss": 1.4047, "step": 6704 }, { "epoch": 0.5152120467117394, "grad_norm": 3.873260974884033, "learning_rate": 1.7939151813153044e-05, "loss": 1.4925, "step": 6706 }, { "epoch": 0.5153657037492317, "grad_norm": 3.176854372024536, "learning_rate": 1.7938537185003074e-05, "loss": 1.301, "step": 6708 }, { "epoch": 0.515519360786724, "grad_norm": 2.7950966358184814, "learning_rate": 1.7937922556853104e-05, "loss": 1.3525, "step": 6710 }, { "epoch": 0.5156730178242164, "grad_norm": 3.167912244796753, "learning_rate": 1.7937307928703137e-05, "loss": 1.5551, "step": 6712 }, { "epoch": 0.5158266748617086, "grad_norm": 3.4793012142181396, "learning_rate": 1.7936693300553166e-05, "loss": 1.5019, "step": 6714 }, { "epoch": 0.515980331899201, "grad_norm": 2.9153974056243896, "learning_rate": 1.7936078672403196e-05, "loss": 1.407, "step": 6716 }, { "epoch": 0.5161339889366933, "grad_norm": 2.8722434043884277, "learning_rate": 1.793546404425323e-05, "loss": 1.3882, "step": 6718 }, { "epoch": 0.5162876459741856, "grad_norm": 2.9194371700286865, "learning_rate": 1.793484941610326e-05, "loss": 1.4366, "step": 6720 }, { "epoch": 0.5164413030116779, "grad_norm": 3.2603633403778076, "learning_rate": 1.7934234787953292e-05, "loss": 1.3387, "step": 6722 }, { "epoch": 0.5165949600491703, "grad_norm": 3.0995850563049316, "learning_rate": 1.793362015980332e-05, "loss": 1.3719, "step": 6724 }, { "epoch": 0.5167486170866625, "grad_norm": 2.8780972957611084, "learning_rate": 1.793300553165335e-05, "loss": 1.3721, "step": 6726 }, { "epoch": 0.5169022741241549, "grad_norm": 3.1328043937683105, "learning_rate": 1.7932390903503384e-05, "loss": 1.2831, "step": 6728 }, { "epoch": 0.5170559311616472, "grad_norm": 3.0199432373046875, "learning_rate": 1.793177627535341e-05, "loss": 1.3505, "step": 6730 }, { "epoch": 0.5172095881991395, "grad_norm": 2.7379181385040283, "learning_rate": 1.7931161647203444e-05, "loss": 1.3088, "step": 6732 }, { "epoch": 0.5173632452366318, "grad_norm": 2.929868459701538, "learning_rate": 1.7930547019053473e-05, "loss": 1.3396, "step": 6734 }, { "epoch": 0.5175169022741242, "grad_norm": 2.787508249282837, "learning_rate": 1.7929932390903503e-05, "loss": 1.3644, "step": 6736 }, { "epoch": 0.5176705593116164, "grad_norm": 2.7956087589263916, "learning_rate": 1.7929317762753536e-05, "loss": 1.2359, "step": 6738 }, { "epoch": 0.5178242163491088, "grad_norm": 2.53226900100708, "learning_rate": 1.7928703134603566e-05, "loss": 1.307, "step": 6740 }, { "epoch": 0.5179778733866011, "grad_norm": 2.9932830333709717, "learning_rate": 1.79280885064536e-05, "loss": 1.3173, "step": 6742 }, { "epoch": 0.5181315304240934, "grad_norm": 2.900578260421753, "learning_rate": 1.792747387830363e-05, "loss": 1.4958, "step": 6744 }, { "epoch": 0.5182851874615857, "grad_norm": 3.190718173980713, "learning_rate": 1.7926859250153658e-05, "loss": 1.3313, "step": 6746 }, { "epoch": 0.5184388444990781, "grad_norm": 3.0104870796203613, "learning_rate": 1.792624462200369e-05, "loss": 1.3749, "step": 6748 }, { "epoch": 0.5185925015365703, "grad_norm": 3.335132598876953, "learning_rate": 1.792562999385372e-05, "loss": 1.3441, "step": 6750 }, { "epoch": 0.5187461585740627, "grad_norm": 2.9974780082702637, "learning_rate": 1.792501536570375e-05, "loss": 1.4476, "step": 6752 }, { "epoch": 0.518899815611555, "grad_norm": 2.8957419395446777, "learning_rate": 1.7924400737553784e-05, "loss": 1.3322, "step": 6754 }, { "epoch": 0.5190534726490473, "grad_norm": 3.018171787261963, "learning_rate": 1.792378610940381e-05, "loss": 1.3581, "step": 6756 }, { "epoch": 0.5192071296865396, "grad_norm": 2.8184151649475098, "learning_rate": 1.7923171481253843e-05, "loss": 1.4037, "step": 6758 }, { "epoch": 0.519360786724032, "grad_norm": 2.97739315032959, "learning_rate": 1.7922556853103873e-05, "loss": 1.2547, "step": 6760 }, { "epoch": 0.5195144437615242, "grad_norm": 3.4361817836761475, "learning_rate": 1.7921942224953906e-05, "loss": 1.3367, "step": 6762 }, { "epoch": 0.5196681007990166, "grad_norm": 25.288660049438477, "learning_rate": 1.7921327596803935e-05, "loss": 1.4986, "step": 6764 }, { "epoch": 0.5198217578365089, "grad_norm": 3.100538492202759, "learning_rate": 1.7920712968653965e-05, "loss": 1.3149, "step": 6766 }, { "epoch": 0.5199754148740012, "grad_norm": 2.8900961875915527, "learning_rate": 1.7920098340503998e-05, "loss": 1.352, "step": 6768 }, { "epoch": 0.5201290719114936, "grad_norm": 2.9886999130249023, "learning_rate": 1.7919483712354028e-05, "loss": 1.2732, "step": 6770 }, { "epoch": 0.5202827289489859, "grad_norm": 2.7704336643218994, "learning_rate": 1.7918869084204058e-05, "loss": 1.1976, "step": 6772 }, { "epoch": 0.5204363859864782, "grad_norm": 2.448591709136963, "learning_rate": 1.791825445605409e-05, "loss": 1.2853, "step": 6774 }, { "epoch": 0.5205900430239705, "grad_norm": 2.9498369693756104, "learning_rate": 1.7917639827904117e-05, "loss": 1.306, "step": 6776 }, { "epoch": 0.5207437000614629, "grad_norm": 3.2364501953125, "learning_rate": 1.791702519975415e-05, "loss": 1.379, "step": 6778 }, { "epoch": 0.5208973570989551, "grad_norm": 3.1900076866149902, "learning_rate": 1.791641057160418e-05, "loss": 1.4177, "step": 6780 }, { "epoch": 0.5210510141364475, "grad_norm": 3.0492565631866455, "learning_rate": 1.791579594345421e-05, "loss": 1.4582, "step": 6782 }, { "epoch": 0.5212046711739398, "grad_norm": 2.8004837036132812, "learning_rate": 1.7915181315304242e-05, "loss": 1.4414, "step": 6784 }, { "epoch": 0.5213583282114321, "grad_norm": 2.4796876907348633, "learning_rate": 1.7914566687154272e-05, "loss": 1.388, "step": 6786 }, { "epoch": 0.5215119852489244, "grad_norm": 2.892904281616211, "learning_rate": 1.7913952059004305e-05, "loss": 1.4414, "step": 6788 }, { "epoch": 0.5216656422864168, "grad_norm": 3.042837381362915, "learning_rate": 1.7913337430854335e-05, "loss": 1.4302, "step": 6790 }, { "epoch": 0.521819299323909, "grad_norm": 3.3126227855682373, "learning_rate": 1.7912722802704365e-05, "loss": 1.4458, "step": 6792 }, { "epoch": 0.5219729563614014, "grad_norm": 2.8981101512908936, "learning_rate": 1.7912108174554398e-05, "loss": 1.4403, "step": 6794 }, { "epoch": 0.5221266133988937, "grad_norm": 3.016014337539673, "learning_rate": 1.7911493546404427e-05, "loss": 1.235, "step": 6796 }, { "epoch": 0.522280270436386, "grad_norm": 2.693057060241699, "learning_rate": 1.7910878918254457e-05, "loss": 1.3, "step": 6798 }, { "epoch": 0.5224339274738783, "grad_norm": 2.8503472805023193, "learning_rate": 1.791026429010449e-05, "loss": 1.2262, "step": 6800 }, { "epoch": 0.5225875845113707, "grad_norm": 3.0016205310821533, "learning_rate": 1.7909649661954516e-05, "loss": 1.328, "step": 6802 }, { "epoch": 0.5227412415488629, "grad_norm": 2.7652969360351562, "learning_rate": 1.790903503380455e-05, "loss": 1.1953, "step": 6804 }, { "epoch": 0.5228948985863553, "grad_norm": 3.02223539352417, "learning_rate": 1.790842040565458e-05, "loss": 1.1621, "step": 6806 }, { "epoch": 0.5230485556238476, "grad_norm": 3.1197988986968994, "learning_rate": 1.7907805777504612e-05, "loss": 1.4014, "step": 6808 }, { "epoch": 0.5232022126613399, "grad_norm": 2.959120273590088, "learning_rate": 1.7907191149354642e-05, "loss": 1.4844, "step": 6810 }, { "epoch": 0.5233558696988322, "grad_norm": 3.0649282932281494, "learning_rate": 1.790657652120467e-05, "loss": 1.4225, "step": 6812 }, { "epoch": 0.5235095267363246, "grad_norm": 2.9492135047912598, "learning_rate": 1.7905961893054705e-05, "loss": 1.4206, "step": 6814 }, { "epoch": 0.5236631837738168, "grad_norm": 3.1265509128570557, "learning_rate": 1.7905347264904734e-05, "loss": 1.317, "step": 6816 }, { "epoch": 0.5238168408113092, "grad_norm": 2.831228256225586, "learning_rate": 1.7904732636754764e-05, "loss": 1.4686, "step": 6818 }, { "epoch": 0.5239704978488015, "grad_norm": 3.228123903274536, "learning_rate": 1.7904118008604797e-05, "loss": 1.4721, "step": 6820 }, { "epoch": 0.5241241548862938, "grad_norm": 3.5068886280059814, "learning_rate": 1.7903503380454827e-05, "loss": 1.4167, "step": 6822 }, { "epoch": 0.5242778119237861, "grad_norm": 2.8935916423797607, "learning_rate": 1.7902888752304856e-05, "loss": 1.362, "step": 6824 }, { "epoch": 0.5244314689612785, "grad_norm": 3.246994972229004, "learning_rate": 1.790227412415489e-05, "loss": 1.3156, "step": 6826 }, { "epoch": 0.5245851259987707, "grad_norm": 3.070075035095215, "learning_rate": 1.790165949600492e-05, "loss": 1.3155, "step": 6828 }, { "epoch": 0.5247387830362631, "grad_norm": 3.2165653705596924, "learning_rate": 1.790104486785495e-05, "loss": 1.4158, "step": 6830 }, { "epoch": 0.5248924400737554, "grad_norm": 3.122731924057007, "learning_rate": 1.790043023970498e-05, "loss": 1.312, "step": 6832 }, { "epoch": 0.5250460971112477, "grad_norm": 2.7937536239624023, "learning_rate": 1.789981561155501e-05, "loss": 1.4255, "step": 6834 }, { "epoch": 0.52519975414874, "grad_norm": 2.8714072704315186, "learning_rate": 1.789920098340504e-05, "loss": 1.3561, "step": 6836 }, { "epoch": 0.5253534111862324, "grad_norm": 3.3876631259918213, "learning_rate": 1.789858635525507e-05, "loss": 1.2653, "step": 6838 }, { "epoch": 0.5255070682237246, "grad_norm": 3.067852258682251, "learning_rate": 1.7897971727105104e-05, "loss": 1.462, "step": 6840 }, { "epoch": 0.525660725261217, "grad_norm": 2.623149871826172, "learning_rate": 1.7897357098955134e-05, "loss": 1.358, "step": 6842 }, { "epoch": 0.5258143822987092, "grad_norm": 3.001631021499634, "learning_rate": 1.7896742470805163e-05, "loss": 1.4515, "step": 6844 }, { "epoch": 0.5259680393362016, "grad_norm": 2.7457780838012695, "learning_rate": 1.7896127842655197e-05, "loss": 1.2601, "step": 6846 }, { "epoch": 0.5261216963736939, "grad_norm": 3.05543851852417, "learning_rate": 1.7895513214505226e-05, "loss": 1.3837, "step": 6848 }, { "epoch": 0.5262753534111863, "grad_norm": 2.7177581787109375, "learning_rate": 1.7894898586355256e-05, "loss": 1.2661, "step": 6850 }, { "epoch": 0.5264290104486785, "grad_norm": 2.9022562503814697, "learning_rate": 1.789428395820529e-05, "loss": 1.3782, "step": 6852 }, { "epoch": 0.5265826674861709, "grad_norm": 3.01700758934021, "learning_rate": 1.789366933005532e-05, "loss": 1.3188, "step": 6854 }, { "epoch": 0.5267363245236631, "grad_norm": 2.9770376682281494, "learning_rate": 1.789305470190535e-05, "loss": 1.4046, "step": 6856 }, { "epoch": 0.5268899815611555, "grad_norm": 3.635504961013794, "learning_rate": 1.7892440073755378e-05, "loss": 1.3997, "step": 6858 }, { "epoch": 0.5270436385986478, "grad_norm": 3.3790907859802246, "learning_rate": 1.789182544560541e-05, "loss": 1.3248, "step": 6860 }, { "epoch": 0.5271972956361402, "grad_norm": 3.0795631408691406, "learning_rate": 1.789121081745544e-05, "loss": 1.2888, "step": 6862 }, { "epoch": 0.5273509526736324, "grad_norm": 2.7538647651672363, "learning_rate": 1.789059618930547e-05, "loss": 1.5236, "step": 6864 }, { "epoch": 0.5275046097111248, "grad_norm": 3.4936375617980957, "learning_rate": 1.7889981561155504e-05, "loss": 1.4455, "step": 6866 }, { "epoch": 0.527658266748617, "grad_norm": 3.7572572231292725, "learning_rate": 1.7889366933005533e-05, "loss": 1.4181, "step": 6868 }, { "epoch": 0.5278119237861094, "grad_norm": 3.407961130142212, "learning_rate": 1.7888752304855563e-05, "loss": 1.3646, "step": 6870 }, { "epoch": 0.5279655808236017, "grad_norm": 2.7877607345581055, "learning_rate": 1.7888137676705596e-05, "loss": 1.259, "step": 6872 }, { "epoch": 0.528119237861094, "grad_norm": 2.999338150024414, "learning_rate": 1.7887523048555626e-05, "loss": 1.3156, "step": 6874 }, { "epoch": 0.5282728948985863, "grad_norm": 3.449953079223633, "learning_rate": 1.7886908420405655e-05, "loss": 1.4194, "step": 6876 }, { "epoch": 0.5284265519360787, "grad_norm": 3.0166075229644775, "learning_rate": 1.788629379225569e-05, "loss": 1.3544, "step": 6878 }, { "epoch": 0.5285802089735709, "grad_norm": 2.9438271522521973, "learning_rate": 1.7885679164105718e-05, "loss": 1.4669, "step": 6880 }, { "epoch": 0.5287338660110633, "grad_norm": 2.558922529220581, "learning_rate": 1.7885064535955748e-05, "loss": 1.2715, "step": 6882 }, { "epoch": 0.5288875230485556, "grad_norm": 2.997262954711914, "learning_rate": 1.7884449907805777e-05, "loss": 1.3229, "step": 6884 }, { "epoch": 0.529041180086048, "grad_norm": 2.7784371376037598, "learning_rate": 1.788383527965581e-05, "loss": 1.2801, "step": 6886 }, { "epoch": 0.5291948371235402, "grad_norm": 3.280125379562378, "learning_rate": 1.788322065150584e-05, "loss": 1.3505, "step": 6888 }, { "epoch": 0.5293484941610326, "grad_norm": 3.9201979637145996, "learning_rate": 1.788260602335587e-05, "loss": 1.3235, "step": 6890 }, { "epoch": 0.5295021511985248, "grad_norm": 3.0474581718444824, "learning_rate": 1.7881991395205903e-05, "loss": 1.3269, "step": 6892 }, { "epoch": 0.5296558082360172, "grad_norm": 2.956028938293457, "learning_rate": 1.7881376767055933e-05, "loss": 1.2897, "step": 6894 }, { "epoch": 0.5298094652735095, "grad_norm": 3.143683433532715, "learning_rate": 1.7880762138905962e-05, "loss": 1.3913, "step": 6896 }, { "epoch": 0.5299631223110018, "grad_norm": 2.942600727081299, "learning_rate": 1.7880147510755995e-05, "loss": 1.2993, "step": 6898 }, { "epoch": 0.5301167793484942, "grad_norm": 2.7133402824401855, "learning_rate": 1.7879532882606025e-05, "loss": 1.2878, "step": 6900 }, { "epoch": 0.5302704363859865, "grad_norm": 2.7319624423980713, "learning_rate": 1.7878918254456055e-05, "loss": 1.2711, "step": 6902 }, { "epoch": 0.5304240934234788, "grad_norm": 3.0787789821624756, "learning_rate": 1.7878303626306084e-05, "loss": 1.4048, "step": 6904 }, { "epoch": 0.5305777504609711, "grad_norm": 3.218059539794922, "learning_rate": 1.7877688998156118e-05, "loss": 1.346, "step": 6906 }, { "epoch": 0.5307314074984635, "grad_norm": 3.2375054359436035, "learning_rate": 1.7877074370006147e-05, "loss": 1.3984, "step": 6908 }, { "epoch": 0.5308850645359557, "grad_norm": 2.9513885974884033, "learning_rate": 1.7876459741856177e-05, "loss": 1.4062, "step": 6910 }, { "epoch": 0.5310387215734481, "grad_norm": 3.1400296688079834, "learning_rate": 1.787584511370621e-05, "loss": 1.4292, "step": 6912 }, { "epoch": 0.5311923786109404, "grad_norm": 3.114333152770996, "learning_rate": 1.787523048555624e-05, "loss": 1.5008, "step": 6914 }, { "epoch": 0.5313460356484327, "grad_norm": 2.8262038230895996, "learning_rate": 1.787461585740627e-05, "loss": 1.4744, "step": 6916 }, { "epoch": 0.531499692685925, "grad_norm": 3.2163949012756348, "learning_rate": 1.7874001229256302e-05, "loss": 1.3959, "step": 6918 }, { "epoch": 0.5316533497234174, "grad_norm": 2.7985939979553223, "learning_rate": 1.7873386601106332e-05, "loss": 1.4022, "step": 6920 }, { "epoch": 0.5318070067609096, "grad_norm": 2.731464147567749, "learning_rate": 1.7872771972956362e-05, "loss": 1.3438, "step": 6922 }, { "epoch": 0.531960663798402, "grad_norm": 3.187049627304077, "learning_rate": 1.7872157344806395e-05, "loss": 1.4009, "step": 6924 }, { "epoch": 0.5321143208358943, "grad_norm": 2.9336483478546143, "learning_rate": 1.7871542716656425e-05, "loss": 1.412, "step": 6926 }, { "epoch": 0.5322679778733866, "grad_norm": 3.132073402404785, "learning_rate": 1.7870928088506454e-05, "loss": 1.3688, "step": 6928 }, { "epoch": 0.5324216349108789, "grad_norm": 2.981800079345703, "learning_rate": 1.7870313460356484e-05, "loss": 1.3054, "step": 6930 }, { "epoch": 0.5325752919483713, "grad_norm": 3.3410542011260986, "learning_rate": 1.7869698832206517e-05, "loss": 1.3939, "step": 6932 }, { "epoch": 0.5327289489858635, "grad_norm": 2.8164236545562744, "learning_rate": 1.7869084204056547e-05, "loss": 1.3269, "step": 6934 }, { "epoch": 0.5328826060233559, "grad_norm": 3.8438332080841064, "learning_rate": 1.7868469575906576e-05, "loss": 1.3612, "step": 6936 }, { "epoch": 0.5330362630608482, "grad_norm": 2.7539196014404297, "learning_rate": 1.786785494775661e-05, "loss": 1.351, "step": 6938 }, { "epoch": 0.5331899200983405, "grad_norm": 3.0499660968780518, "learning_rate": 1.786724031960664e-05, "loss": 1.4173, "step": 6940 }, { "epoch": 0.5333435771358328, "grad_norm": 3.459529399871826, "learning_rate": 1.786662569145667e-05, "loss": 1.4837, "step": 6942 }, { "epoch": 0.5334972341733252, "grad_norm": 3.2329859733581543, "learning_rate": 1.7866011063306702e-05, "loss": 1.4251, "step": 6944 }, { "epoch": 0.5336508912108174, "grad_norm": 3.210688352584839, "learning_rate": 1.786539643515673e-05, "loss": 1.3004, "step": 6946 }, { "epoch": 0.5338045482483098, "grad_norm": 3.564739227294922, "learning_rate": 1.786478180700676e-05, "loss": 1.443, "step": 6948 }, { "epoch": 0.5339582052858021, "grad_norm": 2.8052964210510254, "learning_rate": 1.7864167178856794e-05, "loss": 1.2934, "step": 6950 }, { "epoch": 0.5341118623232944, "grad_norm": 3.189671039581299, "learning_rate": 1.7863552550706824e-05, "loss": 1.3459, "step": 6952 }, { "epoch": 0.5342655193607867, "grad_norm": 3.1696484088897705, "learning_rate": 1.7862937922556857e-05, "loss": 1.4234, "step": 6954 }, { "epoch": 0.5344191763982791, "grad_norm": 2.973815679550171, "learning_rate": 1.7862323294406883e-05, "loss": 1.2077, "step": 6956 }, { "epoch": 0.5345728334357713, "grad_norm": 3.508512258529663, "learning_rate": 1.7861708666256916e-05, "loss": 1.3832, "step": 6958 }, { "epoch": 0.5347264904732637, "grad_norm": 2.8029847145080566, "learning_rate": 1.7861094038106946e-05, "loss": 1.3648, "step": 6960 }, { "epoch": 0.534880147510756, "grad_norm": 3.183760404586792, "learning_rate": 1.7860479409956976e-05, "loss": 1.4012, "step": 6962 }, { "epoch": 0.5350338045482483, "grad_norm": 3.181417942047119, "learning_rate": 1.785986478180701e-05, "loss": 1.3642, "step": 6964 }, { "epoch": 0.5351874615857406, "grad_norm": 3.075376033782959, "learning_rate": 1.785925015365704e-05, "loss": 1.4268, "step": 6966 }, { "epoch": 0.535341118623233, "grad_norm": 3.069875478744507, "learning_rate": 1.7858635525507068e-05, "loss": 1.4777, "step": 6968 }, { "epoch": 0.5354947756607252, "grad_norm": 3.1056337356567383, "learning_rate": 1.78580208973571e-05, "loss": 1.4871, "step": 6970 }, { "epoch": 0.5356484326982176, "grad_norm": 3.0267107486724854, "learning_rate": 1.785740626920713e-05, "loss": 1.4342, "step": 6972 }, { "epoch": 0.5358020897357099, "grad_norm": 3.082707643508911, "learning_rate": 1.7856791641057164e-05, "loss": 1.4585, "step": 6974 }, { "epoch": 0.5359557467732022, "grad_norm": 3.154895544052124, "learning_rate": 1.7856177012907194e-05, "loss": 1.4232, "step": 6976 }, { "epoch": 0.5361094038106945, "grad_norm": 2.978407621383667, "learning_rate": 1.7855562384757223e-05, "loss": 1.3511, "step": 6978 }, { "epoch": 0.5362630608481869, "grad_norm": 2.940761089324951, "learning_rate": 1.7854947756607257e-05, "loss": 1.372, "step": 6980 }, { "epoch": 0.5364167178856791, "grad_norm": 2.9799039363861084, "learning_rate": 1.7854333128457283e-05, "loss": 1.5043, "step": 6982 }, { "epoch": 0.5365703749231715, "grad_norm": 2.7917842864990234, "learning_rate": 1.7853718500307316e-05, "loss": 1.2237, "step": 6984 }, { "epoch": 0.5367240319606638, "grad_norm": 2.8482396602630615, "learning_rate": 1.7853103872157346e-05, "loss": 1.4475, "step": 6986 }, { "epoch": 0.5368776889981561, "grad_norm": 3.9226222038269043, "learning_rate": 1.7852489244007375e-05, "loss": 1.3577, "step": 6988 }, { "epoch": 0.5370313460356484, "grad_norm": 2.828793525695801, "learning_rate": 1.785187461585741e-05, "loss": 1.3318, "step": 6990 }, { "epoch": 0.5371850030731408, "grad_norm": 2.9199106693267822, "learning_rate": 1.7851259987707438e-05, "loss": 1.3467, "step": 6992 }, { "epoch": 0.537338660110633, "grad_norm": 2.835475444793701, "learning_rate": 1.785064535955747e-05, "loss": 1.3634, "step": 6994 }, { "epoch": 0.5374923171481254, "grad_norm": 3.4660840034484863, "learning_rate": 1.78500307314075e-05, "loss": 1.2317, "step": 6996 }, { "epoch": 0.5376459741856177, "grad_norm": 4.105753421783447, "learning_rate": 1.784941610325753e-05, "loss": 1.3137, "step": 6998 }, { "epoch": 0.53779963122311, "grad_norm": 2.8999927043914795, "learning_rate": 1.7848801475107564e-05, "loss": 1.2568, "step": 7000 }, { "epoch": 0.5379532882606023, "grad_norm": 2.814363718032837, "learning_rate": 1.784818684695759e-05, "loss": 1.4887, "step": 7002 }, { "epoch": 0.5381069452980947, "grad_norm": 2.9154648780822754, "learning_rate": 1.7847572218807623e-05, "loss": 1.1712, "step": 7004 }, { "epoch": 0.5382606023355869, "grad_norm": 3.22001051902771, "learning_rate": 1.7846957590657653e-05, "loss": 1.3, "step": 7006 }, { "epoch": 0.5384142593730793, "grad_norm": 3.288647174835205, "learning_rate": 1.7846342962507682e-05, "loss": 1.3765, "step": 7008 }, { "epoch": 0.5385679164105716, "grad_norm": 3.0418827533721924, "learning_rate": 1.7845728334357715e-05, "loss": 1.454, "step": 7010 }, { "epoch": 0.5387215734480639, "grad_norm": 2.8727383613586426, "learning_rate": 1.7845113706207745e-05, "loss": 1.3323, "step": 7012 }, { "epoch": 0.5388752304855562, "grad_norm": 3.0164737701416016, "learning_rate": 1.7844499078057778e-05, "loss": 1.2651, "step": 7014 }, { "epoch": 0.5390288875230486, "grad_norm": 2.6829867362976074, "learning_rate": 1.7843884449907808e-05, "loss": 1.3604, "step": 7016 }, { "epoch": 0.5391825445605408, "grad_norm": 3.5748116970062256, "learning_rate": 1.7843269821757837e-05, "loss": 1.4137, "step": 7018 }, { "epoch": 0.5393362015980332, "grad_norm": 2.9325039386749268, "learning_rate": 1.784265519360787e-05, "loss": 1.3057, "step": 7020 }, { "epoch": 0.5394898586355255, "grad_norm": 2.7085931301116943, "learning_rate": 1.78420405654579e-05, "loss": 1.1975, "step": 7022 }, { "epoch": 0.5396435156730178, "grad_norm": 2.880764961242676, "learning_rate": 1.784142593730793e-05, "loss": 1.35, "step": 7024 }, { "epoch": 0.5397971727105101, "grad_norm": 3.0631964206695557, "learning_rate": 1.7840811309157963e-05, "loss": 1.2638, "step": 7026 }, { "epoch": 0.5399508297480025, "grad_norm": 4.236493110656738, "learning_rate": 1.784019668100799e-05, "loss": 1.2436, "step": 7028 }, { "epoch": 0.5401044867854948, "grad_norm": 2.839618682861328, "learning_rate": 1.7839582052858022e-05, "loss": 1.3736, "step": 7030 }, { "epoch": 0.5402581438229871, "grad_norm": 3.1497669219970703, "learning_rate": 1.7838967424708052e-05, "loss": 1.4248, "step": 7032 }, { "epoch": 0.5404118008604795, "grad_norm": 2.8350067138671875, "learning_rate": 1.783835279655808e-05, "loss": 1.2533, "step": 7034 }, { "epoch": 0.5405654578979717, "grad_norm": 33.07500457763672, "learning_rate": 1.7837738168408115e-05, "loss": 1.4906, "step": 7036 }, { "epoch": 0.5407191149354641, "grad_norm": 3.066897392272949, "learning_rate": 1.7837123540258144e-05, "loss": 1.4008, "step": 7038 }, { "epoch": 0.5408727719729564, "grad_norm": 3.2352633476257324, "learning_rate": 1.7836508912108178e-05, "loss": 1.3084, "step": 7040 }, { "epoch": 0.5410264290104487, "grad_norm": 3.37577486038208, "learning_rate": 1.7835894283958207e-05, "loss": 1.3395, "step": 7042 }, { "epoch": 0.541180086047941, "grad_norm": 2.8560941219329834, "learning_rate": 1.7835279655808237e-05, "loss": 1.3559, "step": 7044 }, { "epoch": 0.5413337430854334, "grad_norm": 3.0230565071105957, "learning_rate": 1.783466502765827e-05, "loss": 1.3866, "step": 7046 }, { "epoch": 0.5414874001229256, "grad_norm": 2.6992247104644775, "learning_rate": 1.78340503995083e-05, "loss": 1.2888, "step": 7048 }, { "epoch": 0.541641057160418, "grad_norm": 3.213545322418213, "learning_rate": 1.783343577135833e-05, "loss": 1.4569, "step": 7050 }, { "epoch": 0.5417947141979103, "grad_norm": 3.045323610305786, "learning_rate": 1.7832821143208362e-05, "loss": 1.406, "step": 7052 }, { "epoch": 0.5419483712354026, "grad_norm": 3.1526753902435303, "learning_rate": 1.783220651505839e-05, "loss": 1.4462, "step": 7054 }, { "epoch": 0.5421020282728949, "grad_norm": 2.5964386463165283, "learning_rate": 1.7831591886908422e-05, "loss": 1.4212, "step": 7056 }, { "epoch": 0.5422556853103873, "grad_norm": 3.107309341430664, "learning_rate": 1.783097725875845e-05, "loss": 1.2955, "step": 7058 }, { "epoch": 0.5424093423478795, "grad_norm": 3.1672708988189697, "learning_rate": 1.7830362630608485e-05, "loss": 1.282, "step": 7060 }, { "epoch": 0.5425629993853719, "grad_norm": 3.1407082080841064, "learning_rate": 1.7829748002458514e-05, "loss": 1.2697, "step": 7062 }, { "epoch": 0.5427166564228642, "grad_norm": 2.588823080062866, "learning_rate": 1.7829133374308544e-05, "loss": 1.2566, "step": 7064 }, { "epoch": 0.5428703134603565, "grad_norm": 2.709662914276123, "learning_rate": 1.7828518746158577e-05, "loss": 1.3653, "step": 7066 }, { "epoch": 0.5430239704978488, "grad_norm": 2.963088274002075, "learning_rate": 1.7827904118008607e-05, "loss": 1.3215, "step": 7068 }, { "epoch": 0.5431776275353412, "grad_norm": 3.0335934162139893, "learning_rate": 1.7827289489858636e-05, "loss": 1.4173, "step": 7070 }, { "epoch": 0.5433312845728334, "grad_norm": 3.156770944595337, "learning_rate": 1.782667486170867e-05, "loss": 1.3679, "step": 7072 }, { "epoch": 0.5434849416103258, "grad_norm": 3.3931353092193604, "learning_rate": 1.78260602335587e-05, "loss": 1.4272, "step": 7074 }, { "epoch": 0.543638598647818, "grad_norm": 2.5168139934539795, "learning_rate": 1.782544560540873e-05, "loss": 1.3466, "step": 7076 }, { "epoch": 0.5437922556853104, "grad_norm": 2.82444429397583, "learning_rate": 1.7824830977258762e-05, "loss": 1.3579, "step": 7078 }, { "epoch": 0.5439459127228027, "grad_norm": 3.1184284687042236, "learning_rate": 1.782421634910879e-05, "loss": 1.4413, "step": 7080 }, { "epoch": 0.544099569760295, "grad_norm": 3.5062296390533447, "learning_rate": 1.782360172095882e-05, "loss": 1.4573, "step": 7082 }, { "epoch": 0.5442532267977873, "grad_norm": 3.173992395401001, "learning_rate": 1.782298709280885e-05, "loss": 1.3502, "step": 7084 }, { "epoch": 0.5444068838352797, "grad_norm": 2.8979501724243164, "learning_rate": 1.7822372464658884e-05, "loss": 1.4154, "step": 7086 }, { "epoch": 0.544560540872772, "grad_norm": 3.195020914077759, "learning_rate": 1.7821757836508914e-05, "loss": 1.408, "step": 7088 }, { "epoch": 0.5447141979102643, "grad_norm": 2.883970022201538, "learning_rate": 1.7821143208358943e-05, "loss": 1.3705, "step": 7090 }, { "epoch": 0.5448678549477566, "grad_norm": 3.1312525272369385, "learning_rate": 1.7820528580208976e-05, "loss": 1.3827, "step": 7092 }, { "epoch": 0.545021511985249, "grad_norm": 2.855011224746704, "learning_rate": 1.7819913952059006e-05, "loss": 1.1705, "step": 7094 }, { "epoch": 0.5451751690227412, "grad_norm": 2.7898807525634766, "learning_rate": 1.7819299323909036e-05, "loss": 1.2252, "step": 7096 }, { "epoch": 0.5453288260602336, "grad_norm": 3.0114316940307617, "learning_rate": 1.781868469575907e-05, "loss": 1.3664, "step": 7098 }, { "epoch": 0.5454824830977258, "grad_norm": 3.1177165508270264, "learning_rate": 1.78180700676091e-05, "loss": 1.4363, "step": 7100 }, { "epoch": 0.5456361401352182, "grad_norm": 3.265270471572876, "learning_rate": 1.7817455439459128e-05, "loss": 1.4604, "step": 7102 }, { "epoch": 0.5457897971727105, "grad_norm": 2.9632492065429688, "learning_rate": 1.7816840811309158e-05, "loss": 1.4497, "step": 7104 }, { "epoch": 0.5459434542102028, "grad_norm": 2.721161127090454, "learning_rate": 1.781622618315919e-05, "loss": 1.3612, "step": 7106 }, { "epoch": 0.5460971112476951, "grad_norm": 2.503848075866699, "learning_rate": 1.781561155500922e-05, "loss": 1.2765, "step": 7108 }, { "epoch": 0.5462507682851875, "grad_norm": 3.5437700748443604, "learning_rate": 1.781499692685925e-05, "loss": 1.3824, "step": 7110 }, { "epoch": 0.5464044253226797, "grad_norm": 2.859767436981201, "learning_rate": 1.7814382298709283e-05, "loss": 1.3918, "step": 7112 }, { "epoch": 0.5465580823601721, "grad_norm": 2.664762496948242, "learning_rate": 1.7813767670559313e-05, "loss": 1.3302, "step": 7114 }, { "epoch": 0.5467117393976644, "grad_norm": 2.8141744136810303, "learning_rate": 1.7813153042409343e-05, "loss": 1.2234, "step": 7116 }, { "epoch": 0.5468653964351567, "grad_norm": 3.569658041000366, "learning_rate": 1.7812538414259376e-05, "loss": 1.3897, "step": 7118 }, { "epoch": 0.547019053472649, "grad_norm": 2.765789747238159, "learning_rate": 1.7811923786109406e-05, "loss": 1.3527, "step": 7120 }, { "epoch": 0.5471727105101414, "grad_norm": 3.1543784141540527, "learning_rate": 1.7811309157959435e-05, "loss": 1.3532, "step": 7122 }, { "epoch": 0.5473263675476336, "grad_norm": 3.019174337387085, "learning_rate": 1.7810694529809468e-05, "loss": 1.3695, "step": 7124 }, { "epoch": 0.547480024585126, "grad_norm": 3.103487968444824, "learning_rate": 1.7810079901659498e-05, "loss": 1.3873, "step": 7126 }, { "epoch": 0.5476336816226183, "grad_norm": 2.756798505783081, "learning_rate": 1.7809465273509528e-05, "loss": 1.2963, "step": 7128 }, { "epoch": 0.5477873386601106, "grad_norm": 3.0003297328948975, "learning_rate": 1.7808850645359557e-05, "loss": 1.3152, "step": 7130 }, { "epoch": 0.5479409956976029, "grad_norm": 2.684039831161499, "learning_rate": 1.780823601720959e-05, "loss": 1.4226, "step": 7132 }, { "epoch": 0.5480946527350953, "grad_norm": 2.9724183082580566, "learning_rate": 1.780762138905962e-05, "loss": 1.4356, "step": 7134 }, { "epoch": 0.5482483097725875, "grad_norm": 2.8482649326324463, "learning_rate": 1.780700676090965e-05, "loss": 1.1933, "step": 7136 }, { "epoch": 0.5484019668100799, "grad_norm": 3.3711745738983154, "learning_rate": 1.7806392132759683e-05, "loss": 1.3696, "step": 7138 }, { "epoch": 0.5485556238475722, "grad_norm": 3.0896215438842773, "learning_rate": 1.7805777504609713e-05, "loss": 1.4287, "step": 7140 }, { "epoch": 0.5487092808850645, "grad_norm": 3.777125120162964, "learning_rate": 1.7805162876459742e-05, "loss": 1.3143, "step": 7142 }, { "epoch": 0.5488629379225568, "grad_norm": 3.1980645656585693, "learning_rate": 1.7804548248309775e-05, "loss": 1.3388, "step": 7144 }, { "epoch": 0.5490165949600492, "grad_norm": 3.0263702869415283, "learning_rate": 1.7803933620159805e-05, "loss": 1.2497, "step": 7146 }, { "epoch": 0.5491702519975414, "grad_norm": 3.010733127593994, "learning_rate": 1.7803318992009835e-05, "loss": 1.1814, "step": 7148 }, { "epoch": 0.5493239090350338, "grad_norm": 2.8031928539276123, "learning_rate": 1.7802704363859868e-05, "loss": 1.251, "step": 7150 }, { "epoch": 0.5494775660725261, "grad_norm": 3.2427356243133545, "learning_rate": 1.7802089735709897e-05, "loss": 1.3079, "step": 7152 }, { "epoch": 0.5496312231100184, "grad_norm": 3.0464084148406982, "learning_rate": 1.7801475107559927e-05, "loss": 1.3989, "step": 7154 }, { "epoch": 0.5497848801475107, "grad_norm": 2.847355604171753, "learning_rate": 1.7800860479409957e-05, "loss": 1.3637, "step": 7156 }, { "epoch": 0.5499385371850031, "grad_norm": 3.1915597915649414, "learning_rate": 1.780024585125999e-05, "loss": 1.3317, "step": 7158 }, { "epoch": 0.5500921942224954, "grad_norm": 3.0718259811401367, "learning_rate": 1.779963122311002e-05, "loss": 1.3284, "step": 7160 }, { "epoch": 0.5502458512599877, "grad_norm": 3.015608310699463, "learning_rate": 1.779901659496005e-05, "loss": 1.3458, "step": 7162 }, { "epoch": 0.5503995082974801, "grad_norm": 3.1199285984039307, "learning_rate": 1.7798401966810082e-05, "loss": 1.3407, "step": 7164 }, { "epoch": 0.5505531653349723, "grad_norm": 3.2667245864868164, "learning_rate": 1.7797787338660112e-05, "loss": 1.3999, "step": 7166 }, { "epoch": 0.5507068223724647, "grad_norm": 2.711768865585327, "learning_rate": 1.779717271051014e-05, "loss": 1.2374, "step": 7168 }, { "epoch": 0.550860479409957, "grad_norm": 2.933833599090576, "learning_rate": 1.7796558082360175e-05, "loss": 1.2441, "step": 7170 }, { "epoch": 0.5510141364474493, "grad_norm": 2.5497310161590576, "learning_rate": 1.7795943454210204e-05, "loss": 1.3304, "step": 7172 }, { "epoch": 0.5511677934849416, "grad_norm": 2.6509580612182617, "learning_rate": 1.7795328826060234e-05, "loss": 1.3488, "step": 7174 }, { "epoch": 0.551321450522434, "grad_norm": 2.720896005630493, "learning_rate": 1.7794714197910267e-05, "loss": 1.2818, "step": 7176 }, { "epoch": 0.5514751075599262, "grad_norm": 3.513991355895996, "learning_rate": 1.7794099569760297e-05, "loss": 1.2761, "step": 7178 }, { "epoch": 0.5516287645974186, "grad_norm": 3.5131587982177734, "learning_rate": 1.7793484941610327e-05, "loss": 1.4, "step": 7180 }, { "epoch": 0.5517824216349109, "grad_norm": 3.281924247741699, "learning_rate": 1.7792870313460356e-05, "loss": 1.3619, "step": 7182 }, { "epoch": 0.5519360786724032, "grad_norm": 2.9389426708221436, "learning_rate": 1.779225568531039e-05, "loss": 1.6219, "step": 7184 }, { "epoch": 0.5520897357098955, "grad_norm": 2.482316493988037, "learning_rate": 1.779164105716042e-05, "loss": 1.2586, "step": 7186 }, { "epoch": 0.5522433927473879, "grad_norm": 3.0932509899139404, "learning_rate": 1.779102642901045e-05, "loss": 1.3397, "step": 7188 }, { "epoch": 0.5523970497848801, "grad_norm": 3.1350340843200684, "learning_rate": 1.7790411800860482e-05, "loss": 1.255, "step": 7190 }, { "epoch": 0.5525507068223725, "grad_norm": 2.5633490085601807, "learning_rate": 1.778979717271051e-05, "loss": 1.1744, "step": 7192 }, { "epoch": 0.5527043638598648, "grad_norm": 2.803712844848633, "learning_rate": 1.778918254456054e-05, "loss": 1.4449, "step": 7194 }, { "epoch": 0.5528580208973571, "grad_norm": 2.5207178592681885, "learning_rate": 1.7788567916410574e-05, "loss": 1.4056, "step": 7196 }, { "epoch": 0.5530116779348494, "grad_norm": 2.7499046325683594, "learning_rate": 1.7787953288260604e-05, "loss": 1.2398, "step": 7198 }, { "epoch": 0.5531653349723418, "grad_norm": 3.0889878273010254, "learning_rate": 1.7787338660110634e-05, "loss": 1.4251, "step": 7200 }, { "epoch": 0.553318992009834, "grad_norm": 2.5837948322296143, "learning_rate": 1.7786724031960667e-05, "loss": 1.345, "step": 7202 }, { "epoch": 0.5534726490473264, "grad_norm": 3.2794241905212402, "learning_rate": 1.7786109403810696e-05, "loss": 1.4219, "step": 7204 }, { "epoch": 0.5536263060848187, "grad_norm": 2.494079828262329, "learning_rate": 1.778549477566073e-05, "loss": 1.2795, "step": 7206 }, { "epoch": 0.553779963122311, "grad_norm": 3.3279178142547607, "learning_rate": 1.7784880147510756e-05, "loss": 1.3046, "step": 7208 }, { "epoch": 0.5539336201598033, "grad_norm": 2.7889404296875, "learning_rate": 1.778426551936079e-05, "loss": 1.2484, "step": 7210 }, { "epoch": 0.5540872771972957, "grad_norm": 2.8925845623016357, "learning_rate": 1.778365089121082e-05, "loss": 1.2489, "step": 7212 }, { "epoch": 0.5542409342347879, "grad_norm": 3.148367166519165, "learning_rate": 1.7783036263060848e-05, "loss": 1.4886, "step": 7214 }, { "epoch": 0.5543945912722803, "grad_norm": 3.505509376525879, "learning_rate": 1.778242163491088e-05, "loss": 1.4575, "step": 7216 }, { "epoch": 0.5545482483097726, "grad_norm": 2.391782760620117, "learning_rate": 1.778180700676091e-05, "loss": 1.2687, "step": 7218 }, { "epoch": 0.5547019053472649, "grad_norm": 2.8791768550872803, "learning_rate": 1.778119237861094e-05, "loss": 1.4009, "step": 7220 }, { "epoch": 0.5548555623847572, "grad_norm": 3.043830633163452, "learning_rate": 1.7780577750460974e-05, "loss": 1.3839, "step": 7222 }, { "epoch": 0.5550092194222496, "grad_norm": 3.7112839221954346, "learning_rate": 1.7779963122311003e-05, "loss": 1.3653, "step": 7224 }, { "epoch": 0.5551628764597418, "grad_norm": 2.8248283863067627, "learning_rate": 1.7779348494161036e-05, "loss": 1.3582, "step": 7226 }, { "epoch": 0.5553165334972342, "grad_norm": 2.891206979751587, "learning_rate": 1.7778733866011063e-05, "loss": 1.2634, "step": 7228 }, { "epoch": 0.5554701905347265, "grad_norm": 2.6088945865631104, "learning_rate": 1.7778119237861096e-05, "loss": 1.4858, "step": 7230 }, { "epoch": 0.5556238475722188, "grad_norm": 3.5376429557800293, "learning_rate": 1.7777504609711125e-05, "loss": 1.3223, "step": 7232 }, { "epoch": 0.5557775046097111, "grad_norm": 2.7143330574035645, "learning_rate": 1.7776889981561155e-05, "loss": 1.2609, "step": 7234 }, { "epoch": 0.5559311616472035, "grad_norm": 2.7578842639923096, "learning_rate": 1.7776275353411188e-05, "loss": 1.3574, "step": 7236 }, { "epoch": 0.5560848186846957, "grad_norm": 2.8860747814178467, "learning_rate": 1.7775660725261218e-05, "loss": 1.2957, "step": 7238 }, { "epoch": 0.5562384757221881, "grad_norm": 3.3761367797851562, "learning_rate": 1.7775046097111248e-05, "loss": 1.396, "step": 7240 }, { "epoch": 0.5563921327596804, "grad_norm": 2.8714826107025146, "learning_rate": 1.777443146896128e-05, "loss": 1.289, "step": 7242 }, { "epoch": 0.5565457897971727, "grad_norm": 2.9392268657684326, "learning_rate": 1.777381684081131e-05, "loss": 1.3034, "step": 7244 }, { "epoch": 0.556699446834665, "grad_norm": 2.9862611293792725, "learning_rate": 1.7773202212661343e-05, "loss": 1.2842, "step": 7246 }, { "epoch": 0.5568531038721574, "grad_norm": 3.157655954360962, "learning_rate": 1.7772587584511373e-05, "loss": 1.3206, "step": 7248 }, { "epoch": 0.5570067609096496, "grad_norm": 2.7796456813812256, "learning_rate": 1.7771972956361403e-05, "loss": 1.3415, "step": 7250 }, { "epoch": 0.557160417947142, "grad_norm": 2.800544023513794, "learning_rate": 1.7771358328211436e-05, "loss": 1.3417, "step": 7252 }, { "epoch": 0.5573140749846343, "grad_norm": 2.797663688659668, "learning_rate": 1.7770743700061462e-05, "loss": 1.2614, "step": 7254 }, { "epoch": 0.5574677320221266, "grad_norm": 2.899327278137207, "learning_rate": 1.7770129071911495e-05, "loss": 1.4148, "step": 7256 }, { "epoch": 0.5576213890596189, "grad_norm": 3.1357805728912354, "learning_rate": 1.7769514443761525e-05, "loss": 1.4304, "step": 7258 }, { "epoch": 0.5577750460971113, "grad_norm": 2.704547643661499, "learning_rate": 1.7768899815611555e-05, "loss": 1.3199, "step": 7260 }, { "epoch": 0.5579287031346035, "grad_norm": 3.0023064613342285, "learning_rate": 1.7768285187461588e-05, "loss": 1.3294, "step": 7262 }, { "epoch": 0.5580823601720959, "grad_norm": 2.6803040504455566, "learning_rate": 1.7767670559311617e-05, "loss": 1.3632, "step": 7264 }, { "epoch": 0.5582360172095882, "grad_norm": 3.395805597305298, "learning_rate": 1.776705593116165e-05, "loss": 1.513, "step": 7266 }, { "epoch": 0.5583896742470805, "grad_norm": 3.188375949859619, "learning_rate": 1.776644130301168e-05, "loss": 1.2284, "step": 7268 }, { "epoch": 0.5585433312845728, "grad_norm": 3.19944167137146, "learning_rate": 1.776582667486171e-05, "loss": 1.5306, "step": 7270 }, { "epoch": 0.5586969883220652, "grad_norm": 3.132715940475464, "learning_rate": 1.7765212046711743e-05, "loss": 1.5381, "step": 7272 }, { "epoch": 0.5588506453595574, "grad_norm": 2.777874231338501, "learning_rate": 1.7764597418561772e-05, "loss": 1.3303, "step": 7274 }, { "epoch": 0.5590043023970498, "grad_norm": 3.418839931488037, "learning_rate": 1.7763982790411802e-05, "loss": 1.3595, "step": 7276 }, { "epoch": 0.559157959434542, "grad_norm": 3.0690410137176514, "learning_rate": 1.7763368162261835e-05, "loss": 1.3164, "step": 7278 }, { "epoch": 0.5593116164720344, "grad_norm": 3.341252326965332, "learning_rate": 1.776275353411186e-05, "loss": 1.4019, "step": 7280 }, { "epoch": 0.5594652735095267, "grad_norm": 2.947756052017212, "learning_rate": 1.7762138905961895e-05, "loss": 1.1799, "step": 7282 }, { "epoch": 0.5596189305470191, "grad_norm": 3.1042239665985107, "learning_rate": 1.7761524277811924e-05, "loss": 1.3689, "step": 7284 }, { "epoch": 0.5597725875845113, "grad_norm": 3.0481724739074707, "learning_rate": 1.7760909649661957e-05, "loss": 1.3483, "step": 7286 }, { "epoch": 0.5599262446220037, "grad_norm": 3.041898250579834, "learning_rate": 1.7760295021511987e-05, "loss": 1.3574, "step": 7288 }, { "epoch": 0.5600799016594961, "grad_norm": 2.627877712249756, "learning_rate": 1.7759680393362017e-05, "loss": 1.3905, "step": 7290 }, { "epoch": 0.5602335586969883, "grad_norm": 2.628779649734497, "learning_rate": 1.775906576521205e-05, "loss": 1.3318, "step": 7292 }, { "epoch": 0.5603872157344807, "grad_norm": 2.788853406906128, "learning_rate": 1.775845113706208e-05, "loss": 1.3654, "step": 7294 }, { "epoch": 0.560540872771973, "grad_norm": 2.898160696029663, "learning_rate": 1.775783650891211e-05, "loss": 1.3316, "step": 7296 }, { "epoch": 0.5606945298094653, "grad_norm": 3.0796875953674316, "learning_rate": 1.7757221880762142e-05, "loss": 1.2504, "step": 7298 }, { "epoch": 0.5608481868469576, "grad_norm": 2.4467694759368896, "learning_rate": 1.7756607252612172e-05, "loss": 1.2919, "step": 7300 } ], "logging_steps": 2, "max_steps": 65080, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.6598612105927066e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }