diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11524277811923786, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015365703749231714, + "grad_norm": 54.03105926513672, + "learning_rate": 1.9999385371850035e-05, + "loss": 5.8486, + "step": 2 + }, + { + "epoch": 0.00030731407498463427, + "grad_norm": 60.26078414916992, + "learning_rate": 1.999877074370006e-05, + "loss": 5.5476, + "step": 4 + }, + { + "epoch": 0.00046097111247695143, + "grad_norm": 172.71730041503906, + "learning_rate": 1.9998156115550094e-05, + "loss": 4.8213, + "step": 6 + }, + { + "epoch": 0.0006146281499692685, + "grad_norm": 80.93025970458984, + "learning_rate": 1.9997541487400124e-05, + "loss": 4.8849, + "step": 8 + }, + { + "epoch": 0.0007682851874615857, + "grad_norm": 118.46244049072266, + "learning_rate": 1.9996926859250153e-05, + "loss": 4.0143, + "step": 10 + }, + { + "epoch": 0.0009219422249539029, + "grad_norm": 58.34870910644531, + "learning_rate": 1.9996312231100187e-05, + "loss": 4.0411, + "step": 12 + }, + { + "epoch": 0.0010755992624462201, + "grad_norm": 35.50437927246094, + "learning_rate": 1.9995697602950216e-05, + "loss": 3.9182, + "step": 14 + }, + { + "epoch": 0.001229256299938537, + "grad_norm": 60.75965118408203, + "learning_rate": 1.999508297480025e-05, + "loss": 3.7527, + "step": 16 + }, + { + "epoch": 0.0013829133374308542, + "grad_norm": 53.328765869140625, + "learning_rate": 1.999446834665028e-05, + "loss": 3.5424, + "step": 18 + }, + { + "epoch": 0.0015365703749231714, + "grad_norm": 40.73623275756836, + "learning_rate": 1.999385371850031e-05, + "loss": 3.7179, + "step": 20 + }, + { + "epoch": 0.0016902274124154886, + "grad_norm": 15.511174201965332, + "learning_rate": 1.9993239090350342e-05, + "loss": 3.3177, + "step": 22 + }, + { + "epoch": 0.0018438844499078057, + "grad_norm": 62.24359130859375, + "learning_rate": 1.9992624462200368e-05, + "loss": 3.5207, + "step": 24 + }, + { + "epoch": 0.001997541487400123, + "grad_norm": 69.20399475097656, + "learning_rate": 1.99920098340504e-05, + "loss": 3.6051, + "step": 26 + }, + { + "epoch": 0.0021511985248924403, + "grad_norm": 39.98881530761719, + "learning_rate": 1.999139520590043e-05, + "loss": 3.3193, + "step": 28 + }, + { + "epoch": 0.0023048555623847574, + "grad_norm": 10.113142013549805, + "learning_rate": 1.999078057775046e-05, + "loss": 3.2187, + "step": 30 + }, + { + "epoch": 0.002458512599877074, + "grad_norm": 28.31175422668457, + "learning_rate": 1.9990165949600494e-05, + "loss": 3.4458, + "step": 32 + }, + { + "epoch": 0.0026121696373693913, + "grad_norm": 21.829612731933594, + "learning_rate": 1.9989551321450523e-05, + "loss": 3.5843, + "step": 34 + }, + { + "epoch": 0.0027658266748617085, + "grad_norm": 18.00796127319336, + "learning_rate": 1.9988936693300556e-05, + "loss": 3.2827, + "step": 36 + }, + { + "epoch": 0.0029194837123540257, + "grad_norm": 16.2840576171875, + "learning_rate": 1.9988322065150586e-05, + "loss": 3.2059, + "step": 38 + }, + { + "epoch": 0.003073140749846343, + "grad_norm": 11.987384796142578, + "learning_rate": 1.9987707437000616e-05, + "loss": 3.1313, + "step": 40 + }, + { + "epoch": 0.00322679778733866, + "grad_norm": 6.873617649078369, + "learning_rate": 1.998709280885065e-05, + "loss": 2.8568, + "step": 42 + }, + { + "epoch": 0.003380454824830977, + "grad_norm": 6.603146076202393, + "learning_rate": 1.998647818070068e-05, + "loss": 3.0781, + "step": 44 + }, + { + "epoch": 0.0035341118623232943, + "grad_norm": 25.308164596557617, + "learning_rate": 1.9985863552550708e-05, + "loss": 3.0764, + "step": 46 + }, + { + "epoch": 0.0036877688998156115, + "grad_norm": 15.176654815673828, + "learning_rate": 1.998524892440074e-05, + "loss": 3.0356, + "step": 48 + }, + { + "epoch": 0.0038414259373079286, + "grad_norm": 7.444390773773193, + "learning_rate": 1.9984634296250767e-05, + "loss": 2.9488, + "step": 50 + }, + { + "epoch": 0.003995082974800246, + "grad_norm": 18.565139770507812, + "learning_rate": 1.99840196681008e-05, + "loss": 2.7179, + "step": 52 + }, + { + "epoch": 0.004148740012292563, + "grad_norm": 10.658416748046875, + "learning_rate": 1.998340503995083e-05, + "loss": 2.716, + "step": 54 + }, + { + "epoch": 0.0043023970497848806, + "grad_norm": 9.682657241821289, + "learning_rate": 1.9982790411800863e-05, + "loss": 2.9189, + "step": 56 + }, + { + "epoch": 0.004456054087277198, + "grad_norm": 20.967639923095703, + "learning_rate": 1.9982175783650893e-05, + "loss": 2.8078, + "step": 58 + }, + { + "epoch": 0.004609711124769515, + "grad_norm": 16.931556701660156, + "learning_rate": 1.9981561155500923e-05, + "loss": 2.837, + "step": 60 + }, + { + "epoch": 0.004763368162261831, + "grad_norm": 12.055686950683594, + "learning_rate": 1.9980946527350956e-05, + "loss": 2.7392, + "step": 62 + }, + { + "epoch": 0.004917025199754148, + "grad_norm": 7.959167957305908, + "learning_rate": 1.9980331899200985e-05, + "loss": 2.8915, + "step": 64 + }, + { + "epoch": 0.0050706822372464655, + "grad_norm": 9.24318790435791, + "learning_rate": 1.9979717271051015e-05, + "loss": 2.5171, + "step": 66 + }, + { + "epoch": 0.005224339274738783, + "grad_norm": 20.02304458618164, + "learning_rate": 1.9979102642901048e-05, + "loss": 2.7947, + "step": 68 + }, + { + "epoch": 0.0053779963122311, + "grad_norm": 8.09688663482666, + "learning_rate": 1.9978488014751078e-05, + "loss": 2.7323, + "step": 70 + }, + { + "epoch": 0.005531653349723417, + "grad_norm": 8.636987686157227, + "learning_rate": 1.9977873386601108e-05, + "loss": 2.6309, + "step": 72 + }, + { + "epoch": 0.005685310387215734, + "grad_norm": 6.815808296203613, + "learning_rate": 1.997725875845114e-05, + "loss": 2.4741, + "step": 74 + }, + { + "epoch": 0.005838967424708051, + "grad_norm": 7.532662868499756, + "learning_rate": 1.997664413030117e-05, + "loss": 2.5875, + "step": 76 + }, + { + "epoch": 0.0059926244622003685, + "grad_norm": 6.733164310455322, + "learning_rate": 1.99760295021512e-05, + "loss": 2.6103, + "step": 78 + }, + { + "epoch": 0.006146281499692686, + "grad_norm": 6.442116737365723, + "learning_rate": 1.997541487400123e-05, + "loss": 2.6208, + "step": 80 + }, + { + "epoch": 0.006299938537185003, + "grad_norm": 6.882765769958496, + "learning_rate": 1.9974800245851263e-05, + "loss": 2.5554, + "step": 82 + }, + { + "epoch": 0.00645359557467732, + "grad_norm": 6.64527702331543, + "learning_rate": 1.9974185617701292e-05, + "loss": 2.5667, + "step": 84 + }, + { + "epoch": 0.006607252612169637, + "grad_norm": 7.69775390625, + "learning_rate": 1.9973570989551322e-05, + "loss": 2.6117, + "step": 86 + }, + { + "epoch": 0.006760909649661954, + "grad_norm": 7.077218532562256, + "learning_rate": 1.9972956361401355e-05, + "loss": 2.4501, + "step": 88 + }, + { + "epoch": 0.0069145666871542714, + "grad_norm": 5.539189338684082, + "learning_rate": 1.9972341733251385e-05, + "loss": 2.4775, + "step": 90 + }, + { + "epoch": 0.007068223724646589, + "grad_norm": 6.602914333343506, + "learning_rate": 1.9971727105101415e-05, + "loss": 2.3944, + "step": 92 + }, + { + "epoch": 0.007221880762138906, + "grad_norm": 5.995626449584961, + "learning_rate": 1.9971112476951448e-05, + "loss": 2.4826, + "step": 94 + }, + { + "epoch": 0.007375537799631223, + "grad_norm": 6.836587429046631, + "learning_rate": 1.9970497848801477e-05, + "loss": 2.468, + "step": 96 + }, + { + "epoch": 0.00752919483712354, + "grad_norm": 6.4697651863098145, + "learning_rate": 1.9969883220651507e-05, + "loss": 2.2833, + "step": 98 + }, + { + "epoch": 0.007682851874615857, + "grad_norm": 8.081903457641602, + "learning_rate": 1.996926859250154e-05, + "loss": 2.6544, + "step": 100 + }, + { + "epoch": 0.007836508912108174, + "grad_norm": 6.688724517822266, + "learning_rate": 1.996865396435157e-05, + "loss": 2.55, + "step": 102 + }, + { + "epoch": 0.007990165949600492, + "grad_norm": 6.878283977508545, + "learning_rate": 1.99680393362016e-05, + "loss": 2.2658, + "step": 104 + }, + { + "epoch": 0.008143822987092809, + "grad_norm": 7.079164505004883, + "learning_rate": 1.996742470805163e-05, + "loss": 2.4793, + "step": 106 + }, + { + "epoch": 0.008297480024585127, + "grad_norm": 6.391737461090088, + "learning_rate": 1.9966810079901662e-05, + "loss": 2.4154, + "step": 108 + }, + { + "epoch": 0.008451137062077443, + "grad_norm": 7.503854274749756, + "learning_rate": 1.9966195451751692e-05, + "loss": 2.3137, + "step": 110 + }, + { + "epoch": 0.008604794099569761, + "grad_norm": 6.10397481918335, + "learning_rate": 1.996558082360172e-05, + "loss": 2.4306, + "step": 112 + }, + { + "epoch": 0.008758451137062077, + "grad_norm": 6.1603264808654785, + "learning_rate": 1.9964966195451755e-05, + "loss": 2.2859, + "step": 114 + }, + { + "epoch": 0.008912108174554395, + "grad_norm": 7.389194011688232, + "learning_rate": 1.9964351567301784e-05, + "loss": 2.3876, + "step": 116 + }, + { + "epoch": 0.009065765212046712, + "grad_norm": 6.887446403503418, + "learning_rate": 1.9963736939151814e-05, + "loss": 2.5139, + "step": 118 + }, + { + "epoch": 0.00921942224953903, + "grad_norm": 7.2416768074035645, + "learning_rate": 1.9963122311001847e-05, + "loss": 2.5653, + "step": 120 + }, + { + "epoch": 0.009373079287031346, + "grad_norm": 7.454037189483643, + "learning_rate": 1.9962507682851877e-05, + "loss": 2.3707, + "step": 122 + }, + { + "epoch": 0.009526736324523662, + "grad_norm": 6.9176483154296875, + "learning_rate": 1.9961893054701906e-05, + "loss": 2.4158, + "step": 124 + }, + { + "epoch": 0.00968039336201598, + "grad_norm": 7.838490009307861, + "learning_rate": 1.9961278426551936e-05, + "loss": 2.45, + "step": 126 + }, + { + "epoch": 0.009834050399508297, + "grad_norm": 6.680061340332031, + "learning_rate": 1.996066379840197e-05, + "loss": 2.1975, + "step": 128 + }, + { + "epoch": 0.009987707437000615, + "grad_norm": 7.567671775817871, + "learning_rate": 1.9960049170252e-05, + "loss": 2.1892, + "step": 130 + }, + { + "epoch": 0.010141364474492931, + "grad_norm": 6.0987396240234375, + "learning_rate": 1.995943454210203e-05, + "loss": 2.2957, + "step": 132 + }, + { + "epoch": 0.010295021511985249, + "grad_norm": 6.579552173614502, + "learning_rate": 1.995881991395206e-05, + "loss": 2.4326, + "step": 134 + }, + { + "epoch": 0.010448678549477565, + "grad_norm": 7.131938934326172, + "learning_rate": 1.995820528580209e-05, + "loss": 2.4767, + "step": 136 + }, + { + "epoch": 0.010602335586969883, + "grad_norm": 6.883522033691406, + "learning_rate": 1.995759065765212e-05, + "loss": 2.3893, + "step": 138 + }, + { + "epoch": 0.0107559926244622, + "grad_norm": 5.52859354019165, + "learning_rate": 1.9956976029502154e-05, + "loss": 2.1851, + "step": 140 + }, + { + "epoch": 0.010909649661954518, + "grad_norm": 6.14478874206543, + "learning_rate": 1.9956361401352184e-05, + "loss": 2.2019, + "step": 142 + }, + { + "epoch": 0.011063306699446834, + "grad_norm": 6.477922439575195, + "learning_rate": 1.9955746773202213e-05, + "loss": 2.2746, + "step": 144 + }, + { + "epoch": 0.011216963736939152, + "grad_norm": 7.661022186279297, + "learning_rate": 1.9955132145052246e-05, + "loss": 2.3499, + "step": 146 + }, + { + "epoch": 0.011370620774431468, + "grad_norm": 7.439324378967285, + "learning_rate": 1.9954517516902276e-05, + "loss": 2.1848, + "step": 148 + }, + { + "epoch": 0.011524277811923786, + "grad_norm": 7.070183753967285, + "learning_rate": 1.9953902888752306e-05, + "loss": 2.2816, + "step": 150 + }, + { + "epoch": 0.011677934849416103, + "grad_norm": 5.912161350250244, + "learning_rate": 1.9953288260602336e-05, + "loss": 2.3688, + "step": 152 + }, + { + "epoch": 0.01183159188690842, + "grad_norm": 6.827462673187256, + "learning_rate": 1.995267363245237e-05, + "loss": 2.3945, + "step": 154 + }, + { + "epoch": 0.011985248924400737, + "grad_norm": 5.7712082862854, + "learning_rate": 1.9952059004302398e-05, + "loss": 2.1618, + "step": 156 + }, + { + "epoch": 0.012138905961893055, + "grad_norm": 5.9169020652771, + "learning_rate": 1.9951444376152428e-05, + "loss": 2.1781, + "step": 158 + }, + { + "epoch": 0.012292562999385371, + "grad_norm": 5.994232177734375, + "learning_rate": 1.995082974800246e-05, + "loss": 2.1474, + "step": 160 + }, + { + "epoch": 0.01244622003687769, + "grad_norm": 6.10550594329834, + "learning_rate": 1.995021511985249e-05, + "loss": 2.2227, + "step": 162 + }, + { + "epoch": 0.012599877074370006, + "grad_norm": 7.107779502868652, + "learning_rate": 1.994960049170252e-05, + "loss": 2.334, + "step": 164 + }, + { + "epoch": 0.012753534111862324, + "grad_norm": 4.990610122680664, + "learning_rate": 1.9948985863552553e-05, + "loss": 2.2313, + "step": 166 + }, + { + "epoch": 0.01290719114935464, + "grad_norm": 8.93641185760498, + "learning_rate": 1.9948371235402583e-05, + "loss": 2.1062, + "step": 168 + }, + { + "epoch": 0.013060848186846958, + "grad_norm": 5.389564037322998, + "learning_rate": 1.9947756607252613e-05, + "loss": 2.1729, + "step": 170 + }, + { + "epoch": 0.013214505224339274, + "grad_norm": 5.347591400146484, + "learning_rate": 1.9947141979102646e-05, + "loss": 2.0474, + "step": 172 + }, + { + "epoch": 0.013368162261831592, + "grad_norm": 6.475700378417969, + "learning_rate": 1.9946527350952676e-05, + "loss": 2.1939, + "step": 174 + }, + { + "epoch": 0.013521819299323909, + "grad_norm": 6.144668102264404, + "learning_rate": 1.9945912722802705e-05, + "loss": 2.217, + "step": 176 + }, + { + "epoch": 0.013675476336816227, + "grad_norm": 6.778875350952148, + "learning_rate": 1.9945298094652735e-05, + "loss": 2.139, + "step": 178 + }, + { + "epoch": 0.013829133374308543, + "grad_norm": 7.560453414916992, + "learning_rate": 1.9944683466502768e-05, + "loss": 2.1931, + "step": 180 + }, + { + "epoch": 0.013982790411800861, + "grad_norm": 5.251035690307617, + "learning_rate": 1.9944068838352798e-05, + "loss": 2.1596, + "step": 182 + }, + { + "epoch": 0.014136447449293177, + "grad_norm": 5.9772162437438965, + "learning_rate": 1.9943454210202827e-05, + "loss": 2.232, + "step": 184 + }, + { + "epoch": 0.014290104486785495, + "grad_norm": 7.088453769683838, + "learning_rate": 1.994283958205286e-05, + "loss": 2.3468, + "step": 186 + }, + { + "epoch": 0.014443761524277812, + "grad_norm": 6.209799289703369, + "learning_rate": 1.994222495390289e-05, + "loss": 2.3158, + "step": 188 + }, + { + "epoch": 0.01459741856177013, + "grad_norm": 6.048709392547607, + "learning_rate": 1.994161032575292e-05, + "loss": 1.9986, + "step": 190 + }, + { + "epoch": 0.014751075599262446, + "grad_norm": 5.292468070983887, + "learning_rate": 1.9940995697602953e-05, + "loss": 2.1564, + "step": 192 + }, + { + "epoch": 0.014904732636754764, + "grad_norm": 6.045801639556885, + "learning_rate": 1.9940381069452983e-05, + "loss": 2.2064, + "step": 194 + }, + { + "epoch": 0.01505838967424708, + "grad_norm": 6.204288482666016, + "learning_rate": 1.9939766441303012e-05, + "loss": 2.2869, + "step": 196 + }, + { + "epoch": 0.015212046711739398, + "grad_norm": 6.579591274261475, + "learning_rate": 1.9939151813153045e-05, + "loss": 2.1494, + "step": 198 + }, + { + "epoch": 0.015365703749231715, + "grad_norm": 6.20919942855835, + "learning_rate": 1.9938537185003075e-05, + "loss": 2.0245, + "step": 200 + }, + { + "epoch": 0.015519360786724033, + "grad_norm": 6.129773139953613, + "learning_rate": 1.9937922556853108e-05, + "loss": 2.0684, + "step": 202 + }, + { + "epoch": 0.01567301782421635, + "grad_norm": 7.500084400177002, + "learning_rate": 1.9937307928703134e-05, + "loss": 2.1818, + "step": 204 + }, + { + "epoch": 0.015826674861708665, + "grad_norm": 6.189898490905762, + "learning_rate": 1.9936693300553167e-05, + "loss": 2.1377, + "step": 206 + }, + { + "epoch": 0.015980331899200985, + "grad_norm": 5.788628101348877, + "learning_rate": 1.9936078672403197e-05, + "loss": 2.1195, + "step": 208 + }, + { + "epoch": 0.0161339889366933, + "grad_norm": 6.9061055183410645, + "learning_rate": 1.9935464044253227e-05, + "loss": 2.1815, + "step": 210 + }, + { + "epoch": 0.016287645974185617, + "grad_norm": 7.366201877593994, + "learning_rate": 1.993484941610326e-05, + "loss": 2.2884, + "step": 212 + }, + { + "epoch": 0.016441303011677934, + "grad_norm": 5.979190826416016, + "learning_rate": 1.993423478795329e-05, + "loss": 2.3251, + "step": 214 + }, + { + "epoch": 0.016594960049170254, + "grad_norm": 6.170030117034912, + "learning_rate": 1.993362015980332e-05, + "loss": 2.2108, + "step": 216 + }, + { + "epoch": 0.01674861708666257, + "grad_norm": 6.819857120513916, + "learning_rate": 1.9933005531653352e-05, + "loss": 2.2231, + "step": 218 + }, + { + "epoch": 0.016902274124154886, + "grad_norm": 7.386382579803467, + "learning_rate": 1.9932390903503382e-05, + "loss": 2.1647, + "step": 220 + }, + { + "epoch": 0.017055931161647202, + "grad_norm": 5.797331809997559, + "learning_rate": 1.9931776275353415e-05, + "loss": 2.2092, + "step": 222 + }, + { + "epoch": 0.017209588199139522, + "grad_norm": 5.605097770690918, + "learning_rate": 1.993116164720344e-05, + "loss": 2.2266, + "step": 224 + }, + { + "epoch": 0.01736324523663184, + "grad_norm": 5.865804672241211, + "learning_rate": 1.9930547019053474e-05, + "loss": 2.0874, + "step": 226 + }, + { + "epoch": 0.017516902274124155, + "grad_norm": 7.769106864929199, + "learning_rate": 1.9929932390903508e-05, + "loss": 2.1032, + "step": 228 + }, + { + "epoch": 0.01767055931161647, + "grad_norm": 6.673518180847168, + "learning_rate": 1.9929317762753534e-05, + "loss": 2.0957, + "step": 230 + }, + { + "epoch": 0.01782421634910879, + "grad_norm": 6.331215858459473, + "learning_rate": 1.9928703134603567e-05, + "loss": 2.143, + "step": 232 + }, + { + "epoch": 0.017977873386601107, + "grad_norm": 5.792760848999023, + "learning_rate": 1.9928088506453597e-05, + "loss": 2.0157, + "step": 234 + }, + { + "epoch": 0.018131530424093423, + "grad_norm": 6.460434436798096, + "learning_rate": 1.9927473878303626e-05, + "loss": 2.0018, + "step": 236 + }, + { + "epoch": 0.01828518746158574, + "grad_norm": 6.339091777801514, + "learning_rate": 1.992685925015366e-05, + "loss": 2.2635, + "step": 238 + }, + { + "epoch": 0.01843884449907806, + "grad_norm": 5.446582317352295, + "learning_rate": 1.992624462200369e-05, + "loss": 2.036, + "step": 240 + }, + { + "epoch": 0.018592501536570376, + "grad_norm": 6.4099273681640625, + "learning_rate": 1.9925629993853722e-05, + "loss": 2.0031, + "step": 242 + }, + { + "epoch": 0.018746158574062692, + "grad_norm": 7.307748794555664, + "learning_rate": 1.9925015365703752e-05, + "loss": 2.0746, + "step": 244 + }, + { + "epoch": 0.01889981561155501, + "grad_norm": 5.755754470825195, + "learning_rate": 1.992440073755378e-05, + "loss": 2.1756, + "step": 246 + }, + { + "epoch": 0.019053472649047325, + "grad_norm": 5.9470744132995605, + "learning_rate": 1.9923786109403815e-05, + "loss": 2.1579, + "step": 248 + }, + { + "epoch": 0.019207129686539644, + "grad_norm": 5.4200873374938965, + "learning_rate": 1.992317148125384e-05, + "loss": 2.1868, + "step": 250 + }, + { + "epoch": 0.01936078672403196, + "grad_norm": 6.8247175216674805, + "learning_rate": 1.9922556853103874e-05, + "loss": 2.1525, + "step": 252 + }, + { + "epoch": 0.019514443761524277, + "grad_norm": 6.334802627563477, + "learning_rate": 1.9921942224953904e-05, + "loss": 2.1261, + "step": 254 + }, + { + "epoch": 0.019668100799016593, + "grad_norm": 7.025927543640137, + "learning_rate": 1.9921327596803933e-05, + "loss": 2.2474, + "step": 256 + }, + { + "epoch": 0.019821757836508913, + "grad_norm": 6.594686508178711, + "learning_rate": 1.9920712968653966e-05, + "loss": 1.9885, + "step": 258 + }, + { + "epoch": 0.01997541487400123, + "grad_norm": 6.713582992553711, + "learning_rate": 1.9920098340503996e-05, + "loss": 2.3728, + "step": 260 + }, + { + "epoch": 0.020129071911493546, + "grad_norm": 5.78023099899292, + "learning_rate": 1.9919483712354026e-05, + "loss": 2.0887, + "step": 262 + }, + { + "epoch": 0.020282728948985862, + "grad_norm": 5.462549686431885, + "learning_rate": 1.991886908420406e-05, + "loss": 2.0673, + "step": 264 + }, + { + "epoch": 0.020436385986478182, + "grad_norm": 6.792922019958496, + "learning_rate": 1.991825445605409e-05, + "loss": 2.177, + "step": 266 + }, + { + "epoch": 0.020590043023970498, + "grad_norm": 6.281880855560303, + "learning_rate": 1.991763982790412e-05, + "loss": 1.9686, + "step": 268 + }, + { + "epoch": 0.020743700061462814, + "grad_norm": 5.745354175567627, + "learning_rate": 1.991702519975415e-05, + "loss": 2.1414, + "step": 270 + }, + { + "epoch": 0.02089735709895513, + "grad_norm": 6.046512126922607, + "learning_rate": 1.991641057160418e-05, + "loss": 2.1541, + "step": 272 + }, + { + "epoch": 0.02105101413644745, + "grad_norm": 7.513150691986084, + "learning_rate": 1.9915795943454214e-05, + "loss": 2.1383, + "step": 274 + }, + { + "epoch": 0.021204671173939767, + "grad_norm": 8.351797103881836, + "learning_rate": 1.991518131530424e-05, + "loss": 2.209, + "step": 276 + }, + { + "epoch": 0.021358328211432083, + "grad_norm": 6.781789302825928, + "learning_rate": 1.9914566687154273e-05, + "loss": 1.9494, + "step": 278 + }, + { + "epoch": 0.0215119852489244, + "grad_norm": 5.912288188934326, + "learning_rate": 1.9913952059004303e-05, + "loss": 1.9871, + "step": 280 + }, + { + "epoch": 0.02166564228641672, + "grad_norm": 5.441234111785889, + "learning_rate": 1.9913337430854333e-05, + "loss": 2.118, + "step": 282 + }, + { + "epoch": 0.021819299323909035, + "grad_norm": 6.041057109832764, + "learning_rate": 1.9912722802704366e-05, + "loss": 2.0064, + "step": 284 + }, + { + "epoch": 0.02197295636140135, + "grad_norm": 6.26601505279541, + "learning_rate": 1.9912108174554395e-05, + "loss": 1.9593, + "step": 286 + }, + { + "epoch": 0.022126613398893668, + "grad_norm": 6.992424488067627, + "learning_rate": 1.991149354640443e-05, + "loss": 2.1785, + "step": 288 + }, + { + "epoch": 0.022280270436385988, + "grad_norm": 7.048946857452393, + "learning_rate": 1.9910878918254458e-05, + "loss": 2.0809, + "step": 290 + }, + { + "epoch": 0.022433927473878304, + "grad_norm": 7.00367546081543, + "learning_rate": 1.9910264290104488e-05, + "loss": 2.0688, + "step": 292 + }, + { + "epoch": 0.02258758451137062, + "grad_norm": 6.326030731201172, + "learning_rate": 1.990964966195452e-05, + "loss": 2.1279, + "step": 294 + }, + { + "epoch": 0.022741241548862937, + "grad_norm": 5.886343002319336, + "learning_rate": 1.990903503380455e-05, + "loss": 1.9146, + "step": 296 + }, + { + "epoch": 0.022894898586355256, + "grad_norm": 6.407416820526123, + "learning_rate": 1.990842040565458e-05, + "loss": 2.073, + "step": 298 + }, + { + "epoch": 0.023048555623847573, + "grad_norm": 5.35817289352417, + "learning_rate": 1.9907805777504613e-05, + "loss": 2.064, + "step": 300 + }, + { + "epoch": 0.02320221266133989, + "grad_norm": 5.71148157119751, + "learning_rate": 1.990719114935464e-05, + "loss": 2.2207, + "step": 302 + }, + { + "epoch": 0.023355869698832205, + "grad_norm": 7.2422051429748535, + "learning_rate": 1.9906576521204673e-05, + "loss": 2.1518, + "step": 304 + }, + { + "epoch": 0.023509526736324525, + "grad_norm": 7.267468452453613, + "learning_rate": 1.9905961893054702e-05, + "loss": 2.0082, + "step": 306 + }, + { + "epoch": 0.02366318377381684, + "grad_norm": 6.504114627838135, + "learning_rate": 1.9905347264904736e-05, + "loss": 1.9722, + "step": 308 + }, + { + "epoch": 0.023816840811309158, + "grad_norm": 7.074812889099121, + "learning_rate": 1.9904732636754765e-05, + "loss": 2.1789, + "step": 310 + }, + { + "epoch": 0.023970497848801474, + "grad_norm": 6.774876117706299, + "learning_rate": 1.9904118008604795e-05, + "loss": 2.219, + "step": 312 + }, + { + "epoch": 0.024124154886293794, + "grad_norm": 5.666469097137451, + "learning_rate": 1.9903503380454828e-05, + "loss": 1.8294, + "step": 314 + }, + { + "epoch": 0.02427781192378611, + "grad_norm": 6.548127174377441, + "learning_rate": 1.9902888752304858e-05, + "loss": 2.0859, + "step": 316 + }, + { + "epoch": 0.024431468961278426, + "grad_norm": 5.174642562866211, + "learning_rate": 1.9902274124154887e-05, + "loss": 1.989, + "step": 318 + }, + { + "epoch": 0.024585125998770743, + "grad_norm": 5.891490936279297, + "learning_rate": 1.990165949600492e-05, + "loss": 2.0776, + "step": 320 + }, + { + "epoch": 0.024738783036263062, + "grad_norm": 5.7647504806518555, + "learning_rate": 1.9901044867854947e-05, + "loss": 1.9681, + "step": 322 + }, + { + "epoch": 0.02489244007375538, + "grad_norm": 5.61868143081665, + "learning_rate": 1.990043023970498e-05, + "loss": 1.8923, + "step": 324 + }, + { + "epoch": 0.025046097111247695, + "grad_norm": 7.358055114746094, + "learning_rate": 1.9899815611555013e-05, + "loss": 1.9859, + "step": 326 + }, + { + "epoch": 0.02519975414874001, + "grad_norm": 5.265814781188965, + "learning_rate": 1.9899200983405043e-05, + "loss": 1.939, + "step": 328 + }, + { + "epoch": 0.02535341118623233, + "grad_norm": 9.370257377624512, + "learning_rate": 1.9898586355255072e-05, + "loss": 1.9538, + "step": 330 + }, + { + "epoch": 0.025507068223724647, + "grad_norm": 7.504848003387451, + "learning_rate": 1.9897971727105102e-05, + "loss": 2.0802, + "step": 332 + }, + { + "epoch": 0.025660725261216964, + "grad_norm": 5.975841045379639, + "learning_rate": 1.9897357098955135e-05, + "loss": 1.853, + "step": 334 + }, + { + "epoch": 0.02581438229870928, + "grad_norm": 6.099985122680664, + "learning_rate": 1.9896742470805165e-05, + "loss": 2.0014, + "step": 336 + }, + { + "epoch": 0.0259680393362016, + "grad_norm": 6.825030326843262, + "learning_rate": 1.9896127842655194e-05, + "loss": 1.9608, + "step": 338 + }, + { + "epoch": 0.026121696373693916, + "grad_norm": 6.16441535949707, + "learning_rate": 1.9895513214505227e-05, + "loss": 2.0848, + "step": 340 + }, + { + "epoch": 0.026275353411186232, + "grad_norm": 6.392692565917969, + "learning_rate": 1.9894898586355257e-05, + "loss": 1.9651, + "step": 342 + }, + { + "epoch": 0.02642901044867855, + "grad_norm": 5.567882537841797, + "learning_rate": 1.9894283958205287e-05, + "loss": 2.1211, + "step": 344 + }, + { + "epoch": 0.026582667486170868, + "grad_norm": 10.182480812072754, + "learning_rate": 1.989366933005532e-05, + "loss": 1.9924, + "step": 346 + }, + { + "epoch": 0.026736324523663185, + "grad_norm": 5.608663558959961, + "learning_rate": 1.989305470190535e-05, + "loss": 1.936, + "step": 348 + }, + { + "epoch": 0.0268899815611555, + "grad_norm": 5.883683204650879, + "learning_rate": 1.989244007375538e-05, + "loss": 2.0998, + "step": 350 + }, + { + "epoch": 0.027043638598647817, + "grad_norm": 8.584614753723145, + "learning_rate": 1.989182544560541e-05, + "loss": 2.1266, + "step": 352 + }, + { + "epoch": 0.027197295636140137, + "grad_norm": 6.828667640686035, + "learning_rate": 1.9891210817455442e-05, + "loss": 1.8693, + "step": 354 + }, + { + "epoch": 0.027350952673632453, + "grad_norm": 7.0278449058532715, + "learning_rate": 1.989059618930547e-05, + "loss": 1.9258, + "step": 356 + }, + { + "epoch": 0.02750460971112477, + "grad_norm": 5.643075466156006, + "learning_rate": 1.98899815611555e-05, + "loss": 2.071, + "step": 358 + }, + { + "epoch": 0.027658266748617086, + "grad_norm": 6.685908794403076, + "learning_rate": 1.9889366933005534e-05, + "loss": 1.8658, + "step": 360 + }, + { + "epoch": 0.027811923786109402, + "grad_norm": 5.766722679138184, + "learning_rate": 1.9888752304855564e-05, + "loss": 2.0608, + "step": 362 + }, + { + "epoch": 0.027965580823601722, + "grad_norm": 6.229999542236328, + "learning_rate": 1.9888137676705594e-05, + "loss": 1.8478, + "step": 364 + }, + { + "epoch": 0.028119237861094038, + "grad_norm": 14.6449613571167, + "learning_rate": 1.9887523048555627e-05, + "loss": 2.0233, + "step": 366 + }, + { + "epoch": 0.028272894898586354, + "grad_norm": 5.458970069885254, + "learning_rate": 1.9886908420405657e-05, + "loss": 1.8742, + "step": 368 + }, + { + "epoch": 0.02842655193607867, + "grad_norm": 9.708429336547852, + "learning_rate": 1.9886293792255686e-05, + "loss": 2.0435, + "step": 370 + }, + { + "epoch": 0.02858020897357099, + "grad_norm": 8.345685958862305, + "learning_rate": 1.988567916410572e-05, + "loss": 1.9448, + "step": 372 + }, + { + "epoch": 0.028733866011063307, + "grad_norm": 5.213901519775391, + "learning_rate": 1.988506453595575e-05, + "loss": 1.8902, + "step": 374 + }, + { + "epoch": 0.028887523048555623, + "grad_norm": 6.842494964599609, + "learning_rate": 1.988444990780578e-05, + "loss": 2.0958, + "step": 376 + }, + { + "epoch": 0.02904118008604794, + "grad_norm": 6.533809185028076, + "learning_rate": 1.988383527965581e-05, + "loss": 2.0073, + "step": 378 + }, + { + "epoch": 0.02919483712354026, + "grad_norm": 5.832721710205078, + "learning_rate": 1.988322065150584e-05, + "loss": 1.9462, + "step": 380 + }, + { + "epoch": 0.029348494161032575, + "grad_norm": 6.040827751159668, + "learning_rate": 1.988260602335587e-05, + "loss": 2.0111, + "step": 382 + }, + { + "epoch": 0.02950215119852489, + "grad_norm": 6.082043647766113, + "learning_rate": 1.98819913952059e-05, + "loss": 2.0088, + "step": 384 + }, + { + "epoch": 0.029655808236017208, + "grad_norm": 4.5363383293151855, + "learning_rate": 1.9881376767055934e-05, + "loss": 1.9059, + "step": 386 + }, + { + "epoch": 0.029809465273509528, + "grad_norm": 4.769321918487549, + "learning_rate": 1.9880762138905964e-05, + "loss": 1.8781, + "step": 388 + }, + { + "epoch": 0.029963122311001844, + "grad_norm": 6.1424994468688965, + "learning_rate": 1.9880147510755993e-05, + "loss": 2.0232, + "step": 390 + }, + { + "epoch": 0.03011677934849416, + "grad_norm": 6.081544399261475, + "learning_rate": 1.9879532882606026e-05, + "loss": 1.8908, + "step": 392 + }, + { + "epoch": 0.030270436385986477, + "grad_norm": 6.146285057067871, + "learning_rate": 1.9878918254456056e-05, + "loss": 2.0144, + "step": 394 + }, + { + "epoch": 0.030424093423478796, + "grad_norm": 5.401834011077881, + "learning_rate": 1.9878303626306086e-05, + "loss": 1.8258, + "step": 396 + }, + { + "epoch": 0.030577750460971113, + "grad_norm": 6.835007667541504, + "learning_rate": 1.987768899815612e-05, + "loss": 2.0515, + "step": 398 + }, + { + "epoch": 0.03073140749846343, + "grad_norm": 7.031691551208496, + "learning_rate": 1.987707437000615e-05, + "loss": 2.0362, + "step": 400 + }, + { + "epoch": 0.030885064535955745, + "grad_norm": 5.733877182006836, + "learning_rate": 1.9876459741856178e-05, + "loss": 2.099, + "step": 402 + }, + { + "epoch": 0.031038721573448065, + "grad_norm": 6.152698516845703, + "learning_rate": 1.9875845113706208e-05, + "loss": 1.9393, + "step": 404 + }, + { + "epoch": 0.03119237861094038, + "grad_norm": 5.859741687774658, + "learning_rate": 1.987523048555624e-05, + "loss": 1.995, + "step": 406 + }, + { + "epoch": 0.0313460356484327, + "grad_norm": 6.834084510803223, + "learning_rate": 1.987461585740627e-05, + "loss": 2.0035, + "step": 408 + }, + { + "epoch": 0.031499692685925014, + "grad_norm": 6.169229030609131, + "learning_rate": 1.98740012292563e-05, + "loss": 2.1276, + "step": 410 + }, + { + "epoch": 0.03165334972341733, + "grad_norm": 5.270079135894775, + "learning_rate": 1.9873386601106333e-05, + "loss": 2.0073, + "step": 412 + }, + { + "epoch": 0.03180700676090965, + "grad_norm": 5.952144145965576, + "learning_rate": 1.9872771972956363e-05, + "loss": 1.8817, + "step": 414 + }, + { + "epoch": 0.03196066379840197, + "grad_norm": 6.3290019035339355, + "learning_rate": 1.9872157344806393e-05, + "loss": 1.9526, + "step": 416 + }, + { + "epoch": 0.032114320835894286, + "grad_norm": 5.712306499481201, + "learning_rate": 1.9871542716656426e-05, + "loss": 2.0793, + "step": 418 + }, + { + "epoch": 0.0322679778733866, + "grad_norm": 5.497166156768799, + "learning_rate": 1.9870928088506455e-05, + "loss": 1.9061, + "step": 420 + }, + { + "epoch": 0.03242163491087892, + "grad_norm": 6.435750484466553, + "learning_rate": 1.9870313460356485e-05, + "loss": 1.8971, + "step": 422 + }, + { + "epoch": 0.032575291948371235, + "grad_norm": 5.9519734382629395, + "learning_rate": 1.9869698832206518e-05, + "loss": 2.0295, + "step": 424 + }, + { + "epoch": 0.03272894898586355, + "grad_norm": 6.359841823577881, + "learning_rate": 1.9869084204056548e-05, + "loss": 1.9017, + "step": 426 + }, + { + "epoch": 0.03288260602335587, + "grad_norm": 6.195022106170654, + "learning_rate": 1.9868469575906578e-05, + "loss": 2.0663, + "step": 428 + }, + { + "epoch": 0.033036263060848184, + "grad_norm": 5.500522613525391, + "learning_rate": 1.9867854947756607e-05, + "loss": 1.9694, + "step": 430 + }, + { + "epoch": 0.03318992009834051, + "grad_norm": 7.16880464553833, + "learning_rate": 1.986724031960664e-05, + "loss": 1.918, + "step": 432 + }, + { + "epoch": 0.03334357713583282, + "grad_norm": 6.0987348556518555, + "learning_rate": 1.986662569145667e-05, + "loss": 1.8705, + "step": 434 + }, + { + "epoch": 0.03349723417332514, + "grad_norm": 6.8652753829956055, + "learning_rate": 1.98660110633067e-05, + "loss": 1.9383, + "step": 436 + }, + { + "epoch": 0.033650891210817456, + "grad_norm": 5.421166896820068, + "learning_rate": 1.9865396435156733e-05, + "loss": 1.879, + "step": 438 + }, + { + "epoch": 0.03380454824830977, + "grad_norm": 5.929842948913574, + "learning_rate": 1.9864781807006762e-05, + "loss": 1.7183, + "step": 440 + }, + { + "epoch": 0.03395820528580209, + "grad_norm": 5.500015735626221, + "learning_rate": 1.9864167178856792e-05, + "loss": 1.9168, + "step": 442 + }, + { + "epoch": 0.034111862323294405, + "grad_norm": 6.267481327056885, + "learning_rate": 1.9863552550706825e-05, + "loss": 1.8126, + "step": 444 + }, + { + "epoch": 0.03426551936078672, + "grad_norm": 6.300197124481201, + "learning_rate": 1.9862937922556855e-05, + "loss": 2.0519, + "step": 446 + }, + { + "epoch": 0.034419176398279044, + "grad_norm": 8.094818115234375, + "learning_rate": 1.9862323294406885e-05, + "loss": 1.8122, + "step": 448 + }, + { + "epoch": 0.03457283343577136, + "grad_norm": 5.738587379455566, + "learning_rate": 1.9861708666256914e-05, + "loss": 1.8155, + "step": 450 + }, + { + "epoch": 0.03472649047326368, + "grad_norm": 5.194686412811279, + "learning_rate": 1.9861094038106947e-05, + "loss": 1.9198, + "step": 452 + }, + { + "epoch": 0.03488014751075599, + "grad_norm": 4.97174072265625, + "learning_rate": 1.9860479409956977e-05, + "loss": 1.9955, + "step": 454 + }, + { + "epoch": 0.03503380454824831, + "grad_norm": 5.790378570556641, + "learning_rate": 1.9859864781807007e-05, + "loss": 1.8218, + "step": 456 + }, + { + "epoch": 0.035187461585740626, + "grad_norm": 5.287135124206543, + "learning_rate": 1.985925015365704e-05, + "loss": 1.9169, + "step": 458 + }, + { + "epoch": 0.03534111862323294, + "grad_norm": 8.098136901855469, + "learning_rate": 1.985863552550707e-05, + "loss": 1.9039, + "step": 460 + }, + { + "epoch": 0.03549477566072526, + "grad_norm": 6.957726955413818, + "learning_rate": 1.98580208973571e-05, + "loss": 2.0036, + "step": 462 + }, + { + "epoch": 0.03564843269821758, + "grad_norm": 4.368841171264648, + "learning_rate": 1.9857406269207132e-05, + "loss": 1.8883, + "step": 464 + }, + { + "epoch": 0.0358020897357099, + "grad_norm": 5.95673131942749, + "learning_rate": 1.9856791641057162e-05, + "loss": 1.8977, + "step": 466 + }, + { + "epoch": 0.035955746773202214, + "grad_norm": 7.365513324737549, + "learning_rate": 1.985617701290719e-05, + "loss": 1.9865, + "step": 468 + }, + { + "epoch": 0.03610940381069453, + "grad_norm": 5.386063098907471, + "learning_rate": 1.9855562384757225e-05, + "loss": 1.8164, + "step": 470 + }, + { + "epoch": 0.03626306084818685, + "grad_norm": 6.155988693237305, + "learning_rate": 1.9854947756607254e-05, + "loss": 2.0083, + "step": 472 + }, + { + "epoch": 0.03641671788567916, + "grad_norm": 6.110922336578369, + "learning_rate": 1.9854333128457287e-05, + "loss": 1.8688, + "step": 474 + }, + { + "epoch": 0.03657037492317148, + "grad_norm": 5.692699909210205, + "learning_rate": 1.9853718500307314e-05, + "loss": 1.8501, + "step": 476 + }, + { + "epoch": 0.036724031960663796, + "grad_norm": 6.044013977050781, + "learning_rate": 1.9853103872157347e-05, + "loss": 1.8486, + "step": 478 + }, + { + "epoch": 0.03687768899815612, + "grad_norm": 6.102372169494629, + "learning_rate": 1.9852489244007376e-05, + "loss": 2.0873, + "step": 480 + }, + { + "epoch": 0.037031346035648435, + "grad_norm": 5.4327239990234375, + "learning_rate": 1.9851874615857406e-05, + "loss": 1.8635, + "step": 482 + }, + { + "epoch": 0.03718500307314075, + "grad_norm": 5.779347896575928, + "learning_rate": 1.985125998770744e-05, + "loss": 2.0413, + "step": 484 + }, + { + "epoch": 0.03733866011063307, + "grad_norm": 5.000186920166016, + "learning_rate": 1.985064535955747e-05, + "loss": 2.0214, + "step": 486 + }, + { + "epoch": 0.037492317148125384, + "grad_norm": 6.581515312194824, + "learning_rate": 1.98500307314075e-05, + "loss": 1.9141, + "step": 488 + }, + { + "epoch": 0.0376459741856177, + "grad_norm": 6.037952423095703, + "learning_rate": 1.984941610325753e-05, + "loss": 1.9475, + "step": 490 + }, + { + "epoch": 0.03779963122311002, + "grad_norm": 4.99038553237915, + "learning_rate": 1.984880147510756e-05, + "loss": 1.8296, + "step": 492 + }, + { + "epoch": 0.03795328826060233, + "grad_norm": 5.351291656494141, + "learning_rate": 1.9848186846957594e-05, + "loss": 1.9845, + "step": 494 + }, + { + "epoch": 0.03810694529809465, + "grad_norm": 6.249404430389404, + "learning_rate": 1.9847572218807624e-05, + "loss": 1.8824, + "step": 496 + }, + { + "epoch": 0.03826060233558697, + "grad_norm": 5.460664749145508, + "learning_rate": 1.9846957590657654e-05, + "loss": 1.9348, + "step": 498 + }, + { + "epoch": 0.03841425937307929, + "grad_norm": 5.399702072143555, + "learning_rate": 1.9846342962507687e-05, + "loss": 1.8646, + "step": 500 + }, + { + "epoch": 0.038567916410571605, + "grad_norm": 6.00943660736084, + "learning_rate": 1.9845728334357713e-05, + "loss": 1.8804, + "step": 502 + }, + { + "epoch": 0.03872157344806392, + "grad_norm": 6.057244300842285, + "learning_rate": 1.9845113706207746e-05, + "loss": 1.8876, + "step": 504 + }, + { + "epoch": 0.03887523048555624, + "grad_norm": 5.178292274475098, + "learning_rate": 1.9844499078057776e-05, + "loss": 1.8163, + "step": 506 + }, + { + "epoch": 0.039028887523048554, + "grad_norm": 5.430099964141846, + "learning_rate": 1.9843884449907806e-05, + "loss": 1.9221, + "step": 508 + }, + { + "epoch": 0.03918254456054087, + "grad_norm": 5.2391791343688965, + "learning_rate": 1.984326982175784e-05, + "loss": 1.9671, + "step": 510 + }, + { + "epoch": 0.03933620159803319, + "grad_norm": 6.54328727722168, + "learning_rate": 1.9842655193607868e-05, + "loss": 1.9916, + "step": 512 + }, + { + "epoch": 0.03948985863552551, + "grad_norm": 5.6781134605407715, + "learning_rate": 1.98420405654579e-05, + "loss": 1.9, + "step": 514 + }, + { + "epoch": 0.039643515673017826, + "grad_norm": 5.34329891204834, + "learning_rate": 1.984142593730793e-05, + "loss": 1.7433, + "step": 516 + }, + { + "epoch": 0.03979717271051014, + "grad_norm": 6.142169952392578, + "learning_rate": 1.984081130915796e-05, + "loss": 1.8559, + "step": 518 + }, + { + "epoch": 0.03995082974800246, + "grad_norm": 5.825856685638428, + "learning_rate": 1.9840196681007994e-05, + "loss": 1.7434, + "step": 520 + }, + { + "epoch": 0.040104486785494775, + "grad_norm": 4.883429050445557, + "learning_rate": 1.9839582052858023e-05, + "loss": 1.8403, + "step": 522 + }, + { + "epoch": 0.04025814382298709, + "grad_norm": 5.759003162384033, + "learning_rate": 1.9838967424708053e-05, + "loss": 1.872, + "step": 524 + }, + { + "epoch": 0.04041180086047941, + "grad_norm": 5.845025539398193, + "learning_rate": 1.9838352796558086e-05, + "loss": 1.818, + "step": 526 + }, + { + "epoch": 0.040565457897971724, + "grad_norm": 6.238631248474121, + "learning_rate": 1.9837738168408113e-05, + "loss": 1.9553, + "step": 528 + }, + { + "epoch": 0.04071911493546405, + "grad_norm": 5.450825214385986, + "learning_rate": 1.9837123540258146e-05, + "loss": 1.9314, + "step": 530 + }, + { + "epoch": 0.040872771972956363, + "grad_norm": 5.4290385246276855, + "learning_rate": 1.9836508912108175e-05, + "loss": 1.8316, + "step": 532 + }, + { + "epoch": 0.04102642901044868, + "grad_norm": 6.243955612182617, + "learning_rate": 1.9835894283958205e-05, + "loss": 1.9605, + "step": 534 + }, + { + "epoch": 0.041180086047940996, + "grad_norm": 5.5207672119140625, + "learning_rate": 1.9835279655808238e-05, + "loss": 1.9377, + "step": 536 + }, + { + "epoch": 0.04133374308543331, + "grad_norm": 5.570779323577881, + "learning_rate": 1.9834665027658268e-05, + "loss": 1.9706, + "step": 538 + }, + { + "epoch": 0.04148740012292563, + "grad_norm": 4.921234130859375, + "learning_rate": 1.98340503995083e-05, + "loss": 1.8666, + "step": 540 + }, + { + "epoch": 0.041641057160417945, + "grad_norm": 6.029317855834961, + "learning_rate": 1.983343577135833e-05, + "loss": 1.8431, + "step": 542 + }, + { + "epoch": 0.04179471419791026, + "grad_norm": 5.6237664222717285, + "learning_rate": 1.983282114320836e-05, + "loss": 2.0265, + "step": 544 + }, + { + "epoch": 0.041948371235402585, + "grad_norm": 4.848809719085693, + "learning_rate": 1.9832206515058393e-05, + "loss": 1.851, + "step": 546 + }, + { + "epoch": 0.0421020282728949, + "grad_norm": 6.06104040145874, + "learning_rate": 1.983159188690842e-05, + "loss": 1.9252, + "step": 548 + }, + { + "epoch": 0.04225568531038722, + "grad_norm": 6.721662521362305, + "learning_rate": 1.9830977258758453e-05, + "loss": 1.9046, + "step": 550 + }, + { + "epoch": 0.04240934234787953, + "grad_norm": 5.039158821105957, + "learning_rate": 1.9830362630608482e-05, + "loss": 1.9457, + "step": 552 + }, + { + "epoch": 0.04256299938537185, + "grad_norm": 4.985758304595947, + "learning_rate": 1.9829748002458512e-05, + "loss": 1.7706, + "step": 554 + }, + { + "epoch": 0.042716656422864166, + "grad_norm": 5.59445858001709, + "learning_rate": 1.9829133374308545e-05, + "loss": 1.9232, + "step": 556 + }, + { + "epoch": 0.04287031346035648, + "grad_norm": 5.786518573760986, + "learning_rate": 1.9828518746158575e-05, + "loss": 1.9535, + "step": 558 + }, + { + "epoch": 0.0430239704978488, + "grad_norm": 5.362064838409424, + "learning_rate": 1.9827904118008608e-05, + "loss": 1.774, + "step": 560 + }, + { + "epoch": 0.04317762753534112, + "grad_norm": 6.807535171508789, + "learning_rate": 1.9827289489858637e-05, + "loss": 1.9963, + "step": 562 + }, + { + "epoch": 0.04333128457283344, + "grad_norm": 4.927182197570801, + "learning_rate": 1.9826674861708667e-05, + "loss": 1.8839, + "step": 564 + }, + { + "epoch": 0.043484941610325754, + "grad_norm": 7.077647686004639, + "learning_rate": 1.98260602335587e-05, + "loss": 1.8577, + "step": 566 + }, + { + "epoch": 0.04363859864781807, + "grad_norm": 4.930956840515137, + "learning_rate": 1.982544560540873e-05, + "loss": 1.9032, + "step": 568 + }, + { + "epoch": 0.04379225568531039, + "grad_norm": 5.537839889526367, + "learning_rate": 1.982483097725876e-05, + "loss": 1.8599, + "step": 570 + }, + { + "epoch": 0.0439459127228027, + "grad_norm": 4.91294527053833, + "learning_rate": 1.9824216349108793e-05, + "loss": 1.8962, + "step": 572 + }, + { + "epoch": 0.04409956976029502, + "grad_norm": 7.946929931640625, + "learning_rate": 1.982360172095882e-05, + "loss": 2.0401, + "step": 574 + }, + { + "epoch": 0.044253226797787336, + "grad_norm": 5.566417217254639, + "learning_rate": 1.9822987092808852e-05, + "loss": 1.7317, + "step": 576 + }, + { + "epoch": 0.04440688383527966, + "grad_norm": 6.196030616760254, + "learning_rate": 1.9822372464658882e-05, + "loss": 1.9818, + "step": 578 + }, + { + "epoch": 0.044560540872771975, + "grad_norm": 5.8990888595581055, + "learning_rate": 1.9821757836508915e-05, + "loss": 1.9209, + "step": 580 + }, + { + "epoch": 0.04471419791026429, + "grad_norm": 4.752439022064209, + "learning_rate": 1.9821143208358944e-05, + "loss": 1.8661, + "step": 582 + }, + { + "epoch": 0.04486785494775661, + "grad_norm": 5.3692121505737305, + "learning_rate": 1.9820528580208974e-05, + "loss": 1.8574, + "step": 584 + }, + { + "epoch": 0.045021511985248924, + "grad_norm": 4.94577169418335, + "learning_rate": 1.9819913952059007e-05, + "loss": 1.76, + "step": 586 + }, + { + "epoch": 0.04517516902274124, + "grad_norm": 5.1533708572387695, + "learning_rate": 1.9819299323909037e-05, + "loss": 1.8634, + "step": 588 + }, + { + "epoch": 0.04532882606023356, + "grad_norm": 5.460253715515137, + "learning_rate": 1.9818684695759067e-05, + "loss": 1.7615, + "step": 590 + }, + { + "epoch": 0.04548248309772587, + "grad_norm": 6.106910705566406, + "learning_rate": 1.98180700676091e-05, + "loss": 1.8658, + "step": 592 + }, + { + "epoch": 0.045636140135218196, + "grad_norm": 8.604896545410156, + "learning_rate": 1.981745543945913e-05, + "loss": 1.8234, + "step": 594 + }, + { + "epoch": 0.04578979717271051, + "grad_norm": 5.533381938934326, + "learning_rate": 1.981684081130916e-05, + "loss": 1.8133, + "step": 596 + }, + { + "epoch": 0.04594345421020283, + "grad_norm": 5.140172481536865, + "learning_rate": 1.9816226183159192e-05, + "loss": 1.7655, + "step": 598 + }, + { + "epoch": 0.046097111247695145, + "grad_norm": 5.633389472961426, + "learning_rate": 1.9815611555009222e-05, + "loss": 1.8804, + "step": 600 + }, + { + "epoch": 0.04625076828518746, + "grad_norm": 5.397654056549072, + "learning_rate": 1.981499692685925e-05, + "loss": 1.9422, + "step": 602 + }, + { + "epoch": 0.04640442532267978, + "grad_norm": 5.916885852813721, + "learning_rate": 1.981438229870928e-05, + "loss": 1.9222, + "step": 604 + }, + { + "epoch": 0.046558082360172094, + "grad_norm": 4.4198198318481445, + "learning_rate": 1.9813767670559314e-05, + "loss": 1.8088, + "step": 606 + }, + { + "epoch": 0.04671173939766441, + "grad_norm": 6.035666465759277, + "learning_rate": 1.9813153042409344e-05, + "loss": 1.9505, + "step": 608 + }, + { + "epoch": 0.04686539643515673, + "grad_norm": 5.293002605438232, + "learning_rate": 1.9812538414259374e-05, + "loss": 1.9354, + "step": 610 + }, + { + "epoch": 0.04701905347264905, + "grad_norm": 5.066743850708008, + "learning_rate": 1.9811923786109407e-05, + "loss": 2.001, + "step": 612 + }, + { + "epoch": 0.047172710510141366, + "grad_norm": 6.867171764373779, + "learning_rate": 1.9811309157959436e-05, + "loss": 1.86, + "step": 614 + }, + { + "epoch": 0.04732636754763368, + "grad_norm": 4.908615589141846, + "learning_rate": 1.9810694529809466e-05, + "loss": 1.8855, + "step": 616 + }, + { + "epoch": 0.047480024585126, + "grad_norm": 5.6588006019592285, + "learning_rate": 1.98100799016595e-05, + "loss": 1.8047, + "step": 618 + }, + { + "epoch": 0.047633681622618315, + "grad_norm": 5.6555304527282715, + "learning_rate": 1.980946527350953e-05, + "loss": 1.7656, + "step": 620 + }, + { + "epoch": 0.04778733866011063, + "grad_norm": 4.742602348327637, + "learning_rate": 1.980885064535956e-05, + "loss": 1.976, + "step": 622 + }, + { + "epoch": 0.04794099569760295, + "grad_norm": 5.0910868644714355, + "learning_rate": 1.980823601720959e-05, + "loss": 1.8894, + "step": 624 + }, + { + "epoch": 0.048094652735095264, + "grad_norm": 5.279669761657715, + "learning_rate": 1.980762138905962e-05, + "loss": 1.9323, + "step": 626 + }, + { + "epoch": 0.04824830977258759, + "grad_norm": 5.603051662445068, + "learning_rate": 1.980700676090965e-05, + "loss": 1.9327, + "step": 628 + }, + { + "epoch": 0.048401966810079904, + "grad_norm": 5.823456764221191, + "learning_rate": 1.980639213275968e-05, + "loss": 1.9087, + "step": 630 + }, + { + "epoch": 0.04855562384757222, + "grad_norm": 4.226296424865723, + "learning_rate": 1.9805777504609714e-05, + "loss": 1.7298, + "step": 632 + }, + { + "epoch": 0.048709280885064536, + "grad_norm": 4.537020683288574, + "learning_rate": 1.9805162876459743e-05, + "loss": 1.8588, + "step": 634 + }, + { + "epoch": 0.04886293792255685, + "grad_norm": 5.843430519104004, + "learning_rate": 1.9804548248309773e-05, + "loss": 1.8581, + "step": 636 + }, + { + "epoch": 0.04901659496004917, + "grad_norm": 5.234043598175049, + "learning_rate": 1.9803933620159806e-05, + "loss": 1.8016, + "step": 638 + }, + { + "epoch": 0.049170251997541485, + "grad_norm": 6.091218948364258, + "learning_rate": 1.9803318992009836e-05, + "loss": 1.8419, + "step": 640 + }, + { + "epoch": 0.0493239090350338, + "grad_norm": 5.473825454711914, + "learning_rate": 1.9802704363859865e-05, + "loss": 1.8742, + "step": 642 + }, + { + "epoch": 0.049477566072526125, + "grad_norm": 5.018134117126465, + "learning_rate": 1.98020897357099e-05, + "loss": 1.9246, + "step": 644 + }, + { + "epoch": 0.04963122311001844, + "grad_norm": 5.1250505447387695, + "learning_rate": 1.9801475107559928e-05, + "loss": 1.8988, + "step": 646 + }, + { + "epoch": 0.04978488014751076, + "grad_norm": 5.310157299041748, + "learning_rate": 1.9800860479409958e-05, + "loss": 1.9718, + "step": 648 + }, + { + "epoch": 0.049938537185003073, + "grad_norm": 5.5490570068359375, + "learning_rate": 1.980024585125999e-05, + "loss": 1.8779, + "step": 650 + }, + { + "epoch": 0.05009219422249539, + "grad_norm": 5.242208480834961, + "learning_rate": 1.979963122311002e-05, + "loss": 1.8712, + "step": 652 + }, + { + "epoch": 0.050245851259987706, + "grad_norm": 4.680446624755859, + "learning_rate": 1.979901659496005e-05, + "loss": 1.9475, + "step": 654 + }, + { + "epoch": 0.05039950829748002, + "grad_norm": 12.400496482849121, + "learning_rate": 1.979840196681008e-05, + "loss": 1.9387, + "step": 656 + }, + { + "epoch": 0.05055316533497234, + "grad_norm": 4.818700313568115, + "learning_rate": 1.9797787338660113e-05, + "loss": 1.7356, + "step": 658 + }, + { + "epoch": 0.05070682237246466, + "grad_norm": 4.733686923980713, + "learning_rate": 1.9797172710510143e-05, + "loss": 1.7161, + "step": 660 + }, + { + "epoch": 0.05086047940995698, + "grad_norm": 5.9219865798950195, + "learning_rate": 1.9796558082360172e-05, + "loss": 1.9821, + "step": 662 + }, + { + "epoch": 0.051014136447449294, + "grad_norm": 4.954675197601318, + "learning_rate": 1.9795943454210206e-05, + "loss": 1.9392, + "step": 664 + }, + { + "epoch": 0.05116779348494161, + "grad_norm": 4.482631206512451, + "learning_rate": 1.9795328826060235e-05, + "loss": 1.9687, + "step": 666 + }, + { + "epoch": 0.05132145052243393, + "grad_norm": 6.749068737030029, + "learning_rate": 1.9794714197910265e-05, + "loss": 1.8242, + "step": 668 + }, + { + "epoch": 0.05147510755992624, + "grad_norm": 4.532095909118652, + "learning_rate": 1.9794099569760298e-05, + "loss": 1.7596, + "step": 670 + }, + { + "epoch": 0.05162876459741856, + "grad_norm": 5.727676868438721, + "learning_rate": 1.9793484941610328e-05, + "loss": 1.94, + "step": 672 + }, + { + "epoch": 0.051782421634910876, + "grad_norm": 5.493950843811035, + "learning_rate": 1.9792870313460357e-05, + "loss": 1.9243, + "step": 674 + }, + { + "epoch": 0.0519360786724032, + "grad_norm": 5.48468017578125, + "learning_rate": 1.9792255685310387e-05, + "loss": 1.7862, + "step": 676 + }, + { + "epoch": 0.052089735709895515, + "grad_norm": 5.862773895263672, + "learning_rate": 1.979164105716042e-05, + "loss": 1.843, + "step": 678 + }, + { + "epoch": 0.05224339274738783, + "grad_norm": 5.505096912384033, + "learning_rate": 1.979102642901045e-05, + "loss": 1.8366, + "step": 680 + }, + { + "epoch": 0.05239704978488015, + "grad_norm": 5.697121620178223, + "learning_rate": 1.979041180086048e-05, + "loss": 1.9764, + "step": 682 + }, + { + "epoch": 0.052550706822372464, + "grad_norm": 4.900547027587891, + "learning_rate": 1.9789797172710513e-05, + "loss": 1.9252, + "step": 684 + }, + { + "epoch": 0.05270436385986478, + "grad_norm": 5.347836017608643, + "learning_rate": 1.9789182544560542e-05, + "loss": 1.8527, + "step": 686 + }, + { + "epoch": 0.0528580208973571, + "grad_norm": 5.393474102020264, + "learning_rate": 1.9788567916410572e-05, + "loss": 1.8422, + "step": 688 + }, + { + "epoch": 0.05301167793484941, + "grad_norm": 5.27833366394043, + "learning_rate": 1.9787953288260605e-05, + "loss": 1.8933, + "step": 690 + }, + { + "epoch": 0.053165334972341736, + "grad_norm": 5.38336181640625, + "learning_rate": 1.9787338660110635e-05, + "loss": 1.9456, + "step": 692 + }, + { + "epoch": 0.05331899200983405, + "grad_norm": 5.273176193237305, + "learning_rate": 1.9786724031960664e-05, + "loss": 1.8142, + "step": 694 + }, + { + "epoch": 0.05347264904732637, + "grad_norm": 5.413751125335693, + "learning_rate": 1.9786109403810697e-05, + "loss": 1.7652, + "step": 696 + }, + { + "epoch": 0.053626306084818685, + "grad_norm": 5.373195648193359, + "learning_rate": 1.9785494775660727e-05, + "loss": 1.883, + "step": 698 + }, + { + "epoch": 0.053779963122311, + "grad_norm": 4.942586421966553, + "learning_rate": 1.9784880147510757e-05, + "loss": 1.76, + "step": 700 + }, + { + "epoch": 0.05393362015980332, + "grad_norm": 5.6196980476379395, + "learning_rate": 1.9784265519360786e-05, + "loss": 1.7673, + "step": 702 + }, + { + "epoch": 0.054087277197295634, + "grad_norm": 5.702764987945557, + "learning_rate": 1.978365089121082e-05, + "loss": 1.8437, + "step": 704 + }, + { + "epoch": 0.05424093423478795, + "grad_norm": 4.99530553817749, + "learning_rate": 1.978303626306085e-05, + "loss": 1.8747, + "step": 706 + }, + { + "epoch": 0.054394591272280274, + "grad_norm": 5.105679035186768, + "learning_rate": 1.978242163491088e-05, + "loss": 1.6632, + "step": 708 + }, + { + "epoch": 0.05454824830977259, + "grad_norm": 4.710418701171875, + "learning_rate": 1.9781807006760912e-05, + "loss": 1.8736, + "step": 710 + }, + { + "epoch": 0.054701905347264906, + "grad_norm": 4.792379856109619, + "learning_rate": 1.978119237861094e-05, + "loss": 1.8016, + "step": 712 + }, + { + "epoch": 0.05485556238475722, + "grad_norm": 4.937024116516113, + "learning_rate": 1.978057775046097e-05, + "loss": 1.7436, + "step": 714 + }, + { + "epoch": 0.05500921942224954, + "grad_norm": 5.5544867515563965, + "learning_rate": 1.9779963122311004e-05, + "loss": 1.926, + "step": 716 + }, + { + "epoch": 0.055162876459741855, + "grad_norm": 6.484194278717041, + "learning_rate": 1.9779348494161034e-05, + "loss": 1.9102, + "step": 718 + }, + { + "epoch": 0.05531653349723417, + "grad_norm": 5.408361434936523, + "learning_rate": 1.9778733866011064e-05, + "loss": 1.7786, + "step": 720 + }, + { + "epoch": 0.05547019053472649, + "grad_norm": 5.705206394195557, + "learning_rate": 1.9778119237861097e-05, + "loss": 1.779, + "step": 722 + }, + { + "epoch": 0.055623847572218804, + "grad_norm": 6.138594627380371, + "learning_rate": 1.9777504609711127e-05, + "loss": 1.6926, + "step": 724 + }, + { + "epoch": 0.05577750460971113, + "grad_norm": 5.507882595062256, + "learning_rate": 1.977688998156116e-05, + "loss": 2.0119, + "step": 726 + }, + { + "epoch": 0.055931161647203444, + "grad_norm": 5.1471710205078125, + "learning_rate": 1.9776275353411186e-05, + "loss": 1.7674, + "step": 728 + }, + { + "epoch": 0.05608481868469576, + "grad_norm": 5.558322906494141, + "learning_rate": 1.977566072526122e-05, + "loss": 1.7262, + "step": 730 + }, + { + "epoch": 0.056238475722188076, + "grad_norm": 5.859812259674072, + "learning_rate": 1.977504609711125e-05, + "loss": 2.0273, + "step": 732 + }, + { + "epoch": 0.05639213275968039, + "grad_norm": 4.931456565856934, + "learning_rate": 1.977443146896128e-05, + "loss": 1.7008, + "step": 734 + }, + { + "epoch": 0.05654578979717271, + "grad_norm": 4.835200786590576, + "learning_rate": 1.977381684081131e-05, + "loss": 1.7615, + "step": 736 + }, + { + "epoch": 0.056699446834665025, + "grad_norm": 5.542105674743652, + "learning_rate": 1.977320221266134e-05, + "loss": 1.7991, + "step": 738 + }, + { + "epoch": 0.05685310387215734, + "grad_norm": 5.737773895263672, + "learning_rate": 1.977258758451137e-05, + "loss": 1.7067, + "step": 740 + }, + { + "epoch": 0.057006760909649665, + "grad_norm": 4.556394100189209, + "learning_rate": 1.9771972956361404e-05, + "loss": 1.8418, + "step": 742 + }, + { + "epoch": 0.05716041794714198, + "grad_norm": 4.682400226593018, + "learning_rate": 1.9771358328211434e-05, + "loss": 1.7565, + "step": 744 + }, + { + "epoch": 0.0573140749846343, + "grad_norm": 5.617753982543945, + "learning_rate": 1.9770743700061467e-05, + "loss": 1.8314, + "step": 746 + }, + { + "epoch": 0.057467732022126614, + "grad_norm": 4.796401500701904, + "learning_rate": 1.9770129071911496e-05, + "loss": 1.6892, + "step": 748 + }, + { + "epoch": 0.05762138905961893, + "grad_norm": 5.084446430206299, + "learning_rate": 1.9769514443761526e-05, + "loss": 1.6731, + "step": 750 + }, + { + "epoch": 0.057775046097111246, + "grad_norm": 5.344216823577881, + "learning_rate": 1.976889981561156e-05, + "loss": 1.8096, + "step": 752 + }, + { + "epoch": 0.05792870313460356, + "grad_norm": 4.87506103515625, + "learning_rate": 1.9768285187461585e-05, + "loss": 1.9015, + "step": 754 + }, + { + "epoch": 0.05808236017209588, + "grad_norm": 5.019058704376221, + "learning_rate": 1.976767055931162e-05, + "loss": 1.9467, + "step": 756 + }, + { + "epoch": 0.0582360172095882, + "grad_norm": 5.275008678436279, + "learning_rate": 1.9767055931161648e-05, + "loss": 1.6159, + "step": 758 + }, + { + "epoch": 0.05838967424708052, + "grad_norm": 5.17955207824707, + "learning_rate": 1.9766441303011678e-05, + "loss": 1.6819, + "step": 760 + }, + { + "epoch": 0.058543331284572835, + "grad_norm": 5.578658580780029, + "learning_rate": 1.976582667486171e-05, + "loss": 1.8369, + "step": 762 + }, + { + "epoch": 0.05869698832206515, + "grad_norm": 4.934607982635498, + "learning_rate": 1.976521204671174e-05, + "loss": 1.8909, + "step": 764 + }, + { + "epoch": 0.05885064535955747, + "grad_norm": 5.5896759033203125, + "learning_rate": 1.9764597418561774e-05, + "loss": 1.7238, + "step": 766 + }, + { + "epoch": 0.05900430239704978, + "grad_norm": 5.263469696044922, + "learning_rate": 1.9763982790411803e-05, + "loss": 1.7776, + "step": 768 + }, + { + "epoch": 0.0591579594345421, + "grad_norm": 4.459990978240967, + "learning_rate": 1.9763368162261833e-05, + "loss": 1.7082, + "step": 770 + }, + { + "epoch": 0.059311616472034416, + "grad_norm": 5.528759002685547, + "learning_rate": 1.9762753534111866e-05, + "loss": 1.9014, + "step": 772 + }, + { + "epoch": 0.05946527350952674, + "grad_norm": 5.372073650360107, + "learning_rate": 1.9762138905961892e-05, + "loss": 1.691, + "step": 774 + }, + { + "epoch": 0.059618930547019056, + "grad_norm": 5.765900135040283, + "learning_rate": 1.9761524277811925e-05, + "loss": 1.9243, + "step": 776 + }, + { + "epoch": 0.05977258758451137, + "grad_norm": 5.123989105224609, + "learning_rate": 1.9760909649661955e-05, + "loss": 2.0223, + "step": 778 + }, + { + "epoch": 0.05992624462200369, + "grad_norm": 5.149808406829834, + "learning_rate": 1.9760295021511985e-05, + "loss": 1.8063, + "step": 780 + }, + { + "epoch": 0.060079901659496004, + "grad_norm": 5.047703266143799, + "learning_rate": 1.9759680393362018e-05, + "loss": 1.9353, + "step": 782 + }, + { + "epoch": 0.06023355869698832, + "grad_norm": 5.555423259735107, + "learning_rate": 1.9759065765212048e-05, + "loss": 1.695, + "step": 784 + }, + { + "epoch": 0.06038721573448064, + "grad_norm": 5.100247859954834, + "learning_rate": 1.9758451137062077e-05, + "loss": 1.6758, + "step": 786 + }, + { + "epoch": 0.06054087277197295, + "grad_norm": 4.941176891326904, + "learning_rate": 1.975783650891211e-05, + "loss": 1.7391, + "step": 788 + }, + { + "epoch": 0.06069452980946528, + "grad_norm": 5.3119964599609375, + "learning_rate": 1.975722188076214e-05, + "loss": 2.0366, + "step": 790 + }, + { + "epoch": 0.06084818684695759, + "grad_norm": 6.0686235427856445, + "learning_rate": 1.9756607252612173e-05, + "loss": 1.7149, + "step": 792 + }, + { + "epoch": 0.06100184388444991, + "grad_norm": 6.141575336456299, + "learning_rate": 1.9755992624462203e-05, + "loss": 1.7808, + "step": 794 + }, + { + "epoch": 0.061155500921942225, + "grad_norm": 5.157688140869141, + "learning_rate": 1.9755377996312232e-05, + "loss": 1.8069, + "step": 796 + }, + { + "epoch": 0.06130915795943454, + "grad_norm": 5.358695983886719, + "learning_rate": 1.9754763368162266e-05, + "loss": 1.9632, + "step": 798 + }, + { + "epoch": 0.06146281499692686, + "grad_norm": 5.3423261642456055, + "learning_rate": 1.9754148740012292e-05, + "loss": 1.6711, + "step": 800 + }, + { + "epoch": 0.061616472034419174, + "grad_norm": 5.9911980628967285, + "learning_rate": 1.9753534111862325e-05, + "loss": 1.6384, + "step": 802 + }, + { + "epoch": 0.06177012907191149, + "grad_norm": 5.021694183349609, + "learning_rate": 1.9752919483712355e-05, + "loss": 1.7412, + "step": 804 + }, + { + "epoch": 0.061923786109403814, + "grad_norm": 5.38372802734375, + "learning_rate": 1.9752304855562384e-05, + "loss": 1.5895, + "step": 806 + }, + { + "epoch": 0.06207744314689613, + "grad_norm": 5.618641376495361, + "learning_rate": 1.9751690227412417e-05, + "loss": 1.8097, + "step": 808 + }, + { + "epoch": 0.062231100184388446, + "grad_norm": 5.081387519836426, + "learning_rate": 1.9751075599262447e-05, + "loss": 1.7789, + "step": 810 + }, + { + "epoch": 0.06238475722188076, + "grad_norm": 5.361464500427246, + "learning_rate": 1.975046097111248e-05, + "loss": 1.7821, + "step": 812 + }, + { + "epoch": 0.06253841425937308, + "grad_norm": 5.113397598266602, + "learning_rate": 1.974984634296251e-05, + "loss": 1.7292, + "step": 814 + }, + { + "epoch": 0.0626920712968654, + "grad_norm": 5.261277198791504, + "learning_rate": 1.974923171481254e-05, + "loss": 1.6956, + "step": 816 + }, + { + "epoch": 0.06284572833435771, + "grad_norm": 7.06874942779541, + "learning_rate": 1.9748617086662573e-05, + "loss": 1.865, + "step": 818 + }, + { + "epoch": 0.06299938537185003, + "grad_norm": 4.949324131011963, + "learning_rate": 1.9748002458512602e-05, + "loss": 1.7885, + "step": 820 + }, + { + "epoch": 0.06315304240934234, + "grad_norm": 6.291264533996582, + "learning_rate": 1.9747387830362632e-05, + "loss": 1.8685, + "step": 822 + }, + { + "epoch": 0.06330669944683466, + "grad_norm": 4.500913143157959, + "learning_rate": 1.9746773202212665e-05, + "loss": 1.6422, + "step": 824 + }, + { + "epoch": 0.06346035648432698, + "grad_norm": 5.313440322875977, + "learning_rate": 1.974615857406269e-05, + "loss": 1.8328, + "step": 826 + }, + { + "epoch": 0.0636140135218193, + "grad_norm": 5.809798240661621, + "learning_rate": 1.9745543945912724e-05, + "loss": 1.8912, + "step": 828 + }, + { + "epoch": 0.06376767055931162, + "grad_norm": 4.69051456451416, + "learning_rate": 1.9744929317762754e-05, + "loss": 1.6786, + "step": 830 + }, + { + "epoch": 0.06392132759680394, + "grad_norm": 5.292459487915039, + "learning_rate": 1.9744314689612787e-05, + "loss": 1.8497, + "step": 832 + }, + { + "epoch": 0.06407498463429626, + "grad_norm": 4.772144794464111, + "learning_rate": 1.9743700061462817e-05, + "loss": 1.6261, + "step": 834 + }, + { + "epoch": 0.06422864167178857, + "grad_norm": 4.984364032745361, + "learning_rate": 1.9743085433312846e-05, + "loss": 1.7354, + "step": 836 + }, + { + "epoch": 0.06438229870928089, + "grad_norm": 4.450577735900879, + "learning_rate": 1.974247080516288e-05, + "loss": 1.7349, + "step": 838 + }, + { + "epoch": 0.0645359557467732, + "grad_norm": 5.341747760772705, + "learning_rate": 1.974185617701291e-05, + "loss": 1.8332, + "step": 840 + }, + { + "epoch": 0.06468961278426552, + "grad_norm": 5.368303298950195, + "learning_rate": 1.974124154886294e-05, + "loss": 1.8219, + "step": 842 + }, + { + "epoch": 0.06484326982175784, + "grad_norm": 4.4096360206604, + "learning_rate": 1.9740626920712972e-05, + "loss": 1.8158, + "step": 844 + }, + { + "epoch": 0.06499692685925015, + "grad_norm": 6.098479270935059, + "learning_rate": 1.9740012292563e-05, + "loss": 1.7323, + "step": 846 + }, + { + "epoch": 0.06515058389674247, + "grad_norm": 4.606769561767578, + "learning_rate": 1.973939766441303e-05, + "loss": 1.7527, + "step": 848 + }, + { + "epoch": 0.06530424093423479, + "grad_norm": 6.082258701324463, + "learning_rate": 1.9738783036263064e-05, + "loss": 1.8471, + "step": 850 + }, + { + "epoch": 0.0654578979717271, + "grad_norm": 5.389958381652832, + "learning_rate": 1.9738168408113094e-05, + "loss": 1.8564, + "step": 852 + }, + { + "epoch": 0.06561155500921942, + "grad_norm": 5.574385643005371, + "learning_rate": 1.9737553779963124e-05, + "loss": 1.784, + "step": 854 + }, + { + "epoch": 0.06576521204671174, + "grad_norm": 5.1567487716674805, + "learning_rate": 1.9736939151813153e-05, + "loss": 1.713, + "step": 856 + }, + { + "epoch": 0.06591886908420405, + "grad_norm": 5.475706577301025, + "learning_rate": 1.9736324523663187e-05, + "loss": 1.7555, + "step": 858 + }, + { + "epoch": 0.06607252612169637, + "grad_norm": 4.831605434417725, + "learning_rate": 1.9735709895513216e-05, + "loss": 1.8695, + "step": 860 + }, + { + "epoch": 0.06622618315918868, + "grad_norm": 6.022873878479004, + "learning_rate": 1.9735095267363246e-05, + "loss": 1.7765, + "step": 862 + }, + { + "epoch": 0.06637984019668101, + "grad_norm": 4.874941825866699, + "learning_rate": 1.973448063921328e-05, + "loss": 1.757, + "step": 864 + }, + { + "epoch": 0.06653349723417333, + "grad_norm": 4.488655090332031, + "learning_rate": 1.973386601106331e-05, + "loss": 1.8837, + "step": 866 + }, + { + "epoch": 0.06668715427166565, + "grad_norm": 4.5713090896606445, + "learning_rate": 1.973325138291334e-05, + "loss": 1.6627, + "step": 868 + }, + { + "epoch": 0.06684081130915796, + "grad_norm": 5.312070846557617, + "learning_rate": 1.973263675476337e-05, + "loss": 1.7551, + "step": 870 + }, + { + "epoch": 0.06699446834665028, + "grad_norm": 5.104644775390625, + "learning_rate": 1.97320221266134e-05, + "loss": 1.8789, + "step": 872 + }, + { + "epoch": 0.0671481253841426, + "grad_norm": 4.595895290374756, + "learning_rate": 1.973140749846343e-05, + "loss": 1.8319, + "step": 874 + }, + { + "epoch": 0.06730178242163491, + "grad_norm": 5.223522186279297, + "learning_rate": 1.973079287031346e-05, + "loss": 1.6661, + "step": 876 + }, + { + "epoch": 0.06745543945912723, + "grad_norm": 4.466522693634033, + "learning_rate": 1.9730178242163494e-05, + "loss": 1.8338, + "step": 878 + }, + { + "epoch": 0.06760909649661954, + "grad_norm": 4.613927841186523, + "learning_rate": 1.9729563614013523e-05, + "loss": 1.7562, + "step": 880 + }, + { + "epoch": 0.06776275353411186, + "grad_norm": 5.868101119995117, + "learning_rate": 1.9728948985863553e-05, + "loss": 1.9181, + "step": 882 + }, + { + "epoch": 0.06791641057160418, + "grad_norm": 5.407005786895752, + "learning_rate": 1.9728334357713586e-05, + "loss": 1.8462, + "step": 884 + }, + { + "epoch": 0.0680700676090965, + "grad_norm": 6.075082778930664, + "learning_rate": 1.9727719729563616e-05, + "loss": 1.816, + "step": 886 + }, + { + "epoch": 0.06822372464658881, + "grad_norm": 5.164397716522217, + "learning_rate": 1.9727105101413645e-05, + "loss": 1.9236, + "step": 888 + }, + { + "epoch": 0.06837738168408113, + "grad_norm": 5.688714981079102, + "learning_rate": 1.972649047326368e-05, + "loss": 1.7171, + "step": 890 + }, + { + "epoch": 0.06853103872157344, + "grad_norm": 4.9518842697143555, + "learning_rate": 1.9725875845113708e-05, + "loss": 1.7829, + "step": 892 + }, + { + "epoch": 0.06868469575906576, + "grad_norm": 5.185763835906982, + "learning_rate": 1.9725261216963738e-05, + "loss": 1.7146, + "step": 894 + }, + { + "epoch": 0.06883835279655809, + "grad_norm": 5.043625354766846, + "learning_rate": 1.972464658881377e-05, + "loss": 1.8594, + "step": 896 + }, + { + "epoch": 0.0689920098340504, + "grad_norm": 4.783642292022705, + "learning_rate": 1.97240319606638e-05, + "loss": 1.7053, + "step": 898 + }, + { + "epoch": 0.06914566687154272, + "grad_norm": 4.887513637542725, + "learning_rate": 1.972341733251383e-05, + "loss": 1.7929, + "step": 900 + }, + { + "epoch": 0.06929932390903504, + "grad_norm": 4.4108123779296875, + "learning_rate": 1.972280270436386e-05, + "loss": 1.7296, + "step": 902 + }, + { + "epoch": 0.06945298094652735, + "grad_norm": 5.531246662139893, + "learning_rate": 1.9722188076213893e-05, + "loss": 1.7792, + "step": 904 + }, + { + "epoch": 0.06960663798401967, + "grad_norm": 4.462593078613281, + "learning_rate": 1.9721573448063923e-05, + "loss": 1.7121, + "step": 906 + }, + { + "epoch": 0.06976029502151199, + "grad_norm": 4.543118000030518, + "learning_rate": 1.9720958819913952e-05, + "loss": 1.7871, + "step": 908 + }, + { + "epoch": 0.0699139520590043, + "grad_norm": 5.9536638259887695, + "learning_rate": 1.9720344191763985e-05, + "loss": 1.7956, + "step": 910 + }, + { + "epoch": 0.07006760909649662, + "grad_norm": 4.735901832580566, + "learning_rate": 1.9719729563614015e-05, + "loss": 1.7511, + "step": 912 + }, + { + "epoch": 0.07022126613398894, + "grad_norm": 4.490820407867432, + "learning_rate": 1.9719114935464045e-05, + "loss": 1.7364, + "step": 914 + }, + { + "epoch": 0.07037492317148125, + "grad_norm": 4.837772846221924, + "learning_rate": 1.9718500307314078e-05, + "loss": 1.8204, + "step": 916 + }, + { + "epoch": 0.07052858020897357, + "grad_norm": 4.400464057922363, + "learning_rate": 1.9717885679164108e-05, + "loss": 1.8637, + "step": 918 + }, + { + "epoch": 0.07068223724646588, + "grad_norm": 4.991397857666016, + "learning_rate": 1.9717271051014137e-05, + "loss": 1.9175, + "step": 920 + }, + { + "epoch": 0.0708358942839582, + "grad_norm": 4.983181953430176, + "learning_rate": 1.971665642286417e-05, + "loss": 1.7065, + "step": 922 + }, + { + "epoch": 0.07098955132145052, + "grad_norm": 4.5055341720581055, + "learning_rate": 1.97160417947142e-05, + "loss": 1.7113, + "step": 924 + }, + { + "epoch": 0.07114320835894283, + "grad_norm": 5.021308422088623, + "learning_rate": 1.971542716656423e-05, + "loss": 1.8903, + "step": 926 + }, + { + "epoch": 0.07129686539643516, + "grad_norm": 6.712131023406982, + "learning_rate": 1.971481253841426e-05, + "loss": 1.7608, + "step": 928 + }, + { + "epoch": 0.07145052243392748, + "grad_norm": 5.711028575897217, + "learning_rate": 1.9714197910264292e-05, + "loss": 1.7826, + "step": 930 + }, + { + "epoch": 0.0716041794714198, + "grad_norm": 5.202549457550049, + "learning_rate": 1.9713583282114322e-05, + "loss": 1.7201, + "step": 932 + }, + { + "epoch": 0.07175783650891211, + "grad_norm": 4.809873580932617, + "learning_rate": 1.9712968653964352e-05, + "loss": 1.7824, + "step": 934 + }, + { + "epoch": 0.07191149354640443, + "grad_norm": 4.417870998382568, + "learning_rate": 1.9712354025814385e-05, + "loss": 1.8275, + "step": 936 + }, + { + "epoch": 0.07206515058389674, + "grad_norm": 4.823970794677734, + "learning_rate": 1.9711739397664415e-05, + "loss": 1.7293, + "step": 938 + }, + { + "epoch": 0.07221880762138906, + "grad_norm": 5.289034843444824, + "learning_rate": 1.9711124769514444e-05, + "loss": 1.7507, + "step": 940 + }, + { + "epoch": 0.07237246465888138, + "grad_norm": 4.538127422332764, + "learning_rate": 1.9710510141364477e-05, + "loss": 1.6391, + "step": 942 + }, + { + "epoch": 0.0725261216963737, + "grad_norm": 4.500412464141846, + "learning_rate": 1.9709895513214507e-05, + "loss": 1.6816, + "step": 944 + }, + { + "epoch": 0.07267977873386601, + "grad_norm": 5.149693012237549, + "learning_rate": 1.9709280885064537e-05, + "loss": 1.6366, + "step": 946 + }, + { + "epoch": 0.07283343577135833, + "grad_norm": 4.554830074310303, + "learning_rate": 1.970866625691457e-05, + "loss": 1.7712, + "step": 948 + }, + { + "epoch": 0.07298709280885064, + "grad_norm": 5.352977752685547, + "learning_rate": 1.97080516287646e-05, + "loss": 1.5819, + "step": 950 + }, + { + "epoch": 0.07314074984634296, + "grad_norm": 5.3114094734191895, + "learning_rate": 1.970743700061463e-05, + "loss": 1.802, + "step": 952 + }, + { + "epoch": 0.07329440688383528, + "grad_norm": 4.19597053527832, + "learning_rate": 1.970682237246466e-05, + "loss": 1.5855, + "step": 954 + }, + { + "epoch": 0.07344806392132759, + "grad_norm": 4.087234020233154, + "learning_rate": 1.9706207744314692e-05, + "loss": 1.7158, + "step": 956 + }, + { + "epoch": 0.07360172095881991, + "grad_norm": 5.064235210418701, + "learning_rate": 1.970559311616472e-05, + "loss": 1.7579, + "step": 958 + }, + { + "epoch": 0.07375537799631224, + "grad_norm": 5.033870697021484, + "learning_rate": 1.970497848801475e-05, + "loss": 1.8105, + "step": 960 + }, + { + "epoch": 0.07390903503380455, + "grad_norm": 4.546055793762207, + "learning_rate": 1.9704363859864784e-05, + "loss": 1.6964, + "step": 962 + }, + { + "epoch": 0.07406269207129687, + "grad_norm": 5.068551540374756, + "learning_rate": 1.9703749231714814e-05, + "loss": 1.6664, + "step": 964 + }, + { + "epoch": 0.07421634910878919, + "grad_norm": 5.1122050285339355, + "learning_rate": 1.9703134603564844e-05, + "loss": 1.819, + "step": 966 + }, + { + "epoch": 0.0743700061462815, + "grad_norm": 4.410829544067383, + "learning_rate": 1.9702519975414877e-05, + "loss": 1.7872, + "step": 968 + }, + { + "epoch": 0.07452366318377382, + "grad_norm": 4.5425190925598145, + "learning_rate": 1.9701905347264906e-05, + "loss": 1.6679, + "step": 970 + }, + { + "epoch": 0.07467732022126614, + "grad_norm": 4.571905612945557, + "learning_rate": 1.9701290719114936e-05, + "loss": 1.7135, + "step": 972 + }, + { + "epoch": 0.07483097725875845, + "grad_norm": 4.853597640991211, + "learning_rate": 1.970067609096497e-05, + "loss": 1.6162, + "step": 974 + }, + { + "epoch": 0.07498463429625077, + "grad_norm": 5.288270473480225, + "learning_rate": 1.9700061462815e-05, + "loss": 1.7473, + "step": 976 + }, + { + "epoch": 0.07513829133374308, + "grad_norm": 4.418172359466553, + "learning_rate": 1.9699446834665032e-05, + "loss": 1.6986, + "step": 978 + }, + { + "epoch": 0.0752919483712354, + "grad_norm": 4.6486496925354, + "learning_rate": 1.9698832206515058e-05, + "loss": 1.6914, + "step": 980 + }, + { + "epoch": 0.07544560540872772, + "grad_norm": 5.205509185791016, + "learning_rate": 1.969821757836509e-05, + "loss": 1.6129, + "step": 982 + }, + { + "epoch": 0.07559926244622003, + "grad_norm": 4.540061950683594, + "learning_rate": 1.969760295021512e-05, + "loss": 1.7026, + "step": 984 + }, + { + "epoch": 0.07575291948371235, + "grad_norm": 4.7810845375061035, + "learning_rate": 1.969698832206515e-05, + "loss": 1.686, + "step": 986 + }, + { + "epoch": 0.07590657652120467, + "grad_norm": 4.475192546844482, + "learning_rate": 1.9696373693915184e-05, + "loss": 1.739, + "step": 988 + }, + { + "epoch": 0.07606023355869698, + "grad_norm": 5.317092418670654, + "learning_rate": 1.9695759065765213e-05, + "loss": 2.0029, + "step": 990 + }, + { + "epoch": 0.0762138905961893, + "grad_norm": 5.178996562957764, + "learning_rate": 1.9695144437615243e-05, + "loss": 1.7278, + "step": 992 + }, + { + "epoch": 0.07636754763368163, + "grad_norm": 5.976894855499268, + "learning_rate": 1.9694529809465276e-05, + "loss": 1.753, + "step": 994 + }, + { + "epoch": 0.07652120467117395, + "grad_norm": 4.536045551300049, + "learning_rate": 1.9693915181315306e-05, + "loss": 1.6081, + "step": 996 + }, + { + "epoch": 0.07667486170866626, + "grad_norm": 4.751937389373779, + "learning_rate": 1.969330055316534e-05, + "loss": 1.6765, + "step": 998 + }, + { + "epoch": 0.07682851874615858, + "grad_norm": 5.145371437072754, + "learning_rate": 1.9692685925015365e-05, + "loss": 1.9088, + "step": 1000 + }, + { + "epoch": 0.0769821757836509, + "grad_norm": 5.149151802062988, + "learning_rate": 1.9692071296865398e-05, + "loss": 1.7855, + "step": 1002 + }, + { + "epoch": 0.07713583282114321, + "grad_norm": 4.331295490264893, + "learning_rate": 1.9691456668715428e-05, + "loss": 1.6398, + "step": 1004 + }, + { + "epoch": 0.07728948985863553, + "grad_norm": 5.288293361663818, + "learning_rate": 1.9690842040565458e-05, + "loss": 1.8451, + "step": 1006 + }, + { + "epoch": 0.07744314689612784, + "grad_norm": 4.709437370300293, + "learning_rate": 1.969022741241549e-05, + "loss": 1.8546, + "step": 1008 + }, + { + "epoch": 0.07759680393362016, + "grad_norm": 5.373138427734375, + "learning_rate": 1.968961278426552e-05, + "loss": 1.7561, + "step": 1010 + }, + { + "epoch": 0.07775046097111248, + "grad_norm": 6.308657169342041, + "learning_rate": 1.968899815611555e-05, + "loss": 1.8271, + "step": 1012 + }, + { + "epoch": 0.07790411800860479, + "grad_norm": 5.380505561828613, + "learning_rate": 1.9688383527965583e-05, + "loss": 1.7852, + "step": 1014 + }, + { + "epoch": 0.07805777504609711, + "grad_norm": 5.468803405761719, + "learning_rate": 1.9687768899815613e-05, + "loss": 1.7797, + "step": 1016 + }, + { + "epoch": 0.07821143208358942, + "grad_norm": 4.73374080657959, + "learning_rate": 1.9687154271665646e-05, + "loss": 1.6201, + "step": 1018 + }, + { + "epoch": 0.07836508912108174, + "grad_norm": 4.9570631980896, + "learning_rate": 1.9686539643515676e-05, + "loss": 1.7538, + "step": 1020 + }, + { + "epoch": 0.07851874615857406, + "grad_norm": 4.574160575866699, + "learning_rate": 1.9685925015365705e-05, + "loss": 1.7656, + "step": 1022 + }, + { + "epoch": 0.07867240319606637, + "grad_norm": 5.078851222991943, + "learning_rate": 1.968531038721574e-05, + "loss": 1.806, + "step": 1024 + }, + { + "epoch": 0.0788260602335587, + "grad_norm": 5.474549293518066, + "learning_rate": 1.9684695759065765e-05, + "loss": 1.8226, + "step": 1026 + }, + { + "epoch": 0.07897971727105102, + "grad_norm": 6.257920265197754, + "learning_rate": 1.9684081130915798e-05, + "loss": 1.7714, + "step": 1028 + }, + { + "epoch": 0.07913337430854334, + "grad_norm": 4.811653137207031, + "learning_rate": 1.9683466502765827e-05, + "loss": 1.7903, + "step": 1030 + }, + { + "epoch": 0.07928703134603565, + "grad_norm": 4.90382194519043, + "learning_rate": 1.9682851874615857e-05, + "loss": 1.9185, + "step": 1032 + }, + { + "epoch": 0.07944068838352797, + "grad_norm": 5.112819194793701, + "learning_rate": 1.968223724646589e-05, + "loss": 1.8098, + "step": 1034 + }, + { + "epoch": 0.07959434542102028, + "grad_norm": 4.832859039306641, + "learning_rate": 1.968162261831592e-05, + "loss": 1.7965, + "step": 1036 + }, + { + "epoch": 0.0797480024585126, + "grad_norm": 5.124858379364014, + "learning_rate": 1.9681007990165953e-05, + "loss": 1.6745, + "step": 1038 + }, + { + "epoch": 0.07990165949600492, + "grad_norm": 4.530187606811523, + "learning_rate": 1.9680393362015983e-05, + "loss": 1.7152, + "step": 1040 + }, + { + "epoch": 0.08005531653349723, + "grad_norm": 4.918298244476318, + "learning_rate": 1.9679778733866012e-05, + "loss": 1.7445, + "step": 1042 + }, + { + "epoch": 0.08020897357098955, + "grad_norm": 4.637542247772217, + "learning_rate": 1.9679164105716045e-05, + "loss": 1.6802, + "step": 1044 + }, + { + "epoch": 0.08036263060848187, + "grad_norm": 5.314078330993652, + "learning_rate": 1.9678549477566075e-05, + "loss": 1.7547, + "step": 1046 + }, + { + "epoch": 0.08051628764597418, + "grad_norm": 5.02266263961792, + "learning_rate": 1.9677934849416105e-05, + "loss": 1.7107, + "step": 1048 + }, + { + "epoch": 0.0806699446834665, + "grad_norm": 4.683084487915039, + "learning_rate": 1.9677320221266138e-05, + "loss": 1.7415, + "step": 1050 + }, + { + "epoch": 0.08082360172095882, + "grad_norm": 4.539281845092773, + "learning_rate": 1.9676705593116164e-05, + "loss": 1.9356, + "step": 1052 + }, + { + "epoch": 0.08097725875845113, + "grad_norm": 4.526500701904297, + "learning_rate": 1.9676090964966197e-05, + "loss": 1.6487, + "step": 1054 + }, + { + "epoch": 0.08113091579594345, + "grad_norm": 4.272531509399414, + "learning_rate": 1.9675476336816227e-05, + "loss": 1.6478, + "step": 1056 + }, + { + "epoch": 0.08128457283343578, + "grad_norm": 4.874919414520264, + "learning_rate": 1.9674861708666257e-05, + "loss": 1.715, + "step": 1058 + }, + { + "epoch": 0.0814382298709281, + "grad_norm": 5.863064765930176, + "learning_rate": 1.967424708051629e-05, + "loss": 1.8635, + "step": 1060 + }, + { + "epoch": 0.08159188690842041, + "grad_norm": 5.5292205810546875, + "learning_rate": 1.967363245236632e-05, + "loss": 1.7227, + "step": 1062 + }, + { + "epoch": 0.08174554394591273, + "grad_norm": 4.343425273895264, + "learning_rate": 1.9673017824216352e-05, + "loss": 1.8257, + "step": 1064 + }, + { + "epoch": 0.08189920098340504, + "grad_norm": 5.900411128997803, + "learning_rate": 1.9672403196066382e-05, + "loss": 1.8205, + "step": 1066 + }, + { + "epoch": 0.08205285802089736, + "grad_norm": 4.439347743988037, + "learning_rate": 1.9671788567916412e-05, + "loss": 1.6958, + "step": 1068 + }, + { + "epoch": 0.08220651505838968, + "grad_norm": 5.694681644439697, + "learning_rate": 1.9671173939766445e-05, + "loss": 1.8959, + "step": 1070 + }, + { + "epoch": 0.08236017209588199, + "grad_norm": 4.767448425292969, + "learning_rate": 1.9670559311616474e-05, + "loss": 1.6721, + "step": 1072 + }, + { + "epoch": 0.08251382913337431, + "grad_norm": 4.426461696624756, + "learning_rate": 1.9669944683466504e-05, + "loss": 1.607, + "step": 1074 + }, + { + "epoch": 0.08266748617086662, + "grad_norm": 4.794045925140381, + "learning_rate": 1.9669330055316537e-05, + "loss": 1.6254, + "step": 1076 + }, + { + "epoch": 0.08282114320835894, + "grad_norm": 4.544212341308594, + "learning_rate": 1.9668715427166564e-05, + "loss": 1.6082, + "step": 1078 + }, + { + "epoch": 0.08297480024585126, + "grad_norm": 4.256971836090088, + "learning_rate": 1.9668100799016597e-05, + "loss": 1.8001, + "step": 1080 + }, + { + "epoch": 0.08312845728334357, + "grad_norm": 6.058056831359863, + "learning_rate": 1.9667486170866626e-05, + "loss": 1.7258, + "step": 1082 + }, + { + "epoch": 0.08328211432083589, + "grad_norm": 4.815703868865967, + "learning_rate": 1.966687154271666e-05, + "loss": 1.7528, + "step": 1084 + }, + { + "epoch": 0.0834357713583282, + "grad_norm": 4.661309719085693, + "learning_rate": 1.966625691456669e-05, + "loss": 1.7316, + "step": 1086 + }, + { + "epoch": 0.08358942839582052, + "grad_norm": 4.863770961761475, + "learning_rate": 1.966564228641672e-05, + "loss": 1.8824, + "step": 1088 + }, + { + "epoch": 0.08374308543331284, + "grad_norm": 5.061347961425781, + "learning_rate": 1.9665027658266752e-05, + "loss": 1.6873, + "step": 1090 + }, + { + "epoch": 0.08389674247080517, + "grad_norm": 4.252581596374512, + "learning_rate": 1.966441303011678e-05, + "loss": 1.8093, + "step": 1092 + }, + { + "epoch": 0.08405039950829749, + "grad_norm": 5.112542152404785, + "learning_rate": 1.966379840196681e-05, + "loss": 1.7366, + "step": 1094 + }, + { + "epoch": 0.0842040565457898, + "grad_norm": 5.147494792938232, + "learning_rate": 1.9663183773816844e-05, + "loss": 1.8676, + "step": 1096 + }, + { + "epoch": 0.08435771358328212, + "grad_norm": 4.757107257843018, + "learning_rate": 1.966256914566687e-05, + "loss": 1.5888, + "step": 1098 + }, + { + "epoch": 0.08451137062077443, + "grad_norm": 5.5242600440979, + "learning_rate": 1.9661954517516904e-05, + "loss": 1.7018, + "step": 1100 + }, + { + "epoch": 0.08466502765826675, + "grad_norm": 5.5325117111206055, + "learning_rate": 1.9661339889366933e-05, + "loss": 1.6928, + "step": 1102 + }, + { + "epoch": 0.08481868469575907, + "grad_norm": 5.308017253875732, + "learning_rate": 1.9660725261216966e-05, + "loss": 1.7695, + "step": 1104 + }, + { + "epoch": 0.08497234173325138, + "grad_norm": 4.460916519165039, + "learning_rate": 1.9660110633066996e-05, + "loss": 1.649, + "step": 1106 + }, + { + "epoch": 0.0851259987707437, + "grad_norm": 5.222771644592285, + "learning_rate": 1.9659496004917026e-05, + "loss": 1.8017, + "step": 1108 + }, + { + "epoch": 0.08527965580823602, + "grad_norm": 4.484593391418457, + "learning_rate": 1.965888137676706e-05, + "loss": 1.697, + "step": 1110 + }, + { + "epoch": 0.08543331284572833, + "grad_norm": 4.95808219909668, + "learning_rate": 1.965826674861709e-05, + "loss": 1.6171, + "step": 1112 + }, + { + "epoch": 0.08558696988322065, + "grad_norm": 5.313704967498779, + "learning_rate": 1.9657652120467118e-05, + "loss": 1.665, + "step": 1114 + }, + { + "epoch": 0.08574062692071296, + "grad_norm": 4.555895805358887, + "learning_rate": 1.965703749231715e-05, + "loss": 1.7004, + "step": 1116 + }, + { + "epoch": 0.08589428395820528, + "grad_norm": 4.939544677734375, + "learning_rate": 1.965642286416718e-05, + "loss": 1.7204, + "step": 1118 + }, + { + "epoch": 0.0860479409956976, + "grad_norm": 4.291268348693848, + "learning_rate": 1.965580823601721e-05, + "loss": 1.6592, + "step": 1120 + }, + { + "epoch": 0.08620159803318991, + "grad_norm": 5.050233840942383, + "learning_rate": 1.9655193607867244e-05, + "loss": 1.7656, + "step": 1122 + }, + { + "epoch": 0.08635525507068224, + "grad_norm": 4.5748090744018555, + "learning_rate": 1.9654578979717273e-05, + "loss": 1.8491, + "step": 1124 + }, + { + "epoch": 0.08650891210817456, + "grad_norm": 4.3803391456604, + "learning_rate": 1.9653964351567303e-05, + "loss": 1.697, + "step": 1126 + }, + { + "epoch": 0.08666256914566688, + "grad_norm": 5.346717834472656, + "learning_rate": 1.9653349723417333e-05, + "loss": 1.817, + "step": 1128 + }, + { + "epoch": 0.08681622618315919, + "grad_norm": 7.8311944007873535, + "learning_rate": 1.9652735095267366e-05, + "loss": 1.7637, + "step": 1130 + }, + { + "epoch": 0.08696988322065151, + "grad_norm": 4.9822845458984375, + "learning_rate": 1.9652120467117395e-05, + "loss": 1.6673, + "step": 1132 + }, + { + "epoch": 0.08712354025814383, + "grad_norm": 5.492576599121094, + "learning_rate": 1.9651505838967425e-05, + "loss": 1.7831, + "step": 1134 + }, + { + "epoch": 0.08727719729563614, + "grad_norm": 5.493293285369873, + "learning_rate": 1.9650891210817458e-05, + "loss": 1.698, + "step": 1136 + }, + { + "epoch": 0.08743085433312846, + "grad_norm": 6.50536584854126, + "learning_rate": 1.9650276582667488e-05, + "loss": 1.7632, + "step": 1138 + }, + { + "epoch": 0.08758451137062077, + "grad_norm": 4.753702640533447, + "learning_rate": 1.9649661954517518e-05, + "loss": 1.64, + "step": 1140 + }, + { + "epoch": 0.08773816840811309, + "grad_norm": 4.77907657623291, + "learning_rate": 1.964904732636755e-05, + "loss": 1.9073, + "step": 1142 + }, + { + "epoch": 0.0878918254456054, + "grad_norm": 4.242752552032471, + "learning_rate": 1.964843269821758e-05, + "loss": 1.7625, + "step": 1144 + }, + { + "epoch": 0.08804548248309772, + "grad_norm": 5.127992630004883, + "learning_rate": 1.964781807006761e-05, + "loss": 1.6271, + "step": 1146 + }, + { + "epoch": 0.08819913952059004, + "grad_norm": 4.7297749519348145, + "learning_rate": 1.9647203441917643e-05, + "loss": 1.6252, + "step": 1148 + }, + { + "epoch": 0.08835279655808236, + "grad_norm": 5.08091926574707, + "learning_rate": 1.9646588813767673e-05, + "loss": 1.7145, + "step": 1150 + }, + { + "epoch": 0.08850645359557467, + "grad_norm": 4.986452102661133, + "learning_rate": 1.9645974185617702e-05, + "loss": 1.8065, + "step": 1152 + }, + { + "epoch": 0.08866011063306699, + "grad_norm": 5.4139204025268555, + "learning_rate": 1.9645359557467732e-05, + "loss": 1.6284, + "step": 1154 + }, + { + "epoch": 0.08881376767055932, + "grad_norm": 5.257991313934326, + "learning_rate": 1.9644744929317765e-05, + "loss": 1.8338, + "step": 1156 + }, + { + "epoch": 0.08896742470805163, + "grad_norm": 4.872262001037598, + "learning_rate": 1.9644130301167795e-05, + "loss": 1.8488, + "step": 1158 + }, + { + "epoch": 0.08912108174554395, + "grad_norm": 5.82914924621582, + "learning_rate": 1.9643515673017825e-05, + "loss": 1.7731, + "step": 1160 + }, + { + "epoch": 0.08927473878303627, + "grad_norm": 4.288515090942383, + "learning_rate": 1.9642901044867858e-05, + "loss": 1.6748, + "step": 1162 + }, + { + "epoch": 0.08942839582052858, + "grad_norm": 4.828827381134033, + "learning_rate": 1.9642286416717887e-05, + "loss": 1.7893, + "step": 1164 + }, + { + "epoch": 0.0895820528580209, + "grad_norm": 4.2209577560424805, + "learning_rate": 1.9641671788567917e-05, + "loss": 1.6139, + "step": 1166 + }, + { + "epoch": 0.08973570989551322, + "grad_norm": 5.542986869812012, + "learning_rate": 1.964105716041795e-05, + "loss": 1.5873, + "step": 1168 + }, + { + "epoch": 0.08988936693300553, + "grad_norm": 4.995078086853027, + "learning_rate": 1.964044253226798e-05, + "loss": 1.5976, + "step": 1170 + }, + { + "epoch": 0.09004302397049785, + "grad_norm": 4.493627071380615, + "learning_rate": 1.963982790411801e-05, + "loss": 1.6423, + "step": 1172 + }, + { + "epoch": 0.09019668100799016, + "grad_norm": 5.041021823883057, + "learning_rate": 1.9639213275968043e-05, + "loss": 1.6996, + "step": 1174 + }, + { + "epoch": 0.09035033804548248, + "grad_norm": 5.527616024017334, + "learning_rate": 1.9638598647818072e-05, + "loss": 1.7338, + "step": 1176 + }, + { + "epoch": 0.0905039950829748, + "grad_norm": 4.937674045562744, + "learning_rate": 1.9637984019668102e-05, + "loss": 1.8472, + "step": 1178 + }, + { + "epoch": 0.09065765212046711, + "grad_norm": 5.515735149383545, + "learning_rate": 1.963736939151813e-05, + "loss": 1.8828, + "step": 1180 + }, + { + "epoch": 0.09081130915795943, + "grad_norm": 5.273295879364014, + "learning_rate": 1.9636754763368165e-05, + "loss": 1.6819, + "step": 1182 + }, + { + "epoch": 0.09096496619545175, + "grad_norm": 4.745762825012207, + "learning_rate": 1.9636140135218194e-05, + "loss": 1.7752, + "step": 1184 + }, + { + "epoch": 0.09111862323294406, + "grad_norm": 4.8165130615234375, + "learning_rate": 1.9635525507068224e-05, + "loss": 1.6133, + "step": 1186 + }, + { + "epoch": 0.09127228027043639, + "grad_norm": 4.156392574310303, + "learning_rate": 1.9634910878918257e-05, + "loss": 1.7126, + "step": 1188 + }, + { + "epoch": 0.09142593730792871, + "grad_norm": 4.523599624633789, + "learning_rate": 1.9634296250768287e-05, + "loss": 1.616, + "step": 1190 + }, + { + "epoch": 0.09157959434542103, + "grad_norm": 4.08304500579834, + "learning_rate": 1.9633681622618316e-05, + "loss": 1.6602, + "step": 1192 + }, + { + "epoch": 0.09173325138291334, + "grad_norm": 4.6315202713012695, + "learning_rate": 1.963306699446835e-05, + "loss": 1.7004, + "step": 1194 + }, + { + "epoch": 0.09188690842040566, + "grad_norm": 4.613175868988037, + "learning_rate": 1.963245236631838e-05, + "loss": 1.7607, + "step": 1196 + }, + { + "epoch": 0.09204056545789797, + "grad_norm": 5.473268508911133, + "learning_rate": 1.963183773816841e-05, + "loss": 1.7788, + "step": 1198 + }, + { + "epoch": 0.09219422249539029, + "grad_norm": 4.436158657073975, + "learning_rate": 1.963122311001844e-05, + "loss": 1.507, + "step": 1200 + }, + { + "epoch": 0.0923478795328826, + "grad_norm": 4.081661701202393, + "learning_rate": 1.963060848186847e-05, + "loss": 1.6724, + "step": 1202 + }, + { + "epoch": 0.09250153657037492, + "grad_norm": 4.4669318199157715, + "learning_rate": 1.96299938537185e-05, + "loss": 1.6776, + "step": 1204 + }, + { + "epoch": 0.09265519360786724, + "grad_norm": 4.446565628051758, + "learning_rate": 1.962937922556853e-05, + "loss": 1.7225, + "step": 1206 + }, + { + "epoch": 0.09280885064535956, + "grad_norm": 4.1054816246032715, + "learning_rate": 1.9628764597418564e-05, + "loss": 1.5964, + "step": 1208 + }, + { + "epoch": 0.09296250768285187, + "grad_norm": 3.9810330867767334, + "learning_rate": 1.9628149969268594e-05, + "loss": 1.6933, + "step": 1210 + }, + { + "epoch": 0.09311616472034419, + "grad_norm": 5.808743476867676, + "learning_rate": 1.9627535341118623e-05, + "loss": 1.796, + "step": 1212 + }, + { + "epoch": 0.0932698217578365, + "grad_norm": 5.4757795333862305, + "learning_rate": 1.9626920712968657e-05, + "loss": 1.6707, + "step": 1214 + }, + { + "epoch": 0.09342347879532882, + "grad_norm": 4.4285430908203125, + "learning_rate": 1.9626306084818686e-05, + "loss": 1.6214, + "step": 1216 + }, + { + "epoch": 0.09357713583282114, + "grad_norm": 4.347642421722412, + "learning_rate": 1.9625691456668716e-05, + "loss": 1.7012, + "step": 1218 + }, + { + "epoch": 0.09373079287031345, + "grad_norm": 4.711369037628174, + "learning_rate": 1.962507682851875e-05, + "loss": 1.6307, + "step": 1220 + }, + { + "epoch": 0.09388444990780578, + "grad_norm": 4.427557945251465, + "learning_rate": 1.962446220036878e-05, + "loss": 1.7988, + "step": 1222 + }, + { + "epoch": 0.0940381069452981, + "grad_norm": 4.078681945800781, + "learning_rate": 1.962384757221881e-05, + "loss": 1.6596, + "step": 1224 + }, + { + "epoch": 0.09419176398279042, + "grad_norm": 4.403939247131348, + "learning_rate": 1.9623232944068838e-05, + "loss": 1.7761, + "step": 1226 + }, + { + "epoch": 0.09434542102028273, + "grad_norm": 4.222808361053467, + "learning_rate": 1.962261831591887e-05, + "loss": 1.7145, + "step": 1228 + }, + { + "epoch": 0.09449907805777505, + "grad_norm": 4.754458904266357, + "learning_rate": 1.96220036877689e-05, + "loss": 1.6256, + "step": 1230 + }, + { + "epoch": 0.09465273509526737, + "grad_norm": 4.7824201583862305, + "learning_rate": 1.962138905961893e-05, + "loss": 1.7846, + "step": 1232 + }, + { + "epoch": 0.09480639213275968, + "grad_norm": 4.751527786254883, + "learning_rate": 1.9620774431468964e-05, + "loss": 1.6602, + "step": 1234 + }, + { + "epoch": 0.094960049170252, + "grad_norm": 4.74616813659668, + "learning_rate": 1.9620159803318993e-05, + "loss": 1.6707, + "step": 1236 + }, + { + "epoch": 0.09511370620774431, + "grad_norm": 5.099863052368164, + "learning_rate": 1.9619545175169023e-05, + "loss": 1.7293, + "step": 1238 + }, + { + "epoch": 0.09526736324523663, + "grad_norm": 5.293537616729736, + "learning_rate": 1.9618930547019056e-05, + "loss": 1.7172, + "step": 1240 + }, + { + "epoch": 0.09542102028272895, + "grad_norm": 5.443637847900391, + "learning_rate": 1.9618315918869086e-05, + "loss": 1.7516, + "step": 1242 + }, + { + "epoch": 0.09557467732022126, + "grad_norm": 4.447843551635742, + "learning_rate": 1.9617701290719115e-05, + "loss": 1.7449, + "step": 1244 + }, + { + "epoch": 0.09572833435771358, + "grad_norm": 4.490113258361816, + "learning_rate": 1.961708666256915e-05, + "loss": 1.6253, + "step": 1246 + }, + { + "epoch": 0.0958819913952059, + "grad_norm": 4.979306221008301, + "learning_rate": 1.9616472034419178e-05, + "loss": 1.6743, + "step": 1248 + }, + { + "epoch": 0.09603564843269821, + "grad_norm": 4.146381855010986, + "learning_rate": 1.961585740626921e-05, + "loss": 1.7014, + "step": 1250 + }, + { + "epoch": 0.09618930547019053, + "grad_norm": 4.571809768676758, + "learning_rate": 1.9615242778119237e-05, + "loss": 1.7867, + "step": 1252 + }, + { + "epoch": 0.09634296250768286, + "grad_norm": 4.7382988929748535, + "learning_rate": 1.961462814996927e-05, + "loss": 1.7669, + "step": 1254 + }, + { + "epoch": 0.09649661954517517, + "grad_norm": 4.332629203796387, + "learning_rate": 1.96140135218193e-05, + "loss": 1.8072, + "step": 1256 + }, + { + "epoch": 0.09665027658266749, + "grad_norm": 4.376523494720459, + "learning_rate": 1.961339889366933e-05, + "loss": 1.7595, + "step": 1258 + }, + { + "epoch": 0.09680393362015981, + "grad_norm": 4.876426696777344, + "learning_rate": 1.9612784265519363e-05, + "loss": 1.5174, + "step": 1260 + }, + { + "epoch": 0.09695759065765212, + "grad_norm": 4.8033905029296875, + "learning_rate": 1.9612169637369393e-05, + "loss": 1.7284, + "step": 1262 + }, + { + "epoch": 0.09711124769514444, + "grad_norm": 4.221518039703369, + "learning_rate": 1.9611555009219422e-05, + "loss": 1.7409, + "step": 1264 + }, + { + "epoch": 0.09726490473263676, + "grad_norm": 4.445687294006348, + "learning_rate": 1.9610940381069455e-05, + "loss": 1.6211, + "step": 1266 + }, + { + "epoch": 0.09741856177012907, + "grad_norm": 4.590234279632568, + "learning_rate": 1.9610325752919485e-05, + "loss": 1.5695, + "step": 1268 + }, + { + "epoch": 0.09757221880762139, + "grad_norm": 5.60252571105957, + "learning_rate": 1.9609711124769518e-05, + "loss": 1.6369, + "step": 1270 + }, + { + "epoch": 0.0977258758451137, + "grad_norm": 4.938035011291504, + "learning_rate": 1.9609096496619548e-05, + "loss": 1.7329, + "step": 1272 + }, + { + "epoch": 0.09787953288260602, + "grad_norm": 5.1367106437683105, + "learning_rate": 1.9608481868469578e-05, + "loss": 1.8568, + "step": 1274 + }, + { + "epoch": 0.09803318992009834, + "grad_norm": 4.405098915100098, + "learning_rate": 1.960786724031961e-05, + "loss": 1.6226, + "step": 1276 + }, + { + "epoch": 0.09818684695759065, + "grad_norm": 5.822478771209717, + "learning_rate": 1.9607252612169637e-05, + "loss": 1.7561, + "step": 1278 + }, + { + "epoch": 0.09834050399508297, + "grad_norm": 4.770538806915283, + "learning_rate": 1.960663798401967e-05, + "loss": 1.6214, + "step": 1280 + }, + { + "epoch": 0.09849416103257529, + "grad_norm": 6.316437244415283, + "learning_rate": 1.96060233558697e-05, + "loss": 1.8268, + "step": 1282 + }, + { + "epoch": 0.0986478180700676, + "grad_norm": 4.640567302703857, + "learning_rate": 1.960540872771973e-05, + "loss": 1.7484, + "step": 1284 + }, + { + "epoch": 0.09880147510755993, + "grad_norm": 4.596543312072754, + "learning_rate": 1.9604794099569762e-05, + "loss": 1.6986, + "step": 1286 + }, + { + "epoch": 0.09895513214505225, + "grad_norm": 4.36724328994751, + "learning_rate": 1.9604179471419792e-05, + "loss": 1.6108, + "step": 1288 + }, + { + "epoch": 0.09910878918254457, + "grad_norm": 5.017337322235107, + "learning_rate": 1.9603564843269825e-05, + "loss": 1.7047, + "step": 1290 + }, + { + "epoch": 0.09926244622003688, + "grad_norm": 4.327188491821289, + "learning_rate": 1.9602950215119855e-05, + "loss": 1.6083, + "step": 1292 + }, + { + "epoch": 0.0994161032575292, + "grad_norm": 5.734022617340088, + "learning_rate": 1.9602335586969885e-05, + "loss": 1.7501, + "step": 1294 + }, + { + "epoch": 0.09956976029502151, + "grad_norm": 4.524082183837891, + "learning_rate": 1.9601720958819918e-05, + "loss": 1.7556, + "step": 1296 + }, + { + "epoch": 0.09972341733251383, + "grad_norm": 4.61295747756958, + "learning_rate": 1.9601106330669947e-05, + "loss": 1.6598, + "step": 1298 + }, + { + "epoch": 0.09987707437000615, + "grad_norm": 4.453684329986572, + "learning_rate": 1.9600491702519977e-05, + "loss": 1.6554, + "step": 1300 + }, + { + "epoch": 0.10003073140749846, + "grad_norm": 4.732148170471191, + "learning_rate": 1.959987707437001e-05, + "loss": 1.7181, + "step": 1302 + }, + { + "epoch": 0.10018438844499078, + "grad_norm": 4.715574741363525, + "learning_rate": 1.9599262446220036e-05, + "loss": 1.6849, + "step": 1304 + }, + { + "epoch": 0.1003380454824831, + "grad_norm": 4.356414318084717, + "learning_rate": 1.959864781807007e-05, + "loss": 1.687, + "step": 1306 + }, + { + "epoch": 0.10049170251997541, + "grad_norm": 4.813374996185303, + "learning_rate": 1.95980331899201e-05, + "loss": 1.5262, + "step": 1308 + }, + { + "epoch": 0.10064535955746773, + "grad_norm": 4.9926981925964355, + "learning_rate": 1.959741856177013e-05, + "loss": 1.8137, + "step": 1310 + }, + { + "epoch": 0.10079901659496004, + "grad_norm": 5.103787422180176, + "learning_rate": 1.9596803933620162e-05, + "loss": 1.64, + "step": 1312 + }, + { + "epoch": 0.10095267363245236, + "grad_norm": 4.895768165588379, + "learning_rate": 1.959618930547019e-05, + "loss": 1.693, + "step": 1314 + }, + { + "epoch": 0.10110633066994468, + "grad_norm": 4.513470649719238, + "learning_rate": 1.9595574677320225e-05, + "loss": 1.5745, + "step": 1316 + }, + { + "epoch": 0.10125998770743701, + "grad_norm": 5.475149154663086, + "learning_rate": 1.9594960049170254e-05, + "loss": 1.6189, + "step": 1318 + }, + { + "epoch": 0.10141364474492932, + "grad_norm": 4.828972339630127, + "learning_rate": 1.9594345421020284e-05, + "loss": 1.6547, + "step": 1320 + }, + { + "epoch": 0.10156730178242164, + "grad_norm": 5.218929290771484, + "learning_rate": 1.9593730792870317e-05, + "loss": 1.6494, + "step": 1322 + }, + { + "epoch": 0.10172095881991396, + "grad_norm": 4.358766078948975, + "learning_rate": 1.9593116164720343e-05, + "loss": 1.7283, + "step": 1324 + }, + { + "epoch": 0.10187461585740627, + "grad_norm": 4.14285135269165, + "learning_rate": 1.9592501536570376e-05, + "loss": 1.7769, + "step": 1326 + }, + { + "epoch": 0.10202827289489859, + "grad_norm": 4.319285869598389, + "learning_rate": 1.9591886908420406e-05, + "loss": 1.4707, + "step": 1328 + }, + { + "epoch": 0.1021819299323909, + "grad_norm": 5.230128288269043, + "learning_rate": 1.9591272280270436e-05, + "loss": 1.4906, + "step": 1330 + }, + { + "epoch": 0.10233558696988322, + "grad_norm": 5.243448257446289, + "learning_rate": 1.959065765212047e-05, + "loss": 1.8825, + "step": 1332 + }, + { + "epoch": 0.10248924400737554, + "grad_norm": 4.784072399139404, + "learning_rate": 1.95900430239705e-05, + "loss": 1.7553, + "step": 1334 + }, + { + "epoch": 0.10264290104486785, + "grad_norm": 5.595427513122559, + "learning_rate": 1.958942839582053e-05, + "loss": 1.6913, + "step": 1336 + }, + { + "epoch": 0.10279655808236017, + "grad_norm": 4.856276512145996, + "learning_rate": 1.958881376767056e-05, + "loss": 1.7659, + "step": 1338 + }, + { + "epoch": 0.10295021511985249, + "grad_norm": 5.188042640686035, + "learning_rate": 1.958819913952059e-05, + "loss": 1.8205, + "step": 1340 + }, + { + "epoch": 0.1031038721573448, + "grad_norm": 4.261306285858154, + "learning_rate": 1.9587584511370624e-05, + "loss": 1.5887, + "step": 1342 + }, + { + "epoch": 0.10325752919483712, + "grad_norm": 4.269975185394287, + "learning_rate": 1.9586969883220654e-05, + "loss": 1.5869, + "step": 1344 + }, + { + "epoch": 0.10341118623232944, + "grad_norm": 5.029308795928955, + "learning_rate": 1.9586355255070683e-05, + "loss": 1.8049, + "step": 1346 + }, + { + "epoch": 0.10356484326982175, + "grad_norm": 4.857789516448975, + "learning_rate": 1.9585740626920716e-05, + "loss": 1.5154, + "step": 1348 + }, + { + "epoch": 0.10371850030731407, + "grad_norm": 4.701939582824707, + "learning_rate": 1.9585125998770743e-05, + "loss": 1.6822, + "step": 1350 + }, + { + "epoch": 0.1038721573448064, + "grad_norm": 4.069787979125977, + "learning_rate": 1.9584511370620776e-05, + "loss": 1.7238, + "step": 1352 + }, + { + "epoch": 0.10402581438229871, + "grad_norm": 4.703420162200928, + "learning_rate": 1.9583896742470806e-05, + "loss": 1.6951, + "step": 1354 + }, + { + "epoch": 0.10417947141979103, + "grad_norm": 4.920733451843262, + "learning_rate": 1.958328211432084e-05, + "loss": 1.6335, + "step": 1356 + }, + { + "epoch": 0.10433312845728335, + "grad_norm": 4.38323974609375, + "learning_rate": 1.9582667486170868e-05, + "loss": 1.7094, + "step": 1358 + }, + { + "epoch": 0.10448678549477566, + "grad_norm": 4.646501541137695, + "learning_rate": 1.9582052858020898e-05, + "loss": 1.6878, + "step": 1360 + }, + { + "epoch": 0.10464044253226798, + "grad_norm": 4.569819450378418, + "learning_rate": 1.958143822987093e-05, + "loss": 1.7198, + "step": 1362 + }, + { + "epoch": 0.1047940995697603, + "grad_norm": 4.552595615386963, + "learning_rate": 1.958082360172096e-05, + "loss": 1.7335, + "step": 1364 + }, + { + "epoch": 0.10494775660725261, + "grad_norm": 3.9051506519317627, + "learning_rate": 1.958020897357099e-05, + "loss": 1.5694, + "step": 1366 + }, + { + "epoch": 0.10510141364474493, + "grad_norm": 3.9420411586761475, + "learning_rate": 1.9579594345421023e-05, + "loss": 1.7298, + "step": 1368 + }, + { + "epoch": 0.10525507068223725, + "grad_norm": 4.996294021606445, + "learning_rate": 1.9578979717271053e-05, + "loss": 1.7769, + "step": 1370 + }, + { + "epoch": 0.10540872771972956, + "grad_norm": 4.845794677734375, + "learning_rate": 1.9578365089121083e-05, + "loss": 1.694, + "step": 1372 + }, + { + "epoch": 0.10556238475722188, + "grad_norm": 4.156089782714844, + "learning_rate": 1.9577750460971116e-05, + "loss": 1.5621, + "step": 1374 + }, + { + "epoch": 0.1057160417947142, + "grad_norm": 5.298906326293945, + "learning_rate": 1.9577135832821146e-05, + "loss": 1.6016, + "step": 1376 + }, + { + "epoch": 0.10586969883220651, + "grad_norm": 4.974923610687256, + "learning_rate": 1.9576521204671175e-05, + "loss": 1.9024, + "step": 1378 + }, + { + "epoch": 0.10602335586969883, + "grad_norm": 4.5802998542785645, + "learning_rate": 1.9575906576521205e-05, + "loss": 1.8249, + "step": 1380 + }, + { + "epoch": 0.10617701290719114, + "grad_norm": 5.364488124847412, + "learning_rate": 1.9575291948371238e-05, + "loss": 1.6205, + "step": 1382 + }, + { + "epoch": 0.10633066994468347, + "grad_norm": 4.810891151428223, + "learning_rate": 1.9574677320221268e-05, + "loss": 1.6702, + "step": 1384 + }, + { + "epoch": 0.10648432698217579, + "grad_norm": 5.155327320098877, + "learning_rate": 1.9574062692071297e-05, + "loss": 1.6251, + "step": 1386 + }, + { + "epoch": 0.1066379840196681, + "grad_norm": 4.292688369750977, + "learning_rate": 1.957344806392133e-05, + "loss": 1.505, + "step": 1388 + }, + { + "epoch": 0.10679164105716042, + "grad_norm": 4.611319541931152, + "learning_rate": 1.957283343577136e-05, + "loss": 1.7301, + "step": 1390 + }, + { + "epoch": 0.10694529809465274, + "grad_norm": 4.324422359466553, + "learning_rate": 1.957221880762139e-05, + "loss": 1.6551, + "step": 1392 + }, + { + "epoch": 0.10709895513214505, + "grad_norm": 4.826112747192383, + "learning_rate": 1.9571604179471423e-05, + "loss": 1.607, + "step": 1394 + }, + { + "epoch": 0.10725261216963737, + "grad_norm": 4.303924560546875, + "learning_rate": 1.9570989551321453e-05, + "loss": 1.6128, + "step": 1396 + }, + { + "epoch": 0.10740626920712969, + "grad_norm": 5.093891620635986, + "learning_rate": 1.9570374923171482e-05, + "loss": 1.6747, + "step": 1398 + }, + { + "epoch": 0.107559926244622, + "grad_norm": 4.303253650665283, + "learning_rate": 1.9569760295021515e-05, + "loss": 1.5981, + "step": 1400 + }, + { + "epoch": 0.10771358328211432, + "grad_norm": 4.165480613708496, + "learning_rate": 1.9569145666871545e-05, + "loss": 1.5993, + "step": 1402 + }, + { + "epoch": 0.10786724031960664, + "grad_norm": 4.655346393585205, + "learning_rate": 1.9568531038721575e-05, + "loss": 1.6806, + "step": 1404 + }, + { + "epoch": 0.10802089735709895, + "grad_norm": 4.743736743927002, + "learning_rate": 1.9567916410571604e-05, + "loss": 1.6624, + "step": 1406 + }, + { + "epoch": 0.10817455439459127, + "grad_norm": 4.2791643142700195, + "learning_rate": 1.9567301782421637e-05, + "loss": 1.6776, + "step": 1408 + }, + { + "epoch": 0.10832821143208358, + "grad_norm": 5.005465030670166, + "learning_rate": 1.9566687154271667e-05, + "loss": 1.5962, + "step": 1410 + }, + { + "epoch": 0.1084818684695759, + "grad_norm": 4.345304012298584, + "learning_rate": 1.9566072526121697e-05, + "loss": 1.5865, + "step": 1412 + }, + { + "epoch": 0.10863552550706822, + "grad_norm": 3.8712103366851807, + "learning_rate": 1.956545789797173e-05, + "loss": 1.6583, + "step": 1414 + }, + { + "epoch": 0.10878918254456055, + "grad_norm": 4.381411075592041, + "learning_rate": 1.956484326982176e-05, + "loss": 1.6203, + "step": 1416 + }, + { + "epoch": 0.10894283958205286, + "grad_norm": 3.933609962463379, + "learning_rate": 1.956422864167179e-05, + "loss": 1.7631, + "step": 1418 + }, + { + "epoch": 0.10909649661954518, + "grad_norm": 5.570189952850342, + "learning_rate": 1.9563614013521822e-05, + "loss": 1.7173, + "step": 1420 + }, + { + "epoch": 0.1092501536570375, + "grad_norm": 4.816314220428467, + "learning_rate": 1.9562999385371852e-05, + "loss": 1.6941, + "step": 1422 + }, + { + "epoch": 0.10940381069452981, + "grad_norm": 4.110052585601807, + "learning_rate": 1.9562384757221882e-05, + "loss": 1.6782, + "step": 1424 + }, + { + "epoch": 0.10955746773202213, + "grad_norm": 4.069727420806885, + "learning_rate": 1.956177012907191e-05, + "loss": 1.657, + "step": 1426 + }, + { + "epoch": 0.10971112476951445, + "grad_norm": 5.244446277618408, + "learning_rate": 1.9561155500921944e-05, + "loss": 1.683, + "step": 1428 + }, + { + "epoch": 0.10986478180700676, + "grad_norm": 5.359142780303955, + "learning_rate": 1.9560540872771974e-05, + "loss": 1.6764, + "step": 1430 + }, + { + "epoch": 0.11001843884449908, + "grad_norm": 5.057417869567871, + "learning_rate": 1.9559926244622004e-05, + "loss": 1.787, + "step": 1432 + }, + { + "epoch": 0.1101720958819914, + "grad_norm": 4.59893274307251, + "learning_rate": 1.9559311616472037e-05, + "loss": 1.6119, + "step": 1434 + }, + { + "epoch": 0.11032575291948371, + "grad_norm": 4.8417744636535645, + "learning_rate": 1.9558696988322067e-05, + "loss": 1.7965, + "step": 1436 + }, + { + "epoch": 0.11047940995697603, + "grad_norm": 4.829365253448486, + "learning_rate": 1.9558082360172096e-05, + "loss": 1.6125, + "step": 1438 + }, + { + "epoch": 0.11063306699446834, + "grad_norm": 4.74966287612915, + "learning_rate": 1.955746773202213e-05, + "loss": 1.5761, + "step": 1440 + }, + { + "epoch": 0.11078672403196066, + "grad_norm": 4.8681535720825195, + "learning_rate": 1.955685310387216e-05, + "loss": 1.5977, + "step": 1442 + }, + { + "epoch": 0.11094038106945298, + "grad_norm": 4.576766014099121, + "learning_rate": 1.955623847572219e-05, + "loss": 1.6297, + "step": 1444 + }, + { + "epoch": 0.11109403810694529, + "grad_norm": 4.206700325012207, + "learning_rate": 1.9555623847572222e-05, + "loss": 1.6246, + "step": 1446 + }, + { + "epoch": 0.11124769514443761, + "grad_norm": 4.753570079803467, + "learning_rate": 1.955500921942225e-05, + "loss": 1.6627, + "step": 1448 + }, + { + "epoch": 0.11140135218192994, + "grad_norm": 4.992982864379883, + "learning_rate": 1.955439459127228e-05, + "loss": 1.7223, + "step": 1450 + }, + { + "epoch": 0.11155500921942225, + "grad_norm": 4.912965297698975, + "learning_rate": 1.955377996312231e-05, + "loss": 1.5616, + "step": 1452 + }, + { + "epoch": 0.11170866625691457, + "grad_norm": 4.4759840965271, + "learning_rate": 1.9553165334972344e-05, + "loss": 1.5403, + "step": 1454 + }, + { + "epoch": 0.11186232329440689, + "grad_norm": 5.181031703948975, + "learning_rate": 1.9552550706822374e-05, + "loss": 1.6365, + "step": 1456 + }, + { + "epoch": 0.1120159803318992, + "grad_norm": 4.845396518707275, + "learning_rate": 1.9551936078672403e-05, + "loss": 1.6279, + "step": 1458 + }, + { + "epoch": 0.11216963736939152, + "grad_norm": 4.756799221038818, + "learning_rate": 1.9551321450522436e-05, + "loss": 1.6919, + "step": 1460 + }, + { + "epoch": 0.11232329440688384, + "grad_norm": 5.1768107414245605, + "learning_rate": 1.9550706822372466e-05, + "loss": 1.5765, + "step": 1462 + }, + { + "epoch": 0.11247695144437615, + "grad_norm": 4.743069648742676, + "learning_rate": 1.9550092194222496e-05, + "loss": 1.8469, + "step": 1464 + }, + { + "epoch": 0.11263060848186847, + "grad_norm": 4.831038951873779, + "learning_rate": 1.954947756607253e-05, + "loss": 1.7746, + "step": 1466 + }, + { + "epoch": 0.11278426551936079, + "grad_norm": 4.309507846832275, + "learning_rate": 1.954886293792256e-05, + "loss": 1.5142, + "step": 1468 + }, + { + "epoch": 0.1129379225568531, + "grad_norm": 55.4850959777832, + "learning_rate": 1.9548248309772588e-05, + "loss": 1.683, + "step": 1470 + }, + { + "epoch": 0.11309157959434542, + "grad_norm": 4.364781856536865, + "learning_rate": 1.954763368162262e-05, + "loss": 1.8977, + "step": 1472 + }, + { + "epoch": 0.11324523663183773, + "grad_norm": 4.795863151550293, + "learning_rate": 1.954701905347265e-05, + "loss": 1.6436, + "step": 1474 + }, + { + "epoch": 0.11339889366933005, + "grad_norm": 4.47898530960083, + "learning_rate": 1.954640442532268e-05, + "loss": 1.7043, + "step": 1476 + }, + { + "epoch": 0.11355255070682237, + "grad_norm": 5.761251926422119, + "learning_rate": 1.954578979717271e-05, + "loss": 1.6943, + "step": 1478 + }, + { + "epoch": 0.11370620774431468, + "grad_norm": 5.274534225463867, + "learning_rate": 1.9545175169022743e-05, + "loss": 1.6005, + "step": 1480 + }, + { + "epoch": 0.11385986478180701, + "grad_norm": 4.412993431091309, + "learning_rate": 1.9544560540872773e-05, + "loss": 1.4977, + "step": 1482 + }, + { + "epoch": 0.11401352181929933, + "grad_norm": 4.08578634262085, + "learning_rate": 1.9543945912722803e-05, + "loss": 1.6459, + "step": 1484 + }, + { + "epoch": 0.11416717885679165, + "grad_norm": 3.7015979290008545, + "learning_rate": 1.9543331284572836e-05, + "loss": 1.5978, + "step": 1486 + }, + { + "epoch": 0.11432083589428396, + "grad_norm": 4.919078826904297, + "learning_rate": 1.9542716656422865e-05, + "loss": 1.5456, + "step": 1488 + }, + { + "epoch": 0.11447449293177628, + "grad_norm": 4.756066799163818, + "learning_rate": 1.9542102028272895e-05, + "loss": 1.7221, + "step": 1490 + }, + { + "epoch": 0.1146281499692686, + "grad_norm": 4.4432525634765625, + "learning_rate": 1.9541487400122928e-05, + "loss": 1.5918, + "step": 1492 + }, + { + "epoch": 0.11478180700676091, + "grad_norm": 5.371875286102295, + "learning_rate": 1.9540872771972958e-05, + "loss": 1.7009, + "step": 1494 + }, + { + "epoch": 0.11493546404425323, + "grad_norm": 3.5335211753845215, + "learning_rate": 1.9540258143822988e-05, + "loss": 1.6021, + "step": 1496 + }, + { + "epoch": 0.11508912108174554, + "grad_norm": 4.77205753326416, + "learning_rate": 1.953964351567302e-05, + "loss": 1.5721, + "step": 1498 + }, + { + "epoch": 0.11524277811923786, + "grad_norm": 5.020537376403809, + "learning_rate": 1.953902888752305e-05, + "loss": 1.6367, + "step": 1500 + } + ], + "logging_steps": 2, + "max_steps": 65080, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.575057282039808e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}