{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 54075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011095700416088765, "grad_norm": 0.44862157106399536, "learning_rate": 9.246417013407304e-10, "loss": 1.1784, "step": 20 }, { "epoch": 0.002219140083217753, "grad_norm": 0.3120013475418091, "learning_rate": 1.8492834026814607e-09, "loss": 1.1808, "step": 40 }, { "epoch": 0.00332871012482663, "grad_norm": 0.22801850736141205, "learning_rate": 2.7739251040221912e-09, "loss": 1.1287, "step": 60 }, { "epoch": 0.004438280166435506, "grad_norm": 0.3955777585506439, "learning_rate": 3.6985668053629215e-09, "loss": 1.1498, "step": 80 }, { "epoch": 0.005547850208044383, "grad_norm": 0.47019973397254944, "learning_rate": 4.623208506703652e-09, "loss": 1.1717, "step": 100 }, { "epoch": 0.00665742024965326, "grad_norm": 0.3231058716773987, "learning_rate": 5.5478502080443824e-09, "loss": 1.1402, "step": 120 }, { "epoch": 0.007766990291262136, "grad_norm": 0.2898150384426117, "learning_rate": 6.472491909385113e-09, "loss": 1.1833, "step": 140 }, { "epoch": 0.008876560332871012, "grad_norm": 0.2865077257156372, "learning_rate": 7.397133610725843e-09, "loss": 1.1964, "step": 160 }, { "epoch": 0.009986130374479889, "grad_norm": 0.38055843114852905, "learning_rate": 8.321775312066573e-09, "loss": 1.1923, "step": 180 }, { "epoch": 0.011095700416088766, "grad_norm": 0.2376246601343155, "learning_rate": 9.246417013407303e-09, "loss": 1.1406, "step": 200 }, { "epoch": 0.012205270457697643, "grad_norm": 0.35449135303497314, "learning_rate": 1.0171058714748034e-08, "loss": 1.1704, "step": 220 }, { "epoch": 0.01331484049930652, "grad_norm": 0.4456291198730469, "learning_rate": 1.1095700416088765e-08, "loss": 1.1868, "step": 240 }, { "epoch": 0.014424410540915394, "grad_norm": 0.20839697122573853, "learning_rate": 1.2020342117429496e-08, "loss": 1.1654, "step": 260 }, { "epoch": 0.015533980582524271, "grad_norm": 0.41858506202697754, "learning_rate": 1.2944983818770226e-08, "loss": 1.1544, "step": 280 }, { "epoch": 0.016643550624133148, "grad_norm": 0.4229697287082672, "learning_rate": 1.3869625520110957e-08, "loss": 1.1883, "step": 300 }, { "epoch": 0.017753120665742025, "grad_norm": 0.4215857982635498, "learning_rate": 1.4794267221451686e-08, "loss": 1.1318, "step": 320 }, { "epoch": 0.0188626907073509, "grad_norm": 0.39319831132888794, "learning_rate": 1.5718908922792418e-08, "loss": 1.1315, "step": 340 }, { "epoch": 0.019972260748959778, "grad_norm": 0.2704245150089264, "learning_rate": 1.6643550624133146e-08, "loss": 1.239, "step": 360 }, { "epoch": 0.021081830790568655, "grad_norm": 0.25417041778564453, "learning_rate": 1.756819232547388e-08, "loss": 1.1537, "step": 380 }, { "epoch": 0.022191400832177532, "grad_norm": 0.3657750189304352, "learning_rate": 1.8492834026814607e-08, "loss": 1.223, "step": 400 }, { "epoch": 0.02330097087378641, "grad_norm": 0.3478376269340515, "learning_rate": 1.9417475728155338e-08, "loss": 1.1284, "step": 420 }, { "epoch": 0.024410540915395285, "grad_norm": 0.2471611499786377, "learning_rate": 2.0342117429496068e-08, "loss": 1.0695, "step": 440 }, { "epoch": 0.025520110957004162, "grad_norm": 0.21677105128765106, "learning_rate": 2.12667591308368e-08, "loss": 1.2161, "step": 460 }, { "epoch": 0.02662968099861304, "grad_norm": 0.2597770392894745, "learning_rate": 2.219140083217753e-08, "loss": 1.1675, "step": 480 }, { "epoch": 0.027739251040221916, "grad_norm": 0.3683781027793884, "learning_rate": 2.311604253351826e-08, "loss": 1.1218, "step": 500 }, { "epoch": 0.02884882108183079, "grad_norm": 0.4040614068508148, "learning_rate": 2.404068423485899e-08, "loss": 1.2517, "step": 520 }, { "epoch": 0.029958391123439666, "grad_norm": 0.38302311301231384, "learning_rate": 2.4965325936199722e-08, "loss": 1.1996, "step": 540 }, { "epoch": 0.031067961165048542, "grad_norm": 0.3522479832172394, "learning_rate": 2.5889967637540452e-08, "loss": 1.1333, "step": 560 }, { "epoch": 0.03217753120665742, "grad_norm": 0.20584695041179657, "learning_rate": 2.6814609338881183e-08, "loss": 1.0692, "step": 580 }, { "epoch": 0.033287101248266296, "grad_norm": 0.29942065477371216, "learning_rate": 2.7739251040221914e-08, "loss": 1.1579, "step": 600 }, { "epoch": 0.034396671289875176, "grad_norm": 0.3697648048400879, "learning_rate": 2.8663892741562644e-08, "loss": 1.1319, "step": 620 }, { "epoch": 0.03550624133148405, "grad_norm": 0.2926139831542969, "learning_rate": 2.9588534442903372e-08, "loss": 1.1272, "step": 640 }, { "epoch": 0.03661581137309293, "grad_norm": 0.3973024785518646, "learning_rate": 3.0513176144244106e-08, "loss": 1.1075, "step": 660 }, { "epoch": 0.0377253814147018, "grad_norm": 0.36896416544914246, "learning_rate": 3.1437817845584836e-08, "loss": 1.2397, "step": 680 }, { "epoch": 0.038834951456310676, "grad_norm": 0.21944652497768402, "learning_rate": 3.236245954692556e-08, "loss": 1.123, "step": 700 }, { "epoch": 0.039944521497919556, "grad_norm": 0.37778010964393616, "learning_rate": 3.328710124826629e-08, "loss": 1.1646, "step": 720 }, { "epoch": 0.04105409153952843, "grad_norm": 0.35726967453956604, "learning_rate": 3.421174294960703e-08, "loss": 1.1871, "step": 740 }, { "epoch": 0.04216366158113731, "grad_norm": 0.40661826729774475, "learning_rate": 3.513638465094776e-08, "loss": 1.2048, "step": 760 }, { "epoch": 0.04327323162274618, "grad_norm": 0.397662878036499, "learning_rate": 3.606102635228848e-08, "loss": 1.1275, "step": 780 }, { "epoch": 0.044382801664355064, "grad_norm": 0.36767083406448364, "learning_rate": 3.6985668053629214e-08, "loss": 1.2062, "step": 800 }, { "epoch": 0.04549237170596394, "grad_norm": 0.13555480539798737, "learning_rate": 3.791030975496995e-08, "loss": 1.0942, "step": 820 }, { "epoch": 0.04660194174757282, "grad_norm": 0.3170234262943268, "learning_rate": 3.8834951456310675e-08, "loss": 1.2095, "step": 840 }, { "epoch": 0.04771151178918169, "grad_norm": 0.42094990611076355, "learning_rate": 3.9759593157651406e-08, "loss": 1.0647, "step": 860 }, { "epoch": 0.04882108183079057, "grad_norm": 0.21436959505081177, "learning_rate": 4.0684234858992137e-08, "loss": 1.1466, "step": 880 }, { "epoch": 0.049930651872399444, "grad_norm": 0.33519357442855835, "learning_rate": 4.1608876560332874e-08, "loss": 1.2308, "step": 900 }, { "epoch": 0.051040221914008324, "grad_norm": 0.4318859875202179, "learning_rate": 4.25335182616736e-08, "loss": 1.1723, "step": 920 }, { "epoch": 0.0521497919556172, "grad_norm": 0.23297818005084991, "learning_rate": 4.345815996301433e-08, "loss": 1.1742, "step": 940 }, { "epoch": 0.05325936199722608, "grad_norm": 0.2868468463420868, "learning_rate": 4.438280166435506e-08, "loss": 1.178, "step": 960 }, { "epoch": 0.05436893203883495, "grad_norm": 0.2635517418384552, "learning_rate": 4.5307443365695797e-08, "loss": 1.2392, "step": 980 }, { "epoch": 0.05547850208044383, "grad_norm": 0.29612234234809875, "learning_rate": 4.623208506703652e-08, "loss": 1.1298, "step": 1000 }, { "epoch": 0.056588072122052704, "grad_norm": 0.23258255422115326, "learning_rate": 4.715672676837725e-08, "loss": 1.197, "step": 1020 }, { "epoch": 0.05769764216366158, "grad_norm": 0.20158788561820984, "learning_rate": 4.808136846971798e-08, "loss": 1.2257, "step": 1040 }, { "epoch": 0.05880721220527046, "grad_norm": 0.24551551043987274, "learning_rate": 4.9006010171058706e-08, "loss": 1.145, "step": 1060 }, { "epoch": 0.05991678224687933, "grad_norm": 0.2524837553501129, "learning_rate": 4.9930651872399443e-08, "loss": 1.1793, "step": 1080 }, { "epoch": 0.06102635228848821, "grad_norm": 0.3862685263156891, "learning_rate": 5.0855293573740174e-08, "loss": 1.161, "step": 1100 }, { "epoch": 0.062135922330097085, "grad_norm": 0.2651582956314087, "learning_rate": 5.1779935275080905e-08, "loss": 1.1724, "step": 1120 }, { "epoch": 0.06324549237170596, "grad_norm": 0.2645418643951416, "learning_rate": 5.270457697642163e-08, "loss": 1.1569, "step": 1140 }, { "epoch": 0.06435506241331485, "grad_norm": 0.1873205304145813, "learning_rate": 5.3629218677762366e-08, "loss": 1.1896, "step": 1160 }, { "epoch": 0.06546463245492372, "grad_norm": 0.19177453219890594, "learning_rate": 5.45538603791031e-08, "loss": 1.2085, "step": 1180 }, { "epoch": 0.06657420249653259, "grad_norm": 0.21481531858444214, "learning_rate": 5.547850208044383e-08, "loss": 1.1974, "step": 1200 }, { "epoch": 0.06768377253814147, "grad_norm": 0.31887128949165344, "learning_rate": 5.640314378178455e-08, "loss": 1.102, "step": 1220 }, { "epoch": 0.06879334257975035, "grad_norm": 0.30679967999458313, "learning_rate": 5.732778548312529e-08, "loss": 1.2065, "step": 1240 }, { "epoch": 0.06990291262135923, "grad_norm": 0.22024956345558167, "learning_rate": 5.825242718446602e-08, "loss": 1.175, "step": 1260 }, { "epoch": 0.0710124826629681, "grad_norm": 0.29602691531181335, "learning_rate": 5.9177068885806744e-08, "loss": 1.0207, "step": 1280 }, { "epoch": 0.07212205270457697, "grad_norm": 0.28703388571739197, "learning_rate": 6.010171058714747e-08, "loss": 1.1092, "step": 1300 }, { "epoch": 0.07323162274618586, "grad_norm": 0.2126881331205368, "learning_rate": 6.102635228848821e-08, "loss": 1.1546, "step": 1320 }, { "epoch": 0.07434119278779473, "grad_norm": 0.4742228090763092, "learning_rate": 6.195099398982894e-08, "loss": 1.1905, "step": 1340 }, { "epoch": 0.0754507628294036, "grad_norm": 0.2526935338973999, "learning_rate": 6.287563569116967e-08, "loss": 1.2051, "step": 1360 }, { "epoch": 0.07656033287101248, "grad_norm": 0.35156166553497314, "learning_rate": 6.38002773925104e-08, "loss": 1.0984, "step": 1380 }, { "epoch": 0.07766990291262135, "grad_norm": 0.355660617351532, "learning_rate": 6.472491909385112e-08, "loss": 1.1748, "step": 1400 }, { "epoch": 0.07877947295423024, "grad_norm": 0.4230602979660034, "learning_rate": 6.564956079519186e-08, "loss": 1.212, "step": 1420 }, { "epoch": 0.07988904299583911, "grad_norm": 0.3976147174835205, "learning_rate": 6.657420249653258e-08, "loss": 1.1657, "step": 1440 }, { "epoch": 0.08099861303744799, "grad_norm": 0.31203871965408325, "learning_rate": 6.749884419787332e-08, "loss": 1.1244, "step": 1460 }, { "epoch": 0.08210818307905686, "grad_norm": 0.41778573393821716, "learning_rate": 6.842348589921406e-08, "loss": 1.224, "step": 1480 }, { "epoch": 0.08321775312066575, "grad_norm": 0.27035555243492126, "learning_rate": 6.934812760055478e-08, "loss": 1.1, "step": 1500 }, { "epoch": 0.08432732316227462, "grad_norm": 0.34498125314712524, "learning_rate": 7.027276930189552e-08, "loss": 1.2051, "step": 1520 }, { "epoch": 0.0854368932038835, "grad_norm": 0.2543281614780426, "learning_rate": 7.119741100323624e-08, "loss": 1.2203, "step": 1540 }, { "epoch": 0.08654646324549237, "grad_norm": 0.2934819161891937, "learning_rate": 7.212205270457697e-08, "loss": 1.1473, "step": 1560 }, { "epoch": 0.08765603328710125, "grad_norm": 0.24872970581054688, "learning_rate": 7.30466944059177e-08, "loss": 1.1447, "step": 1580 }, { "epoch": 0.08876560332871013, "grad_norm": 0.40305283665657043, "learning_rate": 7.397133610725843e-08, "loss": 1.0738, "step": 1600 }, { "epoch": 0.089875173370319, "grad_norm": 0.33497557044029236, "learning_rate": 7.489597780859917e-08, "loss": 1.149, "step": 1620 }, { "epoch": 0.09098474341192787, "grad_norm": 0.4571235477924347, "learning_rate": 7.58206195099399e-08, "loss": 1.1907, "step": 1640 }, { "epoch": 0.09209431345353676, "grad_norm": 0.23668278753757477, "learning_rate": 7.674526121128063e-08, "loss": 1.1192, "step": 1660 }, { "epoch": 0.09320388349514563, "grad_norm": 0.2423340231180191, "learning_rate": 7.766990291262135e-08, "loss": 1.0773, "step": 1680 }, { "epoch": 0.09431345353675451, "grad_norm": 0.30953505635261536, "learning_rate": 7.859454461396209e-08, "loss": 1.0677, "step": 1700 }, { "epoch": 0.09542302357836338, "grad_norm": 0.27933236956596375, "learning_rate": 7.951918631530281e-08, "loss": 1.1163, "step": 1720 }, { "epoch": 0.09653259361997225, "grad_norm": 0.24581144750118256, "learning_rate": 8.044382801664355e-08, "loss": 1.2097, "step": 1740 }, { "epoch": 0.09764216366158114, "grad_norm": 0.2823852300643921, "learning_rate": 8.136846971798427e-08, "loss": 1.1019, "step": 1760 }, { "epoch": 0.09875173370319001, "grad_norm": 0.18255861103534698, "learning_rate": 8.2293111419325e-08, "loss": 1.0751, "step": 1780 }, { "epoch": 0.09986130374479889, "grad_norm": 0.23463492095470428, "learning_rate": 8.321775312066575e-08, "loss": 1.159, "step": 1800 }, { "epoch": 0.10097087378640776, "grad_norm": 0.3392224609851837, "learning_rate": 8.414239482200647e-08, "loss": 1.2215, "step": 1820 }, { "epoch": 0.10208044382801665, "grad_norm": 0.4015384614467621, "learning_rate": 8.50670365233472e-08, "loss": 1.1327, "step": 1840 }, { "epoch": 0.10319001386962552, "grad_norm": 0.2506745457649231, "learning_rate": 8.599167822468793e-08, "loss": 1.119, "step": 1860 }, { "epoch": 0.1042995839112344, "grad_norm": 0.32273781299591064, "learning_rate": 8.691631992602866e-08, "loss": 1.153, "step": 1880 }, { "epoch": 0.10540915395284327, "grad_norm": 0.22436083853244781, "learning_rate": 8.784096162736938e-08, "loss": 1.1221, "step": 1900 }, { "epoch": 0.10651872399445216, "grad_norm": 0.41386881470680237, "learning_rate": 8.876560332871012e-08, "loss": 1.1214, "step": 1920 }, { "epoch": 0.10762829403606103, "grad_norm": 0.23942351341247559, "learning_rate": 8.969024503005084e-08, "loss": 1.1011, "step": 1940 }, { "epoch": 0.1087378640776699, "grad_norm": 0.4087466597557068, "learning_rate": 9.061488673139159e-08, "loss": 1.1332, "step": 1960 }, { "epoch": 0.10984743411927878, "grad_norm": 0.4878149926662445, "learning_rate": 9.153952843273232e-08, "loss": 1.1254, "step": 1980 }, { "epoch": 0.11095700416088766, "grad_norm": 0.32123082876205444, "learning_rate": 9.246417013407304e-08, "loss": 1.0995, "step": 2000 }, { "epoch": 0.11206657420249654, "grad_norm": 0.3750733733177185, "learning_rate": 9.338881183541378e-08, "loss": 1.1586, "step": 2020 }, { "epoch": 0.11317614424410541, "grad_norm": 0.45832639932632446, "learning_rate": 9.43134535367545e-08, "loss": 1.0434, "step": 2040 }, { "epoch": 0.11428571428571428, "grad_norm": 0.47171518206596375, "learning_rate": 9.523809523809523e-08, "loss": 1.245, "step": 2060 }, { "epoch": 0.11539528432732316, "grad_norm": 0.2627200782299042, "learning_rate": 9.616273693943596e-08, "loss": 1.1494, "step": 2080 }, { "epoch": 0.11650485436893204, "grad_norm": 0.32793936133384705, "learning_rate": 9.708737864077669e-08, "loss": 1.084, "step": 2100 }, { "epoch": 0.11761442441054092, "grad_norm": 0.38566404581069946, "learning_rate": 9.801202034211741e-08, "loss": 1.1231, "step": 2120 }, { "epoch": 0.11872399445214979, "grad_norm": 0.6227118372917175, "learning_rate": 9.893666204345816e-08, "loss": 1.2392, "step": 2140 }, { "epoch": 0.11983356449375866, "grad_norm": 0.33338844776153564, "learning_rate": 9.986130374479889e-08, "loss": 1.1348, "step": 2160 }, { "epoch": 0.12094313453536755, "grad_norm": 0.24396933615207672, "learning_rate": 1.0078594544613962e-07, "loss": 1.1444, "step": 2180 }, { "epoch": 0.12205270457697642, "grad_norm": 0.29720935225486755, "learning_rate": 1.0171058714748035e-07, "loss": 1.1379, "step": 2200 }, { "epoch": 0.1231622746185853, "grad_norm": 0.22837242484092712, "learning_rate": 1.0263522884882107e-07, "loss": 1.1409, "step": 2220 }, { "epoch": 0.12427184466019417, "grad_norm": 0.20681488513946533, "learning_rate": 1.0355987055016181e-07, "loss": 1.172, "step": 2240 }, { "epoch": 0.12538141470180306, "grad_norm": 0.251580148935318, "learning_rate": 1.0448451225150253e-07, "loss": 1.1073, "step": 2260 }, { "epoch": 0.12649098474341192, "grad_norm": 0.29831400513648987, "learning_rate": 1.0540915395284326e-07, "loss": 1.0839, "step": 2280 }, { "epoch": 0.1276005547850208, "grad_norm": 0.34739160537719727, "learning_rate": 1.0633379565418401e-07, "loss": 1.1509, "step": 2300 }, { "epoch": 0.1287101248266297, "grad_norm": 0.32550665736198425, "learning_rate": 1.0725843735552473e-07, "loss": 1.1788, "step": 2320 }, { "epoch": 0.12981969486823855, "grad_norm": 0.3324652314186096, "learning_rate": 1.0818307905686546e-07, "loss": 1.126, "step": 2340 }, { "epoch": 0.13092926490984744, "grad_norm": 0.4541341960430145, "learning_rate": 1.091077207582062e-07, "loss": 1.0964, "step": 2360 }, { "epoch": 0.13203883495145632, "grad_norm": 0.49399781227111816, "learning_rate": 1.1003236245954692e-07, "loss": 1.1449, "step": 2380 }, { "epoch": 0.13314840499306518, "grad_norm": 0.3027644753456116, "learning_rate": 1.1095700416088766e-07, "loss": 1.0903, "step": 2400 }, { "epoch": 0.13425797503467407, "grad_norm": 0.32247117161750793, "learning_rate": 1.1188164586222838e-07, "loss": 1.0205, "step": 2420 }, { "epoch": 0.13536754507628293, "grad_norm": 0.2825808823108673, "learning_rate": 1.128062875635691e-07, "loss": 1.0647, "step": 2440 }, { "epoch": 0.13647711511789182, "grad_norm": 0.24574926495552063, "learning_rate": 1.1373092926490985e-07, "loss": 1.1129, "step": 2460 }, { "epoch": 0.1375866851595007, "grad_norm": 0.1870080679655075, "learning_rate": 1.1465557096625058e-07, "loss": 1.0684, "step": 2480 }, { "epoch": 0.13869625520110956, "grad_norm": 0.3495355546474457, "learning_rate": 1.155802126675913e-07, "loss": 1.0948, "step": 2500 }, { "epoch": 0.13980582524271845, "grad_norm": 0.439216673374176, "learning_rate": 1.1650485436893204e-07, "loss": 1.156, "step": 2520 }, { "epoch": 0.1409153952843273, "grad_norm": 0.2272769957780838, "learning_rate": 1.1742949607027276e-07, "loss": 1.0684, "step": 2540 }, { "epoch": 0.1420249653259362, "grad_norm": 0.3861547112464905, "learning_rate": 1.1835413777161349e-07, "loss": 1.0817, "step": 2560 }, { "epoch": 0.14313453536754508, "grad_norm": 0.3330051302909851, "learning_rate": 1.1927877947295422e-07, "loss": 1.0578, "step": 2580 }, { "epoch": 0.14424410540915394, "grad_norm": 0.2546771466732025, "learning_rate": 1.2020342117429495e-07, "loss": 1.0279, "step": 2600 }, { "epoch": 0.14535367545076283, "grad_norm": 0.3805169463157654, "learning_rate": 1.211280628756357e-07, "loss": 1.0598, "step": 2620 }, { "epoch": 0.14646324549237172, "grad_norm": 0.2446746826171875, "learning_rate": 1.2205270457697642e-07, "loss": 1.1417, "step": 2640 }, { "epoch": 0.14757281553398058, "grad_norm": 0.4452511966228485, "learning_rate": 1.2297734627831715e-07, "loss": 1.0638, "step": 2660 }, { "epoch": 0.14868238557558947, "grad_norm": 0.1700054258108139, "learning_rate": 1.2390198797965787e-07, "loss": 1.0659, "step": 2680 }, { "epoch": 0.14979195561719832, "grad_norm": 0.207583487033844, "learning_rate": 1.248266296809986e-07, "loss": 1.0412, "step": 2700 }, { "epoch": 0.1509015256588072, "grad_norm": 0.43273523449897766, "learning_rate": 1.2575127138233935e-07, "loss": 1.1948, "step": 2720 }, { "epoch": 0.1520110957004161, "grad_norm": 0.3201959431171417, "learning_rate": 1.2667591308368007e-07, "loss": 1.0085, "step": 2740 }, { "epoch": 0.15312066574202496, "grad_norm": 0.4740298390388489, "learning_rate": 1.276005547850208e-07, "loss": 1.0372, "step": 2760 }, { "epoch": 0.15423023578363385, "grad_norm": 0.25857725739479065, "learning_rate": 1.2852519648636154e-07, "loss": 1.1155, "step": 2780 }, { "epoch": 0.1553398058252427, "grad_norm": 0.2147607058286667, "learning_rate": 1.2944983818770224e-07, "loss": 1.0727, "step": 2800 }, { "epoch": 0.1564493758668516, "grad_norm": 0.3122655153274536, "learning_rate": 1.30374479889043e-07, "loss": 1.0618, "step": 2820 }, { "epoch": 0.15755894590846048, "grad_norm": 0.3170627951622009, "learning_rate": 1.3129912159038372e-07, "loss": 1.1382, "step": 2840 }, { "epoch": 0.15866851595006934, "grad_norm": 0.30361172556877136, "learning_rate": 1.3222376329172444e-07, "loss": 1.1187, "step": 2860 }, { "epoch": 0.15977808599167823, "grad_norm": 0.4461187422275543, "learning_rate": 1.3314840499306516e-07, "loss": 1.0829, "step": 2880 }, { "epoch": 0.1608876560332871, "grad_norm": 0.18614862859249115, "learning_rate": 1.3407304669440592e-07, "loss": 1.0718, "step": 2900 }, { "epoch": 0.16199722607489597, "grad_norm": 0.4238240420818329, "learning_rate": 1.3499768839574664e-07, "loss": 1.0038, "step": 2920 }, { "epoch": 0.16310679611650486, "grad_norm": 0.27849137783050537, "learning_rate": 1.3592233009708736e-07, "loss": 1.0431, "step": 2940 }, { "epoch": 0.16421636615811372, "grad_norm": 0.19552382826805115, "learning_rate": 1.3684697179842811e-07, "loss": 1.0423, "step": 2960 }, { "epoch": 0.1653259361997226, "grad_norm": 0.3785472810268402, "learning_rate": 1.3777161349976884e-07, "loss": 1.0663, "step": 2980 }, { "epoch": 0.1664355062413315, "grad_norm": 0.3098989725112915, "learning_rate": 1.3869625520110956e-07, "loss": 1.0992, "step": 3000 }, { "epoch": 0.16754507628294035, "grad_norm": 0.34039440751075745, "learning_rate": 1.3962089690245029e-07, "loss": 1.1163, "step": 3020 }, { "epoch": 0.16865464632454924, "grad_norm": 0.46390464901924133, "learning_rate": 1.4054553860379104e-07, "loss": 1.0407, "step": 3040 }, { "epoch": 0.16976421636615813, "grad_norm": 0.2549150586128235, "learning_rate": 1.4147018030513176e-07, "loss": 1.0808, "step": 3060 }, { "epoch": 0.170873786407767, "grad_norm": 0.3573547601699829, "learning_rate": 1.4239482200647248e-07, "loss": 1.0538, "step": 3080 }, { "epoch": 0.17198335644937587, "grad_norm": 0.27099859714508057, "learning_rate": 1.4331946370781324e-07, "loss": 1.1846, "step": 3100 }, { "epoch": 0.17309292649098473, "grad_norm": 0.267699658870697, "learning_rate": 1.4424410540915393e-07, "loss": 1.0546, "step": 3120 }, { "epoch": 0.17420249653259362, "grad_norm": 0.39820596575737, "learning_rate": 1.4516874711049468e-07, "loss": 1.0369, "step": 3140 }, { "epoch": 0.1753120665742025, "grad_norm": 0.4000251889228821, "learning_rate": 1.460933888118354e-07, "loss": 0.9517, "step": 3160 }, { "epoch": 0.17642163661581137, "grad_norm": 0.38491562008857727, "learning_rate": 1.4701803051317613e-07, "loss": 1.1196, "step": 3180 }, { "epoch": 0.17753120665742025, "grad_norm": 0.22453276813030243, "learning_rate": 1.4794267221451686e-07, "loss": 1.0868, "step": 3200 }, { "epoch": 0.1786407766990291, "grad_norm": 0.39847996830940247, "learning_rate": 1.488673139158576e-07, "loss": 1.0271, "step": 3220 }, { "epoch": 0.179750346740638, "grad_norm": 0.4303145110607147, "learning_rate": 1.4979195561719833e-07, "loss": 1.1178, "step": 3240 }, { "epoch": 0.1808599167822469, "grad_norm": 0.19996902346611023, "learning_rate": 1.5071659731853905e-07, "loss": 0.9999, "step": 3260 }, { "epoch": 0.18196948682385575, "grad_norm": 0.37863689661026, "learning_rate": 1.516412390198798e-07, "loss": 1.0327, "step": 3280 }, { "epoch": 0.18307905686546463, "grad_norm": 0.4372310936450958, "learning_rate": 1.525658807212205e-07, "loss": 1.078, "step": 3300 }, { "epoch": 0.18418862690707352, "grad_norm": 0.3684535026550293, "learning_rate": 1.5349052242256125e-07, "loss": 1.0549, "step": 3320 }, { "epoch": 0.18529819694868238, "grad_norm": 0.3560003340244293, "learning_rate": 1.5441516412390198e-07, "loss": 1.0249, "step": 3340 }, { "epoch": 0.18640776699029127, "grad_norm": 0.22536809742450714, "learning_rate": 1.553398058252427e-07, "loss": 1.0848, "step": 3360 }, { "epoch": 0.18751733703190013, "grad_norm": 0.28466135263442993, "learning_rate": 1.5626444752658343e-07, "loss": 1.0897, "step": 3380 }, { "epoch": 0.18862690707350901, "grad_norm": 0.2622385323047638, "learning_rate": 1.5718908922792418e-07, "loss": 1.0799, "step": 3400 }, { "epoch": 0.1897364771151179, "grad_norm": 0.2542628347873688, "learning_rate": 1.5811373092926493e-07, "loss": 0.9971, "step": 3420 }, { "epoch": 0.19084604715672676, "grad_norm": 0.3928048610687256, "learning_rate": 1.5903837263060562e-07, "loss": 1.1834, "step": 3440 }, { "epoch": 0.19195561719833565, "grad_norm": 0.2728850841522217, "learning_rate": 1.5996301433194637e-07, "loss": 1.0756, "step": 3460 }, { "epoch": 0.1930651872399445, "grad_norm": 0.35969096422195435, "learning_rate": 1.608876560332871e-07, "loss": 1.0397, "step": 3480 }, { "epoch": 0.1941747572815534, "grad_norm": 0.2783578336238861, "learning_rate": 1.6181229773462782e-07, "loss": 1.0745, "step": 3500 }, { "epoch": 0.19528432732316228, "grad_norm": 0.3080522418022156, "learning_rate": 1.6273693943596855e-07, "loss": 1.0008, "step": 3520 }, { "epoch": 0.19639389736477114, "grad_norm": 0.3989417254924774, "learning_rate": 1.636615811373093e-07, "loss": 1.0358, "step": 3540 }, { "epoch": 0.19750346740638003, "grad_norm": 0.3332618474960327, "learning_rate": 1.6458622283865e-07, "loss": 1.0504, "step": 3560 }, { "epoch": 0.19861303744798892, "grad_norm": 0.35313040018081665, "learning_rate": 1.6551086453999075e-07, "loss": 1.0025, "step": 3580 }, { "epoch": 0.19972260748959778, "grad_norm": 0.43535152077674866, "learning_rate": 1.664355062413315e-07, "loss": 0.9747, "step": 3600 }, { "epoch": 0.20083217753120666, "grad_norm": 0.4195978045463562, "learning_rate": 1.673601479426722e-07, "loss": 1.0189, "step": 3620 }, { "epoch": 0.20194174757281552, "grad_norm": 0.49405789375305176, "learning_rate": 1.6828478964401294e-07, "loss": 1.0559, "step": 3640 }, { "epoch": 0.2030513176144244, "grad_norm": 0.266400009393692, "learning_rate": 1.6920943134535367e-07, "loss": 1.0002, "step": 3660 }, { "epoch": 0.2041608876560333, "grad_norm": 0.3552699089050293, "learning_rate": 1.701340730466944e-07, "loss": 0.9755, "step": 3680 }, { "epoch": 0.20527045769764216, "grad_norm": 0.3917396664619446, "learning_rate": 1.7105871474803512e-07, "loss": 1.0176, "step": 3700 }, { "epoch": 0.20638002773925104, "grad_norm": 0.2186136543750763, "learning_rate": 1.7198335644937587e-07, "loss": 1.05, "step": 3720 }, { "epoch": 0.20748959778085993, "grad_norm": 0.2564769387245178, "learning_rate": 1.729079981507166e-07, "loss": 0.9835, "step": 3740 }, { "epoch": 0.2085991678224688, "grad_norm": 0.3103283643722534, "learning_rate": 1.7383263985205731e-07, "loss": 0.9864, "step": 3760 }, { "epoch": 0.20970873786407768, "grad_norm": 0.3148307800292969, "learning_rate": 1.7475728155339807e-07, "loss": 0.9537, "step": 3780 }, { "epoch": 0.21081830790568654, "grad_norm": 0.4601244032382965, "learning_rate": 1.7568192325473876e-07, "loss": 1.0277, "step": 3800 }, { "epoch": 0.21192787794729542, "grad_norm": 0.29893559217453003, "learning_rate": 1.7660656495607951e-07, "loss": 1.0037, "step": 3820 }, { "epoch": 0.2130374479889043, "grad_norm": 0.29135629534721375, "learning_rate": 1.7753120665742024e-07, "loss": 0.9838, "step": 3840 }, { "epoch": 0.21414701803051317, "grad_norm": 0.2114720493555069, "learning_rate": 1.78455848358761e-07, "loss": 1.0458, "step": 3860 }, { "epoch": 0.21525658807212206, "grad_norm": 0.4669742286205292, "learning_rate": 1.7938049006010169e-07, "loss": 0.9457, "step": 3880 }, { "epoch": 0.21636615811373092, "grad_norm": 0.3802589774131775, "learning_rate": 1.8030513176144244e-07, "loss": 0.9013, "step": 3900 }, { "epoch": 0.2174757281553398, "grad_norm": 0.28152763843536377, "learning_rate": 1.8122977346278319e-07, "loss": 0.9474, "step": 3920 }, { "epoch": 0.2185852981969487, "grad_norm": 0.26750996708869934, "learning_rate": 1.8215441516412388e-07, "loss": 0.9364, "step": 3940 }, { "epoch": 0.21969486823855755, "grad_norm": 0.23337991535663605, "learning_rate": 1.8307905686546463e-07, "loss": 1.0202, "step": 3960 }, { "epoch": 0.22080443828016644, "grad_norm": 0.35977551341056824, "learning_rate": 1.8400369856680536e-07, "loss": 0.9818, "step": 3980 }, { "epoch": 0.22191400832177532, "grad_norm": 0.4401928186416626, "learning_rate": 1.8492834026814608e-07, "loss": 0.937, "step": 4000 }, { "epoch": 0.22302357836338418, "grad_norm": 0.3906930983066559, "learning_rate": 1.858529819694868e-07, "loss": 0.9727, "step": 4020 }, { "epoch": 0.22413314840499307, "grad_norm": 0.23399241268634796, "learning_rate": 1.8677762367082756e-07, "loss": 0.9365, "step": 4040 }, { "epoch": 0.22524271844660193, "grad_norm": 0.37429577112197876, "learning_rate": 1.8770226537216826e-07, "loss": 0.9626, "step": 4060 }, { "epoch": 0.22635228848821082, "grad_norm": 0.30430614948272705, "learning_rate": 1.88626907073509e-07, "loss": 0.9892, "step": 4080 }, { "epoch": 0.2274618585298197, "grad_norm": 0.4138410687446594, "learning_rate": 1.8955154877484976e-07, "loss": 0.9339, "step": 4100 }, { "epoch": 0.22857142857142856, "grad_norm": 0.3020586371421814, "learning_rate": 1.9047619047619045e-07, "loss": 0.8529, "step": 4120 }, { "epoch": 0.22968099861303745, "grad_norm": 0.4777414798736572, "learning_rate": 1.914008321775312e-07, "loss": 0.8509, "step": 4140 }, { "epoch": 0.2307905686546463, "grad_norm": 0.27581244707107544, "learning_rate": 1.9232547387887193e-07, "loss": 0.955, "step": 4160 }, { "epoch": 0.2319001386962552, "grad_norm": 0.23865193128585815, "learning_rate": 1.9325011558021265e-07, "loss": 0.9277, "step": 4180 }, { "epoch": 0.23300970873786409, "grad_norm": 0.2712233364582062, "learning_rate": 1.9417475728155338e-07, "loss": 0.9568, "step": 4200 }, { "epoch": 0.23411927877947294, "grad_norm": 0.25536400079727173, "learning_rate": 1.9509939898289413e-07, "loss": 0.916, "step": 4220 }, { "epoch": 0.23522884882108183, "grad_norm": 0.3087388873100281, "learning_rate": 1.9602404068423482e-07, "loss": 0.9352, "step": 4240 }, { "epoch": 0.23633841886269072, "grad_norm": 0.28416091203689575, "learning_rate": 1.9694868238557558e-07, "loss": 0.8952, "step": 4260 }, { "epoch": 0.23744798890429958, "grad_norm": 0.24602976441383362, "learning_rate": 1.9787332408691633e-07, "loss": 0.8668, "step": 4280 }, { "epoch": 0.23855755894590847, "grad_norm": 0.29788491129875183, "learning_rate": 1.9879796578825705e-07, "loss": 0.9627, "step": 4300 }, { "epoch": 0.23966712898751732, "grad_norm": 0.5096386671066284, "learning_rate": 1.9972260748959777e-07, "loss": 0.9935, "step": 4320 }, { "epoch": 0.2407766990291262, "grad_norm": 0.313719779253006, "learning_rate": 2.006472491909385e-07, "loss": 0.9366, "step": 4340 }, { "epoch": 0.2418862690707351, "grad_norm": 0.32236772775650024, "learning_rate": 2.0157189089227925e-07, "loss": 0.88, "step": 4360 }, { "epoch": 0.24299583911234396, "grad_norm": 0.4075660705566406, "learning_rate": 2.0249653259361995e-07, "loss": 0.8998, "step": 4380 }, { "epoch": 0.24410540915395285, "grad_norm": 0.33630868792533875, "learning_rate": 2.034211742949607e-07, "loss": 0.964, "step": 4400 }, { "epoch": 0.24521497919556173, "grad_norm": 0.283672571182251, "learning_rate": 2.0434581599630145e-07, "loss": 0.92, "step": 4420 }, { "epoch": 0.2463245492371706, "grad_norm": 0.5094546675682068, "learning_rate": 2.0527045769764214e-07, "loss": 0.934, "step": 4440 }, { "epoch": 0.24743411927877948, "grad_norm": 0.41356781125068665, "learning_rate": 2.061950993989829e-07, "loss": 0.8351, "step": 4460 }, { "epoch": 0.24854368932038834, "grad_norm": 0.4449295103549957, "learning_rate": 2.0711974110032362e-07, "loss": 0.902, "step": 4480 }, { "epoch": 0.24965325936199723, "grad_norm": 0.28505608439445496, "learning_rate": 2.0804438280166434e-07, "loss": 0.9378, "step": 4500 }, { "epoch": 0.2507628294036061, "grad_norm": 0.31138724088668823, "learning_rate": 2.0896902450300507e-07, "loss": 0.9258, "step": 4520 }, { "epoch": 0.251872399445215, "grad_norm": 0.4205388128757477, "learning_rate": 2.0989366620434582e-07, "loss": 0.8769, "step": 4540 }, { "epoch": 0.25298196948682383, "grad_norm": 0.38944011926651, "learning_rate": 2.1081830790568652e-07, "loss": 0.8419, "step": 4560 }, { "epoch": 0.2540915395284327, "grad_norm": 0.3001191020011902, "learning_rate": 2.1174294960702727e-07, "loss": 0.8571, "step": 4580 }, { "epoch": 0.2552011095700416, "grad_norm": 0.2667725086212158, "learning_rate": 2.1266759130836802e-07, "loss": 0.8594, "step": 4600 }, { "epoch": 0.2563106796116505, "grad_norm": 0.3324829936027527, "learning_rate": 2.1359223300970871e-07, "loss": 0.8438, "step": 4620 }, { "epoch": 0.2574202496532594, "grad_norm": 0.3353516459465027, "learning_rate": 2.1451687471104946e-07, "loss": 0.8239, "step": 4640 }, { "epoch": 0.2585298196948682, "grad_norm": 0.6700271964073181, "learning_rate": 2.154415164123902e-07, "loss": 0.842, "step": 4660 }, { "epoch": 0.2596393897364771, "grad_norm": 0.3062458336353302, "learning_rate": 2.163661581137309e-07, "loss": 0.8877, "step": 4680 }, { "epoch": 0.260748959778086, "grad_norm": 0.3098693788051605, "learning_rate": 2.1729079981507164e-07, "loss": 0.8408, "step": 4700 }, { "epoch": 0.2618585298196949, "grad_norm": 0.5123829245567322, "learning_rate": 2.182154415164124e-07, "loss": 0.8418, "step": 4720 }, { "epoch": 0.26296809986130376, "grad_norm": 0.3035115897655487, "learning_rate": 2.191400832177531e-07, "loss": 0.7362, "step": 4740 }, { "epoch": 0.26407766990291265, "grad_norm": 0.42991915345191956, "learning_rate": 2.2006472491909384e-07, "loss": 0.7718, "step": 4760 }, { "epoch": 0.2651872399445215, "grad_norm": 0.35365360975265503, "learning_rate": 2.2098936662043459e-07, "loss": 0.8369, "step": 4780 }, { "epoch": 0.26629680998613037, "grad_norm": 0.21612907946109772, "learning_rate": 2.219140083217753e-07, "loss": 0.8563, "step": 4800 }, { "epoch": 0.26740638002773925, "grad_norm": 0.2655947506427765, "learning_rate": 2.2283865002311603e-07, "loss": 0.8626, "step": 4820 }, { "epoch": 0.26851595006934814, "grad_norm": 0.41583916544914246, "learning_rate": 2.2376329172445676e-07, "loss": 0.882, "step": 4840 }, { "epoch": 0.26962552011095703, "grad_norm": 0.34641966223716736, "learning_rate": 2.246879334257975e-07, "loss": 0.8227, "step": 4860 }, { "epoch": 0.27073509015256586, "grad_norm": 0.3992447853088379, "learning_rate": 2.256125751271382e-07, "loss": 0.8154, "step": 4880 }, { "epoch": 0.27184466019417475, "grad_norm": 0.3981979191303253, "learning_rate": 2.2653721682847896e-07, "loss": 0.7943, "step": 4900 }, { "epoch": 0.27295423023578363, "grad_norm": 0.4129197299480438, "learning_rate": 2.274618585298197e-07, "loss": 0.7888, "step": 4920 }, { "epoch": 0.2740638002773925, "grad_norm": 0.8618291616439819, "learning_rate": 2.283865002311604e-07, "loss": 0.7547, "step": 4940 }, { "epoch": 0.2751733703190014, "grad_norm": 0.7133357524871826, "learning_rate": 2.2931114193250116e-07, "loss": 0.7943, "step": 4960 }, { "epoch": 0.27628294036061024, "grad_norm": 0.3470710515975952, "learning_rate": 2.3023578363384188e-07, "loss": 0.7815, "step": 4980 }, { "epoch": 0.27739251040221913, "grad_norm": 0.364749938249588, "learning_rate": 2.311604253351826e-07, "loss": 0.7657, "step": 5000 }, { "epoch": 0.278502080443828, "grad_norm": 0.9267297983169556, "learning_rate": 2.3208506703652333e-07, "loss": 0.7172, "step": 5020 }, { "epoch": 0.2796116504854369, "grad_norm": 0.3076070547103882, "learning_rate": 2.3300970873786408e-07, "loss": 0.8122, "step": 5040 }, { "epoch": 0.2807212205270458, "grad_norm": 0.5886161923408508, "learning_rate": 2.3393435043920478e-07, "loss": 0.7661, "step": 5060 }, { "epoch": 0.2818307905686546, "grad_norm": 0.4739508628845215, "learning_rate": 2.3485899214054553e-07, "loss": 0.771, "step": 5080 }, { "epoch": 0.2829403606102635, "grad_norm": 0.2893088161945343, "learning_rate": 2.3578363384188628e-07, "loss": 0.7285, "step": 5100 }, { "epoch": 0.2840499306518724, "grad_norm": 0.7013764381408691, "learning_rate": 2.3670827554322697e-07, "loss": 0.7849, "step": 5120 }, { "epoch": 0.2851595006934813, "grad_norm": 0.5786705613136292, "learning_rate": 2.3763291724456772e-07, "loss": 0.6774, "step": 5140 }, { "epoch": 0.28626907073509017, "grad_norm": 0.4135472774505615, "learning_rate": 2.3855755894590845e-07, "loss": 0.7398, "step": 5160 }, { "epoch": 0.287378640776699, "grad_norm": 0.8986484408378601, "learning_rate": 2.3948220064724917e-07, "loss": 0.8406, "step": 5180 }, { "epoch": 0.2884882108183079, "grad_norm": 0.445899099111557, "learning_rate": 2.404068423485899e-07, "loss": 0.7581, "step": 5200 }, { "epoch": 0.2895977808599168, "grad_norm": 0.40542036294937134, "learning_rate": 2.413314840499306e-07, "loss": 0.7268, "step": 5220 }, { "epoch": 0.29070735090152566, "grad_norm": 0.535135805606842, "learning_rate": 2.422561257512714e-07, "loss": 0.7137, "step": 5240 }, { "epoch": 0.29181692094313455, "grad_norm": 0.3063076436519623, "learning_rate": 2.4318076745261207e-07, "loss": 0.7303, "step": 5260 }, { "epoch": 0.29292649098474344, "grad_norm": 0.5731391310691833, "learning_rate": 2.4410540915395285e-07, "loss": 0.7095, "step": 5280 }, { "epoch": 0.29403606102635227, "grad_norm": 0.4280565679073334, "learning_rate": 2.4503005085529357e-07, "loss": 0.7378, "step": 5300 }, { "epoch": 0.29514563106796116, "grad_norm": 0.5090285539627075, "learning_rate": 2.459546925566343e-07, "loss": 0.7125, "step": 5320 }, { "epoch": 0.29625520110957004, "grad_norm": 0.4406924843788147, "learning_rate": 2.46879334257975e-07, "loss": 0.7061, "step": 5340 }, { "epoch": 0.29736477115117893, "grad_norm": 0.40675410628318787, "learning_rate": 2.4780397595931574e-07, "loss": 0.6986, "step": 5360 }, { "epoch": 0.2984743411927878, "grad_norm": 0.517400324344635, "learning_rate": 2.4872861766065647e-07, "loss": 0.6082, "step": 5380 }, { "epoch": 0.29958391123439665, "grad_norm": 0.6128582954406738, "learning_rate": 2.496532593619972e-07, "loss": 0.6309, "step": 5400 }, { "epoch": 0.30069348127600554, "grad_norm": 0.6010112762451172, "learning_rate": 2.505779010633379e-07, "loss": 0.6738, "step": 5420 }, { "epoch": 0.3018030513176144, "grad_norm": 0.2821389138698578, "learning_rate": 2.515025427646787e-07, "loss": 0.6867, "step": 5440 }, { "epoch": 0.3029126213592233, "grad_norm": 0.2796347737312317, "learning_rate": 2.524271844660194e-07, "loss": 0.6871, "step": 5460 }, { "epoch": 0.3040221914008322, "grad_norm": 0.27248141169548035, "learning_rate": 2.5335182616736014e-07, "loss": 0.651, "step": 5480 }, { "epoch": 0.30513176144244103, "grad_norm": 0.5140066146850586, "learning_rate": 2.5427646786870086e-07, "loss": 0.6305, "step": 5500 }, { "epoch": 0.3062413314840499, "grad_norm": 0.5648813843727112, "learning_rate": 2.552011095700416e-07, "loss": 0.5943, "step": 5520 }, { "epoch": 0.3073509015256588, "grad_norm": 0.25366437435150146, "learning_rate": 2.561257512713823e-07, "loss": 0.6986, "step": 5540 }, { "epoch": 0.3084604715672677, "grad_norm": 0.24973690509796143, "learning_rate": 2.570503929727231e-07, "loss": 0.6144, "step": 5560 }, { "epoch": 0.3095700416088766, "grad_norm": 0.5073511004447937, "learning_rate": 2.5797503467406376e-07, "loss": 0.617, "step": 5580 }, { "epoch": 0.3106796116504854, "grad_norm": 0.3252747356891632, "learning_rate": 2.588996763754045e-07, "loss": 0.6301, "step": 5600 }, { "epoch": 0.3117891816920943, "grad_norm": 0.3252374827861786, "learning_rate": 2.5982431807674526e-07, "loss": 0.6752, "step": 5620 }, { "epoch": 0.3128987517337032, "grad_norm": 0.34135308861732483, "learning_rate": 2.60748959778086e-07, "loss": 0.7187, "step": 5640 }, { "epoch": 0.31400832177531207, "grad_norm": 0.38045448064804077, "learning_rate": 2.616736014794267e-07, "loss": 0.6102, "step": 5660 }, { "epoch": 0.31511789181692096, "grad_norm": 0.2146141082048416, "learning_rate": 2.6259824318076743e-07, "loss": 0.7325, "step": 5680 }, { "epoch": 0.31622746185852985, "grad_norm": 0.29172056913375854, "learning_rate": 2.6352288488210816e-07, "loss": 0.6375, "step": 5700 }, { "epoch": 0.3173370319001387, "grad_norm": 0.3667593002319336, "learning_rate": 2.644475265834489e-07, "loss": 0.638, "step": 5720 }, { "epoch": 0.31844660194174756, "grad_norm": 0.5305479168891907, "learning_rate": 2.6537216828478966e-07, "loss": 0.6497, "step": 5740 }, { "epoch": 0.31955617198335645, "grad_norm": 0.30270981788635254, "learning_rate": 2.6629680998613033e-07, "loss": 0.6235, "step": 5760 }, { "epoch": 0.32066574202496534, "grad_norm": 0.37413808703422546, "learning_rate": 2.6722145168747105e-07, "loss": 0.5404, "step": 5780 }, { "epoch": 0.3217753120665742, "grad_norm": 0.26315727829933167, "learning_rate": 2.6814609338881183e-07, "loss": 0.6259, "step": 5800 }, { "epoch": 0.32288488210818306, "grad_norm": 0.3610723614692688, "learning_rate": 2.6907073509015255e-07, "loss": 0.5868, "step": 5820 }, { "epoch": 0.32399445214979194, "grad_norm": 0.3144702911376953, "learning_rate": 2.699953767914933e-07, "loss": 0.6099, "step": 5840 }, { "epoch": 0.32510402219140083, "grad_norm": 0.21689702570438385, "learning_rate": 2.70920018492834e-07, "loss": 0.577, "step": 5860 }, { "epoch": 0.3262135922330097, "grad_norm": 0.2816571891307831, "learning_rate": 2.7184466019417473e-07, "loss": 0.5438, "step": 5880 }, { "epoch": 0.3273231622746186, "grad_norm": 0.3756202757358551, "learning_rate": 2.727693018955155e-07, "loss": 0.6079, "step": 5900 }, { "epoch": 0.32843273231622744, "grad_norm": 0.23747026920318604, "learning_rate": 2.7369394359685623e-07, "loss": 0.6084, "step": 5920 }, { "epoch": 0.3295423023578363, "grad_norm": 0.557923436164856, "learning_rate": 2.746185852981969e-07, "loss": 0.5785, "step": 5940 }, { "epoch": 0.3306518723994452, "grad_norm": 0.36645594239234924, "learning_rate": 2.755432269995377e-07, "loss": 0.6088, "step": 5960 }, { "epoch": 0.3317614424410541, "grad_norm": 0.24802128970623016, "learning_rate": 2.764678687008784e-07, "loss": 0.5693, "step": 5980 }, { "epoch": 0.332871012482663, "grad_norm": 0.2049698382616043, "learning_rate": 2.773925104022191e-07, "loss": 0.5963, "step": 6000 }, { "epoch": 0.3339805825242718, "grad_norm": 0.2446167916059494, "learning_rate": 2.783171521035599e-07, "loss": 0.4995, "step": 6020 }, { "epoch": 0.3350901525658807, "grad_norm": 0.5112882852554321, "learning_rate": 2.7924179380490057e-07, "loss": 0.6231, "step": 6040 }, { "epoch": 0.3361997226074896, "grad_norm": 0.6222682595252991, "learning_rate": 2.801664355062413e-07, "loss": 0.5756, "step": 6060 }, { "epoch": 0.3373092926490985, "grad_norm": 0.3166469931602478, "learning_rate": 2.8109107720758207e-07, "loss": 0.654, "step": 6080 }, { "epoch": 0.33841886269070737, "grad_norm": 0.4047017991542816, "learning_rate": 2.820157189089228e-07, "loss": 0.4953, "step": 6100 }, { "epoch": 0.33952843273231625, "grad_norm": 0.2853200137615204, "learning_rate": 2.829403606102635e-07, "loss": 0.5572, "step": 6120 }, { "epoch": 0.3406380027739251, "grad_norm": 0.7440733909606934, "learning_rate": 2.8386500231160425e-07, "loss": 0.6331, "step": 6140 }, { "epoch": 0.341747572815534, "grad_norm": 0.34607845544815063, "learning_rate": 2.8478964401294497e-07, "loss": 0.5891, "step": 6160 }, { "epoch": 0.34285714285714286, "grad_norm": 0.22683803737163544, "learning_rate": 2.857142857142857e-07, "loss": 0.5694, "step": 6180 }, { "epoch": 0.34396671289875175, "grad_norm": 0.43753308057785034, "learning_rate": 2.8663892741562647e-07, "loss": 0.5114, "step": 6200 }, { "epoch": 0.34507628294036063, "grad_norm": 0.23977980017662048, "learning_rate": 2.8756356911696714e-07, "loss": 0.6944, "step": 6220 }, { "epoch": 0.34618585298196947, "grad_norm": 0.26347365975379944, "learning_rate": 2.8848821081830787e-07, "loss": 0.5931, "step": 6240 }, { "epoch": 0.34729542302357835, "grad_norm": 0.3672482371330261, "learning_rate": 2.8941285251964864e-07, "loss": 0.5772, "step": 6260 }, { "epoch": 0.34840499306518724, "grad_norm": 0.42619141936302185, "learning_rate": 2.9033749422098937e-07, "loss": 0.5409, "step": 6280 }, { "epoch": 0.34951456310679613, "grad_norm": 0.2415861189365387, "learning_rate": 2.912621359223301e-07, "loss": 0.6142, "step": 6300 }, { "epoch": 0.350624133148405, "grad_norm": 0.20353947579860687, "learning_rate": 2.921867776236708e-07, "loss": 0.5562, "step": 6320 }, { "epoch": 0.35173370319001385, "grad_norm": 0.2843568027019501, "learning_rate": 2.9311141932501154e-07, "loss": 0.5702, "step": 6340 }, { "epoch": 0.35284327323162273, "grad_norm": 0.3476616144180298, "learning_rate": 2.9403606102635226e-07, "loss": 0.5592, "step": 6360 }, { "epoch": 0.3539528432732316, "grad_norm": 0.2824789583683014, "learning_rate": 2.9496070272769304e-07, "loss": 0.584, "step": 6380 }, { "epoch": 0.3550624133148405, "grad_norm": 0.36539188027381897, "learning_rate": 2.958853444290337e-07, "loss": 0.5319, "step": 6400 }, { "epoch": 0.3561719833564494, "grad_norm": 0.2157587707042694, "learning_rate": 2.9680998613037444e-07, "loss": 0.664, "step": 6420 }, { "epoch": 0.3572815533980582, "grad_norm": 0.2750494182109833, "learning_rate": 2.977346278317152e-07, "loss": 0.6424, "step": 6440 }, { "epoch": 0.3583911234396671, "grad_norm": 0.38685059547424316, "learning_rate": 2.9865926953305594e-07, "loss": 0.4633, "step": 6460 }, { "epoch": 0.359500693481276, "grad_norm": 0.31798675656318665, "learning_rate": 2.9958391123439666e-07, "loss": 0.4868, "step": 6480 }, { "epoch": 0.3606102635228849, "grad_norm": 0.335330069065094, "learning_rate": 3.005085529357374e-07, "loss": 0.5568, "step": 6500 }, { "epoch": 0.3617198335644938, "grad_norm": 0.19244511425495148, "learning_rate": 3.014331946370781e-07, "loss": 0.5513, "step": 6520 }, { "epoch": 0.36282940360610266, "grad_norm": 0.29192841053009033, "learning_rate": 3.0235783633841883e-07, "loss": 0.6241, "step": 6540 }, { "epoch": 0.3639389736477115, "grad_norm": 0.2889971137046814, "learning_rate": 3.032824780397596e-07, "loss": 0.634, "step": 6560 }, { "epoch": 0.3650485436893204, "grad_norm": 0.2602294981479645, "learning_rate": 3.042071197411003e-07, "loss": 0.5739, "step": 6580 }, { "epoch": 0.36615811373092927, "grad_norm": 0.21736390888690948, "learning_rate": 3.05131761442441e-07, "loss": 0.5948, "step": 6600 }, { "epoch": 0.36726768377253816, "grad_norm": 0.35423943400382996, "learning_rate": 3.060564031437818e-07, "loss": 0.582, "step": 6620 }, { "epoch": 0.36837725381414704, "grad_norm": 0.22004581987857819, "learning_rate": 3.069810448451225e-07, "loss": 0.5741, "step": 6640 }, { "epoch": 0.3694868238557559, "grad_norm": 0.18029527366161346, "learning_rate": 3.0790568654646323e-07, "loss": 0.5314, "step": 6660 }, { "epoch": 0.37059639389736476, "grad_norm": 0.318406343460083, "learning_rate": 3.0883032824780395e-07, "loss": 0.5543, "step": 6680 }, { "epoch": 0.37170596393897365, "grad_norm": 0.21545974910259247, "learning_rate": 3.097549699491447e-07, "loss": 0.588, "step": 6700 }, { "epoch": 0.37281553398058254, "grad_norm": 0.37375134229660034, "learning_rate": 3.106796116504854e-07, "loss": 0.5656, "step": 6720 }, { "epoch": 0.3739251040221914, "grad_norm": 0.19006316363811493, "learning_rate": 3.116042533518262e-07, "loss": 0.5082, "step": 6740 }, { "epoch": 0.37503467406380026, "grad_norm": 0.22540980577468872, "learning_rate": 3.1252889505316685e-07, "loss": 0.6843, "step": 6760 }, { "epoch": 0.37614424410540914, "grad_norm": 0.6511667370796204, "learning_rate": 3.1345353675450763e-07, "loss": 0.4452, "step": 6780 }, { "epoch": 0.37725381414701803, "grad_norm": 0.40717339515686035, "learning_rate": 3.1437817845584835e-07, "loss": 0.5769, "step": 6800 }, { "epoch": 0.3783633841886269, "grad_norm": 0.2875766456127167, "learning_rate": 3.153028201571891e-07, "loss": 0.637, "step": 6820 }, { "epoch": 0.3794729542302358, "grad_norm": 0.3348805904388428, "learning_rate": 3.1622746185852985e-07, "loss": 0.5086, "step": 6840 }, { "epoch": 0.38058252427184464, "grad_norm": 0.24617819488048553, "learning_rate": 3.171521035598705e-07, "loss": 0.5657, "step": 6860 }, { "epoch": 0.3816920943134535, "grad_norm": 0.29950428009033203, "learning_rate": 3.1807674526121125e-07, "loss": 0.5385, "step": 6880 }, { "epoch": 0.3828016643550624, "grad_norm": 0.26588067412376404, "learning_rate": 3.19001386962552e-07, "loss": 0.6407, "step": 6900 }, { "epoch": 0.3839112343966713, "grad_norm": 0.31910252571105957, "learning_rate": 3.1992602866389275e-07, "loss": 0.5534, "step": 6920 }, { "epoch": 0.3850208044382802, "grad_norm": 0.32439926266670227, "learning_rate": 3.208506703652334e-07, "loss": 0.4795, "step": 6940 }, { "epoch": 0.386130374479889, "grad_norm": 0.3166181743144989, "learning_rate": 3.217753120665742e-07, "loss": 0.4811, "step": 6960 }, { "epoch": 0.3872399445214979, "grad_norm": 0.8823560476303101, "learning_rate": 3.226999537679149e-07, "loss": 0.4817, "step": 6980 }, { "epoch": 0.3883495145631068, "grad_norm": 0.24186256527900696, "learning_rate": 3.2362459546925565e-07, "loss": 0.4932, "step": 7000 }, { "epoch": 0.3894590846047157, "grad_norm": 0.21879135072231293, "learning_rate": 3.245492371705964e-07, "loss": 0.4926, "step": 7020 }, { "epoch": 0.39056865464632456, "grad_norm": 0.2966933250427246, "learning_rate": 3.254738788719371e-07, "loss": 0.5091, "step": 7040 }, { "epoch": 0.39167822468793345, "grad_norm": 0.22823984920978546, "learning_rate": 3.263985205732778e-07, "loss": 0.5552, "step": 7060 }, { "epoch": 0.3927877947295423, "grad_norm": 0.1864573359489441, "learning_rate": 3.273231622746186e-07, "loss": 0.5541, "step": 7080 }, { "epoch": 0.39389736477115117, "grad_norm": 0.19437795877456665, "learning_rate": 3.282478039759593e-07, "loss": 0.4623, "step": 7100 }, { "epoch": 0.39500693481276006, "grad_norm": 0.25148576498031616, "learning_rate": 3.291724456773e-07, "loss": 0.5465, "step": 7120 }, { "epoch": 0.39611650485436894, "grad_norm": 0.2807692587375641, "learning_rate": 3.3009708737864077e-07, "loss": 0.5433, "step": 7140 }, { "epoch": 0.39722607489597783, "grad_norm": 0.4560348391532898, "learning_rate": 3.310217290799815e-07, "loss": 0.5504, "step": 7160 }, { "epoch": 0.39833564493758666, "grad_norm": 0.2733548581600189, "learning_rate": 3.319463707813222e-07, "loss": 0.6042, "step": 7180 }, { "epoch": 0.39944521497919555, "grad_norm": 0.25350791215896606, "learning_rate": 3.32871012482663e-07, "loss": 0.5028, "step": 7200 }, { "epoch": 0.40055478502080444, "grad_norm": 0.21779541671276093, "learning_rate": 3.3379565418400366e-07, "loss": 0.5257, "step": 7220 }, { "epoch": 0.4016643550624133, "grad_norm": 0.21930049359798431, "learning_rate": 3.347202958853444e-07, "loss": 0.5564, "step": 7240 }, { "epoch": 0.4027739251040222, "grad_norm": 0.22925446927547455, "learning_rate": 3.3564493758668516e-07, "loss": 0.6265, "step": 7260 }, { "epoch": 0.40388349514563104, "grad_norm": 0.1459244042634964, "learning_rate": 3.365695792880259e-07, "loss": 0.589, "step": 7280 }, { "epoch": 0.40499306518723993, "grad_norm": 0.26222601532936096, "learning_rate": 3.3749422098936656e-07, "loss": 0.5161, "step": 7300 }, { "epoch": 0.4061026352288488, "grad_norm": 0.2893272042274475, "learning_rate": 3.3841886269070734e-07, "loss": 0.4828, "step": 7320 }, { "epoch": 0.4072122052704577, "grad_norm": 0.1881682425737381, "learning_rate": 3.3934350439204806e-07, "loss": 0.4906, "step": 7340 }, { "epoch": 0.4083217753120666, "grad_norm": 0.2624022364616394, "learning_rate": 3.402681460933888e-07, "loss": 0.5221, "step": 7360 }, { "epoch": 0.4094313453536754, "grad_norm": 0.19354553520679474, "learning_rate": 3.4119278779472956e-07, "loss": 0.5277, "step": 7380 }, { "epoch": 0.4105409153952843, "grad_norm": 0.38209661841392517, "learning_rate": 3.4211742949607023e-07, "loss": 0.5448, "step": 7400 }, { "epoch": 0.4116504854368932, "grad_norm": 0.16549670696258545, "learning_rate": 3.4304207119741096e-07, "loss": 0.4758, "step": 7420 }, { "epoch": 0.4127600554785021, "grad_norm": 0.45479658246040344, "learning_rate": 3.4396671289875173e-07, "loss": 0.5189, "step": 7440 }, { "epoch": 0.413869625520111, "grad_norm": 0.28281068801879883, "learning_rate": 3.4489135460009246e-07, "loss": 0.6025, "step": 7460 }, { "epoch": 0.41497919556171986, "grad_norm": 0.2903256416320801, "learning_rate": 3.458159963014332e-07, "loss": 0.4768, "step": 7480 }, { "epoch": 0.4160887656033287, "grad_norm": 0.16982723772525787, "learning_rate": 3.467406380027739e-07, "loss": 0.5871, "step": 7500 }, { "epoch": 0.4171983356449376, "grad_norm": 0.21312560141086578, "learning_rate": 3.4766527970411463e-07, "loss": 0.5071, "step": 7520 }, { "epoch": 0.41830790568654647, "grad_norm": 0.8553511500358582, "learning_rate": 3.4858992140545535e-07, "loss": 0.6352, "step": 7540 }, { "epoch": 0.41941747572815535, "grad_norm": 0.3417006731033325, "learning_rate": 3.4951456310679613e-07, "loss": 0.4903, "step": 7560 }, { "epoch": 0.42052704576976424, "grad_norm": 0.200980082154274, "learning_rate": 3.504392048081368e-07, "loss": 0.517, "step": 7580 }, { "epoch": 0.42163661581137307, "grad_norm": 0.26966115832328796, "learning_rate": 3.513638465094775e-07, "loss": 0.5992, "step": 7600 }, { "epoch": 0.42274618585298196, "grad_norm": 1.115612506866455, "learning_rate": 3.522884882108183e-07, "loss": 0.5035, "step": 7620 }, { "epoch": 0.42385575589459085, "grad_norm": 0.20158541202545166, "learning_rate": 3.5321312991215903e-07, "loss": 0.588, "step": 7640 }, { "epoch": 0.42496532593619973, "grad_norm": 0.21740297973155975, "learning_rate": 3.5413777161349975e-07, "loss": 0.5009, "step": 7660 }, { "epoch": 0.4260748959778086, "grad_norm": 0.31125953793525696, "learning_rate": 3.550624133148405e-07, "loss": 0.4338, "step": 7680 }, { "epoch": 0.42718446601941745, "grad_norm": 0.3379075825214386, "learning_rate": 3.559870550161812e-07, "loss": 0.5567, "step": 7700 }, { "epoch": 0.42829403606102634, "grad_norm": 0.19968168437480927, "learning_rate": 3.56911696717522e-07, "loss": 0.5802, "step": 7720 }, { "epoch": 0.4294036061026352, "grad_norm": 0.25363612174987793, "learning_rate": 3.578363384188627e-07, "loss": 0.5118, "step": 7740 }, { "epoch": 0.4305131761442441, "grad_norm": 0.24938738346099854, "learning_rate": 3.5876098012020337e-07, "loss": 0.5081, "step": 7760 }, { "epoch": 0.431622746185853, "grad_norm": 0.21712304651737213, "learning_rate": 3.5968562182154415e-07, "loss": 0.4873, "step": 7780 }, { "epoch": 0.43273231622746183, "grad_norm": 0.18558524549007416, "learning_rate": 3.6061026352288487e-07, "loss": 0.5528, "step": 7800 }, { "epoch": 0.4338418862690707, "grad_norm": 0.33640581369400024, "learning_rate": 3.615349052242256e-07, "loss": 0.5902, "step": 7820 }, { "epoch": 0.4349514563106796, "grad_norm": 0.293722540140152, "learning_rate": 3.6245954692556637e-07, "loss": 0.6092, "step": 7840 }, { "epoch": 0.4360610263522885, "grad_norm": 0.20892654359340668, "learning_rate": 3.6338418862690704e-07, "loss": 0.5196, "step": 7860 }, { "epoch": 0.4371705963938974, "grad_norm": 0.2536463439464569, "learning_rate": 3.6430883032824777e-07, "loss": 0.5111, "step": 7880 }, { "epoch": 0.43828016643550627, "grad_norm": 0.19541151821613312, "learning_rate": 3.6523347202958855e-07, "loss": 0.6389, "step": 7900 }, { "epoch": 0.4393897364771151, "grad_norm": 0.296332985162735, "learning_rate": 3.6615811373092927e-07, "loss": 0.4797, "step": 7920 }, { "epoch": 0.440499306518724, "grad_norm": 0.18503420054912567, "learning_rate": 3.6708275543226994e-07, "loss": 0.5355, "step": 7940 }, { "epoch": 0.4416088765603329, "grad_norm": 0.190393328666687, "learning_rate": 3.680073971336107e-07, "loss": 0.566, "step": 7960 }, { "epoch": 0.44271844660194176, "grad_norm": 0.3332795798778534, "learning_rate": 3.6893203883495144e-07, "loss": 0.5165, "step": 7980 }, { "epoch": 0.44382801664355065, "grad_norm": 0.23749427497386932, "learning_rate": 3.6985668053629217e-07, "loss": 0.4872, "step": 8000 }, { "epoch": 0.4449375866851595, "grad_norm": 0.23810884356498718, "learning_rate": 3.7078132223763294e-07, "loss": 0.4595, "step": 8020 }, { "epoch": 0.44604715672676837, "grad_norm": 0.21136030554771423, "learning_rate": 3.717059639389736e-07, "loss": 0.5681, "step": 8040 }, { "epoch": 0.44715672676837726, "grad_norm": 0.23159508407115936, "learning_rate": 3.7263060564031434e-07, "loss": 0.5322, "step": 8060 }, { "epoch": 0.44826629680998614, "grad_norm": 0.2050127238035202, "learning_rate": 3.735552473416551e-07, "loss": 0.4856, "step": 8080 }, { "epoch": 0.44937586685159503, "grad_norm": 0.17540262639522552, "learning_rate": 3.7447988904299584e-07, "loss": 0.4884, "step": 8100 }, { "epoch": 0.45048543689320386, "grad_norm": 0.3013472557067871, "learning_rate": 3.754045307443365e-07, "loss": 0.5283, "step": 8120 }, { "epoch": 0.45159500693481275, "grad_norm": 0.25530165433883667, "learning_rate": 3.763291724456773e-07, "loss": 0.5542, "step": 8140 }, { "epoch": 0.45270457697642164, "grad_norm": 0.18685097992420197, "learning_rate": 3.77253814147018e-07, "loss": 0.5015, "step": 8160 }, { "epoch": 0.4538141470180305, "grad_norm": 0.19330184161663055, "learning_rate": 3.7817845584835874e-07, "loss": 0.596, "step": 8180 }, { "epoch": 0.4549237170596394, "grad_norm": 0.25760722160339355, "learning_rate": 3.791030975496995e-07, "loss": 0.5555, "step": 8200 }, { "epoch": 0.45603328710124824, "grad_norm": 0.27315178513526917, "learning_rate": 3.800277392510402e-07, "loss": 0.6167, "step": 8220 }, { "epoch": 0.45714285714285713, "grad_norm": 0.22821161150932312, "learning_rate": 3.809523809523809e-07, "loss": 0.4666, "step": 8240 }, { "epoch": 0.458252427184466, "grad_norm": 0.19657063484191895, "learning_rate": 3.818770226537217e-07, "loss": 0.5827, "step": 8260 }, { "epoch": 0.4593619972260749, "grad_norm": 0.2064313292503357, "learning_rate": 3.828016643550624e-07, "loss": 0.5292, "step": 8280 }, { "epoch": 0.4604715672676838, "grad_norm": 0.19304735958576202, "learning_rate": 3.837263060564031e-07, "loss": 0.5352, "step": 8300 }, { "epoch": 0.4615811373092926, "grad_norm": 0.2460714429616928, "learning_rate": 3.8465094775774386e-07, "loss": 0.5334, "step": 8320 }, { "epoch": 0.4626907073509015, "grad_norm": 0.22690536081790924, "learning_rate": 3.855755894590846e-07, "loss": 0.5174, "step": 8340 }, { "epoch": 0.4638002773925104, "grad_norm": 0.22172993421554565, "learning_rate": 3.865002311604253e-07, "loss": 0.5454, "step": 8360 }, { "epoch": 0.4649098474341193, "grad_norm": 0.2455633282661438, "learning_rate": 3.874248728617661e-07, "loss": 0.5311, "step": 8380 }, { "epoch": 0.46601941747572817, "grad_norm": 0.19094975292682648, "learning_rate": 3.8834951456310675e-07, "loss": 0.5407, "step": 8400 }, { "epoch": 0.46712898751733706, "grad_norm": 0.2559053301811218, "learning_rate": 3.892741562644475e-07, "loss": 0.4831, "step": 8420 }, { "epoch": 0.4682385575589459, "grad_norm": 0.27790650725364685, "learning_rate": 3.9019879796578825e-07, "loss": 0.5029, "step": 8440 }, { "epoch": 0.4693481276005548, "grad_norm": 0.18584692478179932, "learning_rate": 3.91123439667129e-07, "loss": 0.4638, "step": 8460 }, { "epoch": 0.47045769764216366, "grad_norm": 0.21301785111427307, "learning_rate": 3.9204808136846965e-07, "loss": 0.5047, "step": 8480 }, { "epoch": 0.47156726768377255, "grad_norm": 0.3375946283340454, "learning_rate": 3.929727230698104e-07, "loss": 0.5263, "step": 8500 }, { "epoch": 0.47267683772538144, "grad_norm": 0.2399173080921173, "learning_rate": 3.9389736477115115e-07, "loss": 0.4831, "step": 8520 }, { "epoch": 0.47378640776699027, "grad_norm": 0.15032389760017395, "learning_rate": 3.948220064724919e-07, "loss": 0.5644, "step": 8540 }, { "epoch": 0.47489597780859916, "grad_norm": 0.22850152850151062, "learning_rate": 3.9574664817383265e-07, "loss": 0.448, "step": 8560 }, { "epoch": 0.47600554785020804, "grad_norm": 0.26218274235725403, "learning_rate": 3.966712898751733e-07, "loss": 0.5731, "step": 8580 }, { "epoch": 0.47711511789181693, "grad_norm": 0.21220628917217255, "learning_rate": 3.975959315765141e-07, "loss": 0.6342, "step": 8600 }, { "epoch": 0.4782246879334258, "grad_norm": 0.17864075303077698, "learning_rate": 3.985205732778548e-07, "loss": 0.4639, "step": 8620 }, { "epoch": 0.47933425797503465, "grad_norm": 0.2803802490234375, "learning_rate": 3.9944521497919555e-07, "loss": 0.6018, "step": 8640 }, { "epoch": 0.48044382801664354, "grad_norm": 0.2241954654455185, "learning_rate": 4.003698566805363e-07, "loss": 0.5111, "step": 8660 }, { "epoch": 0.4815533980582524, "grad_norm": 0.25983789563179016, "learning_rate": 4.01294498381877e-07, "loss": 0.4778, "step": 8680 }, { "epoch": 0.4826629680998613, "grad_norm": 0.48745396733283997, "learning_rate": 4.022191400832177e-07, "loss": 0.5965, "step": 8700 }, { "epoch": 0.4837725381414702, "grad_norm": 0.23990671336650848, "learning_rate": 4.031437817845585e-07, "loss": 0.5125, "step": 8720 }, { "epoch": 0.48488210818307903, "grad_norm": 0.21298658847808838, "learning_rate": 4.040684234858992e-07, "loss": 0.5075, "step": 8740 }, { "epoch": 0.4859916782246879, "grad_norm": 0.24653124809265137, "learning_rate": 4.049930651872399e-07, "loss": 0.5749, "step": 8760 }, { "epoch": 0.4871012482662968, "grad_norm": 0.3421865403652191, "learning_rate": 4.0591770688858067e-07, "loss": 0.5388, "step": 8780 }, { "epoch": 0.4882108183079057, "grad_norm": 0.2061384618282318, "learning_rate": 4.068423485899214e-07, "loss": 0.5221, "step": 8800 }, { "epoch": 0.4893203883495146, "grad_norm": 0.16795580089092255, "learning_rate": 4.077669902912621e-07, "loss": 0.5583, "step": 8820 }, { "epoch": 0.49042995839112347, "grad_norm": 0.28502967953681946, "learning_rate": 4.086916319926029e-07, "loss": 0.4824, "step": 8840 }, { "epoch": 0.4915395284327323, "grad_norm": 0.20258374512195587, "learning_rate": 4.0961627369394357e-07, "loss": 0.495, "step": 8860 }, { "epoch": 0.4926490984743412, "grad_norm": 0.6956618428230286, "learning_rate": 4.105409153952843e-07, "loss": 0.4872, "step": 8880 }, { "epoch": 0.49375866851595007, "grad_norm": 0.3878124952316284, "learning_rate": 4.1146555709662507e-07, "loss": 0.5019, "step": 8900 }, { "epoch": 0.49486823855755896, "grad_norm": 0.24756085872650146, "learning_rate": 4.123901987979658e-07, "loss": 0.5817, "step": 8920 }, { "epoch": 0.49597780859916785, "grad_norm": 0.27632543444633484, "learning_rate": 4.1331484049930646e-07, "loss": 0.5946, "step": 8940 }, { "epoch": 0.4970873786407767, "grad_norm": 0.25847044587135315, "learning_rate": 4.1423948220064724e-07, "loss": 0.5606, "step": 8960 }, { "epoch": 0.49819694868238557, "grad_norm": 0.27678734064102173, "learning_rate": 4.1516412390198796e-07, "loss": 0.5293, "step": 8980 }, { "epoch": 0.49930651872399445, "grad_norm": 0.25392892956733704, "learning_rate": 4.160887656033287e-07, "loss": 0.6604, "step": 9000 }, { "epoch": 0.5004160887656033, "grad_norm": 0.20299489796161652, "learning_rate": 4.1701340730466946e-07, "loss": 0.4616, "step": 9020 }, { "epoch": 0.5015256588072122, "grad_norm": 0.2838829457759857, "learning_rate": 4.1793804900601013e-07, "loss": 0.5392, "step": 9040 }, { "epoch": 0.5026352288488211, "grad_norm": 0.30037567019462585, "learning_rate": 4.1886269070735086e-07, "loss": 0.3846, "step": 9060 }, { "epoch": 0.50374479889043, "grad_norm": 0.25013217329978943, "learning_rate": 4.1978733240869164e-07, "loss": 0.4969, "step": 9080 }, { "epoch": 0.5048543689320388, "grad_norm": 0.3404879868030548, "learning_rate": 4.2071197411003236e-07, "loss": 0.4925, "step": 9100 }, { "epoch": 0.5059639389736477, "grad_norm": 0.4206264913082123, "learning_rate": 4.2163661581137303e-07, "loss": 0.4729, "step": 9120 }, { "epoch": 0.5070735090152566, "grad_norm": 0.2556391656398773, "learning_rate": 4.225612575127138e-07, "loss": 0.4663, "step": 9140 }, { "epoch": 0.5081830790568654, "grad_norm": 0.19968748092651367, "learning_rate": 4.2348589921405453e-07, "loss": 0.4627, "step": 9160 }, { "epoch": 0.5092926490984744, "grad_norm": 0.18693120777606964, "learning_rate": 4.2441054091539526e-07, "loss": 0.5116, "step": 9180 }, { "epoch": 0.5104022191400832, "grad_norm": 0.6120763421058655, "learning_rate": 4.2533518261673603e-07, "loss": 0.5455, "step": 9200 }, { "epoch": 0.511511789181692, "grad_norm": 0.2741752564907074, "learning_rate": 4.262598243180767e-07, "loss": 0.5641, "step": 9220 }, { "epoch": 0.512621359223301, "grad_norm": 0.32368993759155273, "learning_rate": 4.2718446601941743e-07, "loss": 0.5138, "step": 9240 }, { "epoch": 0.5137309292649098, "grad_norm": 0.18577614426612854, "learning_rate": 4.281091077207582e-07, "loss": 0.5589, "step": 9260 }, { "epoch": 0.5148404993065188, "grad_norm": 0.3895021378993988, "learning_rate": 4.2903374942209893e-07, "loss": 0.5672, "step": 9280 }, { "epoch": 0.5159500693481276, "grad_norm": 0.2861137092113495, "learning_rate": 4.299583911234396e-07, "loss": 0.447, "step": 9300 }, { "epoch": 0.5170596393897364, "grad_norm": 0.1956733763217926, "learning_rate": 4.308830328247804e-07, "loss": 0.4737, "step": 9320 }, { "epoch": 0.5181692094313454, "grad_norm": 0.21330368518829346, "learning_rate": 4.318076745261211e-07, "loss": 0.5488, "step": 9340 }, { "epoch": 0.5192787794729542, "grad_norm": 0.22797897458076477, "learning_rate": 4.327323162274618e-07, "loss": 0.6304, "step": 9360 }, { "epoch": 0.5203883495145631, "grad_norm": 0.3280169367790222, "learning_rate": 4.336569579288026e-07, "loss": 0.449, "step": 9380 }, { "epoch": 0.521497919556172, "grad_norm": 0.1938370317220688, "learning_rate": 4.345815996301433e-07, "loss": 0.6148, "step": 9400 }, { "epoch": 0.5226074895977808, "grad_norm": 0.1874140352010727, "learning_rate": 4.35506241331484e-07, "loss": 0.4351, "step": 9420 }, { "epoch": 0.5237170596393897, "grad_norm": 0.32289639115333557, "learning_rate": 4.364308830328248e-07, "loss": 0.5614, "step": 9440 }, { "epoch": 0.5248266296809986, "grad_norm": 0.33790576457977295, "learning_rate": 4.373555247341655e-07, "loss": 0.5315, "step": 9460 }, { "epoch": 0.5259361997226075, "grad_norm": 0.11760276556015015, "learning_rate": 4.382801664355062e-07, "loss": 0.5404, "step": 9480 }, { "epoch": 0.5270457697642164, "grad_norm": 0.29093876481056213, "learning_rate": 4.3920480813684695e-07, "loss": 0.5708, "step": 9500 }, { "epoch": 0.5281553398058253, "grad_norm": 0.2900814116001129, "learning_rate": 4.4012944983818767e-07, "loss": 0.4933, "step": 9520 }, { "epoch": 0.5292649098474341, "grad_norm": 0.2806960642337799, "learning_rate": 4.4105409153952845e-07, "loss": 0.5684, "step": 9540 }, { "epoch": 0.530374479889043, "grad_norm": 0.20187591016292572, "learning_rate": 4.4197873324086917e-07, "loss": 0.5541, "step": 9560 }, { "epoch": 0.5314840499306519, "grad_norm": 0.25328460335731506, "learning_rate": 4.4290337494220984e-07, "loss": 0.4609, "step": 9580 }, { "epoch": 0.5325936199722607, "grad_norm": 0.22789980471134186, "learning_rate": 4.438280166435506e-07, "loss": 0.6008, "step": 9600 }, { "epoch": 0.5337031900138697, "grad_norm": 0.22686462104320526, "learning_rate": 4.4475265834489134e-07, "loss": 0.5901, "step": 9620 }, { "epoch": 0.5348127600554785, "grad_norm": 0.20367446541786194, "learning_rate": 4.4567730004623207e-07, "loss": 0.5921, "step": 9640 }, { "epoch": 0.5359223300970873, "grad_norm": 0.24750830233097076, "learning_rate": 4.4660194174757285e-07, "loss": 0.5244, "step": 9660 }, { "epoch": 0.5370319001386963, "grad_norm": 0.22665967047214508, "learning_rate": 4.475265834489135e-07, "loss": 0.6619, "step": 9680 }, { "epoch": 0.5381414701803051, "grad_norm": 0.21742811799049377, "learning_rate": 4.4845122515025424e-07, "loss": 0.4535, "step": 9700 }, { "epoch": 0.5392510402219141, "grad_norm": 0.24885281920433044, "learning_rate": 4.49375866851595e-07, "loss": 0.4427, "step": 9720 }, { "epoch": 0.5403606102635229, "grad_norm": 0.2896670997142792, "learning_rate": 4.5030050855293574e-07, "loss": 0.4553, "step": 9740 }, { "epoch": 0.5414701803051317, "grad_norm": 0.16223928332328796, "learning_rate": 4.512251502542764e-07, "loss": 0.4865, "step": 9760 }, { "epoch": 0.5425797503467407, "grad_norm": 0.21577787399291992, "learning_rate": 4.521497919556172e-07, "loss": 0.4797, "step": 9780 }, { "epoch": 0.5436893203883495, "grad_norm": 0.28907138109207153, "learning_rate": 4.530744336569579e-07, "loss": 0.5, "step": 9800 }, { "epoch": 0.5447988904299584, "grad_norm": 0.19837988913059235, "learning_rate": 4.5399907535829864e-07, "loss": 0.54, "step": 9820 }, { "epoch": 0.5459084604715673, "grad_norm": 0.2459855079650879, "learning_rate": 4.549237170596394e-07, "loss": 0.5042, "step": 9840 }, { "epoch": 0.5470180305131761, "grad_norm": 0.2531174123287201, "learning_rate": 4.558483587609801e-07, "loss": 0.494, "step": 9860 }, { "epoch": 0.548127600554785, "grad_norm": 0.27641522884368896, "learning_rate": 4.567730004623208e-07, "loss": 0.5629, "step": 9880 }, { "epoch": 0.5492371705963939, "grad_norm": 0.19625480473041534, "learning_rate": 4.576976421636616e-07, "loss": 0.4881, "step": 9900 }, { "epoch": 0.5503467406380028, "grad_norm": 0.3862428665161133, "learning_rate": 4.586222838650023e-07, "loss": 0.5441, "step": 9920 }, { "epoch": 0.5514563106796116, "grad_norm": 0.2694646120071411, "learning_rate": 4.59546925566343e-07, "loss": 0.4859, "step": 9940 }, { "epoch": 0.5525658807212205, "grad_norm": 0.3000504970550537, "learning_rate": 4.6047156726768376e-07, "loss": 0.4917, "step": 9960 }, { "epoch": 0.5536754507628294, "grad_norm": 0.21275319159030914, "learning_rate": 4.613962089690245e-07, "loss": 0.5584, "step": 9980 }, { "epoch": 0.5547850208044383, "grad_norm": 0.22570380568504333, "learning_rate": 4.623208506703652e-07, "loss": 0.5654, "step": 10000 }, { "epoch": 0.5558945908460472, "grad_norm": 0.19912026822566986, "learning_rate": 4.63245492371706e-07, "loss": 0.5072, "step": 10020 }, { "epoch": 0.557004160887656, "grad_norm": 0.2832677364349365, "learning_rate": 4.6417013407304666e-07, "loss": 0.4946, "step": 10040 }, { "epoch": 0.5581137309292649, "grad_norm": 0.41166970133781433, "learning_rate": 4.650947757743874e-07, "loss": 0.4583, "step": 10060 }, { "epoch": 0.5592233009708738, "grad_norm": 0.36281099915504456, "learning_rate": 4.6601941747572816e-07, "loss": 0.5193, "step": 10080 }, { "epoch": 0.5603328710124826, "grad_norm": 0.23534733057022095, "learning_rate": 4.669440591770689e-07, "loss": 0.5753, "step": 10100 }, { "epoch": 0.5614424410540916, "grad_norm": 0.3915897607803345, "learning_rate": 4.6786870087840955e-07, "loss": 0.4895, "step": 10120 }, { "epoch": 0.5625520110957004, "grad_norm": 0.19305333495140076, "learning_rate": 4.6879334257975033e-07, "loss": 0.5521, "step": 10140 }, { "epoch": 0.5636615811373092, "grad_norm": 0.24136780202388763, "learning_rate": 4.6971798428109105e-07, "loss": 0.5126, "step": 10160 }, { "epoch": 0.5647711511789182, "grad_norm": 0.7551522254943848, "learning_rate": 4.706426259824318e-07, "loss": 0.5578, "step": 10180 }, { "epoch": 0.565880721220527, "grad_norm": 0.24454346299171448, "learning_rate": 4.7156726768377255e-07, "loss": 0.6188, "step": 10200 }, { "epoch": 0.566990291262136, "grad_norm": 0.37767449021339417, "learning_rate": 4.724919093851132e-07, "loss": 0.4718, "step": 10220 }, { "epoch": 0.5680998613037448, "grad_norm": 0.18873493373394012, "learning_rate": 4.7341655108645395e-07, "loss": 0.5381, "step": 10240 }, { "epoch": 0.5692094313453536, "grad_norm": 0.20388391613960266, "learning_rate": 4.743411927877947e-07, "loss": 0.5298, "step": 10260 }, { "epoch": 0.5703190013869626, "grad_norm": 0.21988770365715027, "learning_rate": 4.7526583448913545e-07, "loss": 0.4575, "step": 10280 }, { "epoch": 0.5714285714285714, "grad_norm": 0.2506229281425476, "learning_rate": 4.761904761904761e-07, "loss": 0.571, "step": 10300 }, { "epoch": 0.5725381414701803, "grad_norm": 0.21189585328102112, "learning_rate": 4.771151178918169e-07, "loss": 0.5025, "step": 10320 }, { "epoch": 0.5736477115117892, "grad_norm": 0.18935170769691467, "learning_rate": 4.780397595931576e-07, "loss": 0.5407, "step": 10340 }, { "epoch": 0.574757281553398, "grad_norm": 0.47174862027168274, "learning_rate": 4.789644012944983e-07, "loss": 0.5619, "step": 10360 }, { "epoch": 0.575866851595007, "grad_norm": 0.280070036649704, "learning_rate": 4.798890429958391e-07, "loss": 0.6197, "step": 10380 }, { "epoch": 0.5769764216366158, "grad_norm": 0.24726368486881256, "learning_rate": 4.808136846971798e-07, "loss": 0.4462, "step": 10400 }, { "epoch": 0.5780859916782247, "grad_norm": 0.17450051009655, "learning_rate": 4.817383263985206e-07, "loss": 0.5471, "step": 10420 }, { "epoch": 0.5791955617198336, "grad_norm": 0.26398560404777527, "learning_rate": 4.826629680998612e-07, "loss": 0.5714, "step": 10440 }, { "epoch": 0.5803051317614425, "grad_norm": 0.24109946191310883, "learning_rate": 4.83587609801202e-07, "loss": 0.5268, "step": 10460 }, { "epoch": 0.5814147018030513, "grad_norm": 0.18299829959869385, "learning_rate": 4.845122515025428e-07, "loss": 0.4802, "step": 10480 }, { "epoch": 0.5825242718446602, "grad_norm": 0.24511303007602692, "learning_rate": 4.854368932038835e-07, "loss": 0.5393, "step": 10500 }, { "epoch": 0.5836338418862691, "grad_norm": 0.21095749735832214, "learning_rate": 4.863615349052241e-07, "loss": 0.5132, "step": 10520 }, { "epoch": 0.5847434119278779, "grad_norm": 0.25618451833724976, "learning_rate": 4.87286176606565e-07, "loss": 0.5151, "step": 10540 }, { "epoch": 0.5858529819694869, "grad_norm": 0.2276226133108139, "learning_rate": 4.882108183079057e-07, "loss": 0.6023, "step": 10560 }, { "epoch": 0.5869625520110957, "grad_norm": 0.18411724269390106, "learning_rate": 4.891354600092464e-07, "loss": 0.5539, "step": 10580 }, { "epoch": 0.5880721220527045, "grad_norm": 0.2630547881126404, "learning_rate": 4.900601017105871e-07, "loss": 0.5431, "step": 10600 }, { "epoch": 0.5891816920943135, "grad_norm": 0.20443211495876312, "learning_rate": 4.909847434119279e-07, "loss": 0.5149, "step": 10620 }, { "epoch": 0.5902912621359223, "grad_norm": 0.23013320565223694, "learning_rate": 4.919093851132686e-07, "loss": 0.5168, "step": 10640 }, { "epoch": 0.5914008321775313, "grad_norm": 0.21951639652252197, "learning_rate": 4.928340268146093e-07, "loss": 0.4965, "step": 10660 }, { "epoch": 0.5925104022191401, "grad_norm": 0.23171420395374298, "learning_rate": 4.9375866851595e-07, "loss": 0.6251, "step": 10680 }, { "epoch": 0.5936199722607489, "grad_norm": 0.19360388815402985, "learning_rate": 4.946833102172908e-07, "loss": 0.4962, "step": 10700 }, { "epoch": 0.5947295423023579, "grad_norm": 0.23048527538776398, "learning_rate": 4.956079519186315e-07, "loss": 0.5471, "step": 10720 }, { "epoch": 0.5958391123439667, "grad_norm": 0.19450080394744873, "learning_rate": 4.965325936199722e-07, "loss": 0.4633, "step": 10740 }, { "epoch": 0.5969486823855756, "grad_norm": 0.2471095770597458, "learning_rate": 4.974572353213129e-07, "loss": 0.4779, "step": 10760 }, { "epoch": 0.5980582524271845, "grad_norm": 0.22223465144634247, "learning_rate": 4.983818770226538e-07, "loss": 0.5368, "step": 10780 }, { "epoch": 0.5991678224687933, "grad_norm": 0.2931700348854065, "learning_rate": 4.993065187239944e-07, "loss": 0.5382, "step": 10800 }, { "epoch": 0.6002773925104022, "grad_norm": 0.28318122029304504, "learning_rate": 4.999999835192591e-07, "loss": 0.5871, "step": 10820 }, { "epoch": 0.6013869625520111, "grad_norm": 0.1852101981639862, "learning_rate": 4.999995879815858e-07, "loss": 0.4431, "step": 10840 }, { "epoch": 0.60249653259362, "grad_norm": 0.20264457166194916, "learning_rate": 4.999986650611594e-07, "loss": 0.475, "step": 10860 }, { "epoch": 0.6036061026352288, "grad_norm": 0.14976227283477783, "learning_rate": 4.999972147599267e-07, "loss": 0.5002, "step": 10880 }, { "epoch": 0.6047156726768377, "grad_norm": 0.26538553833961487, "learning_rate": 4.999952370809473e-07, "loss": 0.616, "step": 10900 }, { "epoch": 0.6058252427184466, "grad_norm": 0.22870361804962158, "learning_rate": 4.999927320283929e-07, "loss": 0.492, "step": 10920 }, { "epoch": 0.6069348127600555, "grad_norm": 0.2551482915878296, "learning_rate": 4.999896996075486e-07, "loss": 0.4574, "step": 10940 }, { "epoch": 0.6080443828016644, "grad_norm": 0.24273602664470673, "learning_rate": 4.999861398248107e-07, "loss": 0.5455, "step": 10960 }, { "epoch": 0.6091539528432732, "grad_norm": 0.21139869093894958, "learning_rate": 4.999820526876891e-07, "loss": 0.5297, "step": 10980 }, { "epoch": 0.6102635228848821, "grad_norm": 0.3627355992794037, "learning_rate": 4.999774382048057e-07, "loss": 0.6048, "step": 11000 }, { "epoch": 0.611373092926491, "grad_norm": 0.1958065778017044, "learning_rate": 4.999722963858947e-07, "loss": 0.5966, "step": 11020 }, { "epoch": 0.6124826629680998, "grad_norm": 0.23947326838970184, "learning_rate": 4.999666272418033e-07, "loss": 0.4801, "step": 11040 }, { "epoch": 0.6135922330097088, "grad_norm": 0.2646409571170807, "learning_rate": 4.999604307844905e-07, "loss": 0.4903, "step": 11060 }, { "epoch": 0.6147018030513176, "grad_norm": 0.28446322679519653, "learning_rate": 4.999537070270278e-07, "loss": 0.5325, "step": 11080 }, { "epoch": 0.6158113730929264, "grad_norm": 0.3418862223625183, "learning_rate": 4.999464559835997e-07, "loss": 0.4894, "step": 11100 }, { "epoch": 0.6169209431345354, "grad_norm": 0.24407543241977692, "learning_rate": 4.999386776695021e-07, "loss": 0.5557, "step": 11120 }, { "epoch": 0.6180305131761442, "grad_norm": 0.3358474671840668, "learning_rate": 4.999303721011437e-07, "loss": 0.5208, "step": 11140 }, { "epoch": 0.6191400832177532, "grad_norm": 0.21251265704631805, "learning_rate": 4.999215392960455e-07, "loss": 0.5094, "step": 11160 }, { "epoch": 0.620249653259362, "grad_norm": 0.19353316724300385, "learning_rate": 4.999121792728404e-07, "loss": 0.5487, "step": 11180 }, { "epoch": 0.6213592233009708, "grad_norm": 0.22420887649059296, "learning_rate": 4.99902292051274e-07, "loss": 0.4533, "step": 11200 }, { "epoch": 0.6224687933425798, "grad_norm": 0.685371994972229, "learning_rate": 4.998918776522036e-07, "loss": 0.532, "step": 11220 }, { "epoch": 0.6235783633841886, "grad_norm": 0.3081361651420593, "learning_rate": 4.998809360975985e-07, "loss": 0.5977, "step": 11240 }, { "epoch": 0.6246879334257975, "grad_norm": 0.2112504243850708, "learning_rate": 4.998694674105406e-07, "loss": 0.5674, "step": 11260 }, { "epoch": 0.6257975034674064, "grad_norm": 0.30409491062164307, "learning_rate": 4.998574716152234e-07, "loss": 0.5374, "step": 11280 }, { "epoch": 0.6269070735090153, "grad_norm": 0.1710476279258728, "learning_rate": 4.998449487369523e-07, "loss": 0.5585, "step": 11300 }, { "epoch": 0.6280166435506241, "grad_norm": 0.21146634221076965, "learning_rate": 4.998318988021449e-07, "loss": 0.4967, "step": 11320 }, { "epoch": 0.629126213592233, "grad_norm": 0.4319445490837097, "learning_rate": 4.998183218383305e-07, "loss": 0.5381, "step": 11340 }, { "epoch": 0.6302357836338419, "grad_norm": 0.2634948492050171, "learning_rate": 4.998042178741502e-07, "loss": 0.4865, "step": 11360 }, { "epoch": 0.6313453536754507, "grad_norm": 0.23158107697963715, "learning_rate": 4.997895869393566e-07, "loss": 0.5132, "step": 11380 }, { "epoch": 0.6324549237170597, "grad_norm": 0.1735488921403885, "learning_rate": 4.997744290648143e-07, "loss": 0.4402, "step": 11400 }, { "epoch": 0.6335644937586685, "grad_norm": 0.20482942461967468, "learning_rate": 4.997587442824993e-07, "loss": 0.4954, "step": 11420 }, { "epoch": 0.6346740638002774, "grad_norm": 0.23735348880290985, "learning_rate": 4.997425326254993e-07, "loss": 0.4866, "step": 11440 }, { "epoch": 0.6357836338418863, "grad_norm": 0.24048744142055511, "learning_rate": 4.997257941280133e-07, "loss": 0.5499, "step": 11460 }, { "epoch": 0.6368932038834951, "grad_norm": 0.27225053310394287, "learning_rate": 4.997085288253516e-07, "loss": 0.5484, "step": 11480 }, { "epoch": 0.6380027739251041, "grad_norm": 0.21932609379291534, "learning_rate": 4.996907367539362e-07, "loss": 0.5618, "step": 11500 }, { "epoch": 0.6391123439667129, "grad_norm": 0.2289571762084961, "learning_rate": 4.996724179512999e-07, "loss": 0.6138, "step": 11520 }, { "epoch": 0.6402219140083217, "grad_norm": 0.2370901256799698, "learning_rate": 4.99653572456087e-07, "loss": 0.4856, "step": 11540 }, { "epoch": 0.6413314840499307, "grad_norm": 0.2043846994638443, "learning_rate": 4.996342003080525e-07, "loss": 0.4488, "step": 11560 }, { "epoch": 0.6424410540915395, "grad_norm": 0.24291053414344788, "learning_rate": 4.996143015480629e-07, "loss": 0.4688, "step": 11580 }, { "epoch": 0.6435506241331485, "grad_norm": 0.3703598082065582, "learning_rate": 4.99593876218095e-07, "loss": 0.5123, "step": 11600 }, { "epoch": 0.6446601941747573, "grad_norm": 0.2228018194437027, "learning_rate": 4.995729243612369e-07, "loss": 0.4482, "step": 11620 }, { "epoch": 0.6457697642163661, "grad_norm": 0.3408016264438629, "learning_rate": 4.995514460216873e-07, "loss": 0.5718, "step": 11640 }, { "epoch": 0.6468793342579751, "grad_norm": 0.2187211811542511, "learning_rate": 4.995294412447555e-07, "loss": 0.5386, "step": 11660 }, { "epoch": 0.6479889042995839, "grad_norm": 0.2644013464450836, "learning_rate": 4.995069100768613e-07, "loss": 0.4799, "step": 11680 }, { "epoch": 0.6490984743411928, "grad_norm": 0.28515705466270447, "learning_rate": 4.994838525655349e-07, "loss": 0.5356, "step": 11700 }, { "epoch": 0.6502080443828017, "grad_norm": 0.22527480125427246, "learning_rate": 4.99460268759417e-07, "loss": 0.6369, "step": 11720 }, { "epoch": 0.6513176144244105, "grad_norm": 0.24099533259868622, "learning_rate": 4.994361587082586e-07, "loss": 0.52, "step": 11740 }, { "epoch": 0.6524271844660194, "grad_norm": 0.28258323669433594, "learning_rate": 4.994115224629204e-07, "loss": 0.5285, "step": 11760 }, { "epoch": 0.6535367545076283, "grad_norm": 0.28382718563079834, "learning_rate": 4.993863600753734e-07, "loss": 0.461, "step": 11780 }, { "epoch": 0.6546463245492372, "grad_norm": 0.2652598023414612, "learning_rate": 4.99360671598699e-07, "loss": 0.4041, "step": 11800 }, { "epoch": 0.655755894590846, "grad_norm": 0.2193400114774704, "learning_rate": 4.993344570870874e-07, "loss": 0.5637, "step": 11820 }, { "epoch": 0.6568654646324549, "grad_norm": 0.23770926892757416, "learning_rate": 4.993077165958391e-07, "loss": 0.5826, "step": 11840 }, { "epoch": 0.6579750346740638, "grad_norm": 0.31088513135910034, "learning_rate": 4.992804501813643e-07, "loss": 0.4451, "step": 11860 }, { "epoch": 0.6590846047156727, "grad_norm": 0.24922266602516174, "learning_rate": 4.992526579011823e-07, "loss": 0.6032, "step": 11880 }, { "epoch": 0.6601941747572816, "grad_norm": 0.4216838777065277, "learning_rate": 4.99224339813922e-07, "loss": 0.5633, "step": 11900 }, { "epoch": 0.6613037447988904, "grad_norm": 0.23628100752830505, "learning_rate": 4.991954959793212e-07, "loss": 0.5307, "step": 11920 }, { "epoch": 0.6624133148404993, "grad_norm": 0.24533213675022125, "learning_rate": 4.991661264582271e-07, "loss": 0.5425, "step": 11940 }, { "epoch": 0.6635228848821082, "grad_norm": 0.261923223733902, "learning_rate": 4.991362313125957e-07, "loss": 0.4876, "step": 11960 }, { "epoch": 0.664632454923717, "grad_norm": 0.4596007466316223, "learning_rate": 4.991058106054917e-07, "loss": 0.4515, "step": 11980 }, { "epoch": 0.665742024965326, "grad_norm": 0.17913702130317688, "learning_rate": 4.990748644010888e-07, "loss": 0.509, "step": 12000 }, { "epoch": 0.6668515950069348, "grad_norm": 0.26999178528785706, "learning_rate": 4.99043392764669e-07, "loss": 0.4939, "step": 12020 }, { "epoch": 0.6679611650485436, "grad_norm": 0.31055986881256104, "learning_rate": 4.990113957626229e-07, "loss": 0.5451, "step": 12040 }, { "epoch": 0.6690707350901526, "grad_norm": 0.23110932111740112, "learning_rate": 4.989788734624492e-07, "loss": 0.4863, "step": 12060 }, { "epoch": 0.6701803051317614, "grad_norm": 0.24311134219169617, "learning_rate": 4.989458259327548e-07, "loss": 0.5626, "step": 12080 }, { "epoch": 0.6712898751733704, "grad_norm": 0.33318567276000977, "learning_rate": 4.989122532432546e-07, "loss": 0.4121, "step": 12100 }, { "epoch": 0.6723994452149792, "grad_norm": 0.20019620656967163, "learning_rate": 4.988781554647714e-07, "loss": 0.5623, "step": 12120 }, { "epoch": 0.673509015256588, "grad_norm": 0.20741215348243713, "learning_rate": 4.988435326692356e-07, "loss": 0.6168, "step": 12140 }, { "epoch": 0.674618585298197, "grad_norm": 0.25016623735427856, "learning_rate": 4.988083849296852e-07, "loss": 0.4316, "step": 12160 }, { "epoch": 0.6757281553398058, "grad_norm": 0.21898433566093445, "learning_rate": 4.987727123202655e-07, "loss": 0.5066, "step": 12180 }, { "epoch": 0.6768377253814147, "grad_norm": 0.29984351992607117, "learning_rate": 4.987365149162293e-07, "loss": 0.4435, "step": 12200 }, { "epoch": 0.6779472954230236, "grad_norm": 0.2449224442243576, "learning_rate": 4.98699792793936e-07, "loss": 0.4222, "step": 12220 }, { "epoch": 0.6790568654646325, "grad_norm": 0.2419872134923935, "learning_rate": 4.986625460308524e-07, "loss": 0.4595, "step": 12240 }, { "epoch": 0.6801664355062413, "grad_norm": 0.19066084921360016, "learning_rate": 4.986247747055517e-07, "loss": 0.469, "step": 12260 }, { "epoch": 0.6812760055478502, "grad_norm": 0.19902339577674866, "learning_rate": 4.98586478897714e-07, "loss": 0.5446, "step": 12280 }, { "epoch": 0.6823855755894591, "grad_norm": 0.4300174117088318, "learning_rate": 4.985476586881254e-07, "loss": 0.5844, "step": 12300 }, { "epoch": 0.683495145631068, "grad_norm": 0.19871722161769867, "learning_rate": 4.985083141586786e-07, "loss": 0.4553, "step": 12320 }, { "epoch": 0.6846047156726769, "grad_norm": 0.2638891935348511, "learning_rate": 4.984684453923721e-07, "loss": 0.48, "step": 12340 }, { "epoch": 0.6857142857142857, "grad_norm": 0.2136056274175644, "learning_rate": 4.984280524733107e-07, "loss": 0.5821, "step": 12360 }, { "epoch": 0.6868238557558946, "grad_norm": 0.27467551827430725, "learning_rate": 4.983871354867043e-07, "loss": 0.6047, "step": 12380 }, { "epoch": 0.6879334257975035, "grad_norm": 0.1906892955303192, "learning_rate": 4.98345694518869e-07, "loss": 0.4991, "step": 12400 }, { "epoch": 0.6890429958391123, "grad_norm": 0.21737554669380188, "learning_rate": 4.983037296572259e-07, "loss": 0.5198, "step": 12420 }, { "epoch": 0.6901525658807213, "grad_norm": 0.2989285886287689, "learning_rate": 4.982612409903012e-07, "loss": 0.4793, "step": 12440 }, { "epoch": 0.6912621359223301, "grad_norm": 0.5304551124572754, "learning_rate": 4.982182286077262e-07, "loss": 0.5495, "step": 12460 }, { "epoch": 0.6923717059639389, "grad_norm": 0.3350374698638916, "learning_rate": 4.981746926002372e-07, "loss": 0.4756, "step": 12480 }, { "epoch": 0.6934812760055479, "grad_norm": 0.23341912031173706, "learning_rate": 4.981306330596747e-07, "loss": 0.4574, "step": 12500 }, { "epoch": 0.6945908460471567, "grad_norm": 0.20473182201385498, "learning_rate": 4.98086050078984e-07, "loss": 0.5174, "step": 12520 }, { "epoch": 0.6957004160887656, "grad_norm": 0.2923905849456787, "learning_rate": 4.980409437522143e-07, "loss": 0.4867, "step": 12540 }, { "epoch": 0.6968099861303745, "grad_norm": 0.25249674916267395, "learning_rate": 4.97995314174519e-07, "loss": 0.5913, "step": 12560 }, { "epoch": 0.6979195561719833, "grad_norm": 0.23195558786392212, "learning_rate": 4.979491614421553e-07, "loss": 0.5315, "step": 12580 }, { "epoch": 0.6990291262135923, "grad_norm": 0.32216912508010864, "learning_rate": 4.979024856524839e-07, "loss": 0.4512, "step": 12600 }, { "epoch": 0.7001386962552011, "grad_norm": 0.25318339467048645, "learning_rate": 4.978552869039691e-07, "loss": 0.5246, "step": 12620 }, { "epoch": 0.70124826629681, "grad_norm": 0.3519690930843353, "learning_rate": 4.978075652961781e-07, "loss": 0.4516, "step": 12640 }, { "epoch": 0.7023578363384189, "grad_norm": 0.2068701535463333, "learning_rate": 4.977593209297814e-07, "loss": 0.4496, "step": 12660 }, { "epoch": 0.7034674063800277, "grad_norm": 0.22099046409130096, "learning_rate": 4.977105539065522e-07, "loss": 0.5329, "step": 12680 }, { "epoch": 0.7045769764216366, "grad_norm": 0.21252214908599854, "learning_rate": 4.976612643293663e-07, "loss": 0.4534, "step": 12700 }, { "epoch": 0.7056865464632455, "grad_norm": 0.2109062522649765, "learning_rate": 4.976114523022015e-07, "loss": 0.4855, "step": 12720 }, { "epoch": 0.7067961165048544, "grad_norm": 0.25755247473716736, "learning_rate": 4.975611179301381e-07, "loss": 0.5593, "step": 12740 }, { "epoch": 0.7079056865464632, "grad_norm": 0.24992936849594116, "learning_rate": 4.975102613193583e-07, "loss": 0.5806, "step": 12760 }, { "epoch": 0.7090152565880721, "grad_norm": 0.2244323492050171, "learning_rate": 4.974588825771457e-07, "loss": 0.4997, "step": 12780 }, { "epoch": 0.710124826629681, "grad_norm": 0.19922678172588348, "learning_rate": 4.974069818118858e-07, "loss": 0.4892, "step": 12800 }, { "epoch": 0.7112343966712898, "grad_norm": 0.40158599615097046, "learning_rate": 4.973545591330647e-07, "loss": 0.5177, "step": 12820 }, { "epoch": 0.7123439667128988, "grad_norm": 0.34475815296173096, "learning_rate": 4.9730161465127e-07, "loss": 0.5986, "step": 12840 }, { "epoch": 0.7134535367545076, "grad_norm": 0.2620779871940613, "learning_rate": 4.972481484781901e-07, "loss": 0.6164, "step": 12860 }, { "epoch": 0.7145631067961165, "grad_norm": 0.33956992626190186, "learning_rate": 4.971941607266134e-07, "loss": 0.4266, "step": 12880 }, { "epoch": 0.7156726768377254, "grad_norm": 0.26485252380371094, "learning_rate": 4.971396515104292e-07, "loss": 0.4735, "step": 12900 }, { "epoch": 0.7167822468793342, "grad_norm": 0.27967050671577454, "learning_rate": 4.970846209446264e-07, "loss": 0.5209, "step": 12920 }, { "epoch": 0.7178918169209432, "grad_norm": 0.1987200528383255, "learning_rate": 4.97029069145294e-07, "loss": 0.5776, "step": 12940 }, { "epoch": 0.719001386962552, "grad_norm": 0.199361190199852, "learning_rate": 4.969729962296203e-07, "loss": 0.476, "step": 12960 }, { "epoch": 0.7201109570041608, "grad_norm": 0.2446381151676178, "learning_rate": 4.969164023158933e-07, "loss": 0.5148, "step": 12980 }, { "epoch": 0.7212205270457698, "grad_norm": 0.33316630125045776, "learning_rate": 4.968592875234995e-07, "loss": 0.4937, "step": 13000 }, { "epoch": 0.7223300970873786, "grad_norm": 0.23806354403495789, "learning_rate": 4.968016519729246e-07, "loss": 0.5441, "step": 13020 }, { "epoch": 0.7234396671289876, "grad_norm": 0.25994622707366943, "learning_rate": 4.967434957857529e-07, "loss": 0.4716, "step": 13040 }, { "epoch": 0.7245492371705964, "grad_norm": 0.23756317794322968, "learning_rate": 4.966848190846669e-07, "loss": 0.5581, "step": 13060 }, { "epoch": 0.7256588072122053, "grad_norm": 0.325259804725647, "learning_rate": 4.966256219934471e-07, "loss": 0.5346, "step": 13080 }, { "epoch": 0.7267683772538142, "grad_norm": 0.1814831644296646, "learning_rate": 4.965659046369716e-07, "loss": 0.5328, "step": 13100 }, { "epoch": 0.727877947295423, "grad_norm": 0.4260367453098297, "learning_rate": 4.965056671412164e-07, "loss": 0.4851, "step": 13120 }, { "epoch": 0.7289875173370319, "grad_norm": 0.25289660692214966, "learning_rate": 4.964449096332547e-07, "loss": 0.4328, "step": 13140 }, { "epoch": 0.7300970873786408, "grad_norm": 0.237093985080719, "learning_rate": 4.963836322412563e-07, "loss": 0.5327, "step": 13160 }, { "epoch": 0.7312066574202497, "grad_norm": 0.22632507979869843, "learning_rate": 4.963218350944881e-07, "loss": 0.5619, "step": 13180 }, { "epoch": 0.7323162274618585, "grad_norm": 0.28966596722602844, "learning_rate": 4.962595183233133e-07, "loss": 0.5084, "step": 13200 }, { "epoch": 0.7334257975034674, "grad_norm": 0.28821441531181335, "learning_rate": 4.961966820591913e-07, "loss": 0.5239, "step": 13220 }, { "epoch": 0.7345353675450763, "grad_norm": 0.34235697984695435, "learning_rate": 4.961333264346774e-07, "loss": 0.4959, "step": 13240 }, { "epoch": 0.7356449375866851, "grad_norm": 0.30707985162734985, "learning_rate": 4.960694515834224e-07, "loss": 0.4504, "step": 13260 }, { "epoch": 0.7367545076282941, "grad_norm": 0.2812998592853546, "learning_rate": 4.960050576401724e-07, "loss": 0.4862, "step": 13280 }, { "epoch": 0.7378640776699029, "grad_norm": 0.29537180066108704, "learning_rate": 4.95940144740769e-07, "loss": 0.4552, "step": 13300 }, { "epoch": 0.7389736477115117, "grad_norm": 0.6961839199066162, "learning_rate": 4.958747130221477e-07, "loss": 0.5532, "step": 13320 }, { "epoch": 0.7400832177531207, "grad_norm": 0.24016644060611725, "learning_rate": 4.958087626223394e-07, "loss": 0.4068, "step": 13340 }, { "epoch": 0.7411927877947295, "grad_norm": 0.17492686212062836, "learning_rate": 4.957422936804684e-07, "loss": 0.5467, "step": 13360 }, { "epoch": 0.7423023578363385, "grad_norm": 0.19463686645030975, "learning_rate": 4.956753063367537e-07, "loss": 0.5138, "step": 13380 }, { "epoch": 0.7434119278779473, "grad_norm": 0.23167824745178223, "learning_rate": 4.956078007325069e-07, "loss": 0.5657, "step": 13400 }, { "epoch": 0.7445214979195561, "grad_norm": 0.2615565061569214, "learning_rate": 4.955397770101336e-07, "loss": 0.5774, "step": 13420 }, { "epoch": 0.7456310679611651, "grad_norm": 0.21568907797336578, "learning_rate": 4.954712353131323e-07, "loss": 0.4273, "step": 13440 }, { "epoch": 0.7467406380027739, "grad_norm": 0.18977397680282593, "learning_rate": 4.954021757860939e-07, "loss": 0.4739, "step": 13460 }, { "epoch": 0.7478502080443828, "grad_norm": 0.30827805399894714, "learning_rate": 4.95332598574702e-07, "loss": 0.4992, "step": 13480 }, { "epoch": 0.7489597780859917, "grad_norm": 0.22292642295360565, "learning_rate": 4.952625038257321e-07, "loss": 0.5482, "step": 13500 }, { "epoch": 0.7500693481276005, "grad_norm": 0.2590494155883789, "learning_rate": 4.951918916870514e-07, "loss": 0.5736, "step": 13520 }, { "epoch": 0.7511789181692095, "grad_norm": 0.3594379723072052, "learning_rate": 4.951207623076186e-07, "loss": 0.485, "step": 13540 }, { "epoch": 0.7522884882108183, "grad_norm": 0.20570561289787292, "learning_rate": 4.950491158374837e-07, "loss": 0.5755, "step": 13560 }, { "epoch": 0.7533980582524272, "grad_norm": 0.29242074489593506, "learning_rate": 4.949769524277874e-07, "loss": 0.5644, "step": 13580 }, { "epoch": 0.7545076282940361, "grad_norm": 0.3922889232635498, "learning_rate": 4.949042722307608e-07, "loss": 0.5359, "step": 13600 }, { "epoch": 0.7556171983356449, "grad_norm": 0.22301004827022552, "learning_rate": 4.948310753997254e-07, "loss": 0.4671, "step": 13620 }, { "epoch": 0.7567267683772538, "grad_norm": 0.30847975611686707, "learning_rate": 4.947573620890924e-07, "loss": 0.4983, "step": 13640 }, { "epoch": 0.7578363384188627, "grad_norm": 0.23073406517505646, "learning_rate": 4.946831324543627e-07, "loss": 0.4901, "step": 13660 }, { "epoch": 0.7589459084604716, "grad_norm": 0.32450634241104126, "learning_rate": 4.94608386652126e-07, "loss": 0.4763, "step": 13680 }, { "epoch": 0.7600554785020804, "grad_norm": 0.18560366332530975, "learning_rate": 4.945331248400613e-07, "loss": 0.4744, "step": 13700 }, { "epoch": 0.7611650485436893, "grad_norm": 0.20490926504135132, "learning_rate": 4.94457347176936e-07, "loss": 0.5917, "step": 13720 }, { "epoch": 0.7622746185852982, "grad_norm": 0.1755695343017578, "learning_rate": 4.943810538226056e-07, "loss": 0.5368, "step": 13740 }, { "epoch": 0.763384188626907, "grad_norm": 0.3012818992137909, "learning_rate": 4.943042449380137e-07, "loss": 0.4551, "step": 13760 }, { "epoch": 0.764493758668516, "grad_norm": 0.2918994128704071, "learning_rate": 4.942269206851912e-07, "loss": 0.5907, "step": 13780 }, { "epoch": 0.7656033287101248, "grad_norm": 0.20111998915672302, "learning_rate": 4.941490812272563e-07, "loss": 0.4781, "step": 13800 }, { "epoch": 0.7667128987517337, "grad_norm": 0.2861482501029968, "learning_rate": 4.94070726728414e-07, "loss": 0.4925, "step": 13820 }, { "epoch": 0.7678224687933426, "grad_norm": 0.380003422498703, "learning_rate": 4.939918573539559e-07, "loss": 0.544, "step": 13840 }, { "epoch": 0.7689320388349514, "grad_norm": 0.23532161116600037, "learning_rate": 4.939124732702595e-07, "loss": 0.4854, "step": 13860 }, { "epoch": 0.7700416088765604, "grad_norm": 0.21966029703617096, "learning_rate": 4.938325746447884e-07, "loss": 0.4671, "step": 13880 }, { "epoch": 0.7711511789181692, "grad_norm": 0.22129754722118378, "learning_rate": 4.937521616460915e-07, "loss": 0.5384, "step": 13900 }, { "epoch": 0.772260748959778, "grad_norm": 0.23980213701725006, "learning_rate": 4.936712344438028e-07, "loss": 0.5326, "step": 13920 }, { "epoch": 0.773370319001387, "grad_norm": 0.22208735346794128, "learning_rate": 4.935897932086409e-07, "loss": 0.5939, "step": 13940 }, { "epoch": 0.7744798890429958, "grad_norm": 0.3069957196712494, "learning_rate": 4.935078381124091e-07, "loss": 0.4534, "step": 13960 }, { "epoch": 0.7755894590846047, "grad_norm": 0.25610312819480896, "learning_rate": 4.934253693279943e-07, "loss": 0.4865, "step": 13980 }, { "epoch": 0.7766990291262136, "grad_norm": 0.18258234858512878, "learning_rate": 4.933423870293673e-07, "loss": 0.4846, "step": 14000 }, { "epoch": 0.7778085991678225, "grad_norm": 0.29724743962287903, "learning_rate": 4.932588913915822e-07, "loss": 0.5323, "step": 14020 }, { "epoch": 0.7789181692094314, "grad_norm": 0.29876524209976196, "learning_rate": 4.931748825907759e-07, "loss": 0.4786, "step": 14040 }, { "epoch": 0.7800277392510402, "grad_norm": 0.17633473873138428, "learning_rate": 4.930903608041679e-07, "loss": 0.5135, "step": 14060 }, { "epoch": 0.7811373092926491, "grad_norm": 0.30462852120399475, "learning_rate": 4.930053262100596e-07, "loss": 0.5304, "step": 14080 }, { "epoch": 0.782246879334258, "grad_norm": 0.2663354277610779, "learning_rate": 4.929197789878347e-07, "loss": 0.4568, "step": 14100 }, { "epoch": 0.7833564493758669, "grad_norm": 0.32569876313209534, "learning_rate": 4.928337193179578e-07, "loss": 0.4756, "step": 14120 }, { "epoch": 0.7844660194174757, "grad_norm": 0.2273908406496048, "learning_rate": 4.92747147381975e-07, "loss": 0.556, "step": 14140 }, { "epoch": 0.7855755894590846, "grad_norm": 0.2550768256187439, "learning_rate": 4.926600633625126e-07, "loss": 0.4907, "step": 14160 }, { "epoch": 0.7866851595006935, "grad_norm": 0.3085142970085144, "learning_rate": 4.925724674432771e-07, "loss": 0.3765, "step": 14180 }, { "epoch": 0.7877947295423023, "grad_norm": 0.22788256406784058, "learning_rate": 4.924843598090557e-07, "loss": 0.4527, "step": 14200 }, { "epoch": 0.7889042995839113, "grad_norm": 0.41495075821876526, "learning_rate": 4.92395740645714e-07, "loss": 0.5893, "step": 14220 }, { "epoch": 0.7900138696255201, "grad_norm": 0.2527763843536377, "learning_rate": 4.923066101401973e-07, "loss": 0.5653, "step": 14240 }, { "epoch": 0.791123439667129, "grad_norm": 0.22646237909793854, "learning_rate": 4.922169684805296e-07, "loss": 0.5204, "step": 14260 }, { "epoch": 0.7922330097087379, "grad_norm": 0.23035015165805817, "learning_rate": 4.92126815855813e-07, "loss": 0.5439, "step": 14280 }, { "epoch": 0.7933425797503467, "grad_norm": 0.2976521849632263, "learning_rate": 4.920361524562276e-07, "loss": 0.5024, "step": 14300 }, { "epoch": 0.7944521497919557, "grad_norm": 0.27207639813423157, "learning_rate": 4.919449784730308e-07, "loss": 0.4756, "step": 14320 }, { "epoch": 0.7955617198335645, "grad_norm": 0.24028311669826508, "learning_rate": 4.918532940985576e-07, "loss": 0.4261, "step": 14340 }, { "epoch": 0.7966712898751733, "grad_norm": 0.21439999341964722, "learning_rate": 4.91761099526219e-07, "loss": 0.4244, "step": 14360 }, { "epoch": 0.7977808599167823, "grad_norm": 0.26036378741264343, "learning_rate": 4.916683949505028e-07, "loss": 0.4687, "step": 14380 }, { "epoch": 0.7988904299583911, "grad_norm": 0.2510151267051697, "learning_rate": 4.915751805669725e-07, "loss": 0.5159, "step": 14400 }, { "epoch": 0.8, "grad_norm": 0.28550541400909424, "learning_rate": 4.91481456572267e-07, "loss": 0.5684, "step": 14420 }, { "epoch": 0.8011095700416089, "grad_norm": 0.39744970202445984, "learning_rate": 4.913872231641005e-07, "loss": 0.4914, "step": 14440 }, { "epoch": 0.8022191400832177, "grad_norm": 0.21175946295261383, "learning_rate": 4.912924805412613e-07, "loss": 0.5646, "step": 14460 }, { "epoch": 0.8033287101248267, "grad_norm": 1.472761631011963, "learning_rate": 4.911972289036124e-07, "loss": 0.5229, "step": 14480 }, { "epoch": 0.8044382801664355, "grad_norm": 0.2281097024679184, "learning_rate": 4.911014684520904e-07, "loss": 0.5514, "step": 14500 }, { "epoch": 0.8055478502080444, "grad_norm": 0.3583027124404907, "learning_rate": 4.910051993887053e-07, "loss": 0.611, "step": 14520 }, { "epoch": 0.8066574202496533, "grad_norm": 0.33134713768959045, "learning_rate": 4.909084219165399e-07, "loss": 0.5303, "step": 14540 }, { "epoch": 0.8077669902912621, "grad_norm": 0.2871852219104767, "learning_rate": 4.908111362397499e-07, "loss": 0.5066, "step": 14560 }, { "epoch": 0.808876560332871, "grad_norm": 0.2712966203689575, "learning_rate": 4.907133425635625e-07, "loss": 0.42, "step": 14580 }, { "epoch": 0.8099861303744799, "grad_norm": 0.24459266662597656, "learning_rate": 4.906150410942768e-07, "loss": 0.4223, "step": 14600 }, { "epoch": 0.8110957004160888, "grad_norm": 0.23019199073314667, "learning_rate": 4.905162320392635e-07, "loss": 0.4866, "step": 14620 }, { "epoch": 0.8122052704576976, "grad_norm": 0.21123717725276947, "learning_rate": 4.904169156069633e-07, "loss": 0.5185, "step": 14640 }, { "epoch": 0.8133148404993065, "grad_norm": 0.41238492727279663, "learning_rate": 4.903170920068879e-07, "loss": 0.4581, "step": 14660 }, { "epoch": 0.8144244105409154, "grad_norm": 0.27384158968925476, "learning_rate": 4.902167614496185e-07, "loss": 0.4927, "step": 14680 }, { "epoch": 0.8155339805825242, "grad_norm": 0.5437495112419128, "learning_rate": 4.90115924146806e-07, "loss": 0.5507, "step": 14700 }, { "epoch": 0.8166435506241332, "grad_norm": 0.25934696197509766, "learning_rate": 4.9001458031117e-07, "loss": 0.5462, "step": 14720 }, { "epoch": 0.817753120665742, "grad_norm": 0.34160590171813965, "learning_rate": 4.899127301564989e-07, "loss": 0.5152, "step": 14740 }, { "epoch": 0.8188626907073508, "grad_norm": 0.22587896883487701, "learning_rate": 4.898103738976491e-07, "loss": 0.561, "step": 14760 }, { "epoch": 0.8199722607489598, "grad_norm": 0.23106370866298676, "learning_rate": 4.897075117505447e-07, "loss": 0.4836, "step": 14780 }, { "epoch": 0.8210818307905686, "grad_norm": 0.25426605343818665, "learning_rate": 4.89604143932177e-07, "loss": 0.487, "step": 14800 }, { "epoch": 0.8221914008321776, "grad_norm": 0.24273595213890076, "learning_rate": 4.895002706606037e-07, "loss": 0.4938, "step": 14820 }, { "epoch": 0.8233009708737864, "grad_norm": 0.2597360610961914, "learning_rate": 4.893958921549494e-07, "loss": 0.5628, "step": 14840 }, { "epoch": 0.8244105409153952, "grad_norm": 0.276300311088562, "learning_rate": 4.89291008635404e-07, "loss": 0.4875, "step": 14860 }, { "epoch": 0.8255201109570042, "grad_norm": 0.38235533237457275, "learning_rate": 4.891856203232228e-07, "loss": 0.5863, "step": 14880 }, { "epoch": 0.826629680998613, "grad_norm": 0.21681709587574005, "learning_rate": 4.890797274407263e-07, "loss": 0.5836, "step": 14900 }, { "epoch": 0.827739251040222, "grad_norm": 0.18042901158332825, "learning_rate": 4.889733302112991e-07, "loss": 0.4501, "step": 14920 }, { "epoch": 0.8288488210818308, "grad_norm": 0.35691049695014954, "learning_rate": 4.888664288593896e-07, "loss": 0.4889, "step": 14940 }, { "epoch": 0.8299583911234397, "grad_norm": 0.5920586585998535, "learning_rate": 4.887590236105102e-07, "loss": 0.6062, "step": 14960 }, { "epoch": 0.8310679611650486, "grad_norm": 0.24850663542747498, "learning_rate": 4.886511146912358e-07, "loss": 0.5384, "step": 14980 }, { "epoch": 0.8321775312066574, "grad_norm": 0.25241413712501526, "learning_rate": 4.885427023292043e-07, "loss": 0.5477, "step": 15000 }, { "epoch": 0.8332871012482663, "grad_norm": 0.26729947328567505, "learning_rate": 4.884337867531148e-07, "loss": 0.4749, "step": 15020 }, { "epoch": 0.8343966712898752, "grad_norm": 0.31970369815826416, "learning_rate": 4.883243681927288e-07, "loss": 0.4941, "step": 15040 }, { "epoch": 0.8355062413314841, "grad_norm": 0.2843242883682251, "learning_rate": 4.882144468788685e-07, "loss": 0.4704, "step": 15060 }, { "epoch": 0.8366158113730929, "grad_norm": 0.25197479128837585, "learning_rate": 4.881040230434166e-07, "loss": 0.4648, "step": 15080 }, { "epoch": 0.8377253814147018, "grad_norm": 0.22234483063220978, "learning_rate": 4.879930969193161e-07, "loss": 0.5144, "step": 15100 }, { "epoch": 0.8388349514563107, "grad_norm": 0.21174634993076324, "learning_rate": 4.878816687405694e-07, "loss": 0.4968, "step": 15120 }, { "epoch": 0.8399445214979195, "grad_norm": 0.2078089416027069, "learning_rate": 4.877697387422382e-07, "loss": 0.428, "step": 15140 }, { "epoch": 0.8410540915395285, "grad_norm": 0.18942753970623016, "learning_rate": 4.876573071604425e-07, "loss": 0.468, "step": 15160 }, { "epoch": 0.8421636615811373, "grad_norm": 0.3018622398376465, "learning_rate": 4.875443742323607e-07, "loss": 0.5183, "step": 15180 }, { "epoch": 0.8432732316227461, "grad_norm": 0.3080112934112549, "learning_rate": 4.874309401962287e-07, "loss": 0.4939, "step": 15200 }, { "epoch": 0.8443828016643551, "grad_norm": 0.25513333082199097, "learning_rate": 4.873170052913397e-07, "loss": 0.5199, "step": 15220 }, { "epoch": 0.8454923717059639, "grad_norm": 0.24920856952667236, "learning_rate": 4.872025697580431e-07, "loss": 0.4165, "step": 15240 }, { "epoch": 0.8466019417475729, "grad_norm": 0.24633198976516724, "learning_rate": 4.870876338377446e-07, "loss": 0.4684, "step": 15260 }, { "epoch": 0.8477115117891817, "grad_norm": 0.4820736050605774, "learning_rate": 4.869721977729054e-07, "loss": 0.497, "step": 15280 }, { "epoch": 0.8488210818307905, "grad_norm": 0.213221937417984, "learning_rate": 4.868562618070422e-07, "loss": 0.4241, "step": 15300 }, { "epoch": 0.8499306518723995, "grad_norm": 2.244574546813965, "learning_rate": 4.867398261847256e-07, "loss": 0.4696, "step": 15320 }, { "epoch": 0.8510402219140083, "grad_norm": 0.31189611554145813, "learning_rate": 4.866228911515806e-07, "loss": 0.4882, "step": 15340 }, { "epoch": 0.8521497919556172, "grad_norm": 0.2364753931760788, "learning_rate": 4.865054569542859e-07, "loss": 0.4234, "step": 15360 }, { "epoch": 0.8532593619972261, "grad_norm": 0.28598833084106445, "learning_rate": 4.863875238405728e-07, "loss": 0.5155, "step": 15380 }, { "epoch": 0.8543689320388349, "grad_norm": 0.29290738701820374, "learning_rate": 4.862690920592253e-07, "loss": 0.4822, "step": 15400 }, { "epoch": 0.8554785020804438, "grad_norm": 0.2712767720222473, "learning_rate": 4.861501618600794e-07, "loss": 0.4605, "step": 15420 }, { "epoch": 0.8565880721220527, "grad_norm": 0.24093014001846313, "learning_rate": 4.860307334940223e-07, "loss": 0.489, "step": 15440 }, { "epoch": 0.8576976421636616, "grad_norm": 0.23672768473625183, "learning_rate": 4.859108072129925e-07, "loss": 0.5159, "step": 15460 }, { "epoch": 0.8588072122052705, "grad_norm": 0.24443398416042328, "learning_rate": 4.857903832699784e-07, "loss": 0.589, "step": 15480 }, { "epoch": 0.8599167822468793, "grad_norm": 0.35630983114242554, "learning_rate": 4.856694619190186e-07, "loss": 0.4189, "step": 15500 }, { "epoch": 0.8610263522884882, "grad_norm": 0.322094589471817, "learning_rate": 4.855480434152007e-07, "loss": 0.4768, "step": 15520 }, { "epoch": 0.8621359223300971, "grad_norm": 0.31887516379356384, "learning_rate": 4.854261280146615e-07, "loss": 0.5078, "step": 15540 }, { "epoch": 0.863245492371706, "grad_norm": 0.2811984717845917, "learning_rate": 4.853037159745854e-07, "loss": 0.4921, "step": 15560 }, { "epoch": 0.8643550624133148, "grad_norm": 0.24148902297019958, "learning_rate": 4.85180807553205e-07, "loss": 0.5171, "step": 15580 }, { "epoch": 0.8654646324549237, "grad_norm": 0.26906171441078186, "learning_rate": 4.850574030097999e-07, "loss": 0.4867, "step": 15600 }, { "epoch": 0.8665742024965326, "grad_norm": 0.28062304854393005, "learning_rate": 4.849335026046963e-07, "loss": 0.5582, "step": 15620 }, { "epoch": 0.8676837725381414, "grad_norm": 0.41734233498573303, "learning_rate": 4.848091065992661e-07, "loss": 0.5553, "step": 15640 }, { "epoch": 0.8687933425797504, "grad_norm": 0.5747133493423462, "learning_rate": 4.846842152559272e-07, "loss": 0.547, "step": 15660 }, { "epoch": 0.8699029126213592, "grad_norm": 0.25351420044898987, "learning_rate": 4.845588288381421e-07, "loss": 0.5038, "step": 15680 }, { "epoch": 0.871012482662968, "grad_norm": 0.5361616611480713, "learning_rate": 4.844329476104177e-07, "loss": 0.617, "step": 15700 }, { "epoch": 0.872122052704577, "grad_norm": 0.2942206859588623, "learning_rate": 4.843065718383051e-07, "loss": 0.5598, "step": 15720 }, { "epoch": 0.8732316227461858, "grad_norm": 0.18865200877189636, "learning_rate": 4.841797017883981e-07, "loss": 0.5397, "step": 15740 }, { "epoch": 0.8743411927877948, "grad_norm": 0.34044092893600464, "learning_rate": 4.840523377283333e-07, "loss": 0.5253, "step": 15760 }, { "epoch": 0.8754507628294036, "grad_norm": 0.2387341409921646, "learning_rate": 4.839244799267899e-07, "loss": 0.5348, "step": 15780 }, { "epoch": 0.8765603328710125, "grad_norm": 0.36708223819732666, "learning_rate": 4.837961286534882e-07, "loss": 0.4274, "step": 15800 }, { "epoch": 0.8776699029126214, "grad_norm": 0.36376726627349854, "learning_rate": 4.836672841791895e-07, "loss": 0.4612, "step": 15820 }, { "epoch": 0.8787794729542302, "grad_norm": 0.2821993827819824, "learning_rate": 4.83537946775696e-07, "loss": 0.4341, "step": 15840 }, { "epoch": 0.8798890429958391, "grad_norm": 0.23787397146224976, "learning_rate": 4.834081167158492e-07, "loss": 0.4838, "step": 15860 }, { "epoch": 0.880998613037448, "grad_norm": 0.31258630752563477, "learning_rate": 4.8327779427353e-07, "loss": 0.5118, "step": 15880 }, { "epoch": 0.8821081830790569, "grad_norm": 0.23163455724716187, "learning_rate": 4.831469797236582e-07, "loss": 0.5126, "step": 15900 }, { "epoch": 0.8832177531206657, "grad_norm": 0.23242497444152832, "learning_rate": 4.830156733421916e-07, "loss": 0.4473, "step": 15920 }, { "epoch": 0.8843273231622746, "grad_norm": 0.35846635699272156, "learning_rate": 4.828838754061255e-07, "loss": 0.5921, "step": 15940 }, { "epoch": 0.8854368932038835, "grad_norm": 0.2894617021083832, "learning_rate": 4.827515861934924e-07, "loss": 0.5557, "step": 15960 }, { "epoch": 0.8865464632454924, "grad_norm": 0.822104811668396, "learning_rate": 4.826188059833606e-07, "loss": 0.5137, "step": 15980 }, { "epoch": 0.8876560332871013, "grad_norm": 0.3071456253528595, "learning_rate": 4.824855350558348e-07, "loss": 0.5825, "step": 16000 }, { "epoch": 0.8887656033287101, "grad_norm": 0.22043250501155853, "learning_rate": 4.823517736920546e-07, "loss": 0.5755, "step": 16020 }, { "epoch": 0.889875173370319, "grad_norm": 0.3122783601284027, "learning_rate": 4.822175221741941e-07, "loss": 0.5379, "step": 16040 }, { "epoch": 0.8909847434119279, "grad_norm": 0.22958078980445862, "learning_rate": 4.820827807854615e-07, "loss": 0.5153, "step": 16060 }, { "epoch": 0.8920943134535367, "grad_norm": 0.2927316725254059, "learning_rate": 4.819475498100985e-07, "loss": 0.5817, "step": 16080 }, { "epoch": 0.8932038834951457, "grad_norm": 0.2785501182079315, "learning_rate": 4.818118295333794e-07, "loss": 0.5238, "step": 16100 }, { "epoch": 0.8943134535367545, "grad_norm": 0.23847341537475586, "learning_rate": 4.81675620241611e-07, "loss": 0.5132, "step": 16120 }, { "epoch": 0.8954230235783633, "grad_norm": 0.26902398467063904, "learning_rate": 4.815389222221313e-07, "loss": 0.5198, "step": 16140 }, { "epoch": 0.8965325936199723, "grad_norm": 0.3341926336288452, "learning_rate": 4.814017357633096e-07, "loss": 0.6031, "step": 16160 }, { "epoch": 0.8976421636615811, "grad_norm": 0.2657605707645416, "learning_rate": 4.812640611545453e-07, "loss": 0.5887, "step": 16180 }, { "epoch": 0.8987517337031901, "grad_norm": 0.34312596917152405, "learning_rate": 4.81125898686268e-07, "loss": 0.4871, "step": 16200 }, { "epoch": 0.8998613037447989, "grad_norm": 0.24543847143650055, "learning_rate": 4.80987248649936e-07, "loss": 0.554, "step": 16220 }, { "epoch": 0.9009708737864077, "grad_norm": 0.2780109941959381, "learning_rate": 4.808481113380364e-07, "loss": 0.4506, "step": 16240 }, { "epoch": 0.9020804438280167, "grad_norm": 0.26394328474998474, "learning_rate": 4.80708487044084e-07, "loss": 0.5513, "step": 16260 }, { "epoch": 0.9031900138696255, "grad_norm": 0.2913033664226532, "learning_rate": 4.805683760626214e-07, "loss": 0.5081, "step": 16280 }, { "epoch": 0.9042995839112344, "grad_norm": 0.33289602398872375, "learning_rate": 4.804277786892173e-07, "loss": 0.4713, "step": 16300 }, { "epoch": 0.9054091539528433, "grad_norm": 0.26058298349380493, "learning_rate": 4.802866952204667e-07, "loss": 0.4457, "step": 16320 }, { "epoch": 0.9065187239944521, "grad_norm": 0.18589697778224945, "learning_rate": 4.801451259539902e-07, "loss": 0.54, "step": 16340 }, { "epoch": 0.907628294036061, "grad_norm": 0.32331371307373047, "learning_rate": 4.800030711884329e-07, "loss": 0.5062, "step": 16360 }, { "epoch": 0.9087378640776699, "grad_norm": 0.8287452459335327, "learning_rate": 4.798605312234643e-07, "loss": 0.5013, "step": 16380 }, { "epoch": 0.9098474341192788, "grad_norm": 0.2520425617694855, "learning_rate": 4.797175063597774e-07, "loss": 0.5413, "step": 16400 }, { "epoch": 0.9109570041608877, "grad_norm": 0.21053974330425262, "learning_rate": 4.795739968990879e-07, "loss": 0.4441, "step": 16420 }, { "epoch": 0.9120665742024965, "grad_norm": 0.368002325296402, "learning_rate": 4.794300031441342e-07, "loss": 0.4178, "step": 16440 }, { "epoch": 0.9131761442441054, "grad_norm": 0.5419968962669373, "learning_rate": 4.79285525398676e-07, "loss": 0.5216, "step": 16460 }, { "epoch": 0.9142857142857143, "grad_norm": 0.3541153073310852, "learning_rate": 4.79140563967494e-07, "loss": 0.4944, "step": 16480 }, { "epoch": 0.9153952843273232, "grad_norm": 0.28327611088752747, "learning_rate": 4.789951191563895e-07, "loss": 0.5079, "step": 16500 }, { "epoch": 0.916504854368932, "grad_norm": 0.3249143064022064, "learning_rate": 4.788491912721832e-07, "loss": 0.5471, "step": 16520 }, { "epoch": 0.9176144244105409, "grad_norm": 0.3213510513305664, "learning_rate": 4.787027806227151e-07, "loss": 0.501, "step": 16540 }, { "epoch": 0.9187239944521498, "grad_norm": 0.25237423181533813, "learning_rate": 4.785558875168434e-07, "loss": 0.4508, "step": 16560 }, { "epoch": 0.9198335644937586, "grad_norm": 0.28759607672691345, "learning_rate": 4.784085122644443e-07, "loss": 0.4848, "step": 16580 }, { "epoch": 0.9209431345353676, "grad_norm": 0.3305744528770447, "learning_rate": 4.78260655176411e-07, "loss": 0.6155, "step": 16600 }, { "epoch": 0.9220527045769764, "grad_norm": 0.22068889439105988, "learning_rate": 4.781123165646529e-07, "loss": 0.5193, "step": 16620 }, { "epoch": 0.9231622746185852, "grad_norm": 0.22431910037994385, "learning_rate": 4.779634967420957e-07, "loss": 0.4973, "step": 16640 }, { "epoch": 0.9242718446601942, "grad_norm": 0.21701250970363617, "learning_rate": 4.778141960226798e-07, "loss": 0.549, "step": 16660 }, { "epoch": 0.925381414701803, "grad_norm": 0.19255466759204865, "learning_rate": 4.776644147213602e-07, "loss": 0.4236, "step": 16680 }, { "epoch": 0.926490984743412, "grad_norm": 0.28331053256988525, "learning_rate": 4.775141531541059e-07, "loss": 0.4638, "step": 16700 }, { "epoch": 0.9276005547850208, "grad_norm": 0.3694329559803009, "learning_rate": 4.773634116378985e-07, "loss": 0.5719, "step": 16720 }, { "epoch": 0.9287101248266297, "grad_norm": 0.2507230341434479, "learning_rate": 4.772121904907328e-07, "loss": 0.5021, "step": 16740 }, { "epoch": 0.9298196948682386, "grad_norm": 0.35515108704566956, "learning_rate": 4.770604900316148e-07, "loss": 0.5663, "step": 16760 }, { "epoch": 0.9309292649098474, "grad_norm": 0.24078282713890076, "learning_rate": 4.769083105805619e-07, "loss": 0.5412, "step": 16780 }, { "epoch": 0.9320388349514563, "grad_norm": 0.19905756413936615, "learning_rate": 4.7675565245860195e-07, "loss": 0.5761, "step": 16800 }, { "epoch": 0.9331484049930652, "grad_norm": 0.2559676170349121, "learning_rate": 4.7660251598777243e-07, "loss": 0.4588, "step": 16820 }, { "epoch": 0.9342579750346741, "grad_norm": 0.3492356538772583, "learning_rate": 4.7644890149112023e-07, "loss": 0.4998, "step": 16840 }, { "epoch": 0.935367545076283, "grad_norm": 0.3238747715950012, "learning_rate": 4.7629480929270014e-07, "loss": 0.5082, "step": 16860 }, { "epoch": 0.9364771151178918, "grad_norm": 0.2711605727672577, "learning_rate": 4.761402397175752e-07, "loss": 0.5553, "step": 16880 }, { "epoch": 0.9375866851595007, "grad_norm": 0.24044501781463623, "learning_rate": 4.759851930918151e-07, "loss": 0.5454, "step": 16900 }, { "epoch": 0.9386962552011096, "grad_norm": 0.3525417149066925, "learning_rate": 4.7582966974249607e-07, "loss": 0.4522, "step": 16920 }, { "epoch": 0.9398058252427185, "grad_norm": 0.2660657465457916, "learning_rate": 4.7567366999770004e-07, "loss": 0.4965, "step": 16940 }, { "epoch": 0.9409153952843273, "grad_norm": 0.3178681433200836, "learning_rate": 4.755171941865138e-07, "loss": 0.5931, "step": 16960 }, { "epoch": 0.9420249653259362, "grad_norm": 0.20523175597190857, "learning_rate": 4.753602426390285e-07, "loss": 0.4806, "step": 16980 }, { "epoch": 0.9431345353675451, "grad_norm": 0.2907808721065521, "learning_rate": 4.752028156863389e-07, "loss": 0.4961, "step": 17000 }, { "epoch": 0.9442441054091539, "grad_norm": 0.25513818860054016, "learning_rate": 4.750449136605424e-07, "loss": 0.4866, "step": 17020 }, { "epoch": 0.9453536754507629, "grad_norm": 0.2840083837509155, "learning_rate": 4.7488653689473903e-07, "loss": 0.4707, "step": 17040 }, { "epoch": 0.9464632454923717, "grad_norm": 0.524346113204956, "learning_rate": 4.7472768572302985e-07, "loss": 0.4442, "step": 17060 }, { "epoch": 0.9475728155339805, "grad_norm": 0.18961882591247559, "learning_rate": 4.7456836048051695e-07, "loss": 0.5037, "step": 17080 }, { "epoch": 0.9486823855755895, "grad_norm": 0.23285827040672302, "learning_rate": 4.744085615033023e-07, "loss": 0.4856, "step": 17100 }, { "epoch": 0.9497919556171983, "grad_norm": 0.22163750231266022, "learning_rate": 4.742482891284876e-07, "loss": 0.4732, "step": 17120 }, { "epoch": 0.9509015256588073, "grad_norm": 0.2964208424091339, "learning_rate": 4.740875436941728e-07, "loss": 0.5997, "step": 17140 }, { "epoch": 0.9520110957004161, "grad_norm": 0.3355337679386139, "learning_rate": 4.739263255394559e-07, "loss": 0.5664, "step": 17160 }, { "epoch": 0.9531206657420249, "grad_norm": 0.27029746770858765, "learning_rate": 4.7376463500443227e-07, "loss": 0.4439, "step": 17180 }, { "epoch": 0.9542302357836339, "grad_norm": 0.4982469379901886, "learning_rate": 4.7360247243019355e-07, "loss": 0.433, "step": 17200 }, { "epoch": 0.9553398058252427, "grad_norm": 0.33958593010902405, "learning_rate": 4.734398381588274e-07, "loss": 0.4981, "step": 17220 }, { "epoch": 0.9564493758668516, "grad_norm": 0.3105083107948303, "learning_rate": 4.732767325334163e-07, "loss": 0.5258, "step": 17240 }, { "epoch": 0.9575589459084605, "grad_norm": 0.2882620692253113, "learning_rate": 4.7311315589803734e-07, "loss": 0.6116, "step": 17260 }, { "epoch": 0.9586685159500693, "grad_norm": 0.3724370300769806, "learning_rate": 4.7294910859776095e-07, "loss": 0.5438, "step": 17280 }, { "epoch": 0.9597780859916782, "grad_norm": 0.2626511752605438, "learning_rate": 4.7278459097865065e-07, "loss": 0.4972, "step": 17300 }, { "epoch": 0.9608876560332871, "grad_norm": 0.6237671375274658, "learning_rate": 4.72619603387762e-07, "loss": 0.5099, "step": 17320 }, { "epoch": 0.961997226074896, "grad_norm": 0.29267674684524536, "learning_rate": 4.7245414617314193e-07, "loss": 0.4413, "step": 17340 }, { "epoch": 0.9631067961165048, "grad_norm": 0.3022725284099579, "learning_rate": 4.722882196838283e-07, "loss": 0.5421, "step": 17360 }, { "epoch": 0.9642163661581137, "grad_norm": 0.28904616832733154, "learning_rate": 4.721218242698486e-07, "loss": 0.5416, "step": 17380 }, { "epoch": 0.9653259361997226, "grad_norm": 0.3465704321861267, "learning_rate": 4.719549602822199e-07, "loss": 0.493, "step": 17400 }, { "epoch": 0.9664355062413315, "grad_norm": 0.21620279550552368, "learning_rate": 4.7178762807294737e-07, "loss": 0.5027, "step": 17420 }, { "epoch": 0.9675450762829404, "grad_norm": 0.3273220658302307, "learning_rate": 4.716198279950241e-07, "loss": 0.5453, "step": 17440 }, { "epoch": 0.9686546463245492, "grad_norm": 0.367601603269577, "learning_rate": 4.7145156040243017e-07, "loss": 0.4352, "step": 17460 }, { "epoch": 0.9697642163661581, "grad_norm": 0.26952314376831055, "learning_rate": 4.712828256501318e-07, "loss": 0.4507, "step": 17480 }, { "epoch": 0.970873786407767, "grad_norm": 0.23673859238624573, "learning_rate": 4.711136240940809e-07, "loss": 0.5201, "step": 17500 }, { "epoch": 0.9719833564493758, "grad_norm": 0.29072991013526917, "learning_rate": 4.709439560912139e-07, "loss": 0.4529, "step": 17520 }, { "epoch": 0.9730929264909848, "grad_norm": 0.28428131341934204, "learning_rate": 4.707738219994513e-07, "loss": 0.4963, "step": 17540 }, { "epoch": 0.9742024965325936, "grad_norm": 0.36110901832580566, "learning_rate": 4.706032221776969e-07, "loss": 0.5713, "step": 17560 }, { "epoch": 0.9753120665742026, "grad_norm": 0.317875474691391, "learning_rate": 4.704321569858368e-07, "loss": 0.5807, "step": 17580 }, { "epoch": 0.9764216366158114, "grad_norm": 0.2890872359275818, "learning_rate": 4.702606267847391e-07, "loss": 0.5196, "step": 17600 }, { "epoch": 0.9775312066574202, "grad_norm": 0.2355562150478363, "learning_rate": 4.7008863193625247e-07, "loss": 0.514, "step": 17620 }, { "epoch": 0.9786407766990292, "grad_norm": 0.32910382747650146, "learning_rate": 4.6991617280320614e-07, "loss": 0.5281, "step": 17640 }, { "epoch": 0.979750346740638, "grad_norm": 0.26359501481056213, "learning_rate": 4.697432497494085e-07, "loss": 0.5029, "step": 17660 }, { "epoch": 0.9808599167822469, "grad_norm": 0.37599971890449524, "learning_rate": 4.6956986313964664e-07, "loss": 0.5226, "step": 17680 }, { "epoch": 0.9819694868238558, "grad_norm": 0.455422967672348, "learning_rate": 4.6939601333968583e-07, "loss": 0.4839, "step": 17700 }, { "epoch": 0.9830790568654646, "grad_norm": 0.3952491879463196, "learning_rate": 4.6922170071626794e-07, "loss": 0.4934, "step": 17720 }, { "epoch": 0.9841886269070735, "grad_norm": 0.21913796663284302, "learning_rate": 4.690469256371116e-07, "loss": 0.4858, "step": 17740 }, { "epoch": 0.9852981969486824, "grad_norm": 0.5274518728256226, "learning_rate": 4.6887168847091085e-07, "loss": 0.5531, "step": 17760 }, { "epoch": 0.9864077669902913, "grad_norm": 0.23438116908073425, "learning_rate": 4.6869598958733457e-07, "loss": 0.5863, "step": 17780 }, { "epoch": 0.9875173370319001, "grad_norm": 0.35722559690475464, "learning_rate": 4.685198293570256e-07, "loss": 0.495, "step": 17800 }, { "epoch": 0.988626907073509, "grad_norm": 0.24522677063941956, "learning_rate": 4.683432081516e-07, "loss": 0.5098, "step": 17820 }, { "epoch": 0.9897364771151179, "grad_norm": 0.2502312660217285, "learning_rate": 4.6816612634364626e-07, "loss": 0.4977, "step": 17840 }, { "epoch": 0.9908460471567268, "grad_norm": 0.2745305895805359, "learning_rate": 4.6798858430672465e-07, "loss": 0.453, "step": 17860 }, { "epoch": 0.9919556171983357, "grad_norm": 0.32878631353378296, "learning_rate": 4.678105824153662e-07, "loss": 0.4941, "step": 17880 }, { "epoch": 0.9930651872399445, "grad_norm": 0.33521202206611633, "learning_rate": 4.676321210450719e-07, "loss": 0.5406, "step": 17900 }, { "epoch": 0.9941747572815534, "grad_norm": 0.2142360955476761, "learning_rate": 4.6745320057231235e-07, "loss": 0.5053, "step": 17920 }, { "epoch": 0.9952843273231623, "grad_norm": 0.3933190405368805, "learning_rate": 4.6727382137452644e-07, "loss": 0.4929, "step": 17940 }, { "epoch": 0.9963938973647711, "grad_norm": 0.29542025923728943, "learning_rate": 4.670939838301206e-07, "loss": 0.4723, "step": 17960 }, { "epoch": 0.9975034674063801, "grad_norm": 0.33013424277305603, "learning_rate": 4.669136883184685e-07, "loss": 0.5297, "step": 17980 }, { "epoch": 0.9986130374479889, "grad_norm": 0.2505033016204834, "learning_rate": 4.6673293521990966e-07, "loss": 0.523, "step": 18000 }, { "epoch": 0.9997226074895977, "grad_norm": 0.32065272331237793, "learning_rate": 4.665517249157491e-07, "loss": 0.4935, "step": 18020 }, { "epoch": 1.0008321775312066, "grad_norm": 0.3525514304637909, "learning_rate": 4.66370057788256e-07, "loss": 0.5569, "step": 18040 }, { "epoch": 1.0019417475728156, "grad_norm": 0.3695344030857086, "learning_rate": 4.661879342206636e-07, "loss": 0.5232, "step": 18060 }, { "epoch": 1.0030513176144245, "grad_norm": 0.21442274749279022, "learning_rate": 4.660053545971678e-07, "loss": 0.6005, "step": 18080 }, { "epoch": 1.0041608876560333, "grad_norm": 0.3651989996433258, "learning_rate": 4.658223193029266e-07, "loss": 0.622, "step": 18100 }, { "epoch": 1.0052704576976421, "grad_norm": 0.2379089742898941, "learning_rate": 4.6563882872405924e-07, "loss": 0.5101, "step": 18120 }, { "epoch": 1.006380027739251, "grad_norm": 0.2602407932281494, "learning_rate": 4.654548832476455e-07, "loss": 0.5576, "step": 18140 }, { "epoch": 1.00748959778086, "grad_norm": 0.2794329822063446, "learning_rate": 4.6527048326172465e-07, "loss": 0.5923, "step": 18160 }, { "epoch": 1.0085991678224688, "grad_norm": 0.29430291056632996, "learning_rate": 4.650856291552948e-07, "loss": 0.4838, "step": 18180 }, { "epoch": 1.0097087378640777, "grad_norm": 0.2219160944223404, "learning_rate": 4.6490032131831216e-07, "loss": 0.4734, "step": 18200 }, { "epoch": 1.0108183079056865, "grad_norm": 0.26343485713005066, "learning_rate": 4.647145601416899e-07, "loss": 0.44, "step": 18220 }, { "epoch": 1.0119278779472953, "grad_norm": 0.3491497337818146, "learning_rate": 4.645283460172976e-07, "loss": 0.5182, "step": 18240 }, { "epoch": 1.0130374479889044, "grad_norm": 0.36246126890182495, "learning_rate": 4.6434167933796047e-07, "loss": 0.4677, "step": 18260 }, { "epoch": 1.0141470180305132, "grad_norm": 0.24133969843387604, "learning_rate": 4.6415456049745833e-07, "loss": 0.5098, "step": 18280 }, { "epoch": 1.015256588072122, "grad_norm": 0.25202351808547974, "learning_rate": 4.6396698989052473e-07, "loss": 0.52, "step": 18300 }, { "epoch": 1.0163661581137309, "grad_norm": 0.25879043340682983, "learning_rate": 4.6377896791284645e-07, "loss": 0.575, "step": 18320 }, { "epoch": 1.0174757281553397, "grad_norm": 0.21422986686229706, "learning_rate": 4.6359049496106225e-07, "loss": 0.4624, "step": 18340 }, { "epoch": 1.0185852981969488, "grad_norm": 0.25253644585609436, "learning_rate": 4.6340157143276233e-07, "loss": 0.5388, "step": 18360 }, { "epoch": 1.0196948682385576, "grad_norm": 0.24280954897403717, "learning_rate": 4.632121977264874e-07, "loss": 0.5206, "step": 18380 }, { "epoch": 1.0208044382801664, "grad_norm": 0.41219544410705566, "learning_rate": 4.6302237424172786e-07, "loss": 0.4168, "step": 18400 }, { "epoch": 1.0219140083217753, "grad_norm": 0.3510596454143524, "learning_rate": 4.628321013789228e-07, "loss": 0.4499, "step": 18420 }, { "epoch": 1.023023578363384, "grad_norm": 0.29174432158470154, "learning_rate": 4.626413795394595e-07, "loss": 0.5625, "step": 18440 }, { "epoch": 1.0241331484049931, "grad_norm": 0.3954429030418396, "learning_rate": 4.624502091256722e-07, "loss": 0.5267, "step": 18460 }, { "epoch": 1.025242718446602, "grad_norm": 0.2540510296821594, "learning_rate": 4.622585905408414e-07, "loss": 0.5273, "step": 18480 }, { "epoch": 1.0263522884882108, "grad_norm": 0.30282846093177795, "learning_rate": 4.6206652418919333e-07, "loss": 0.506, "step": 18500 }, { "epoch": 1.0274618585298196, "grad_norm": 0.33365193009376526, "learning_rate": 4.618740104758984e-07, "loss": 0.4436, "step": 18520 }, { "epoch": 1.0285714285714285, "grad_norm": 0.27561724185943604, "learning_rate": 4.6168104980707103e-07, "loss": 0.5251, "step": 18540 }, { "epoch": 1.0296809986130375, "grad_norm": 0.22135314345359802, "learning_rate": 4.6148764258976835e-07, "loss": 0.5589, "step": 18560 }, { "epoch": 1.0307905686546464, "grad_norm": 0.3224560022354126, "learning_rate": 4.612937892319895e-07, "loss": 0.4638, "step": 18580 }, { "epoch": 1.0319001386962552, "grad_norm": 0.23120611906051636, "learning_rate": 4.6109949014267494e-07, "loss": 0.5193, "step": 18600 }, { "epoch": 1.033009708737864, "grad_norm": 0.4292590618133545, "learning_rate": 4.6090474573170524e-07, "loss": 0.5064, "step": 18620 }, { "epoch": 1.0341192787794729, "grad_norm": 0.24833281338214874, "learning_rate": 4.607095564099005e-07, "loss": 0.5628, "step": 18640 }, { "epoch": 1.035228848821082, "grad_norm": 0.3108525276184082, "learning_rate": 4.605139225890192e-07, "loss": 0.5728, "step": 18660 }, { "epoch": 1.0363384188626907, "grad_norm": 0.3050781190395355, "learning_rate": 4.603178446817578e-07, "loss": 0.5498, "step": 18680 }, { "epoch": 1.0374479889042996, "grad_norm": 0.2089320868253708, "learning_rate": 4.6012132310174936e-07, "loss": 0.5008, "step": 18700 }, { "epoch": 1.0385575589459084, "grad_norm": 0.2967858612537384, "learning_rate": 4.5992435826356286e-07, "loss": 0.4734, "step": 18720 }, { "epoch": 1.0396671289875172, "grad_norm": 0.32540586590766907, "learning_rate": 4.597269505827024e-07, "loss": 0.4788, "step": 18740 }, { "epoch": 1.0407766990291263, "grad_norm": 0.29043784737586975, "learning_rate": 4.5952910047560646e-07, "loss": 0.4512, "step": 18760 }, { "epoch": 1.0418862690707351, "grad_norm": 0.2867758274078369, "learning_rate": 4.593308083596464e-07, "loss": 0.4818, "step": 18780 }, { "epoch": 1.042995839112344, "grad_norm": 0.3159836232662201, "learning_rate": 4.591320746531266e-07, "loss": 0.541, "step": 18800 }, { "epoch": 1.0441054091539528, "grad_norm": 0.34301233291625977, "learning_rate": 4.589328997752824e-07, "loss": 0.5545, "step": 18820 }, { "epoch": 1.0452149791955616, "grad_norm": 0.1924845278263092, "learning_rate": 4.587332841462802e-07, "loss": 0.4931, "step": 18840 }, { "epoch": 1.0463245492371707, "grad_norm": 0.2547283172607422, "learning_rate": 4.58533228187216e-07, "loss": 0.5095, "step": 18860 }, { "epoch": 1.0474341192787795, "grad_norm": 0.26863807439804077, "learning_rate": 4.5833273232011483e-07, "loss": 0.5062, "step": 18880 }, { "epoch": 1.0485436893203883, "grad_norm": 0.25382405519485474, "learning_rate": 4.581317969679296e-07, "loss": 0.4967, "step": 18900 }, { "epoch": 1.0496532593619972, "grad_norm": 0.31816625595092773, "learning_rate": 4.5793042255454026e-07, "loss": 0.5513, "step": 18920 }, { "epoch": 1.050762829403606, "grad_norm": 0.2712040841579437, "learning_rate": 4.577286095047532e-07, "loss": 0.3711, "step": 18940 }, { "epoch": 1.051872399445215, "grad_norm": 0.21581771969795227, "learning_rate": 4.575263582443e-07, "loss": 0.416, "step": 18960 }, { "epoch": 1.0529819694868239, "grad_norm": 0.2240162491798401, "learning_rate": 4.573236691998366e-07, "loss": 0.4615, "step": 18980 }, { "epoch": 1.0540915395284327, "grad_norm": 0.359831303358078, "learning_rate": 4.571205427989426e-07, "loss": 0.4797, "step": 19000 }, { "epoch": 1.0552011095700415, "grad_norm": 0.3368038833141327, "learning_rate": 4.5691697947012016e-07, "loss": 0.5954, "step": 19020 }, { "epoch": 1.0563106796116504, "grad_norm": 0.3113715648651123, "learning_rate": 4.5671297964279305e-07, "loss": 0.4476, "step": 19040 }, { "epoch": 1.0574202496532594, "grad_norm": 0.2818163335323334, "learning_rate": 4.56508543747306e-07, "loss": 0.479, "step": 19060 }, { "epoch": 1.0585298196948683, "grad_norm": 0.3632010221481323, "learning_rate": 4.563036722149236e-07, "loss": 0.5212, "step": 19080 }, { "epoch": 1.059639389736477, "grad_norm": 0.23106127977371216, "learning_rate": 4.560983654778294e-07, "loss": 0.5956, "step": 19100 }, { "epoch": 1.060748959778086, "grad_norm": 0.22520194947719574, "learning_rate": 4.5589262396912504e-07, "loss": 0.5062, "step": 19120 }, { "epoch": 1.061858529819695, "grad_norm": 0.31032079458236694, "learning_rate": 4.556864481228293e-07, "loss": 0.435, "step": 19140 }, { "epoch": 1.0629680998613038, "grad_norm": 0.522094190120697, "learning_rate": 4.5547983837387727e-07, "loss": 0.4194, "step": 19160 }, { "epoch": 1.0640776699029126, "grad_norm": 0.29584962129592896, "learning_rate": 4.5527279515811935e-07, "loss": 0.4847, "step": 19180 }, { "epoch": 1.0651872399445215, "grad_norm": 0.289877325296402, "learning_rate": 4.5506531891232036e-07, "loss": 0.5036, "step": 19200 }, { "epoch": 1.0662968099861303, "grad_norm": 0.28097590804100037, "learning_rate": 4.548574100741585e-07, "loss": 0.467, "step": 19220 }, { "epoch": 1.0674063800277394, "grad_norm": 0.28440162539482117, "learning_rate": 4.5464906908222474e-07, "loss": 0.4652, "step": 19240 }, { "epoch": 1.0685159500693482, "grad_norm": 0.2917669117450714, "learning_rate": 4.5444029637602154e-07, "loss": 0.497, "step": 19260 }, { "epoch": 1.069625520110957, "grad_norm": 0.25734850764274597, "learning_rate": 4.542310923959621e-07, "loss": 0.47, "step": 19280 }, { "epoch": 1.0707350901525658, "grad_norm": 0.29548370838165283, "learning_rate": 4.540214575833695e-07, "loss": 0.4564, "step": 19300 }, { "epoch": 1.0718446601941747, "grad_norm": 0.2588016092777252, "learning_rate": 4.5381139238047553e-07, "loss": 0.5313, "step": 19320 }, { "epoch": 1.0729542302357837, "grad_norm": 0.34810346364974976, "learning_rate": 4.536008972304201e-07, "loss": 0.6632, "step": 19340 }, { "epoch": 1.0740638002773926, "grad_norm": 0.27274230122566223, "learning_rate": 4.5338997257724985e-07, "loss": 0.4953, "step": 19360 }, { "epoch": 1.0751733703190014, "grad_norm": 0.3211914598941803, "learning_rate": 4.531786188659177e-07, "loss": 0.5724, "step": 19380 }, { "epoch": 1.0762829403606102, "grad_norm": 0.28688207268714905, "learning_rate": 4.5296683654228153e-07, "loss": 0.6232, "step": 19400 }, { "epoch": 1.077392510402219, "grad_norm": 0.20882871747016907, "learning_rate": 4.527546260531035e-07, "loss": 0.5383, "step": 19420 }, { "epoch": 1.0785020804438281, "grad_norm": 0.2768833637237549, "learning_rate": 4.525419878460489e-07, "loss": 0.4915, "step": 19440 }, { "epoch": 1.079611650485437, "grad_norm": 0.39787665009498596, "learning_rate": 4.523289223696855e-07, "loss": 0.4576, "step": 19460 }, { "epoch": 1.0807212205270458, "grad_norm": 0.3390793800354004, "learning_rate": 4.521154300734821e-07, "loss": 0.543, "step": 19480 }, { "epoch": 1.0818307905686546, "grad_norm": 0.2765834629535675, "learning_rate": 4.519015114078082e-07, "loss": 0.5154, "step": 19500 }, { "epoch": 1.0829403606102634, "grad_norm": 0.2756231427192688, "learning_rate": 4.5168716682393254e-07, "loss": 0.4608, "step": 19520 }, { "epoch": 1.0840499306518725, "grad_norm": 0.2993645668029785, "learning_rate": 4.514723967740224e-07, "loss": 0.4793, "step": 19540 }, { "epoch": 1.0851595006934813, "grad_norm": 0.2577601373195648, "learning_rate": 4.5125720171114265e-07, "loss": 0.5259, "step": 19560 }, { "epoch": 1.0862690707350902, "grad_norm": 0.24818478524684906, "learning_rate": 4.510415820892546e-07, "loss": 0.5614, "step": 19580 }, { "epoch": 1.087378640776699, "grad_norm": 0.22790955007076263, "learning_rate": 4.508255383632154e-07, "loss": 0.5098, "step": 19600 }, { "epoch": 1.0884882108183078, "grad_norm": 0.3550455868244171, "learning_rate": 4.506090709887767e-07, "loss": 0.538, "step": 19620 }, { "epoch": 1.0895977808599169, "grad_norm": 0.30925559997558594, "learning_rate": 4.5039218042258386e-07, "loss": 0.517, "step": 19640 }, { "epoch": 1.0907073509015257, "grad_norm": 0.3214380443096161, "learning_rate": 4.50174867122175e-07, "loss": 0.4464, "step": 19660 }, { "epoch": 1.0918169209431345, "grad_norm": 0.28385454416275024, "learning_rate": 4.4995713154598014e-07, "loss": 0.5747, "step": 19680 }, { "epoch": 1.0929264909847434, "grad_norm": 0.33149564266204834, "learning_rate": 4.4973897415331983e-07, "loss": 0.4896, "step": 19700 }, { "epoch": 1.0940360610263522, "grad_norm": 0.42345455288887024, "learning_rate": 4.4952039540440473e-07, "loss": 0.4689, "step": 19720 }, { "epoch": 1.0951456310679613, "grad_norm": 0.24603191018104553, "learning_rate": 4.493013957603342e-07, "loss": 0.5852, "step": 19740 }, { "epoch": 1.09625520110957, "grad_norm": 0.27773013710975647, "learning_rate": 4.490819756830955e-07, "loss": 0.5989, "step": 19760 }, { "epoch": 1.097364771151179, "grad_norm": 0.29582101106643677, "learning_rate": 4.48862135635563e-07, "loss": 0.4715, "step": 19780 }, { "epoch": 1.0984743411927878, "grad_norm": 0.3566180467605591, "learning_rate": 4.4864187608149664e-07, "loss": 0.5523, "step": 19800 }, { "epoch": 1.0995839112343966, "grad_norm": 0.25772908329963684, "learning_rate": 4.484211974855417e-07, "loss": 0.5799, "step": 19820 }, { "epoch": 1.1006934812760056, "grad_norm": 0.3181874454021454, "learning_rate": 4.4820010031322733e-07, "loss": 0.4732, "step": 19840 }, { "epoch": 1.1018030513176145, "grad_norm": 0.2680630683898926, "learning_rate": 4.4797858503096553e-07, "loss": 0.5103, "step": 19860 }, { "epoch": 1.1029126213592233, "grad_norm": 0.30635958909988403, "learning_rate": 4.477566521060504e-07, "loss": 0.4626, "step": 19880 }, { "epoch": 1.1040221914008321, "grad_norm": 0.2925454080104828, "learning_rate": 4.4753430200665723e-07, "loss": 0.612, "step": 19900 }, { "epoch": 1.105131761442441, "grad_norm": 0.21165505051612854, "learning_rate": 4.473115352018412e-07, "loss": 0.498, "step": 19920 }, { "epoch": 1.10624133148405, "grad_norm": 0.21539141237735748, "learning_rate": 4.470883521615364e-07, "loss": 0.5084, "step": 19940 }, { "epoch": 1.1073509015256588, "grad_norm": 0.3781816363334656, "learning_rate": 4.468647533565554e-07, "loss": 0.5101, "step": 19960 }, { "epoch": 1.1084604715672677, "grad_norm": 0.29596197605133057, "learning_rate": 4.4664073925858737e-07, "loss": 0.4068, "step": 19980 }, { "epoch": 1.1095700416088765, "grad_norm": 0.26415279507637024, "learning_rate": 4.4641631034019787e-07, "loss": 0.5141, "step": 20000 }, { "epoch": 1.1106796116504853, "grad_norm": 0.3133207857608795, "learning_rate": 4.461914670748275e-07, "loss": 0.4239, "step": 20020 }, { "epoch": 1.1117891816920944, "grad_norm": 0.1802404820919037, "learning_rate": 4.459662099367908e-07, "loss": 0.545, "step": 20040 }, { "epoch": 1.1128987517337032, "grad_norm": 0.2777576446533203, "learning_rate": 4.457405394012755e-07, "loss": 0.4294, "step": 20060 }, { "epoch": 1.114008321775312, "grad_norm": 0.3770914673805237, "learning_rate": 4.4551445594434145e-07, "loss": 0.5412, "step": 20080 }, { "epoch": 1.115117891816921, "grad_norm": 0.324416846036911, "learning_rate": 4.4528796004291937e-07, "loss": 0.548, "step": 20100 }, { "epoch": 1.1162274618585297, "grad_norm": 0.2394169420003891, "learning_rate": 4.450610521748103e-07, "loss": 0.5329, "step": 20120 }, { "epoch": 1.1173370319001388, "grad_norm": 0.2715405225753784, "learning_rate": 4.44833732818684e-07, "loss": 0.4943, "step": 20140 }, { "epoch": 1.1184466019417476, "grad_norm": 0.19607490301132202, "learning_rate": 4.4460600245407876e-07, "loss": 0.5201, "step": 20160 }, { "epoch": 1.1195561719833564, "grad_norm": 0.2596384584903717, "learning_rate": 4.4437786156139944e-07, "loss": 0.5307, "step": 20180 }, { "epoch": 1.1206657420249653, "grad_norm": 0.18252524733543396, "learning_rate": 4.441493106219173e-07, "loss": 0.585, "step": 20200 }, { "epoch": 1.121775312066574, "grad_norm": 0.30697065591812134, "learning_rate": 4.439203501177683e-07, "loss": 0.4802, "step": 20220 }, { "epoch": 1.1228848821081832, "grad_norm": 0.2932966649532318, "learning_rate": 4.4369098053195255e-07, "loss": 0.4496, "step": 20240 }, { "epoch": 1.123994452149792, "grad_norm": 0.33387163281440735, "learning_rate": 4.434612023483331e-07, "loss": 0.4765, "step": 20260 }, { "epoch": 1.1251040221914008, "grad_norm": 0.23948165774345398, "learning_rate": 4.432310160516348e-07, "loss": 0.4675, "step": 20280 }, { "epoch": 1.1262135922330097, "grad_norm": 0.31499868631362915, "learning_rate": 4.430004221274439e-07, "loss": 0.5634, "step": 20300 }, { "epoch": 1.1273231622746187, "grad_norm": 0.24027635157108307, "learning_rate": 4.4276942106220605e-07, "loss": 0.5106, "step": 20320 }, { "epoch": 1.1284327323162275, "grad_norm": 0.2582188546657562, "learning_rate": 4.42538013343226e-07, "loss": 0.5235, "step": 20340 }, { "epoch": 1.1295423023578364, "grad_norm": 0.3221554458141327, "learning_rate": 4.423061994586663e-07, "loss": 0.5387, "step": 20360 }, { "epoch": 1.1306518723994452, "grad_norm": 0.2051803469657898, "learning_rate": 4.420739798975463e-07, "loss": 0.5277, "step": 20380 }, { "epoch": 1.131761442441054, "grad_norm": 0.3561082184314728, "learning_rate": 4.4184135514974117e-07, "loss": 0.4213, "step": 20400 }, { "epoch": 1.132871012482663, "grad_norm": 0.22326581180095673, "learning_rate": 4.4160832570598095e-07, "loss": 0.4193, "step": 20420 }, { "epoch": 1.133980582524272, "grad_norm": 0.2613493502140045, "learning_rate": 4.413748920578493e-07, "loss": 0.5388, "step": 20440 }, { "epoch": 1.1350901525658808, "grad_norm": 0.2132934182882309, "learning_rate": 4.411410546977823e-07, "loss": 0.3635, "step": 20460 }, { "epoch": 1.1361997226074896, "grad_norm": 0.5562141537666321, "learning_rate": 4.4090681411906814e-07, "loss": 0.4427, "step": 20480 }, { "epoch": 1.1373092926490984, "grad_norm": 0.3076641857624054, "learning_rate": 4.406721708158454e-07, "loss": 0.4496, "step": 20500 }, { "epoch": 1.1384188626907075, "grad_norm": 0.2895612418651581, "learning_rate": 4.4043712528310217e-07, "loss": 0.5904, "step": 20520 }, { "epoch": 1.1395284327323163, "grad_norm": 0.297904372215271, "learning_rate": 4.4020167801667504e-07, "loss": 0.5294, "step": 20540 }, { "epoch": 1.1406380027739251, "grad_norm": 0.3360684812068939, "learning_rate": 4.399658295132483e-07, "loss": 0.5356, "step": 20560 }, { "epoch": 1.141747572815534, "grad_norm": 0.3303326964378357, "learning_rate": 4.397295802703523e-07, "loss": 0.5241, "step": 20580 }, { "epoch": 1.1428571428571428, "grad_norm": 0.23933163285255432, "learning_rate": 4.394929307863632e-07, "loss": 0.4993, "step": 20600 }, { "epoch": 1.1439667128987518, "grad_norm": 0.19543717801570892, "learning_rate": 4.392558815605011e-07, "loss": 0.4915, "step": 20620 }, { "epoch": 1.1450762829403607, "grad_norm": 0.2979204058647156, "learning_rate": 4.390184330928295e-07, "loss": 0.5925, "step": 20640 }, { "epoch": 1.1461858529819695, "grad_norm": 0.27096354961395264, "learning_rate": 4.3878058588425424e-07, "loss": 0.5704, "step": 20660 }, { "epoch": 1.1472954230235783, "grad_norm": 0.3115093410015106, "learning_rate": 4.3854234043652216e-07, "loss": 0.5943, "step": 20680 }, { "epoch": 1.1484049930651872, "grad_norm": 0.2984101474285126, "learning_rate": 4.3830369725222017e-07, "loss": 0.4713, "step": 20700 }, { "epoch": 1.1495145631067962, "grad_norm": 0.26506519317626953, "learning_rate": 4.3806465683477436e-07, "loss": 0.5698, "step": 20720 }, { "epoch": 1.150624133148405, "grad_norm": 0.35192254185676575, "learning_rate": 4.3782521968844876e-07, "loss": 0.5665, "step": 20740 }, { "epoch": 1.151733703190014, "grad_norm": 1.5585196018218994, "learning_rate": 4.375853863183443e-07, "loss": 0.5211, "step": 20760 }, { "epoch": 1.1528432732316227, "grad_norm": 0.2659168243408203, "learning_rate": 4.3734515723039766e-07, "loss": 0.482, "step": 20780 }, { "epoch": 1.1539528432732316, "grad_norm": 0.31809118390083313, "learning_rate": 4.371045329313805e-07, "loss": 0.5172, "step": 20800 }, { "epoch": 1.1550624133148406, "grad_norm": 0.32345712184906006, "learning_rate": 4.3686351392889793e-07, "loss": 0.5514, "step": 20820 }, { "epoch": 1.1561719833564494, "grad_norm": 0.38529345393180847, "learning_rate": 4.36622100731388e-07, "loss": 0.4934, "step": 20840 }, { "epoch": 1.1572815533980583, "grad_norm": 0.25609254837036133, "learning_rate": 4.3638029384812006e-07, "loss": 0.4618, "step": 20860 }, { "epoch": 1.158391123439667, "grad_norm": 0.25315091013908386, "learning_rate": 4.361380937891942e-07, "loss": 0.4727, "step": 20880 }, { "epoch": 1.159500693481276, "grad_norm": 0.3340308964252472, "learning_rate": 4.358955010655396e-07, "loss": 0.5607, "step": 20900 }, { "epoch": 1.160610263522885, "grad_norm": 0.30985623598098755, "learning_rate": 4.3565251618891427e-07, "loss": 0.4592, "step": 20920 }, { "epoch": 1.1617198335644938, "grad_norm": 0.33738934993743896, "learning_rate": 4.3540913967190286e-07, "loss": 0.4911, "step": 20940 }, { "epoch": 1.1628294036061027, "grad_norm": 0.35902056097984314, "learning_rate": 4.3516537202791677e-07, "loss": 0.563, "step": 20960 }, { "epoch": 1.1639389736477115, "grad_norm": 1.2402771711349487, "learning_rate": 4.3492121377119214e-07, "loss": 0.471, "step": 20980 }, { "epoch": 1.1650485436893203, "grad_norm": 0.3261203169822693, "learning_rate": 4.346766654167893e-07, "loss": 0.4717, "step": 21000 }, { "epoch": 1.1661581137309294, "grad_norm": 0.28907689452171326, "learning_rate": 4.344317274805913e-07, "loss": 0.5464, "step": 21020 }, { "epoch": 1.1672676837725382, "grad_norm": 0.2829582095146179, "learning_rate": 4.341864004793033e-07, "loss": 0.4097, "step": 21040 }, { "epoch": 1.168377253814147, "grad_norm": 0.2297755926847458, "learning_rate": 4.33940684930451e-07, "loss": 0.4202, "step": 21060 }, { "epoch": 1.1694868238557559, "grad_norm": 0.29886823892593384, "learning_rate": 4.336945813523799e-07, "loss": 0.4971, "step": 21080 }, { "epoch": 1.1705963938973647, "grad_norm": 0.3400444984436035, "learning_rate": 4.3344809026425396e-07, "loss": 0.536, "step": 21100 }, { "epoch": 1.1717059639389737, "grad_norm": 0.3287181258201599, "learning_rate": 4.3320121218605454e-07, "loss": 0.5319, "step": 21120 }, { "epoch": 1.1728155339805826, "grad_norm": 0.26089945435523987, "learning_rate": 4.3295394763857956e-07, "loss": 0.4538, "step": 21140 }, { "epoch": 1.1739251040221914, "grad_norm": 0.14349904656410217, "learning_rate": 4.327062971434421e-07, "loss": 0.4549, "step": 21160 }, { "epoch": 1.1750346740638002, "grad_norm": 0.23498626053333282, "learning_rate": 4.324582612230694e-07, "loss": 0.5169, "step": 21180 }, { "epoch": 1.176144244105409, "grad_norm": 0.3627133071422577, "learning_rate": 4.322098404007017e-07, "loss": 0.4583, "step": 21200 }, { "epoch": 1.1772538141470181, "grad_norm": 0.28728342056274414, "learning_rate": 4.319610352003915e-07, "loss": 0.4944, "step": 21220 }, { "epoch": 1.178363384188627, "grad_norm": 0.33951136469841003, "learning_rate": 4.3171184614700185e-07, "loss": 0.5665, "step": 21240 }, { "epoch": 1.1794729542302358, "grad_norm": 0.2991965413093567, "learning_rate": 4.314622737662056e-07, "loss": 0.5121, "step": 21260 }, { "epoch": 1.1805825242718446, "grad_norm": 0.3339698910713196, "learning_rate": 4.3121231858448434e-07, "loss": 0.5318, "step": 21280 }, { "epoch": 1.1816920943134535, "grad_norm": 0.294872522354126, "learning_rate": 4.309619811291271e-07, "loss": 0.5298, "step": 21300 }, { "epoch": 1.1828016643550625, "grad_norm": 0.23444300889968872, "learning_rate": 4.3071126192822946e-07, "loss": 0.4842, "step": 21320 }, { "epoch": 1.1839112343966713, "grad_norm": 0.3687494099140167, "learning_rate": 4.3046016151069216e-07, "loss": 0.4355, "step": 21340 }, { "epoch": 1.1850208044382802, "grad_norm": 0.19200222194194794, "learning_rate": 4.3020868040622023e-07, "loss": 0.5193, "step": 21360 }, { "epoch": 1.186130374479889, "grad_norm": 0.2897944748401642, "learning_rate": 4.2995681914532165e-07, "loss": 0.4015, "step": 21380 }, { "epoch": 1.1872399445214978, "grad_norm": 0.21029962599277496, "learning_rate": 4.2970457825930656e-07, "loss": 0.4983, "step": 21400 }, { "epoch": 1.188349514563107, "grad_norm": 0.3207491636276245, "learning_rate": 4.294519582802857e-07, "loss": 0.5792, "step": 21420 }, { "epoch": 1.1894590846047157, "grad_norm": 0.3624780476093292, "learning_rate": 4.2919895974116975e-07, "loss": 0.4505, "step": 21440 }, { "epoch": 1.1905686546463246, "grad_norm": 0.25255516171455383, "learning_rate": 4.2894558317566776e-07, "loss": 0.5011, "step": 21460 }, { "epoch": 1.1916782246879334, "grad_norm": 0.34456872940063477, "learning_rate": 4.2869182911828627e-07, "loss": 0.5302, "step": 21480 }, { "epoch": 1.1927877947295422, "grad_norm": 0.40119966864585876, "learning_rate": 4.284376981043284e-07, "loss": 0.5695, "step": 21500 }, { "epoch": 1.1938973647711513, "grad_norm": 0.2503664493560791, "learning_rate": 4.281831906698921e-07, "loss": 0.5656, "step": 21520 }, { "epoch": 1.19500693481276, "grad_norm": 0.3286171555519104, "learning_rate": 4.2792830735186976e-07, "loss": 0.4887, "step": 21540 }, { "epoch": 1.196116504854369, "grad_norm": 0.4181727468967438, "learning_rate": 4.276730486879464e-07, "loss": 0.4552, "step": 21560 }, { "epoch": 1.1972260748959778, "grad_norm": 0.2780179977416992, "learning_rate": 4.2741741521659897e-07, "loss": 0.4763, "step": 21580 }, { "epoch": 1.1983356449375866, "grad_norm": 0.2816377580165863, "learning_rate": 4.2716140747709516e-07, "loss": 0.583, "step": 21600 }, { "epoch": 1.1994452149791957, "grad_norm": 0.31979119777679443, "learning_rate": 4.269050260094922e-07, "loss": 0.5195, "step": 21620 }, { "epoch": 1.2005547850208045, "grad_norm": 0.24650464951992035, "learning_rate": 4.2664827135463544e-07, "loss": 0.5408, "step": 21640 }, { "epoch": 1.2016643550624133, "grad_norm": 0.30070972442626953, "learning_rate": 4.2639114405415777e-07, "loss": 0.5127, "step": 21660 }, { "epoch": 1.2027739251040221, "grad_norm": 0.2763945162296295, "learning_rate": 4.2613364465047805e-07, "loss": 0.4677, "step": 21680 }, { "epoch": 1.203883495145631, "grad_norm": 0.3006082773208618, "learning_rate": 4.258757736868004e-07, "loss": 0.4983, "step": 21700 }, { "epoch": 1.20499306518724, "grad_norm": 0.2762427031993866, "learning_rate": 4.256175317071122e-07, "loss": 0.4958, "step": 21720 }, { "epoch": 1.2061026352288489, "grad_norm": 0.26127681136131287, "learning_rate": 4.2535891925618394e-07, "loss": 0.509, "step": 21740 }, { "epoch": 1.2072122052704577, "grad_norm": 0.2729744613170624, "learning_rate": 4.250999368795676e-07, "loss": 0.4558, "step": 21760 }, { "epoch": 1.2083217753120665, "grad_norm": 0.44533059000968933, "learning_rate": 4.248405851235952e-07, "loss": 0.4441, "step": 21780 }, { "epoch": 1.2094313453536754, "grad_norm": 0.19278019666671753, "learning_rate": 4.2458086453537847e-07, "loss": 0.4977, "step": 21800 }, { "epoch": 1.2105409153952844, "grad_norm": 0.21814994513988495, "learning_rate": 4.2432077566280676e-07, "loss": 0.4991, "step": 21820 }, { "epoch": 1.2116504854368932, "grad_norm": 0.35878831148147583, "learning_rate": 4.2406031905454664e-07, "loss": 0.408, "step": 21840 }, { "epoch": 1.212760055478502, "grad_norm": 0.3183673620223999, "learning_rate": 4.237994952600402e-07, "loss": 0.4578, "step": 21860 }, { "epoch": 1.213869625520111, "grad_norm": 0.5679528117179871, "learning_rate": 4.235383048295044e-07, "loss": 0.4924, "step": 21880 }, { "epoch": 1.2149791955617197, "grad_norm": 0.19501584768295288, "learning_rate": 4.2327674831392923e-07, "loss": 0.5554, "step": 21900 }, { "epoch": 1.2160887656033288, "grad_norm": 0.3771109879016876, "learning_rate": 4.2301482626507733e-07, "loss": 0.48, "step": 21920 }, { "epoch": 1.2171983356449376, "grad_norm": 0.2654207646846771, "learning_rate": 4.2275253923548225e-07, "loss": 0.5021, "step": 21940 }, { "epoch": 1.2183079056865465, "grad_norm": 0.2312295138835907, "learning_rate": 4.2248988777844756e-07, "loss": 0.4928, "step": 21960 }, { "epoch": 1.2194174757281553, "grad_norm": 0.28616973757743835, "learning_rate": 4.2222687244804557e-07, "loss": 0.4519, "step": 21980 }, { "epoch": 1.2205270457697641, "grad_norm": 2.1662755012512207, "learning_rate": 4.219634937991161e-07, "loss": 0.5318, "step": 22000 }, { "epoch": 1.2216366158113732, "grad_norm": 0.4827166199684143, "learning_rate": 4.216997523872656e-07, "loss": 0.5303, "step": 22020 }, { "epoch": 1.222746185852982, "grad_norm": 0.368572860956192, "learning_rate": 4.214356487688656e-07, "loss": 0.4898, "step": 22040 }, { "epoch": 1.2238557558945908, "grad_norm": 0.296252578496933, "learning_rate": 4.211711835010519e-07, "loss": 0.472, "step": 22060 }, { "epoch": 1.2249653259361997, "grad_norm": 0.2842251658439636, "learning_rate": 4.2090635714172295e-07, "loss": 0.5373, "step": 22080 }, { "epoch": 1.2260748959778085, "grad_norm": 0.34295085072517395, "learning_rate": 4.206411702495393e-07, "loss": 0.4507, "step": 22100 }, { "epoch": 1.2271844660194176, "grad_norm": 0.2523220479488373, "learning_rate": 4.2037562338392156e-07, "loss": 0.5072, "step": 22120 }, { "epoch": 1.2282940360610264, "grad_norm": 0.25443604588508606, "learning_rate": 4.2010971710505024e-07, "loss": 0.4905, "step": 22140 }, { "epoch": 1.2294036061026352, "grad_norm": 0.23542262613773346, "learning_rate": 4.198434519738637e-07, "loss": 0.459, "step": 22160 }, { "epoch": 1.230513176144244, "grad_norm": 0.2792638838291168, "learning_rate": 4.1957682855205747e-07, "loss": 0.5183, "step": 22180 }, { "epoch": 1.2316227461858529, "grad_norm": 0.27320483326911926, "learning_rate": 4.1930984740208277e-07, "loss": 0.5027, "step": 22200 }, { "epoch": 1.232732316227462, "grad_norm": 0.25877195596694946, "learning_rate": 4.190425090871456e-07, "loss": 0.4876, "step": 22220 }, { "epoch": 1.2338418862690708, "grad_norm": 0.8559215664863586, "learning_rate": 4.187748141712054e-07, "loss": 0.5595, "step": 22240 }, { "epoch": 1.2349514563106796, "grad_norm": 0.537159264087677, "learning_rate": 4.185067632189737e-07, "loss": 0.5979, "step": 22260 }, { "epoch": 1.2360610263522884, "grad_norm": 0.4771907329559326, "learning_rate": 4.1823835679591333e-07, "loss": 0.4271, "step": 22280 }, { "epoch": 1.2371705963938973, "grad_norm": 0.23294192552566528, "learning_rate": 4.1796959546823694e-07, "loss": 0.5128, "step": 22300 }, { "epoch": 1.2382801664355063, "grad_norm": 0.42406392097473145, "learning_rate": 4.177004798029058e-07, "loss": 0.4261, "step": 22320 }, { "epoch": 1.2393897364771151, "grad_norm": 0.26407167315483093, "learning_rate": 4.174310103676285e-07, "loss": 0.4733, "step": 22340 }, { "epoch": 1.240499306518724, "grad_norm": 0.3006958067417145, "learning_rate": 4.1716118773086034e-07, "loss": 0.4061, "step": 22360 }, { "epoch": 1.2416088765603328, "grad_norm": 0.2354767620563507, "learning_rate": 4.1689101246180134e-07, "loss": 0.5419, "step": 22380 }, { "epoch": 1.2427184466019416, "grad_norm": 0.3211475610733032, "learning_rate": 4.166204851303957e-07, "loss": 0.5099, "step": 22400 }, { "epoch": 1.2438280166435507, "grad_norm": 0.30135178565979004, "learning_rate": 4.1634960630732996e-07, "loss": 0.5932, "step": 22420 }, { "epoch": 1.2449375866851595, "grad_norm": 0.3333330452442169, "learning_rate": 4.1607837656403245e-07, "loss": 0.487, "step": 22440 }, { "epoch": 1.2460471567267684, "grad_norm": 0.2941558063030243, "learning_rate": 4.158067964726716e-07, "loss": 0.6, "step": 22460 }, { "epoch": 1.2471567267683772, "grad_norm": 0.2161741554737091, "learning_rate": 4.1553486660615513e-07, "loss": 0.5328, "step": 22480 }, { "epoch": 1.248266296809986, "grad_norm": 0.27961641550064087, "learning_rate": 4.1526258753812833e-07, "loss": 0.493, "step": 22500 }, { "epoch": 1.249375866851595, "grad_norm": 0.31312963366508484, "learning_rate": 4.149899598429733e-07, "loss": 0.5104, "step": 22520 }, { "epoch": 1.250485436893204, "grad_norm": 0.30593591928482056, "learning_rate": 4.1471698409580765e-07, "loss": 0.4835, "step": 22540 }, { "epoch": 1.2515950069348127, "grad_norm": 0.2754952013492584, "learning_rate": 4.1444366087248304e-07, "loss": 0.5762, "step": 22560 }, { "epoch": 1.2527045769764216, "grad_norm": 0.2530832886695862, "learning_rate": 4.1416999074958427e-07, "loss": 0.5277, "step": 22580 }, { "epoch": 1.2538141470180304, "grad_norm": 0.3305167853832245, "learning_rate": 4.138959743044279e-07, "loss": 0.4397, "step": 22600 }, { "epoch": 1.2549237170596395, "grad_norm": 0.6943464279174805, "learning_rate": 4.136216121150611e-07, "loss": 0.5096, "step": 22620 }, { "epoch": 1.2560332871012483, "grad_norm": 0.3612828552722931, "learning_rate": 4.133469047602604e-07, "loss": 0.4614, "step": 22640 }, { "epoch": 1.2571428571428571, "grad_norm": 0.31668516993522644, "learning_rate": 4.130718528195303e-07, "loss": 0.4404, "step": 22660 }, { "epoch": 1.258252427184466, "grad_norm": 0.3303450345993042, "learning_rate": 4.1279645687310245e-07, "loss": 0.595, "step": 22680 }, { "epoch": 1.2593619972260748, "grad_norm": 0.3007012903690338, "learning_rate": 4.125207175019341e-07, "loss": 0.5484, "step": 22700 }, { "epoch": 1.2604715672676838, "grad_norm": 0.34879636764526367, "learning_rate": 4.12244635287707e-07, "loss": 0.4766, "step": 22720 }, { "epoch": 1.2615811373092927, "grad_norm": 0.21622873842716217, "learning_rate": 4.11968210812826e-07, "loss": 0.4977, "step": 22740 }, { "epoch": 1.2626907073509015, "grad_norm": 0.2922593355178833, "learning_rate": 4.1169144466041814e-07, "loss": 0.4522, "step": 22760 }, { "epoch": 1.2638002773925103, "grad_norm": 0.31872883439064026, "learning_rate": 4.114143374143311e-07, "loss": 0.4877, "step": 22780 }, { "epoch": 1.2649098474341192, "grad_norm": 0.23865756392478943, "learning_rate": 4.111368896591323e-07, "loss": 0.5128, "step": 22800 }, { "epoch": 1.2660194174757282, "grad_norm": 0.20843331515789032, "learning_rate": 4.108591019801072e-07, "loss": 0.5097, "step": 22820 }, { "epoch": 1.267128987517337, "grad_norm": 0.2617850601673126, "learning_rate": 4.1058097496325863e-07, "loss": 0.5396, "step": 22840 }, { "epoch": 1.2682385575589459, "grad_norm": 0.2932295501232147, "learning_rate": 4.10302509195305e-07, "loss": 0.4559, "step": 22860 }, { "epoch": 1.2693481276005547, "grad_norm": 0.23737436532974243, "learning_rate": 4.100237052636795e-07, "loss": 0.4344, "step": 22880 }, { "epoch": 1.2704576976421635, "grad_norm": 0.3473213016986847, "learning_rate": 4.0974456375652864e-07, "loss": 0.4274, "step": 22900 }, { "epoch": 1.2715672676837726, "grad_norm": 0.3310716450214386, "learning_rate": 4.0946508526271107e-07, "loss": 0.484, "step": 22920 }, { "epoch": 1.2726768377253814, "grad_norm": 0.4026238024234772, "learning_rate": 4.091852703717963e-07, "loss": 0.4494, "step": 22940 }, { "epoch": 1.2737864077669903, "grad_norm": 0.42452314496040344, "learning_rate": 4.089051196740635e-07, "loss": 0.5366, "step": 22960 }, { "epoch": 1.274895977808599, "grad_norm": 0.26845407485961914, "learning_rate": 4.086246337605002e-07, "loss": 0.4537, "step": 22980 }, { "epoch": 1.276005547850208, "grad_norm": 0.24991346895694733, "learning_rate": 4.0834381322280114e-07, "loss": 0.5199, "step": 23000 }, { "epoch": 1.277115117891817, "grad_norm": 0.30734777450561523, "learning_rate": 4.0806265865336693e-07, "loss": 0.4655, "step": 23020 }, { "epoch": 1.2782246879334258, "grad_norm": 0.4869508445262909, "learning_rate": 4.077811706453028e-07, "loss": 0.507, "step": 23040 }, { "epoch": 1.2793342579750346, "grad_norm": 0.26691967248916626, "learning_rate": 4.0749934979241746e-07, "loss": 0.5778, "step": 23060 }, { "epoch": 1.2804438280166435, "grad_norm": 0.410972535610199, "learning_rate": 4.072171966892216e-07, "loss": 0.4825, "step": 23080 }, { "epoch": 1.2815533980582523, "grad_norm": 0.28712522983551025, "learning_rate": 4.069347119309271e-07, "loss": 0.4653, "step": 23100 }, { "epoch": 1.2826629680998614, "grad_norm": 0.2712896466255188, "learning_rate": 4.066518961134452e-07, "loss": 0.5359, "step": 23120 }, { "epoch": 1.2837725381414702, "grad_norm": 0.330881804227829, "learning_rate": 4.063687498333856e-07, "loss": 0.605, "step": 23140 }, { "epoch": 1.284882108183079, "grad_norm": 0.24023129045963287, "learning_rate": 4.060852736880553e-07, "loss": 0.5086, "step": 23160 }, { "epoch": 1.2859916782246879, "grad_norm": 0.40944501757621765, "learning_rate": 4.058014682754568e-07, "loss": 0.5199, "step": 23180 }, { "epoch": 1.2871012482662967, "grad_norm": 0.3029615581035614, "learning_rate": 4.055173341942874e-07, "loss": 0.4488, "step": 23200 }, { "epoch": 1.2882108183079057, "grad_norm": 0.2231222540140152, "learning_rate": 4.0523287204393795e-07, "loss": 0.4456, "step": 23220 }, { "epoch": 1.2893203883495146, "grad_norm": 0.32356059551239014, "learning_rate": 4.04948082424491e-07, "loss": 0.4551, "step": 23240 }, { "epoch": 1.2904299583911234, "grad_norm": 0.4188784956932068, "learning_rate": 4.046629659367201e-07, "loss": 0.4847, "step": 23260 }, { "epoch": 1.2915395284327325, "grad_norm": 0.27792972326278687, "learning_rate": 4.0437752318208846e-07, "loss": 0.4571, "step": 23280 }, { "epoch": 1.292649098474341, "grad_norm": 0.22208185493946075, "learning_rate": 4.040917547627472e-07, "loss": 0.4765, "step": 23300 }, { "epoch": 1.2937586685159501, "grad_norm": 0.29756030440330505, "learning_rate": 4.0380566128153474e-07, "loss": 0.5847, "step": 23320 }, { "epoch": 1.294868238557559, "grad_norm": 0.36027979850769043, "learning_rate": 4.0351924334197516e-07, "loss": 0.4265, "step": 23340 }, { "epoch": 1.2959778085991678, "grad_norm": 0.2652425467967987, "learning_rate": 4.0323250154827703e-07, "loss": 0.5125, "step": 23360 }, { "epoch": 1.2970873786407768, "grad_norm": 0.24833956360816956, "learning_rate": 4.02945436505332e-07, "loss": 0.5339, "step": 23380 }, { "epoch": 1.2981969486823854, "grad_norm": 0.6940369009971619, "learning_rate": 4.0265804881871366e-07, "loss": 0.4995, "step": 23400 }, { "epoch": 1.2993065187239945, "grad_norm": 0.3002369701862335, "learning_rate": 4.023703390946762e-07, "loss": 0.4073, "step": 23420 }, { "epoch": 1.3004160887656033, "grad_norm": 0.21742357313632965, "learning_rate": 4.0208230794015344e-07, "loss": 0.6188, "step": 23440 }, { "epoch": 1.3015256588072122, "grad_norm": 0.31908342242240906, "learning_rate": 4.0179395596275665e-07, "loss": 0.6126, "step": 23460 }, { "epoch": 1.3026352288488212, "grad_norm": 0.3762455880641937, "learning_rate": 4.015052837707746e-07, "loss": 0.4659, "step": 23480 }, { "epoch": 1.3037447988904298, "grad_norm": 0.3659731149673462, "learning_rate": 4.0121629197317117e-07, "loss": 0.533, "step": 23500 }, { "epoch": 1.3048543689320389, "grad_norm": 0.39868220686912537, "learning_rate": 4.0092698117958447e-07, "loss": 0.5323, "step": 23520 }, { "epoch": 1.3059639389736477, "grad_norm": 0.33988723158836365, "learning_rate": 4.006373520003256e-07, "loss": 0.4713, "step": 23540 }, { "epoch": 1.3070735090152565, "grad_norm": 0.2681286036968231, "learning_rate": 4.003474050463772e-07, "loss": 0.5123, "step": 23560 }, { "epoch": 1.3081830790568656, "grad_norm": 0.3168198764324188, "learning_rate": 4.0005714092939255e-07, "loss": 0.5196, "step": 23580 }, { "epoch": 1.3092926490984742, "grad_norm": 0.3387150764465332, "learning_rate": 3.997665602616938e-07, "loss": 0.4943, "step": 23600 }, { "epoch": 1.3104022191400833, "grad_norm": 0.29789161682128906, "learning_rate": 3.9947566365627077e-07, "loss": 0.4378, "step": 23620 }, { "epoch": 1.311511789181692, "grad_norm": 0.3228195309638977, "learning_rate": 3.9918445172677995e-07, "loss": 0.428, "step": 23640 }, { "epoch": 1.312621359223301, "grad_norm": 0.2895941436290741, "learning_rate": 3.9889292508754304e-07, "loss": 0.5233, "step": 23660 }, { "epoch": 1.31373092926491, "grad_norm": 0.27492010593414307, "learning_rate": 3.986010843535453e-07, "loss": 0.4765, "step": 23680 }, { "epoch": 1.3148404993065188, "grad_norm": 0.370553195476532, "learning_rate": 3.983089301404351e-07, "loss": 0.5703, "step": 23700 }, { "epoch": 1.3159500693481276, "grad_norm": 0.22017182409763336, "learning_rate": 3.980164630645215e-07, "loss": 0.5387, "step": 23720 }, { "epoch": 1.3170596393897365, "grad_norm": 0.1595495194196701, "learning_rate": 3.9772368374277427e-07, "loss": 0.4841, "step": 23740 }, { "epoch": 1.3181692094313453, "grad_norm": 0.2701757550239563, "learning_rate": 3.9743059279282126e-07, "loss": 0.565, "step": 23760 }, { "epoch": 1.3192787794729544, "grad_norm": 0.24846290051937103, "learning_rate": 3.9713719083294795e-07, "loss": 0.5165, "step": 23780 }, { "epoch": 1.3203883495145632, "grad_norm": 0.5249803066253662, "learning_rate": 3.968434784820959e-07, "loss": 0.5461, "step": 23800 }, { "epoch": 1.321497919556172, "grad_norm": 0.3887864053249359, "learning_rate": 3.9654945635986155e-07, "loss": 0.4551, "step": 23820 }, { "epoch": 1.3226074895977808, "grad_norm": 0.2729036509990692, "learning_rate": 3.962551250864945e-07, "loss": 0.5549, "step": 23840 }, { "epoch": 1.3237170596393897, "grad_norm": 0.3999611437320709, "learning_rate": 3.9596048528289695e-07, "loss": 0.4977, "step": 23860 }, { "epoch": 1.3248266296809987, "grad_norm": 0.3275601863861084, "learning_rate": 3.9566553757062154e-07, "loss": 0.5857, "step": 23880 }, { "epoch": 1.3259361997226076, "grad_norm": 0.20718099176883698, "learning_rate": 3.953702825718708e-07, "loss": 0.5084, "step": 23900 }, { "epoch": 1.3270457697642164, "grad_norm": 0.26601970195770264, "learning_rate": 3.950747209094952e-07, "loss": 0.5272, "step": 23920 }, { "epoch": 1.3281553398058252, "grad_norm": 0.34316569566726685, "learning_rate": 3.947788532069923e-07, "loss": 0.5372, "step": 23940 }, { "epoch": 1.329264909847434, "grad_norm": 0.2882291078567505, "learning_rate": 3.9448268008850526e-07, "loss": 0.5044, "step": 23960 }, { "epoch": 1.3303744798890431, "grad_norm": 0.4047781229019165, "learning_rate": 3.9418620217882146e-07, "loss": 0.5598, "step": 23980 }, { "epoch": 1.331484049930652, "grad_norm": 0.32583579421043396, "learning_rate": 3.938894201033713e-07, "loss": 0.542, "step": 24000 }, { "epoch": 1.3325936199722608, "grad_norm": 0.29183679819107056, "learning_rate": 3.9359233448822684e-07, "loss": 0.4634, "step": 24020 }, { "epoch": 1.3337031900138696, "grad_norm": 0.38822606205940247, "learning_rate": 3.9329494596010024e-07, "loss": 0.5621, "step": 24040 }, { "epoch": 1.3348127600554784, "grad_norm": 0.3026890158653259, "learning_rate": 3.929972551463431e-07, "loss": 0.4908, "step": 24060 }, { "epoch": 1.3359223300970875, "grad_norm": 0.3104611933231354, "learning_rate": 3.9269926267494437e-07, "loss": 0.4917, "step": 24080 }, { "epoch": 1.3370319001386963, "grad_norm": 0.26182132959365845, "learning_rate": 3.9240096917452935e-07, "loss": 0.4852, "step": 24100 }, { "epoch": 1.3381414701803052, "grad_norm": 0.22805210947990417, "learning_rate": 3.9210237527435864e-07, "loss": 0.4179, "step": 24120 }, { "epoch": 1.339251040221914, "grad_norm": 0.5565185546875, "learning_rate": 3.9180348160432617e-07, "loss": 0.4638, "step": 24140 }, { "epoch": 1.3403606102635228, "grad_norm": 0.24348342418670654, "learning_rate": 3.9150428879495855e-07, "loss": 0.5448, "step": 24160 }, { "epoch": 1.3414701803051319, "grad_norm": 0.312476247549057, "learning_rate": 3.9120479747741344e-07, "loss": 0.4453, "step": 24180 }, { "epoch": 1.3425797503467407, "grad_norm": 0.210465207695961, "learning_rate": 3.9090500828347794e-07, "loss": 0.4625, "step": 24200 }, { "epoch": 1.3436893203883495, "grad_norm": 0.328029066324234, "learning_rate": 3.9060492184556777e-07, "loss": 0.5302, "step": 24220 }, { "epoch": 1.3447988904299584, "grad_norm": 0.2610151171684265, "learning_rate": 3.903045387967256e-07, "loss": 0.5442, "step": 24240 }, { "epoch": 1.3459084604715672, "grad_norm": 0.3073832392692566, "learning_rate": 3.9000385977061977e-07, "loss": 0.53, "step": 24260 }, { "epoch": 1.3470180305131763, "grad_norm": 0.35046860575675964, "learning_rate": 3.8970288540154315e-07, "loss": 0.4835, "step": 24280 }, { "epoch": 1.348127600554785, "grad_norm": 0.2784862518310547, "learning_rate": 3.8940161632441157e-07, "loss": 0.5455, "step": 24300 }, { "epoch": 1.349237170596394, "grad_norm": 0.29548051953315735, "learning_rate": 3.891000531747625e-07, "loss": 0.4628, "step": 24320 }, { "epoch": 1.3503467406380028, "grad_norm": 0.2380640059709549, "learning_rate": 3.887981965887537e-07, "loss": 0.5328, "step": 24340 }, { "epoch": 1.3514563106796116, "grad_norm": 0.2840331196784973, "learning_rate": 3.884960472031622e-07, "loss": 0.4684, "step": 24360 }, { "epoch": 1.3525658807212206, "grad_norm": 0.34044143557548523, "learning_rate": 3.881936056553825e-07, "loss": 0.6123, "step": 24380 }, { "epoch": 1.3536754507628295, "grad_norm": 0.32320764660835266, "learning_rate": 3.8789087258342544e-07, "loss": 0.5392, "step": 24400 }, { "epoch": 1.3547850208044383, "grad_norm": 0.3448775112628937, "learning_rate": 3.87587848625917e-07, "loss": 0.4576, "step": 24420 }, { "epoch": 1.3558945908460471, "grad_norm": 0.25193849205970764, "learning_rate": 3.872845344220965e-07, "loss": 0.4429, "step": 24440 }, { "epoch": 1.357004160887656, "grad_norm": 0.421467661857605, "learning_rate": 3.8698093061181574e-07, "loss": 0.6663, "step": 24460 }, { "epoch": 1.358113730929265, "grad_norm": 0.25269341468811035, "learning_rate": 3.866770378355375e-07, "loss": 0.5185, "step": 24480 }, { "epoch": 1.3592233009708738, "grad_norm": 0.34419819712638855, "learning_rate": 3.863728567343341e-07, "loss": 0.4848, "step": 24500 }, { "epoch": 1.3603328710124827, "grad_norm": 0.37951138615608215, "learning_rate": 3.8606838794988603e-07, "loss": 0.5418, "step": 24520 }, { "epoch": 1.3614424410540915, "grad_norm": 0.6216520667076111, "learning_rate": 3.8576363212448057e-07, "loss": 0.5088, "step": 24540 }, { "epoch": 1.3625520110957003, "grad_norm": 0.298691064119339, "learning_rate": 3.854585899010109e-07, "loss": 0.6277, "step": 24560 }, { "epoch": 1.3636615811373094, "grad_norm": 0.38073039054870605, "learning_rate": 3.851532619229738e-07, "loss": 0.4725, "step": 24580 }, { "epoch": 1.3647711511789182, "grad_norm": 0.25239279866218567, "learning_rate": 3.8484764883446944e-07, "loss": 0.4936, "step": 24600 }, { "epoch": 1.365880721220527, "grad_norm": 0.28390464186668396, "learning_rate": 3.8454175128019905e-07, "loss": 0.4708, "step": 24620 }, { "epoch": 1.366990291262136, "grad_norm": 0.1831650286912918, "learning_rate": 3.842355699054641e-07, "loss": 0.5109, "step": 24640 }, { "epoch": 1.3680998613037447, "grad_norm": 0.23556751012802124, "learning_rate": 3.8392910535616476e-07, "loss": 0.5446, "step": 24660 }, { "epoch": 1.3692094313453538, "grad_norm": 0.2929559051990509, "learning_rate": 3.836223582787985e-07, "loss": 0.5635, "step": 24680 }, { "epoch": 1.3703190013869626, "grad_norm": 0.3317455053329468, "learning_rate": 3.8331532932045884e-07, "loss": 0.466, "step": 24700 }, { "epoch": 1.3714285714285714, "grad_norm": 0.6451997756958008, "learning_rate": 3.8300801912883414e-07, "loss": 0.5717, "step": 24720 }, { "epoch": 1.3725381414701803, "grad_norm": 0.40009111166000366, "learning_rate": 3.8270042835220575e-07, "loss": 0.5186, "step": 24740 }, { "epoch": 1.373647711511789, "grad_norm": 0.34240442514419556, "learning_rate": 3.823925576394469e-07, "loss": 0.616, "step": 24760 }, { "epoch": 1.3747572815533982, "grad_norm": 0.38171151280403137, "learning_rate": 3.820844076400216e-07, "loss": 0.4937, "step": 24780 }, { "epoch": 1.375866851595007, "grad_norm": 0.330308735370636, "learning_rate": 3.8177597900398295e-07, "loss": 0.4526, "step": 24800 }, { "epoch": 1.3769764216366158, "grad_norm": 0.2901124060153961, "learning_rate": 3.8146727238197167e-07, "loss": 0.4587, "step": 24820 }, { "epoch": 1.3780859916782247, "grad_norm": 0.30086854100227356, "learning_rate": 3.8115828842521514e-07, "loss": 0.523, "step": 24840 }, { "epoch": 1.3791955617198335, "grad_norm": 0.32858511805534363, "learning_rate": 3.808490277855256e-07, "loss": 0.4959, "step": 24860 }, { "epoch": 1.3803051317614425, "grad_norm": 0.2536601424217224, "learning_rate": 3.80539491115299e-07, "loss": 0.5447, "step": 24880 }, { "epoch": 1.3814147018030514, "grad_norm": 0.29281720519065857, "learning_rate": 3.802296790675137e-07, "loss": 0.4897, "step": 24900 }, { "epoch": 1.3825242718446602, "grad_norm": 0.344677597284317, "learning_rate": 3.79919592295729e-07, "loss": 0.5809, "step": 24920 }, { "epoch": 1.383633841886269, "grad_norm": 0.3403691053390503, "learning_rate": 3.796092314540834e-07, "loss": 0.5333, "step": 24940 }, { "epoch": 1.3847434119278779, "grad_norm": 0.28125202655792236, "learning_rate": 3.7929859719729394e-07, "loss": 0.5742, "step": 24960 }, { "epoch": 1.385852981969487, "grad_norm": 0.2931448221206665, "learning_rate": 3.7898769018065425e-07, "loss": 0.4605, "step": 24980 }, { "epoch": 1.3869625520110958, "grad_norm": 0.3080357611179352, "learning_rate": 3.786765110600334e-07, "loss": 0.4492, "step": 25000 }, { "epoch": 1.3880721220527046, "grad_norm": 0.3714500665664673, "learning_rate": 3.783650604918746e-07, "loss": 0.4219, "step": 25020 }, { "epoch": 1.3891816920943134, "grad_norm": 0.31632566452026367, "learning_rate": 3.7805333913319346e-07, "loss": 0.4306, "step": 25040 }, { "epoch": 1.3902912621359222, "grad_norm": 0.24262356758117676, "learning_rate": 3.777413476415769e-07, "loss": 0.6016, "step": 25060 }, { "epoch": 1.3914008321775313, "grad_norm": 0.3781633675098419, "learning_rate": 3.7742908667518175e-07, "loss": 0.56, "step": 25080 }, { "epoch": 1.3925104022191401, "grad_norm": 0.30562499165534973, "learning_rate": 3.771165568927335e-07, "loss": 0.5271, "step": 25100 }, { "epoch": 1.393619972260749, "grad_norm": 0.6770728230476379, "learning_rate": 3.768037589535241e-07, "loss": 0.4877, "step": 25120 }, { "epoch": 1.3947295423023578, "grad_norm": 0.31421026587486267, "learning_rate": 3.7649069351741185e-07, "loss": 0.5459, "step": 25140 }, { "epoch": 1.3958391123439666, "grad_norm": 0.34310925006866455, "learning_rate": 3.7617736124481913e-07, "loss": 0.4435, "step": 25160 }, { "epoch": 1.3969486823855757, "grad_norm": 0.3297117352485657, "learning_rate": 3.758637627967311e-07, "loss": 0.5602, "step": 25180 }, { "epoch": 1.3980582524271845, "grad_norm": 0.29948461055755615, "learning_rate": 3.755498988346945e-07, "loss": 0.5443, "step": 25200 }, { "epoch": 1.3991678224687933, "grad_norm": 0.3100070357322693, "learning_rate": 3.7523577002081607e-07, "loss": 0.5776, "step": 25220 }, { "epoch": 1.4002773925104022, "grad_norm": 0.36609548330307007, "learning_rate": 3.749213770177616e-07, "loss": 0.4609, "step": 25240 }, { "epoch": 1.401386962552011, "grad_norm": 0.3436796963214874, "learning_rate": 3.746067204887538e-07, "loss": 0.52, "step": 25260 }, { "epoch": 1.40249653259362, "grad_norm": 0.300744891166687, "learning_rate": 3.742918010975716e-07, "loss": 0.5269, "step": 25280 }, { "epoch": 1.403606102635229, "grad_norm": 0.29174962639808655, "learning_rate": 3.7397661950854813e-07, "loss": 0.4875, "step": 25300 }, { "epoch": 1.4047156726768377, "grad_norm": 0.2820412516593933, "learning_rate": 3.7366117638657e-07, "loss": 0.523, "step": 25320 }, { "epoch": 1.4058252427184466, "grad_norm": 0.24443332850933075, "learning_rate": 3.7334547239707536e-07, "loss": 0.5159, "step": 25340 }, { "epoch": 1.4069348127600554, "grad_norm": 0.29381799697875977, "learning_rate": 3.730295082060525e-07, "loss": 0.4793, "step": 25360 }, { "epoch": 1.4080443828016644, "grad_norm": 0.26477178931236267, "learning_rate": 3.72713284480039e-07, "loss": 0.3968, "step": 25380 }, { "epoch": 1.4091539528432733, "grad_norm": 0.239963561296463, "learning_rate": 3.723968018861195e-07, "loss": 0.4335, "step": 25400 }, { "epoch": 1.410263522884882, "grad_norm": 0.3080999553203583, "learning_rate": 3.7208006109192503e-07, "loss": 0.4871, "step": 25420 }, { "epoch": 1.411373092926491, "grad_norm": 0.3935830593109131, "learning_rate": 3.7176306276563126e-07, "loss": 0.5126, "step": 25440 }, { "epoch": 1.4124826629680998, "grad_norm": 0.26854172348976135, "learning_rate": 3.7144580757595704e-07, "loss": 0.5425, "step": 25460 }, { "epoch": 1.4135922330097088, "grad_norm": 0.28879329562187195, "learning_rate": 3.711282961921632e-07, "loss": 0.5079, "step": 25480 }, { "epoch": 1.4147018030513177, "grad_norm": 0.371822327375412, "learning_rate": 3.708105292840509e-07, "loss": 0.5236, "step": 25500 }, { "epoch": 1.4158113730929265, "grad_norm": 0.34833621978759766, "learning_rate": 3.7049250752196036e-07, "loss": 0.5434, "step": 25520 }, { "epoch": 1.4169209431345353, "grad_norm": 0.25428009033203125, "learning_rate": 3.7017423157676965e-07, "loss": 0.5739, "step": 25540 }, { "epoch": 1.4180305131761441, "grad_norm": 0.3593008816242218, "learning_rate": 3.698557021198925e-07, "loss": 0.5076, "step": 25560 }, { "epoch": 1.4191400832177532, "grad_norm": 0.3411901891231537, "learning_rate": 3.695369198232781e-07, "loss": 0.4725, "step": 25580 }, { "epoch": 1.420249653259362, "grad_norm": 0.3985610902309418, "learning_rate": 3.6921788535940854e-07, "loss": 0.5859, "step": 25600 }, { "epoch": 1.4213592233009709, "grad_norm": 0.4536857604980469, "learning_rate": 3.6889859940129814e-07, "loss": 0.5451, "step": 25620 }, { "epoch": 1.4224687933425797, "grad_norm": 0.34659093618392944, "learning_rate": 3.685790626224916e-07, "loss": 0.501, "step": 25640 }, { "epoch": 1.4235783633841885, "grad_norm": 0.2774284780025482, "learning_rate": 3.682592756970626e-07, "loss": 0.5851, "step": 25660 }, { "epoch": 1.4246879334257976, "grad_norm": 0.35417017340660095, "learning_rate": 3.6793923929961296e-07, "loss": 0.495, "step": 25680 }, { "epoch": 1.4257975034674064, "grad_norm": 0.3469982147216797, "learning_rate": 3.6761895410527034e-07, "loss": 0.5818, "step": 25700 }, { "epoch": 1.4269070735090152, "grad_norm": 0.3335203230381012, "learning_rate": 3.6729842078968744e-07, "loss": 0.4706, "step": 25720 }, { "epoch": 1.428016643550624, "grad_norm": 0.29470697045326233, "learning_rate": 3.669776400290403e-07, "loss": 0.5086, "step": 25740 }, { "epoch": 1.429126213592233, "grad_norm": 0.3146023154258728, "learning_rate": 3.6665661250002713e-07, "loss": 0.4697, "step": 25760 }, { "epoch": 1.430235783633842, "grad_norm": 0.2865731418132782, "learning_rate": 3.6633533887986645e-07, "loss": 0.5199, "step": 25780 }, { "epoch": 1.4313453536754508, "grad_norm": 0.38033854961395264, "learning_rate": 3.66013819846296e-07, "loss": 0.5234, "step": 25800 }, { "epoch": 1.4324549237170596, "grad_norm": 0.3382117450237274, "learning_rate": 3.6569205607757147e-07, "loss": 0.522, "step": 25820 }, { "epoch": 1.4335644937586685, "grad_norm": 0.29803699254989624, "learning_rate": 3.6537004825246443e-07, "loss": 0.4439, "step": 25840 }, { "epoch": 1.4346740638002773, "grad_norm": 0.24666158854961395, "learning_rate": 3.6504779705026156e-07, "loss": 0.5336, "step": 25860 }, { "epoch": 1.4357836338418863, "grad_norm": 0.37512490153312683, "learning_rate": 3.64725303150763e-07, "loss": 0.5119, "step": 25880 }, { "epoch": 1.4368932038834952, "grad_norm": 0.32666176557540894, "learning_rate": 3.6440256723428053e-07, "loss": 0.4829, "step": 25900 }, { "epoch": 1.438002773925104, "grad_norm": 0.30997928977012634, "learning_rate": 3.6407958998163687e-07, "loss": 0.4334, "step": 25920 }, { "epoch": 1.4391123439667128, "grad_norm": 0.2500138580799103, "learning_rate": 3.6375637207416365e-07, "loss": 0.5359, "step": 25940 }, { "epoch": 1.4402219140083217, "grad_norm": 0.22119645774364471, "learning_rate": 3.634329141937e-07, "loss": 0.5046, "step": 25960 }, { "epoch": 1.4413314840499307, "grad_norm": 0.32077789306640625, "learning_rate": 3.6310921702259184e-07, "loss": 0.5031, "step": 25980 }, { "epoch": 1.4424410540915396, "grad_norm": 0.2976148724555969, "learning_rate": 3.627852812436892e-07, "loss": 0.5256, "step": 26000 }, { "epoch": 1.4435506241331484, "grad_norm": 0.2778477072715759, "learning_rate": 3.62461107540346e-07, "loss": 0.5425, "step": 26020 }, { "epoch": 1.4446601941747572, "grad_norm": 0.3003707230091095, "learning_rate": 3.6213669659641757e-07, "loss": 0.4741, "step": 26040 }, { "epoch": 1.445769764216366, "grad_norm": 0.24113766849040985, "learning_rate": 3.6181204909626027e-07, "loss": 0.5264, "step": 26060 }, { "epoch": 1.446879334257975, "grad_norm": 0.29167383909225464, "learning_rate": 3.614871657247291e-07, "loss": 0.5621, "step": 26080 }, { "epoch": 1.447988904299584, "grad_norm": 0.33470118045806885, "learning_rate": 3.611620471671766e-07, "loss": 0.3946, "step": 26100 }, { "epoch": 1.4490984743411928, "grad_norm": 0.22758501768112183, "learning_rate": 3.608366941094518e-07, "loss": 0.4709, "step": 26120 }, { "epoch": 1.4502080443828016, "grad_norm": 0.41667652130126953, "learning_rate": 3.6051110723789813e-07, "loss": 0.5822, "step": 26140 }, { "epoch": 1.4513176144244104, "grad_norm": 0.33780637383461, "learning_rate": 3.6018528723935214e-07, "loss": 0.3956, "step": 26160 }, { "epoch": 1.4524271844660195, "grad_norm": 0.22279702126979828, "learning_rate": 3.5985923480114263e-07, "loss": 0.4754, "step": 26180 }, { "epoch": 1.4535367545076283, "grad_norm": 0.2962397634983063, "learning_rate": 3.595329506110883e-07, "loss": 0.4423, "step": 26200 }, { "epoch": 1.4546463245492371, "grad_norm": 0.4138476550579071, "learning_rate": 3.5920643535749696e-07, "loss": 0.5364, "step": 26220 }, { "epoch": 1.455755894590846, "grad_norm": 0.835754930973053, "learning_rate": 3.588796897291638e-07, "loss": 0.4798, "step": 26240 }, { "epoch": 1.4568654646324548, "grad_norm": 0.2033246010541916, "learning_rate": 3.5855271441536996e-07, "loss": 0.5367, "step": 26260 }, { "epoch": 1.4579750346740639, "grad_norm": 0.32556480169296265, "learning_rate": 3.582255101058811e-07, "loss": 0.5065, "step": 26280 }, { "epoch": 1.4590846047156727, "grad_norm": 0.3730859160423279, "learning_rate": 3.578980774909461e-07, "loss": 0.5272, "step": 26300 }, { "epoch": 1.4601941747572815, "grad_norm": 0.3475755453109741, "learning_rate": 3.575704172612953e-07, "loss": 0.4888, "step": 26320 }, { "epoch": 1.4613037447988904, "grad_norm": 0.32908475399017334, "learning_rate": 3.572425301081392e-07, "loss": 0.5496, "step": 26340 }, { "epoch": 1.4624133148404992, "grad_norm": 0.24404336512088776, "learning_rate": 3.569144167231672e-07, "loss": 0.5632, "step": 26360 }, { "epoch": 1.4635228848821082, "grad_norm": 0.2568022608757019, "learning_rate": 3.5658607779854566e-07, "loss": 0.5337, "step": 26380 }, { "epoch": 1.464632454923717, "grad_norm": 0.27923357486724854, "learning_rate": 3.5625751402691693e-07, "loss": 0.3616, "step": 26400 }, { "epoch": 1.465742024965326, "grad_norm": 0.2678179442882538, "learning_rate": 3.559287261013977e-07, "loss": 0.4167, "step": 26420 }, { "epoch": 1.4668515950069347, "grad_norm": 0.28212153911590576, "learning_rate": 3.5559971471557725e-07, "loss": 0.499, "step": 26440 }, { "epoch": 1.4679611650485436, "grad_norm": 0.24012410640716553, "learning_rate": 3.5527048056351654e-07, "loss": 0.5544, "step": 26460 }, { "epoch": 1.4690707350901526, "grad_norm": 0.2977813482284546, "learning_rate": 3.549410243397464e-07, "loss": 0.524, "step": 26480 }, { "epoch": 1.4701803051317615, "grad_norm": 0.24951153993606567, "learning_rate": 3.5461134673926615e-07, "loss": 0.4429, "step": 26500 }, { "epoch": 1.4712898751733703, "grad_norm": 0.4653339087963104, "learning_rate": 3.542814484575419e-07, "loss": 0.5169, "step": 26520 }, { "epoch": 1.4723994452149791, "grad_norm": 0.23980718851089478, "learning_rate": 3.5395133019050557e-07, "loss": 0.523, "step": 26540 }, { "epoch": 1.473509015256588, "grad_norm": 0.38057467341423035, "learning_rate": 3.53620992634553e-07, "loss": 0.5357, "step": 26560 }, { "epoch": 1.474618585298197, "grad_norm": 0.2189929336309433, "learning_rate": 3.532904364865426e-07, "loss": 0.4166, "step": 26580 }, { "epoch": 1.4757281553398058, "grad_norm": 0.27599650621414185, "learning_rate": 3.5295966244379407e-07, "loss": 0.4656, "step": 26600 }, { "epoch": 1.4768377253814147, "grad_norm": 0.29793888330459595, "learning_rate": 3.5262867120408655e-07, "loss": 0.4882, "step": 26620 }, { "epoch": 1.4779472954230235, "grad_norm": 0.3658837080001831, "learning_rate": 3.522974634656576e-07, "loss": 0.5683, "step": 26640 }, { "epoch": 1.4790568654646323, "grad_norm": 0.5472880601882935, "learning_rate": 3.5196603992720125e-07, "loss": 0.4744, "step": 26660 }, { "epoch": 1.4801664355062414, "grad_norm": 0.6574767827987671, "learning_rate": 3.5163440128786696e-07, "loss": 0.4488, "step": 26680 }, { "epoch": 1.4812760055478502, "grad_norm": 0.3188958168029785, "learning_rate": 3.5130254824725787e-07, "loss": 0.4608, "step": 26700 }, { "epoch": 1.482385575589459, "grad_norm": 0.26202791929244995, "learning_rate": 3.509704815054294e-07, "loss": 0.4767, "step": 26720 }, { "epoch": 1.4834951456310679, "grad_norm": 0.37773221731185913, "learning_rate": 3.506382017628878e-07, "loss": 0.4856, "step": 26740 }, { "epoch": 1.4846047156726767, "grad_norm": 0.337298721075058, "learning_rate": 3.503057097205885e-07, "loss": 0.4589, "step": 26760 }, { "epoch": 1.4857142857142858, "grad_norm": 0.21895577013492584, "learning_rate": 3.499730060799352e-07, "loss": 0.461, "step": 26780 }, { "epoch": 1.4868238557558946, "grad_norm": 0.31171756982803345, "learning_rate": 3.4964009154277756e-07, "loss": 0.4823, "step": 26800 }, { "epoch": 1.4879334257975034, "grad_norm": 0.3064799904823303, "learning_rate": 3.4930696681141034e-07, "loss": 0.4772, "step": 26820 }, { "epoch": 1.4890429958391125, "grad_norm": 0.29456526041030884, "learning_rate": 3.4897363258857156e-07, "loss": 0.5023, "step": 26840 }, { "epoch": 1.490152565880721, "grad_norm": 0.3055489659309387, "learning_rate": 3.486400895774413e-07, "loss": 0.535, "step": 26860 }, { "epoch": 1.4912621359223301, "grad_norm": 0.3658798038959503, "learning_rate": 3.4830633848164006e-07, "loss": 0.4723, "step": 26880 }, { "epoch": 1.492371705963939, "grad_norm": 0.2875451445579529, "learning_rate": 3.479723800052272e-07, "loss": 0.5867, "step": 26900 }, { "epoch": 1.4934812760055478, "grad_norm": 0.33762866258621216, "learning_rate": 3.4763821485269985e-07, "loss": 0.4568, "step": 26920 }, { "epoch": 1.4945908460471569, "grad_norm": 0.280441015958786, "learning_rate": 3.473038437289907e-07, "loss": 0.5172, "step": 26940 }, { "epoch": 1.4957004160887655, "grad_norm": 0.2871154546737671, "learning_rate": 3.469692673394673e-07, "loss": 0.5014, "step": 26960 }, { "epoch": 1.4968099861303745, "grad_norm": 0.2808482050895691, "learning_rate": 3.466344863899301e-07, "loss": 0.4633, "step": 26980 }, { "epoch": 1.4979195561719834, "grad_norm": 0.2705095708370209, "learning_rate": 3.462995015866109e-07, "loss": 0.6002, "step": 27000 }, { "epoch": 1.4990291262135922, "grad_norm": 0.2726444602012634, "learning_rate": 3.4596431363617186e-07, "loss": 0.3903, "step": 27020 }, { "epoch": 1.5001386962552012, "grad_norm": 0.2183285653591156, "learning_rate": 3.4562892324570347e-07, "loss": 0.5002, "step": 27040 }, { "epoch": 1.5012482662968099, "grad_norm": 0.31023529171943665, "learning_rate": 3.452933311227232e-07, "loss": 0.4914, "step": 27060 }, { "epoch": 1.502357836338419, "grad_norm": 0.2810976505279541, "learning_rate": 3.449575379751744e-07, "loss": 0.5259, "step": 27080 }, { "epoch": 1.5034674063800277, "grad_norm": 0.2652877867221832, "learning_rate": 3.446215445114243e-07, "loss": 0.4479, "step": 27100 }, { "epoch": 1.5045769764216366, "grad_norm": 0.1707787811756134, "learning_rate": 3.442853514402626e-07, "loss": 0.5258, "step": 27120 }, { "epoch": 1.5056865464632456, "grad_norm": 0.3659198582172394, "learning_rate": 3.4394895947090007e-07, "loss": 0.5216, "step": 27140 }, { "epoch": 1.5067961165048542, "grad_norm": 0.24395674467086792, "learning_rate": 3.4361236931296746e-07, "loss": 0.4976, "step": 27160 }, { "epoch": 1.5079056865464633, "grad_norm": 0.3418830335140228, "learning_rate": 3.432755816765131e-07, "loss": 0.5203, "step": 27180 }, { "epoch": 1.5090152565880721, "grad_norm": 0.3920184373855591, "learning_rate": 3.429385972720022e-07, "loss": 0.4878, "step": 27200 }, { "epoch": 1.510124826629681, "grad_norm": 0.24508875608444214, "learning_rate": 3.42601416810315e-07, "loss": 0.4905, "step": 27220 }, { "epoch": 1.51123439667129, "grad_norm": 0.5051547288894653, "learning_rate": 3.422640410027451e-07, "loss": 0.4829, "step": 27240 }, { "epoch": 1.5123439667128986, "grad_norm": 0.40496188402175903, "learning_rate": 3.4192647056099876e-07, "loss": 0.4523, "step": 27260 }, { "epoch": 1.5134535367545077, "grad_norm": 0.3536234200000763, "learning_rate": 3.415887061971923e-07, "loss": 0.5565, "step": 27280 }, { "epoch": 1.5145631067961165, "grad_norm": 0.3378537595272064, "learning_rate": 3.412507486238512e-07, "loss": 0.5054, "step": 27300 }, { "epoch": 1.5156726768377253, "grad_norm": 0.40263503789901733, "learning_rate": 3.4091259855390873e-07, "loss": 0.5951, "step": 27320 }, { "epoch": 1.5167822468793344, "grad_norm": 0.2306205928325653, "learning_rate": 3.4057425670070416e-07, "loss": 0.5499, "step": 27340 }, { "epoch": 1.517891816920943, "grad_norm": 0.3210078477859497, "learning_rate": 3.4023572377798116e-07, "loss": 0.525, "step": 27360 }, { "epoch": 1.519001386962552, "grad_norm": 0.40825071930885315, "learning_rate": 3.398970004998867e-07, "loss": 0.4936, "step": 27380 }, { "epoch": 1.5201109570041609, "grad_norm": 0.23699213564395905, "learning_rate": 3.395580875809692e-07, "loss": 0.5873, "step": 27400 }, { "epoch": 1.5212205270457697, "grad_norm": 0.22610828280448914, "learning_rate": 3.3921898573617715e-07, "loss": 0.5254, "step": 27420 }, { "epoch": 1.5223300970873788, "grad_norm": 0.26229390501976013, "learning_rate": 3.3887969568085756e-07, "loss": 0.4093, "step": 27440 }, { "epoch": 1.5234396671289874, "grad_norm": 0.34853002429008484, "learning_rate": 3.3854021813075463e-07, "loss": 0.5494, "step": 27460 }, { "epoch": 1.5245492371705964, "grad_norm": 0.42384761571884155, "learning_rate": 3.382005538020078e-07, "loss": 0.5947, "step": 27480 }, { "epoch": 1.5256588072122053, "grad_norm": 0.2898954153060913, "learning_rate": 3.378607034111507e-07, "loss": 0.4911, "step": 27500 }, { "epoch": 1.526768377253814, "grad_norm": 0.3806048631668091, "learning_rate": 3.3752066767510956e-07, "loss": 0.447, "step": 27520 }, { "epoch": 1.5278779472954231, "grad_norm": 0.3625750243663788, "learning_rate": 3.371804473112014e-07, "loss": 0.5926, "step": 27540 }, { "epoch": 1.5289875173370318, "grad_norm": 0.3748339116573334, "learning_rate": 3.368400430371329e-07, "loss": 0.49, "step": 27560 }, { "epoch": 1.5300970873786408, "grad_norm": 0.2171182781457901, "learning_rate": 3.364994555709986e-07, "loss": 0.5603, "step": 27580 }, { "epoch": 1.5312066574202496, "grad_norm": 0.24239978194236755, "learning_rate": 3.3615868563127937e-07, "loss": 0.5481, "step": 27600 }, { "epoch": 1.5323162274618585, "grad_norm": 0.31223204731941223, "learning_rate": 3.3581773393684124e-07, "loss": 0.4589, "step": 27620 }, { "epoch": 1.5334257975034675, "grad_norm": 0.3772710859775543, "learning_rate": 3.354766012069337e-07, "loss": 0.4176, "step": 27640 }, { "epoch": 1.5345353675450761, "grad_norm": 0.2354874461889267, "learning_rate": 3.3513528816118775e-07, "loss": 0.3866, "step": 27660 }, { "epoch": 1.5356449375866852, "grad_norm": 0.3374041020870209, "learning_rate": 3.3479379551961516e-07, "loss": 0.4333, "step": 27680 }, { "epoch": 1.536754507628294, "grad_norm": 0.3103543221950531, "learning_rate": 3.344521240026066e-07, "loss": 0.5075, "step": 27700 }, { "epoch": 1.5378640776699029, "grad_norm": 0.3151879608631134, "learning_rate": 3.341102743309296e-07, "loss": 0.5045, "step": 27720 }, { "epoch": 1.538973647711512, "grad_norm": 0.3394266366958618, "learning_rate": 3.3376824722572803e-07, "loss": 0.5983, "step": 27740 }, { "epoch": 1.5400832177531205, "grad_norm": 0.44573837518692017, "learning_rate": 3.334260434085199e-07, "loss": 0.4152, "step": 27760 }, { "epoch": 1.5411927877947296, "grad_norm": 0.30766671895980835, "learning_rate": 3.3308366360119584e-07, "loss": 0.5279, "step": 27780 }, { "epoch": 1.5423023578363384, "grad_norm": 0.3044489026069641, "learning_rate": 3.327411085260179e-07, "loss": 0.5547, "step": 27800 }, { "epoch": 1.5434119278779472, "grad_norm": 0.26974210143089294, "learning_rate": 3.323983789056179e-07, "loss": 0.5485, "step": 27820 }, { "epoch": 1.5445214979195563, "grad_norm": 0.4256914258003235, "learning_rate": 3.3205547546299575e-07, "loss": 0.5058, "step": 27840 }, { "epoch": 1.545631067961165, "grad_norm": 1.356689691543579, "learning_rate": 3.317123989215179e-07, "loss": 0.4561, "step": 27860 }, { "epoch": 1.546740638002774, "grad_norm": 0.30659595131874084, "learning_rate": 3.313691500049165e-07, "loss": 0.5168, "step": 27880 }, { "epoch": 1.5478502080443828, "grad_norm": 0.26571470499038696, "learning_rate": 3.3102572943728673e-07, "loss": 0.5332, "step": 27900 }, { "epoch": 1.5489597780859916, "grad_norm": 0.2585001587867737, "learning_rate": 3.3068213794308624e-07, "loss": 0.4772, "step": 27920 }, { "epoch": 1.5500693481276007, "grad_norm": 0.23646335303783417, "learning_rate": 3.30338376247133e-07, "loss": 0.4898, "step": 27940 }, { "epoch": 1.5511789181692093, "grad_norm": 0.529796838760376, "learning_rate": 3.2999444507460437e-07, "loss": 0.4594, "step": 27960 }, { "epoch": 1.5522884882108183, "grad_norm": 0.2717756927013397, "learning_rate": 3.296503451510348e-07, "loss": 0.424, "step": 27980 }, { "epoch": 1.5533980582524272, "grad_norm": 0.35993632674217224, "learning_rate": 3.2930607720231513e-07, "loss": 0.481, "step": 28000 }, { "epoch": 1.554507628294036, "grad_norm": 0.3204074800014496, "learning_rate": 3.2896164195469033e-07, "loss": 0.5457, "step": 28020 }, { "epoch": 1.555617198335645, "grad_norm": 0.3053014278411865, "learning_rate": 3.2861704013475854e-07, "loss": 0.4867, "step": 28040 }, { "epoch": 1.5567267683772537, "grad_norm": 0.23565566539764404, "learning_rate": 3.2827227246946907e-07, "loss": 0.4876, "step": 28060 }, { "epoch": 1.5578363384188627, "grad_norm": 0.3225576877593994, "learning_rate": 3.279273396861214e-07, "loss": 0.4907, "step": 28080 }, { "epoch": 1.5589459084604715, "grad_norm": 0.2811879813671112, "learning_rate": 3.275822425123629e-07, "loss": 0.4639, "step": 28100 }, { "epoch": 1.5600554785020804, "grad_norm": 0.22023139894008636, "learning_rate": 3.27236981676188e-07, "loss": 0.5236, "step": 28120 }, { "epoch": 1.5611650485436894, "grad_norm": 0.2729092538356781, "learning_rate": 3.268915579059366e-07, "loss": 0.3504, "step": 28140 }, { "epoch": 1.562274618585298, "grad_norm": 0.32027074694633484, "learning_rate": 3.265459719302917e-07, "loss": 0.5132, "step": 28160 }, { "epoch": 1.563384188626907, "grad_norm": 0.341043621301651, "learning_rate": 3.26200224478279e-07, "loss": 0.4224, "step": 28180 }, { "epoch": 1.564493758668516, "grad_norm": 0.22810377180576324, "learning_rate": 3.2585431627926476e-07, "loss": 0.5314, "step": 28200 }, { "epoch": 1.5656033287101248, "grad_norm": 0.5231753587722778, "learning_rate": 3.255082480629542e-07, "loss": 0.5429, "step": 28220 }, { "epoch": 1.5667128987517338, "grad_norm": 0.3144018352031708, "learning_rate": 3.2516202055939e-07, "loss": 0.4748, "step": 28240 }, { "epoch": 1.5678224687933424, "grad_norm": 0.37708231806755066, "learning_rate": 3.248156344989512e-07, "loss": 0.3939, "step": 28260 }, { "epoch": 1.5689320388349515, "grad_norm": 0.37304365634918213, "learning_rate": 3.2446909061235106e-07, "loss": 0.5667, "step": 28280 }, { "epoch": 1.5700416088765603, "grad_norm": 0.438184916973114, "learning_rate": 3.241223896306359e-07, "loss": 0.5251, "step": 28300 }, { "epoch": 1.5711511789181691, "grad_norm": 0.25777795910835266, "learning_rate": 3.237755322851834e-07, "loss": 0.5139, "step": 28320 }, { "epoch": 1.5722607489597782, "grad_norm": 0.310349702835083, "learning_rate": 3.2342851930770103e-07, "loss": 0.5077, "step": 28340 }, { "epoch": 1.5733703190013868, "grad_norm": 0.33580219745635986, "learning_rate": 3.2308135143022475e-07, "loss": 0.4802, "step": 28360 }, { "epoch": 1.5744798890429959, "grad_norm": 0.39907827973365784, "learning_rate": 3.2273402938511706e-07, "loss": 0.4963, "step": 28380 }, { "epoch": 1.5755894590846047, "grad_norm": 0.45628952980041504, "learning_rate": 3.223865539050659e-07, "loss": 0.5515, "step": 28400 }, { "epoch": 1.5766990291262135, "grad_norm": 0.27358758449554443, "learning_rate": 3.2203892572308255e-07, "loss": 0.4145, "step": 28420 }, { "epoch": 1.5778085991678226, "grad_norm": 0.25609341263771057, "learning_rate": 3.2169114557250103e-07, "loss": 0.4556, "step": 28440 }, { "epoch": 1.5789181692094314, "grad_norm": 0.24854277074337006, "learning_rate": 3.213432141869752e-07, "loss": 0.4863, "step": 28460 }, { "epoch": 1.5800277392510402, "grad_norm": 0.39399391412734985, "learning_rate": 3.209951323004785e-07, "loss": 0.48, "step": 28480 }, { "epoch": 1.581137309292649, "grad_norm": 0.29452070593833923, "learning_rate": 3.206469006473017e-07, "loss": 0.5251, "step": 28500 }, { "epoch": 1.582246879334258, "grad_norm": 0.32692965865135193, "learning_rate": 3.202985199620514e-07, "loss": 0.5051, "step": 28520 }, { "epoch": 1.583356449375867, "grad_norm": 0.382464736700058, "learning_rate": 3.199499909796486e-07, "loss": 0.5003, "step": 28540 }, { "epoch": 1.5844660194174758, "grad_norm": 0.26259946823120117, "learning_rate": 3.196013144353274e-07, "loss": 0.5013, "step": 28560 }, { "epoch": 1.5855755894590846, "grad_norm": 0.2503226101398468, "learning_rate": 3.1925249106463294e-07, "loss": 0.4975, "step": 28580 }, { "epoch": 1.5866851595006934, "grad_norm": 0.27847903966903687, "learning_rate": 3.189035216034199e-07, "loss": 0.4302, "step": 28600 }, { "epoch": 1.5877947295423023, "grad_norm": 0.3335626423358917, "learning_rate": 3.185544067878518e-07, "loss": 0.4663, "step": 28620 }, { "epoch": 1.5889042995839113, "grad_norm": 0.32258400321006775, "learning_rate": 3.182051473543981e-07, "loss": 0.4788, "step": 28640 }, { "epoch": 1.5900138696255202, "grad_norm": 0.3986503481864929, "learning_rate": 3.1785574403983377e-07, "loss": 0.4587, "step": 28660 }, { "epoch": 1.591123439667129, "grad_norm": 0.29908812046051025, "learning_rate": 3.175061975812371e-07, "loss": 0.4209, "step": 28680 }, { "epoch": 1.5922330097087378, "grad_norm": 0.3167146146297455, "learning_rate": 3.171565087159883e-07, "loss": 0.5408, "step": 28700 }, { "epoch": 1.5933425797503467, "grad_norm": 0.3292248249053955, "learning_rate": 3.168066781817682e-07, "loss": 0.5484, "step": 28720 }, { "epoch": 1.5944521497919557, "grad_norm": 0.38223186135292053, "learning_rate": 3.1645670671655645e-07, "loss": 0.5149, "step": 28740 }, { "epoch": 1.5955617198335645, "grad_norm": 0.28162357211112976, "learning_rate": 3.161065950586298e-07, "loss": 0.477, "step": 28760 }, { "epoch": 1.5966712898751734, "grad_norm": 0.32921168208122253, "learning_rate": 3.157563439465608e-07, "loss": 0.5128, "step": 28780 }, { "epoch": 1.5977808599167822, "grad_norm": 0.22339418530464172, "learning_rate": 3.154059541192164e-07, "loss": 0.4067, "step": 28800 }, { "epoch": 1.598890429958391, "grad_norm": 0.21557636559009552, "learning_rate": 3.150554263157561e-07, "loss": 0.4716, "step": 28820 }, { "epoch": 1.6, "grad_norm": 0.27148211002349854, "learning_rate": 3.147047612756302e-07, "loss": 0.5051, "step": 28840 }, { "epoch": 1.601109570041609, "grad_norm": 0.39929234981536865, "learning_rate": 3.1435395973857876e-07, "loss": 0.4799, "step": 28860 }, { "epoch": 1.6022191400832178, "grad_norm": 0.3441343903541565, "learning_rate": 3.140030224446297e-07, "loss": 0.5458, "step": 28880 }, { "epoch": 1.6033287101248266, "grad_norm": 0.39072877168655396, "learning_rate": 3.136519501340974e-07, "loss": 0.5566, "step": 28900 }, { "epoch": 1.6044382801664354, "grad_norm": 0.2531302273273468, "learning_rate": 3.1330074354758094e-07, "loss": 0.5074, "step": 28920 }, { "epoch": 1.6055478502080445, "grad_norm": 0.3106899857521057, "learning_rate": 3.129494034259628e-07, "loss": 0.5314, "step": 28940 }, { "epoch": 1.6066574202496533, "grad_norm": 0.361161470413208, "learning_rate": 3.125979305104071e-07, "loss": 0.4433, "step": 28960 }, { "epoch": 1.6077669902912621, "grad_norm": 0.19862572848796844, "learning_rate": 3.12246325542358e-07, "loss": 0.5041, "step": 28980 }, { "epoch": 1.608876560332871, "grad_norm": 0.24220655858516693, "learning_rate": 3.118945892635383e-07, "loss": 0.4919, "step": 29000 }, { "epoch": 1.6099861303744798, "grad_norm": 0.2106894701719284, "learning_rate": 3.115427224159479e-07, "loss": 0.42, "step": 29020 }, { "epoch": 1.6110957004160888, "grad_norm": 0.30059361457824707, "learning_rate": 3.11190725741862e-07, "loss": 0.5269, "step": 29040 }, { "epoch": 1.6122052704576977, "grad_norm": 0.31522464752197266, "learning_rate": 3.1083859998382966e-07, "loss": 0.4995, "step": 29060 }, { "epoch": 1.6133148404993065, "grad_norm": 0.3345507085323334, "learning_rate": 3.104863458846724e-07, "loss": 0.5447, "step": 29080 }, { "epoch": 1.6144244105409153, "grad_norm": 0.32452645897865295, "learning_rate": 3.1013396418748234e-07, "loss": 0.4673, "step": 29100 }, { "epoch": 1.6155339805825242, "grad_norm": 0.22615940868854523, "learning_rate": 3.0978145563562093e-07, "loss": 0.4503, "step": 29120 }, { "epoch": 1.6166435506241332, "grad_norm": 0.28898218274116516, "learning_rate": 3.0942882097271696e-07, "loss": 0.5572, "step": 29140 }, { "epoch": 1.617753120665742, "grad_norm": 0.22564494609832764, "learning_rate": 3.090760609426655e-07, "loss": 0.3937, "step": 29160 }, { "epoch": 1.618862690707351, "grad_norm": 0.30136924982070923, "learning_rate": 3.0872317628962604e-07, "loss": 0.4654, "step": 29180 }, { "epoch": 1.6199722607489597, "grad_norm": 0.3415296673774719, "learning_rate": 3.083701677580208e-07, "loss": 0.4134, "step": 29200 }, { "epoch": 1.6210818307905686, "grad_norm": 0.2911455035209656, "learning_rate": 3.080170360925336e-07, "loss": 0.5795, "step": 29220 }, { "epoch": 1.6221914008321776, "grad_norm": 0.3195721507072449, "learning_rate": 3.0766378203810775e-07, "loss": 0.4975, "step": 29240 }, { "epoch": 1.6233009708737864, "grad_norm": 0.26215308904647827, "learning_rate": 3.0731040633994493e-07, "loss": 0.5678, "step": 29260 }, { "epoch": 1.6244105409153953, "grad_norm": 0.4036589562892914, "learning_rate": 3.069569097435033e-07, "loss": 0.5047, "step": 29280 }, { "epoch": 1.6255201109570043, "grad_norm": 0.32612839341163635, "learning_rate": 3.066032929944962e-07, "loss": 0.5211, "step": 29300 }, { "epoch": 1.626629680998613, "grad_norm": 0.3161689341068268, "learning_rate": 3.062495568388903e-07, "loss": 0.4639, "step": 29320 }, { "epoch": 1.627739251040222, "grad_norm": 0.34872516989707947, "learning_rate": 3.0589570202290433e-07, "loss": 0.5102, "step": 29340 }, { "epoch": 1.6288488210818308, "grad_norm": 0.2274002581834793, "learning_rate": 3.0554172929300695e-07, "loss": 0.533, "step": 29360 }, { "epoch": 1.6299583911234397, "grad_norm": 0.2623968720436096, "learning_rate": 3.051876393959162e-07, "loss": 0.4273, "step": 29380 }, { "epoch": 1.6310679611650487, "grad_norm": 0.32831913232803345, "learning_rate": 3.0483343307859663e-07, "loss": 0.4442, "step": 29400 }, { "epoch": 1.6321775312066573, "grad_norm": 0.3770638704299927, "learning_rate": 3.0447911108825897e-07, "loss": 0.4815, "step": 29420 }, { "epoch": 1.6332871012482664, "grad_norm": 0.3095746338367462, "learning_rate": 3.0412467417235745e-07, "loss": 0.4063, "step": 29440 }, { "epoch": 1.6343966712898752, "grad_norm": 0.4561549425125122, "learning_rate": 3.0377012307858904e-07, "loss": 0.5787, "step": 29460 }, { "epoch": 1.635506241331484, "grad_norm": 0.4631495773792267, "learning_rate": 3.034154585548915e-07, "loss": 0.5254, "step": 29480 }, { "epoch": 1.636615811373093, "grad_norm": 0.3094829022884369, "learning_rate": 3.0306068134944185e-07, "loss": 0.4618, "step": 29500 }, { "epoch": 1.6377253814147017, "grad_norm": 0.26945701241493225, "learning_rate": 3.027057922106549e-07, "loss": 0.4745, "step": 29520 }, { "epoch": 1.6388349514563108, "grad_norm": 0.31297236680984497, "learning_rate": 3.023507918871814e-07, "loss": 0.5437, "step": 29540 }, { "epoch": 1.6399445214979196, "grad_norm": 0.32407745718955994, "learning_rate": 3.0199568112790704e-07, "loss": 0.5942, "step": 29560 }, { "epoch": 1.6410540915395284, "grad_norm": 0.37164920568466187, "learning_rate": 3.0164046068195e-07, "loss": 0.573, "step": 29580 }, { "epoch": 1.6421636615811375, "grad_norm": 0.31025248765945435, "learning_rate": 3.0128513129866e-07, "loss": 0.4245, "step": 29600 }, { "epoch": 1.643273231622746, "grad_norm": 0.35869303345680237, "learning_rate": 3.0092969372761685e-07, "loss": 0.4382, "step": 29620 }, { "epoch": 1.6443828016643551, "grad_norm": 0.25680845975875854, "learning_rate": 3.0057414871862816e-07, "loss": 0.4148, "step": 29640 }, { "epoch": 1.645492371705964, "grad_norm": 0.3561042845249176, "learning_rate": 3.0021849702172854e-07, "loss": 0.4139, "step": 29660 }, { "epoch": 1.6466019417475728, "grad_norm": 0.2987665832042694, "learning_rate": 2.998627393871774e-07, "loss": 0.4972, "step": 29680 }, { "epoch": 1.6477115117891818, "grad_norm": 0.3060048222541809, "learning_rate": 2.9950687656545787e-07, "loss": 0.57, "step": 29700 }, { "epoch": 1.6488210818307905, "grad_norm": 0.37241366505622864, "learning_rate": 2.9915090930727474e-07, "loss": 0.5076, "step": 29720 }, { "epoch": 1.6499306518723995, "grad_norm": 0.27067720890045166, "learning_rate": 2.9879483836355323e-07, "loss": 0.4567, "step": 29740 }, { "epoch": 1.6510402219140083, "grad_norm": 0.23468643426895142, "learning_rate": 2.9843866448543727e-07, "loss": 0.5183, "step": 29760 }, { "epoch": 1.6521497919556172, "grad_norm": 0.35854992270469666, "learning_rate": 2.980823884242881e-07, "loss": 0.5279, "step": 29780 }, { "epoch": 1.6532593619972262, "grad_norm": 0.32893064618110657, "learning_rate": 2.97726010931682e-07, "loss": 0.5942, "step": 29800 }, { "epoch": 1.6543689320388348, "grad_norm": 0.4739121198654175, "learning_rate": 2.973695327594099e-07, "loss": 0.4604, "step": 29820 }, { "epoch": 1.655478502080444, "grad_norm": 0.29216960072517395, "learning_rate": 2.9701295465947477e-07, "loss": 0.5115, "step": 29840 }, { "epoch": 1.6565880721220527, "grad_norm": 0.2660972774028778, "learning_rate": 2.966562773840903e-07, "loss": 0.4512, "step": 29860 }, { "epoch": 1.6576976421636616, "grad_norm": 0.365914911031723, "learning_rate": 2.9629950168567954e-07, "loss": 0.4894, "step": 29880 }, { "epoch": 1.6588072122052706, "grad_norm": 0.26106780767440796, "learning_rate": 2.959426283168731e-07, "loss": 0.4553, "step": 29900 }, { "epoch": 1.6599167822468792, "grad_norm": 0.2803400456905365, "learning_rate": 2.955856580305078e-07, "loss": 0.45, "step": 29920 }, { "epoch": 1.6610263522884883, "grad_norm": 0.28876036405563354, "learning_rate": 2.9522859157962454e-07, "loss": 0.5181, "step": 29940 }, { "epoch": 1.662135922330097, "grad_norm": 0.3790487051010132, "learning_rate": 2.9487142971746755e-07, "loss": 0.4827, "step": 29960 }, { "epoch": 1.663245492371706, "grad_norm": 0.1955181360244751, "learning_rate": 2.9451417319748187e-07, "loss": 0.4856, "step": 29980 }, { "epoch": 1.664355062413315, "grad_norm": 0.32159459590911865, "learning_rate": 2.9415682277331265e-07, "loss": 0.434, "step": 30000 }, { "epoch": 1.6654646324549236, "grad_norm": 0.3358677327632904, "learning_rate": 2.937993791988029e-07, "loss": 0.5575, "step": 30020 }, { "epoch": 1.6665742024965327, "grad_norm": 0.2520512640476227, "learning_rate": 2.9344184322799197e-07, "loss": 0.4819, "step": 30040 }, { "epoch": 1.6676837725381415, "grad_norm": 0.3289657533168793, "learning_rate": 2.930842156151146e-07, "loss": 0.6709, "step": 30060 }, { "epoch": 1.6687933425797503, "grad_norm": 0.4591546952724457, "learning_rate": 2.927264971145984e-07, "loss": 0.4905, "step": 30080 }, { "epoch": 1.6699029126213594, "grad_norm": 0.5178128480911255, "learning_rate": 2.9236868848106296e-07, "loss": 0.4705, "step": 30100 }, { "epoch": 1.671012482662968, "grad_norm": 0.28582507371902466, "learning_rate": 2.920107904693178e-07, "loss": 0.5952, "step": 30120 }, { "epoch": 1.672122052704577, "grad_norm": 0.17550955712795258, "learning_rate": 2.916528038343613e-07, "loss": 0.452, "step": 30140 }, { "epoch": 1.6732316227461859, "grad_norm": 0.25457873940467834, "learning_rate": 2.9129472933137857e-07, "loss": 0.5889, "step": 30160 }, { "epoch": 1.6743411927877947, "grad_norm": 0.3667411506175995, "learning_rate": 2.9093656771574006e-07, "loss": 0.5109, "step": 30180 }, { "epoch": 1.6754507628294038, "grad_norm": 0.34097346663475037, "learning_rate": 2.905783197430001e-07, "loss": 0.6018, "step": 30200 }, { "epoch": 1.6765603328710124, "grad_norm": 0.3572954833507538, "learning_rate": 2.902199861688951e-07, "loss": 0.4753, "step": 30220 }, { "epoch": 1.6776699029126214, "grad_norm": 0.2566346824169159, "learning_rate": 2.8986156774934204e-07, "loss": 0.5425, "step": 30240 }, { "epoch": 1.6787794729542302, "grad_norm": 0.22325794398784637, "learning_rate": 2.895030652404371e-07, "loss": 0.4758, "step": 30260 }, { "epoch": 1.679889042995839, "grad_norm": 0.19706304371356964, "learning_rate": 2.891444793984536e-07, "loss": 0.3958, "step": 30280 }, { "epoch": 1.6809986130374481, "grad_norm": 0.36266472935676575, "learning_rate": 2.8878581097984075e-07, "loss": 0.5407, "step": 30300 }, { "epoch": 1.6821081830790567, "grad_norm": 0.279533714056015, "learning_rate": 2.8842706074122193e-07, "loss": 0.4125, "step": 30320 }, { "epoch": 1.6832177531206658, "grad_norm": 0.3364698588848114, "learning_rate": 2.8806822943939315e-07, "loss": 0.5194, "step": 30340 }, { "epoch": 1.6843273231622746, "grad_norm": 0.27354753017425537, "learning_rate": 2.877093178313214e-07, "loss": 0.5438, "step": 30360 }, { "epoch": 1.6854368932038835, "grad_norm": 0.24867723882198334, "learning_rate": 2.8735032667414315e-07, "loss": 0.5495, "step": 30380 }, { "epoch": 1.6865464632454925, "grad_norm": 0.3060404658317566, "learning_rate": 2.8699125672516254e-07, "loss": 0.4865, "step": 30400 }, { "epoch": 1.6876560332871011, "grad_norm": 0.41411465406417847, "learning_rate": 2.8663210874185013e-07, "loss": 0.4341, "step": 30420 }, { "epoch": 1.6887656033287102, "grad_norm": 0.36156442761421204, "learning_rate": 2.862728834818409e-07, "loss": 0.4191, "step": 30440 }, { "epoch": 1.689875173370319, "grad_norm": 0.2508484125137329, "learning_rate": 2.8591358170293297e-07, "loss": 0.5103, "step": 30460 }, { "epoch": 1.6909847434119278, "grad_norm": 0.26582667231559753, "learning_rate": 2.8555420416308573e-07, "loss": 0.4938, "step": 30480 }, { "epoch": 1.692094313453537, "grad_norm": 0.29400238394737244, "learning_rate": 2.851947516204186e-07, "loss": 0.462, "step": 30500 }, { "epoch": 1.6932038834951455, "grad_norm": 0.28862330317497253, "learning_rate": 2.848352248332091e-07, "loss": 0.4949, "step": 30520 }, { "epoch": 1.6943134535367546, "grad_norm": 0.2672070264816284, "learning_rate": 2.8447562455989134e-07, "loss": 0.5244, "step": 30540 }, { "epoch": 1.6954230235783634, "grad_norm": 0.3128918707370758, "learning_rate": 2.8411595155905457e-07, "loss": 0.5553, "step": 30560 }, { "epoch": 1.6965325936199722, "grad_norm": 0.3227311372756958, "learning_rate": 2.8375620658944133e-07, "loss": 0.5354, "step": 30580 }, { "epoch": 1.6976421636615813, "grad_norm": 0.2765336036682129, "learning_rate": 2.8339639040994604e-07, "loss": 0.5288, "step": 30600 }, { "epoch": 1.6987517337031899, "grad_norm": 0.3127990961074829, "learning_rate": 2.830365037796134e-07, "loss": 0.4439, "step": 30620 }, { "epoch": 1.699861303744799, "grad_norm": 0.32751116156578064, "learning_rate": 2.8267654745763656e-07, "loss": 0.579, "step": 30640 }, { "epoch": 1.7009708737864078, "grad_norm": 0.40528789162635803, "learning_rate": 2.8231652220335603e-07, "loss": 0.5668, "step": 30660 }, { "epoch": 1.7020804438280166, "grad_norm": 0.1772594451904297, "learning_rate": 2.819564287762572e-07, "loss": 0.4557, "step": 30680 }, { "epoch": 1.7031900138696257, "grad_norm": 0.48305651545524597, "learning_rate": 2.815962679359697e-07, "loss": 0.5199, "step": 30700 }, { "epoch": 1.7042995839112343, "grad_norm": 0.3395373821258545, "learning_rate": 2.812360404422653e-07, "loss": 0.5083, "step": 30720 }, { "epoch": 1.7054091539528433, "grad_norm": 0.2559817135334015, "learning_rate": 2.808757470550563e-07, "loss": 0.4611, "step": 30740 }, { "epoch": 1.7065187239944521, "grad_norm": 0.29759660363197327, "learning_rate": 2.80515388534394e-07, "loss": 0.4843, "step": 30760 }, { "epoch": 1.707628294036061, "grad_norm": 0.19269053637981415, "learning_rate": 2.80154965640467e-07, "loss": 0.4592, "step": 30780 }, { "epoch": 1.70873786407767, "grad_norm": 0.22289779782295227, "learning_rate": 2.7979447913360017e-07, "loss": 0.4618, "step": 30800 }, { "epoch": 1.7098474341192786, "grad_norm": 0.3606151342391968, "learning_rate": 2.7943392977425187e-07, "loss": 0.4639, "step": 30820 }, { "epoch": 1.7109570041608877, "grad_norm": 0.2358284443616867, "learning_rate": 2.790733183230136e-07, "loss": 0.4582, "step": 30840 }, { "epoch": 1.7120665742024965, "grad_norm": 0.7422751188278198, "learning_rate": 2.7871264554060766e-07, "loss": 0.4957, "step": 30860 }, { "epoch": 1.7131761442441054, "grad_norm": 0.2739536464214325, "learning_rate": 2.7835191218788557e-07, "loss": 0.4503, "step": 30880 }, { "epoch": 1.7142857142857144, "grad_norm": 0.31112849712371826, "learning_rate": 2.7799111902582693e-07, "loss": 0.5007, "step": 30900 }, { "epoch": 1.715395284327323, "grad_norm": 0.5001584887504578, "learning_rate": 2.7763026681553734e-07, "loss": 0.4872, "step": 30920 }, { "epoch": 1.716504854368932, "grad_norm": 0.36047807335853577, "learning_rate": 2.7726935631824694e-07, "loss": 0.5274, "step": 30940 }, { "epoch": 1.717614424410541, "grad_norm": 0.3823573589324951, "learning_rate": 2.7690838829530886e-07, "loss": 0.5245, "step": 30960 }, { "epoch": 1.7187239944521497, "grad_norm": 0.26852449774742126, "learning_rate": 2.765473635081977e-07, "loss": 0.4243, "step": 30980 }, { "epoch": 1.7198335644937588, "grad_norm": 0.2846018075942993, "learning_rate": 2.761862827185075e-07, "loss": 0.4546, "step": 31000 }, { "epoch": 1.7209431345353674, "grad_norm": 0.3950466215610504, "learning_rate": 2.758251466879508e-07, "loss": 0.49, "step": 31020 }, { "epoch": 1.7220527045769765, "grad_norm": 0.36862340569496155, "learning_rate": 2.7546395617835656e-07, "loss": 0.4718, "step": 31040 }, { "epoch": 1.7231622746185853, "grad_norm": 0.21566441655158997, "learning_rate": 2.751027119516684e-07, "loss": 0.5766, "step": 31060 }, { "epoch": 1.7242718446601941, "grad_norm": 0.30930423736572266, "learning_rate": 2.7474141476994366e-07, "loss": 0.4301, "step": 31080 }, { "epoch": 1.7253814147018032, "grad_norm": 0.3274851441383362, "learning_rate": 2.743800653953511e-07, "loss": 0.4368, "step": 31100 }, { "epoch": 1.7264909847434118, "grad_norm": 0.36067548394203186, "learning_rate": 2.740186645901699e-07, "loss": 0.4363, "step": 31120 }, { "epoch": 1.7276005547850208, "grad_norm": 0.3733062446117401, "learning_rate": 2.736572131167872e-07, "loss": 0.4926, "step": 31140 }, { "epoch": 1.7287101248266297, "grad_norm": 0.2768867611885071, "learning_rate": 2.7329571173769756e-07, "loss": 0.4902, "step": 31160 }, { "epoch": 1.7298196948682385, "grad_norm": 0.24901051819324493, "learning_rate": 2.729341612155005e-07, "loss": 0.4946, "step": 31180 }, { "epoch": 1.7309292649098476, "grad_norm": 0.31498342752456665, "learning_rate": 2.725725623128994e-07, "loss": 0.6087, "step": 31200 }, { "epoch": 1.7320388349514562, "grad_norm": 0.39936602115631104, "learning_rate": 2.7221091579269966e-07, "loss": 0.4526, "step": 31220 }, { "epoch": 1.7331484049930652, "grad_norm": 0.285976380109787, "learning_rate": 2.7184922241780697e-07, "loss": 0.5414, "step": 31240 }, { "epoch": 1.734257975034674, "grad_norm": 0.28418394923210144, "learning_rate": 2.71487482951226e-07, "loss": 0.5414, "step": 31260 }, { "epoch": 1.7353675450762829, "grad_norm": 0.24928802251815796, "learning_rate": 2.7112569815605876e-07, "loss": 0.5215, "step": 31280 }, { "epoch": 1.736477115117892, "grad_norm": 0.3119857907295227, "learning_rate": 2.707638687955025e-07, "loss": 0.5494, "step": 31300 }, { "epoch": 1.7375866851595005, "grad_norm": 0.5153653025627136, "learning_rate": 2.7040199563284894e-07, "loss": 0.5877, "step": 31320 }, { "epoch": 1.7386962552011096, "grad_norm": 0.24885863065719604, "learning_rate": 2.70040079431482e-07, "loss": 0.393, "step": 31340 }, { "epoch": 1.7398058252427184, "grad_norm": 0.463762104511261, "learning_rate": 2.6967812095487626e-07, "loss": 0.5564, "step": 31360 }, { "epoch": 1.7409153952843273, "grad_norm": 0.355584055185318, "learning_rate": 2.6931612096659566e-07, "loss": 0.4844, "step": 31380 }, { "epoch": 1.7420249653259363, "grad_norm": 0.5023449659347534, "learning_rate": 2.6895408023029175e-07, "loss": 0.5218, "step": 31400 }, { "epoch": 1.743134535367545, "grad_norm": 0.28821951150894165, "learning_rate": 2.685919995097019e-07, "loss": 0.5613, "step": 31420 }, { "epoch": 1.744244105409154, "grad_norm": 0.34325727820396423, "learning_rate": 2.682298795686478e-07, "loss": 0.4855, "step": 31440 }, { "epoch": 1.7453536754507628, "grad_norm": 0.5021767020225525, "learning_rate": 2.67867721171034e-07, "loss": 0.5383, "step": 31460 }, { "epoch": 1.7464632454923716, "grad_norm": 0.28583070635795593, "learning_rate": 2.675055250808462e-07, "loss": 0.5306, "step": 31480 }, { "epoch": 1.7475728155339807, "grad_norm": 0.28275159001350403, "learning_rate": 2.671432920621495e-07, "loss": 0.5037, "step": 31500 }, { "epoch": 1.7486823855755893, "grad_norm": 0.57793128490448, "learning_rate": 2.6678102287908686e-07, "loss": 0.5919, "step": 31520 }, { "epoch": 1.7497919556171984, "grad_norm": 0.3371540307998657, "learning_rate": 2.664187182958777e-07, "loss": 0.4734, "step": 31540 }, { "epoch": 1.7509015256588072, "grad_norm": 0.3013628125190735, "learning_rate": 2.6605637907681613e-07, "loss": 0.4833, "step": 31560 }, { "epoch": 1.752011095700416, "grad_norm": 0.3028537929058075, "learning_rate": 2.65694005986269e-07, "loss": 0.4394, "step": 31580 }, { "epoch": 1.753120665742025, "grad_norm": 0.21752415597438812, "learning_rate": 2.65331599788675e-07, "loss": 0.4717, "step": 31600 }, { "epoch": 1.7542302357836337, "grad_norm": 0.2948974072933197, "learning_rate": 2.6496916124854244e-07, "loss": 0.4704, "step": 31620 }, { "epoch": 1.7553398058252427, "grad_norm": 0.38149356842041016, "learning_rate": 2.6460669113044805e-07, "loss": 0.5134, "step": 31640 }, { "epoch": 1.7564493758668516, "grad_norm": 0.3391442596912384, "learning_rate": 2.6424419019903495e-07, "loss": 0.5296, "step": 31660 }, { "epoch": 1.7575589459084604, "grad_norm": 0.3338284492492676, "learning_rate": 2.638816592190112e-07, "loss": 0.4526, "step": 31680 }, { "epoch": 1.7586685159500695, "grad_norm": 0.4283836781978607, "learning_rate": 2.635190989551487e-07, "loss": 0.4335, "step": 31700 }, { "epoch": 1.759778085991678, "grad_norm": 0.21237985789775848, "learning_rate": 2.631565101722807e-07, "loss": 0.4812, "step": 31720 }, { "epoch": 1.7608876560332871, "grad_norm": 0.40082094073295593, "learning_rate": 2.627938936353006e-07, "loss": 0.5211, "step": 31740 }, { "epoch": 1.761997226074896, "grad_norm": 0.4209996163845062, "learning_rate": 2.6243125010916067e-07, "loss": 0.4823, "step": 31760 }, { "epoch": 1.7631067961165048, "grad_norm": 0.27289316058158875, "learning_rate": 2.6206858035887e-07, "loss": 0.484, "step": 31780 }, { "epoch": 1.7642163661581138, "grad_norm": 0.2887282371520996, "learning_rate": 2.617058851494927e-07, "loss": 0.5137, "step": 31800 }, { "epoch": 1.7653259361997224, "grad_norm": 0.3484319746494293, "learning_rate": 2.6134316524614696e-07, "loss": 0.594, "step": 31820 }, { "epoch": 1.7664355062413315, "grad_norm": 0.4235683083534241, "learning_rate": 2.60980421414003e-07, "loss": 0.5024, "step": 31840 }, { "epoch": 1.7675450762829403, "grad_norm": 0.35101646184921265, "learning_rate": 2.606176544182813e-07, "loss": 0.4447, "step": 31860 }, { "epoch": 1.7686546463245492, "grad_norm": 0.39119628071784973, "learning_rate": 2.6025486502425144e-07, "loss": 0.4998, "step": 31880 }, { "epoch": 1.7697642163661582, "grad_norm": 0.35422465205192566, "learning_rate": 2.598920539972301e-07, "loss": 0.3535, "step": 31900 }, { "epoch": 1.7708737864077668, "grad_norm": 0.41485661268234253, "learning_rate": 2.5952922210257964e-07, "loss": 0.4735, "step": 31920 }, { "epoch": 1.7719833564493759, "grad_norm": 0.34744489192962646, "learning_rate": 2.5916637010570656e-07, "loss": 0.4809, "step": 31940 }, { "epoch": 1.7730929264909847, "grad_norm": 0.31952229142189026, "learning_rate": 2.588034987720596e-07, "loss": 0.3837, "step": 31960 }, { "epoch": 1.7742024965325935, "grad_norm": 0.3148404359817505, "learning_rate": 2.584406088671284e-07, "loss": 0.4116, "step": 31980 }, { "epoch": 1.7753120665742026, "grad_norm": 0.3295808732509613, "learning_rate": 2.580777011564416e-07, "loss": 0.4851, "step": 32000 }, { "epoch": 1.7764216366158114, "grad_norm": 0.3079177439212799, "learning_rate": 2.577147764055657e-07, "loss": 0.5326, "step": 32020 }, { "epoch": 1.7775312066574203, "grad_norm": 0.28215643763542175, "learning_rate": 2.573518353801028e-07, "loss": 0.4242, "step": 32040 }, { "epoch": 1.778640776699029, "grad_norm": 0.33437782526016235, "learning_rate": 2.569888788456896e-07, "loss": 0.4981, "step": 32060 }, { "epoch": 1.779750346740638, "grad_norm": 0.4504397213459015, "learning_rate": 2.566259075679956e-07, "loss": 0.5685, "step": 32080 }, { "epoch": 1.780859916782247, "grad_norm": 0.24355018138885498, "learning_rate": 2.5626292231272086e-07, "loss": 0.4218, "step": 32100 }, { "epoch": 1.7819694868238558, "grad_norm": 0.3099643588066101, "learning_rate": 2.558999238455956e-07, "loss": 0.4148, "step": 32120 }, { "epoch": 1.7830790568654646, "grad_norm": 0.2772121727466583, "learning_rate": 2.5553691293237744e-07, "loss": 0.586, "step": 32140 }, { "epoch": 1.7841886269070735, "grad_norm": 0.3421294689178467, "learning_rate": 2.5517389033885056e-07, "loss": 0.4856, "step": 32160 }, { "epoch": 1.7852981969486823, "grad_norm": 0.25106149911880493, "learning_rate": 2.5481085683082346e-07, "loss": 0.5189, "step": 32180 }, { "epoch": 1.7864077669902914, "grad_norm": 0.3564590513706207, "learning_rate": 2.544478131741281e-07, "loss": 0.4875, "step": 32200 }, { "epoch": 1.7875173370319002, "grad_norm": 0.25506940484046936, "learning_rate": 2.540847601346173e-07, "loss": 0.471, "step": 32220 }, { "epoch": 1.788626907073509, "grad_norm": 0.341706246137619, "learning_rate": 2.537216984781642e-07, "loss": 0.5568, "step": 32240 }, { "epoch": 1.7897364771151179, "grad_norm": 0.4286288321018219, "learning_rate": 2.5335862897065983e-07, "loss": 0.5943, "step": 32260 }, { "epoch": 1.7908460471567267, "grad_norm": 0.28730303049087524, "learning_rate": 2.5299555237801176e-07, "loss": 0.5034, "step": 32280 }, { "epoch": 1.7919556171983357, "grad_norm": 0.4705427587032318, "learning_rate": 2.5263246946614263e-07, "loss": 0.4988, "step": 32300 }, { "epoch": 1.7930651872399446, "grad_norm": 0.4834686517715454, "learning_rate": 2.5226938100098843e-07, "loss": 0.49, "step": 32320 }, { "epoch": 1.7941747572815534, "grad_norm": 0.3516719341278076, "learning_rate": 2.5190628774849667e-07, "loss": 0.4999, "step": 32340 }, { "epoch": 1.7952843273231622, "grad_norm": 0.24008581042289734, "learning_rate": 2.5154319047462514e-07, "loss": 0.5243, "step": 32360 }, { "epoch": 1.796393897364771, "grad_norm": 0.285537987947464, "learning_rate": 2.511800899453402e-07, "loss": 0.5006, "step": 32380 }, { "epoch": 1.7975034674063801, "grad_norm": 0.4226382374763489, "learning_rate": 2.5081698692661475e-07, "loss": 0.5995, "step": 32400 }, { "epoch": 1.798613037447989, "grad_norm": 0.3707716166973114, "learning_rate": 2.5045388218442715e-07, "loss": 0.436, "step": 32420 }, { "epoch": 1.7997226074895978, "grad_norm": 0.3749970495700836, "learning_rate": 2.5009077648475944e-07, "loss": 0.5176, "step": 32440 }, { "epoch": 1.8008321775312066, "grad_norm": 0.3354260325431824, "learning_rate": 2.497276705935957e-07, "loss": 0.5082, "step": 32460 }, { "epoch": 1.8019417475728154, "grad_norm": 0.3313789367675781, "learning_rate": 2.4936456527692016e-07, "loss": 0.5363, "step": 32480 }, { "epoch": 1.8030513176144245, "grad_norm": 0.3185703754425049, "learning_rate": 2.49001461300716e-07, "loss": 0.4129, "step": 32500 }, { "epoch": 1.8041608876560333, "grad_norm": 0.2886795997619629, "learning_rate": 2.4863835943096386e-07, "loss": 0.567, "step": 32520 }, { "epoch": 1.8052704576976422, "grad_norm": 0.16472795605659485, "learning_rate": 2.4827526043363937e-07, "loss": 0.5241, "step": 32540 }, { "epoch": 1.806380027739251, "grad_norm": 0.3610054552555084, "learning_rate": 2.479121650747124e-07, "loss": 0.4988, "step": 32560 }, { "epoch": 1.8074895977808598, "grad_norm": 0.38332366943359375, "learning_rate": 2.4754907412014526e-07, "loss": 0.5975, "step": 32580 }, { "epoch": 1.8085991678224689, "grad_norm": 0.6330152750015259, "learning_rate": 2.4718598833589085e-07, "loss": 0.5855, "step": 32600 }, { "epoch": 1.8097087378640777, "grad_norm": 0.3983088433742523, "learning_rate": 2.468229084878911e-07, "loss": 0.528, "step": 32620 }, { "epoch": 1.8108183079056865, "grad_norm": 0.3223673105239868, "learning_rate": 2.464598353420754e-07, "loss": 0.5263, "step": 32640 }, { "epoch": 1.8119278779472954, "grad_norm": 0.23156464099884033, "learning_rate": 2.460967696643592e-07, "loss": 0.503, "step": 32660 }, { "epoch": 1.8130374479889042, "grad_norm": 0.5105547904968262, "learning_rate": 2.45733712220642e-07, "loss": 0.4407, "step": 32680 }, { "epoch": 1.8141470180305133, "grad_norm": 0.6725867390632629, "learning_rate": 2.45370663776806e-07, "loss": 0.5376, "step": 32700 }, { "epoch": 1.815256588072122, "grad_norm": 0.2037682682275772, "learning_rate": 2.4500762509871445e-07, "loss": 0.4683, "step": 32720 }, { "epoch": 1.816366158113731, "grad_norm": 0.27555859088897705, "learning_rate": 2.4464459695220987e-07, "loss": 0.4556, "step": 32740 }, { "epoch": 1.8174757281553398, "grad_norm": 0.4941754937171936, "learning_rate": 2.442815801031128e-07, "loss": 0.4526, "step": 32760 }, { "epoch": 1.8185852981969486, "grad_norm": 0.28933095932006836, "learning_rate": 2.4391857531721976e-07, "loss": 0.5054, "step": 32780 }, { "epoch": 1.8196948682385576, "grad_norm": 0.2108532041311264, "learning_rate": 2.435555833603017e-07, "loss": 0.5612, "step": 32800 }, { "epoch": 1.8208044382801665, "grad_norm": 0.36382246017456055, "learning_rate": 2.431926049981029e-07, "loss": 0.535, "step": 32820 }, { "epoch": 1.8219140083217753, "grad_norm": 0.36497408151626587, "learning_rate": 2.4282964099633863e-07, "loss": 0.5158, "step": 32840 }, { "epoch": 1.8230235783633841, "grad_norm": 0.28695353865623474, "learning_rate": 2.424666921206939e-07, "loss": 0.4803, "step": 32860 }, { "epoch": 1.824133148404993, "grad_norm": 0.2558686435222626, "learning_rate": 2.4210375913682203e-07, "loss": 0.4537, "step": 32880 }, { "epoch": 1.825242718446602, "grad_norm": 0.2963564395904541, "learning_rate": 2.4174084281034263e-07, "loss": 0.6176, "step": 32900 }, { "epoch": 1.8263522884882109, "grad_norm": 0.3271200656890869, "learning_rate": 2.413779439068401e-07, "loss": 0.5457, "step": 32920 }, { "epoch": 1.8274618585298197, "grad_norm": 0.3468210697174072, "learning_rate": 2.4101506319186234e-07, "loss": 0.3909, "step": 32940 }, { "epoch": 1.8285714285714287, "grad_norm": 0.3404419720172882, "learning_rate": 2.406522014309186e-07, "loss": 0.4748, "step": 32960 }, { "epoch": 1.8296809986130373, "grad_norm": 0.28537169098854065, "learning_rate": 2.4028935938947834e-07, "loss": 0.5204, "step": 32980 }, { "epoch": 1.8307905686546464, "grad_norm": 0.2639816105365753, "learning_rate": 2.399265378329694e-07, "loss": 0.4139, "step": 33000 }, { "epoch": 1.8319001386962552, "grad_norm": 2.110807418823242, "learning_rate": 2.3956373752677637e-07, "loss": 0.4592, "step": 33020 }, { "epoch": 1.833009708737864, "grad_norm": 0.3347322642803192, "learning_rate": 2.392009592362388e-07, "loss": 0.4915, "step": 33040 }, { "epoch": 1.8341192787794731, "grad_norm": 0.3435039222240448, "learning_rate": 2.388382037266504e-07, "loss": 0.5238, "step": 33060 }, { "epoch": 1.8352288488210817, "grad_norm": 0.25648629665374756, "learning_rate": 2.384754717632561e-07, "loss": 0.5157, "step": 33080 }, { "epoch": 1.8363384188626908, "grad_norm": 0.28603076934814453, "learning_rate": 2.3811276411125145e-07, "loss": 0.5136, "step": 33100 }, { "epoch": 1.8374479889042996, "grad_norm": 0.3912927210330963, "learning_rate": 2.3775008153578108e-07, "loss": 0.6174, "step": 33120 }, { "epoch": 1.8385575589459084, "grad_norm": 0.2747021019458771, "learning_rate": 2.3738742480193616e-07, "loss": 0.4905, "step": 33140 }, { "epoch": 1.8396671289875175, "grad_norm": 0.37825027108192444, "learning_rate": 2.370247946747535e-07, "loss": 0.4688, "step": 33160 }, { "epoch": 1.840776699029126, "grad_norm": 0.40013495087623596, "learning_rate": 2.366621919192141e-07, "loss": 0.549, "step": 33180 }, { "epoch": 1.8418862690707352, "grad_norm": 0.3099905252456665, "learning_rate": 2.3629961730024084e-07, "loss": 0.5117, "step": 33200 }, { "epoch": 1.842995839112344, "grad_norm": 0.2892189621925354, "learning_rate": 2.3593707158269736e-07, "loss": 0.4422, "step": 33220 }, { "epoch": 1.8441054091539528, "grad_norm": 0.25907695293426514, "learning_rate": 2.3557455553138645e-07, "loss": 0.4912, "step": 33240 }, { "epoch": 1.8452149791955619, "grad_norm": 0.3768417239189148, "learning_rate": 2.3521206991104816e-07, "loss": 0.4637, "step": 33260 }, { "epoch": 1.8463245492371705, "grad_norm": 0.36686909198760986, "learning_rate": 2.3484961548635837e-07, "loss": 0.5043, "step": 33280 }, { "epoch": 1.8474341192787795, "grad_norm": 0.31872138381004333, "learning_rate": 2.3448719302192729e-07, "loss": 0.5698, "step": 33300 }, { "epoch": 1.8485436893203884, "grad_norm": 0.37085631489753723, "learning_rate": 2.341248032822976e-07, "loss": 0.4876, "step": 33320 }, { "epoch": 1.8496532593619972, "grad_norm": 0.19931206107139587, "learning_rate": 2.3376244703194278e-07, "loss": 0.5031, "step": 33340 }, { "epoch": 1.8507628294036063, "grad_norm": 0.40558522939682007, "learning_rate": 2.3340012503526607e-07, "loss": 0.5489, "step": 33360 }, { "epoch": 1.8518723994452149, "grad_norm": 0.2731511890888214, "learning_rate": 2.330378380565981e-07, "loss": 0.5319, "step": 33380 }, { "epoch": 1.852981969486824, "grad_norm": 0.2972654104232788, "learning_rate": 2.3267558686019567e-07, "loss": 0.5179, "step": 33400 }, { "epoch": 1.8540915395284328, "grad_norm": 0.22593317925930023, "learning_rate": 2.323133722102404e-07, "loss": 0.4404, "step": 33420 }, { "epoch": 1.8552011095700416, "grad_norm": 0.30599352717399597, "learning_rate": 2.3195119487083643e-07, "loss": 0.5293, "step": 33440 }, { "epoch": 1.8563106796116506, "grad_norm": 0.45779117941856384, "learning_rate": 2.3158905560600926e-07, "loss": 0.4989, "step": 33460 }, { "epoch": 1.8574202496532592, "grad_norm": 0.3490274250507355, "learning_rate": 2.3122695517970434e-07, "loss": 0.5544, "step": 33480 }, { "epoch": 1.8585298196948683, "grad_norm": 0.45047861337661743, "learning_rate": 2.3086489435578503e-07, "loss": 0.5675, "step": 33500 }, { "epoch": 1.8596393897364771, "grad_norm": 0.16335023939609528, "learning_rate": 2.3050287389803087e-07, "loss": 0.4616, "step": 33520 }, { "epoch": 1.860748959778086, "grad_norm": 0.4132339060306549, "learning_rate": 2.3014089457013675e-07, "loss": 0.4824, "step": 33540 }, { "epoch": 1.861858529819695, "grad_norm": 0.21777623891830444, "learning_rate": 2.2977895713571047e-07, "loss": 0.4781, "step": 33560 }, { "epoch": 1.8629680998613036, "grad_norm": 0.28226935863494873, "learning_rate": 2.2941706235827147e-07, "loss": 0.417, "step": 33580 }, { "epoch": 1.8640776699029127, "grad_norm": 0.23804087936878204, "learning_rate": 2.2905521100124935e-07, "loss": 0.4615, "step": 33600 }, { "epoch": 1.8651872399445215, "grad_norm": 0.39199724793434143, "learning_rate": 2.28693403827982e-07, "loss": 0.4911, "step": 33620 }, { "epoch": 1.8662968099861303, "grad_norm": 0.34928959608078003, "learning_rate": 2.283316416017141e-07, "loss": 0.4439, "step": 33640 }, { "epoch": 1.8674063800277394, "grad_norm": 0.43283700942993164, "learning_rate": 2.2796992508559563e-07, "loss": 0.558, "step": 33660 }, { "epoch": 1.868515950069348, "grad_norm": 0.3502650558948517, "learning_rate": 2.2760825504267993e-07, "loss": 0.4684, "step": 33680 }, { "epoch": 1.869625520110957, "grad_norm": 0.3979421854019165, "learning_rate": 2.2724663223592243e-07, "loss": 0.5095, "step": 33700 }, { "epoch": 1.870735090152566, "grad_norm": 0.3323058485984802, "learning_rate": 2.2688505742817916e-07, "loss": 0.4125, "step": 33720 }, { "epoch": 1.8718446601941747, "grad_norm": 0.33879172801971436, "learning_rate": 2.2652353138220436e-07, "loss": 0.5019, "step": 33740 }, { "epoch": 1.8729542302357838, "grad_norm": 0.50958651304245, "learning_rate": 2.261620548606497e-07, "loss": 0.5099, "step": 33760 }, { "epoch": 1.8740638002773924, "grad_norm": 0.44224172830581665, "learning_rate": 2.258006286260626e-07, "loss": 0.5051, "step": 33780 }, { "epoch": 1.8751733703190014, "grad_norm": 0.42298296093940735, "learning_rate": 2.2543925344088407e-07, "loss": 0.481, "step": 33800 }, { "epoch": 1.8762829403606103, "grad_norm": 0.2881152629852295, "learning_rate": 2.2507793006744734e-07, "loss": 0.3972, "step": 33820 }, { "epoch": 1.877392510402219, "grad_norm": 0.3136497735977173, "learning_rate": 2.2471665926797676e-07, "loss": 0.5154, "step": 33840 }, { "epoch": 1.8785020804438282, "grad_norm": 0.27369651198387146, "learning_rate": 2.2435544180458546e-07, "loss": 0.4653, "step": 33860 }, { "epoch": 1.8796116504854368, "grad_norm": 0.29867157340049744, "learning_rate": 2.239942784392741e-07, "loss": 0.4452, "step": 33880 }, { "epoch": 1.8807212205270458, "grad_norm": 0.3626418709754944, "learning_rate": 2.2363316993392932e-07, "loss": 0.4583, "step": 33900 }, { "epoch": 1.8818307905686547, "grad_norm": 0.3023126721382141, "learning_rate": 2.2327211705032198e-07, "loss": 0.484, "step": 33920 }, { "epoch": 1.8829403606102635, "grad_norm": 0.24008943140506744, "learning_rate": 2.2291112055010546e-07, "loss": 0.4933, "step": 33940 }, { "epoch": 1.8840499306518725, "grad_norm": 0.3629247844219208, "learning_rate": 2.225501811948145e-07, "loss": 0.4078, "step": 33960 }, { "epoch": 1.8851595006934811, "grad_norm": 0.2726888060569763, "learning_rate": 2.2218929974586302e-07, "loss": 0.4322, "step": 33980 }, { "epoch": 1.8862690707350902, "grad_norm": 0.49839699268341064, "learning_rate": 2.2182847696454283e-07, "loss": 0.5129, "step": 34000 }, { "epoch": 1.887378640776699, "grad_norm": 0.36724430322647095, "learning_rate": 2.2146771361202215e-07, "loss": 0.5735, "step": 34020 }, { "epoch": 1.8884882108183079, "grad_norm": 0.3907157778739929, "learning_rate": 2.2110701044934368e-07, "loss": 0.4933, "step": 34040 }, { "epoch": 1.889597780859917, "grad_norm": 0.270027756690979, "learning_rate": 2.2074636823742304e-07, "loss": 0.4752, "step": 34060 }, { "epoch": 1.8907073509015255, "grad_norm": 0.4066958427429199, "learning_rate": 2.203857877370477e-07, "loss": 0.5235, "step": 34080 }, { "epoch": 1.8918169209431346, "grad_norm": 0.40386998653411865, "learning_rate": 2.200252697088745e-07, "loss": 0.4646, "step": 34100 }, { "epoch": 1.8929264909847434, "grad_norm": 0.4584975242614746, "learning_rate": 2.1966481491342851e-07, "loss": 0.4122, "step": 34120 }, { "epoch": 1.8940360610263522, "grad_norm": 0.2802693545818329, "learning_rate": 2.193044241111018e-07, "loss": 0.4842, "step": 34140 }, { "epoch": 1.8951456310679613, "grad_norm": 0.3565486669540405, "learning_rate": 2.1894409806215114e-07, "loss": 0.5021, "step": 34160 }, { "epoch": 1.89625520110957, "grad_norm": 0.26126715540885925, "learning_rate": 2.185838375266966e-07, "loss": 0.4888, "step": 34180 }, { "epoch": 1.897364771151179, "grad_norm": 0.3850455582141876, "learning_rate": 2.182236432647204e-07, "loss": 0.5249, "step": 34200 }, { "epoch": 1.8984743411927878, "grad_norm": 0.5834473371505737, "learning_rate": 2.1786351603606467e-07, "loss": 0.5246, "step": 34220 }, { "epoch": 1.8995839112343966, "grad_norm": 0.3303210437297821, "learning_rate": 2.175034566004302e-07, "loss": 0.6079, "step": 34240 }, { "epoch": 1.9006934812760057, "grad_norm": 0.27707669138908386, "learning_rate": 2.1714346571737485e-07, "loss": 0.4991, "step": 34260 }, { "epoch": 1.9018030513176143, "grad_norm": 0.2626209259033203, "learning_rate": 2.1678354414631166e-07, "loss": 0.5055, "step": 34280 }, { "epoch": 1.9029126213592233, "grad_norm": 0.34815868735313416, "learning_rate": 2.1642369264650788e-07, "loss": 0.5837, "step": 34300 }, { "epoch": 1.9040221914008322, "grad_norm": 0.3966573476791382, "learning_rate": 2.160639119770824e-07, "loss": 0.5064, "step": 34320 }, { "epoch": 1.905131761442441, "grad_norm": 0.3128938674926758, "learning_rate": 2.1570420289700495e-07, "loss": 0.45, "step": 34340 }, { "epoch": 1.90624133148405, "grad_norm": 0.3131668269634247, "learning_rate": 2.1534456616509444e-07, "loss": 0.533, "step": 34360 }, { "epoch": 1.9073509015256587, "grad_norm": 0.3010883927345276, "learning_rate": 2.1498500254001683e-07, "loss": 0.5209, "step": 34380 }, { "epoch": 1.9084604715672677, "grad_norm": 0.29310786724090576, "learning_rate": 2.1462551278028395e-07, "loss": 0.5577, "step": 34400 }, { "epoch": 1.9095700416088766, "grad_norm": 0.25145503878593445, "learning_rate": 2.1426609764425196e-07, "loss": 0.4535, "step": 34420 }, { "epoch": 1.9106796116504854, "grad_norm": 0.16989769041538239, "learning_rate": 2.1390675789011945e-07, "loss": 0.4278, "step": 34440 }, { "epoch": 1.9117891816920944, "grad_norm": 0.736035943031311, "learning_rate": 2.1354749427592592e-07, "loss": 0.5231, "step": 34460 }, { "epoch": 1.912898751733703, "grad_norm": 0.5227901339530945, "learning_rate": 2.131883075595505e-07, "loss": 0.4819, "step": 34480 }, { "epoch": 1.914008321775312, "grad_norm": 0.24113894999027252, "learning_rate": 2.128291984987099e-07, "loss": 0.4644, "step": 34500 }, { "epoch": 1.915117891816921, "grad_norm": 0.4220200181007385, "learning_rate": 2.12470167850957e-07, "loss": 0.4548, "step": 34520 }, { "epoch": 1.9162274618585298, "grad_norm": 0.2872553765773773, "learning_rate": 2.1211121637367944e-07, "loss": 0.4633, "step": 34540 }, { "epoch": 1.9173370319001388, "grad_norm": 0.29027271270751953, "learning_rate": 2.117523448240977e-07, "loss": 0.4725, "step": 34560 }, { "epoch": 1.9184466019417474, "grad_norm": 0.4183167517185211, "learning_rate": 2.1139355395926358e-07, "loss": 0.4417, "step": 34580 }, { "epoch": 1.9195561719833565, "grad_norm": 0.4276142716407776, "learning_rate": 2.1103484453605907e-07, "loss": 0.5137, "step": 34600 }, { "epoch": 1.9206657420249653, "grad_norm": 0.20618025958538055, "learning_rate": 2.1067621731119384e-07, "loss": 0.5177, "step": 34620 }, { "epoch": 1.9217753120665741, "grad_norm": 0.2810036242008209, "learning_rate": 2.103176730412043e-07, "loss": 0.5401, "step": 34640 }, { "epoch": 1.9228848821081832, "grad_norm": 0.2688840329647064, "learning_rate": 2.0995921248245224e-07, "loss": 0.512, "step": 34660 }, { "epoch": 1.9239944521497918, "grad_norm": 0.383444219827652, "learning_rate": 2.0960083639112243e-07, "loss": 0.4812, "step": 34680 }, { "epoch": 1.9251040221914009, "grad_norm": 0.3267551064491272, "learning_rate": 2.0924254552322152e-07, "loss": 0.4753, "step": 34700 }, { "epoch": 1.9262135922330097, "grad_norm": 0.24109479784965515, "learning_rate": 2.0888434063457654e-07, "loss": 0.6108, "step": 34720 }, { "epoch": 1.9273231622746185, "grad_norm": 0.2657538950443268, "learning_rate": 2.0852622248083308e-07, "loss": 0.552, "step": 34740 }, { "epoch": 1.9284327323162276, "grad_norm": 0.5456156134605408, "learning_rate": 2.0816819181745365e-07, "loss": 0.4368, "step": 34760 }, { "epoch": 1.9295423023578362, "grad_norm": 0.24174275994300842, "learning_rate": 2.078102493997164e-07, "loss": 0.5476, "step": 34780 }, { "epoch": 1.9306518723994452, "grad_norm": 0.33184370398521423, "learning_rate": 2.0745239598271312e-07, "loss": 0.4561, "step": 34800 }, { "epoch": 1.931761442441054, "grad_norm": 0.32091259956359863, "learning_rate": 2.070946323213479e-07, "loss": 0.4672, "step": 34820 }, { "epoch": 1.932871012482663, "grad_norm": 0.2712443172931671, "learning_rate": 2.0673695917033562e-07, "loss": 0.4876, "step": 34840 }, { "epoch": 1.933980582524272, "grad_norm": 0.30243590474128723, "learning_rate": 2.0637937728420008e-07, "loss": 0.5649, "step": 34860 }, { "epoch": 1.9350901525658806, "grad_norm": 0.3122914433479309, "learning_rate": 2.060218874172725e-07, "loss": 0.4203, "step": 34880 }, { "epoch": 1.9361997226074896, "grad_norm": 0.27437761425971985, "learning_rate": 2.0566449032369034e-07, "loss": 0.5277, "step": 34900 }, { "epoch": 1.9373092926490985, "grad_norm": 0.24069464206695557, "learning_rate": 2.0530718675739488e-07, "loss": 0.5751, "step": 34920 }, { "epoch": 1.9384188626907073, "grad_norm": 0.25943881273269653, "learning_rate": 2.049499774721303e-07, "loss": 0.4512, "step": 34940 }, { "epoch": 1.9395284327323163, "grad_norm": 0.520367443561554, "learning_rate": 2.045928632214421e-07, "loss": 0.5096, "step": 34960 }, { "epoch": 1.940638002773925, "grad_norm": 0.34084126353263855, "learning_rate": 2.0423584475867504e-07, "loss": 0.5124, "step": 34980 }, { "epoch": 1.941747572815534, "grad_norm": 0.3656160831451416, "learning_rate": 2.0387892283697166e-07, "loss": 0.5617, "step": 35000 }, { "epoch": 1.9428571428571428, "grad_norm": 0.33541634678840637, "learning_rate": 2.0352209820927135e-07, "loss": 0.5652, "step": 35020 }, { "epoch": 1.9439667128987517, "grad_norm": 0.4019622206687927, "learning_rate": 2.0316537162830784e-07, "loss": 0.5575, "step": 35040 }, { "epoch": 1.9450762829403607, "grad_norm": 0.22843344509601593, "learning_rate": 2.0280874384660815e-07, "loss": 0.5174, "step": 35060 }, { "epoch": 1.9461858529819693, "grad_norm": 0.26440104842185974, "learning_rate": 2.0245221561649094e-07, "loss": 0.6065, "step": 35080 }, { "epoch": 1.9472954230235784, "grad_norm": 0.26198408007621765, "learning_rate": 2.020957876900648e-07, "loss": 0.4758, "step": 35100 }, { "epoch": 1.9484049930651872, "grad_norm": 0.27166131138801575, "learning_rate": 2.0173946081922666e-07, "loss": 0.4114, "step": 35120 }, { "epoch": 1.949514563106796, "grad_norm": 0.3279228210449219, "learning_rate": 2.0138323575566044e-07, "loss": 0.4517, "step": 35140 }, { "epoch": 1.950624133148405, "grad_norm": 0.3527485430240631, "learning_rate": 2.0102711325083513e-07, "loss": 0.4707, "step": 35160 }, { "epoch": 1.9517337031900137, "grad_norm": 0.29928019642829895, "learning_rate": 2.0067109405600336e-07, "loss": 0.5108, "step": 35180 }, { "epoch": 1.9528432732316228, "grad_norm": 0.3652009069919586, "learning_rate": 2.0031517892220017e-07, "loss": 0.5011, "step": 35200 }, { "epoch": 1.9539528432732316, "grad_norm": 0.253216415643692, "learning_rate": 1.999593686002406e-07, "loss": 0.467, "step": 35220 }, { "epoch": 1.9550624133148404, "grad_norm": 0.27623605728149414, "learning_rate": 1.996036638407187e-07, "loss": 0.5629, "step": 35240 }, { "epoch": 1.9561719833564495, "grad_norm": 0.3090532124042511, "learning_rate": 1.9924806539400617e-07, "loss": 0.516, "step": 35260 }, { "epoch": 1.957281553398058, "grad_norm": 0.368063360452652, "learning_rate": 1.9889257401025015e-07, "loss": 0.4518, "step": 35280 }, { "epoch": 1.9583911234396671, "grad_norm": 0.4081331491470337, "learning_rate": 1.9853719043937177e-07, "loss": 0.4635, "step": 35300 }, { "epoch": 1.959500693481276, "grad_norm": 0.43525028228759766, "learning_rate": 1.9818191543106516e-07, "loss": 0.5303, "step": 35320 }, { "epoch": 1.9606102635228848, "grad_norm": 0.3463977873325348, "learning_rate": 1.978267497347951e-07, "loss": 0.4874, "step": 35340 }, { "epoch": 1.9617198335644939, "grad_norm": 0.4796011745929718, "learning_rate": 1.9747169409979587e-07, "loss": 0.5602, "step": 35360 }, { "epoch": 1.9628294036061025, "grad_norm": 0.46094682812690735, "learning_rate": 1.9711674927506966e-07, "loss": 0.5223, "step": 35380 }, { "epoch": 1.9639389736477115, "grad_norm": 0.27011212706565857, "learning_rate": 1.9676191600938474e-07, "loss": 0.4505, "step": 35400 }, { "epoch": 1.9650485436893204, "grad_norm": 0.3924122452735901, "learning_rate": 1.964071950512741e-07, "loss": 0.5663, "step": 35420 }, { "epoch": 1.9661581137309292, "grad_norm": 0.42884013056755066, "learning_rate": 1.960525871490339e-07, "loss": 0.483, "step": 35440 }, { "epoch": 1.9672676837725382, "grad_norm": 0.3565513789653778, "learning_rate": 1.9569809305072177e-07, "loss": 0.536, "step": 35460 }, { "epoch": 1.9683772538141469, "grad_norm": 0.3264450430870056, "learning_rate": 1.9534371350415504e-07, "loss": 0.4616, "step": 35480 }, { "epoch": 1.969486823855756, "grad_norm": 0.26990899443626404, "learning_rate": 1.949894492569099e-07, "loss": 0.4728, "step": 35500 }, { "epoch": 1.9705963938973647, "grad_norm": 0.31033796072006226, "learning_rate": 1.9463530105631877e-07, "loss": 0.4385, "step": 35520 }, { "epoch": 1.9717059639389736, "grad_norm": 0.2820804715156555, "learning_rate": 1.942812696494695e-07, "loss": 0.5654, "step": 35540 }, { "epoch": 1.9728155339805826, "grad_norm": 0.3638303279876709, "learning_rate": 1.9392735578320362e-07, "loss": 0.4042, "step": 35560 }, { "epoch": 1.9739251040221912, "grad_norm": 0.5269972085952759, "learning_rate": 1.9357356020411475e-07, "loss": 0.4412, "step": 35580 }, { "epoch": 1.9750346740638003, "grad_norm": 0.5268856287002563, "learning_rate": 1.9321988365854652e-07, "loss": 0.4744, "step": 35600 }, { "epoch": 1.9761442441054091, "grad_norm": 0.294504314661026, "learning_rate": 1.9286632689259217e-07, "loss": 0.495, "step": 35620 }, { "epoch": 1.977253814147018, "grad_norm": 0.934911847114563, "learning_rate": 1.925128906520917e-07, "loss": 0.5032, "step": 35640 }, { "epoch": 1.978363384188627, "grad_norm": 0.22908180952072144, "learning_rate": 1.921595756826311e-07, "loss": 0.4728, "step": 35660 }, { "epoch": 1.9794729542302358, "grad_norm": 0.27198266983032227, "learning_rate": 1.9180638272954048e-07, "loss": 0.4442, "step": 35680 }, { "epoch": 1.9805825242718447, "grad_norm": 0.37855255603790283, "learning_rate": 1.9145331253789253e-07, "loss": 0.4843, "step": 35700 }, { "epoch": 1.9816920943134535, "grad_norm": 0.4885648787021637, "learning_rate": 1.91100365852501e-07, "loss": 0.5166, "step": 35720 }, { "epoch": 1.9828016643550623, "grad_norm": 0.2763024568557739, "learning_rate": 1.9074754341791914e-07, "loss": 0.4332, "step": 35740 }, { "epoch": 1.9839112343966714, "grad_norm": 0.45498839020729065, "learning_rate": 1.90394845978438e-07, "loss": 0.5004, "step": 35760 }, { "epoch": 1.9850208044382802, "grad_norm": 0.38357841968536377, "learning_rate": 1.90042274278085e-07, "loss": 0.4622, "step": 35780 }, { "epoch": 1.986130374479889, "grad_norm": 0.23842637240886688, "learning_rate": 1.8968982906062237e-07, "loss": 0.5426, "step": 35800 }, { "epoch": 1.9872399445214979, "grad_norm": 0.47405555844306946, "learning_rate": 1.8933751106954535e-07, "loss": 0.4915, "step": 35820 }, { "epoch": 1.9883495145631067, "grad_norm": 0.2285192906856537, "learning_rate": 1.8898532104808084e-07, "loss": 0.5166, "step": 35840 }, { "epoch": 1.9894590846047158, "grad_norm": 0.2909943461418152, "learning_rate": 1.8863325973918604e-07, "loss": 0.4753, "step": 35860 }, { "epoch": 1.9905686546463246, "grad_norm": 0.29150089621543884, "learning_rate": 1.8828132788554638e-07, "loss": 0.5338, "step": 35880 }, { "epoch": 1.9916782246879334, "grad_norm": 0.37628743052482605, "learning_rate": 1.87929526229574e-07, "loss": 0.4994, "step": 35900 }, { "epoch": 1.9927877947295423, "grad_norm": 0.45627090334892273, "learning_rate": 1.8757785551340694e-07, "loss": 0.5643, "step": 35920 }, { "epoch": 1.993897364771151, "grad_norm": 0.27300581336021423, "learning_rate": 1.8722631647890657e-07, "loss": 0.5266, "step": 35940 }, { "epoch": 1.9950069348127601, "grad_norm": 0.2636503279209137, "learning_rate": 1.8687490986765653e-07, "loss": 0.485, "step": 35960 }, { "epoch": 1.996116504854369, "grad_norm": 0.3490878939628601, "learning_rate": 1.8652363642096133e-07, "loss": 0.5015, "step": 35980 }, { "epoch": 1.9972260748959778, "grad_norm": 0.3447932302951813, "learning_rate": 1.8617249687984434e-07, "loss": 0.4982, "step": 36000 }, { "epoch": 1.9983356449375866, "grad_norm": 0.2836887538433075, "learning_rate": 1.8582149198504655e-07, "loss": 0.5322, "step": 36020 }, { "epoch": 1.9994452149791955, "grad_norm": 0.3658786714076996, "learning_rate": 1.8547062247702488e-07, "loss": 0.4878, "step": 36040 }, { "epoch": 2.0005547850208045, "grad_norm": 0.4797327220439911, "learning_rate": 1.8511988909595067e-07, "loss": 0.5086, "step": 36060 }, { "epoch": 2.001664355062413, "grad_norm": 0.33241429924964905, "learning_rate": 1.8476929258170806e-07, "loss": 0.4764, "step": 36080 }, { "epoch": 2.002773925104022, "grad_norm": 0.34977927803993225, "learning_rate": 1.8441883367389254e-07, "loss": 0.5302, "step": 36100 }, { "epoch": 2.0038834951456312, "grad_norm": 0.2433873862028122, "learning_rate": 1.8406851311180926e-07, "loss": 0.5659, "step": 36120 }, { "epoch": 2.00499306518724, "grad_norm": 0.2682408094406128, "learning_rate": 1.8371833163447137e-07, "loss": 0.4875, "step": 36140 }, { "epoch": 2.006102635228849, "grad_norm": 0.2715090811252594, "learning_rate": 1.8336828998059903e-07, "loss": 0.469, "step": 36160 }, { "epoch": 2.0072122052704575, "grad_norm": 0.3055861294269562, "learning_rate": 1.8301838888861709e-07, "loss": 0.4318, "step": 36180 }, { "epoch": 2.0083217753120666, "grad_norm": 0.23031096160411835, "learning_rate": 1.826686290966538e-07, "loss": 0.4636, "step": 36200 }, { "epoch": 2.0094313453536756, "grad_norm": 0.18845374882221222, "learning_rate": 1.8231901134253975e-07, "loss": 0.4893, "step": 36220 }, { "epoch": 2.0105409153952842, "grad_norm": 0.3928230106830597, "learning_rate": 1.819695363638055e-07, "loss": 0.5167, "step": 36240 }, { "epoch": 2.0116504854368933, "grad_norm": 0.31636807322502136, "learning_rate": 1.816202048976806e-07, "loss": 0.4803, "step": 36260 }, { "epoch": 2.012760055478502, "grad_norm": 0.26821890473365784, "learning_rate": 1.8127101768109188e-07, "loss": 0.5435, "step": 36280 }, { "epoch": 2.013869625520111, "grad_norm": 0.2751017212867737, "learning_rate": 1.809219754506618e-07, "loss": 0.5128, "step": 36300 }, { "epoch": 2.01497919556172, "grad_norm": 0.3263298273086548, "learning_rate": 1.805730789427069e-07, "loss": 0.4537, "step": 36320 }, { "epoch": 2.0160887656033286, "grad_norm": 0.322975218296051, "learning_rate": 1.8022432889323646e-07, "loss": 0.4849, "step": 36340 }, { "epoch": 2.0171983356449377, "grad_norm": 0.3045096695423126, "learning_rate": 1.7987572603795078e-07, "loss": 0.5178, "step": 36360 }, { "epoch": 2.0183079056865463, "grad_norm": 0.26343196630477905, "learning_rate": 1.795272711122395e-07, "loss": 0.4415, "step": 36380 }, { "epoch": 2.0194174757281553, "grad_norm": 0.2388848215341568, "learning_rate": 1.791789648511804e-07, "loss": 0.5934, "step": 36400 }, { "epoch": 2.0205270457697644, "grad_norm": 0.4802308678627014, "learning_rate": 1.7883080798953754e-07, "loss": 0.4956, "step": 36420 }, { "epoch": 2.021636615811373, "grad_norm": 0.28895866870880127, "learning_rate": 1.784828012617597e-07, "loss": 0.4957, "step": 36440 }, { "epoch": 2.022746185852982, "grad_norm": 0.5173145532608032, "learning_rate": 1.7813494540197927e-07, "loss": 0.4727, "step": 36460 }, { "epoch": 2.0238557558945907, "grad_norm": 0.36505502462387085, "learning_rate": 1.777872411440101e-07, "loss": 0.4557, "step": 36480 }, { "epoch": 2.0249653259361997, "grad_norm": 0.32368674874305725, "learning_rate": 1.7743968922134622e-07, "loss": 0.4708, "step": 36500 }, { "epoch": 2.0260748959778088, "grad_norm": 0.3907826542854309, "learning_rate": 1.7709229036716057e-07, "loss": 0.523, "step": 36520 }, { "epoch": 2.0271844660194174, "grad_norm": 0.29961493611335754, "learning_rate": 1.767450453143029e-07, "loss": 0.5372, "step": 36540 }, { "epoch": 2.0282940360610264, "grad_norm": 0.37403976917266846, "learning_rate": 1.763979547952986e-07, "loss": 0.4807, "step": 36560 }, { "epoch": 2.029403606102635, "grad_norm": 0.3249925673007965, "learning_rate": 1.7605101954234718e-07, "loss": 0.5105, "step": 36580 }, { "epoch": 2.030513176144244, "grad_norm": 0.48967990279197693, "learning_rate": 1.757042402873205e-07, "loss": 0.4874, "step": 36600 }, { "epoch": 2.031622746185853, "grad_norm": 0.33546438813209534, "learning_rate": 1.7535761776176133e-07, "loss": 0.4582, "step": 36620 }, { "epoch": 2.0327323162274618, "grad_norm": 0.30081143975257874, "learning_rate": 1.7501115269688188e-07, "loss": 0.5263, "step": 36640 }, { "epoch": 2.033841886269071, "grad_norm": 0.26733970642089844, "learning_rate": 1.7466484582356212e-07, "loss": 0.5446, "step": 36660 }, { "epoch": 2.0349514563106794, "grad_norm": 0.33630067110061646, "learning_rate": 1.743186978723483e-07, "loss": 0.5219, "step": 36680 }, { "epoch": 2.0360610263522885, "grad_norm": 0.40593501925468445, "learning_rate": 1.7397270957345157e-07, "loss": 0.5359, "step": 36700 }, { "epoch": 2.0371705963938975, "grad_norm": 0.3428402245044708, "learning_rate": 1.736268816567461e-07, "loss": 0.469, "step": 36720 }, { "epoch": 2.038280166435506, "grad_norm": 0.5262805819511414, "learning_rate": 1.7328121485176774e-07, "loss": 0.4824, "step": 36740 }, { "epoch": 2.039389736477115, "grad_norm": 0.47333571314811707, "learning_rate": 1.729357098877128e-07, "loss": 0.5049, "step": 36760 }, { "epoch": 2.040499306518724, "grad_norm": 0.2494775354862213, "learning_rate": 1.725903674934357e-07, "loss": 0.4211, "step": 36780 }, { "epoch": 2.041608876560333, "grad_norm": 0.23243889212608337, "learning_rate": 1.7224518839744807e-07, "loss": 0.4739, "step": 36800 }, { "epoch": 2.042718446601942, "grad_norm": 0.3029046952724457, "learning_rate": 1.7190017332791728e-07, "loss": 0.495, "step": 36820 }, { "epoch": 2.0438280166435505, "grad_norm": 0.2532845139503479, "learning_rate": 1.715553230126645e-07, "loss": 0.4618, "step": 36840 }, { "epoch": 2.0449375866851596, "grad_norm": 0.5111905336380005, "learning_rate": 1.712106381791633e-07, "loss": 0.4447, "step": 36860 }, { "epoch": 2.046047156726768, "grad_norm": 0.3672882616519928, "learning_rate": 1.7086611955453825e-07, "loss": 0.5154, "step": 36880 }, { "epoch": 2.0471567267683772, "grad_norm": 0.3546757996082306, "learning_rate": 1.705217678655633e-07, "loss": 0.4995, "step": 36900 }, { "epoch": 2.0482662968099863, "grad_norm": 0.3925154209136963, "learning_rate": 1.7017758383866013e-07, "loss": 0.5134, "step": 36920 }, { "epoch": 2.049375866851595, "grad_norm": 0.2889503538608551, "learning_rate": 1.6983356819989692e-07, "loss": 0.5261, "step": 36940 }, { "epoch": 2.050485436893204, "grad_norm": 1.1773898601531982, "learning_rate": 1.6948972167498649e-07, "loss": 0.475, "step": 36960 }, { "epoch": 2.0515950069348126, "grad_norm": 0.2554333209991455, "learning_rate": 1.6914604498928488e-07, "loss": 0.6754, "step": 36980 }, { "epoch": 2.0527045769764216, "grad_norm": 0.44611790776252747, "learning_rate": 1.6880253886779e-07, "loss": 0.5599, "step": 37000 }, { "epoch": 2.0538141470180307, "grad_norm": 0.3371647298336029, "learning_rate": 1.684592040351398e-07, "loss": 0.5677, "step": 37020 }, { "epoch": 2.0549237170596393, "grad_norm": 0.33603009581565857, "learning_rate": 1.681160412156109e-07, "loss": 0.476, "step": 37040 }, { "epoch": 2.0560332871012483, "grad_norm": 0.3955811858177185, "learning_rate": 1.6777305113311734e-07, "loss": 0.5197, "step": 37060 }, { "epoch": 2.057142857142857, "grad_norm": 0.28556638956069946, "learning_rate": 1.674302345112083e-07, "loss": 0.4628, "step": 37080 }, { "epoch": 2.058252427184466, "grad_norm": 0.36463966965675354, "learning_rate": 1.6708759207306723e-07, "loss": 0.4556, "step": 37100 }, { "epoch": 2.059361997226075, "grad_norm": 0.36078131198883057, "learning_rate": 1.6674512454151036e-07, "loss": 0.4958, "step": 37120 }, { "epoch": 2.0604715672676837, "grad_norm": 0.2134753316640854, "learning_rate": 1.664028326389847e-07, "loss": 0.5533, "step": 37140 }, { "epoch": 2.0615811373092927, "grad_norm": 0.35123831033706665, "learning_rate": 1.660607170875667e-07, "loss": 0.4849, "step": 37160 }, { "epoch": 2.0626907073509013, "grad_norm": 0.38204720616340637, "learning_rate": 1.657187786089611e-07, "loss": 0.5141, "step": 37180 }, { "epoch": 2.0638002773925104, "grad_norm": 0.38928312063217163, "learning_rate": 1.6537701792449882e-07, "loss": 0.4209, "step": 37200 }, { "epoch": 2.0649098474341194, "grad_norm": 0.2671926021575928, "learning_rate": 1.6503543575513576e-07, "loss": 0.5063, "step": 37220 }, { "epoch": 2.066019417475728, "grad_norm": 0.46130502223968506, "learning_rate": 1.6469403282145138e-07, "loss": 0.5047, "step": 37240 }, { "epoch": 2.067128987517337, "grad_norm": 0.37296372652053833, "learning_rate": 1.6435280984364692e-07, "loss": 0.4436, "step": 37260 }, { "epoch": 2.0682385575589457, "grad_norm": 0.2673152685165405, "learning_rate": 1.6401176754154398e-07, "loss": 0.5601, "step": 37280 }, { "epoch": 2.0693481276005548, "grad_norm": 0.38090190291404724, "learning_rate": 1.636709066345831e-07, "loss": 0.5131, "step": 37300 }, { "epoch": 2.070457697642164, "grad_norm": 0.3103052079677582, "learning_rate": 1.633302278418221e-07, "loss": 0.5525, "step": 37320 }, { "epoch": 2.0715672676837724, "grad_norm": 0.363341748714447, "learning_rate": 1.6298973188193458e-07, "loss": 0.5362, "step": 37340 }, { "epoch": 2.0726768377253815, "grad_norm": 0.38937488198280334, "learning_rate": 1.626494194732087e-07, "loss": 0.5699, "step": 37360 }, { "epoch": 2.07378640776699, "grad_norm": 0.31126880645751953, "learning_rate": 1.6230929133354506e-07, "loss": 0.5913, "step": 37380 }, { "epoch": 2.074895977808599, "grad_norm": 0.33595922589302063, "learning_rate": 1.6196934818045566e-07, "loss": 0.5503, "step": 37400 }, { "epoch": 2.076005547850208, "grad_norm": 0.3240179717540741, "learning_rate": 1.6162959073106248e-07, "loss": 0.4163, "step": 37420 }, { "epoch": 2.077115117891817, "grad_norm": 0.24516253173351288, "learning_rate": 1.6129001970209552e-07, "loss": 0.4369, "step": 37440 }, { "epoch": 2.078224687933426, "grad_norm": 0.40774011611938477, "learning_rate": 1.609506358098914e-07, "loss": 0.5236, "step": 37460 }, { "epoch": 2.0793342579750345, "grad_norm": 0.35175076127052307, "learning_rate": 1.6061143977039239e-07, "loss": 0.4817, "step": 37480 }, { "epoch": 2.0804438280166435, "grad_norm": 0.309893399477005, "learning_rate": 1.6027243229914414e-07, "loss": 0.5567, "step": 37500 }, { "epoch": 2.0815533980582526, "grad_norm": 0.26271405816078186, "learning_rate": 1.5993361411129453e-07, "loss": 0.4499, "step": 37520 }, { "epoch": 2.082662968099861, "grad_norm": 0.3389107882976532, "learning_rate": 1.5959498592159238e-07, "loss": 0.4943, "step": 37540 }, { "epoch": 2.0837725381414702, "grad_norm": 0.3142915666103363, "learning_rate": 1.5925654844438536e-07, "loss": 0.4814, "step": 37560 }, { "epoch": 2.084882108183079, "grad_norm": 0.31424760818481445, "learning_rate": 1.5891830239361907e-07, "loss": 0.4875, "step": 37580 }, { "epoch": 2.085991678224688, "grad_norm": 0.24209196865558624, "learning_rate": 1.5858024848283523e-07, "loss": 0.455, "step": 37600 }, { "epoch": 2.087101248266297, "grad_norm": 0.3856252133846283, "learning_rate": 1.582423874251703e-07, "loss": 0.6782, "step": 37620 }, { "epoch": 2.0882108183079056, "grad_norm": 0.2996671795845032, "learning_rate": 1.5790471993335357e-07, "loss": 0.4943, "step": 37640 }, { "epoch": 2.0893203883495146, "grad_norm": 0.3790593147277832, "learning_rate": 1.5756724671970668e-07, "loss": 0.5278, "step": 37660 }, { "epoch": 2.090429958391123, "grad_norm": 0.2942624092102051, "learning_rate": 1.5722996849614066e-07, "loss": 0.4043, "step": 37680 }, { "epoch": 2.0915395284327323, "grad_norm": 0.4559268057346344, "learning_rate": 1.5689288597415555e-07, "loss": 0.5467, "step": 37700 }, { "epoch": 2.0926490984743413, "grad_norm": 0.2624240219593048, "learning_rate": 1.565559998648388e-07, "loss": 0.5071, "step": 37720 }, { "epoch": 2.09375866851595, "grad_norm": 0.5491320490837097, "learning_rate": 1.5621931087886324e-07, "loss": 0.5552, "step": 37740 }, { "epoch": 2.094868238557559, "grad_norm": 0.37411582469940186, "learning_rate": 1.5588281972648565e-07, "loss": 0.4976, "step": 37760 }, { "epoch": 2.0959778085991676, "grad_norm": 0.2763647735118866, "learning_rate": 1.5554652711754595e-07, "loss": 0.5838, "step": 37780 }, { "epoch": 2.0970873786407767, "grad_norm": 0.246172696352005, "learning_rate": 1.5521043376146494e-07, "loss": 0.4158, "step": 37800 }, { "epoch": 2.0981969486823857, "grad_norm": 0.24387991428375244, "learning_rate": 1.548745403672431e-07, "loss": 0.3899, "step": 37820 }, { "epoch": 2.0993065187239943, "grad_norm": 0.2620500922203064, "learning_rate": 1.545388476434592e-07, "loss": 0.4574, "step": 37840 }, { "epoch": 2.1004160887656034, "grad_norm": 0.3492521047592163, "learning_rate": 1.5420335629826856e-07, "loss": 0.444, "step": 37860 }, { "epoch": 2.101525658807212, "grad_norm": 0.2339642196893692, "learning_rate": 1.538680670394018e-07, "loss": 0.5694, "step": 37880 }, { "epoch": 2.102635228848821, "grad_norm": 0.9007330536842346, "learning_rate": 1.5353298057416315e-07, "loss": 0.4938, "step": 37900 }, { "epoch": 2.10374479889043, "grad_norm": 0.3108046054840088, "learning_rate": 1.5319809760942896e-07, "loss": 0.4735, "step": 37920 }, { "epoch": 2.1048543689320387, "grad_norm": 0.42015713453292847, "learning_rate": 1.5286341885164654e-07, "loss": 0.4562, "step": 37940 }, { "epoch": 2.1059639389736478, "grad_norm": 0.32474932074546814, "learning_rate": 1.5252894500683234e-07, "loss": 0.4281, "step": 37960 }, { "epoch": 2.1070735090152564, "grad_norm": 0.4648033678531647, "learning_rate": 1.5219467678057017e-07, "loss": 0.6079, "step": 37980 }, { "epoch": 2.1081830790568654, "grad_norm": 0.3230738043785095, "learning_rate": 1.5186061487801073e-07, "loss": 0.4223, "step": 38000 }, { "epoch": 2.1092926490984745, "grad_norm": 0.3691207766532898, "learning_rate": 1.515267600038689e-07, "loss": 0.6263, "step": 38020 }, { "epoch": 2.110402219140083, "grad_norm": 0.36342281103134155, "learning_rate": 1.511931128624231e-07, "loss": 0.4388, "step": 38040 }, { "epoch": 2.111511789181692, "grad_norm": 0.36960119009017944, "learning_rate": 1.5085967415751353e-07, "loss": 0.4507, "step": 38060 }, { "epoch": 2.1126213592233007, "grad_norm": 0.2285505086183548, "learning_rate": 1.505264445925406e-07, "loss": 0.4821, "step": 38080 }, { "epoch": 2.11373092926491, "grad_norm": 0.34993231296539307, "learning_rate": 1.5019342487046355e-07, "loss": 0.5455, "step": 38100 }, { "epoch": 2.114840499306519, "grad_norm": 0.3273991644382477, "learning_rate": 1.49860615693799e-07, "loss": 0.5748, "step": 38120 }, { "epoch": 2.1159500693481275, "grad_norm": 0.3436887860298157, "learning_rate": 1.4952801776461942e-07, "loss": 0.4812, "step": 38140 }, { "epoch": 2.1170596393897365, "grad_norm": 0.39011484384536743, "learning_rate": 1.4919563178455153e-07, "loss": 0.4091, "step": 38160 }, { "epoch": 2.1181692094313456, "grad_norm": 0.2777954638004303, "learning_rate": 1.4886345845477505e-07, "loss": 0.4589, "step": 38180 }, { "epoch": 2.119278779472954, "grad_norm": 0.2919027507305145, "learning_rate": 1.4853149847602114e-07, "loss": 0.5081, "step": 38200 }, { "epoch": 2.1203883495145632, "grad_norm": 0.30477458238601685, "learning_rate": 1.4819975254857066e-07, "loss": 0.4559, "step": 38220 }, { "epoch": 2.121497919556172, "grad_norm": 0.20105265080928802, "learning_rate": 1.4786822137225334e-07, "loss": 0.4908, "step": 38240 }, { "epoch": 2.122607489597781, "grad_norm": 0.3497171401977539, "learning_rate": 1.475369056464455e-07, "loss": 0.4889, "step": 38260 }, { "epoch": 2.12371705963939, "grad_norm": 0.3250243365764618, "learning_rate": 1.472058060700689e-07, "loss": 0.5051, "step": 38280 }, { "epoch": 2.1248266296809986, "grad_norm": 0.42375174164772034, "learning_rate": 1.4687492334158984e-07, "loss": 0.5421, "step": 38300 }, { "epoch": 2.1259361997226076, "grad_norm": 0.4394356310367584, "learning_rate": 1.4654425815901667e-07, "loss": 0.5779, "step": 38320 }, { "epoch": 2.127045769764216, "grad_norm": 0.33973628282546997, "learning_rate": 1.46213811219899e-07, "loss": 0.4788, "step": 38340 }, { "epoch": 2.1281553398058253, "grad_norm": 0.3649098873138428, "learning_rate": 1.4588358322132612e-07, "loss": 0.5793, "step": 38360 }, { "epoch": 2.129264909847434, "grad_norm": 0.28052374720573425, "learning_rate": 1.4555357485992537e-07, "loss": 0.4062, "step": 38380 }, { "epoch": 2.130374479889043, "grad_norm": 0.28619879484176636, "learning_rate": 1.452237868318606e-07, "loss": 0.44, "step": 38400 }, { "epoch": 2.131484049930652, "grad_norm": 0.44376733899116516, "learning_rate": 1.4489421983283125e-07, "loss": 0.4882, "step": 38420 }, { "epoch": 2.1325936199722606, "grad_norm": 0.352039098739624, "learning_rate": 1.4456487455807032e-07, "loss": 0.4966, "step": 38440 }, { "epoch": 2.1337031900138697, "grad_norm": 0.3198799192905426, "learning_rate": 1.4423575170234267e-07, "loss": 0.5213, "step": 38460 }, { "epoch": 2.1348127600554787, "grad_norm": 0.3184233009815216, "learning_rate": 1.4390685195994467e-07, "loss": 0.4252, "step": 38480 }, { "epoch": 2.1359223300970873, "grad_norm": 0.3668297231197357, "learning_rate": 1.435781760247015e-07, "loss": 0.5051, "step": 38500 }, { "epoch": 2.1370319001386964, "grad_norm": 0.34509968757629395, "learning_rate": 1.4324972458996638e-07, "loss": 0.5712, "step": 38520 }, { "epoch": 2.138141470180305, "grad_norm": 0.3810400664806366, "learning_rate": 1.42921498348619e-07, "loss": 0.5537, "step": 38540 }, { "epoch": 2.139251040221914, "grad_norm": 0.35781317949295044, "learning_rate": 1.4259349799306398e-07, "loss": 0.4972, "step": 38560 }, { "epoch": 2.140360610263523, "grad_norm": 0.30156922340393066, "learning_rate": 1.422657242152293e-07, "loss": 0.4984, "step": 38580 }, { "epoch": 2.1414701803051317, "grad_norm": 0.2956472933292389, "learning_rate": 1.4193817770656504e-07, "loss": 0.5652, "step": 38600 }, { "epoch": 2.1425797503467408, "grad_norm": 0.41743120551109314, "learning_rate": 1.4161085915804192e-07, "loss": 0.5276, "step": 38620 }, { "epoch": 2.1436893203883494, "grad_norm": 0.30117279291152954, "learning_rate": 1.4128376926014957e-07, "loss": 0.5354, "step": 38640 }, { "epoch": 2.1447988904299584, "grad_norm": 0.33347612619400024, "learning_rate": 1.4095690870289557e-07, "loss": 0.548, "step": 38660 }, { "epoch": 2.1459084604715675, "grad_norm": 0.32350632548332214, "learning_rate": 1.4063027817580353e-07, "loss": 0.4408, "step": 38680 }, { "epoch": 2.147018030513176, "grad_norm": 0.30435916781425476, "learning_rate": 1.4030387836791164e-07, "loss": 0.499, "step": 38700 }, { "epoch": 2.148127600554785, "grad_norm": 0.4015420079231262, "learning_rate": 1.399777099677718e-07, "loss": 0.5225, "step": 38720 }, { "epoch": 2.1492371705963937, "grad_norm": 0.32129380106925964, "learning_rate": 1.3965177366344743e-07, "loss": 0.4629, "step": 38740 }, { "epoch": 2.150346740638003, "grad_norm": 0.36064520478248596, "learning_rate": 1.3932607014251218e-07, "loss": 0.5055, "step": 38760 }, { "epoch": 2.151456310679612, "grad_norm": 0.4212459623813629, "learning_rate": 1.390006000920491e-07, "loss": 0.4743, "step": 38780 }, { "epoch": 2.1525658807212205, "grad_norm": 0.3538709282875061, "learning_rate": 1.3867536419864846e-07, "loss": 0.4742, "step": 38800 }, { "epoch": 2.1536754507628295, "grad_norm": 0.2761835753917694, "learning_rate": 1.3835036314840643e-07, "loss": 0.545, "step": 38820 }, { "epoch": 2.154785020804438, "grad_norm": 0.39690110087394714, "learning_rate": 1.3802559762692417e-07, "loss": 0.5275, "step": 38840 }, { "epoch": 2.155894590846047, "grad_norm": 0.36765530705451965, "learning_rate": 1.3770106831930562e-07, "loss": 0.5256, "step": 38860 }, { "epoch": 2.1570041608876562, "grad_norm": 0.22159022092819214, "learning_rate": 1.3737677591015657e-07, "loss": 0.5085, "step": 38880 }, { "epoch": 2.158113730929265, "grad_norm": 0.3043636083602905, "learning_rate": 1.3705272108358304e-07, "loss": 0.4874, "step": 38900 }, { "epoch": 2.159223300970874, "grad_norm": 0.26278674602508545, "learning_rate": 1.3672890452318986e-07, "loss": 0.5335, "step": 38920 }, { "epoch": 2.1603328710124825, "grad_norm": 0.33735939860343933, "learning_rate": 1.364053269120791e-07, "loss": 0.5155, "step": 38940 }, { "epoch": 2.1614424410540916, "grad_norm": 0.2501813769340515, "learning_rate": 1.3608198893284913e-07, "loss": 0.4073, "step": 38960 }, { "epoch": 2.1625520110957006, "grad_norm": 0.32090553641319275, "learning_rate": 1.357588912675925e-07, "loss": 0.6106, "step": 38980 }, { "epoch": 2.163661581137309, "grad_norm": 0.4311932325363159, "learning_rate": 1.3543603459789466e-07, "loss": 0.474, "step": 39000 }, { "epoch": 2.1647711511789183, "grad_norm": 0.4605388641357422, "learning_rate": 1.351134196048332e-07, "loss": 0.5517, "step": 39020 }, { "epoch": 2.165880721220527, "grad_norm": 0.3975484371185303, "learning_rate": 1.3479104696897558e-07, "loss": 0.4502, "step": 39040 }, { "epoch": 2.166990291262136, "grad_norm": 0.3403488099575043, "learning_rate": 1.3446891737037762e-07, "loss": 0.5531, "step": 39060 }, { "epoch": 2.168099861303745, "grad_norm": 0.335472047328949, "learning_rate": 1.3414703148858313e-07, "loss": 0.5118, "step": 39080 }, { "epoch": 2.1692094313453536, "grad_norm": 0.3395792245864868, "learning_rate": 1.3382539000262144e-07, "loss": 0.5259, "step": 39100 }, { "epoch": 2.1703190013869627, "grad_norm": 0.25248828530311584, "learning_rate": 1.3350399359100623e-07, "loss": 0.5946, "step": 39120 }, { "epoch": 2.1714285714285713, "grad_norm": 0.31494832038879395, "learning_rate": 1.3318284293173449e-07, "loss": 0.5181, "step": 39140 }, { "epoch": 2.1725381414701803, "grad_norm": 0.24149291217327118, "learning_rate": 1.3286193870228452e-07, "loss": 0.5202, "step": 39160 }, { "epoch": 2.1736477115117894, "grad_norm": 0.3055821359157562, "learning_rate": 1.3254128157961486e-07, "loss": 0.4186, "step": 39180 }, { "epoch": 2.174757281553398, "grad_norm": 0.30945852398872375, "learning_rate": 1.322208722401628e-07, "loss": 0.4906, "step": 39200 }, { "epoch": 2.175866851595007, "grad_norm": 0.324486643075943, "learning_rate": 1.3190071135984286e-07, "loss": 0.5227, "step": 39220 }, { "epoch": 2.1769764216366156, "grad_norm": 0.34117335081100464, "learning_rate": 1.3158079961404534e-07, "loss": 0.4648, "step": 39240 }, { "epoch": 2.1780859916782247, "grad_norm": 0.3016805052757263, "learning_rate": 1.3126113767763535e-07, "loss": 0.4665, "step": 39260 }, { "epoch": 2.1791955617198338, "grad_norm": 0.39678677916526794, "learning_rate": 1.3094172622495066e-07, "loss": 0.4352, "step": 39280 }, { "epoch": 2.1803051317614424, "grad_norm": 0.31056568026542664, "learning_rate": 1.3062256592980064e-07, "loss": 0.5466, "step": 39300 }, { "epoch": 2.1814147018030514, "grad_norm": 0.28223127126693726, "learning_rate": 1.3030365746546522e-07, "loss": 0.481, "step": 39320 }, { "epoch": 2.18252427184466, "grad_norm": 0.34469664096832275, "learning_rate": 1.2998500150469252e-07, "loss": 0.4697, "step": 39340 }, { "epoch": 2.183633841886269, "grad_norm": 0.24636615812778473, "learning_rate": 1.296665987196983e-07, "loss": 0.509, "step": 39360 }, { "epoch": 2.184743411927878, "grad_norm": 0.22806523740291595, "learning_rate": 1.2934844978216435e-07, "loss": 0.5512, "step": 39380 }, { "epoch": 2.1858529819694867, "grad_norm": 0.3925151526927948, "learning_rate": 1.290305553632368e-07, "loss": 0.4605, "step": 39400 }, { "epoch": 2.186962552011096, "grad_norm": 0.36504384875297546, "learning_rate": 1.2871291613352477e-07, "loss": 0.4209, "step": 39420 }, { "epoch": 2.1880721220527044, "grad_norm": 0.3153577148914337, "learning_rate": 1.2839553276309937e-07, "loss": 0.5425, "step": 39440 }, { "epoch": 2.1891816920943135, "grad_norm": 0.3118636906147003, "learning_rate": 1.2807840592149162e-07, "loss": 0.5442, "step": 39460 }, { "epoch": 2.1902912621359225, "grad_norm": 0.28142908215522766, "learning_rate": 1.2776153627769159e-07, "loss": 0.5088, "step": 39480 }, { "epoch": 2.191400832177531, "grad_norm": 0.3825930953025818, "learning_rate": 1.274449245001467e-07, "loss": 0.5357, "step": 39500 }, { "epoch": 2.19251040221914, "grad_norm": 0.2942569851875305, "learning_rate": 1.2712857125676044e-07, "loss": 0.6335, "step": 39520 }, { "epoch": 2.193619972260749, "grad_norm": 0.34247496724128723, "learning_rate": 1.2681247721489074e-07, "loss": 0.4206, "step": 39540 }, { "epoch": 2.194729542302358, "grad_norm": 0.36453333497047424, "learning_rate": 1.264966430413491e-07, "loss": 0.5839, "step": 39560 }, { "epoch": 2.195839112343967, "grad_norm": 0.37157902121543884, "learning_rate": 1.2618106940239853e-07, "loss": 0.4888, "step": 39580 }, { "epoch": 2.1969486823855755, "grad_norm": 0.35525181889533997, "learning_rate": 1.2586575696375238e-07, "loss": 0.4839, "step": 39600 }, { "epoch": 2.1980582524271846, "grad_norm": 0.2853974997997284, "learning_rate": 1.255507063905734e-07, "loss": 0.3924, "step": 39620 }, { "epoch": 2.199167822468793, "grad_norm": 0.3778701424598694, "learning_rate": 1.2523591834747138e-07, "loss": 0.4539, "step": 39640 }, { "epoch": 2.200277392510402, "grad_norm": 0.655086100101471, "learning_rate": 1.249213934985025e-07, "loss": 0.5212, "step": 39660 }, { "epoch": 2.2013869625520113, "grad_norm": 0.32354071736335754, "learning_rate": 1.2460713250716805e-07, "loss": 0.5377, "step": 39680 }, { "epoch": 2.20249653259362, "grad_norm": 0.2763678729534149, "learning_rate": 1.2429313603641226e-07, "loss": 0.4857, "step": 39700 }, { "epoch": 2.203606102635229, "grad_norm": 0.2953469753265381, "learning_rate": 1.2397940474862144e-07, "loss": 0.4952, "step": 39720 }, { "epoch": 2.2047156726768375, "grad_norm": 0.2942279279232025, "learning_rate": 1.2366593930562278e-07, "loss": 0.4512, "step": 39740 }, { "epoch": 2.2058252427184466, "grad_norm": 0.3509308397769928, "learning_rate": 1.2335274036868233e-07, "loss": 0.4655, "step": 39760 }, { "epoch": 2.2069348127600557, "grad_norm": 0.350507915019989, "learning_rate": 1.2303980859850402e-07, "loss": 0.5575, "step": 39780 }, { "epoch": 2.2080443828016643, "grad_norm": 0.3966754972934723, "learning_rate": 1.2272714465522827e-07, "loss": 0.5068, "step": 39800 }, { "epoch": 2.2091539528432733, "grad_norm": 0.36393553018569946, "learning_rate": 1.2241474919843043e-07, "loss": 0.4545, "step": 39820 }, { "epoch": 2.210263522884882, "grad_norm": 0.3185720443725586, "learning_rate": 1.2210262288711933e-07, "loss": 0.6028, "step": 39840 }, { "epoch": 2.211373092926491, "grad_norm": 0.3905843198299408, "learning_rate": 1.2179076637973647e-07, "loss": 0.4589, "step": 39860 }, { "epoch": 2.2124826629681, "grad_norm": 0.2925828993320465, "learning_rate": 1.214791803341538e-07, "loss": 0.5075, "step": 39880 }, { "epoch": 2.2135922330097086, "grad_norm": 0.342237263917923, "learning_rate": 1.2116786540767267e-07, "loss": 0.5803, "step": 39900 }, { "epoch": 2.2147018030513177, "grad_norm": 0.4244076609611511, "learning_rate": 1.2085682225702304e-07, "loss": 0.5171, "step": 39920 }, { "epoch": 2.2158113730929263, "grad_norm": 0.2605830729007721, "learning_rate": 1.2054605153836077e-07, "loss": 0.5234, "step": 39940 }, { "epoch": 2.2169209431345354, "grad_norm": 0.3428288400173187, "learning_rate": 1.2023555390726748e-07, "loss": 0.4514, "step": 39960 }, { "epoch": 2.2180305131761444, "grad_norm": 0.5148153305053711, "learning_rate": 1.199253300187488e-07, "loss": 0.5178, "step": 39980 }, { "epoch": 2.219140083217753, "grad_norm": 0.2536509037017822, "learning_rate": 1.1961538052723265e-07, "loss": 0.5071, "step": 40000 }, { "epoch": 2.220249653259362, "grad_norm": 0.3203612267971039, "learning_rate": 1.1930570608656803e-07, "loss": 0.5115, "step": 40020 }, { "epoch": 2.2213592233009707, "grad_norm": 0.4563620090484619, "learning_rate": 1.1899630735002409e-07, "loss": 0.496, "step": 40040 }, { "epoch": 2.2224687933425797, "grad_norm": 0.3274185061454773, "learning_rate": 1.1868718497028798e-07, "loss": 0.5512, "step": 40060 }, { "epoch": 2.223578363384189, "grad_norm": 0.4102165102958679, "learning_rate": 1.183783395994641e-07, "loss": 0.5122, "step": 40080 }, { "epoch": 2.2246879334257974, "grad_norm": 0.30960938334465027, "learning_rate": 1.1806977188907236e-07, "loss": 0.4545, "step": 40100 }, { "epoch": 2.2257975034674065, "grad_norm": 0.4142305552959442, "learning_rate": 1.17761482490047e-07, "loss": 0.5727, "step": 40120 }, { "epoch": 2.226907073509015, "grad_norm": 0.34385111927986145, "learning_rate": 1.1745347205273506e-07, "loss": 0.5081, "step": 40140 }, { "epoch": 2.228016643550624, "grad_norm": 0.3649786412715912, "learning_rate": 1.1714574122689536e-07, "loss": 0.4485, "step": 40160 }, { "epoch": 2.229126213592233, "grad_norm": 0.23757563531398773, "learning_rate": 1.1683829066169656e-07, "loss": 0.4645, "step": 40180 }, { "epoch": 2.230235783633842, "grad_norm": 0.26587429642677307, "learning_rate": 1.1653112100571619e-07, "loss": 0.5078, "step": 40200 }, { "epoch": 2.231345353675451, "grad_norm": 0.3870731294155121, "learning_rate": 1.162242329069395e-07, "loss": 0.4421, "step": 40220 }, { "epoch": 2.2324549237170594, "grad_norm": 0.38600581884384155, "learning_rate": 1.1591762701275723e-07, "loss": 0.4922, "step": 40240 }, { "epoch": 2.2335644937586685, "grad_norm": 0.21039997041225433, "learning_rate": 1.1561130396996508e-07, "loss": 0.4365, "step": 40260 }, { "epoch": 2.2346740638002776, "grad_norm": 0.3185214102268219, "learning_rate": 1.1530526442476226e-07, "loss": 0.4576, "step": 40280 }, { "epoch": 2.235783633841886, "grad_norm": 0.2597521245479584, "learning_rate": 1.1499950902274968e-07, "loss": 0.4908, "step": 40300 }, { "epoch": 2.236893203883495, "grad_norm": 0.3822251558303833, "learning_rate": 1.146940384089288e-07, "loss": 0.5693, "step": 40320 }, { "epoch": 2.238002773925104, "grad_norm": 0.3780229389667511, "learning_rate": 1.1438885322770062e-07, "loss": 0.463, "step": 40340 }, { "epoch": 2.239112343966713, "grad_norm": 0.41160452365875244, "learning_rate": 1.1408395412286365e-07, "loss": 0.5103, "step": 40360 }, { "epoch": 2.240221914008322, "grad_norm": 0.2547391653060913, "learning_rate": 1.1377934173761311e-07, "loss": 0.4918, "step": 40380 }, { "epoch": 2.2413314840499305, "grad_norm": 0.24462980031967163, "learning_rate": 1.1347501671453935e-07, "loss": 0.5173, "step": 40400 }, { "epoch": 2.2424410540915396, "grad_norm": 0.297760546207428, "learning_rate": 1.1317097969562647e-07, "loss": 0.4749, "step": 40420 }, { "epoch": 2.243550624133148, "grad_norm": 0.4383184313774109, "learning_rate": 1.1286723132225095e-07, "loss": 0.4449, "step": 40440 }, { "epoch": 2.2446601941747573, "grad_norm": 0.3082912862300873, "learning_rate": 1.1256377223518068e-07, "loss": 0.5519, "step": 40460 }, { "epoch": 2.2457697642163663, "grad_norm": 0.3220030963420868, "learning_rate": 1.1226060307457297e-07, "loss": 0.4995, "step": 40480 }, { "epoch": 2.246879334257975, "grad_norm": 0.29188576340675354, "learning_rate": 1.1195772447997348e-07, "loss": 0.4666, "step": 40500 }, { "epoch": 2.247988904299584, "grad_norm": 0.24548663198947906, "learning_rate": 1.116551370903154e-07, "loss": 0.4002, "step": 40520 }, { "epoch": 2.2490984743411926, "grad_norm": 0.46225422620773315, "learning_rate": 1.1135284154391694e-07, "loss": 0.4673, "step": 40540 }, { "epoch": 2.2502080443828016, "grad_norm": 0.35080599784851074, "learning_rate": 1.1105083847848101e-07, "loss": 0.6099, "step": 40560 }, { "epoch": 2.2513176144244107, "grad_norm": 0.36472031474113464, "learning_rate": 1.1074912853109364e-07, "loss": 0.6005, "step": 40580 }, { "epoch": 2.2524271844660193, "grad_norm": 0.26884734630584717, "learning_rate": 1.104477123382223e-07, "loss": 0.5572, "step": 40600 }, { "epoch": 2.2535367545076284, "grad_norm": 0.24598993360996246, "learning_rate": 1.1014659053571476e-07, "loss": 0.4303, "step": 40620 }, { "epoch": 2.2546463245492374, "grad_norm": 0.4250015914440155, "learning_rate": 1.0984576375879803e-07, "loss": 0.5128, "step": 40640 }, { "epoch": 2.255755894590846, "grad_norm": 0.359209269285202, "learning_rate": 1.0954523264207646e-07, "loss": 0.5249, "step": 40660 }, { "epoch": 2.256865464632455, "grad_norm": 0.248770609498024, "learning_rate": 1.092449978195308e-07, "loss": 0.5572, "step": 40680 }, { "epoch": 2.2579750346740637, "grad_norm": 0.2718575894832611, "learning_rate": 1.0894505992451682e-07, "loss": 0.484, "step": 40700 }, { "epoch": 2.2590846047156727, "grad_norm": 0.3930140435695648, "learning_rate": 1.0864541958976378e-07, "loss": 0.4952, "step": 40720 }, { "epoch": 2.260194174757282, "grad_norm": 0.6200651526451111, "learning_rate": 1.0834607744737329e-07, "loss": 0.4776, "step": 40740 }, { "epoch": 2.2613037447988904, "grad_norm": 0.22395218908786774, "learning_rate": 1.0804703412881802e-07, "loss": 0.5156, "step": 40760 }, { "epoch": 2.2624133148404995, "grad_norm": 0.29515132308006287, "learning_rate": 1.0774829026494011e-07, "loss": 0.4479, "step": 40780 }, { "epoch": 2.263522884882108, "grad_norm": 0.3003232479095459, "learning_rate": 1.0744984648595006e-07, "loss": 0.4863, "step": 40800 }, { "epoch": 2.264632454923717, "grad_norm": 0.3430972695350647, "learning_rate": 1.0715170342142532e-07, "loss": 0.4809, "step": 40820 }, { "epoch": 2.265742024965326, "grad_norm": 0.6491120457649231, "learning_rate": 1.0685386170030894e-07, "loss": 0.54, "step": 40840 }, { "epoch": 2.266851595006935, "grad_norm": 0.25227710604667664, "learning_rate": 1.0655632195090822e-07, "loss": 0.4213, "step": 40860 }, { "epoch": 2.267961165048544, "grad_norm": 0.2641170024871826, "learning_rate": 1.0625908480089371e-07, "loss": 0.3851, "step": 40880 }, { "epoch": 2.2690707350901524, "grad_norm": 0.24986933171749115, "learning_rate": 1.0596215087729732e-07, "loss": 0.5428, "step": 40900 }, { "epoch": 2.2701803051317615, "grad_norm": 0.3207925856113434, "learning_rate": 1.0566552080651133e-07, "loss": 0.5266, "step": 40920 }, { "epoch": 2.2712898751733706, "grad_norm": 0.23741750419139862, "learning_rate": 1.053691952142873e-07, "loss": 0.4839, "step": 40940 }, { "epoch": 2.272399445214979, "grad_norm": 0.3868573307991028, "learning_rate": 1.0507317472573418e-07, "loss": 0.5405, "step": 40960 }, { "epoch": 2.273509015256588, "grad_norm": 0.2984812557697296, "learning_rate": 1.0477745996531739e-07, "loss": 0.4968, "step": 40980 }, { "epoch": 2.274618585298197, "grad_norm": 0.38597699999809265, "learning_rate": 1.0448205155685744e-07, "loss": 0.5796, "step": 41000 }, { "epoch": 2.275728155339806, "grad_norm": 0.36040642857551575, "learning_rate": 1.0418695012352854e-07, "loss": 0.4754, "step": 41020 }, { "epoch": 2.276837725381415, "grad_norm": 0.3200027346611023, "learning_rate": 1.0389215628785725e-07, "loss": 0.4726, "step": 41040 }, { "epoch": 2.2779472954230235, "grad_norm": 0.40186208486557007, "learning_rate": 1.0359767067172151e-07, "loss": 0.4925, "step": 41060 }, { "epoch": 2.2790568654646326, "grad_norm": 0.3643325865268707, "learning_rate": 1.0330349389634882e-07, "loss": 0.4856, "step": 41080 }, { "epoch": 2.280166435506241, "grad_norm": 0.22765520215034485, "learning_rate": 1.0300962658231521e-07, "loss": 0.5707, "step": 41100 }, { "epoch": 2.2812760055478503, "grad_norm": 0.3594735562801361, "learning_rate": 1.0271606934954391e-07, "loss": 0.4974, "step": 41120 }, { "epoch": 2.2823855755894593, "grad_norm": 0.30506420135498047, "learning_rate": 1.0242282281730402e-07, "loss": 0.4429, "step": 41140 }, { "epoch": 2.283495145631068, "grad_norm": 0.31707510352134705, "learning_rate": 1.0212988760420918e-07, "loss": 0.542, "step": 41160 }, { "epoch": 2.284604715672677, "grad_norm": 0.33363673090934753, "learning_rate": 1.0183726432821643e-07, "loss": 0.4955, "step": 41180 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4339458644390106, "learning_rate": 1.0154495360662463e-07, "loss": 0.5316, "step": 41200 }, { "epoch": 2.2868238557558946, "grad_norm": 0.32692259550094604, "learning_rate": 1.0125295605607324e-07, "loss": 0.4439, "step": 41220 }, { "epoch": 2.2879334257975037, "grad_norm": 0.32738301157951355, "learning_rate": 1.0096127229254128e-07, "loss": 0.5431, "step": 41240 }, { "epoch": 2.2890429958391123, "grad_norm": 0.33724966645240784, "learning_rate": 1.0066990293134567e-07, "loss": 0.4717, "step": 41260 }, { "epoch": 2.2901525658807214, "grad_norm": 0.296539843082428, "learning_rate": 1.0037884858714012e-07, "loss": 0.4757, "step": 41280 }, { "epoch": 2.29126213592233, "grad_norm": 0.3162001669406891, "learning_rate": 1.000881098739138e-07, "loss": 0.4939, "step": 41300 }, { "epoch": 2.292371705963939, "grad_norm": 0.19192631542682648, "learning_rate": 9.979768740499006e-08, "loss": 0.44, "step": 41320 }, { "epoch": 2.293481276005548, "grad_norm": 0.3006276488304138, "learning_rate": 9.950758179302504e-08, "loss": 0.4121, "step": 41340 }, { "epoch": 2.2945908460471567, "grad_norm": 0.4388786554336548, "learning_rate": 9.921779365000668e-08, "loss": 0.473, "step": 41360 }, { "epoch": 2.2957004160887657, "grad_norm": 0.3584080636501312, "learning_rate": 9.892832358725298e-08, "loss": 0.4483, "step": 41380 }, { "epoch": 2.2968099861303743, "grad_norm": 0.297114759683609, "learning_rate": 9.863917221541104e-08, "loss": 0.5627, "step": 41400 }, { "epoch": 2.2979195561719834, "grad_norm": 0.3420100212097168, "learning_rate": 9.835034014445561e-08, "loss": 0.4989, "step": 41420 }, { "epoch": 2.2990291262135925, "grad_norm": 0.2852362394332886, "learning_rate": 9.806182798368798e-08, "loss": 0.5818, "step": 41440 }, { "epoch": 2.300138696255201, "grad_norm": 0.3422713577747345, "learning_rate": 9.777363634173436e-08, "loss": 0.4433, "step": 41460 }, { "epoch": 2.30124826629681, "grad_norm": 0.39617717266082764, "learning_rate": 9.748576582654514e-08, "loss": 0.5065, "step": 41480 }, { "epoch": 2.3023578363384187, "grad_norm": 0.34729039669036865, "learning_rate": 9.719821704539291e-08, "loss": 0.4077, "step": 41500 }, { "epoch": 2.303467406380028, "grad_norm": 0.5843175649642944, "learning_rate": 9.691099060487196e-08, "loss": 0.4357, "step": 41520 }, { "epoch": 2.304576976421637, "grad_norm": 0.4193805754184723, "learning_rate": 9.662408711089626e-08, "loss": 0.6325, "step": 41540 }, { "epoch": 2.3056865464632454, "grad_norm": 0.21688449382781982, "learning_rate": 9.633750716869863e-08, "loss": 0.5129, "step": 41560 }, { "epoch": 2.3067961165048545, "grad_norm": 0.3370996415615082, "learning_rate": 9.605125138282935e-08, "loss": 0.551, "step": 41580 }, { "epoch": 2.307905686546463, "grad_norm": 0.32774749398231506, "learning_rate": 9.576532035715485e-08, "loss": 0.4544, "step": 41600 }, { "epoch": 2.309015256588072, "grad_norm": 0.3826076090335846, "learning_rate": 9.547971469485638e-08, "loss": 0.5031, "step": 41620 }, { "epoch": 2.310124826629681, "grad_norm": 0.33858364820480347, "learning_rate": 9.519443499842919e-08, "loss": 0.5205, "step": 41640 }, { "epoch": 2.31123439667129, "grad_norm": 0.31076088547706604, "learning_rate": 9.490948186968051e-08, "loss": 0.4732, "step": 41660 }, { "epoch": 2.312343966712899, "grad_norm": 0.6869786381721497, "learning_rate": 9.462485590972869e-08, "loss": 0.4242, "step": 41680 }, { "epoch": 2.3134535367545075, "grad_norm": 0.3365934491157532, "learning_rate": 9.434055771900227e-08, "loss": 0.5237, "step": 41700 }, { "epoch": 2.3145631067961165, "grad_norm": 0.3064265549182892, "learning_rate": 9.405658789723784e-08, "loss": 0.5844, "step": 41720 }, { "epoch": 2.3156726768377256, "grad_norm": 0.28392043709754944, "learning_rate": 9.377294704347955e-08, "loss": 0.4673, "step": 41740 }, { "epoch": 2.316782246879334, "grad_norm": 0.32479995489120483, "learning_rate": 9.348963575607771e-08, "loss": 0.4465, "step": 41760 }, { "epoch": 2.3178918169209433, "grad_norm": 0.24451689422130585, "learning_rate": 9.320665463268718e-08, "loss": 0.5395, "step": 41780 }, { "epoch": 2.319001386962552, "grad_norm": 0.3727029263973236, "learning_rate": 9.292400427026628e-08, "loss": 0.5177, "step": 41800 }, { "epoch": 2.320110957004161, "grad_norm": 0.5698169469833374, "learning_rate": 9.264168526507593e-08, "loss": 0.5147, "step": 41820 }, { "epoch": 2.32122052704577, "grad_norm": 0.33081555366516113, "learning_rate": 9.23596982126777e-08, "loss": 0.4584, "step": 41840 }, { "epoch": 2.3223300970873786, "grad_norm": 0.2574847936630249, "learning_rate": 9.207804370793295e-08, "loss": 0.4253, "step": 41860 }, { "epoch": 2.3234396671289876, "grad_norm": 0.36542317271232605, "learning_rate": 9.179672234500166e-08, "loss": 0.5536, "step": 41880 }, { "epoch": 2.3245492371705962, "grad_norm": 0.242173969745636, "learning_rate": 9.15157347173409e-08, "loss": 0.4867, "step": 41900 }, { "epoch": 2.3256588072122053, "grad_norm": 0.33480873703956604, "learning_rate": 9.123508141770367e-08, "loss": 0.4476, "step": 41920 }, { "epoch": 2.3267683772538144, "grad_norm": 0.41106659173965454, "learning_rate": 9.095476303813796e-08, "loss": 0.6361, "step": 41940 }, { "epoch": 2.327877947295423, "grad_norm": 0.28075936436653137, "learning_rate": 9.067478016998497e-08, "loss": 0.3721, "step": 41960 }, { "epoch": 2.328987517337032, "grad_norm": 0.2947435975074768, "learning_rate": 9.039513340387816e-08, "loss": 0.4817, "step": 41980 }, { "epoch": 2.3300970873786406, "grad_norm": 0.2995186746120453, "learning_rate": 9.011582332974227e-08, "loss": 0.5681, "step": 42000 }, { "epoch": 2.3312066574202497, "grad_norm": 0.3285873830318451, "learning_rate": 8.983685053679125e-08, "loss": 0.534, "step": 42020 }, { "epoch": 2.3323162274618587, "grad_norm": 0.3830658197402954, "learning_rate": 8.955821561352785e-08, "loss": 0.4772, "step": 42040 }, { "epoch": 2.3334257975034673, "grad_norm": 0.28851670026779175, "learning_rate": 8.927991914774227e-08, "loss": 0.4746, "step": 42060 }, { "epoch": 2.3345353675450764, "grad_norm": 0.35229215025901794, "learning_rate": 8.900196172651033e-08, "loss": 0.5892, "step": 42080 }, { "epoch": 2.335644937586685, "grad_norm": 0.2901206612586975, "learning_rate": 8.872434393619277e-08, "loss": 0.5064, "step": 42100 }, { "epoch": 2.336754507628294, "grad_norm": 0.28118959069252014, "learning_rate": 8.844706636243404e-08, "loss": 0.5849, "step": 42120 }, { "epoch": 2.337864077669903, "grad_norm": 0.27726155519485474, "learning_rate": 8.817012959016066e-08, "loss": 0.5584, "step": 42140 }, { "epoch": 2.3389736477115117, "grad_norm": 0.4250234365463257, "learning_rate": 8.789353420358032e-08, "loss": 0.5181, "step": 42160 }, { "epoch": 2.340083217753121, "grad_norm": 0.3469075560569763, "learning_rate": 8.761728078618049e-08, "loss": 0.4026, "step": 42180 }, { "epoch": 2.3411927877947294, "grad_norm": 0.43628400564193726, "learning_rate": 8.734136992072733e-08, "loss": 0.5175, "step": 42200 }, { "epoch": 2.3423023578363384, "grad_norm": 0.22146722674369812, "learning_rate": 8.706580218926421e-08, "loss": 0.6083, "step": 42220 }, { "epoch": 2.3434119278779475, "grad_norm": 0.48523247241973877, "learning_rate": 8.679057817311095e-08, "loss": 0.5215, "step": 42240 }, { "epoch": 2.344521497919556, "grad_norm": 0.2753826379776001, "learning_rate": 8.651569845286202e-08, "loss": 0.4968, "step": 42260 }, { "epoch": 2.345631067961165, "grad_norm": 0.38380247354507446, "learning_rate": 8.624116360838556e-08, "loss": 0.476, "step": 42280 }, { "epoch": 2.3467406380027738, "grad_norm": 0.31504160165786743, "learning_rate": 8.596697421882257e-08, "loss": 0.5232, "step": 42300 }, { "epoch": 2.347850208044383, "grad_norm": 0.43794184923171997, "learning_rate": 8.569313086258478e-08, "loss": 0.5038, "step": 42320 }, { "epoch": 2.348959778085992, "grad_norm": 0.29483968019485474, "learning_rate": 8.541963411735417e-08, "loss": 0.5646, "step": 42340 }, { "epoch": 2.3500693481276005, "grad_norm": 0.2609615623950958, "learning_rate": 8.514648456008173e-08, "loss": 0.5554, "step": 42360 }, { "epoch": 2.3511789181692095, "grad_norm": 0.29072296619415283, "learning_rate": 8.487368276698579e-08, "loss": 0.5307, "step": 42380 }, { "epoch": 2.352288488210818, "grad_norm": 0.39302295446395874, "learning_rate": 8.460122931355107e-08, "loss": 0.5426, "step": 42400 }, { "epoch": 2.353398058252427, "grad_norm": 0.3284793198108673, "learning_rate": 8.43291247745277e-08, "loss": 0.5076, "step": 42420 }, { "epoch": 2.3545076282940363, "grad_norm": 0.31506726145744324, "learning_rate": 8.405736972392946e-08, "loss": 0.4849, "step": 42440 }, { "epoch": 2.355617198335645, "grad_norm": 0.3414899706840515, "learning_rate": 8.378596473503304e-08, "loss": 0.5092, "step": 42460 }, { "epoch": 2.356726768377254, "grad_norm": 0.3895412087440491, "learning_rate": 8.351491038037662e-08, "loss": 0.5612, "step": 42480 }, { "epoch": 2.3578363384188625, "grad_norm": 0.3313666582107544, "learning_rate": 8.324420723175871e-08, "loss": 0.5123, "step": 42500 }, { "epoch": 2.3589459084604716, "grad_norm": 0.3673238158226013, "learning_rate": 8.297385586023683e-08, "loss": 0.5074, "step": 42520 }, { "epoch": 2.3600554785020806, "grad_norm": 0.31858721375465393, "learning_rate": 8.270385683612674e-08, "loss": 0.4792, "step": 42540 }, { "epoch": 2.3611650485436892, "grad_norm": 0.29056671261787415, "learning_rate": 8.243421072900059e-08, "loss": 0.446, "step": 42560 }, { "epoch": 2.3622746185852983, "grad_norm": 0.40199610590934753, "learning_rate": 8.21649181076861e-08, "loss": 0.5147, "step": 42580 }, { "epoch": 2.363384188626907, "grad_norm": 0.3294355869293213, "learning_rate": 8.189597954026539e-08, "loss": 0.4298, "step": 42600 }, { "epoch": 2.364493758668516, "grad_norm": 0.2900794744491577, "learning_rate": 8.162739559407364e-08, "loss": 0.5295, "step": 42620 }, { "epoch": 2.365603328710125, "grad_norm": 0.29145869612693787, "learning_rate": 8.135916683569785e-08, "loss": 0.5451, "step": 42640 }, { "epoch": 2.3667128987517336, "grad_norm": 0.4100785553455353, "learning_rate": 8.1091293830976e-08, "loss": 0.5674, "step": 42660 }, { "epoch": 2.3678224687933427, "grad_norm": 0.316750168800354, "learning_rate": 8.082377714499536e-08, "loss": 0.5323, "step": 42680 }, { "epoch": 2.3689320388349513, "grad_norm": 0.2979957163333893, "learning_rate": 8.055661734209151e-08, "loss": 0.4566, "step": 42700 }, { "epoch": 2.3700416088765603, "grad_norm": 0.3870287835597992, "learning_rate": 8.028981498584745e-08, "loss": 0.5271, "step": 42720 }, { "epoch": 2.3711511789181694, "grad_norm": 0.43149611353874207, "learning_rate": 8.002337063909187e-08, "loss": 0.469, "step": 42740 }, { "epoch": 2.372260748959778, "grad_norm": 0.23622258007526398, "learning_rate": 7.975728486389829e-08, "loss": 0.5567, "step": 42760 }, { "epoch": 2.373370319001387, "grad_norm": 0.38464128971099854, "learning_rate": 7.949155822158385e-08, "loss": 0.4559, "step": 42780 }, { "epoch": 2.3744798890429957, "grad_norm": 0.22643379867076874, "learning_rate": 7.922619127270804e-08, "loss": 0.5095, "step": 42800 }, { "epoch": 2.3755894590846047, "grad_norm": 0.24212543666362762, "learning_rate": 7.896118457707158e-08, "loss": 0.4167, "step": 42820 }, { "epoch": 2.376699029126214, "grad_norm": 0.34148135781288147, "learning_rate": 7.869653869371528e-08, "loss": 0.5313, "step": 42840 }, { "epoch": 2.3778085991678224, "grad_norm": 0.367553174495697, "learning_rate": 7.843225418091878e-08, "loss": 0.5177, "step": 42860 }, { "epoch": 2.3789181692094314, "grad_norm": 0.4402429759502411, "learning_rate": 7.816833159619932e-08, "loss": 0.5294, "step": 42880 }, { "epoch": 2.38002773925104, "grad_norm": 0.3316340446472168, "learning_rate": 7.790477149631072e-08, "loss": 0.4941, "step": 42900 }, { "epoch": 2.381137309292649, "grad_norm": 0.32757988572120667, "learning_rate": 7.764157443724209e-08, "loss": 0.4449, "step": 42920 }, { "epoch": 2.382246879334258, "grad_norm": 0.28232768177986145, "learning_rate": 7.737874097421662e-08, "loss": 0.5449, "step": 42940 }, { "epoch": 2.3833564493758668, "grad_norm": 0.2633494734764099, "learning_rate": 7.711627166169073e-08, "loss": 0.4966, "step": 42960 }, { "epoch": 2.384466019417476, "grad_norm": 0.40396252274513245, "learning_rate": 7.685416705335243e-08, "loss": 0.5622, "step": 42980 }, { "epoch": 2.3855755894590844, "grad_norm": 0.2596459984779358, "learning_rate": 7.659242770212032e-08, "loss": 0.4452, "step": 43000 }, { "epoch": 2.3866851595006935, "grad_norm": 0.31763955950737, "learning_rate": 7.633105416014277e-08, "loss": 0.4462, "step": 43020 }, { "epoch": 2.3877947295423025, "grad_norm": 0.924512505531311, "learning_rate": 7.607004697879615e-08, "loss": 0.5982, "step": 43040 }, { "epoch": 2.388904299583911, "grad_norm": 0.3287283778190613, "learning_rate": 7.580940670868419e-08, "loss": 0.4764, "step": 43060 }, { "epoch": 2.39001386962552, "grad_norm": 0.3352401852607727, "learning_rate": 7.554913389963646e-08, "loss": 0.5368, "step": 43080 }, { "epoch": 2.391123439667129, "grad_norm": 0.38678836822509766, "learning_rate": 7.528922910070748e-08, "loss": 0.4845, "step": 43100 }, { "epoch": 2.392233009708738, "grad_norm": 0.2506890892982483, "learning_rate": 7.502969286017527e-08, "loss": 0.5832, "step": 43120 }, { "epoch": 2.393342579750347, "grad_norm": 0.301279217004776, "learning_rate": 7.477052572554065e-08, "loss": 0.4107, "step": 43140 }, { "epoch": 2.3944521497919555, "grad_norm": 0.33143892884254456, "learning_rate": 7.451172824352559e-08, "loss": 0.5618, "step": 43160 }, { "epoch": 2.3955617198335646, "grad_norm": 0.34349754452705383, "learning_rate": 7.425330096007223e-08, "loss": 0.4775, "step": 43180 }, { "epoch": 2.396671289875173, "grad_norm": 0.36729952692985535, "learning_rate": 7.399524442034188e-08, "loss": 0.5857, "step": 43200 }, { "epoch": 2.3977808599167822, "grad_norm": 0.3756861984729767, "learning_rate": 7.373755916871371e-08, "loss": 0.4868, "step": 43220 }, { "epoch": 2.3988904299583913, "grad_norm": 0.27576547861099243, "learning_rate": 7.348024574878355e-08, "loss": 0.4276, "step": 43240 }, { "epoch": 2.4, "grad_norm": 0.2955794334411621, "learning_rate": 7.322330470336313e-08, "loss": 0.5018, "step": 43260 }, { "epoch": 2.401109570041609, "grad_norm": 0.4296010434627533, "learning_rate": 7.296673657447833e-08, "loss": 0.5238, "step": 43280 }, { "epoch": 2.4022191400832176, "grad_norm": 0.2728475034236908, "learning_rate": 7.271054190336839e-08, "loss": 0.4732, "step": 43300 }, { "epoch": 2.4033287101248266, "grad_norm": 0.35518208146095276, "learning_rate": 7.245472123048499e-08, "loss": 0.5106, "step": 43320 }, { "epoch": 2.4044382801664357, "grad_norm": 0.27670395374298096, "learning_rate": 7.219927509549057e-08, "loss": 0.4951, "step": 43340 }, { "epoch": 2.4055478502080443, "grad_norm": 0.3181087374687195, "learning_rate": 7.194420403725754e-08, "loss": 0.401, "step": 43360 }, { "epoch": 2.4066574202496533, "grad_norm": 0.29299256205558777, "learning_rate": 7.168950859386714e-08, "loss": 0.5158, "step": 43380 }, { "epoch": 2.407766990291262, "grad_norm": 0.3201359212398529, "learning_rate": 7.143518930260813e-08, "loss": 0.487, "step": 43400 }, { "epoch": 2.408876560332871, "grad_norm": 0.33857691287994385, "learning_rate": 7.11812466999758e-08, "loss": 0.49, "step": 43420 }, { "epoch": 2.40998613037448, "grad_norm": 0.3936862349510193, "learning_rate": 7.092768132167098e-08, "loss": 0.4823, "step": 43440 }, { "epoch": 2.4110957004160887, "grad_norm": 0.2676030099391937, "learning_rate": 7.067449370259848e-08, "loss": 0.4816, "step": 43460 }, { "epoch": 2.4122052704576977, "grad_norm": 0.27712443470954895, "learning_rate": 7.042168437686633e-08, "loss": 0.4576, "step": 43480 }, { "epoch": 2.4133148404993063, "grad_norm": 0.3102083206176758, "learning_rate": 7.01692538777845e-08, "loss": 0.4988, "step": 43500 }, { "epoch": 2.4144244105409154, "grad_norm": 0.24610158801078796, "learning_rate": 6.991720273786387e-08, "loss": 0.5064, "step": 43520 }, { "epoch": 2.4155339805825244, "grad_norm": 0.25599372386932373, "learning_rate": 6.966553148881491e-08, "loss": 0.5361, "step": 43540 }, { "epoch": 2.416643550624133, "grad_norm": 0.2825527489185333, "learning_rate": 6.941424066154697e-08, "loss": 0.4642, "step": 43560 }, { "epoch": 2.417753120665742, "grad_norm": 0.3034735321998596, "learning_rate": 6.916333078616663e-08, "loss": 0.5781, "step": 43580 }, { "epoch": 2.4188626907073507, "grad_norm": 0.39200812578201294, "learning_rate": 6.891280239197683e-08, "loss": 0.466, "step": 43600 }, { "epoch": 2.4199722607489598, "grad_norm": 0.346649169921875, "learning_rate": 6.866265600747604e-08, "loss": 0.5633, "step": 43620 }, { "epoch": 2.421081830790569, "grad_norm": 0.374523401260376, "learning_rate": 6.84128921603566e-08, "loss": 0.4992, "step": 43640 }, { "epoch": 2.4221914008321774, "grad_norm": 0.3052015006542206, "learning_rate": 6.816351137750398e-08, "loss": 0.4115, "step": 43660 }, { "epoch": 2.4233009708737865, "grad_norm": 0.2451123148202896, "learning_rate": 6.79145141849955e-08, "loss": 0.58, "step": 43680 }, { "epoch": 2.424410540915395, "grad_norm": 0.29950588941574097, "learning_rate": 6.766590110809933e-08, "loss": 0.5216, "step": 43700 }, { "epoch": 2.425520110957004, "grad_norm": 0.34000176191329956, "learning_rate": 6.74176726712733e-08, "loss": 0.5624, "step": 43720 }, { "epoch": 2.426629680998613, "grad_norm": 0.24485866725444794, "learning_rate": 6.716982939816398e-08, "loss": 0.4523, "step": 43740 }, { "epoch": 2.427739251040222, "grad_norm": 0.4311103820800781, "learning_rate": 6.692237181160523e-08, "loss": 0.5027, "step": 43760 }, { "epoch": 2.428848821081831, "grad_norm": 0.27732953429222107, "learning_rate": 6.667530043361735e-08, "loss": 0.6103, "step": 43780 }, { "epoch": 2.4299583911234395, "grad_norm": 0.4089241921901703, "learning_rate": 6.642861578540595e-08, "loss": 0.501, "step": 43800 }, { "epoch": 2.4310679611650485, "grad_norm": 0.28103822469711304, "learning_rate": 6.618231838736082e-08, "loss": 0.4628, "step": 43820 }, { "epoch": 2.4321775312066576, "grad_norm": 0.27052149176597595, "learning_rate": 6.59364087590547e-08, "loss": 0.4665, "step": 43840 }, { "epoch": 2.433287101248266, "grad_norm": 0.23979775607585907, "learning_rate": 6.569088741924261e-08, "loss": 0.4451, "step": 43860 }, { "epoch": 2.4343966712898752, "grad_norm": 0.3018569350242615, "learning_rate": 6.544575488586018e-08, "loss": 0.4619, "step": 43880 }, { "epoch": 2.435506241331484, "grad_norm": 0.650049090385437, "learning_rate": 6.520101167602293e-08, "loss": 0.5352, "step": 43900 }, { "epoch": 2.436615811373093, "grad_norm": 0.3691965639591217, "learning_rate": 6.495665830602518e-08, "loss": 0.5568, "step": 43920 }, { "epoch": 2.437725381414702, "grad_norm": 0.595257580280304, "learning_rate": 6.471269529133874e-08, "loss": 0.3967, "step": 43940 }, { "epoch": 2.4388349514563106, "grad_norm": 0.43867751955986023, "learning_rate": 6.446912314661205e-08, "loss": 0.5555, "step": 43960 }, { "epoch": 2.4399445214979196, "grad_norm": 0.49037832021713257, "learning_rate": 6.42259423856689e-08, "loss": 0.4883, "step": 43980 }, { "epoch": 2.4410540915395282, "grad_norm": 0.3651586174964905, "learning_rate": 6.398315352150749e-08, "loss": 0.5624, "step": 44000 }, { "epoch": 2.4421636615811373, "grad_norm": 0.3260118067264557, "learning_rate": 6.374075706629925e-08, "loss": 0.5441, "step": 44020 }, { "epoch": 2.4432732316227463, "grad_norm": 0.2301023006439209, "learning_rate": 6.349875353138801e-08, "loss": 0.5494, "step": 44040 }, { "epoch": 2.444382801664355, "grad_norm": 0.3623005151748657, "learning_rate": 6.325714342728847e-08, "loss": 0.5103, "step": 44060 }, { "epoch": 2.445492371705964, "grad_norm": 0.28617408871650696, "learning_rate": 6.301592726368551e-08, "loss": 0.5869, "step": 44080 }, { "epoch": 2.4466019417475726, "grad_norm": 0.3933884799480438, "learning_rate": 6.277510554943294e-08, "loss": 0.4689, "step": 44100 }, { "epoch": 2.4477115117891817, "grad_norm": 0.30622199177742004, "learning_rate": 6.253467879255247e-08, "loss": 0.4639, "step": 44120 }, { "epoch": 2.4488210818307907, "grad_norm": 0.3007581830024719, "learning_rate": 6.229464750023258e-08, "loss": 0.457, "step": 44140 }, { "epoch": 2.4499306518723993, "grad_norm": 0.3437626361846924, "learning_rate": 6.205501217882766e-08, "loss": 0.4846, "step": 44160 }, { "epoch": 2.4510402219140084, "grad_norm": 0.33551496267318726, "learning_rate": 6.181577333385665e-08, "loss": 0.4987, "step": 44180 }, { "epoch": 2.452149791955617, "grad_norm": 0.2560988664627075, "learning_rate": 6.157693147000206e-08, "loss": 0.5092, "step": 44200 }, { "epoch": 2.453259361997226, "grad_norm": 0.26454198360443115, "learning_rate": 6.13384870911092e-08, "loss": 0.5352, "step": 44220 }, { "epoch": 2.454368932038835, "grad_norm": 0.36754098534584045, "learning_rate": 6.110044070018466e-08, "loss": 0.3713, "step": 44240 }, { "epoch": 2.4554785020804437, "grad_norm": 0.31598976254463196, "learning_rate": 6.086279279939538e-08, "loss": 0.5121, "step": 44260 }, { "epoch": 2.4565880721220528, "grad_norm": 0.3758425712585449, "learning_rate": 6.062554389006794e-08, "loss": 0.4608, "step": 44280 }, { "epoch": 2.4576976421636614, "grad_norm": 0.4358053505420685, "learning_rate": 6.038869447268707e-08, "loss": 0.4237, "step": 44300 }, { "epoch": 2.4588072122052704, "grad_norm": 0.31615889072418213, "learning_rate": 6.015224504689468e-08, "loss": 0.5664, "step": 44320 }, { "epoch": 2.4599167822468795, "grad_norm": 0.25300636887550354, "learning_rate": 5.991619611148918e-08, "loss": 0.5065, "step": 44340 }, { "epoch": 2.461026352288488, "grad_norm": 0.3130981922149658, "learning_rate": 5.96805481644238e-08, "loss": 0.4434, "step": 44360 }, { "epoch": 2.462135922330097, "grad_norm": 0.32278120517730713, "learning_rate": 5.9445301702806094e-08, "loss": 0.4439, "step": 44380 }, { "epoch": 2.4632454923717058, "grad_norm": 0.49549317359924316, "learning_rate": 5.9210457222896524e-08, "loss": 0.4358, "step": 44400 }, { "epoch": 2.464355062413315, "grad_norm": 0.34794139862060547, "learning_rate": 5.8976015220107645e-08, "loss": 0.5261, "step": 44420 }, { "epoch": 2.465464632454924, "grad_norm": 0.2752000391483307, "learning_rate": 5.8741976189002865e-08, "loss": 0.6256, "step": 44440 }, { "epoch": 2.4665742024965325, "grad_norm": 0.38202446699142456, "learning_rate": 5.850834062329574e-08, "loss": 0.514, "step": 44460 }, { "epoch": 2.4676837725381415, "grad_norm": 0.2991621494293213, "learning_rate": 5.8275109015848505e-08, "loss": 0.409, "step": 44480 }, { "epoch": 2.46879334257975, "grad_norm": 0.2662794589996338, "learning_rate": 5.804228185867116e-08, "loss": 0.507, "step": 44500 }, { "epoch": 2.469902912621359, "grad_norm": 0.2497805655002594, "learning_rate": 5.780985964292079e-08, "loss": 0.5104, "step": 44520 }, { "epoch": 2.4710124826629682, "grad_norm": 0.2597384452819824, "learning_rate": 5.757784285890011e-08, "loss": 0.5693, "step": 44540 }, { "epoch": 2.472122052704577, "grad_norm": 0.4971882402896881, "learning_rate": 5.734623199605626e-08, "loss": 0.4467, "step": 44560 }, { "epoch": 2.473231622746186, "grad_norm": 0.3779316246509552, "learning_rate": 5.711502754298059e-08, "loss": 0.5103, "step": 44580 }, { "epoch": 2.4743411927877945, "grad_norm": 0.2746543288230896, "learning_rate": 5.688422998740677e-08, "loss": 0.5967, "step": 44600 }, { "epoch": 2.4754507628294036, "grad_norm": 0.34571078419685364, "learning_rate": 5.665383981621014e-08, "loss": 0.468, "step": 44620 }, { "epoch": 2.4765603328710126, "grad_norm": 0.25493454933166504, "learning_rate": 5.6423857515406876e-08, "loss": 0.4637, "step": 44640 }, { "epoch": 2.4776699029126212, "grad_norm": 0.3291374444961548, "learning_rate": 5.619428357015249e-08, "loss": 0.4298, "step": 44660 }, { "epoch": 2.4787794729542303, "grad_norm": 0.2322039157152176, "learning_rate": 5.596511846474114e-08, "loss": 0.4799, "step": 44680 }, { "epoch": 2.479889042995839, "grad_norm": 0.36044350266456604, "learning_rate": 5.573636268260451e-08, "loss": 0.5645, "step": 44700 }, { "epoch": 2.480998613037448, "grad_norm": 0.28443822264671326, "learning_rate": 5.550801670631083e-08, "loss": 0.4982, "step": 44720 }, { "epoch": 2.482108183079057, "grad_norm": 0.5585693717002869, "learning_rate": 5.528008101756371e-08, "loss": 0.5468, "step": 44740 }, { "epoch": 2.4832177531206656, "grad_norm": 0.4081180989742279, "learning_rate": 5.5052556097201525e-08, "loss": 0.4575, "step": 44760 }, { "epoch": 2.4843273231622747, "grad_norm": 0.2768988013267517, "learning_rate": 5.482544242519585e-08, "loss": 0.5575, "step": 44780 }, { "epoch": 2.4854368932038833, "grad_norm": 0.3548398017883301, "learning_rate": 5.459874048065072e-08, "loss": 0.614, "step": 44800 }, { "epoch": 2.4865464632454923, "grad_norm": 0.28826358914375305, "learning_rate": 5.437245074180191e-08, "loss": 0.5371, "step": 44820 }, { "epoch": 2.4876560332871014, "grad_norm": 0.4172697961330414, "learning_rate": 5.4146573686015354e-08, "loss": 0.4931, "step": 44840 }, { "epoch": 2.48876560332871, "grad_norm": 0.2831767797470093, "learning_rate": 5.3921109789786313e-08, "loss": 0.5248, "step": 44860 }, { "epoch": 2.489875173370319, "grad_norm": 0.2847730815410614, "learning_rate": 5.369605952873887e-08, "loss": 0.4043, "step": 44880 }, { "epoch": 2.4909847434119277, "grad_norm": 0.3498815894126892, "learning_rate": 5.347142337762425e-08, "loss": 0.5704, "step": 44900 }, { "epoch": 2.4920943134535367, "grad_norm": 0.33715176582336426, "learning_rate": 5.324720181032008e-08, "loss": 0.507, "step": 44920 }, { "epoch": 2.4932038834951458, "grad_norm": 0.31990915536880493, "learning_rate": 5.302339529982961e-08, "loss": 0.527, "step": 44940 }, { "epoch": 2.4943134535367544, "grad_norm": 0.3989587724208832, "learning_rate": 5.280000431828036e-08, "loss": 0.5949, "step": 44960 }, { "epoch": 2.4954230235783634, "grad_norm": 0.31968551874160767, "learning_rate": 5.2577029336923264e-08, "loss": 0.4847, "step": 44980 }, { "epoch": 2.496532593619972, "grad_norm": 0.27839186787605286, "learning_rate": 5.2354470826131785e-08, "loss": 0.5472, "step": 45000 }, { "epoch": 2.497642163661581, "grad_norm": 0.2550245225429535, "learning_rate": 5.213232925540073e-08, "loss": 0.6293, "step": 45020 }, { "epoch": 2.49875173370319, "grad_norm": 0.3091263771057129, "learning_rate": 5.1910605093345384e-08, "loss": 0.5004, "step": 45040 }, { "epoch": 2.4998613037447988, "grad_norm": 0.3626408278942108, "learning_rate": 5.168929880770062e-08, "loss": 0.4945, "step": 45060 }, { "epoch": 2.500970873786408, "grad_norm": 0.14909061789512634, "learning_rate": 5.1468410865319554e-08, "loss": 0.4684, "step": 45080 }, { "epoch": 2.5020804438280164, "grad_norm": 0.4817771017551422, "learning_rate": 5.124794173217303e-08, "loss": 0.5038, "step": 45100 }, { "epoch": 2.5031900138696255, "grad_norm": 0.46438390016555786, "learning_rate": 5.102789187334827e-08, "loss": 0.4765, "step": 45120 }, { "epoch": 2.5042995839112345, "grad_norm": 0.3004396855831146, "learning_rate": 5.080826175304798e-08, "loss": 0.501, "step": 45140 }, { "epoch": 2.505409153952843, "grad_norm": 0.3741270899772644, "learning_rate": 5.058905183458953e-08, "loss": 0.5124, "step": 45160 }, { "epoch": 2.506518723994452, "grad_norm": 0.2659628093242645, "learning_rate": 5.0370262580403775e-08, "loss": 0.4377, "step": 45180 }, { "epoch": 2.507628294036061, "grad_norm": 0.3916822373867035, "learning_rate": 5.0151894452034105e-08, "loss": 0.3723, "step": 45200 }, { "epoch": 2.50873786407767, "grad_norm": 0.28140518069267273, "learning_rate": 4.9933947910135805e-08, "loss": 0.4607, "step": 45220 }, { "epoch": 2.509847434119279, "grad_norm": 0.362035870552063, "learning_rate": 4.9716423414474515e-08, "loss": 0.4773, "step": 45240 }, { "epoch": 2.5109570041608875, "grad_norm": 0.2963012456893921, "learning_rate": 4.949932142392568e-08, "loss": 0.4572, "step": 45260 }, { "epoch": 2.5120665742024966, "grad_norm": 0.46244707703590393, "learning_rate": 4.928264239647342e-08, "loss": 0.5745, "step": 45280 }, { "epoch": 2.513176144244105, "grad_norm": 0.32611292600631714, "learning_rate": 4.906638678920963e-08, "loss": 0.4715, "step": 45300 }, { "epoch": 2.5142857142857142, "grad_norm": 0.3377882242202759, "learning_rate": 4.885055505833291e-08, "loss": 0.4526, "step": 45320 }, { "epoch": 2.5153952843273233, "grad_norm": 0.5239116549491882, "learning_rate": 4.863514765914786e-08, "loss": 0.5409, "step": 45340 }, { "epoch": 2.516504854368932, "grad_norm": 0.4017760455608368, "learning_rate": 4.842016504606375e-08, "loss": 0.5193, "step": 45360 }, { "epoch": 2.517614424410541, "grad_norm": 0.4771023392677307, "learning_rate": 4.820560767259374e-08, "loss": 0.5166, "step": 45380 }, { "epoch": 2.5187239944521496, "grad_norm": 0.513019859790802, "learning_rate": 4.79914759913542e-08, "loss": 0.526, "step": 45400 }, { "epoch": 2.5198335644937586, "grad_norm": 0.24345675110816956, "learning_rate": 4.777777045406314e-08, "loss": 0.4311, "step": 45420 }, { "epoch": 2.5209431345353677, "grad_norm": 0.47122231125831604, "learning_rate": 4.756449151153985e-08, "loss": 0.5066, "step": 45440 }, { "epoch": 2.5220527045769763, "grad_norm": 0.22235077619552612, "learning_rate": 4.735163961370356e-08, "loss": 0.549, "step": 45460 }, { "epoch": 2.5231622746185853, "grad_norm": 0.6681892275810242, "learning_rate": 4.71392152095727e-08, "loss": 0.4563, "step": 45480 }, { "epoch": 2.524271844660194, "grad_norm": 0.2912302613258362, "learning_rate": 4.6927218747263826e-08, "loss": 0.4611, "step": 45500 }, { "epoch": 2.525381414701803, "grad_norm": 0.245005264878273, "learning_rate": 4.671565067399091e-08, "loss": 0.4494, "step": 45520 }, { "epoch": 2.526490984743412, "grad_norm": 0.3203307092189789, "learning_rate": 4.6504511436064014e-08, "loss": 0.4389, "step": 45540 }, { "epoch": 2.5276005547850207, "grad_norm": 0.2912476360797882, "learning_rate": 4.6293801478888675e-08, "loss": 0.464, "step": 45560 }, { "epoch": 2.5287101248266297, "grad_norm": 0.326294481754303, "learning_rate": 4.60835212469648e-08, "loss": 0.5397, "step": 45580 }, { "epoch": 2.5298196948682383, "grad_norm": 0.5682638883590698, "learning_rate": 4.587367118388577e-08, "loss": 0.4704, "step": 45600 }, { "epoch": 2.5309292649098474, "grad_norm": 0.31526440382003784, "learning_rate": 4.566425173233751e-08, "loss": 0.4949, "step": 45620 }, { "epoch": 2.5320388349514564, "grad_norm": 0.19022785127162933, "learning_rate": 4.545526333409769e-08, "loss": 0.4227, "step": 45640 }, { "epoch": 2.533148404993065, "grad_norm": 0.17607265710830688, "learning_rate": 4.5246706430034445e-08, "loss": 0.5606, "step": 45660 }, { "epoch": 2.534257975034674, "grad_norm": 0.2546526789665222, "learning_rate": 4.503858146010575e-08, "loss": 0.5271, "step": 45680 }, { "epoch": 2.5353675450762827, "grad_norm": 0.35845425724983215, "learning_rate": 4.483088886335848e-08, "loss": 0.4797, "step": 45700 }, { "epoch": 2.5364771151178918, "grad_norm": 0.2444685995578766, "learning_rate": 4.4623629077927296e-08, "loss": 0.4588, "step": 45720 }, { "epoch": 2.537586685159501, "grad_norm": 0.2122899442911148, "learning_rate": 4.441680254103386e-08, "loss": 0.5253, "step": 45740 }, { "epoch": 2.5386962552011094, "grad_norm": 0.4066406786441803, "learning_rate": 4.4210409688985875e-08, "loss": 0.5891, "step": 45760 }, { "epoch": 2.5398058252427185, "grad_norm": 0.40800443291664124, "learning_rate": 4.40044509571762e-08, "loss": 0.4989, "step": 45780 }, { "epoch": 2.540915395284327, "grad_norm": 0.27279120683670044, "learning_rate": 4.379892678008182e-08, "loss": 0.5183, "step": 45800 }, { "epoch": 2.542024965325936, "grad_norm": 0.37215977907180786, "learning_rate": 4.3593837591263174e-08, "loss": 0.4472, "step": 45820 }, { "epoch": 2.543134535367545, "grad_norm": 0.35144948959350586, "learning_rate": 4.338918382336296e-08, "loss": 0.5111, "step": 45840 }, { "epoch": 2.544244105409154, "grad_norm": 0.434501975774765, "learning_rate": 4.3184965908105305e-08, "loss": 0.515, "step": 45860 }, { "epoch": 2.545353675450763, "grad_norm": 0.3157638907432556, "learning_rate": 4.298118427629499e-08, "loss": 0.4687, "step": 45880 }, { "epoch": 2.5464632454923715, "grad_norm": 0.36596953868865967, "learning_rate": 4.277783935781637e-08, "loss": 0.4039, "step": 45900 }, { "epoch": 2.5475728155339805, "grad_norm": 0.31997817754745483, "learning_rate": 4.25749315816325e-08, "loss": 0.4273, "step": 45920 }, { "epoch": 2.5486823855755896, "grad_norm": 0.2804777920246124, "learning_rate": 4.2372461375784455e-08, "loss": 0.5452, "step": 45940 }, { "epoch": 2.549791955617198, "grad_norm": 0.5422829985618591, "learning_rate": 4.217042916739011e-08, "loss": 0.4926, "step": 45960 }, { "epoch": 2.5509015256588072, "grad_norm": 0.38099366426467896, "learning_rate": 4.196883538264323e-08, "loss": 0.4768, "step": 45980 }, { "epoch": 2.552011095700416, "grad_norm": 0.36163070797920227, "learning_rate": 4.176768044681303e-08, "loss": 0.4387, "step": 46000 }, { "epoch": 2.553120665742025, "grad_norm": 0.3419199585914612, "learning_rate": 4.156696478424279e-08, "loss": 0.4529, "step": 46020 }, { "epoch": 2.554230235783634, "grad_norm": 0.39725953340530396, "learning_rate": 4.1366688818348926e-08, "loss": 0.4622, "step": 46040 }, { "epoch": 2.5553398058252426, "grad_norm": 0.379535973072052, "learning_rate": 4.1166852971620674e-08, "loss": 0.5784, "step": 46060 }, { "epoch": 2.5564493758668516, "grad_norm": 0.35484927892684937, "learning_rate": 4.096745766561857e-08, "loss": 0.5192, "step": 46080 }, { "epoch": 2.5575589459084602, "grad_norm": 0.24533678591251373, "learning_rate": 4.076850332097387e-08, "loss": 0.5289, "step": 46100 }, { "epoch": 2.5586685159500693, "grad_norm": 0.3287791609764099, "learning_rate": 4.05699903573877e-08, "loss": 0.4325, "step": 46120 }, { "epoch": 2.5597780859916783, "grad_norm": 0.4092911183834076, "learning_rate": 4.0371919193629975e-08, "loss": 0.4603, "step": 46140 }, { "epoch": 2.560887656033287, "grad_norm": 0.34958940744400024, "learning_rate": 4.0174290247538655e-08, "loss": 0.4465, "step": 46160 }, { "epoch": 2.561997226074896, "grad_norm": 0.27071332931518555, "learning_rate": 3.997710393601877e-08, "loss": 0.4478, "step": 46180 }, { "epoch": 2.5631067961165046, "grad_norm": 0.3072054088115692, "learning_rate": 3.9780360675041675e-08, "loss": 0.4237, "step": 46200 }, { "epoch": 2.5642163661581137, "grad_norm": 0.2954820990562439, "learning_rate": 3.9584060879644025e-08, "loss": 0.502, "step": 46220 }, { "epoch": 2.5653259361997227, "grad_norm": 0.2712906002998352, "learning_rate": 3.9388204963927114e-08, "loss": 0.5669, "step": 46240 }, { "epoch": 2.5664355062413313, "grad_norm": 0.328586220741272, "learning_rate": 3.9192793341055655e-08, "loss": 0.4407, "step": 46260 }, { "epoch": 2.5675450762829404, "grad_norm": 0.25344762206077576, "learning_rate": 3.899782642325719e-08, "loss": 0.5005, "step": 46280 }, { "epoch": 2.568654646324549, "grad_norm": 0.37006357312202454, "learning_rate": 3.880330462182127e-08, "loss": 0.4602, "step": 46300 }, { "epoch": 2.569764216366158, "grad_norm": 0.33011186122894287, "learning_rate": 3.860922834709832e-08, "loss": 0.4785, "step": 46320 }, { "epoch": 2.570873786407767, "grad_norm": 0.28053492307662964, "learning_rate": 3.841559800849878e-08, "loss": 0.4253, "step": 46340 }, { "epoch": 2.5719833564493757, "grad_norm": 0.31888270378112793, "learning_rate": 3.822241401449269e-08, "loss": 0.5732, "step": 46360 }, { "epoch": 2.5730929264909848, "grad_norm": 0.3503880500793457, "learning_rate": 3.8029676772608324e-08, "loss": 0.4529, "step": 46380 }, { "epoch": 2.5742024965325934, "grad_norm": 0.18899886310100555, "learning_rate": 3.783738668943143e-08, "loss": 0.4621, "step": 46400 }, { "epoch": 2.5753120665742024, "grad_norm": 0.3825567960739136, "learning_rate": 3.764554417060472e-08, "loss": 0.4826, "step": 46420 }, { "epoch": 2.5764216366158115, "grad_norm": 0.2637205123901367, "learning_rate": 3.745414962082655e-08, "loss": 0.4789, "step": 46440 }, { "epoch": 2.5775312066574205, "grad_norm": 0.3205200135707855, "learning_rate": 3.726320344385028e-08, "loss": 0.5234, "step": 46460 }, { "epoch": 2.578640776699029, "grad_norm": 0.3583070933818817, "learning_rate": 3.707270604248352e-08, "loss": 0.4859, "step": 46480 }, { "epoch": 2.5797503467406377, "grad_norm": 0.30797627568244934, "learning_rate": 3.688265781858707e-08, "loss": 0.5621, "step": 46500 }, { "epoch": 2.580859916782247, "grad_norm": 0.3863540291786194, "learning_rate": 3.669305917307416e-08, "loss": 0.4773, "step": 46520 }, { "epoch": 2.581969486823856, "grad_norm": 0.30976638197898865, "learning_rate": 3.650391050590978e-08, "loss": 0.5109, "step": 46540 }, { "epoch": 2.583079056865465, "grad_norm": 0.41036781668663025, "learning_rate": 3.631521221610953e-08, "loss": 0.511, "step": 46560 }, { "epoch": 2.5841886269070735, "grad_norm": 0.2725338637828827, "learning_rate": 3.612696470173887e-08, "loss": 0.4877, "step": 46580 }, { "epoch": 2.585298196948682, "grad_norm": 0.34197479486465454, "learning_rate": 3.593916835991259e-08, "loss": 0.526, "step": 46600 }, { "epoch": 2.586407766990291, "grad_norm": 0.2554982900619507, "learning_rate": 3.575182358679349e-08, "loss": 0.4952, "step": 46620 }, { "epoch": 2.5875173370319002, "grad_norm": 0.19301778078079224, "learning_rate": 3.556493077759171e-08, "loss": 0.5347, "step": 46640 }, { "epoch": 2.5886269070735093, "grad_norm": 0.4632357060909271, "learning_rate": 3.5378490326564234e-08, "loss": 0.4666, "step": 46660 }, { "epoch": 2.589736477115118, "grad_norm": 0.3314858376979828, "learning_rate": 3.5192502627013535e-08, "loss": 0.4808, "step": 46680 }, { "epoch": 2.5908460471567265, "grad_norm": 0.2741951644420624, "learning_rate": 3.500696807128706e-08, "loss": 0.5464, "step": 46700 }, { "epoch": 2.5919556171983356, "grad_norm": 0.42794179916381836, "learning_rate": 3.482188705077646e-08, "loss": 0.5595, "step": 46720 }, { "epoch": 2.5930651872399446, "grad_norm": 0.38187316060066223, "learning_rate": 3.463725995591646e-08, "loss": 0.4442, "step": 46740 }, { "epoch": 2.5941747572815537, "grad_norm": 0.4221172034740448, "learning_rate": 3.445308717618431e-08, "loss": 0.4983, "step": 46760 }, { "epoch": 2.5952843273231623, "grad_norm": 0.4255363345146179, "learning_rate": 3.426936910009881e-08, "loss": 0.4654, "step": 46780 }, { "epoch": 2.596393897364771, "grad_norm": 0.5041808485984802, "learning_rate": 3.408610611521959e-08, "loss": 0.5909, "step": 46800 }, { "epoch": 2.59750346740638, "grad_norm": 0.415483295917511, "learning_rate": 3.390329860814617e-08, "loss": 0.5683, "step": 46820 }, { "epoch": 2.598613037447989, "grad_norm": 0.4259670674800873, "learning_rate": 3.372094696451744e-08, "loss": 0.5359, "step": 46840 }, { "epoch": 2.599722607489598, "grad_norm": 0.38342469930648804, "learning_rate": 3.3539051569010376e-08, "loss": 0.458, "step": 46860 }, { "epoch": 2.6008321775312067, "grad_norm": 0.2447005957365036, "learning_rate": 3.335761280533958e-08, "loss": 0.4871, "step": 46880 }, { "epoch": 2.6019417475728153, "grad_norm": 0.3886164426803589, "learning_rate": 3.3176631056256485e-08, "loss": 0.5601, "step": 46900 }, { "epoch": 2.6030513176144243, "grad_norm": 0.38947737216949463, "learning_rate": 3.29961067035483e-08, "loss": 0.5381, "step": 46920 }, { "epoch": 2.6041608876560334, "grad_norm": 0.3751154839992523, "learning_rate": 3.281604012803721e-08, "loss": 0.4675, "step": 46940 }, { "epoch": 2.6052704576976424, "grad_norm": 0.3174614906311035, "learning_rate": 3.263643170958005e-08, "loss": 0.4802, "step": 46960 }, { "epoch": 2.606380027739251, "grad_norm": 0.24638549983501434, "learning_rate": 3.245728182706695e-08, "loss": 0.5218, "step": 46980 }, { "epoch": 2.6074895977808596, "grad_norm": 0.3457053601741791, "learning_rate": 3.227859085842063e-08, "loss": 0.3978, "step": 47000 }, { "epoch": 2.6085991678224687, "grad_norm": 0.2396436482667923, "learning_rate": 3.210035918059603e-08, "loss": 0.5215, "step": 47020 }, { "epoch": 2.6097087378640778, "grad_norm": 0.33602702617645264, "learning_rate": 3.1922587169578965e-08, "loss": 0.4256, "step": 47040 }, { "epoch": 2.610818307905687, "grad_norm": 0.5059120655059814, "learning_rate": 3.1745275200385584e-08, "loss": 0.3842, "step": 47060 }, { "epoch": 2.6119278779472954, "grad_norm": 0.4174138605594635, "learning_rate": 3.156842364706161e-08, "loss": 0.4968, "step": 47080 }, { "epoch": 2.613037447988904, "grad_norm": 0.46458983421325684, "learning_rate": 3.1392032882681524e-08, "loss": 0.4559, "step": 47100 }, { "epoch": 2.614147018030513, "grad_norm": 0.2873481214046478, "learning_rate": 3.121610327934765e-08, "loss": 0.529, "step": 47120 }, { "epoch": 2.615256588072122, "grad_norm": 0.37056469917297363, "learning_rate": 3.104063520818964e-08, "loss": 0.5568, "step": 47140 }, { "epoch": 2.616366158113731, "grad_norm": 0.34002047777175903, "learning_rate": 3.086562903936343e-08, "loss": 0.5291, "step": 47160 }, { "epoch": 2.61747572815534, "grad_norm": 0.3537414073944092, "learning_rate": 3.069108514205051e-08, "loss": 0.5551, "step": 47180 }, { "epoch": 2.6185852981969484, "grad_norm": 0.30064067244529724, "learning_rate": 3.0517003884457357e-08, "loss": 0.4769, "step": 47200 }, { "epoch": 2.6196948682385575, "grad_norm": 0.29317256808280945, "learning_rate": 3.0343385633814336e-08, "loss": 0.472, "step": 47220 }, { "epoch": 2.6208044382801665, "grad_norm": 0.2896140217781067, "learning_rate": 3.0170230756375036e-08, "loss": 0.4848, "step": 47240 }, { "epoch": 2.6219140083217756, "grad_norm": 0.24535700678825378, "learning_rate": 2.99975396174158e-08, "loss": 0.4625, "step": 47260 }, { "epoch": 2.623023578363384, "grad_norm": 0.35107865929603577, "learning_rate": 2.982531258123447e-08, "loss": 0.4287, "step": 47280 }, { "epoch": 2.624133148404993, "grad_norm": 0.4287501275539398, "learning_rate": 2.965355001114986e-08, "loss": 0.544, "step": 47300 }, { "epoch": 2.625242718446602, "grad_norm": 0.3570953607559204, "learning_rate": 2.9482252269501128e-08, "loss": 0.5382, "step": 47320 }, { "epoch": 2.626352288488211, "grad_norm": 0.37135937809944153, "learning_rate": 2.931141971764675e-08, "loss": 0.452, "step": 47340 }, { "epoch": 2.62746185852982, "grad_norm": 0.40913960337638855, "learning_rate": 2.91410527159639e-08, "loss": 0.4562, "step": 47360 }, { "epoch": 2.6285714285714286, "grad_norm": 0.4895024299621582, "learning_rate": 2.8971151623847584e-08, "loss": 0.5318, "step": 47380 }, { "epoch": 2.6296809986130376, "grad_norm": 0.42940640449523926, "learning_rate": 2.880171679971005e-08, "loss": 0.4224, "step": 47400 }, { "epoch": 2.6307905686546462, "grad_norm": 0.44355618953704834, "learning_rate": 2.863274860097986e-08, "loss": 0.5525, "step": 47420 }, { "epoch": 2.6319001386962553, "grad_norm": 0.2761595547199249, "learning_rate": 2.846424738410133e-08, "loss": 0.4939, "step": 47440 }, { "epoch": 2.6330097087378643, "grad_norm": 0.3673642873764038, "learning_rate": 2.8296213504533596e-08, "loss": 0.4933, "step": 47460 }, { "epoch": 2.634119278779473, "grad_norm": 0.5060901045799255, "learning_rate": 2.8128647316749797e-08, "loss": 0.538, "step": 47480 }, { "epoch": 2.635228848821082, "grad_norm": 0.359235018491745, "learning_rate": 2.7961549174236766e-08, "loss": 0.4504, "step": 47500 }, { "epoch": 2.6363384188626906, "grad_norm": 0.2648777663707733, "learning_rate": 2.779491942949369e-08, "loss": 0.522, "step": 47520 }, { "epoch": 2.6374479889042997, "grad_norm": 0.4029914438724518, "learning_rate": 2.76287584340317e-08, "loss": 0.4291, "step": 47540 }, { "epoch": 2.6385575589459087, "grad_norm": 0.268282026052475, "learning_rate": 2.7463066538373268e-08, "loss": 0.5023, "step": 47560 }, { "epoch": 2.6396671289875173, "grad_norm": 0.35931873321533203, "learning_rate": 2.7297844092051104e-08, "loss": 0.5443, "step": 47580 }, { "epoch": 2.6407766990291264, "grad_norm": 0.38801708817481995, "learning_rate": 2.7133091443607647e-08, "loss": 0.6324, "step": 47600 }, { "epoch": 2.641886269070735, "grad_norm": 0.3864608407020569, "learning_rate": 2.6968808940594363e-08, "loss": 0.4683, "step": 47620 }, { "epoch": 2.642995839112344, "grad_norm": 0.427723228931427, "learning_rate": 2.680499692957078e-08, "loss": 0.5113, "step": 47640 }, { "epoch": 2.644105409153953, "grad_norm": 0.3010461628437042, "learning_rate": 2.6641655756104053e-08, "loss": 0.4727, "step": 47660 }, { "epoch": 2.6452149791955617, "grad_norm": 0.31030094623565674, "learning_rate": 2.6478785764767945e-08, "loss": 0.4713, "step": 47680 }, { "epoch": 2.6463245492371708, "grad_norm": 0.3839551508426666, "learning_rate": 2.6316387299142374e-08, "loss": 0.4962, "step": 47700 }, { "epoch": 2.6474341192787794, "grad_norm": 0.37921831011772156, "learning_rate": 2.6154460701812432e-08, "loss": 0.5552, "step": 47720 }, { "epoch": 2.6485436893203884, "grad_norm": 0.2846652865409851, "learning_rate": 2.599300631436796e-08, "loss": 0.5311, "step": 47740 }, { "epoch": 2.6496532593619975, "grad_norm": 0.38263627886772156, "learning_rate": 2.5832024477402543e-08, "loss": 0.5345, "step": 47760 }, { "epoch": 2.650762829403606, "grad_norm": 0.4085031747817993, "learning_rate": 2.5671515530512806e-08, "loss": 0.4941, "step": 47780 }, { "epoch": 2.651872399445215, "grad_norm": 0.30303889513015747, "learning_rate": 2.551147981229809e-08, "loss": 0.5065, "step": 47800 }, { "epoch": 2.6529819694868237, "grad_norm": 0.4854058027267456, "learning_rate": 2.535191766035913e-08, "loss": 0.5219, "step": 47820 }, { "epoch": 2.654091539528433, "grad_norm": 0.3077492415904999, "learning_rate": 2.5192829411297743e-08, "loss": 0.5323, "step": 47840 }, { "epoch": 2.655201109570042, "grad_norm": 0.32661134004592896, "learning_rate": 2.5034215400716236e-08, "loss": 0.3887, "step": 47860 }, { "epoch": 2.6563106796116505, "grad_norm": 0.29467132687568665, "learning_rate": 2.4876075963216226e-08, "loss": 0.4351, "step": 47880 }, { "epoch": 2.6574202496532595, "grad_norm": 0.296843558549881, "learning_rate": 2.4718411432398308e-08, "loss": 0.4133, "step": 47900 }, { "epoch": 2.658529819694868, "grad_norm": 0.31993141770362854, "learning_rate": 2.4561222140861316e-08, "loss": 0.4297, "step": 47920 }, { "epoch": 2.659639389736477, "grad_norm": 0.3798917233943939, "learning_rate": 2.4404508420201446e-08, "loss": 0.4372, "step": 47940 }, { "epoch": 2.6607489597780862, "grad_norm": 0.3493525981903076, "learning_rate": 2.42482706010117e-08, "loss": 0.4476, "step": 47960 }, { "epoch": 2.661858529819695, "grad_norm": 0.3446005880832672, "learning_rate": 2.4092509012881123e-08, "loss": 0.395, "step": 47980 }, { "epoch": 2.662968099861304, "grad_norm": 0.26892855763435364, "learning_rate": 2.3937223984394212e-08, "loss": 0.4741, "step": 48000 }, { "epoch": 2.6640776699029125, "grad_norm": 0.31622636318206787, "learning_rate": 2.3782415843129956e-08, "loss": 0.5136, "step": 48020 }, { "epoch": 2.6651872399445216, "grad_norm": 0.3301359713077545, "learning_rate": 2.362808491566165e-08, "loss": 0.5047, "step": 48040 }, { "epoch": 2.6662968099861306, "grad_norm": 0.2515262961387634, "learning_rate": 2.3474231527555595e-08, "loss": 0.4239, "step": 48060 }, { "epoch": 2.667406380027739, "grad_norm": 0.5557871460914612, "learning_rate": 2.3320856003370832e-08, "loss": 0.5213, "step": 48080 }, { "epoch": 2.6685159500693483, "grad_norm": 0.22517164051532745, "learning_rate": 2.3167958666658412e-08, "loss": 0.4701, "step": 48100 }, { "epoch": 2.669625520110957, "grad_norm": 0.2352830469608307, "learning_rate": 2.301553983996041e-08, "loss": 0.4135, "step": 48120 }, { "epoch": 2.670735090152566, "grad_norm": 0.4745240807533264, "learning_rate": 2.2863599844809595e-08, "loss": 0.557, "step": 48140 }, { "epoch": 2.671844660194175, "grad_norm": 0.30073869228363037, "learning_rate": 2.2712139001728748e-08, "loss": 0.481, "step": 48160 }, { "epoch": 2.6729542302357836, "grad_norm": 0.34166544675827026, "learning_rate": 2.2561157630229673e-08, "loss": 0.4868, "step": 48180 }, { "epoch": 2.6740638002773927, "grad_norm": 0.2506662607192993, "learning_rate": 2.2410656048812733e-08, "loss": 0.439, "step": 48200 }, { "epoch": 2.6751733703190013, "grad_norm": 0.2926943302154541, "learning_rate": 2.2260634574966298e-08, "loss": 0.4855, "step": 48220 }, { "epoch": 2.6762829403606103, "grad_norm": 0.31673961877822876, "learning_rate": 2.2111093525165826e-08, "loss": 0.4528, "step": 48240 }, { "epoch": 2.6773925104022194, "grad_norm": 0.5393707752227783, "learning_rate": 2.196203321487325e-08, "loss": 0.4453, "step": 48260 }, { "epoch": 2.678502080443828, "grad_norm": 0.38518422842025757, "learning_rate": 2.181345395853651e-08, "loss": 0.5692, "step": 48280 }, { "epoch": 2.679611650485437, "grad_norm": 0.37991929054260254, "learning_rate": 2.1665356069588607e-08, "loss": 0.5454, "step": 48300 }, { "epoch": 2.6807212205270456, "grad_norm": 0.29949456453323364, "learning_rate": 2.1517739860447164e-08, "loss": 0.4711, "step": 48320 }, { "epoch": 2.6818307905686547, "grad_norm": 0.27533912658691406, "learning_rate": 2.1370605642513695e-08, "loss": 0.478, "step": 48340 }, { "epoch": 2.6829403606102638, "grad_norm": 0.2533174455165863, "learning_rate": 2.1223953726172917e-08, "loss": 0.4934, "step": 48360 }, { "epoch": 2.6840499306518724, "grad_norm": 0.36064958572387695, "learning_rate": 2.1077784420792028e-08, "loss": 0.5141, "step": 48380 }, { "epoch": 2.6851595006934814, "grad_norm": 0.31320858001708984, "learning_rate": 2.0932098034720347e-08, "loss": 0.471, "step": 48400 }, { "epoch": 2.68626907073509, "grad_norm": 0.30094367265701294, "learning_rate": 2.078689487528823e-08, "loss": 0.4443, "step": 48420 }, { "epoch": 2.687378640776699, "grad_norm": 0.29525718092918396, "learning_rate": 2.064217524880671e-08, "loss": 0.501, "step": 48440 }, { "epoch": 2.688488210818308, "grad_norm": 0.262055903673172, "learning_rate": 2.049793946056694e-08, "loss": 0.4651, "step": 48460 }, { "epoch": 2.6895977808599167, "grad_norm": 0.283216267824173, "learning_rate": 2.0354187814839248e-08, "loss": 0.4293, "step": 48480 }, { "epoch": 2.690707350901526, "grad_norm": 0.3899552524089813, "learning_rate": 2.0210920614872617e-08, "loss": 0.5598, "step": 48500 }, { "epoch": 2.6918169209431344, "grad_norm": 0.5513736605644226, "learning_rate": 2.0068138162894295e-08, "loss": 0.503, "step": 48520 }, { "epoch": 2.6929264909847435, "grad_norm": 0.25844162702560425, "learning_rate": 1.992584076010867e-08, "loss": 0.546, "step": 48540 }, { "epoch": 2.6940360610263525, "grad_norm": 0.24406981468200684, "learning_rate": 1.9784028706697097e-08, "loss": 0.5384, "step": 48560 }, { "epoch": 2.695145631067961, "grad_norm": 0.379564493894577, "learning_rate": 1.9642702301816917e-08, "loss": 0.4759, "step": 48580 }, { "epoch": 2.69625520110957, "grad_norm": 0.21882766485214233, "learning_rate": 1.9501861843601114e-08, "loss": 0.5072, "step": 48600 }, { "epoch": 2.697364771151179, "grad_norm": 0.25654175877571106, "learning_rate": 1.936150762915742e-08, "loss": 0.5513, "step": 48620 }, { "epoch": 2.698474341192788, "grad_norm": 0.24711990356445312, "learning_rate": 1.922163995456799e-08, "loss": 0.4447, "step": 48640 }, { "epoch": 2.699583911234397, "grad_norm": 0.24425290524959564, "learning_rate": 1.9082259114888477e-08, "loss": 0.517, "step": 48660 }, { "epoch": 2.7006934812760055, "grad_norm": 0.3406910300254822, "learning_rate": 1.894336540414748e-08, "loss": 0.498, "step": 48680 }, { "epoch": 2.7018030513176146, "grad_norm": 0.3013654053211212, "learning_rate": 1.8804959115346242e-08, "loss": 0.4515, "step": 48700 }, { "epoch": 2.702912621359223, "grad_norm": 0.30556216835975647, "learning_rate": 1.8667040540457423e-08, "loss": 0.4893, "step": 48720 }, { "epoch": 2.704022191400832, "grad_norm": 0.3394109606742859, "learning_rate": 1.852960997042513e-08, "loss": 0.5009, "step": 48740 }, { "epoch": 2.7051317614424413, "grad_norm": 0.35217949748039246, "learning_rate": 1.839266769516387e-08, "loss": 0.5229, "step": 48760 }, { "epoch": 2.70624133148405, "grad_norm": 0.30995404720306396, "learning_rate": 1.8256214003558035e-08, "loss": 0.4499, "step": 48780 }, { "epoch": 2.707350901525659, "grad_norm": 0.5276461243629456, "learning_rate": 1.8120249183461473e-08, "loss": 0.5264, "step": 48800 }, { "epoch": 2.7084604715672675, "grad_norm": 0.565543532371521, "learning_rate": 1.798477352169664e-08, "loss": 0.4961, "step": 48820 }, { "epoch": 2.7095700416088766, "grad_norm": 0.3824761211872101, "learning_rate": 1.7849787304054093e-08, "loss": 0.5422, "step": 48840 }, { "epoch": 2.7106796116504857, "grad_norm": 0.2933233082294464, "learning_rate": 1.771529081529194e-08, "loss": 0.4494, "step": 48860 }, { "epoch": 2.7117891816920943, "grad_norm": 0.6354488730430603, "learning_rate": 1.758128433913514e-08, "loss": 0.5203, "step": 48880 }, { "epoch": 2.7128987517337033, "grad_norm": 0.288551926612854, "learning_rate": 1.7447768158274923e-08, "loss": 0.5493, "step": 48900 }, { "epoch": 2.714008321775312, "grad_norm": 0.3104619085788727, "learning_rate": 1.7314742554368407e-08, "loss": 0.5203, "step": 48920 }, { "epoch": 2.715117891816921, "grad_norm": 0.29661014676094055, "learning_rate": 1.7182207808037613e-08, "loss": 0.4669, "step": 48940 }, { "epoch": 2.71622746185853, "grad_norm": 0.21396254003047943, "learning_rate": 1.7050164198869148e-08, "loss": 0.4788, "step": 48960 }, { "epoch": 2.7173370319001386, "grad_norm": 0.3975656032562256, "learning_rate": 1.6918612005413642e-08, "loss": 0.4529, "step": 48980 }, { "epoch": 2.7184466019417477, "grad_norm": 0.2710399627685547, "learning_rate": 1.6787551505184906e-08, "loss": 0.5908, "step": 49000 }, { "epoch": 2.7195561719833563, "grad_norm": 0.26105114817619324, "learning_rate": 1.6656982974659563e-08, "loss": 0.5593, "step": 49020 }, { "epoch": 2.7206657420249654, "grad_norm": 0.24644818902015686, "learning_rate": 1.6526906689276475e-08, "loss": 0.4263, "step": 49040 }, { "epoch": 2.7217753120665744, "grad_norm": 0.3648255467414856, "learning_rate": 1.6397322923436098e-08, "loss": 0.4967, "step": 49060 }, { "epoch": 2.722884882108183, "grad_norm": 0.35969844460487366, "learning_rate": 1.6268231950499727e-08, "loss": 0.5452, "step": 49080 }, { "epoch": 2.723994452149792, "grad_norm": 0.34863385558128357, "learning_rate": 1.6139634042789368e-08, "loss": 0.4802, "step": 49100 }, { "epoch": 2.7251040221914007, "grad_norm": 0.3086669445037842, "learning_rate": 1.6011529471586656e-08, "loss": 0.4045, "step": 49120 }, { "epoch": 2.7262135922330097, "grad_norm": 0.40755099058151245, "learning_rate": 1.5883918507132637e-08, "loss": 0.4844, "step": 49140 }, { "epoch": 2.727323162274619, "grad_norm": 0.30245617032051086, "learning_rate": 1.5756801418627035e-08, "loss": 0.4661, "step": 49160 }, { "epoch": 2.7284327323162274, "grad_norm": 0.36927253007888794, "learning_rate": 1.5630178474227706e-08, "loss": 0.5154, "step": 49180 }, { "epoch": 2.7295423023578365, "grad_norm": 0.28562623262405396, "learning_rate": 1.550404994105009e-08, "loss": 0.4673, "step": 49200 }, { "epoch": 2.730651872399445, "grad_norm": 0.4261753559112549, "learning_rate": 1.5378416085166768e-08, "loss": 0.4258, "step": 49220 }, { "epoch": 2.731761442441054, "grad_norm": 0.44709065556526184, "learning_rate": 1.525327717160668e-08, "loss": 0.5743, "step": 49240 }, { "epoch": 2.732871012482663, "grad_norm": 0.29340893030166626, "learning_rate": 1.5128633464354584e-08, "loss": 0.4577, "step": 49260 }, { "epoch": 2.733980582524272, "grad_norm": 0.5357069969177246, "learning_rate": 1.500448522635081e-08, "loss": 0.5096, "step": 49280 }, { "epoch": 2.735090152565881, "grad_norm": 0.3148517310619354, "learning_rate": 1.4880832719490255e-08, "loss": 0.567, "step": 49300 }, { "epoch": 2.7361997226074894, "grad_norm": 0.3797491490840912, "learning_rate": 1.475767620462215e-08, "loss": 0.561, "step": 49320 }, { "epoch": 2.7373092926490985, "grad_norm": 0.3674441874027252, "learning_rate": 1.463501594154945e-08, "loss": 0.5571, "step": 49340 }, { "epoch": 2.7384188626907076, "grad_norm": 0.30513426661491394, "learning_rate": 1.4512852189028202e-08, "loss": 0.5348, "step": 49360 }, { "epoch": 2.739528432732316, "grad_norm": 0.28775492310523987, "learning_rate": 1.439118520476701e-08, "loss": 0.4988, "step": 49380 }, { "epoch": 2.740638002773925, "grad_norm": 0.31999436020851135, "learning_rate": 1.4270015245426648e-08, "loss": 0.4999, "step": 49400 }, { "epoch": 2.741747572815534, "grad_norm": 0.2799537479877472, "learning_rate": 1.4149342566619288e-08, "loss": 0.4734, "step": 49420 }, { "epoch": 2.742857142857143, "grad_norm": 0.24496117234230042, "learning_rate": 1.4029167422908105e-08, "loss": 0.4369, "step": 49440 }, { "epoch": 2.743966712898752, "grad_norm": 0.440842866897583, "learning_rate": 1.390949006780673e-08, "loss": 0.5263, "step": 49460 }, { "epoch": 2.7450762829403605, "grad_norm": 0.3630158305168152, "learning_rate": 1.379031075377865e-08, "loss": 0.5451, "step": 49480 }, { "epoch": 2.7461858529819696, "grad_norm": 0.45627152919769287, "learning_rate": 1.3671629732236679e-08, "loss": 0.4608, "step": 49500 }, { "epoch": 2.747295423023578, "grad_norm": 0.556372880935669, "learning_rate": 1.3553447253542654e-08, "loss": 0.5564, "step": 49520 }, { "epoch": 2.7484049930651873, "grad_norm": 0.28986474871635437, "learning_rate": 1.3435763567006514e-08, "loss": 0.5255, "step": 49540 }, { "epoch": 2.7495145631067963, "grad_norm": 0.30257272720336914, "learning_rate": 1.3318578920886003e-08, "loss": 0.5751, "step": 49560 }, { "epoch": 2.750624133148405, "grad_norm": 0.46946805715560913, "learning_rate": 1.3201893562386323e-08, "loss": 0.4856, "step": 49580 }, { "epoch": 2.751733703190014, "grad_norm": 0.45060354471206665, "learning_rate": 1.3085707737659158e-08, "loss": 0.5475, "step": 49600 }, { "epoch": 2.7528432732316226, "grad_norm": 0.2520931661128998, "learning_rate": 1.2970021691802475e-08, "loss": 0.4122, "step": 49620 }, { "epoch": 2.7539528432732316, "grad_norm": 0.3641115128993988, "learning_rate": 1.2854835668860058e-08, "loss": 0.5901, "step": 49640 }, { "epoch": 2.7550624133148407, "grad_norm": 0.2932727038860321, "learning_rate": 1.2740149911820792e-08, "loss": 0.4393, "step": 49660 }, { "epoch": 2.7561719833564493, "grad_norm": 0.49297595024108887, "learning_rate": 1.2625964662618172e-08, "loss": 0.4601, "step": 49680 }, { "epoch": 2.7572815533980584, "grad_norm": 0.28914856910705566, "learning_rate": 1.251228016212999e-08, "loss": 0.5165, "step": 49700 }, { "epoch": 2.758391123439667, "grad_norm": 0.26816123723983765, "learning_rate": 1.2399096650177577e-08, "loss": 0.4418, "step": 49720 }, { "epoch": 2.759500693481276, "grad_norm": 0.3988465368747711, "learning_rate": 1.2286414365525494e-08, "loss": 0.5102, "step": 49740 }, { "epoch": 2.760610263522885, "grad_norm": 0.21637725830078125, "learning_rate": 1.2174233545880847e-08, "loss": 0.4146, "step": 49760 }, { "epoch": 2.7617198335644937, "grad_norm": 0.2590038776397705, "learning_rate": 1.2062554427893002e-08, "loss": 0.4969, "step": 49780 }, { "epoch": 2.7628294036061027, "grad_norm": 0.35655972361564636, "learning_rate": 1.1951377247152867e-08, "loss": 0.5629, "step": 49800 }, { "epoch": 2.7639389736477114, "grad_norm": 0.40170758962631226, "learning_rate": 1.1840702238192584e-08, "loss": 0.5073, "step": 49820 }, { "epoch": 2.7650485436893204, "grad_norm": 0.24098189175128937, "learning_rate": 1.173052963448487e-08, "loss": 0.4639, "step": 49840 }, { "epoch": 2.7661581137309295, "grad_norm": 0.40717729926109314, "learning_rate": 1.162085966844259e-08, "loss": 0.5941, "step": 49860 }, { "epoch": 2.767267683772538, "grad_norm": 0.5377121567726135, "learning_rate": 1.1511692571418457e-08, "loss": 0.4865, "step": 49880 }, { "epoch": 2.768377253814147, "grad_norm": 0.29645365476608276, "learning_rate": 1.140302857370412e-08, "loss": 0.5034, "step": 49900 }, { "epoch": 2.7694868238557557, "grad_norm": 0.35984981060028076, "learning_rate": 1.1294867904529992e-08, "loss": 0.4631, "step": 49920 }, { "epoch": 2.770596393897365, "grad_norm": 0.44465333223342896, "learning_rate": 1.1187210792064832e-08, "loss": 0.5394, "step": 49940 }, { "epoch": 2.771705963938974, "grad_norm": 0.3130132555961609, "learning_rate": 1.1080057463414977e-08, "loss": 0.4735, "step": 49960 }, { "epoch": 2.7728155339805824, "grad_norm": 0.31491196155548096, "learning_rate": 1.097340814462408e-08, "loss": 0.444, "step": 49980 }, { "epoch": 2.7739251040221915, "grad_norm": 0.3265323042869568, "learning_rate": 1.0867263060672566e-08, "loss": 0.547, "step": 50000 }, { "epoch": 2.7750346740638, "grad_norm": 0.3313845992088318, "learning_rate": 1.076162243547718e-08, "loss": 0.4732, "step": 50020 }, { "epoch": 2.776144244105409, "grad_norm": 0.28874528408050537, "learning_rate": 1.065648649189041e-08, "loss": 0.4561, "step": 50040 }, { "epoch": 2.777253814147018, "grad_norm": 0.39177921414375305, "learning_rate": 1.0551855451700148e-08, "loss": 0.489, "step": 50060 }, { "epoch": 2.778363384188627, "grad_norm": 0.3458656668663025, "learning_rate": 1.0447729535629252e-08, "loss": 0.4923, "step": 50080 }, { "epoch": 2.779472954230236, "grad_norm": 0.25984108448028564, "learning_rate": 1.0344108963334847e-08, "loss": 0.4004, "step": 50100 }, { "epoch": 2.7805825242718445, "grad_norm": 0.33904939889907837, "learning_rate": 1.024099395340816e-08, "loss": 0.5722, "step": 50120 }, { "epoch": 2.7816920943134535, "grad_norm": 0.3904574513435364, "learning_rate": 1.0138384723373889e-08, "loss": 0.5151, "step": 50140 }, { "epoch": 2.7828016643550626, "grad_norm": 0.32786476612091064, "learning_rate": 1.003628148968963e-08, "loss": 0.5247, "step": 50160 }, { "epoch": 2.783911234396671, "grad_norm": 0.41585350036621094, "learning_rate": 9.93468446774584e-09, "loss": 0.4361, "step": 50180 }, { "epoch": 2.7850208044382803, "grad_norm": 0.4058113992214203, "learning_rate": 9.833593871864803e-09, "loss": 0.6048, "step": 50200 }, { "epoch": 2.786130374479889, "grad_norm": 0.3305119574069977, "learning_rate": 9.733009915300628e-09, "loss": 0.4739, "step": 50220 }, { "epoch": 2.787239944521498, "grad_norm": 0.3286583423614502, "learning_rate": 9.63293281023872e-09, "loss": 0.4866, "step": 50240 }, { "epoch": 2.788349514563107, "grad_norm": 0.28699707984924316, "learning_rate": 9.53336276779515e-09, "loss": 0.4577, "step": 50260 }, { "epoch": 2.7894590846047156, "grad_norm": 0.314662367105484, "learning_rate": 9.434299998016287e-09, "loss": 0.5501, "step": 50280 }, { "epoch": 2.7905686546463246, "grad_norm": 0.6367086172103882, "learning_rate": 9.33574470987858e-09, "loss": 0.56, "step": 50300 }, { "epoch": 2.7916782246879333, "grad_norm": 0.2939661741256714, "learning_rate": 9.237697111287745e-09, "loss": 0.4477, "step": 50320 }, { "epoch": 2.7927877947295423, "grad_norm": 0.2913439869880676, "learning_rate": 9.140157409078559e-09, "loss": 0.4798, "step": 50340 }, { "epoch": 2.7938973647711514, "grad_norm": 0.5121108293533325, "learning_rate": 9.043125809014423e-09, "loss": 0.4724, "step": 50360 }, { "epoch": 2.79500693481276, "grad_norm": 0.31932052969932556, "learning_rate": 8.9466025157868e-09, "loss": 0.4639, "step": 50380 }, { "epoch": 2.796116504854369, "grad_norm": 0.40991172194480896, "learning_rate": 8.850587733014947e-09, "loss": 0.4622, "step": 50400 }, { "epoch": 2.7972260748959776, "grad_norm": 0.21632447838783264, "learning_rate": 8.755081663245345e-09, "loss": 0.4652, "step": 50420 }, { "epoch": 2.7983356449375867, "grad_norm": 0.3352177143096924, "learning_rate": 8.660084507951331e-09, "loss": 0.4699, "step": 50440 }, { "epoch": 2.7994452149791957, "grad_norm": 0.3686872720718384, "learning_rate": 8.565596467532715e-09, "loss": 0.458, "step": 50460 }, { "epoch": 2.8005547850208043, "grad_norm": 0.4523831605911255, "learning_rate": 8.471617741315245e-09, "loss": 0.5381, "step": 50480 }, { "epoch": 2.8016643550624134, "grad_norm": 0.30168017745018005, "learning_rate": 8.37814852755031e-09, "loss": 0.5196, "step": 50500 }, { "epoch": 2.802773925104022, "grad_norm": 0.37785762548446655, "learning_rate": 8.28518902341438e-09, "loss": 0.4645, "step": 50520 }, { "epoch": 2.803883495145631, "grad_norm": 0.2909519076347351, "learning_rate": 8.19273942500881e-09, "loss": 0.5273, "step": 50540 }, { "epoch": 2.80499306518724, "grad_norm": 0.30156806111335754, "learning_rate": 8.10079992735918e-09, "loss": 0.4077, "step": 50560 }, { "epoch": 2.8061026352288487, "grad_norm": 0.26899388432502747, "learning_rate": 8.009370724415015e-09, "loss": 0.4858, "step": 50580 }, { "epoch": 2.807212205270458, "grad_norm": 0.3285605311393738, "learning_rate": 7.918452009049447e-09, "loss": 0.4646, "step": 50600 }, { "epoch": 2.8083217753120664, "grad_norm": 0.23425298929214478, "learning_rate": 7.828043973058557e-09, "loss": 0.5356, "step": 50620 }, { "epoch": 2.8094313453536754, "grad_norm": 0.4782729744911194, "learning_rate": 7.738146807161255e-09, "loss": 0.6069, "step": 50640 }, { "epoch": 2.8105409153952845, "grad_norm": 0.23450233042240143, "learning_rate": 7.648760700998707e-09, "loss": 0.425, "step": 50660 }, { "epoch": 2.811650485436893, "grad_norm": 0.3057810664176941, "learning_rate": 7.559885843133968e-09, "loss": 0.4699, "step": 50680 }, { "epoch": 2.812760055478502, "grad_norm": 0.34332847595214844, "learning_rate": 7.471522421051618e-09, "loss": 0.477, "step": 50700 }, { "epoch": 2.8138696255201108, "grad_norm": 0.41291117668151855, "learning_rate": 7.383670621157412e-09, "loss": 0.4375, "step": 50720 }, { "epoch": 2.81497919556172, "grad_norm": 0.3535383939743042, "learning_rate": 7.296330628777658e-09, "loss": 0.4552, "step": 50740 }, { "epoch": 2.816088765603329, "grad_norm": 0.3838901221752167, "learning_rate": 7.209502628159142e-09, "loss": 0.4657, "step": 50760 }, { "epoch": 2.8171983356449375, "grad_norm": 0.3909391164779663, "learning_rate": 7.1231868024685116e-09, "loss": 0.5232, "step": 50780 }, { "epoch": 2.8183079056865465, "grad_norm": 0.33024367690086365, "learning_rate": 7.037383333791946e-09, "loss": 0.4702, "step": 50800 }, { "epoch": 2.819417475728155, "grad_norm": 0.26133590936660767, "learning_rate": 6.952092403134851e-09, "loss": 0.5234, "step": 50820 }, { "epoch": 2.820527045769764, "grad_norm": 0.2780231535434723, "learning_rate": 6.867314190421386e-09, "loss": 0.4387, "step": 50840 }, { "epoch": 2.8216366158113733, "grad_norm": 0.4538043737411499, "learning_rate": 6.783048874494129e-09, "loss": 0.5561, "step": 50860 }, { "epoch": 2.822746185852982, "grad_norm": 0.3469817042350769, "learning_rate": 6.69929663311361e-09, "loss": 0.4217, "step": 50880 }, { "epoch": 2.823855755894591, "grad_norm": 0.3227823078632355, "learning_rate": 6.616057642958167e-09, "loss": 0.3833, "step": 50900 }, { "epoch": 2.8249653259361995, "grad_norm": 0.43543004989624023, "learning_rate": 6.533332079623255e-09, "loss": 0.5223, "step": 50920 }, { "epoch": 2.8260748959778086, "grad_norm": 0.3666130602359772, "learning_rate": 6.451120117621306e-09, "loss": 0.539, "step": 50940 }, { "epoch": 2.8271844660194176, "grad_norm": 0.30034101009368896, "learning_rate": 6.3694219303813135e-09, "loss": 0.5399, "step": 50960 }, { "epoch": 2.8282940360610263, "grad_norm": 0.3210417330265045, "learning_rate": 6.288237690248388e-09, "loss": 0.4775, "step": 50980 }, { "epoch": 2.8294036061026353, "grad_norm": 0.2789684236049652, "learning_rate": 6.2075675684835075e-09, "loss": 0.5359, "step": 51000 }, { "epoch": 2.830513176144244, "grad_norm": 0.8644620180130005, "learning_rate": 6.1274117352630716e-09, "loss": 0.5753, "step": 51020 }, { "epoch": 2.831622746185853, "grad_norm": 0.2665315270423889, "learning_rate": 6.0477703596785725e-09, "loss": 0.5499, "step": 51040 }, { "epoch": 2.832732316227462, "grad_norm": 0.32534053921699524, "learning_rate": 5.968643609736257e-09, "loss": 0.5001, "step": 51060 }, { "epoch": 2.8338418862690706, "grad_norm": 0.34237363934516907, "learning_rate": 5.890031652356714e-09, "loss": 0.504, "step": 51080 }, { "epoch": 2.8349514563106797, "grad_norm": 0.21469056606292725, "learning_rate": 5.811934653374567e-09, "loss": 0.3941, "step": 51100 }, { "epoch": 2.8360610263522883, "grad_norm": 0.441135048866272, "learning_rate": 5.734352777538143e-09, "loss": 0.5122, "step": 51120 }, { "epoch": 2.8371705963938973, "grad_norm": 0.27943921089172363, "learning_rate": 5.657286188509081e-09, "loss": 0.4015, "step": 51140 }, { "epoch": 2.8382801664355064, "grad_norm": 0.3298356533050537, "learning_rate": 5.580735048862029e-09, "loss": 0.4523, "step": 51160 }, { "epoch": 2.839389736477115, "grad_norm": 0.30228736996650696, "learning_rate": 5.504699520084227e-09, "loss": 0.603, "step": 51180 }, { "epoch": 2.840499306518724, "grad_norm": 0.31227973103523254, "learning_rate": 5.4291797625752865e-09, "loss": 0.4986, "step": 51200 }, { "epoch": 2.8416088765603327, "grad_norm": 0.25893259048461914, "learning_rate": 5.354175935646688e-09, "loss": 0.5476, "step": 51220 }, { "epoch": 2.8427184466019417, "grad_norm": 0.3982257843017578, "learning_rate": 5.279688197521643e-09, "loss": 0.5335, "step": 51240 }, { "epoch": 2.843828016643551, "grad_norm": 0.4158751368522644, "learning_rate": 5.2057167053345675e-09, "loss": 0.465, "step": 51260 }, { "epoch": 2.8449375866851594, "grad_norm": 0.3218027651309967, "learning_rate": 5.132261615130862e-09, "loss": 0.5285, "step": 51280 }, { "epoch": 2.8460471567267684, "grad_norm": 0.32423874735832214, "learning_rate": 5.059323081866601e-09, "loss": 0.4603, "step": 51300 }, { "epoch": 2.847156726768377, "grad_norm": 0.24938726425170898, "learning_rate": 4.98690125940815e-09, "loss": 0.4171, "step": 51320 }, { "epoch": 2.848266296809986, "grad_norm": 0.40163180232048035, "learning_rate": 4.914996300531799e-09, "loss": 0.5131, "step": 51340 }, { "epoch": 2.849375866851595, "grad_norm": 0.3943190276622772, "learning_rate": 4.8436083569236004e-09, "loss": 0.5125, "step": 51360 }, { "epoch": 2.8504854368932038, "grad_norm": 0.5573268532752991, "learning_rate": 4.772737579178815e-09, "loss": 0.4755, "step": 51380 }, { "epoch": 2.851595006934813, "grad_norm": 0.47374945878982544, "learning_rate": 4.7023841168018495e-09, "loss": 0.4633, "step": 51400 }, { "epoch": 2.8527045769764214, "grad_norm": 0.43656882643699646, "learning_rate": 4.632548118205681e-09, "loss": 0.5299, "step": 51420 }, { "epoch": 2.8538141470180305, "grad_norm": 0.32251492142677307, "learning_rate": 4.563229730711854e-09, "loss": 0.5619, "step": 51440 }, { "epoch": 2.8549237170596395, "grad_norm": 0.29662254452705383, "learning_rate": 4.49442910054984e-09, "loss": 0.5264, "step": 51460 }, { "epoch": 2.856033287101248, "grad_norm": 0.8042982816696167, "learning_rate": 4.4261463728569315e-09, "loss": 0.5795, "step": 51480 }, { "epoch": 2.857142857142857, "grad_norm": 0.2703373432159424, "learning_rate": 4.358381691677931e-09, "loss": 0.5325, "step": 51500 }, { "epoch": 2.858252427184466, "grad_norm": 0.3735770881175995, "learning_rate": 4.291135199964768e-09, "loss": 0.4496, "step": 51520 }, { "epoch": 2.859361997226075, "grad_norm": 0.33529382944107056, "learning_rate": 4.224407039576244e-09, "loss": 0.4813, "step": 51540 }, { "epoch": 2.860471567267684, "grad_norm": 0.3222930133342743, "learning_rate": 4.1581973512776755e-09, "loss": 0.5761, "step": 51560 }, { "epoch": 2.8615811373092925, "grad_norm": 0.21736930310726166, "learning_rate": 4.092506274740726e-09, "loss": 0.5925, "step": 51580 }, { "epoch": 2.8626907073509016, "grad_norm": 0.7809433937072754, "learning_rate": 4.027333948542932e-09, "loss": 0.5683, "step": 51600 }, { "epoch": 2.86380027739251, "grad_norm": 0.3284660279750824, "learning_rate": 3.962680510167627e-09, "loss": 0.4051, "step": 51620 }, { "epoch": 2.8649098474341192, "grad_norm": 0.1596662998199463, "learning_rate": 3.89854609600343e-09, "loss": 0.4174, "step": 51640 }, { "epoch": 2.8660194174757283, "grad_norm": 0.249763622879982, "learning_rate": 3.834930841344119e-09, "loss": 0.5728, "step": 51660 }, { "epoch": 2.867128987517337, "grad_norm": 0.5664410591125488, "learning_rate": 3.771834880388264e-09, "loss": 0.5241, "step": 51680 }, { "epoch": 2.868238557558946, "grad_norm": 0.27903735637664795, "learning_rate": 3.709258346238947e-09, "loss": 0.4727, "step": 51700 }, { "epoch": 2.8693481276005546, "grad_norm": 0.3580375015735626, "learning_rate": 3.6472013709035464e-09, "loss": 0.4439, "step": 51720 }, { "epoch": 2.8704576976421636, "grad_norm": 0.38934141397476196, "learning_rate": 3.5856640852933994e-09, "loss": 0.5416, "step": 51740 }, { "epoch": 2.8715672676837727, "grad_norm": 0.2683168649673462, "learning_rate": 3.5246466192235256e-09, "loss": 0.5651, "step": 51760 }, { "epoch": 2.8726768377253813, "grad_norm": 0.2908162772655487, "learning_rate": 3.4641491014123224e-09, "loss": 0.4368, "step": 51780 }, { "epoch": 2.8737864077669903, "grad_norm": 0.3453535735607147, "learning_rate": 3.4041716594815085e-09, "loss": 0.4946, "step": 51800 }, { "epoch": 2.874895977808599, "grad_norm": 0.2649340033531189, "learning_rate": 3.344714419955458e-09, "loss": 0.4997, "step": 51820 }, { "epoch": 2.876005547850208, "grad_norm": 0.415551096200943, "learning_rate": 3.2857775082613115e-09, "loss": 0.4751, "step": 51840 }, { "epoch": 2.877115117891817, "grad_norm": 0.23369406163692474, "learning_rate": 3.227361048728533e-09, "loss": 0.5292, "step": 51860 }, { "epoch": 2.8782246879334257, "grad_norm": 0.28008100390434265, "learning_rate": 3.1694651645886026e-09, "loss": 0.4465, "step": 51880 }, { "epoch": 2.8793342579750347, "grad_norm": 0.3158840239048004, "learning_rate": 3.1120899779749354e-09, "loss": 0.6489, "step": 51900 }, { "epoch": 2.8804438280166433, "grad_norm": 0.3464593291282654, "learning_rate": 3.055235609922463e-09, "loss": 0.4581, "step": 51920 }, { "epoch": 2.8815533980582524, "grad_norm": 0.3101706802845001, "learning_rate": 2.998902180367413e-09, "loss": 0.4621, "step": 51940 }, { "epoch": 2.8826629680998614, "grad_norm": 0.2960689663887024, "learning_rate": 2.9430898081471144e-09, "loss": 0.4621, "step": 51960 }, { "epoch": 2.88377253814147, "grad_norm": 0.2680453062057495, "learning_rate": 2.8877986109996644e-09, "loss": 0.5489, "step": 51980 }, { "epoch": 2.884882108183079, "grad_norm": 0.25487950444221497, "learning_rate": 2.833028705563789e-09, "loss": 0.5897, "step": 52000 }, { "epoch": 2.8859916782246877, "grad_norm": 0.32062649726867676, "learning_rate": 2.7787802073784563e-09, "loss": 0.5899, "step": 52020 }, { "epoch": 2.8871012482662968, "grad_norm": 0.3527581989765167, "learning_rate": 2.7250532308827902e-09, "loss": 0.4854, "step": 52040 }, { "epoch": 2.888210818307906, "grad_norm": 0.30510571599006653, "learning_rate": 2.6718478894156572e-09, "loss": 0.5067, "step": 52060 }, { "epoch": 2.8893203883495144, "grad_norm": 0.32150235772132874, "learning_rate": 2.619164295215581e-09, "loss": 0.5021, "step": 52080 }, { "epoch": 2.8904299583911235, "grad_norm": 0.2723201811313629, "learning_rate": 2.5670025594204112e-09, "loss": 0.5872, "step": 52100 }, { "epoch": 2.891539528432732, "grad_norm": 0.3638918697834015, "learning_rate": 2.5153627920671274e-09, "loss": 0.4446, "step": 52120 }, { "epoch": 2.892649098474341, "grad_norm": 0.23144610226154327, "learning_rate": 2.4642451020916165e-09, "loss": 0.5607, "step": 52140 }, { "epoch": 2.89375866851595, "grad_norm": 0.22184501588344574, "learning_rate": 2.4136495973283986e-09, "loss": 0.4776, "step": 52160 }, { "epoch": 2.894868238557559, "grad_norm": 0.4573993682861328, "learning_rate": 2.3635763845104296e-09, "loss": 0.4748, "step": 52180 }, { "epoch": 2.895977808599168, "grad_norm": 0.26982975006103516, "learning_rate": 2.314025569268879e-09, "loss": 0.5204, "step": 52200 }, { "epoch": 2.8970873786407765, "grad_norm": 0.2678343653678894, "learning_rate": 2.2649972561329113e-09, "loss": 0.49, "step": 52220 }, { "epoch": 2.8981969486823855, "grad_norm": 0.461797297000885, "learning_rate": 2.2164915485294587e-09, "loss": 0.4908, "step": 52240 }, { "epoch": 2.8993065187239946, "grad_norm": 0.28805771470069885, "learning_rate": 2.1685085487829493e-09, "loss": 0.4627, "step": 52260 }, { "epoch": 2.900416088765603, "grad_norm": 0.33698198199272156, "learning_rate": 2.1210483581151904e-09, "loss": 0.5163, "step": 52280 }, { "epoch": 2.9015256588072122, "grad_norm": 0.42888399958610535, "learning_rate": 2.0741110766450686e-09, "loss": 0.4747, "step": 52300 }, { "epoch": 2.902635228848821, "grad_norm": 0.4557395577430725, "learning_rate": 2.0276968033884347e-09, "loss": 0.5147, "step": 52320 }, { "epoch": 2.90374479889043, "grad_norm": 0.27196362614631653, "learning_rate": 1.9818056362577996e-09, "loss": 0.4133, "step": 52340 }, { "epoch": 2.904854368932039, "grad_norm": 0.2950708270072937, "learning_rate": 1.936437672062141e-09, "loss": 0.5727, "step": 52360 }, { "epoch": 2.9059639389736476, "grad_norm": 0.34791111946105957, "learning_rate": 1.8915930065067365e-09, "loss": 0.3777, "step": 52380 }, { "epoch": 2.9070735090152566, "grad_norm": 0.37563133239746094, "learning_rate": 1.8472717341929955e-09, "loss": 0.3826, "step": 52400 }, { "epoch": 2.9081830790568652, "grad_norm": 0.34508243203163147, "learning_rate": 1.8034739486181282e-09, "loss": 0.513, "step": 52420 }, { "epoch": 2.9092926490984743, "grad_norm": 0.41222870349884033, "learning_rate": 1.760199742175089e-09, "loss": 0.47, "step": 52440 }, { "epoch": 2.9104022191400833, "grad_norm": 0.34041187167167664, "learning_rate": 1.717449206152327e-09, "loss": 0.6768, "step": 52460 }, { "epoch": 2.911511789181692, "grad_norm": 0.30594080686569214, "learning_rate": 1.6752224307335361e-09, "loss": 0.6169, "step": 52480 }, { "epoch": 2.912621359223301, "grad_norm": 0.2777078449726105, "learning_rate": 1.6335195049975992e-09, "loss": 0.5275, "step": 52500 }, { "epoch": 2.9137309292649096, "grad_norm": 0.27585068345069885, "learning_rate": 1.592340516918228e-09, "loss": 0.4655, "step": 52520 }, { "epoch": 2.9148404993065187, "grad_norm": 0.3215627670288086, "learning_rate": 1.5516855533639629e-09, "loss": 0.4625, "step": 52540 }, { "epoch": 2.9159500693481277, "grad_norm": 0.24131633341312408, "learning_rate": 1.5115547000978113e-09, "loss": 0.4845, "step": 52560 }, { "epoch": 2.9170596393897363, "grad_norm": 0.38613539934158325, "learning_rate": 1.4719480417772211e-09, "loss": 0.5244, "step": 52580 }, { "epoch": 2.9181692094313454, "grad_norm": 0.24121151864528656, "learning_rate": 1.4328656619538026e-09, "loss": 0.5572, "step": 52600 }, { "epoch": 2.919278779472954, "grad_norm": 0.2574670910835266, "learning_rate": 1.3943076430731614e-09, "loss": 0.4955, "step": 52620 }, { "epoch": 2.920388349514563, "grad_norm": 0.4427129328250885, "learning_rate": 1.3562740664747885e-09, "loss": 0.4747, "step": 52640 }, { "epoch": 2.921497919556172, "grad_norm": 0.2854710817337036, "learning_rate": 1.3187650123918092e-09, "loss": 0.4278, "step": 52660 }, { "epoch": 2.9226074895977807, "grad_norm": 0.2698379456996918, "learning_rate": 1.2817805599509014e-09, "loss": 0.4048, "step": 52680 }, { "epoch": 2.9237170596393898, "grad_norm": 0.8392530083656311, "learning_rate": 1.2453207871720161e-09, "loss": 0.5113, "step": 52700 }, { "epoch": 2.9248266296809984, "grad_norm": 0.4941146671772003, "learning_rate": 1.2093857709682965e-09, "loss": 0.5569, "step": 52720 }, { "epoch": 2.9259361997226074, "grad_norm": 0.25422412157058716, "learning_rate": 1.173975587145909e-09, "loss": 0.4504, "step": 52740 }, { "epoch": 2.9270457697642165, "grad_norm": 0.38003239035606384, "learning_rate": 1.1390903104038506e-09, "loss": 0.4404, "step": 52760 }, { "epoch": 2.928155339805825, "grad_norm": 0.3107146620750427, "learning_rate": 1.1047300143338379e-09, "loss": 0.4832, "step": 52780 }, { "epoch": 2.929264909847434, "grad_norm": 0.4226890504360199, "learning_rate": 1.0708947714200557e-09, "loss": 0.4564, "step": 52800 }, { "epoch": 2.9303744798890428, "grad_norm": 0.746756374835968, "learning_rate": 1.0375846530391586e-09, "loss": 0.5057, "step": 52820 }, { "epoch": 2.931484049930652, "grad_norm": 0.4394928514957428, "learning_rate": 1.004799729459993e-09, "loss": 0.505, "step": 52840 }, { "epoch": 2.932593619972261, "grad_norm": 0.3593466281890869, "learning_rate": 9.725400698434583e-10, "loss": 0.4402, "step": 52860 }, { "epoch": 2.9337031900138695, "grad_norm": 0.3878641724586487, "learning_rate": 9.408057422424232e-10, "loss": 0.5035, "step": 52880 }, { "epoch": 2.9348127600554785, "grad_norm": 0.26895925402641296, "learning_rate": 9.09596813601532e-10, "loss": 0.5122, "step": 52900 }, { "epoch": 2.935922330097087, "grad_norm": 0.3767130672931671, "learning_rate": 8.789133497571488e-10, "loss": 0.434, "step": 52920 }, { "epoch": 2.937031900138696, "grad_norm": 0.22045975923538208, "learning_rate": 8.487554154370524e-10, "loss": 0.5476, "step": 52940 }, { "epoch": 2.9381414701803052, "grad_norm": 0.24776870012283325, "learning_rate": 8.19123074260436e-10, "loss": 0.5232, "step": 52960 }, { "epoch": 2.939251040221914, "grad_norm": 0.6471778154373169, "learning_rate": 7.900163887377964e-10, "loss": 0.4766, "step": 52980 }, { "epoch": 2.940360610263523, "grad_norm": 0.2411019653081894, "learning_rate": 7.614354202706564e-10, "loss": 0.4448, "step": 53000 }, { "epoch": 2.9414701803051315, "grad_norm": 0.3050864338874817, "learning_rate": 7.333802291515645e-10, "loss": 0.5739, "step": 53020 }, { "epoch": 2.9425797503467406, "grad_norm": 0.3716188371181488, "learning_rate": 7.058508745639014e-10, "loss": 0.4181, "step": 53040 }, { "epoch": 2.9436893203883496, "grad_norm": 0.2360864281654358, "learning_rate": 6.788474145818234e-10, "loss": 0.5082, "step": 53060 }, { "epoch": 2.9447988904299582, "grad_norm": 0.39952409267425537, "learning_rate": 6.523699061699861e-10, "loss": 0.4608, "step": 53080 }, { "epoch": 2.9459084604715673, "grad_norm": 0.2753463387489319, "learning_rate": 6.264184051837096e-10, "loss": 0.5093, "step": 53100 }, { "epoch": 2.947018030513176, "grad_norm": 0.3914014995098114, "learning_rate": 6.009929663685076e-10, "loss": 0.4631, "step": 53120 }, { "epoch": 2.948127600554785, "grad_norm": 0.2853849232196808, "learning_rate": 5.760936433602259e-10, "loss": 0.439, "step": 53140 }, { "epoch": 2.949237170596394, "grad_norm": 0.4752857983112335, "learning_rate": 5.517204886848758e-10, "loss": 0.6328, "step": 53160 }, { "epoch": 2.9503467406380026, "grad_norm": 0.381692111492157, "learning_rate": 5.2787355375844e-10, "loss": 0.4876, "step": 53180 }, { "epoch": 2.9514563106796117, "grad_norm": 0.28281769156455994, "learning_rate": 5.045528888868722e-10, "loss": 0.5335, "step": 53200 }, { "epoch": 2.9525658807212203, "grad_norm": 0.4425702393054962, "learning_rate": 4.817585432659032e-10, "loss": 0.4976, "step": 53220 }, { "epoch": 2.9536754507628293, "grad_norm": 0.2663850486278534, "learning_rate": 4.5949056498098546e-10, "loss": 0.527, "step": 53240 }, { "epoch": 2.9547850208044384, "grad_norm": 0.24688832461833954, "learning_rate": 4.3774900100720934e-10, "loss": 0.463, "step": 53260 }, { "epoch": 2.955894590846047, "grad_norm": 0.3755951225757599, "learning_rate": 4.1653389720916474e-10, "loss": 0.4728, "step": 53280 }, { "epoch": 2.957004160887656, "grad_norm": 0.3009140193462372, "learning_rate": 3.958452983408023e-10, "loss": 0.5299, "step": 53300 }, { "epoch": 2.9581137309292647, "grad_norm": 0.28627830743789673, "learning_rate": 3.7568324804543305e-10, "loss": 0.472, "step": 53320 }, { "epoch": 2.9592233009708737, "grad_norm": 0.43121597170829773, "learning_rate": 3.5604778885564567e-10, "loss": 0.4318, "step": 53340 }, { "epoch": 2.9603328710124828, "grad_norm": 0.5034205317497253, "learning_rate": 3.369389621931118e-10, "loss": 0.595, "step": 53360 }, { "epoch": 2.9614424410540914, "grad_norm": 0.2886047065258026, "learning_rate": 3.1835680836850285e-10, "loss": 0.4757, "step": 53380 }, { "epoch": 2.9625520110957004, "grad_norm": 0.2768586575984955, "learning_rate": 3.0030136658157343e-10, "loss": 0.5009, "step": 53400 }, { "epoch": 2.963661581137309, "grad_norm": 0.2947545647621155, "learning_rate": 2.827726749208559e-10, "loss": 0.6028, "step": 53420 }, { "epoch": 2.964771151178918, "grad_norm": 0.2857181131839752, "learning_rate": 2.6577077036377126e-10, "loss": 0.5453, "step": 53440 }, { "epoch": 2.965880721220527, "grad_norm": 0.4124162197113037, "learning_rate": 2.492956887764075e-10, "loss": 0.492, "step": 53460 }, { "epoch": 2.9669902912621358, "grad_norm": 0.30150046944618225, "learning_rate": 2.333474649135192e-10, "loss": 0.4439, "step": 53480 }, { "epoch": 2.968099861303745, "grad_norm": 0.41767415404319763, "learning_rate": 2.1792613241841673e-10, "loss": 0.4787, "step": 53500 }, { "epoch": 2.9692094313453534, "grad_norm": 0.2768767774105072, "learning_rate": 2.0303172382293843e-10, "loss": 0.4596, "step": 53520 }, { "epoch": 2.9703190013869625, "grad_norm": 0.34780144691467285, "learning_rate": 1.886642705473951e-10, "loss": 0.4925, "step": 53540 }, { "epoch": 2.9714285714285715, "grad_norm": 0.3510110080242157, "learning_rate": 1.7482380290034792e-10, "loss": 0.4787, "step": 53560 }, { "epoch": 2.97253814147018, "grad_norm": 0.4793170094490051, "learning_rate": 1.6151035007883062e-10, "loss": 0.4023, "step": 53580 }, { "epoch": 2.973647711511789, "grad_norm": 0.277187705039978, "learning_rate": 1.4872394016796076e-10, "loss": 0.5901, "step": 53600 }, { "epoch": 2.974757281553398, "grad_norm": 0.3932417631149292, "learning_rate": 1.3646460014113403e-10, "loss": 0.4297, "step": 53620 }, { "epoch": 2.975866851595007, "grad_norm": 0.28129783272743225, "learning_rate": 1.2473235585983012e-10, "loss": 0.4588, "step": 53640 }, { "epoch": 2.976976421636616, "grad_norm": 0.5752376914024353, "learning_rate": 1.13527232073668e-10, "loss": 0.5642, "step": 53660 }, { "epoch": 2.978085991678225, "grad_norm": 0.24046343564987183, "learning_rate": 1.028492524202118e-10, "loss": 0.5053, "step": 53680 }, { "epoch": 2.9791955617198336, "grad_norm": 0.28925299644470215, "learning_rate": 9.269843942505407e-11, "loss": 0.4333, "step": 53700 }, { "epoch": 2.980305131761442, "grad_norm": 0.352924644947052, "learning_rate": 8.30748145016491e-11, "loss": 0.4762, "step": 53720 }, { "epoch": 2.9814147018030512, "grad_norm": 0.39334508776664734, "learning_rate": 7.397839795139637e-11, "loss": 0.614, "step": 53740 }, { "epoch": 2.9825242718446603, "grad_norm": 0.3364826440811157, "learning_rate": 6.54092089634739e-11, "loss": 0.4795, "step": 53760 }, { "epoch": 2.9836338418862693, "grad_norm": 0.30030718445777893, "learning_rate": 5.7367265614893803e-11, "loss": 0.4914, "step": 53780 }, { "epoch": 2.984743411927878, "grad_norm": 0.4692116975784302, "learning_rate": 4.985258487041899e-11, "loss": 0.462, "step": 53800 }, { "epoch": 2.9858529819694866, "grad_norm": 0.22229287028312683, "learning_rate": 4.286518258250771e-11, "loss": 0.4893, "step": 53820 }, { "epoch": 2.9869625520110956, "grad_norm": 0.2507055103778839, "learning_rate": 3.640507349134125e-11, "loss": 0.4693, "step": 53840 }, { "epoch": 2.9880721220527047, "grad_norm": 0.4118461608886719, "learning_rate": 3.0472271224768474e-11, "loss": 0.4531, "step": 53860 }, { "epoch": 2.9891816920943137, "grad_norm": 0.3519735038280487, "learning_rate": 2.506678829819475e-11, "loss": 0.5306, "step": 53880 }, { "epoch": 2.9902912621359223, "grad_norm": 0.3333680331707001, "learning_rate": 2.0188636114693013e-11, "loss": 0.4896, "step": 53900 }, { "epoch": 2.991400832177531, "grad_norm": 0.31058236956596375, "learning_rate": 1.583782496489272e-11, "loss": 0.56, "step": 53920 }, { "epoch": 2.99251040221914, "grad_norm": 0.37107500433921814, "learning_rate": 1.2014364026979862e-11, "loss": 0.523, "step": 53940 }, { "epoch": 2.993619972260749, "grad_norm": 0.5263276696205139, "learning_rate": 8.7182613666692e-12, "loss": 0.4493, "step": 53960 }, { "epoch": 2.994729542302358, "grad_norm": 0.2217492312192917, "learning_rate": 5.9495239372042705e-12, "loss": 0.4349, "step": 53980 }, { "epoch": 2.9958391123439667, "grad_norm": 0.29085150361061096, "learning_rate": 3.708157579357385e-12, "loss": 0.4581, "step": 54000 }, { "epoch": 2.9969486823855753, "grad_norm": 0.30060455203056335, "learning_rate": 1.9941670213463603e-12, "loss": 0.5357, "step": 54020 }, { "epoch": 2.9980582524271844, "grad_norm": 0.3470294773578644, "learning_rate": 8.075558788900316e-13, "loss": 0.4781, "step": 54040 }, { "epoch": 2.9991678224687934, "grad_norm": 0.29062414169311523, "learning_rate": 1.4832665518049737e-13, "loss": 0.464, "step": 54060 }, { "epoch": 3.0, "step": 54075, "total_flos": 1.3318012341259592e+18, "train_loss": 0.5598785406739636, "train_runtime": 54928.4731, "train_samples_per_second": 0.984, "train_steps_per_second": 0.984 } ], "logging_steps": 20, "max_steps": 54075, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3318012341259592e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }