diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8970 +1,2948 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9992804029743345, - "eval_steps": 1000, - "global_step": 6252, + "epoch": 2.9964020148716717, + "eval_steps": 1400, + "global_step": 2082, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0023986567522187576, - "grad_norm": 4.928963404047604, - "learning_rate": 7.987220447284345e-08, - "loss": 1.1911, + "epoch": 0.007195970256656272, + "grad_norm": 3.382328658618869, + "learning_rate": 1.9138755980861244e-07, + "loss": 1.1034, "step": 5 }, { - "epoch": 0.004797313504437515, - "grad_norm": 6.084456595330925, - "learning_rate": 1.597444089456869e-07, - "loss": 1.1865, + "epoch": 0.014391940513312544, + "grad_norm": 2.9332510464158155, + "learning_rate": 3.827751196172249e-07, + "loss": 1.1248, "step": 10 }, { - "epoch": 0.007195970256656272, - "grad_norm": 4.612988319344874, - "learning_rate": 2.396166134185304e-07, - "loss": 1.1439, + "epoch": 0.021587910769968816, + "grad_norm": 2.8021125575132175, + "learning_rate": 5.741626794258373e-07, + "loss": 1.0979, "step": 15 }, { - "epoch": 0.00959462700887503, - "grad_norm": 4.7903715576713894, - "learning_rate": 3.194888178913738e-07, - "loss": 1.1287, + "epoch": 0.02878388102662509, + "grad_norm": 2.5443691578126004, + "learning_rate": 7.655502392344498e-07, + "loss": 1.1128, "step": 20 }, { - "epoch": 0.011993283761093787, - "grad_norm": 5.511263638761197, - "learning_rate": 3.993610223642173e-07, - "loss": 1.212, + "epoch": 0.035979851283281364, + "grad_norm": 2.823918046213155, + "learning_rate": 9.569377990430622e-07, + "loss": 1.0726, "step": 25 }, { - "epoch": 0.014391940513312544, - "grad_norm": 4.364651246737439, - "learning_rate": 4.792332268370608e-07, - "loss": 1.2221, + "epoch": 0.04317582153993763, + "grad_norm": 2.0326907547127124, + "learning_rate": 1.1483253588516746e-06, + "loss": 1.0043, "step": 30 }, { - "epoch": 0.016790597265531303, - "grad_norm": 14.508142045947785, - "learning_rate": 5.591054313099041e-07, - "loss": 1.1268, + "epoch": 0.05037179179659391, + "grad_norm": 2.168998278242428, + "learning_rate": 1.339712918660287e-06, + "loss": 1.0079, "step": 35 }, { - "epoch": 0.01918925401775006, - "grad_norm": 3.618955442205435, - "learning_rate": 6.389776357827476e-07, - "loss": 1.0991, + "epoch": 0.05756776205325018, + "grad_norm": 2.152879633190319, + "learning_rate": 1.5311004784688995e-06, + "loss": 1.0086, "step": 40 }, { - "epoch": 0.021587910769968816, - "grad_norm": 3.3756865136406042, - "learning_rate": 7.188498402555912e-07, - "loss": 1.2278, + "epoch": 0.06476373230990645, + "grad_norm": 1.7771809680542394, + "learning_rate": 1.722488038277512e-06, + "loss": 0.9862, "step": 45 }, { - "epoch": 0.023986567522187575, - "grad_norm": 9.702984255699223, - "learning_rate": 7.987220447284346e-07, - "loss": 1.201, + "epoch": 0.07195970256656273, + "grad_norm": 1.6796335234145752, + "learning_rate": 1.9138755980861244e-06, + "loss": 0.9757, "step": 50 }, { - "epoch": 0.026385224274406333, - "grad_norm": 5.04285882630285, - "learning_rate": 8.78594249201278e-07, - "loss": 1.0465, + "epoch": 0.079155672823219, + "grad_norm": 1.7111004529632545, + "learning_rate": 2.1052631578947366e-06, + "loss": 0.9067, "step": 55 }, { - "epoch": 0.02878388102662509, - "grad_norm": 4.782314822096669, - "learning_rate": 9.584664536741215e-07, - "loss": 1.1928, + "epoch": 0.08635164307987526, + "grad_norm": 2.1161929062800184, + "learning_rate": 2.2966507177033493e-06, + "loss": 0.9534, "step": 60 }, { - "epoch": 0.031182537778843847, - "grad_norm": 4.72671053943802, - "learning_rate": 1.038338658146965e-06, - "loss": 1.2059, + "epoch": 0.09354761333653154, + "grad_norm": 1.739361359363285, + "learning_rate": 2.4880382775119615e-06, + "loss": 0.9368, "step": 65 }, { - "epoch": 0.033581194531062605, - "grad_norm": 6.921221031777395, - "learning_rate": 1.1182108626198083e-06, - "loss": 0.9764, + "epoch": 0.10074358359318782, + "grad_norm": 1.9645645051440501, + "learning_rate": 2.679425837320574e-06, + "loss": 0.8762, "step": 70 }, { - "epoch": 0.035979851283281364, - "grad_norm": 4.2701184749105785, - "learning_rate": 1.198083067092652e-06, - "loss": 1.091, + "epoch": 0.10793955384984409, + "grad_norm": 2.291876148211727, + "learning_rate": 2.8708133971291864e-06, + "loss": 0.9079, "step": 75 }, { - "epoch": 0.03837850803550012, - "grad_norm": 4.417378125112381, - "learning_rate": 1.2779552715654952e-06, - "loss": 0.9595, + "epoch": 0.11513552410650035, + "grad_norm": 1.9350312982517304, + "learning_rate": 3.062200956937799e-06, + "loss": 0.9063, "step": 80 }, { - "epoch": 0.04077716478771888, - "grad_norm": 6.151705197956376, - "learning_rate": 1.3578274760383387e-06, - "loss": 1.031, + "epoch": 0.12233149436315663, + "grad_norm": 1.723686001407234, + "learning_rate": 3.2535885167464113e-06, + "loss": 0.9141, "step": 85 }, { - "epoch": 0.04317582153993763, - "grad_norm": 4.20485503693391, - "learning_rate": 1.4376996805111824e-06, - "loss": 0.9779, + "epoch": 0.1295274646198129, + "grad_norm": 1.7291796951049525, + "learning_rate": 3.444976076555024e-06, + "loss": 0.8852, "step": 90 }, { - "epoch": 0.04557447829215639, - "grad_norm": 7.172834781189588, - "learning_rate": 1.5175718849840257e-06, - "loss": 1.0034, + "epoch": 0.13672343487646918, + "grad_norm": 1.5591618462436518, + "learning_rate": 3.636363636363636e-06, + "loss": 0.8812, "step": 95 }, { - "epoch": 0.04797313504437515, - "grad_norm": 3.5852162346181022, - "learning_rate": 1.5974440894568691e-06, - "loss": 1.0194, - "step": 100 - }, - { - "epoch": 0.04797313504437515, - "eval_loss": 1.0799548625946045, - "eval_runtime": 759.5931, - "eval_samples_per_second": 9.755, - "eval_steps_per_second": 0.611, + "epoch": 0.14391940513312546, + "grad_norm": 2.5857950863717596, + "learning_rate": 3.827751196172249e-06, + "loss": 0.8719, "step": 100 }, { - "epoch": 0.05037179179659391, - "grad_norm": 3.2060401406256314, - "learning_rate": 1.6773162939297128e-06, - "loss": 0.994, + "epoch": 0.15111537538978173, + "grad_norm": 1.882950492021297, + "learning_rate": 4.019138755980861e-06, + "loss": 0.923, "step": 105 }, { - "epoch": 0.052770448548812667, - "grad_norm": 3.8285067274755638, - "learning_rate": 1.757188498402556e-06, - "loss": 0.9704, + "epoch": 0.158311345646438, + "grad_norm": 1.9095611162862844, + "learning_rate": 4.210526315789473e-06, + "loss": 0.8466, "step": 110 }, { - "epoch": 0.055169105301031425, - "grad_norm": 4.008540740919368, - "learning_rate": 1.8370607028753996e-06, - "loss": 1.0667, + "epoch": 0.16550731590309425, + "grad_norm": 1.7349719997638062, + "learning_rate": 4.4019138755980855e-06, + "loss": 0.8433, "step": 115 }, { - "epoch": 0.05756776205325018, - "grad_norm": 4.778094922475321, - "learning_rate": 1.916932907348243e-06, - "loss": 0.9591, + "epoch": 0.17270328615975053, + "grad_norm": 1.6057643525733838, + "learning_rate": 4.5933014354066986e-06, + "loss": 0.8901, "step": 120 }, { - "epoch": 0.059966418805468935, - "grad_norm": 5.3861654284616955, - "learning_rate": 1.9968051118210867e-06, - "loss": 0.9443, + "epoch": 0.1798992564164068, + "grad_norm": 1.636440080236875, + "learning_rate": 4.784688995215311e-06, + "loss": 0.9158, "step": 125 }, { - "epoch": 0.062365075557687694, - "grad_norm": 4.224941464907675, - "learning_rate": 2.07667731629393e-06, - "loss": 0.932, + "epoch": 0.18709522667306308, + "grad_norm": 1.7680919340453238, + "learning_rate": 4.976076555023923e-06, + "loss": 0.9012, "step": 130 }, { - "epoch": 0.06476373230990645, - "grad_norm": 4.606013084447571, - "learning_rate": 2.1565495207667733e-06, - "loss": 0.9077, + "epoch": 0.19429119692971936, + "grad_norm": 1.73915276490131, + "learning_rate": 5.167464114832536e-06, + "loss": 0.8372, "step": 135 }, { - "epoch": 0.06716238906212521, - "grad_norm": 5.077386951712394, - "learning_rate": 2.2364217252396165e-06, - "loss": 0.8907, + "epoch": 0.20148716718637563, + "grad_norm": 1.4761634523053184, + "learning_rate": 5.358851674641148e-06, + "loss": 0.8631, "step": 140 }, { - "epoch": 0.06956104581434397, - "grad_norm": 3.8083760700058287, - "learning_rate": 2.3162939297124602e-06, - "loss": 0.9614, + "epoch": 0.2086831374430319, + "grad_norm": 1.7615083334015453, + "learning_rate": 5.5502392344497606e-06, + "loss": 0.8275, "step": 145 }, { - "epoch": 0.07195970256656273, - "grad_norm": 3.3687263984866314, - "learning_rate": 2.396166134185304e-06, - "loss": 0.9853, + "epoch": 0.21587910769968818, + "grad_norm": 1.5283874292360122, + "learning_rate": 5.741626794258373e-06, + "loss": 0.8625, "step": 150 }, { - "epoch": 0.07435835931878149, - "grad_norm": 3.9276886090521725, - "learning_rate": 2.476038338658147e-06, - "loss": 0.8836, + "epoch": 0.22307507795634446, + "grad_norm": 1.6114784875621726, + "learning_rate": 5.933014354066985e-06, + "loss": 0.891, "step": 155 }, { - "epoch": 0.07675701607100024, - "grad_norm": 4.504046079167171, - "learning_rate": 2.5559105431309904e-06, - "loss": 0.8545, + "epoch": 0.2302710482130007, + "grad_norm": 1.6981117694270553, + "learning_rate": 6.124401913875598e-06, + "loss": 0.8548, "step": 160 }, { - "epoch": 0.079155672823219, - "grad_norm": 3.33674905835483, - "learning_rate": 2.6357827476038337e-06, - "loss": 0.8817, + "epoch": 0.23746701846965698, + "grad_norm": 1.7093051041039082, + "learning_rate": 6.31578947368421e-06, + "loss": 0.8953, "step": 165 }, { - "epoch": 0.08155432957543776, - "grad_norm": 4.009856578725833, - "learning_rate": 2.7156549520766774e-06, - "loss": 0.9024, + "epoch": 0.24466298872631326, + "grad_norm": 1.6829774560988056, + "learning_rate": 6.5071770334928226e-06, + "loss": 0.7894, "step": 170 }, { - "epoch": 0.0839529863276565, - "grad_norm": 2.9677427686921694, - "learning_rate": 2.795527156549521e-06, - "loss": 0.9347, + "epoch": 0.25185895898296956, + "grad_norm": 1.7224253982363609, + "learning_rate": 6.698564593301436e-06, + "loss": 0.9342, "step": 175 }, { - "epoch": 0.08635164307987526, - "grad_norm": 6.375386023546387, - "learning_rate": 2.8753993610223648e-06, - "loss": 0.9421, + "epoch": 0.2590549292396258, + "grad_norm": 1.718825453842612, + "learning_rate": 6.889952153110048e-06, + "loss": 0.8765, "step": 180 }, { - "epoch": 0.08875029983209402, - "grad_norm": 3.33944743041201, - "learning_rate": 2.955271565495208e-06, - "loss": 0.9086, + "epoch": 0.26625089949628206, + "grad_norm": 1.8299643097592468, + "learning_rate": 7.081339712918659e-06, + "loss": 0.8562, "step": 185 }, { - "epoch": 0.09114895658431278, - "grad_norm": 4.394097556098567, - "learning_rate": 3.0351437699680513e-06, - "loss": 0.7906, + "epoch": 0.27344686975293836, + "grad_norm": 1.9650794343564884, + "learning_rate": 7.272727272727272e-06, + "loss": 0.8468, "step": 190 }, { - "epoch": 0.09354761333653154, - "grad_norm": 9.160425752039988, - "learning_rate": 3.1150159744408946e-06, - "loss": 0.9486, + "epoch": 0.2806428400095946, + "grad_norm": 1.6665409567649216, + "learning_rate": 7.4641148325358846e-06, + "loss": 0.8394, "step": 195 }, { - "epoch": 0.0959462700887503, - "grad_norm": 3.571966750853734, - "learning_rate": 3.1948881789137383e-06, - "loss": 0.8006, - "step": 200 - }, - { - "epoch": 0.0959462700887503, - "eval_loss": 0.8958361148834229, - "eval_runtime": 739.3588, - "eval_samples_per_second": 10.022, - "eval_steps_per_second": 0.628, + "epoch": 0.2878388102662509, + "grad_norm": 1.6178516494615538, + "learning_rate": 7.655502392344498e-06, + "loss": 0.9063, "step": 200 }, { - "epoch": 0.09834492684096906, - "grad_norm": 4.8180626358321295, - "learning_rate": 3.2747603833865815e-06, - "loss": 0.8072, + "epoch": 0.29503478052290716, + "grad_norm": 1.737750416660729, + "learning_rate": 7.84688995215311e-06, + "loss": 0.8376, "step": 205 }, { - "epoch": 0.10074358359318782, - "grad_norm": 3.7249040169973715, - "learning_rate": 3.3546325878594257e-06, - "loss": 0.9298, + "epoch": 0.30223075077956346, + "grad_norm": 1.6775958248025469, + "learning_rate": 7.9999943732958e-06, + "loss": 0.8839, "step": 210 }, { - "epoch": 0.10314224034540657, - "grad_norm": 4.440940528532116, - "learning_rate": 3.434504792332269e-06, - "loss": 0.9033, + "epoch": 0.3094267210362197, + "grad_norm": 1.6351855660641077, + "learning_rate": 7.999797440310976e-06, + "loss": 0.8138, "step": 215 }, { - "epoch": 0.10554089709762533, - "grad_norm": 2.7840742197428217, - "learning_rate": 3.514376996805112e-06, - "loss": 0.9216, + "epoch": 0.316622691292876, + "grad_norm": 1.5160913628770296, + "learning_rate": 7.999319187945908e-06, + "loss": 0.8634, "step": 220 }, { - "epoch": 0.10793955384984409, - "grad_norm": 4.303196845314569, - "learning_rate": 3.5942492012779555e-06, - "loss": 0.8446, + "epoch": 0.32381866154953226, + "grad_norm": 1.5346049341737504, + "learning_rate": 7.998559649837715e-06, + "loss": 0.8777, "step": 225 }, { - "epoch": 0.11033821060206285, - "grad_norm": 3.21207580531984, - "learning_rate": 3.674121405750799e-06, - "loss": 0.8883, + "epoch": 0.3310146318061885, + "grad_norm": 1.7202200592837338, + "learning_rate": 7.997518879407302e-06, + "loss": 0.9041, "step": 230 }, { - "epoch": 0.11273686735428161, - "grad_norm": 5.207754700223726, - "learning_rate": 3.7539936102236424e-06, - "loss": 0.8105, + "epoch": 0.3382106020628448, + "grad_norm": 1.6194787415884186, + "learning_rate": 7.996196949855597e-06, + "loss": 0.8567, "step": 235 }, { - "epoch": 0.11513552410650035, - "grad_norm": 4.057648237886164, - "learning_rate": 3.833865814696486e-06, - "loss": 0.9013, + "epoch": 0.34540657231950106, + "grad_norm": 1.5761200757620502, + "learning_rate": 7.994593954158409e-06, + "loss": 0.8683, "step": 240 }, { - "epoch": 0.11753418085871911, - "grad_norm": 3.1683543571860713, - "learning_rate": 3.913738019169329e-06, - "loss": 0.8875, + "epoch": 0.35260254257615736, + "grad_norm": 1.6216799441610352, + "learning_rate": 7.992710005059886e-06, + "loss": 0.8718, "step": 245 }, { - "epoch": 0.11993283761093787, - "grad_norm": 4.234833266114255, - "learning_rate": 3.9936102236421735e-06, - "loss": 0.8656, + "epoch": 0.3597985128328136, + "grad_norm": 1.4982517397299326, + "learning_rate": 7.990545235064588e-06, + "loss": 0.8491, "step": 250 }, { - "epoch": 0.12233149436315663, - "grad_norm": 2.909152649590802, - "learning_rate": 4.073482428115016e-06, - "loss": 0.8178, + "epoch": 0.3669944830894699, + "grad_norm": 1.7915522189289803, + "learning_rate": 7.988099796428161e-06, + "loss": 0.8546, "step": 255 }, { - "epoch": 0.12473015111537539, - "grad_norm": 2.751539094221403, - "learning_rate": 4.15335463258786e-06, - "loss": 0.9129, + "epoch": 0.37419045334612616, + "grad_norm": 1.6209807612052567, + "learning_rate": 7.985373861146636e-06, + "loss": 0.8112, "step": 260 }, { - "epoch": 0.12712880786759415, - "grad_norm": 3.0547647847167974, - "learning_rate": 4.233226837060703e-06, - "loss": 0.8499, + "epoch": 0.38138642360278247, + "grad_norm": 1.5354364067126935, + "learning_rate": 7.98236762094433e-06, + "loss": 0.8484, "step": 265 }, { - "epoch": 0.1295274646198129, - "grad_norm": 2.3819103654972023, - "learning_rate": 4.3130990415335465e-06, - "loss": 0.8459, + "epoch": 0.3885823938594387, + "grad_norm": 1.7666681382505078, + "learning_rate": 7.979081287260356e-06, + "loss": 0.8752, "step": 270 }, { - "epoch": 0.13192612137203166, - "grad_norm": 3.326228240795246, - "learning_rate": 4.39297124600639e-06, - "loss": 0.9211, + "epoch": 0.39577836411609496, + "grad_norm": 1.6458650430939799, + "learning_rate": 7.975515091233757e-06, + "loss": 0.8294, "step": 275 }, { - "epoch": 0.13432477812425042, - "grad_norm": 3.846786293390378, - "learning_rate": 4.472843450479233e-06, - "loss": 0.7904, + "epoch": 0.40297433437275126, + "grad_norm": 1.7589737639189482, + "learning_rate": 7.971669283687252e-06, + "loss": 0.8269, "step": 280 }, { - "epoch": 0.13672343487646918, - "grad_norm": 2.856338249786616, - "learning_rate": 4.552715654952077e-06, - "loss": 0.7958, + "epoch": 0.4101703046294075, + "grad_norm": 1.597742594232911, + "learning_rate": 7.967544135109583e-06, + "loss": 0.8873, "step": 285 }, { - "epoch": 0.13912209162868794, - "grad_norm": 3.452725703157698, - "learning_rate": 4.6325878594249205e-06, - "loss": 0.8353, + "epoch": 0.4173662748860638, + "grad_norm": 1.516759424294559, + "learning_rate": 7.963139935636505e-06, + "loss": 0.8162, "step": 290 }, { - "epoch": 0.1415207483809067, - "grad_norm": 2.8816531878068066, - "learning_rate": 4.712460063897764e-06, - "loss": 0.891, + "epoch": 0.42456224514272006, + "grad_norm": 1.6997534481298684, + "learning_rate": 7.958456995030372e-06, + "loss": 0.8202, "step": 295 }, { - "epoch": 0.14391940513312546, - "grad_norm": 4.1723661091130415, - "learning_rate": 4.792332268370608e-06, - "loss": 0.8183, - "step": 300 - }, - { - "epoch": 0.14391940513312546, - "eval_loss": 0.8849568963050842, - "eval_runtime": 741.1905, - "eval_samples_per_second": 9.997, - "eval_steps_per_second": 0.626, + "epoch": 0.43175821539937637, + "grad_norm": 1.726622337993047, + "learning_rate": 7.95349564265835e-06, + "loss": 0.8456, "step": 300 }, { - "epoch": 0.1463180618853442, - "grad_norm": 2.5376644618127506, - "learning_rate": 4.872204472843451e-06, - "loss": 0.8931, + "epoch": 0.4389541856560326, + "grad_norm": 1.647760739349251, + "learning_rate": 7.94825622746925e-06, + "loss": 0.8648, "step": 305 }, { - "epoch": 0.14871671863756297, - "grad_norm": 16.404073305191517, - "learning_rate": 4.952076677316294e-06, - "loss": 0.8317, + "epoch": 0.4461501559126889, + "grad_norm": 1.7211951131144776, + "learning_rate": 7.942739117968995e-06, + "loss": 0.8272, "step": 310 }, { - "epoch": 0.15111537538978173, - "grad_norm": 5.422899883781751, - "learning_rate": 5.031948881789138e-06, - "loss": 1.0199, + "epoch": 0.45334612616934516, + "grad_norm": 1.5335036085040443, + "learning_rate": 7.936944702194691e-06, + "loss": 0.878, "step": 315 }, { - "epoch": 0.1535140321420005, - "grad_norm": 3.618606897676806, - "learning_rate": 5.111821086261981e-06, - "loss": 0.8023, + "epoch": 0.4605420964260014, + "grad_norm": 1.4848768316508538, + "learning_rate": 7.93087338768734e-06, + "loss": 0.8456, "step": 320 }, { - "epoch": 0.15591268889421925, - "grad_norm": 3.2505122550483847, - "learning_rate": 5.191693290734825e-06, - "loss": 0.8634, + "epoch": 0.4677380666826577, + "grad_norm": 1.5394168018437981, + "learning_rate": 7.924525601463173e-06, + "loss": 0.8427, "step": 325 }, { - "epoch": 0.158311345646438, - "grad_norm": 4.168273711265947, - "learning_rate": 5.2715654952076674e-06, - "loss": 0.8512, + "epoch": 0.47493403693931396, + "grad_norm": 1.559312757501083, + "learning_rate": 7.91790178998362e-06, + "loss": 0.8202, "step": 330 }, { - "epoch": 0.16071000239865676, - "grad_norm": 3.8748834001471324, - "learning_rate": 5.351437699680512e-06, - "loss": 0.8893, + "epoch": 0.48213000719597027, + "grad_norm": 1.5792217882137543, + "learning_rate": 7.91100241912391e-06, + "loss": 0.8419, "step": 335 }, { - "epoch": 0.16310865915087552, - "grad_norm": 3.0369612897329143, - "learning_rate": 5.431309904153355e-06, - "loss": 0.8146, + "epoch": 0.4893259774526265, + "grad_norm": 1.738156211889958, + "learning_rate": 7.9038279741403e-06, + "loss": 0.8659, "step": 340 }, { - "epoch": 0.16550731590309425, - "grad_norm": 3.0242707239755164, - "learning_rate": 5.5111821086261985e-06, - "loss": 0.8655, + "epoch": 0.4965219477092828, + "grad_norm": 1.7060742398951603, + "learning_rate": 7.896378959635946e-06, + "loss": 0.8564, "step": 345 }, { - "epoch": 0.167905972655313, - "grad_norm": 2.807112546364469, - "learning_rate": 5.591054313099042e-06, - "loss": 0.8343, + "epoch": 0.5037179179659391, + "grad_norm": 1.483857852582522, + "learning_rate": 7.888655899525413e-06, + "loss": 0.8122, "step": 350 }, { - "epoch": 0.17030462940753177, - "grad_norm": 3.0445759143839974, - "learning_rate": 5.670926517571885e-06, - "loss": 0.8672, + "epoch": 0.5109138882225953, + "grad_norm": 1.468389601692516, + "learning_rate": 7.880659336997833e-06, + "loss": 0.887, "step": 355 }, { - "epoch": 0.17270328615975053, - "grad_norm": 2.533966215351572, - "learning_rate": 5.7507987220447296e-06, - "loss": 0.9204, + "epoch": 0.5181098584792516, + "grad_norm": 1.5870295716868312, + "learning_rate": 7.872389834478688e-06, + "loss": 0.8813, "step": 360 }, { - "epoch": 0.1751019429119693, - "grad_norm": 2.637089076537768, - "learning_rate": 5.830670926517572e-06, - "loss": 0.9182, + "epoch": 0.5253058287359079, + "grad_norm": 1.8895020621596357, + "learning_rate": 7.863847973590265e-06, + "loss": 0.8626, "step": 365 }, { - "epoch": 0.17750059966418805, - "grad_norm": 2.9535137171261545, - "learning_rate": 5.910543130990416e-06, - "loss": 0.9401, + "epoch": 0.5325017989925641, + "grad_norm": 1.550855518803781, + "learning_rate": 7.855034355110736e-06, + "loss": 0.8546, "step": 370 }, { - "epoch": 0.1798992564164068, - "grad_norm": 3.215701545330308, - "learning_rate": 5.990415335463259e-06, - "loss": 0.8791, + "epoch": 0.5396977692492204, + "grad_norm": 1.5611172491745864, + "learning_rate": 7.845949598931918e-06, + "loss": 0.848, "step": 375 }, { - "epoch": 0.18229791316862556, - "grad_norm": 2.7011931711589443, - "learning_rate": 6.070287539936103e-06, - "loss": 0.9199, + "epoch": 0.5468937395058767, + "grad_norm": 1.5611908671622932, + "learning_rate": 7.836594344015661e-06, + "loss": 0.8738, "step": 380 }, { - "epoch": 0.18469656992084432, - "grad_norm": 2.707790719957795, - "learning_rate": 6.150159744408946e-06, - "loss": 0.8737, + "epoch": 0.554089709762533, + "grad_norm": 1.5092812979555394, + "learning_rate": 7.826969248348915e-06, + "loss": 0.8693, "step": 385 }, { - "epoch": 0.18709522667306308, - "grad_norm": 3.597189776277077, - "learning_rate": 6.230031948881789e-06, - "loss": 0.9733, + "epoch": 0.5612856800191892, + "grad_norm": 2.804211699082011, + "learning_rate": 7.817074988897446e-06, + "loss": 0.8373, "step": 390 }, { - "epoch": 0.18949388342528184, - "grad_norm": 2.9193000070767354, - "learning_rate": 6.309904153354634e-06, - "loss": 0.8133, + "epoch": 0.5684816502758455, + "grad_norm": 1.6322846593476272, + "learning_rate": 7.806912261558232e-06, + "loss": 0.8179, "step": 395 }, { - "epoch": 0.1918925401775006, - "grad_norm": 2.7897280116321106, - "learning_rate": 6.3897763578274765e-06, - "loss": 0.7918, - "step": 400 - }, - { - "epoch": 0.1918925401775006, - "eval_loss": 0.8788505792617798, - "eval_runtime": 738.99, - "eval_samples_per_second": 10.027, - "eval_steps_per_second": 0.628, + "epoch": 0.5756776205325018, + "grad_norm": 1.537211750195004, + "learning_rate": 7.796481781110504e-06, + "loss": 0.8881, "step": 400 }, { - "epoch": 0.19429119692971936, - "grad_norm": 2.904956056545914, - "learning_rate": 6.46964856230032e-06, - "loss": 0.8769, + "epoch": 0.5828735907891581, + "grad_norm": 1.6424496003416038, + "learning_rate": 7.785784281165491e-06, + "loss": 0.8285, "step": 405 }, { - "epoch": 0.19668985368193811, - "grad_norm": 2.694702495778036, - "learning_rate": 6.549520766773163e-06, - "loss": 0.8796, + "epoch": 0.5900695610458143, + "grad_norm": 1.712983639226015, + "learning_rate": 7.774820514114804e-06, + "loss": 0.8471, "step": 410 }, { - "epoch": 0.19908851043415687, - "grad_norm": 2.958505594137407, - "learning_rate": 6.629392971246007e-06, - "loss": 0.7932, + "epoch": 0.5972655313024706, + "grad_norm": 1.6543355090148368, + "learning_rate": 7.763591251077532e-06, + "loss": 0.8181, "step": 415 }, { - "epoch": 0.20148716718637563, - "grad_norm": 2.954096827602939, - "learning_rate": 6.709265175718851e-06, - "loss": 0.9052, + "epoch": 0.6044615015591269, + "grad_norm": 1.7695569793499315, + "learning_rate": 7.752097281845998e-06, + "loss": 0.8317, "step": 420 }, { - "epoch": 0.2038858239385944, - "grad_norm": 4.110594691742628, - "learning_rate": 6.789137380191694e-06, - "loss": 0.7819, + "epoch": 0.6116574718157831, + "grad_norm": 1.6408594582199851, + "learning_rate": 7.740339414830216e-06, + "loss": 0.8822, "step": 425 }, { - "epoch": 0.20628448069081315, - "grad_norm": 2.8603447719001673, - "learning_rate": 6.869009584664538e-06, - "loss": 0.8848, + "epoch": 0.6188534420724394, + "grad_norm": 1.532763883790403, + "learning_rate": 7.72831847700103e-06, + "loss": 0.8858, "step": 430 }, { - "epoch": 0.2086831374430319, - "grad_norm": 3.677542996907206, - "learning_rate": 6.948881789137381e-06, - "loss": 0.8922, + "epoch": 0.6260494123290957, + "grad_norm": 1.5699531832045912, + "learning_rate": 7.71603531383195e-06, + "loss": 0.803, "step": 435 }, { - "epoch": 0.21108179419525067, - "grad_norm": 2.3643998409365006, - "learning_rate": 7.028753993610224e-06, - "loss": 0.8853, + "epoch": 0.633245382585752, + "grad_norm": 1.4023688398705096, + "learning_rate": 7.703490789239685e-06, + "loss": 0.8015, "step": 440 }, { - "epoch": 0.21348045094746942, - "grad_norm": 2.811619082486357, - "learning_rate": 7.108626198083067e-06, - "loss": 0.8435, + "epoch": 0.6404413528424082, + "grad_norm": 1.743812894669404, + "learning_rate": 7.690685785523388e-06, + "loss": 0.8398, "step": 445 }, { - "epoch": 0.21587910769968818, - "grad_norm": 2.7629535174547497, - "learning_rate": 7.188498402555911e-06, - "loss": 0.9014, + "epoch": 0.6476373230990645, + "grad_norm": 1.5621879493455977, + "learning_rate": 7.677621203302591e-06, + "loss": 0.7979, "step": 450 }, { - "epoch": 0.21827776445190694, - "grad_norm": 2.779613231443427, - "learning_rate": 7.268370607028754e-06, - "loss": 0.8528, + "epoch": 0.6548332933557208, + "grad_norm": 1.5193447362876678, + "learning_rate": 7.66429796145387e-06, + "loss": 0.8125, "step": 455 }, { - "epoch": 0.2206764212041257, - "grad_norm": 2.4729051800667845, - "learning_rate": 7.348242811501598e-06, - "loss": 0.9453, + "epoch": 0.662029263612377, + "grad_norm": 1.6795960658239877, + "learning_rate": 7.650716997046216e-06, + "loss": 0.8477, "step": 460 }, { - "epoch": 0.22307507795634446, - "grad_norm": 3.4684655010762393, - "learning_rate": 7.428115015974442e-06, - "loss": 0.9096, + "epoch": 0.6692252338690333, + "grad_norm": 1.585934440158675, + "learning_rate": 7.636879265275119e-06, + "loss": 0.845, "step": 465 }, { - "epoch": 0.22547373470856322, - "grad_norm": 3.9774609808031403, - "learning_rate": 7.507987220447285e-06, - "loss": 0.866, + "epoch": 0.6764212041256896, + "grad_norm": 1.550239333699456, + "learning_rate": 7.622785739395397e-06, + "loss": 0.8723, "step": 470 }, { - "epoch": 0.22787239146078198, - "grad_norm": 4.462097218795264, - "learning_rate": 7.5878594249201285e-06, - "loss": 0.9069, + "epoch": 0.6836171743823459, + "grad_norm": 1.6342679772831266, + "learning_rate": 7.608437410652739e-06, + "loss": 0.8237, "step": 475 }, { - "epoch": 0.2302710482130007, - "grad_norm": 2.687922202397471, - "learning_rate": 7.667731629392972e-06, - "loss": 0.8583, + "epoch": 0.6908131446390021, + "grad_norm": 1.9009580720490618, + "learning_rate": 7.593835288213984e-06, + "loss": 0.8525, "step": 480 }, { - "epoch": 0.23266970496521946, - "grad_norm": 3.03353543529184, - "learning_rate": 7.747603833865815e-06, - "loss": 0.8999, + "epoch": 0.6980091148956584, + "grad_norm": 1.4797149380660173, + "learning_rate": 7.578980399096153e-06, + "loss": 0.8343, "step": 485 }, { - "epoch": 0.23506836171743822, - "grad_norm": 3.645290023306533, - "learning_rate": 7.827476038338658e-06, - "loss": 0.9374, + "epoch": 0.7052050851523147, + "grad_norm": 1.6927806221547512, + "learning_rate": 7.5638737880942e-06, + "loss": 0.819, "step": 490 }, { - "epoch": 0.23746701846965698, - "grad_norm": 3.5934722365995095, - "learning_rate": 7.907348242811502e-06, - "loss": 0.8538, + "epoch": 0.712401055408971, + "grad_norm": 1.6132959543020267, + "learning_rate": 7.548516517707544e-06, + "loss": 0.8177, "step": 495 }, { - "epoch": 0.23986567522187574, - "grad_norm": 4.244150178459668, - "learning_rate": 7.987220447284347e-06, - "loss": 0.8661, - "step": 500 - }, - { - "epoch": 0.23986567522187574, - "eval_loss": 0.8783833980560303, - "eval_runtime": 739.2193, - "eval_samples_per_second": 10.024, - "eval_steps_per_second": 0.628, + "epoch": 0.7195970256656272, + "grad_norm": 1.5783090956116979, + "learning_rate": 7.532909668065329e-06, + "loss": 0.8217, "step": 500 }, { - "epoch": 0.2422643319740945, - "grad_norm": 3.363197601795703, - "learning_rate": 8.06709265175719e-06, - "loss": 0.7723, + "epoch": 0.7267929959222835, + "grad_norm": 1.439090732947757, + "learning_rate": 7.517054336850457e-06, + "loss": 0.8617, "step": 505 }, { - "epoch": 0.24466298872631326, - "grad_norm": 3.687217931851492, - "learning_rate": 8.146964856230033e-06, - "loss": 0.7713, + "epoch": 0.7339889661789398, + "grad_norm": 1.906650403594057, + "learning_rate": 7.500951639222389e-06, + "loss": 0.8427, "step": 510 }, { - "epoch": 0.24706164547853202, - "grad_norm": 3.91943742276457, - "learning_rate": 8.226837060702875e-06, - "loss": 0.9931, + "epoch": 0.741184936435596, + "grad_norm": 1.5718009905007941, + "learning_rate": 7.484602707738707e-06, + "loss": 0.9079, "step": 515 }, { - "epoch": 0.24946030223075077, - "grad_norm": 3.242784908627176, - "learning_rate": 8.30670926517572e-06, - "loss": 0.9052, + "epoch": 0.7483809066922523, + "grad_norm": 1.716197310859666, + "learning_rate": 7.468008692275457e-06, + "loss": 0.8278, "step": 520 }, { - "epoch": 0.25185895898296956, - "grad_norm": 5.452872481341522, - "learning_rate": 8.386581469648563e-06, - "loss": 0.9227, + "epoch": 0.7555768769489086, + "grad_norm": 1.5485542395076963, + "learning_rate": 7.45117075994628e-06, + "loss": 0.8326, "step": 525 }, { - "epoch": 0.2542576157351883, - "grad_norm": 2.801600778934709, - "learning_rate": 8.466453674121406e-06, - "loss": 0.9145, + "epoch": 0.7627728472055649, + "grad_norm": 1.4519889463898326, + "learning_rate": 7.434090095020318e-06, + "loss": 0.7923, "step": 530 }, { - "epoch": 0.2566562724874071, - "grad_norm": 2.80399708145586, - "learning_rate": 8.54632587859425e-06, - "loss": 0.8459, + "epoch": 0.7699688174622211, + "grad_norm": 1.4499255632268135, + "learning_rate": 7.416767898838926e-06, + "loss": 0.8449, "step": 535 }, { - "epoch": 0.2590549292396258, - "grad_norm": 3.55867842461135, - "learning_rate": 8.626198083067093e-06, - "loss": 0.9457, + "epoch": 0.7771647877188774, + "grad_norm": 1.498217506498266, + "learning_rate": 7.399205389731172e-06, + "loss": 0.8462, "step": 540 }, { - "epoch": 0.26145358599184454, - "grad_norm": 3.150113704402208, - "learning_rate": 8.706070287539938e-06, - "loss": 0.835, + "epoch": 0.7843607579755337, + "grad_norm": 1.441416826072204, + "learning_rate": 7.381403802928153e-06, + "loss": 0.7864, "step": 545 }, { - "epoch": 0.2638522427440633, - "grad_norm": 2.91320676896268, - "learning_rate": 8.78594249201278e-06, - "loss": 0.8667, + "epoch": 0.7915567282321899, + "grad_norm": 1.5728216952970062, + "learning_rate": 7.363364390476114e-06, + "loss": 0.779, "step": 550 }, { - "epoch": 0.26625089949628206, - "grad_norm": 3.2187525835358626, - "learning_rate": 8.865814696485623e-06, - "loss": 0.8478, + "epoch": 0.7987526984888462, + "grad_norm": 1.579757123001059, + "learning_rate": 7.34508842114839e-06, + "loss": 0.8342, "step": 555 }, { - "epoch": 0.26864955624850084, - "grad_norm": 3.3992782102360173, - "learning_rate": 8.945686900958466e-06, - "loss": 0.8523, + "epoch": 0.8059486687455025, + "grad_norm": 1.5204038228338386, + "learning_rate": 7.326577180356162e-06, + "loss": 0.8202, "step": 560 }, { - "epoch": 0.2710482130007196, - "grad_norm": 3.107174104616647, - "learning_rate": 9.02555910543131e-06, - "loss": 0.9199, + "epoch": 0.8131446390021588, + "grad_norm": 1.6070382704422732, + "learning_rate": 7.30783197005806e-06, + "loss": 0.7948, "step": 565 }, { - "epoch": 0.27344686975293836, - "grad_norm": 3.600873597055109, - "learning_rate": 9.105431309904154e-06, - "loss": 0.7872, + "epoch": 0.820340609258815, + "grad_norm": 1.4952361279508235, + "learning_rate": 7.288854108668586e-06, + "loss": 0.8451, "step": 570 }, { - "epoch": 0.2758455265051571, - "grad_norm": 2.9647080063438915, - "learning_rate": 9.185303514376998e-06, - "loss": 0.8401, + "epoch": 0.8275365795154713, + "grad_norm": 1.4373323975649135, + "learning_rate": 7.2696449309653795e-06, + "loss": 0.8381, "step": 575 }, { - "epoch": 0.2782441832573759, - "grad_norm": 2.7990639275693945, - "learning_rate": 9.265175718849841e-06, - "loss": 0.8267, + "epoch": 0.8347325497721276, + "grad_norm": 1.292833703910724, + "learning_rate": 7.250205787995353e-06, + "loss": 0.8286, "step": 580 }, { - "epoch": 0.2806428400095946, - "grad_norm": 3.3315727902538157, - "learning_rate": 9.345047923322684e-06, - "loss": 0.8867, + "epoch": 0.8419285200287839, + "grad_norm": 1.2885838799591653, + "learning_rate": 7.230538046979654e-06, + "loss": 0.8506, "step": 585 }, { - "epoch": 0.2830414967618134, - "grad_norm": 6.339874565170752, - "learning_rate": 9.424920127795528e-06, - "loss": 0.9029, + "epoch": 0.8491244902854401, + "grad_norm": 1.4134558176619048, + "learning_rate": 7.210643091217513e-06, + "loss": 0.8411, "step": 590 }, { - "epoch": 0.2854401535140321, - "grad_norm": 4.308904102490444, - "learning_rate": 9.504792332268371e-06, - "loss": 0.9859, + "epoch": 0.8563204605420964, + "grad_norm": 1.6657530487665673, + "learning_rate": 7.1905223199889425e-06, + "loss": 0.834, "step": 595 }, { - "epoch": 0.2878388102662509, - "grad_norm": 3.452224572712429, - "learning_rate": 9.584664536741216e-06, - "loss": 0.9071, - "step": 600 - }, - { - "epoch": 0.2878388102662509, - "eval_loss": 0.899307906627655, - "eval_runtime": 739.1395, - "eval_samples_per_second": 10.025, - "eval_steps_per_second": 0.628, + "epoch": 0.8635164307987527, + "grad_norm": 1.7801521918282026, + "learning_rate": 7.170177148456331e-06, + "loss": 0.8461, "step": 600 }, { - "epoch": 0.29023746701846964, - "grad_norm": 3.178382258183808, - "learning_rate": 9.664536741214059e-06, - "loss": 0.8549, + "epoch": 0.8707124010554089, + "grad_norm": 1.5712149389587902, + "learning_rate": 7.149609007564903e-06, + "loss": 0.8683, "step": 605 }, { - "epoch": 0.2926361237706884, - "grad_norm": 3.8066863018491994, - "learning_rate": 9.744408945686901e-06, - "loss": 0.8329, + "epoch": 0.8779083713120652, + "grad_norm": 1.5017107860836532, + "learning_rate": 7.128819343942077e-06, + "loss": 0.8442, "step": 610 }, { - "epoch": 0.29503478052290716, - "grad_norm": 4.217226984051162, - "learning_rate": 9.824281150159746e-06, - "loss": 0.884, + "epoch": 0.8851043415687215, + "grad_norm": 1.4727249376876361, + "learning_rate": 7.107809619795722e-06, + "loss": 0.8668, "step": 615 }, { - "epoch": 0.29743343727512594, - "grad_norm": 4.082578713375918, - "learning_rate": 9.904153354632589e-06, - "loss": 0.8972, + "epoch": 0.8923003118253778, + "grad_norm": 1.428036074094576, + "learning_rate": 7.086581312811309e-06, + "loss": 0.773, "step": 620 }, { - "epoch": 0.2998320940273447, - "grad_norm": 3.601173386435701, - "learning_rate": 9.984025559105433e-06, - "loss": 0.9211, + "epoch": 0.899496282082034, + "grad_norm": 1.459125555270072, + "learning_rate": 7.065135916047992e-06, + "loss": 0.8551, "step": 625 }, { - "epoch": 0.30223075077956346, - "grad_norm": 4.327224463154232, - "learning_rate": 9.999987527310802e-06, - "loss": 0.9326, + "epoch": 0.9066922523386903, + "grad_norm": 1.4437358399730527, + "learning_rate": 7.043474937833581e-06, + "loss": 0.8055, "step": 630 }, { - "epoch": 0.3046294075317822, - "grad_norm": 3.192949701933106, - "learning_rate": 9.99993685711758e-06, - "loss": 0.8407, + "epoch": 0.9138882225953466, + "grad_norm": 1.6487138533805716, + "learning_rate": 7.021599901658467e-06, + "loss": 0.8162, "step": 635 }, { - "epoch": 0.307028064284001, - "grad_norm": 3.7659658769393665, - "learning_rate": 9.999847210271954e-06, - "loss": 0.834, + "epoch": 0.9210841928520028, + "grad_norm": 1.5886402538030315, + "learning_rate": 6.999512346068467e-06, + "loss": 0.8472, "step": 640 }, { - "epoch": 0.3094267210362197, - "grad_norm": 2.4979512705164657, - "learning_rate": 9.999718587472758e-06, - "loss": 0.848, + "epoch": 0.9282801631086591, + "grad_norm": 1.580701659838036, + "learning_rate": 6.977213824556613e-06, + "loss": 0.8185, "step": 645 }, { - "epoch": 0.3118253777884385, - "grad_norm": 5.339110116490045, - "learning_rate": 9.999550989722662e-06, - "loss": 0.9671, + "epoch": 0.9354761333653154, + "grad_norm": 1.3791827207697653, + "learning_rate": 6.95470590545389e-06, + "loss": 0.8424, "step": 650 }, { - "epoch": 0.3142240345406572, - "grad_norm": 2.866052234022034, - "learning_rate": 9.999344418328161e-06, - "loss": 0.8325, + "epoch": 0.9426721036219717, + "grad_norm": 1.5248977943916064, + "learning_rate": 6.931990171818923e-06, + "loss": 0.8829, "step": 655 }, { - "epoch": 0.316622691292876, - "grad_norm": 3.050315691952451, - "learning_rate": 9.99909887489957e-06, - "loss": 0.8506, + "epoch": 0.9498680738786279, + "grad_norm": 1.4962085290443874, + "learning_rate": 6.909068221326647e-06, + "loss": 0.8236, "step": 660 }, { - "epoch": 0.31902134804509474, - "grad_norm": 2.906833616688605, - "learning_rate": 9.998814361351006e-06, - "loss": 0.8681, + "epoch": 0.9570640441352842, + "grad_norm": 1.6074178928012908, + "learning_rate": 6.88594166615593e-06, + "loss": 0.8165, "step": 665 }, { - "epoch": 0.32142000479731353, - "grad_norm": 3.108240781409998, - "learning_rate": 9.99849087990037e-06, - "loss": 0.9081, + "epoch": 0.9642600143919405, + "grad_norm": 1.538150422370725, + "learning_rate": 6.8626121328761824e-06, + "loss": 0.8155, "step": 670 }, { - "epoch": 0.32381866154953226, - "grad_norm": 2.772972214094703, - "learning_rate": 9.998128433069345e-06, - "loss": 0.9144, + "epoch": 0.9714559846485968, + "grad_norm": 1.4261928133080215, + "learning_rate": 6.839081262332957e-06, + "loss": 0.8271, "step": 675 }, { - "epoch": 0.32621731830175105, - "grad_norm": 4.96500038402326, - "learning_rate": 9.997727023683353e-06, - "loss": 0.987, + "epoch": 0.978651954905253, + "grad_norm": 1.4811633224353407, + "learning_rate": 6.815350709532544e-06, + "loss": 0.8417, "step": 680 }, { - "epoch": 0.3286159750539698, - "grad_norm": 2.651958496925647, - "learning_rate": 9.997286654871556e-06, - "loss": 0.8424, + "epoch": 0.9858479251619093, + "grad_norm": 1.5008044573312285, + "learning_rate": 6.791422143525564e-06, + "loss": 0.859, "step": 685 }, { - "epoch": 0.3310146318061885, - "grad_norm": 3.4588489379856484, - "learning_rate": 9.996807330066816e-06, - "loss": 0.9033, + "epoch": 0.9930438954185656, + "grad_norm": 1.5977699325206687, + "learning_rate": 6.767297247289585e-06, + "loss": 0.8663, "step": 690 }, { - "epoch": 0.3334132885584073, - "grad_norm": 2.79385907219515, - "learning_rate": 9.996289053005676e-06, - "loss": 0.8301, + "epoch": 1.0002398656752218, + "grad_norm": 2.198059715592546, + "learning_rate": 6.742977717610744e-06, + "loss": 0.8427, "step": 695 }, { - "epoch": 0.335811945310626, - "grad_norm": 3.7163749268253357, - "learning_rate": 9.99573182772833e-06, - "loss": 0.8955, - "step": 700 - }, - { - "epoch": 0.335811945310626, - "eval_loss": 0.8928011655807495, - "eval_runtime": 741.2881, - "eval_samples_per_second": 9.996, - "eval_steps_per_second": 0.626, + "epoch": 1.0074358359318782, + "grad_norm": 1.447486858474852, + "learning_rate": 6.718465264964414e-06, + "loss": 0.5445, "step": 700 }, { - "epoch": 0.3382106020628448, - "grad_norm": 2.5849874442305714, - "learning_rate": 9.995135658578587e-06, - "loss": 0.9328, + "epoch": 1.0146318061885344, + "grad_norm": 1.5387360085110118, + "learning_rate": 6.693761613394899e-06, + "loss": 0.5585, "step": 705 }, { - "epoch": 0.34060925881506354, - "grad_norm": 3.0610660646289705, - "learning_rate": 9.994500550203838e-06, - "loss": 0.9111, + "epoch": 1.0218277764451906, + "grad_norm": 1.3617154485025116, + "learning_rate": 6.668868500394172e-06, + "loss": 0.4605, "step": 710 }, { - "epoch": 0.34300791556728233, - "grad_norm": 3.310088400644419, - "learning_rate": 9.99382650755503e-06, - "loss": 0.8882, + "epoch": 1.029023746701847, + "grad_norm": 1.433780770938119, + "learning_rate": 6.643787676779671e-06, + "loss": 0.5254, "step": 715 }, { - "epoch": 0.34540657231950106, - "grad_norm": 2.989166571674383, - "learning_rate": 9.993113535886612e-06, - "loss": 0.9057, + "epoch": 1.0362197169585032, + "grad_norm": 1.4766138665890287, + "learning_rate": 6.618520906571171e-06, + "loss": 0.476, "step": 720 }, { - "epoch": 0.34780522907171985, - "grad_norm": 3.2839488015561638, - "learning_rate": 9.992361640756504e-06, - "loss": 0.9816, + "epoch": 1.0434156872151594, + "grad_norm": 1.4473624764279305, + "learning_rate": 6.593069966866694e-06, + "loss": 0.5404, "step": 725 }, { - "epoch": 0.3502038858239386, - "grad_norm": 2.952806180266413, - "learning_rate": 9.991570828026052e-06, - "loss": 0.9105, + "epoch": 1.0506116574718158, + "grad_norm": 1.4713255574009756, + "learning_rate": 6.567436647717535e-06, + "loss": 0.5293, "step": 730 }, { - "epoch": 0.35260254257615736, - "grad_norm": 3.0776125312419644, - "learning_rate": 9.99074110385998e-06, - "loss": 0.8911, + "epoch": 1.057807627728472, + "grad_norm": 1.3099551452546936, + "learning_rate": 6.541622752002355e-06, + "loss": 0.5168, "step": 735 }, { - "epoch": 0.3550011993283761, - "grad_norm": 2.8653507727619356, - "learning_rate": 9.989872474726341e-06, - "loss": 0.9403, + "epoch": 1.0650035979851282, + "grad_norm": 1.4756303619125208, + "learning_rate": 6.515630095300383e-06, + "loss": 0.5253, "step": 740 }, { - "epoch": 0.3573998560805949, - "grad_norm": 3.1505016143613047, - "learning_rate": 9.988964947396477e-06, - "loss": 0.8747, + "epoch": 1.0721995682417846, + "grad_norm": 1.2607903619444512, + "learning_rate": 6.489460505763713e-06, + "loss": 0.5203, "step": 745 }, { - "epoch": 0.3597985128328136, - "grad_norm": 2.6125502058810883, - "learning_rate": 9.98801852894495e-06, - "loss": 0.8193, + "epoch": 1.0793955384984408, + "grad_norm": 1.2668860008648557, + "learning_rate": 6.463115823988732e-06, + "loss": 0.5133, "step": 750 }, { - "epoch": 0.3621971695850324, - "grad_norm": 2.7512586189265003, - "learning_rate": 9.987033226749497e-06, - "loss": 0.851, + "epoch": 1.0865915087550972, + "grad_norm": 1.5445317289615172, + "learning_rate": 6.436597902886655e-06, + "loss": 0.5399, "step": 755 }, { - "epoch": 0.3645958263372511, - "grad_norm": 2.56627099994587, - "learning_rate": 9.986009048490971e-06, - "loss": 0.8504, + "epoch": 1.0937874790117534, + "grad_norm": 1.373346882134398, + "learning_rate": 6.409908607553217e-06, + "loss": 0.4742, "step": 760 }, { - "epoch": 0.3669944830894699, - "grad_norm": 3.2634625404493387, - "learning_rate": 9.984946002153284e-06, - "loss": 0.9531, + "epoch": 1.1009834492684096, + "grad_norm": 1.2423912396823111, + "learning_rate": 6.38304981513748e-06, + "loss": 0.4928, "step": 765 }, { - "epoch": 0.36939313984168864, - "grad_norm": 2.897125687255057, - "learning_rate": 9.983844096023334e-06, - "loss": 0.8097, + "epoch": 1.108179419525066, + "grad_norm": 1.564288911463839, + "learning_rate": 6.3560234147098155e-06, + "loss": 0.509, "step": 770 }, { - "epoch": 0.37179179659390743, - "grad_norm": 2.7944503893231474, - "learning_rate": 9.982703338690955e-06, - "loss": 0.9376, + "epoch": 1.1153753897817222, + "grad_norm": 1.592002787628597, + "learning_rate": 6.328831307129039e-06, + "loss": 0.5373, "step": 775 }, { - "epoch": 0.37419045334612616, - "grad_norm": 3.0897623363555455, - "learning_rate": 9.981523739048839e-06, - "loss": 0.8607, + "epoch": 1.1225713600383784, + "grad_norm": 1.2953958374191832, + "learning_rate": 6.30147540490871e-06, + "loss": 0.5053, "step": 780 }, { - "epoch": 0.37658911009834495, - "grad_norm": 2.825952708220128, - "learning_rate": 9.98030530629247e-06, - "loss": 0.9208, + "epoch": 1.1297673302950348, + "grad_norm": 1.3851707132475586, + "learning_rate": 6.27395763208263e-06, + "loss": 0.5138, "step": 785 }, { - "epoch": 0.3789877668505637, - "grad_norm": 3.21377979039193, - "learning_rate": 9.979048049920056e-06, - "loss": 0.9482, + "epoch": 1.136963300551691, + "grad_norm": 1.5083938363704992, + "learning_rate": 6.246279924069504e-06, + "loss": 0.4639, "step": 790 }, { - "epoch": 0.38138642360278247, - "grad_norm": 3.4941808752016015, - "learning_rate": 9.977751979732448e-06, - "loss": 0.8321, + "epoch": 1.1441592708083472, + "grad_norm": 1.4583984595795128, + "learning_rate": 6.218444227536832e-06, + "loss": 0.509, "step": 795 }, { - "epoch": 0.3837850803550012, - "grad_norm": 2.819630176729509, - "learning_rate": 9.97641710583307e-06, - "loss": 0.89, - "step": 800 - }, - { - "epoch": 0.3837850803550012, - "eval_loss": 0.8970605731010437, - "eval_runtime": 739.8413, - "eval_samples_per_second": 10.016, - "eval_steps_per_second": 0.627, + "epoch": 1.1513552410650036, + "grad_norm": 1.1915713777559744, + "learning_rate": 6.190452500263975e-06, + "loss": 0.4771, "step": 800 }, { - "epoch": 0.38618373710722, - "grad_norm": 2.6833336278666264, - "learning_rate": 9.975043438627836e-06, - "loss": 0.8416, + "epoch": 1.1585512113216598, + "grad_norm": 1.2538864728177044, + "learning_rate": 6.162306711004474e-06, + "loss": 0.4927, "step": 805 }, { - "epoch": 0.3885823938594387, - "grad_norm": 3.0978237393301735, - "learning_rate": 9.973630988825071e-06, - "loss": 0.9354, + "epoch": 1.165747181578316, + "grad_norm": 1.3142042016311857, + "learning_rate": 6.134008839347575e-06, + "loss": 0.4884, "step": 810 }, { - "epoch": 0.3909810506116575, - "grad_norm": 4.2752235499170705, - "learning_rate": 9.972179767435428e-06, - "loss": 0.9026, + "epoch": 1.1729431518349724, + "grad_norm": 1.232777503769632, + "learning_rate": 6.105560875578994e-06, + "loss": 0.5273, "step": 815 }, { - "epoch": 0.39337970736387623, - "grad_norm": 2.430423958859156, - "learning_rate": 9.970689785771798e-06, - "loss": 0.8654, + "epoch": 1.1801391220916286, + "grad_norm": 1.502479848796588, + "learning_rate": 6.076964820540937e-06, + "loss": 0.5086, "step": 820 }, { - "epoch": 0.39577836411609496, - "grad_norm": 2.9029155463869527, - "learning_rate": 9.96916105544923e-06, - "loss": 0.8403, + "epoch": 1.187335092348285, + "grad_norm": 1.5121283948236117, + "learning_rate": 6.048222685491374e-06, + "loss": 0.5374, "step": 825 }, { - "epoch": 0.39817702086831375, - "grad_norm": 2.7152107809882065, - "learning_rate": 9.967593588384832e-06, - "loss": 0.7184, + "epoch": 1.1945310626049412, + "grad_norm": 1.8544252686881426, + "learning_rate": 6.019336491962581e-06, + "loss": 0.5381, "step": 830 }, { - "epoch": 0.4005756776205325, - "grad_norm": 2.725981699021402, - "learning_rate": 9.965987396797678e-06, - "loss": 0.9184, + "epoch": 1.2017270328615974, + "grad_norm": 1.3559009500960952, + "learning_rate": 5.990308271618956e-06, + "loss": 0.4939, "step": 835 }, { - "epoch": 0.40297433437275126, - "grad_norm": 3.1490594445803626, - "learning_rate": 9.964342493208727e-06, - "loss": 0.9349, + "epoch": 1.2089230031182538, + "grad_norm": 1.6991974082062777, + "learning_rate": 5.961140066114128e-06, + "loss": 0.5429, "step": 840 }, { - "epoch": 0.40537299112497, - "grad_norm": 2.4002360503838847, - "learning_rate": 9.962658890440703e-06, - "loss": 0.8469, + "epoch": 1.21611897337491, + "grad_norm": 1.40613574888153, + "learning_rate": 5.931833926947358e-06, + "loss": 0.4778, "step": 845 }, { - "epoch": 0.4077716478771888, - "grad_norm": 3.3266215600879994, - "learning_rate": 9.960936601618014e-06, - "loss": 1.0367, + "epoch": 1.2233149436315662, + "grad_norm": 1.291827429944612, + "learning_rate": 5.902391915319252e-06, + "loss": 0.4604, "step": 850 }, { - "epoch": 0.4101703046294075, - "grad_norm": 2.965406316734529, - "learning_rate": 9.959175640166639e-06, - "loss": 0.8879, + "epoch": 1.2305109138882226, + "grad_norm": 1.5403421830718962, + "learning_rate": 5.872816101986789e-06, + "loss": 0.4993, "step": 855 }, { - "epoch": 0.4125689613816263, - "grad_norm": 2.5176873752286757, - "learning_rate": 9.957376019814028e-06, - "loss": 0.7977, + "epoch": 1.2377068841448788, + "grad_norm": 1.295984623620993, + "learning_rate": 5.843108567117678e-06, + "loss": 0.4972, "step": 860 }, { - "epoch": 0.41496761813384503, - "grad_norm": 3.221299896812408, - "learning_rate": 9.955537754588996e-06, - "loss": 0.8624, + "epoch": 1.244902854401535, + "grad_norm": 1.372473134492806, + "learning_rate": 5.813271400144051e-06, + "loss": 0.5199, "step": 865 }, { - "epoch": 0.4173662748860638, - "grad_norm": 3.4035598410054995, - "learning_rate": 9.953660858821604e-06, - "loss": 0.842, + "epoch": 1.2520988246581914, + "grad_norm": 1.3747684408273264, + "learning_rate": 5.783306699615512e-06, + "loss": 0.5136, "step": 870 }, { - "epoch": 0.41976493163828255, - "grad_norm": 2.759621830247372, - "learning_rate": 9.951745347143065e-06, - "loss": 0.831, + "epoch": 1.2592947949148476, + "grad_norm": 1.3792887268238399, + "learning_rate": 5.753216573051526e-06, + "loss": 0.5045, "step": 875 }, { - "epoch": 0.42216358839050133, - "grad_norm": 3.071767749542282, - "learning_rate": 9.94979123448561e-06, - "loss": 0.8394, + "epoch": 1.266490765171504, + "grad_norm": 1.6127625261536036, + "learning_rate": 5.723003136793208e-06, + "loss": 0.5003, "step": 880 }, { - "epoch": 0.42456224514272006, - "grad_norm": 2.728604857418505, - "learning_rate": 9.947798536082386e-06, - "loss": 0.9297, + "epoch": 1.2736867354281602, + "grad_norm": 1.4440630275399904, + "learning_rate": 5.692668515854457e-06, + "loss": 0.4521, "step": 885 }, { - "epoch": 0.42696090189493885, - "grad_norm": 3.2357927761859564, - "learning_rate": 9.945767267467335e-06, - "loss": 0.9248, + "epoch": 1.2808827056848164, + "grad_norm": 1.5683931030948375, + "learning_rate": 5.662214843772506e-06, + "loss": 0.5435, "step": 890 }, { - "epoch": 0.4293595586471576, - "grad_norm": 3.0566660546759326, - "learning_rate": 9.943697444475062e-06, - "loss": 0.9137, + "epoch": 1.2880786759414729, + "grad_norm": 1.4069551760135997, + "learning_rate": 5.631644262457861e-06, + "loss": 0.5326, "step": 895 }, { - "epoch": 0.43175821539937637, - "grad_norm": 3.281447736907799, - "learning_rate": 9.94158908324073e-06, - "loss": 0.8446, - "step": 900 - }, - { - "epoch": 0.43175821539937637, - "eval_loss": 0.891981303691864, - "eval_runtime": 739.0277, - "eval_samples_per_second": 10.027, - "eval_steps_per_second": 0.628, + "epoch": 1.295274646198129, + "grad_norm": 1.3933443999205188, + "learning_rate": 5.600958922043651e-06, + "loss": 0.4905, "step": 900 }, { - "epoch": 0.4341568721515951, - "grad_norm": 2.669716984027581, - "learning_rate": 9.939442200199917e-06, - "loss": 0.8603, + "epoch": 1.3024706164547855, + "grad_norm": 2.0908428594407344, + "learning_rate": 5.570160980734405e-06, + "loss": 0.4444, "step": 905 }, { - "epoch": 0.4365555289038139, - "grad_norm": 2.6752353630977854, - "learning_rate": 9.937256812088496e-06, - "loss": 0.8846, + "epoch": 1.3096665867114416, + "grad_norm": 1.6732069684734259, + "learning_rate": 5.539252604654256e-06, + "loss": 0.5535, "step": 910 }, { - "epoch": 0.4389541856560326, - "grad_norm": 4.071925103373792, - "learning_rate": 9.935032935942506e-06, - "loss": 0.9126, + "epoch": 1.3168625569680978, + "grad_norm": 1.4666496678005971, + "learning_rate": 5.50823596769459e-06, + "loss": 0.4977, "step": 915 }, { - "epoch": 0.4413528424082514, - "grad_norm": 2.9599856546225274, - "learning_rate": 9.932770589098015e-06, - "loss": 0.9073, + "epoch": 1.324058527224754, + "grad_norm": 1.3633392168469545, + "learning_rate": 5.477113251361149e-06, + "loss": 0.5118, "step": 920 }, { - "epoch": 0.44375149916047013, - "grad_norm": 2.4671508950529186, - "learning_rate": 9.930469789190988e-06, - "loss": 0.8806, + "epoch": 1.3312544974814104, + "grad_norm": 1.2762316788464472, + "learning_rate": 5.445886644620601e-06, + "loss": 0.5136, "step": 925 }, { - "epoch": 0.4461501559126889, - "grad_norm": 6.163188231431187, - "learning_rate": 9.928130554157144e-06, - "loss": 0.8668, + "epoch": 1.3384504677380666, + "grad_norm": 1.5424740001396617, + "learning_rate": 5.414558343746579e-06, + "loss": 0.4926, "step": 930 }, { - "epoch": 0.44854881266490765, - "grad_norm": 4.3889573037879535, - "learning_rate": 9.925752902231829e-06, - "loss": 0.9947, + "epoch": 1.345646437994723, + "grad_norm": 1.3847453947317292, + "learning_rate": 5.38313055216521e-06, + "loss": 0.5458, "step": 935 }, { - "epoch": 0.45094746941712643, - "grad_norm": 3.099558653114257, - "learning_rate": 9.923336851949855e-06, - "loss": 0.8214, + "epoch": 1.3528424082513792, + "grad_norm": 1.3877715039925074, + "learning_rate": 5.351605480300143e-06, + "loss": 0.4637, "step": 940 }, { - "epoch": 0.45334612616934516, - "grad_norm": 3.960300474080028, - "learning_rate": 9.920882422145372e-06, - "loss": 0.9271, + "epoch": 1.3600383785080354, + "grad_norm": 1.3969957173831602, + "learning_rate": 5.319985345417079e-06, + "loss": 0.4787, "step": 945 }, { - "epoch": 0.45574478292156395, - "grad_norm": 4.305333975019212, - "learning_rate": 9.918389631951712e-06, - "loss": 0.8557, + "epoch": 1.3672343487646919, + "grad_norm": 1.669129051283974, + "learning_rate": 5.288272371467827e-06, + "loss": 0.484, "step": 950 }, { - "epoch": 0.4581434396737827, - "grad_norm": 2.703221339883744, - "learning_rate": 9.915858500801245e-06, - "loss": 0.8476, + "epoch": 1.374430319021348, + "grad_norm": 1.3646255552767974, + "learning_rate": 5.256468788933881e-06, + "loss": 0.4782, "step": 955 }, { - "epoch": 0.4605420964260014, - "grad_norm": 2.0714764562806476, - "learning_rate": 9.913289048425225e-06, - "loss": 0.8258, + "epoch": 1.3816262892780042, + "grad_norm": 1.384029909257828, + "learning_rate": 5.2245768346695494e-06, + "loss": 0.5021, "step": 960 }, { - "epoch": 0.4629407531782202, - "grad_norm": 3.079398404907295, - "learning_rate": 9.910681294853632e-06, - "loss": 0.9569, + "epoch": 1.3888222595346607, + "grad_norm": 1.3642725031029292, + "learning_rate": 5.192598751744621e-06, + "loss": 0.476, "step": 965 }, { - "epoch": 0.46533940993043893, - "grad_norm": 3.1990029825784094, - "learning_rate": 9.908035260415028e-06, - "loss": 0.8552, + "epoch": 1.3960182297913168, + "grad_norm": 1.4324224845783868, + "learning_rate": 5.160536789286612e-06, + "loss": 0.4966, "step": 970 }, { - "epoch": 0.4677380666826577, - "grad_norm": 2.8699251494441738, - "learning_rate": 9.905350965736382e-06, - "loss": 0.8553, + "epoch": 1.403214200047973, + "grad_norm": 1.3426650482840705, + "learning_rate": 5.128393202322565e-06, + "loss": 0.5116, "step": 975 }, { - "epoch": 0.47013672343487645, - "grad_norm": 4.49023764637979, - "learning_rate": 9.902628431742924e-06, - "loss": 0.8762, + "epoch": 1.4104101703046295, + "grad_norm": 1.4860297463165402, + "learning_rate": 5.096170251620458e-06, + "loss": 0.512, "step": 980 }, { - "epoch": 0.47253538018709523, - "grad_norm": 3.0297852374638956, - "learning_rate": 9.899867679657974e-06, - "loss": 0.855, + "epoch": 1.4176061405612856, + "grad_norm": 1.3809734192523142, + "learning_rate": 5.063870203530188e-06, + "loss": 0.5128, "step": 985 }, { - "epoch": 0.47493403693931396, - "grad_norm": 3.4604086734874007, - "learning_rate": 9.89706873100278e-06, - "loss": 0.8564, + "epoch": 1.424802110817942, + "grad_norm": 1.61363323216821, + "learning_rate": 5.031495329824175e-06, + "loss": 0.5342, "step": 990 }, { - "epoch": 0.47733269369153275, - "grad_norm": 2.3167328747243365, - "learning_rate": 9.894231607596348e-06, - "loss": 0.8405, + "epoch": 1.4319980810745983, + "grad_norm": 2.088959286090713, + "learning_rate": 4.999047907537582e-06, + "loss": 0.489, "step": 995 }, { - "epoch": 0.4797313504437515, - "grad_norm": 2.334415977229892, - "learning_rate": 9.891356331555272e-06, - "loss": 0.8908, - "step": 1000 - }, - { - "epoch": 0.4797313504437515, - "eval_loss": 0.898030698299408, - "eval_runtime": 739.1272, - "eval_samples_per_second": 10.025, - "eval_steps_per_second": 0.628, + "epoch": 1.4391940513312544, + "grad_norm": 1.5116312432174273, + "learning_rate": 4.966530218808157e-06, + "loss": 0.4968, "step": 1000 }, { - "epoch": 0.48213000719597027, - "grad_norm": 3.3169042936391526, - "learning_rate": 9.888442925293555e-06, - "loss": 0.9384, + "epoch": 1.4463900215879106, + "grad_norm": 1.4453482163933629, + "learning_rate": 4.933944550715725e-06, + "loss": 0.5297, "step": 1005 }, { - "epoch": 0.484528663948189, - "grad_norm": 2.680090693916463, - "learning_rate": 9.885491411522457e-06, - "loss": 0.9831, + "epoch": 1.453585991844567, + "grad_norm": 1.5874368851505785, + "learning_rate": 4.901293195121338e-06, + "loss": 0.5005, "step": 1010 }, { - "epoch": 0.4869273207004078, - "grad_norm": 2.7070444583755195, - "learning_rate": 9.882501813250292e-06, - "loss": 0.8285, + "epoch": 1.4607819621012232, + "grad_norm": 1.5140612281307557, + "learning_rate": 4.868578448506067e-06, + "loss": 0.5425, "step": 1015 }, { - "epoch": 0.4893259774526265, - "grad_norm": 3.8337165580593204, - "learning_rate": 9.879474153782258e-06, - "loss": 0.9792, + "epoch": 1.4679779323578797, + "grad_norm": 1.5247613079347937, + "learning_rate": 4.835802611809492e-06, + "loss": 0.5246, "step": 1020 }, { - "epoch": 0.4917246342048453, - "grad_norm": 2.3683185044283706, - "learning_rate": 9.876408456720261e-06, - "loss": 0.8057, + "epoch": 1.4751739026145358, + "grad_norm": 1.4922253999621244, + "learning_rate": 4.802967990267867e-06, + "loss": 0.5129, "step": 1025 }, { - "epoch": 0.49412329095706403, - "grad_norm": 3.0270798858712586, - "learning_rate": 9.873304745962725e-06, - "loss": 0.8337, + "epoch": 1.482369872871192, + "grad_norm": 1.5098918292072203, + "learning_rate": 4.770076893251986e-06, + "loss": 0.5239, "step": 1030 }, { - "epoch": 0.4965219477092828, - "grad_norm": 3.6556525120504655, - "learning_rate": 9.870163045704403e-06, - "loss": 0.9186, + "epoch": 1.4895658431278485, + "grad_norm": 1.6715329520651785, + "learning_rate": 4.7371316341047484e-06, + "loss": 0.5659, "step": 1035 }, { - "epoch": 0.49892060446150155, - "grad_norm": 3.5057620622825887, - "learning_rate": 9.866983380436202e-06, - "loss": 0.8611, + "epoch": 1.4967618133845046, + "grad_norm": 1.4935236860112138, + "learning_rate": 4.704134529978471e-06, + "loss": 0.4914, "step": 1040 }, { - "epoch": 0.5013192612137203, - "grad_norm": 2.703061130697398, - "learning_rate": 9.863765774944973e-06, - "loss": 0.791, + "epoch": 1.503957783641161, + "grad_norm": 1.4122572036404084, + "learning_rate": 4.671087901671899e-06, + "loss": 0.4798, "step": 1045 }, { - "epoch": 0.5037179179659391, - "grad_norm": 2.58369559163552, - "learning_rate": 9.860510254313332e-06, - "loss": 0.9499, + "epoch": 1.5111537538978173, + "grad_norm": 1.4681688735493286, + "learning_rate": 4.637994073466981e-06, + "loss": 0.5051, "step": 1050 }, { - "epoch": 0.5061165747181579, - "grad_norm": 2.682160545465096, - "learning_rate": 9.857216843919457e-06, - "loss": 0.942, + "epoch": 1.5183497241544734, + "grad_norm": 1.44942902813713, + "learning_rate": 4.604855372965394e-06, + "loss": 0.539, "step": 1055 }, { - "epoch": 0.5085152314703766, - "grad_norm": 2.780862397297652, - "learning_rate": 9.853885569436896e-06, - "loss": 0.9212, + "epoch": 1.5255456944111296, + "grad_norm": 1.236269924531647, + "learning_rate": 4.5716741309248445e-06, + "loss": 0.5305, "step": 1060 }, { - "epoch": 0.5109138882225953, - "grad_norm": 2.022760314849931, - "learning_rate": 9.850516456834364e-06, - "loss": 0.8786, + "epoch": 1.532741664667786, + "grad_norm": 1.6240033228942266, + "learning_rate": 4.538452681095123e-06, + "loss": 0.5531, "step": 1065 }, { - "epoch": 0.5133125449748142, - "grad_norm": 2.604771721280209, - "learning_rate": 9.847109532375536e-06, - "loss": 0.9393, + "epoch": 1.5399376349244425, + "grad_norm": 1.5622316143038788, + "learning_rate": 4.5051933600539705e-06, + "loss": 0.494, "step": 1070 }, { - "epoch": 0.5157112017270329, - "grad_norm": 3.5331046029013553, - "learning_rate": 9.84366482261885e-06, - "loss": 0.8909, + "epoch": 1.5471336051810987, + "grad_norm": 1.5638109251408436, + "learning_rate": 4.471898507042745e-06, + "loss": 0.533, "step": 1075 }, { - "epoch": 0.5181098584792516, - "grad_norm": 2.640697846099872, - "learning_rate": 9.840182354417298e-06, - "loss": 0.9053, + "epoch": 1.5543295754377549, + "grad_norm": 1.5029077042303538, + "learning_rate": 4.438570463801884e-06, + "loss": 0.513, "step": 1080 }, { - "epoch": 0.5205085152314703, - "grad_norm": 2.5514635987479077, - "learning_rate": 9.836662154918212e-06, - "loss": 0.8935, + "epoch": 1.561525545694411, + "grad_norm": 1.372369046315647, + "learning_rate": 4.405211574406209e-06, + "loss": 0.4698, "step": 1085 }, { - "epoch": 0.5229071719836891, - "grad_norm": 2.7348046479736374, - "learning_rate": 9.833104251563058e-06, - "loss": 0.8916, + "epoch": 1.5687215159510672, + "grad_norm": 1.6322850083649267, + "learning_rate": 4.371824185100054e-06, + "loss": 0.4607, "step": 1090 }, { - "epoch": 0.5253058287359079, - "grad_norm": 2.8303414330543686, - "learning_rate": 9.829508672087216e-06, - "loss": 0.8965, + "epoch": 1.5759174862077237, + "grad_norm": 1.6038884982185644, + "learning_rate": 4.338410644132256e-06, + "loss": 0.4918, "step": 1095 }, { - "epoch": 0.5277044854881267, - "grad_norm": 2.9495795537053553, - "learning_rate": 9.825875444519772e-06, - "loss": 0.8806, - "step": 1100 - }, - { - "epoch": 0.5277044854881267, - "eval_loss": 0.8870354294776917, - "eval_runtime": 739.0135, - "eval_samples_per_second": 10.027, - "eval_steps_per_second": 0.628, + "epoch": 1.58311345646438, + "grad_norm": 1.510163759577701, + "learning_rate": 4.304973301590977e-06, + "loss": 0.5141, "step": 1100 }, { - "epoch": 0.5301031422403454, - "grad_norm": 2.6141874348758583, - "learning_rate": 9.822204597183295e-06, - "loss": 0.8272, + "epoch": 1.5903094267210363, + "grad_norm": 1.3447962919620353, + "learning_rate": 4.271514509238434e-06, + "loss": 0.5719, "step": 1105 }, { - "epoch": 0.5325017989925641, - "grad_norm": 2.2732654732109387, - "learning_rate": 9.818496158693611e-06, - "loss": 0.8833, + "epoch": 1.5975053969776924, + "grad_norm": 1.4095809553505452, + "learning_rate": 4.238036620345477e-06, + "loss": 0.5378, "step": 1110 }, { - "epoch": 0.534900455744783, - "grad_norm": 3.143024300051868, - "learning_rate": 9.814750157959591e-06, - "loss": 0.938, + "epoch": 1.6047013672343486, + "grad_norm": 1.5471600021180223, + "learning_rate": 4.204541989526083e-06, + "loss": 0.5159, "step": 1115 }, { - "epoch": 0.5372991124970017, - "grad_norm": 2.1585701131113737, - "learning_rate": 9.810966624182917e-06, - "loss": 0.7753, + "epoch": 1.611897337491005, + "grad_norm": 1.3407123732434036, + "learning_rate": 4.171032972571744e-06, + "loss": 0.514, "step": 1120 }, { - "epoch": 0.5396977692492204, - "grad_norm": 2.6267326211662914, - "learning_rate": 9.807145586857861e-06, - "loss": 0.8769, + "epoch": 1.6190933077476615, + "grad_norm": 1.5587421912806174, + "learning_rate": 4.137511926285779e-06, + "loss": 0.4943, "step": 1125 }, { - "epoch": 0.5420964260014391, - "grad_norm": 3.036194720896026, - "learning_rate": 9.80328707577104e-06, - "loss": 0.949, + "epoch": 1.6262892780043177, + "grad_norm": 1.5228893274275985, + "learning_rate": 4.103981208317571e-06, + "loss": 0.5161, "step": 1130 }, { - "epoch": 0.544495082753658, - "grad_norm": 2.9408794675718926, - "learning_rate": 9.79939112100121e-06, - "loss": 0.9483, + "epoch": 1.6334852482609739, + "grad_norm": 1.3816307683209126, + "learning_rate": 4.070443176996745e-06, + "loss": 0.5036, "step": 1135 }, { - "epoch": 0.5468937395058767, - "grad_norm": 3.0771625978930754, - "learning_rate": 9.795457752919007e-06, - "loss": 0.899, + "epoch": 1.64068121851763, + "grad_norm": 1.551360790563111, + "learning_rate": 4.036900191167301e-06, + "loss": 0.4973, "step": 1140 }, { - "epoch": 0.5492923962580955, - "grad_norm": 2.5540520239747444, - "learning_rate": 9.791487002186721e-06, - "loss": 1.0063, + "epoch": 1.6478771887742862, + "grad_norm": 1.3412086458854404, + "learning_rate": 4.003354610021701e-06, + "loss": 0.5029, "step": 1145 }, { - "epoch": 0.5516910530103142, - "grad_norm": 3.1679148544487155, - "learning_rate": 9.787478899758059e-06, - "loss": 0.8657, + "epoch": 1.6550731590309427, + "grad_norm": 1.4793955535305545, + "learning_rate": 3.96980879293495e-06, + "loss": 0.4925, "step": 1150 }, { - "epoch": 0.554089709762533, - "grad_norm": 2.976757653301214, - "learning_rate": 9.783433476877898e-06, - "loss": 0.8958, + "epoch": 1.662269129287599, + "grad_norm": 1.2902630164808577, + "learning_rate": 3.9362650992986465e-06, + "loss": 0.4906, "step": 1155 }, { - "epoch": 0.5564883665147518, - "grad_norm": 2.5443050711582513, - "learning_rate": 9.779350765082045e-06, - "loss": 0.857, + "epoch": 1.6694650995442553, + "grad_norm": 1.5152262911206174, + "learning_rate": 3.902725888355037e-06, + "loss": 0.5019, "step": 1160 }, { - "epoch": 0.5588870232669705, - "grad_norm": 2.86585544843531, - "learning_rate": 9.775230796196989e-06, - "loss": 0.9084, + "epoch": 1.6766610698009115, + "grad_norm": 1.577445286461562, + "learning_rate": 3.869193519031086e-06, + "loss": 0.49, "step": 1165 }, { - "epoch": 0.5612856800191892, - "grad_norm": 2.963216428022191, - "learning_rate": 9.771073602339655e-06, - "loss": 0.8169, + "epoch": 1.6838570400575676, + "grad_norm": 1.4532186104713023, + "learning_rate": 3.835670349772566e-06, + "loss": 0.47, "step": 1170 }, { - "epoch": 0.5636843367714081, - "grad_norm": 2.760026417999172, - "learning_rate": 9.766879215917152e-06, - "loss": 0.8466, + "epoch": 1.691053010314224, + "grad_norm": 1.2527595036522912, + "learning_rate": 3.802158738378176e-06, + "loss": 0.4508, "step": 1175 }, { - "epoch": 0.5660829935236268, - "grad_norm": 2.6454017962342125, - "learning_rate": 9.762647669626522e-06, - "loss": 0.8039, + "epoch": 1.6982489805708805, + "grad_norm": 1.4942224309571124, + "learning_rate": 3.7686610418337083e-06, + "loss": 0.5039, "step": 1180 }, { - "epoch": 0.5684816502758455, - "grad_norm": 2.4626436519353616, - "learning_rate": 9.758378996454482e-06, - "loss": 0.923, + "epoch": 1.7054449508275367, + "grad_norm": 1.72911326258422, + "learning_rate": 3.7351796161462796e-06, + "loss": 0.4808, "step": 1185 }, { - "epoch": 0.5708803070280642, - "grad_norm": 4.129179717625037, - "learning_rate": 9.754073229677171e-06, - "loss": 0.972, + "epoch": 1.7126409210841929, + "grad_norm": 1.3747290591542072, + "learning_rate": 3.7017168161786215e-06, + "loss": 0.4993, "step": 1190 }, { - "epoch": 0.5732789637802831, - "grad_norm": 2.2820077397958123, - "learning_rate": 9.749730402859887e-06, - "loss": 0.994, + "epoch": 1.719836891340849, + "grad_norm": 1.4764634140281585, + "learning_rate": 3.6682749954834548e-06, + "loss": 0.5115, "step": 1195 }, { - "epoch": 0.5756776205325018, - "grad_norm": 2.609944354559159, - "learning_rate": 9.745350549856831e-06, - "loss": 0.8549, - "step": 1200 - }, - { - "epoch": 0.5756776205325018, - "eval_loss": 0.8886787295341492, - "eval_runtime": 739.2649, - "eval_samples_per_second": 10.023, - "eval_steps_per_second": 0.628, + "epoch": 1.7270328615975052, + "grad_norm": 1.6701936544900278, + "learning_rate": 3.634856506137956e-06, + "loss": 0.5653, "step": 1200 }, { - "epoch": 0.5780762772847206, - "grad_norm": 2.8948575829375565, - "learning_rate": 9.740933704810832e-06, - "loss": 0.8167, + "epoch": 1.7342288318541617, + "grad_norm": 1.5206960936360632, + "learning_rate": 3.6014636985783287e-06, + "loss": 0.521, "step": 1205 }, { - "epoch": 0.5804749340369393, - "grad_norm": 2.3042933975353463, - "learning_rate": 9.736479902153093e-06, - "loss": 0.8773, + "epoch": 1.741424802110818, + "grad_norm": 1.5138010545314067, + "learning_rate": 3.568098921434488e-06, + "loss": 0.4856, "step": 1210 }, { - "epoch": 0.5828735907891581, - "grad_norm": 3.3042761579341153, - "learning_rate": 9.731989176602918e-06, - "loss": 0.8711, + "epoch": 1.7486207723674743, + "grad_norm": 1.5508467662920749, + "learning_rate": 3.534764521364879e-06, + "loss": 0.4846, "step": 1215 }, { - "epoch": 0.5852722475413769, - "grad_norm": 3.448353009463417, - "learning_rate": 9.727461563167436e-06, - "loss": 0.8695, + "epoch": 1.7558167426241305, + "grad_norm": 1.294214634658577, + "learning_rate": 3.501462842891418e-06, + "loss": 0.4876, "step": 1220 }, { - "epoch": 0.5876709042935956, - "grad_norm": 2.628400374964348, - "learning_rate": 9.722897097141336e-06, - "loss": 0.9043, + "epoch": 1.7630127128807866, + "grad_norm": 1.4066532330498682, + "learning_rate": 3.4681962282346023e-06, + "loss": 0.4644, "step": 1225 }, { - "epoch": 0.5900695610458143, - "grad_norm": 3.519964559295127, - "learning_rate": 9.718295814106589e-06, - "loss": 0.8924, + "epoch": 1.770208683137443, + "grad_norm": 1.4435875920097863, + "learning_rate": 3.4349670171487714e-06, + "loss": 0.5199, "step": 1230 }, { - "epoch": 0.592468217798033, - "grad_norm": 2.9319636027613742, - "learning_rate": 9.713657749932172e-06, - "loss": 0.931, + "epoch": 1.7774046533940993, + "grad_norm": 1.5007291535231901, + "learning_rate": 3.4017775467575446e-06, + "loss": 0.5224, "step": 1235 }, { - "epoch": 0.5948668745502519, - "grad_norm": 2.938976926867282, - "learning_rate": 9.70898294077378e-06, - "loss": 0.8561, + "epoch": 1.7846006236507557, + "grad_norm": 1.2699334841273793, + "learning_rate": 3.3686301513894416e-06, + "loss": 0.4914, "step": 1240 }, { - "epoch": 0.5972655313024706, - "grad_norm": 2.29795980845488, - "learning_rate": 9.704271423073562e-06, - "loss": 0.8199, + "epoch": 1.7917965939074119, + "grad_norm": 1.2195698381350315, + "learning_rate": 3.3355271624137037e-06, + "loss": 0.4719, "step": 1245 }, { - "epoch": 0.5996641880546894, - "grad_norm": 2.398089494212856, - "learning_rate": 9.699523233559813e-06, - "loss": 0.8939, + "epoch": 1.798992564164068, + "grad_norm": 1.3895500926839512, + "learning_rate": 3.3024709080763186e-06, + "loss": 0.5144, "step": 1250 }, { - "epoch": 0.6020628448069081, - "grad_norm": 2.380808540854109, - "learning_rate": 9.69473840924671e-06, - "loss": 0.8052, + "epoch": 1.8061885344207242, + "grad_norm": 1.48913348325029, + "learning_rate": 3.269463713336268e-06, + "loss": 0.5103, "step": 1255 }, { - "epoch": 0.6044615015591269, - "grad_norm": 2.8543322126129915, - "learning_rate": 9.689916987434012e-06, - "loss": 0.8934, + "epoch": 1.8133845046773807, + "grad_norm": 1.3658603273721261, + "learning_rate": 3.236507899702005e-06, + "loss": 0.473, "step": 1260 }, { - "epoch": 0.6068601583113457, - "grad_norm": 4.013675664035515, - "learning_rate": 9.685059005706767e-06, - "loss": 0.8592, + "epoch": 1.820580474934037, + "grad_norm": 1.3775281668274946, + "learning_rate": 3.2036057850681745e-06, + "loss": 0.514, "step": 1265 }, { - "epoch": 0.6092588150635644, - "grad_norm": 2.557169604456725, - "learning_rate": 9.680164501935027e-06, - "loss": 0.9077, + "epoch": 1.8277764451906933, + "grad_norm": 1.631774330401289, + "learning_rate": 3.170759683552586e-06, + "loss": 0.5163, "step": 1270 }, { - "epoch": 0.6116574718157831, - "grad_norm": 2.8906956541267745, - "learning_rate": 9.675233514273552e-06, - "loss": 0.9295, + "epoch": 1.8349724154473495, + "grad_norm": 1.384315211463836, + "learning_rate": 3.137971905333458e-06, + "loss": 0.4752, "step": 1275 }, { - "epoch": 0.614056128568002, - "grad_norm": 2.550036815779647, - "learning_rate": 9.670266081161501e-06, - "loss": 0.9126, + "epoch": 1.8421683857040057, + "grad_norm": 1.4207130788508293, + "learning_rate": 3.1052447564869343e-06, + "loss": 0.5018, "step": 1280 }, { - "epoch": 0.6164547853202207, - "grad_norm": 3.015435597841737, - "learning_rate": 9.665262241322148e-06, - "loss": 0.8321, + "epoch": 1.849364355960662, + "grad_norm": 1.5148685580490273, + "learning_rate": 3.0725805388248834e-06, + "loss": 0.5127, "step": 1285 }, { - "epoch": 0.6188534420724394, - "grad_norm": 2.437308403492592, - "learning_rate": 9.66022203376257e-06, - "loss": 0.8764, + "epoch": 1.8565603262173183, + "grad_norm": 1.4919953654248346, + "learning_rate": 3.039981549733014e-06, + "loss": 0.4971, "step": 1290 }, { - "epoch": 0.6212520988246581, - "grad_norm": 2.655419936930409, - "learning_rate": 9.655145497773348e-06, - "loss": 0.8112, + "epoch": 1.8637562964739747, + "grad_norm": 2.065773895317616, + "learning_rate": 3.007450082009283e-06, + "loss": 0.4843, "step": 1295 }, { - "epoch": 0.623650755576877, - "grad_norm": 2.856040503173194, - "learning_rate": 9.650032672928256e-06, - "loss": 0.9197, - "step": 1300 - }, - { - "epoch": 0.623650755576877, - "eval_loss": 0.891351580619812, - "eval_runtime": 739.9462, - "eval_samples_per_second": 10.014, - "eval_steps_per_second": 0.627, + "epoch": 1.8709522667306309, + "grad_norm": 1.4849907079900204, + "learning_rate": 2.9749884237026426e-06, + "loss": 0.5102, "step": 1300 }, { - "epoch": 0.6260494123290957, - "grad_norm": 2.7662185790099976, - "learning_rate": 9.644883599083959e-06, - "loss": 0.7763, + "epoch": 1.878148236987287, + "grad_norm": 1.7567340972637704, + "learning_rate": 2.9425988579521103e-06, + "loss": 0.4901, "step": 1305 }, { - "epoch": 0.6284480690813145, - "grad_norm": 2.9200612566129838, - "learning_rate": 9.639698316379692e-06, - "loss": 0.8978, + "epoch": 1.8853442072439432, + "grad_norm": 1.4271802301503538, + "learning_rate": 2.910283662826188e-06, + "loss": 0.4805, "step": 1310 }, { - "epoch": 0.6308467258335332, - "grad_norm": 2.0716196866252163, - "learning_rate": 9.634476865236964e-06, - "loss": 0.838, + "epoch": 1.8925401775005997, + "grad_norm": 1.479518679920681, + "learning_rate": 2.8780451111626384e-06, + "loss": 0.4908, "step": 1315 }, { - "epoch": 0.633245382585752, - "grad_norm": 2.437636233160616, - "learning_rate": 9.62921928635922e-06, - "loss": 0.8614, + "epoch": 1.899736147757256, + "grad_norm": 1.5062976034854971, + "learning_rate": 2.8458854704086275e-06, + "loss": 0.491, "step": 1320 }, { - "epoch": 0.6356440393379708, - "grad_norm": 2.8685025444969385, - "learning_rate": 9.623925620731546e-06, - "loss": 0.8602, + "epoch": 1.9069321180139123, + "grad_norm": 1.552858614788501, + "learning_rate": 2.8138070024612504e-06, + "loss": 0.4787, "step": 1325 }, { - "epoch": 0.6380426960901895, - "grad_norm": 2.7442682238420826, - "learning_rate": 9.618595909620335e-06, - "loss": 0.8944, + "epoch": 1.9141280882705685, + "grad_norm": 1.668691870366938, + "learning_rate": 2.7818119635084392e-06, + "loss": 0.536, "step": 1330 }, { - "epoch": 0.6404413528424082, - "grad_norm": 2.9361485822984124, - "learning_rate": 9.613230194572972e-06, - "loss": 0.8433, + "epoch": 1.9213240585272247, + "grad_norm": 1.4857517196291667, + "learning_rate": 2.749902603870283e-06, + "loss": 0.5047, "step": 1335 }, { - "epoch": 0.6428400095946271, - "grad_norm": 3.095568397022331, - "learning_rate": 9.607828517417507e-06, - "loss": 0.8038, + "epoch": 1.928520028783881, + "grad_norm": 2.9875786042021932, + "learning_rate": 2.7180811678407525e-06, + "loss": 0.504, "step": 1340 }, { - "epoch": 0.6452386663468458, - "grad_norm": 2.46360142167789, - "learning_rate": 9.60239092026233e-06, - "loss": 0.8382, + "epoch": 1.9357159990405373, + "grad_norm": 1.3595766267302944, + "learning_rate": 2.686349893529849e-06, + "loss": 0.4863, "step": 1345 }, { - "epoch": 0.6476373230990645, - "grad_norm": 2.7582607273277344, - "learning_rate": 9.596917445495843e-06, - "loss": 0.8037, + "epoch": 1.9429119692971937, + "grad_norm": 1.4889142920291538, + "learning_rate": 2.6547110127061975e-06, + "loss": 0.4926, "step": 1350 }, { - "epoch": 0.6500359798512833, - "grad_norm": 2.9601669174372693, - "learning_rate": 9.591408135786132e-06, - "loss": 0.8638, + "epoch": 1.9501079395538499, + "grad_norm": 1.451668900558599, + "learning_rate": 2.6231667506400706e-06, + "loss": 0.4984, "step": 1355 }, { - "epoch": 0.6524346366035021, - "grad_norm": 3.127369086710566, - "learning_rate": 9.585863034080624e-06, - "loss": 0.8225, + "epoch": 1.957303909810506, + "grad_norm": 1.47137694607456, + "learning_rate": 2.591719325946883e-06, + "loss": 0.5209, "step": 1360 }, { - "epoch": 0.6548332933557208, - "grad_norm": 3.3773196297676256, - "learning_rate": 9.58028218360577e-06, - "loss": 0.8978, + "epoch": 1.9644998800671623, + "grad_norm": 1.5284770765800149, + "learning_rate": 2.560370950431146e-06, + "loss": 0.4603, "step": 1365 }, { - "epoch": 0.6572319501079396, - "grad_norm": 3.7552856472244502, - "learning_rate": 9.574665627866692e-06, - "loss": 0.8582, + "epoch": 1.9716958503238187, + "grad_norm": 1.402896466872614, + "learning_rate": 2.5291238289309054e-06, + "loss": 0.5077, "step": 1370 }, { - "epoch": 0.6596306068601583, - "grad_norm": 2.958749010394765, - "learning_rate": 9.56901341064685e-06, - "loss": 0.8175, + "epoch": 1.978891820580475, + "grad_norm": 1.4817926638302614, + "learning_rate": 2.497980159162667e-06, + "loss": 0.4839, "step": 1375 }, { - "epoch": 0.662029263612377, - "grad_norm": 2.9983246522004054, - "learning_rate": 9.563325576007702e-06, - "loss": 0.8431, + "epoch": 1.9860877908371313, + "grad_norm": 1.5453436757112435, + "learning_rate": 2.466942131566824e-06, + "loss": 0.4888, "step": 1380 }, { - "epoch": 0.6644279203645959, - "grad_norm": 3.074984178917188, - "learning_rate": 9.557602168288357e-06, - "loss": 0.9192, + "epoch": 1.9932837610937875, + "grad_norm": 1.4335485637084342, + "learning_rate": 2.4360119291535955e-06, + "loss": 0.4917, "step": 1385 }, { - "epoch": 0.6668265771168146, - "grad_norm": 2.531045908501192, - "learning_rate": 9.551843232105239e-06, - "loss": 0.8562, + "epoch": 2.0004797313504437, + "grad_norm": 1.6143369616539034, + "learning_rate": 2.405191727349489e-06, + "loss": 0.4993, "step": 1390 }, { - "epoch": 0.6692252338690333, - "grad_norm": 3.4852818268873174, - "learning_rate": 9.54604881235172e-06, - "loss": 0.8687, + "epoch": 2.0076757016071, + "grad_norm": 1.2224443867251211, + "learning_rate": 2.3744836938442936e-06, + "loss": 0.2088, "step": 1395 }, { - "epoch": 0.671623890621252, - "grad_norm": 2.23016124603345, - "learning_rate": 9.540218954197789e-06, - "loss": 0.8864, + "epoch": 2.0148716718637565, + "grad_norm": 1.2602299678447657, + "learning_rate": 2.3438899884386185e-06, + "loss": 0.1941, "step": 1400 }, { - "epoch": 0.671623890621252, - "eval_loss": 0.8827491998672485, - "eval_runtime": 739.7102, - "eval_samples_per_second": 10.017, + "epoch": 2.0148716718637565, + "eval_loss": 0.9261869192123413, + "eval_runtime": 740.3886, + "eval_samples_per_second": 10.008, "eval_steps_per_second": 0.627, "step": 1400 }, { - "epoch": 0.6740225473734709, - "grad_norm": 3.018147763364397, - "learning_rate": 9.534353703089692e-06, - "loss": 0.9238, + "epoch": 2.0220676421204127, + "grad_norm": 1.323505095452205, + "learning_rate": 2.3134127628919927e-06, + "loss": 0.1915, "step": 1405 }, { - "epoch": 0.6764212041256896, - "grad_norm": 2.554891580216872, - "learning_rate": 9.528453104749575e-06, - "loss": 0.8289, + "epoch": 2.029263612377069, + "grad_norm": 1.2958926807268987, + "learning_rate": 2.2830541607715136e-06, + "loss": 0.1736, "step": 1410 }, { - "epoch": 0.6788198608779084, - "grad_norm": 2.258802758649112, - "learning_rate": 9.522517205175133e-06, - "loss": 0.8195, + "epoch": 2.036459582633725, + "grad_norm": 1.271545256175082, + "learning_rate": 2.2528163173010927e-06, + "loss": 0.1845, "step": 1415 }, { - "epoch": 0.6812185176301271, - "grad_norm": 3.1309363033906594, - "learning_rate": 9.516546050639253e-06, - "loss": 0.9182, + "epoch": 2.0436555528903813, + "grad_norm": 1.5029993532268295, + "learning_rate": 2.2227013592112757e-06, + "loss": 0.1893, "step": 1420 }, { - "epoch": 0.6836171743823459, - "grad_norm": 2.799964298869419, - "learning_rate": 9.510539687689641e-06, - "loss": 0.8926, + "epoch": 2.0508515231470374, + "grad_norm": 1.2921857666544403, + "learning_rate": 2.192711404589658e-06, + "loss": 0.1958, "step": 1425 }, { - "epoch": 0.6860158311345647, - "grad_norm": 7.855666861983768, - "learning_rate": 9.504498163148476e-06, - "loss": 0.8561, + "epoch": 2.058047493403694, + "grad_norm": 1.2460289218576504, + "learning_rate": 2.162848562731916e-06, + "loss": 0.1994, "step": 1430 }, { - "epoch": 0.6884144878867834, - "grad_norm": 3.0545825578502996, - "learning_rate": 9.498421524112032e-06, - "loss": 0.8985, + "epoch": 2.0652434636603503, + "grad_norm": 1.2881623243419067, + "learning_rate": 2.133114933993452e-06, + "loss": 0.1935, "step": 1435 }, { - "epoch": 0.6908131446390021, - "grad_norm": 3.2954044794247404, - "learning_rate": 9.492309817950315e-06, - "loss": 0.9906, + "epoch": 2.0724394339170065, + "grad_norm": 1.1792218418956621, + "learning_rate": 2.1035126096416704e-06, + "loss": 0.1951, "step": 1440 }, { - "epoch": 0.693211801391221, - "grad_norm": 2.5123813211532684, - "learning_rate": 9.486163092306699e-06, - "loss": 0.8587, + "epoch": 2.0796354041736627, + "grad_norm": 1.284870948942911, + "learning_rate": 2.07404367170889e-06, + "loss": 0.1948, "step": 1445 }, { - "epoch": 0.6956104581434397, - "grad_norm": 3.0011863719016527, - "learning_rate": 9.479981395097545e-06, - "loss": 0.8932, + "epoch": 2.086831374430319, + "grad_norm": 1.2222749381574636, + "learning_rate": 2.0447101928459083e-06, + "loss": 0.1927, "step": 1450 }, { - "epoch": 0.6980091148956584, - "grad_norm": 3.0839066691421775, - "learning_rate": 9.473764774511833e-06, - "loss": 0.8058, + "epoch": 2.0940273446869755, + "grad_norm": 1.4089391140981338, + "learning_rate": 2.0155142361762256e-06, + "loss": 0.1553, "step": 1455 }, { - "epoch": 0.7004077716478772, - "grad_norm": 2.2891983688771, - "learning_rate": 9.467513279010792e-06, - "loss": 0.7818, + "epoch": 2.1012233149436317, + "grad_norm": 1.1670069976157664, + "learning_rate": 1.986457855150937e-06, + "loss": 0.1882, "step": 1460 }, { - "epoch": 0.702806428400096, - "grad_norm": 2.725073147047602, - "learning_rate": 9.461226957327506e-06, - "loss": 0.8881, + "epoch": 2.108419285200288, + "grad_norm": 1.20035808468667, + "learning_rate": 1.957543093404309e-06, + "loss": 0.1723, "step": 1465 }, { - "epoch": 0.7052050851523147, - "grad_norm": 3.1730042623525083, - "learning_rate": 9.454905858466547e-06, - "loss": 0.8823, + "epoch": 2.115615255456944, + "grad_norm": 1.2485627377889825, + "learning_rate": 1.9287719846100366e-06, + "loss": 0.1841, "step": 1470 }, { - "epoch": 0.7076037419045335, - "grad_norm": 2.8737424156514706, - "learning_rate": 9.448550031703599e-06, - "loss": 0.8467, + "epoch": 2.1228112257136003, + "grad_norm": 1.4758861123400375, + "learning_rate": 1.900146552338222e-06, + "loss": 0.1989, "step": 1475 }, { - "epoch": 0.7100023986567522, - "grad_norm": 2.5220020914196914, - "learning_rate": 9.442159526585052e-06, - "loss": 0.8828, + "epoch": 2.1300071959702565, + "grad_norm": 1.415660377393989, + "learning_rate": 1.8716688099130336e-06, + "loss": 0.1792, "step": 1480 }, { - "epoch": 0.712401055408971, - "grad_norm": 3.219145860445945, - "learning_rate": 9.435734392927639e-06, - "loss": 0.8935, + "epoch": 2.137203166226913, + "grad_norm": 1.1398390415745234, + "learning_rate": 1.8433407602711122e-06, + "loss": 0.1828, "step": 1485 }, { - "epoch": 0.7147997121611898, - "grad_norm": 3.4596708965275673, - "learning_rate": 9.429274680818029e-06, - "loss": 0.8935, + "epoch": 2.1443991364835693, + "grad_norm": 1.436825768706905, + "learning_rate": 1.8151643958206963e-06, + "loss": 0.1873, "step": 1490 }, { - "epoch": 0.7171983689134085, - "grad_norm": 2.180515903835741, - "learning_rate": 9.422780440612449e-06, - "loss": 0.9376, + "epoch": 2.1515951067402255, + "grad_norm": 1.2111903866598819, + "learning_rate": 1.7871416983014864e-06, + "loss": 0.1747, "step": 1495 }, { - "epoch": 0.7195970256656272, - "grad_norm": 3.5672104188512, - "learning_rate": 9.416251722936289e-06, - "loss": 0.8231, - "step": 1500 - }, - { - "epoch": 0.7195970256656272, - "eval_loss": 0.8757531642913818, - "eval_runtime": 740.9577, - "eval_samples_per_second": 10.001, - "eval_steps_per_second": 0.626, + "epoch": 2.1587910769968817, + "grad_norm": 1.592322648486121, + "learning_rate": 1.7592746386452641e-06, + "loss": 0.1981, "step": 1500 }, { - "epoch": 0.721995682417846, - "grad_norm": 5.348376903765216, - "learning_rate": 9.409688578683702e-06, - "loss": 0.9555, + "epoch": 2.165987047253538, + "grad_norm": 1.3381476033081696, + "learning_rate": 1.7315651768372734e-06, + "loss": 0.1752, "step": 1505 }, { - "epoch": 0.7243943391700648, - "grad_norm": 3.0987915733129, - "learning_rate": 9.403091059017214e-06, - "loss": 0.894, + "epoch": 2.1731830175101945, + "grad_norm": 1.5243125399513529, + "learning_rate": 1.7040152617783607e-06, + "loss": 0.1797, "step": 1510 }, { - "epoch": 0.7267929959222835, - "grad_norm": 2.1853482972820113, - "learning_rate": 9.396459215367319e-06, - "loss": 0.7919, + "epoch": 2.1803789877668507, + "grad_norm": 1.5148343638714192, + "learning_rate": 1.6766268311479078e-06, + "loss": 0.193, "step": 1515 }, { - "epoch": 0.7291916526745023, - "grad_norm": 2.950634652802006, - "learning_rate": 9.389793099432083e-06, - "loss": 0.7889, + "epoch": 2.187574958023507, + "grad_norm": 1.315102374687142, + "learning_rate": 1.649401811267546e-06, + "loss": 0.1889, "step": 1520 }, { - "epoch": 0.731590309426721, - "grad_norm": 2.551117094087631, - "learning_rate": 9.38309276317674e-06, - "loss": 0.9133, + "epoch": 2.194770928280163, + "grad_norm": 1.5038597370043303, + "learning_rate": 1.622342116965672e-06, + "loss": 0.2193, "step": 1525 }, { - "epoch": 0.7339889661789398, - "grad_norm": 3.484732257795789, - "learning_rate": 9.376358258833283e-06, - "loss": 0.9706, + "epoch": 2.2019668985368193, + "grad_norm": 1.3456620640508148, + "learning_rate": 1.595449651442771e-06, + "loss": 0.1842, "step": 1530 }, { - "epoch": 0.7363876229311586, - "grad_norm": 2.526861763894699, - "learning_rate": 9.369589638900065e-06, - "loss": 0.9191, + "epoch": 2.2091628687934755, + "grad_norm": 1.3647300470767014, + "learning_rate": 1.5687263061375595e-06, + "loss": 0.1752, "step": 1535 }, { - "epoch": 0.7387862796833773, - "grad_norm": 3.206173841917802, - "learning_rate": 9.362786956141382e-06, - "loss": 0.8839, + "epoch": 2.216358839050132, + "grad_norm": 1.417987485184227, + "learning_rate": 1.5421739605939518e-06, + "loss": 0.1728, "step": 1540 }, { - "epoch": 0.741184936435596, - "grad_norm": 4.53746680005531, - "learning_rate": 9.35595026358706e-06, - "loss": 0.8787, + "epoch": 2.2235548093067883, + "grad_norm": 1.5887020304276804, + "learning_rate": 1.5157944823288672e-06, + "loss": 0.1637, "step": 1545 }, { - "epoch": 0.7435835931878149, - "grad_norm": 3.1177510145335425, - "learning_rate": 9.34907961453205e-06, - "loss": 0.9194, + "epoch": 2.2307507795634445, + "grad_norm": 1.3375708672110973, + "learning_rate": 1.4895897267008782e-06, + "loss": 0.1792, "step": 1550 }, { - "epoch": 0.7459822499400336, - "grad_norm": 2.3736625117965278, - "learning_rate": 9.342175062536012e-06, - "loss": 0.8542, + "epoch": 2.2379467498201007, + "grad_norm": 1.3565700525423485, + "learning_rate": 1.463561536779724e-06, + "loss": 0.1921, "step": 1555 }, { - "epoch": 0.7483809066922523, - "grad_norm": 3.443392708153097, - "learning_rate": 9.33523666142289e-06, - "loss": 0.8262, + "epoch": 2.245142720076757, + "grad_norm": 1.5551856772129453, + "learning_rate": 1.4377117432166718e-06, + "loss": 0.1618, "step": 1560 }, { - "epoch": 0.750779563444471, - "grad_norm": 2.4419951167114515, - "learning_rate": 9.328264465280494e-06, - "loss": 0.8791, + "epoch": 2.2523386903334135, + "grad_norm": 1.2100448164204372, + "learning_rate": 1.4120421641157662e-06, + "loss": 0.1928, "step": 1565 }, { - "epoch": 0.7531782201966899, - "grad_norm": 3.3955749482732402, - "learning_rate": 9.321258528460087e-06, - "loss": 0.8317, + "epoch": 2.2595346605900697, + "grad_norm": 1.438877153368831, + "learning_rate": 1.386554604905955e-06, + "loss": 0.1774, "step": 1570 }, { - "epoch": 0.7555768769489086, - "grad_norm": 2.7100494409509364, - "learning_rate": 9.314218905575947e-06, - "loss": 0.8661, + "epoch": 2.266730630846726, + "grad_norm": 1.2780217507242704, + "learning_rate": 1.3612508582141065e-06, + "loss": 0.1871, "step": 1575 }, { - "epoch": 0.7579755337011274, - "grad_norm": 2.8677532226727607, - "learning_rate": 9.307145651504959e-06, - "loss": 0.7924, + "epoch": 2.273926601103382, + "grad_norm": 1.3558845492725387, + "learning_rate": 1.3361327037389295e-06, + "loss": 0.2018, "step": 1580 }, { - "epoch": 0.7603741904533461, - "grad_norm": 2.8165977913128133, - "learning_rate": 9.300038821386167e-06, - "loss": 0.8922, + "epoch": 2.2811225713600383, + "grad_norm": 1.3490250928179355, + "learning_rate": 1.3112019081257986e-06, + "loss": 0.1731, "step": 1585 }, { - "epoch": 0.7627728472055649, - "grad_norm": 2.632477549615508, - "learning_rate": 9.292898470620364e-06, - "loss": 0.79, + "epoch": 2.2883185416166945, + "grad_norm": 1.2405141654870557, + "learning_rate": 1.2864602248425018e-06, + "loss": 0.1886, "step": 1590 }, { - "epoch": 0.7651715039577837, - "grad_norm": 3.2008058338812155, - "learning_rate": 9.285724654869646e-06, - "loss": 0.8729, + "epoch": 2.295514511873351, + "grad_norm": 1.2873724354006912, + "learning_rate": 1.2619093940559138e-06, + "loss": 0.1868, "step": 1595 }, { - "epoch": 0.7675701607100024, - "grad_norm": 2.8865776414732096, - "learning_rate": 9.278517430056983e-06, - "loss": 0.8658, - "step": 1600 - }, - { - "epoch": 0.7675701607100024, - "eval_loss": 0.8723194599151611, - "eval_runtime": 741.5116, - "eval_samples_per_second": 9.993, - "eval_steps_per_second": 0.626, + "epoch": 2.3027104821300073, + "grad_norm": 1.3107124153475105, + "learning_rate": 1.2375511425096013e-06, + "loss": 0.187, "step": 1600 }, { - "epoch": 0.7699688174622211, - "grad_norm": 3.215530709050795, - "learning_rate": 9.271276852365785e-06, - "loss": 0.8832, + "epoch": 2.3099064523866635, + "grad_norm": 1.3468010137925535, + "learning_rate": 1.213387183402378e-06, + "loss": 0.1771, "step": 1605 }, { - "epoch": 0.77236747421444, - "grad_norm": 2.0508885653823263, - "learning_rate": 9.264002978239459e-06, - "loss": 0.809, + "epoch": 2.3171024226433197, + "grad_norm": 1.4179240822671797, + "learning_rate": 1.1894192162678086e-06, + "loss": 0.1654, "step": 1610 }, { - "epoch": 0.7747661309666587, - "grad_norm": 2.752742925623021, - "learning_rate": 9.256695864380975e-06, - "loss": 0.8717, + "epoch": 2.324298392899976, + "grad_norm": 1.3848546480668056, + "learning_rate": 1.165648926854672e-06, + "loss": 0.1838, "step": 1615 }, { - "epoch": 0.7771647877188774, - "grad_norm": 2.6147386599664824, - "learning_rate": 9.24935556775242e-06, - "loss": 0.8645, + "epoch": 2.331494363156632, + "grad_norm": 1.5195023852589002, + "learning_rate": 1.1420779870084052e-06, + "loss": 0.1955, "step": 1620 }, { - "epoch": 0.7795634444710962, - "grad_norm": 2.7748844905168153, - "learning_rate": 9.241982145574555e-06, - "loss": 0.8061, + "epoch": 2.3386903334132887, + "grad_norm": 1.2410727385995408, + "learning_rate": 1.1187080545535064e-06, + "loss": 0.1685, "step": 1625 }, { - "epoch": 0.781962101223315, - "grad_norm": 2.8763749911858216, - "learning_rate": 9.234575655326369e-06, - "loss": 0.8543, + "epoch": 2.345886303669945, + "grad_norm": 1.1237477417805415, + "learning_rate": 1.09554077317694e-06, + "loss": 0.1824, "step": 1630 }, { - "epoch": 0.7843607579755337, - "grad_norm": 2.3381964392401318, - "learning_rate": 9.227136154744626e-06, - "loss": 0.7687, + "epoch": 2.353082273926601, + "grad_norm": 1.2937342096954545, + "learning_rate": 1.0725777723125301e-06, + "loss": 0.1943, "step": 1635 }, { - "epoch": 0.7867594147277525, - "grad_norm": 2.798456190922495, - "learning_rate": 9.219663701823427e-06, - "loss": 0.7947, + "epoch": 2.3602782441832573, + "grad_norm": 1.2828779926698606, + "learning_rate": 1.0498206670263567e-06, + "loss": 0.1832, "step": 1640 }, { - "epoch": 0.7891580714799712, - "grad_norm": 4.15300651724386, - "learning_rate": 9.21215835481375e-06, - "loss": 0.84, + "epoch": 2.3674742144399135, + "grad_norm": 1.253425033010922, + "learning_rate": 1.0272710579031616e-06, + "loss": 0.2044, "step": 1645 }, { - "epoch": 0.7915567282321899, - "grad_norm": 3.176612904560953, - "learning_rate": 9.20462017222299e-06, - "loss": 0.8498, + "epoch": 2.37467018469657, + "grad_norm": 1.3678742472737333, + "learning_rate": 1.0049305309337758e-06, + "loss": 0.1672, "step": 1650 }, { - "epoch": 0.7939553849844088, - "grad_norm": 2.6908797712355934, - "learning_rate": 9.197049212814518e-06, - "loss": 0.8092, + "epoch": 2.3818661549532263, + "grad_norm": 1.5542727998398753, + "learning_rate": 9.82800657403569e-07, + "loss": 0.1955, "step": 1655 }, { - "epoch": 0.7963540417366275, - "grad_norm": 3.5662583410447324, - "learning_rate": 9.189445535607207e-06, - "loss": 0.9158, + "epoch": 2.3890621252098825, + "grad_norm": 1.4017624513152087, + "learning_rate": 9.60882993781937e-07, + "loss": 0.1733, "step": 1660 }, { - "epoch": 0.7987526984888462, - "grad_norm": 2.882549375268096, - "learning_rate": 9.181809199874983e-06, - "loss": 0.8807, + "epoch": 2.3962580954665387, + "grad_norm": 1.199342554533447, + "learning_rate": 9.391790816128304e-07, + "loss": 0.1649, "step": 1665 }, { - "epoch": 0.801151355241065, - "grad_norm": 2.2541644302033452, - "learning_rate": 9.174140265146356e-06, - "loss": 0.8684, + "epoch": 2.403454065723195, + "grad_norm": 1.2335679341459465, + "learning_rate": 9.176904474063319e-07, + "loss": 0.198, "step": 1670 }, { - "epoch": 0.8035500119932838, - "grad_norm": 2.403498333088569, - "learning_rate": 9.166438791203967e-06, - "loss": 0.8122, + "epoch": 2.4106500359798515, + "grad_norm": 1.4585828923578052, + "learning_rate": 8.964186025312908e-07, + "loss": 0.1988, "step": 1675 }, { - "epoch": 0.8059486687455025, - "grad_norm": 2.9396774647398383, - "learning_rate": 9.158704838084102e-06, - "loss": 0.8704, + "epoch": 2.4178460062365077, + "grad_norm": 1.4518660782918198, + "learning_rate": 8.753650431090252e-07, + "loss": 0.1701, "step": 1680 }, { - "epoch": 0.8083473254977213, - "grad_norm": 2.291793929286392, - "learning_rate": 9.15093846607625e-06, - "loss": 0.8499, + "epoch": 2.425041976493164, + "grad_norm": 1.3322728405275928, + "learning_rate": 8.545312499080922e-07, + "loss": 0.1729, "step": 1685 }, { - "epoch": 0.81074598224994, - "grad_norm": 4.646259487690163, - "learning_rate": 9.143139735722607e-06, - "loss": 0.8904, + "epoch": 2.43223794674982, + "grad_norm": 1.3067316057050342, + "learning_rate": 8.339186882401445e-07, + "loss": 0.1874, "step": 1690 }, { - "epoch": 0.8131446390021588, - "grad_norm": 3.3102326837687173, - "learning_rate": 9.135308707817623e-06, - "loss": 0.79, + "epoch": 2.4394339170064763, + "grad_norm": 1.4177678336114292, + "learning_rate": 8.135288078568656e-07, + "loss": 0.2021, "step": 1695 }, { - "epoch": 0.8155432957543776, - "grad_norm": 6.864638773101543, - "learning_rate": 9.12744544340752e-06, - "loss": 0.8506, - "step": 1700 - }, - { - "epoch": 0.8155432957543776, - "eval_loss": 0.8721764087677002, - "eval_runtime": 740.915, - "eval_samples_per_second": 10.001, - "eval_steps_per_second": 0.626, + "epoch": 2.4466298872631325, + "grad_norm": 1.3121080863750958, + "learning_rate": 7.933630428480049e-07, + "loss": 0.1699, "step": 1700 }, { - "epoch": 0.8179419525065963, - "grad_norm": 2.9260118718517245, - "learning_rate": 9.119550003789815e-06, - "loss": 0.9302, + "epoch": 2.453825857519789, + "grad_norm": 1.3185780885959946, + "learning_rate": 7.734228115405161e-07, + "loss": 0.1624, "step": 1705 }, { - "epoch": 0.820340609258815, - "grad_norm": 2.7721431237181133, - "learning_rate": 9.111622450512846e-06, - "loss": 0.8453, + "epoch": 2.4610218277764453, + "grad_norm": 1.33019533604804, + "learning_rate": 7.537095163987972e-07, + "loss": 0.1784, "step": 1710 }, { - "epoch": 0.8227392660110339, - "grad_norm": 2.955453435688259, - "learning_rate": 9.103662845375294e-06, - "loss": 0.9435, + "epoch": 2.4682177980331015, + "grad_norm": 1.3853774517952444, + "learning_rate": 7.342245439260537e-07, + "loss": 0.1824, "step": 1715 }, { - "epoch": 0.8251379227632526, - "grad_norm": 2.658722267205169, - "learning_rate": 9.095671250425693e-06, - "loss": 0.8011, + "epoch": 2.4754137682897577, + "grad_norm": 1.1804687459435843, + "learning_rate": 7.149692645667804e-07, + "loss": 0.1693, "step": 1720 }, { - "epoch": 0.8275365795154713, - "grad_norm": 2.6804272332523036, - "learning_rate": 9.087647727961956e-06, - "loss": 0.8641, + "epoch": 2.482609738546414, + "grad_norm": 1.250231314429457, + "learning_rate": 6.959450326103722e-07, + "loss": 0.2067, "step": 1725 }, { - "epoch": 0.8299352362676901, - "grad_norm": 3.39051606914781, - "learning_rate": 9.079592340530879e-06, - "loss": 0.8148, + "epoch": 2.48980570880307, + "grad_norm": 1.3184620916504868, + "learning_rate": 6.771531860958726e-07, + "loss": 0.1557, "step": 1730 }, { - "epoch": 0.8323338930199089, - "grad_norm": 2.537680913646963, - "learning_rate": 9.071505150927663e-06, - "loss": 0.8964, + "epoch": 2.4970016790597267, + "grad_norm": 1.3996911523285738, + "learning_rate": 6.585950467178656e-07, + "loss": 0.1984, "step": 1735 }, { - "epoch": 0.8347325497721276, - "grad_norm": 5.12661162717378, - "learning_rate": 9.06338622219542e-06, - "loss": 0.8701, + "epoch": 2.504197649316383, + "grad_norm": 1.330732277956789, + "learning_rate": 6.402719197335181e-07, + "loss": 0.1656, "step": 1740 }, { - "epoch": 0.8371312065243464, - "grad_norm": 4.645974768815863, - "learning_rate": 9.055235617624682e-06, - "loss": 0.8514, + "epoch": 2.511393619573039, + "grad_norm": 1.3782406997114114, + "learning_rate": 6.22185093870772e-07, + "loss": 0.1669, "step": 1745 }, { - "epoch": 0.8395298632765651, - "grad_norm": 3.084982306927161, - "learning_rate": 9.047053400752907e-06, - "loss": 0.8877, + "epoch": 2.5185895898296953, + "grad_norm": 1.4431968846802443, + "learning_rate": 6.043358412377069e-07, + "loss": 0.1799, "step": 1750 }, { - "epoch": 0.8419285200287839, - "grad_norm": 2.576877326965564, - "learning_rate": 9.038839635363987e-06, - "loss": 0.8163, + "epoch": 2.5257855600863515, + "grad_norm": 1.1865288276002492, + "learning_rate": 5.867254172330689e-07, + "loss": 0.1614, "step": 1755 }, { - "epoch": 0.8443271767810027, - "grad_norm": 3.51019570508426, - "learning_rate": 9.030594385487745e-06, - "loss": 0.8915, + "epoch": 2.532981530343008, + "grad_norm": 1.3447844251083265, + "learning_rate": 5.693550604579722e-07, + "loss": 0.1761, "step": 1760 }, { - "epoch": 0.8467258335332214, - "grad_norm": 3.8600342540679025, - "learning_rate": 9.02231771539944e-06, - "loss": 0.9304, + "epoch": 2.5401775005996643, + "grad_norm": 1.312290863998097, + "learning_rate": 5.52225992628784e-07, + "loss": 0.175, "step": 1765 }, { - "epoch": 0.8491244902854401, - "grad_norm": 2.703374255650966, - "learning_rate": 9.014009689619267e-06, - "loss": 0.8352, + "epoch": 2.5473734708563205, + "grad_norm": 1.325480546799902, + "learning_rate": 5.353394184912012e-07, + "loss": 0.1893, "step": 1770 }, { - "epoch": 0.8515231470376589, - "grad_norm": 2.3896428975452553, - "learning_rate": 9.005670372911848e-06, - "loss": 0.8556, + "epoch": 2.5545694411129767, + "grad_norm": 1.211006197074522, + "learning_rate": 5.186965257355092e-07, + "loss": 0.1738, "step": 1775 }, { - "epoch": 0.8539218037898777, - "grad_norm": 3.008401281219767, - "learning_rate": 8.997299830285735e-06, - "loss": 0.8637, + "epoch": 2.561765411369633, + "grad_norm": 1.2613128106853304, + "learning_rate": 5.022984849130542e-07, + "loss": 0.1735, "step": 1780 }, { - "epoch": 0.8563204605420964, - "grad_norm": 3.3845621217286315, - "learning_rate": 8.9888981269929e-06, - "loss": 0.9349, + "epoch": 2.5689613816262895, + "grad_norm": 1.4240080375407917, + "learning_rate": 4.861464493539116e-07, + "loss": 0.209, "step": 1785 }, { - "epoch": 0.8587191172943152, - "grad_norm": 3.066551980706959, - "learning_rate": 8.98046532852822e-06, - "loss": 1.0207, + "epoch": 2.5761573518829457, + "grad_norm": 1.212642870699417, + "learning_rate": 4.702415550857668e-07, + "loss": 0.1661, "step": 1790 }, { - "epoch": 0.8611177740465339, - "grad_norm": 3.593558463375603, - "learning_rate": 8.972001500628978e-06, - "loss": 0.8669, + "epoch": 2.583353322139602, + "grad_norm": 1.19899124906289, + "learning_rate": 4.5458492075401845e-07, + "loss": 0.1871, "step": 1795 }, { - "epoch": 0.8635164307987527, - "grad_norm": 4.827873639267246, - "learning_rate": 8.963506709274344e-06, - "loss": 0.9533, - "step": 1800 - }, - { - "epoch": 0.8635164307987527, - "eval_loss": 0.8710312247276306, - "eval_runtime": 740.1915, - "eval_samples_per_second": 10.011, - "eval_steps_per_second": 0.627, + "epoch": 2.590549292396258, + "grad_norm": 1.2451776201467897, + "learning_rate": 4.391776475430964e-07, + "loss": 0.1736, "step": 1800 }, { - "epoch": 0.8659150875509715, - "grad_norm": 2.1671636510448913, - "learning_rate": 8.95498102068486e-06, - "loss": 0.8423, + "epoch": 2.5977452626529143, + "grad_norm": 1.4217111682942414, + "learning_rate": 4.240208190990149e-07, + "loss": 0.1656, "step": 1805 }, { - "epoch": 0.8683137443031902, - "grad_norm": 2.2993204223464954, - "learning_rate": 8.946424501321922e-06, - "loss": 0.9611, + "epoch": 2.604941232909571, + "grad_norm": 1.154023125578338, + "learning_rate": 4.0911550145315356e-07, + "loss": 0.176, "step": 1810 }, { - "epoch": 0.8707124010554089, - "grad_norm": 3.6181635510094323, - "learning_rate": 8.937837217887273e-06, - "loss": 0.9857, + "epoch": 2.6121372031662267, + "grad_norm": 1.2517982852871838, + "learning_rate": 3.944627429472809e-07, + "loss": 0.168, "step": 1815 }, { - "epoch": 0.8731110578076278, - "grad_norm": 2.3157461628034968, - "learning_rate": 8.929219237322468e-06, - "loss": 0.8967, + "epoch": 2.6193331734228833, + "grad_norm": 1.3001175217867729, + "learning_rate": 3.8006357415981947e-07, + "loss": 0.1582, "step": 1820 }, { - "epoch": 0.8755097145598465, - "grad_norm": 2.3766663633520593, - "learning_rate": 8.920570626808364e-06, - "loss": 0.8718, + "epoch": 2.6265291436795395, + "grad_norm": 1.4179539106113206, + "learning_rate": 3.659190078333667e-07, + "loss": 0.1901, "step": 1825 }, { - "epoch": 0.8779083713120652, - "grad_norm": 3.1542811825496617, - "learning_rate": 8.911891453764587e-06, - "loss": 0.8869, + "epoch": 2.6337251139361957, + "grad_norm": 1.2865481274768071, + "learning_rate": 3.5203003880345786e-07, + "loss": 0.1825, "step": 1830 }, { - "epoch": 0.880307028064284, - "grad_norm": 2.3205179227394765, - "learning_rate": 8.903181785849016e-06, - "loss": 0.9165, + "epoch": 2.640921084192852, + "grad_norm": 1.2107327771575902, + "learning_rate": 3.383976439286007e-07, + "loss": 0.178, "step": 1835 }, { - "epoch": 0.8827056848165028, - "grad_norm": 2.395604854053, - "learning_rate": 8.894441690957246e-06, - "loss": 0.8951, + "epoch": 2.648117054449508, + "grad_norm": 1.4930579520298934, + "learning_rate": 3.250227820215694e-07, + "loss": 0.1795, "step": 1840 }, { - "epoch": 0.8851043415687215, - "grad_norm": 2.4063566793229674, - "learning_rate": 8.88567123722207e-06, - "loss": 0.867, + "epoch": 2.6553130247061647, + "grad_norm": 1.7580144453795274, + "learning_rate": 3.119063937819666e-07, + "loss": 0.1988, "step": 1845 }, { - "epoch": 0.8875029983209403, - "grad_norm": 3.262096284773896, - "learning_rate": 8.876870493012931e-06, - "loss": 0.8811, + "epoch": 2.662508994962821, + "grad_norm": 1.389677232858989, + "learning_rate": 2.990494017300604e-07, + "loss": 0.189, "step": 1850 }, { - "epoch": 0.889901655073159, - "grad_norm": 4.069658033112594, - "learning_rate": 8.868039526935407e-06, - "loss": 0.8339, + "epoch": 2.669704965219477, + "grad_norm": 1.4778063736068945, + "learning_rate": 2.864527101419032e-07, + "loss": 0.2053, "step": 1855 }, { - "epoch": 0.8923003118253778, - "grad_norm": 3.192868682184835, - "learning_rate": 8.859178407830668e-06, - "loss": 0.7906, + "epoch": 2.6769009354761333, + "grad_norm": 1.2577420076989798, + "learning_rate": 2.7411720498572744e-07, + "loss": 0.1917, "step": 1860 }, { - "epoch": 0.8946989685775966, - "grad_norm": 2.478055612898803, - "learning_rate": 8.850287204774936e-06, - "loss": 0.9756, + "epoch": 2.6840969057327895, + "grad_norm": 1.5130645195940433, + "learning_rate": 2.6204375385963494e-07, + "loss": 0.161, "step": 1865 }, { - "epoch": 0.8970976253298153, - "grad_norm": 2.509818105526665, - "learning_rate": 8.841365987078955e-06, - "loss": 0.8408, + "epoch": 2.691292875989446, + "grad_norm": 1.080302530956707, + "learning_rate": 2.502332059305745e-07, + "loss": 0.1752, "step": 1870 }, { - "epoch": 0.899496282082034, - "grad_norm": 2.726184599074837, - "learning_rate": 8.832414824287441e-06, - "loss": 0.885, + "epoch": 2.6984888462461023, + "grad_norm": 1.306131392662643, + "learning_rate": 2.386863918746167e-07, + "loss": 0.1968, "step": 1875 }, { - "epoch": 0.9018949388342529, - "grad_norm": 3.068184221265625, - "learning_rate": 8.823433786178549e-06, - "loss": 0.8282, + "epoch": 2.7056848165027585, + "grad_norm": 1.3461515984684975, + "learning_rate": 2.2740412381853223e-07, + "loss": 0.183, "step": 1880 }, { - "epoch": 0.9042935955864716, - "grad_norm": 5.076920426744664, - "learning_rate": 8.814422942763322e-06, - "loss": 0.8414, + "epoch": 2.7128807867594147, + "grad_norm": 1.4486810753503954, + "learning_rate": 2.1638719528266835e-07, + "loss": 0.1938, "step": 1885 }, { - "epoch": 0.9066922523386903, - "grad_norm": 2.0951998113700445, - "learning_rate": 8.805382364285152e-06, - "loss": 0.9026, + "epoch": 2.720076757016071, + "grad_norm": 1.035281562927121, + "learning_rate": 2.0563638112514047e-07, + "loss": 0.1823, "step": 1890 }, { - "epoch": 0.9090909090909091, - "grad_norm": 2.5304664436244564, - "learning_rate": 8.79631212121922e-06, - "loss": 0.8819, + "epoch": 2.7272727272727275, + "grad_norm": 1.4407581503306328, + "learning_rate": 1.9515243748733455e-07, + "loss": 0.1648, "step": 1895 }, { - "epoch": 0.9114895658431279, - "grad_norm": 2.910233568728739, - "learning_rate": 8.787212284271969e-06, - "loss": 0.7901, - "step": 1900 - }, - { - "epoch": 0.9114895658431279, - "eval_loss": 0.865532398223877, - "eval_runtime": 739.6594, - "eval_samples_per_second": 10.018, - "eval_steps_per_second": 0.627, + "epoch": 2.7344686975293837, + "grad_norm": 1.2289916037617492, + "learning_rate": 1.8493610174072248e-07, + "loss": 0.1716, "step": 1900 }, { - "epoch": 0.9138882225953466, - "grad_norm": 3.0968516736110385, - "learning_rate": 8.778082924380527e-06, - "loss": 0.8504, + "epoch": 2.74166466778604, + "grad_norm": 1.1641895371111006, + "learning_rate": 1.7498809243500133e-07, + "loss": 0.1659, "step": 1905 }, { - "epoch": 0.9162868793475654, - "grad_norm": 2.843647032000645, - "learning_rate": 8.76892411271217e-06, - "loss": 0.8945, + "epoch": 2.748860638042696, + "grad_norm": 1.2177853046541605, + "learning_rate": 1.6530910924755603e-07, + "loss": 0.1905, "step": 1910 }, { - "epoch": 0.9186855360997841, - "grad_norm": 2.7639703028981915, - "learning_rate": 8.759735920663764e-06, - "loss": 0.8549, + "epoch": 2.7560566082993523, + "grad_norm": 1.373432351937655, + "learning_rate": 1.5589983293424802e-07, + "loss": 0.1948, "step": 1915 }, { - "epoch": 0.9210841928520028, - "grad_norm": 5.035770193454392, - "learning_rate": 8.750518419861206e-06, - "loss": 0.9528, + "epoch": 2.7632525785560085, + "grad_norm": 1.2223263222194338, + "learning_rate": 1.4676092528153495e-07, + "loss": 0.1635, "step": 1920 }, { - "epoch": 0.9234828496042217, - "grad_norm": 2.2246289636977004, - "learning_rate": 8.74127168215887e-06, - "loss": 0.8214, + "epoch": 2.7704485488126647, + "grad_norm": 1.323423227538223, + "learning_rate": 1.378930290599265e-07, + "loss": 0.1941, "step": 1925 }, { - "epoch": 0.9258815063564404, - "grad_norm": 3.5156544905473828, - "learning_rate": 8.731995779639042e-06, - "loss": 0.8866, + "epoch": 2.7776445190693213, + "grad_norm": 1.1747342118230812, + "learning_rate": 1.29296767978774e-07, + "loss": 0.1556, "step": 1930 }, { - "epoch": 0.9282801631086591, - "grad_norm": 3.0156056794105752, - "learning_rate": 8.72269078461136e-06, - "loss": 0.9573, + "epoch": 2.7848404893259775, + "grad_norm": 1.4551159106122102, + "learning_rate": 1.2097274664240486e-07, + "loss": 0.1778, "step": 1935 }, { - "epoch": 0.9306788198608779, - "grad_norm": 3.063423244582553, - "learning_rate": 8.713356769612254e-06, - "loss": 0.8277, + "epoch": 2.7920364595826337, + "grad_norm": 1.2601624341009796, + "learning_rate": 1.1292155050759689e-07, + "loss": 0.183, "step": 1940 }, { - "epoch": 0.9330774766130967, - "grad_norm": 2.900116122741938, - "learning_rate": 8.703993807404371e-06, - "loss": 0.8748, + "epoch": 2.79923242983929, + "grad_norm": 1.2755552581393579, + "learning_rate": 1.0514374584240338e-07, + "loss": 0.1623, "step": 1945 }, { - "epoch": 0.9354761333653154, - "grad_norm": 2.8160457003362196, - "learning_rate": 8.694601970976022e-06, - "loss": 0.9839, + "epoch": 2.806428400095946, + "grad_norm": 1.2967219351574655, + "learning_rate": 9.763987968632293e-08, + "loss": 0.1895, "step": 1950 }, { - "epoch": 0.9378747901175342, - "grad_norm": 2.9528327363198343, - "learning_rate": 8.685181333540596e-06, - "loss": 0.9682, + "epoch": 2.8136243703526027, + "grad_norm": 1.5216756650995487, + "learning_rate": 9.04104798118257e-08, + "loss": 0.18, "step": 1955 }, { - "epoch": 0.9402734468697529, - "grad_norm": 2.416154745371056, - "learning_rate": 8.675731968536004e-06, - "loss": 0.8996, + "epoch": 2.820820340609259, + "grad_norm": 1.2463724400999108, + "learning_rate": 8.345605468723427e-08, + "loss": 0.1855, "step": 1960 }, { - "epoch": 0.9426721036219717, - "grad_norm": 2.6496183375696902, - "learning_rate": 8.666253949624097e-06, - "loss": 0.8659, + "epoch": 2.828016310865915, + "grad_norm": 1.3612982361619894, + "learning_rate": 7.677709344095883e-08, + "loss": 0.1971, "step": 1965 }, { - "epoch": 0.9450707603741905, - "grad_norm": 2.5880937245783033, - "learning_rate": 8.656747350690102e-06, - "loss": 0.8052, + "epoch": 2.8352122811225713, + "grad_norm": 1.1485240309542117, + "learning_rate": 7.037406582709815e-08, + "loss": 0.1673, "step": 1970 }, { - "epoch": 0.9474694171264092, - "grad_norm": 3.7751706043240105, - "learning_rate": 8.647212245842035e-06, - "loss": 0.9261, + "epoch": 2.8424082513792275, + "grad_norm": 1.129849173373603, + "learning_rate": 6.424742219239698e-08, + "loss": 0.1688, "step": 1975 }, { - "epoch": 0.9498680738786279, - "grad_norm": 2.886044266318896, - "learning_rate": 8.637648709410125e-06, - "loss": 0.8753, + "epoch": 2.849604221635884, + "grad_norm": 1.271858319859489, + "learning_rate": 5.839759344457462e-08, + "loss": 0.1864, "step": 1980 }, { - "epoch": 0.9522667306308468, - "grad_norm": 2.5010200818615584, - "learning_rate": 8.628056815946243e-06, - "loss": 0.7647, + "epoch": 2.8568001918925403, + "grad_norm": 1.3695271997638596, + "learning_rate": 5.282499102201532e-08, + "loss": 0.182, "step": 1985 }, { - "epoch": 0.9546653873830655, - "grad_norm": 2.4230783414425443, - "learning_rate": 8.618436640223311e-06, - "loss": 0.9169, + "epoch": 2.8639961621491965, + "grad_norm": 1.2817216551660306, + "learning_rate": 4.753000686483189e-08, + "loss": 0.191, "step": 1990 }, { - "epoch": 0.9570640441352842, - "grad_norm": 5.70711797643376, - "learning_rate": 8.608788257234726e-06, - "loss": 0.9718, + "epoch": 2.8711921324058527, + "grad_norm": 1.2963119208915668, + "learning_rate": 4.2513013387298846e-08, + "loss": 0.1877, "step": 1995 }, { - "epoch": 0.959462700887503, - "grad_norm": 3.2409263975929146, - "learning_rate": 8.599111742193772e-06, - "loss": 0.8306, - "step": 2000 - }, - { - "epoch": 0.959462700887503, - "eval_loss": 0.8639265894889832, - "eval_runtime": 738.2244, - "eval_samples_per_second": 10.038, - "eval_steps_per_second": 0.629, + "epoch": 2.878388102662509, + "grad_norm": 1.290101189771667, + "learning_rate": 3.7774363451658744e-08, + "loss": 0.1796, "step": 2000 }, { - "epoch": 0.9618613576397218, - "grad_norm": 2.60287555153669, - "learning_rate": 8.589407170533035e-06, - "loss": 0.9463, + "epoch": 2.8855840729191655, + "grad_norm": 1.4819813983719878, + "learning_rate": 3.331439034330552e-08, + "loss": 0.1763, "step": 2005 }, { - "epoch": 0.9642600143919405, - "grad_norm": 4.0419154556516235, - "learning_rate": 8.579674617903812e-06, - "loss": 0.8895, + "epoch": 2.8927800431758213, + "grad_norm": 1.4019557468116053, + "learning_rate": 2.913340774734152e-08, + "loss": 0.1708, "step": 2010 }, { - "epoch": 0.9666586711441593, - "grad_norm": 3.1939130510098934, - "learning_rate": 8.569914160175527e-06, - "loss": 0.9188, + "epoch": 2.899976013432478, + "grad_norm": 1.2339439577357936, + "learning_rate": 2.5231709726516005e-08, + "loss": 0.1789, "step": 2015 }, { - "epoch": 0.969057327896378, - "grad_norm": 3.1878190598190304, - "learning_rate": 8.560125873435132e-06, - "loss": 0.8027, + "epoch": 2.907171983689134, + "grad_norm": 1.2294457543053794, + "learning_rate": 2.1609570700543478e-08, + "loss": 0.1575, "step": 2020 }, { - "epoch": 0.9714559846485968, - "grad_norm": 2.1973702930768937, - "learning_rate": 8.550309833986525e-06, - "loss": 0.7583, + "epoch": 2.9143679539457903, + "grad_norm": 1.6910934029285385, + "learning_rate": 1.826724542680047e-08, + "loss": 0.1853, "step": 2025 }, { - "epoch": 0.9738546414008156, - "grad_norm": 3.416693480749312, - "learning_rate": 8.540466118349938e-06, - "loss": 0.873, + "epoch": 2.9215639242024465, + "grad_norm": 1.5951062337979125, + "learning_rate": 1.5204968982410527e-08, + "loss": 0.1994, "step": 2030 }, { - "epoch": 0.9762532981530343, - "grad_norm": 4.297709957092606, - "learning_rate": 8.530594803261362e-06, - "loss": 0.8679, + "epoch": 2.9287598944591027, + "grad_norm": 1.244634963059678, + "learning_rate": 1.2422956747708546e-08, + "loss": 0.1792, "step": 2035 }, { - "epoch": 0.978651954905253, - "grad_norm": 2.7317028436026343, - "learning_rate": 8.520695965671928e-06, - "loss": 0.8326, + "epoch": 2.9359558647157593, + "grad_norm": 1.3731755256138354, + "learning_rate": 9.92140439109157e-09, + "loss": 0.1855, "step": 2040 }, { - "epoch": 0.9810506116574718, - "grad_norm": 3.819976358159541, - "learning_rate": 8.510769682747324e-06, - "loss": 0.992, + "epoch": 2.9431518349724155, + "grad_norm": 1.487158498377204, + "learning_rate": 7.700487855260007e-09, + "loss": 0.1713, "step": 2045 }, { - "epoch": 0.9834492684096906, - "grad_norm": 2.4139195174663883, - "learning_rate": 8.50081603186718e-06, - "loss": 0.7586, + "epoch": 2.9503478052290717, + "grad_norm": 1.195702473189198, + "learning_rate": 5.760363344839536e-09, + "loss": 0.1756, "step": 2050 }, { - "epoch": 0.9858479251619093, - "grad_norm": 2.438887753565038, - "learning_rate": 8.490835090624476e-06, - "loss": 0.9304, + "epoch": 2.957543775485728, + "grad_norm": 1.2219249635030713, + "learning_rate": 4.101167315396559e-09, + "loss": 0.1705, "step": 2055 }, { - "epoch": 0.9882465819141281, - "grad_norm": 3.320035848122713, - "learning_rate": 8.480826936824933e-06, - "loss": 0.8927, + "epoch": 2.964739745742384, + "grad_norm": 1.1937579061988086, + "learning_rate": 2.7230164638401e-09, + "loss": 0.1669, "step": 2060 }, { - "epoch": 0.9906452386663468, - "grad_norm": 2.0982737537627165, - "learning_rate": 8.470791648486396e-06, - "loss": 0.9131, + "epoch": 2.9719357159990407, + "grad_norm": 1.0911166566357549, + "learning_rate": 1.626007720214595e-09, + "loss": 0.1556, "step": 2065 }, { - "epoch": 0.9930438954185656, - "grad_norm": 4.233832052942202, - "learning_rate": 8.460729303838246e-06, - "loss": 0.8421, + "epoch": 2.979131686255697, + "grad_norm": 1.155309788494306, + "learning_rate": 8.102182408822322e-10, + "loss": 0.1572, "step": 2070 }, { - "epoch": 0.9954425521707844, - "grad_norm": 3.0600316774502523, - "learning_rate": 8.450639981320777e-06, - "loss": 0.8101, + "epoch": 2.986327656512353, + "grad_norm": 1.1809384871247852, + "learning_rate": 2.7570540309618253e-10, + "loss": 0.1651, "step": 2075 }, { - "epoch": 0.9978412089230031, - "grad_norm": 3.229891184239876, - "learning_rate": 8.440523759584583e-06, - "loss": 0.8834, + "epoch": 2.9935236267690093, + "grad_norm": 2.41813981269854, + "learning_rate": 2.2506800965604867e-11, + "loss": 0.1452, "step": 2080 }, { - "epoch": 1.0002398656752218, - "grad_norm": 4.476281436495972, - "learning_rate": 8.43038071748995e-06, - "loss": 0.9229, - "step": 2085 - }, - { - "epoch": 1.0026385224274406, - "grad_norm": 2.5593143307820743, - "learning_rate": 8.420210934106245e-06, - "loss": 0.4714, - "step": 2090 - }, - { - "epoch": 1.0050371791796593, - "grad_norm": 2.213171577306985, - "learning_rate": 8.41001448871129e-06, - "loss": 0.5274, - "step": 2095 - }, - { - "epoch": 1.0074358359318782, - "grad_norm": 2.0985588840636282, - "learning_rate": 8.399791460790752e-06, - "loss": 0.4803, - "step": 2100 - }, - { - "epoch": 1.009834492684097, - "grad_norm": 2.4553936796867637, - "learning_rate": 8.389541930037516e-06, - "loss": 0.4868, - "step": 2105 - }, - { - "epoch": 1.0122331494363157, - "grad_norm": 2.228861585976993, - "learning_rate": 8.379265976351074e-06, - "loss": 0.5068, - "step": 2110 - }, - { - "epoch": 1.0146318061885344, - "grad_norm": 2.890711742500512, - "learning_rate": 8.36896367983689e-06, - "loss": 0.4482, - "step": 2115 - }, - { - "epoch": 1.0170304629407532, - "grad_norm": 2.227985338155966, - "learning_rate": 8.358635120805783e-06, - "loss": 0.4309, - "step": 2120 - }, - { - "epoch": 1.019429119692972, - "grad_norm": 1.7753949748634992, - "learning_rate": 8.348280379773303e-06, - "loss": 0.3702, - "step": 2125 - }, - { - "epoch": 1.0218277764451906, - "grad_norm": 2.39040145648445, - "learning_rate": 8.337899537459098e-06, - "loss": 0.4513, - "step": 2130 - }, - { - "epoch": 1.0242264331974094, - "grad_norm": 2.1792944421413925, - "learning_rate": 8.327492674786286e-06, - "loss": 0.4522, - "step": 2135 - }, - { - "epoch": 1.0266250899496283, - "grad_norm": 2.704876180547788, - "learning_rate": 8.317059872880823e-06, - "loss": 0.4446, - "step": 2140 - }, - { - "epoch": 1.029023746701847, - "grad_norm": 2.5695079674783337, - "learning_rate": 8.30660121307088e-06, - "loss": 0.4808, - "step": 2145 - }, - { - "epoch": 1.0314224034540658, - "grad_norm": 2.4711435136084465, - "learning_rate": 8.296116776886192e-06, - "loss": 0.3853, - "step": 2150 - }, - { - "epoch": 1.0338210602062845, - "grad_norm": 2.9044848081202126, - "learning_rate": 8.285606646057435e-06, - "loss": 0.443, - "step": 2155 - }, - { - "epoch": 1.0362197169585032, - "grad_norm": 2.1130488032920303, - "learning_rate": 8.275070902515593e-06, - "loss": 0.4353, - "step": 2160 - }, - { - "epoch": 1.038618373710722, - "grad_norm": 1.6913616556429052, - "learning_rate": 8.264509628391299e-06, - "loss": 0.4457, - "step": 2165 - }, - { - "epoch": 1.0410170304629407, - "grad_norm": 2.6512115817567614, - "learning_rate": 8.25392290601422e-06, - "loss": 0.5117, - "step": 2170 - }, - { - "epoch": 1.0434156872151594, - "grad_norm": 3.956549519902379, - "learning_rate": 8.243310817912395e-06, - "loss": 0.4705, - "step": 2175 - }, - { - "epoch": 1.0458143439673782, - "grad_norm": 2.7210983741481565, - "learning_rate": 8.232673446811602e-06, - "loss": 0.5065, - "step": 2180 - }, - { - "epoch": 1.048213000719597, - "grad_norm": 2.5059382822841187, - "learning_rate": 8.222010875634713e-06, - "loss": 0.5251, - "step": 2185 - }, - { - "epoch": 1.0506116574718158, - "grad_norm": 2.262650183186798, - "learning_rate": 8.211323187501046e-06, - "loss": 0.4201, - "step": 2190 - }, - { - "epoch": 1.0530103142240346, - "grad_norm": 2.1701091490205453, - "learning_rate": 8.200610465725709e-06, - "loss": 0.432, - "step": 2195 - }, - { - "epoch": 1.0554089709762533, - "grad_norm": 3.203732105955941, - "learning_rate": 8.189872793818966e-06, - "loss": 0.4875, - "step": 2200 - }, - { - "epoch": 1.057807627728472, - "grad_norm": 2.129191102623214, - "learning_rate": 8.179110255485576e-06, - "loss": 0.4272, - "step": 2205 - }, - { - "epoch": 1.0602062844806908, - "grad_norm": 2.0600885302520506, - "learning_rate": 8.16832293462414e-06, - "loss": 0.4211, - "step": 2210 - }, - { - "epoch": 1.0626049412329095, - "grad_norm": 1.7498086907025407, - "learning_rate": 8.157510915326453e-06, - "loss": 0.4825, - "step": 2215 - }, - { - "epoch": 1.0650035979851282, - "grad_norm": 2.7615254217770895, - "learning_rate": 8.14667428187684e-06, - "loss": 0.4528, - "step": 2220 - }, - { - "epoch": 1.0674022547373472, - "grad_norm": 2.6704428087666607, - "learning_rate": 8.135813118751508e-06, - "loss": 0.502, - "step": 2225 - }, - { - "epoch": 1.069800911489566, - "grad_norm": 2.5880979319847977, - "learning_rate": 8.124927510617886e-06, - "loss": 0.4126, - "step": 2230 - }, - { - "epoch": 1.0721995682417846, - "grad_norm": 2.62434654551889, - "learning_rate": 8.114017542333955e-06, - "loss": 0.485, - "step": 2235 - }, - { - "epoch": 1.0745982249940034, - "grad_norm": 1.777812099558463, - "learning_rate": 8.103083298947594e-06, - "loss": 0.4729, - "step": 2240 - }, - { - "epoch": 1.076996881746222, - "grad_norm": 2.09259412996849, - "learning_rate": 8.092124865695928e-06, - "loss": 0.4283, - "step": 2245 - }, - { - "epoch": 1.0793955384984408, - "grad_norm": 2.38765858433518, - "learning_rate": 8.081142328004638e-06, - "loss": 0.523, - "step": 2250 - }, - { - "epoch": 1.0817941952506596, - "grad_norm": 3.237139015501116, - "learning_rate": 8.070135771487317e-06, - "loss": 0.519, - "step": 2255 - }, - { - "epoch": 1.0841928520028783, - "grad_norm": 2.4130326528367334, - "learning_rate": 8.059105281944799e-06, - "loss": 0.5109, - "step": 2260 - }, - { - "epoch": 1.0865915087550972, - "grad_norm": 2.4038606293430758, - "learning_rate": 8.048050945364477e-06, - "loss": 0.3813, - "step": 2265 - }, - { - "epoch": 1.088990165507316, - "grad_norm": 2.6627926967455187, - "learning_rate": 8.036972847919645e-06, - "loss": 0.4671, - "step": 2270 - }, - { - "epoch": 1.0913888222595347, - "grad_norm": 2.6725216982707547, - "learning_rate": 8.025871075968828e-06, - "loss": 0.4293, - "step": 2275 - }, - { - "epoch": 1.0937874790117534, - "grad_norm": 1.8642256511396682, - "learning_rate": 8.014745716055095e-06, - "loss": 0.3925, - "step": 2280 - }, - { - "epoch": 1.0961861357639722, - "grad_norm": 2.6136199232940176, - "learning_rate": 8.003596854905405e-06, - "loss": 0.4031, - "step": 2285 - }, - { - "epoch": 1.098584792516191, - "grad_norm": 1.9487935903728233, - "learning_rate": 7.99242457942991e-06, - "loss": 0.5139, - "step": 2290 - }, - { - "epoch": 1.1009834492684096, - "grad_norm": 2.259840081609626, - "learning_rate": 7.981228976721287e-06, - "loss": 0.4398, - "step": 2295 - }, - { - "epoch": 1.1033821060206284, - "grad_norm": 3.5633206330617555, - "learning_rate": 7.970010134054064e-06, - "loss": 0.4823, - "step": 2300 - }, - { - "epoch": 1.1057807627728473, - "grad_norm": 2.6015299960981144, - "learning_rate": 7.95876813888393e-06, - "loss": 0.4304, - "step": 2305 - }, - { - "epoch": 1.108179419525066, - "grad_norm": 2.7206316510278694, - "learning_rate": 7.947503078847061e-06, - "loss": 0.4586, - "step": 2310 - }, - { - "epoch": 1.1105780762772848, - "grad_norm": 2.794098011567633, - "learning_rate": 7.93621504175943e-06, - "loss": 0.471, - "step": 2315 - }, - { - "epoch": 1.1129767330295035, - "grad_norm": 2.7240226313790226, - "learning_rate": 7.92490411561613e-06, - "loss": 0.497, - "step": 2320 - }, - { - "epoch": 1.1153753897817222, - "grad_norm": 3.5098947402019443, - "learning_rate": 7.91357038859068e-06, - "loss": 0.4853, - "step": 2325 - }, - { - "epoch": 1.117774046533941, - "grad_norm": 2.0290357336252582, - "learning_rate": 7.902213949034345e-06, - "loss": 0.421, - "step": 2330 - }, - { - "epoch": 1.1201727032861597, - "grad_norm": 3.0022701990136302, - "learning_rate": 7.89083488547544e-06, - "loss": 0.4909, - "step": 2335 - }, - { - "epoch": 1.1225713600383784, - "grad_norm": 2.065209472044516, - "learning_rate": 7.87943328661865e-06, - "loss": 0.4806, - "step": 2340 - }, - { - "epoch": 1.1249700167905972, - "grad_norm": 2.8034071820688142, - "learning_rate": 7.868009241344323e-06, - "loss": 0.4474, - "step": 2345 - }, - { - "epoch": 1.1273686735428161, - "grad_norm": 2.580536233115346, - "learning_rate": 7.856562838707799e-06, - "loss": 0.4731, - "step": 2350 - }, - { - "epoch": 1.1297673302950348, - "grad_norm": 2.0441292412793426, - "learning_rate": 7.845094167938689e-06, - "loss": 0.4212, - "step": 2355 - }, - { - "epoch": 1.1321659870472536, - "grad_norm": 1.9698046885266003, - "learning_rate": 7.833603318440207e-06, - "loss": 0.4269, - "step": 2360 - }, - { - "epoch": 1.1345646437994723, - "grad_norm": 2.4020632540305336, - "learning_rate": 7.822090379788449e-06, - "loss": 0.4117, - "step": 2365 - }, - { - "epoch": 1.136963300551691, - "grad_norm": 2.3063432967429662, - "learning_rate": 7.810555441731715e-06, - "loss": 0.4061, - "step": 2370 - }, - { - "epoch": 1.1393619573039098, - "grad_norm": 2.6738090741389997, - "learning_rate": 7.79899859418979e-06, - "loss": 0.4746, - "step": 2375 - }, - { - "epoch": 1.1417606140561285, - "grad_norm": 2.22066686785184, - "learning_rate": 7.787419927253259e-06, - "loss": 0.4648, - "step": 2380 - }, - { - "epoch": 1.1441592708083472, - "grad_norm": 2.1609157489317, - "learning_rate": 7.775819531182797e-06, - "loss": 0.4836, - "step": 2385 - }, - { - "epoch": 1.1465579275605662, - "grad_norm": 2.825274671556735, - "learning_rate": 7.764197496408468e-06, - "loss": 0.3937, - "step": 2390 - }, - { - "epoch": 1.148956584312785, - "grad_norm": 1.8923634073444822, - "learning_rate": 7.752553913529019e-06, - "loss": 0.4561, - "step": 2395 - }, - { - "epoch": 1.1513552410650036, - "grad_norm": 2.4561463478420964, - "learning_rate": 7.74088887331117e-06, - "loss": 0.3996, - "step": 2400 - }, - { - "epoch": 1.1537538978172224, - "grad_norm": 2.3237076372641643, - "learning_rate": 7.729202466688914e-06, - "loss": 0.5374, - "step": 2405 - }, - { - "epoch": 1.156152554569441, - "grad_norm": 1.7336149449293585, - "learning_rate": 7.717494784762804e-06, - "loss": 0.4204, - "step": 2410 - }, - { - "epoch": 1.1585512113216598, - "grad_norm": 2.6072676927620244, - "learning_rate": 7.705765918799244e-06, - "loss": 0.3674, - "step": 2415 - }, - { - "epoch": 1.1609498680738786, - "grad_norm": 2.031726825542831, - "learning_rate": 7.69401596022977e-06, - "loss": 0.4854, - "step": 2420 - }, - { - "epoch": 1.1633485248260973, - "grad_norm": 3.044907096929622, - "learning_rate": 7.682245000650354e-06, - "loss": 0.4676, - "step": 2425 - }, - { - "epoch": 1.165747181578316, - "grad_norm": 1.7787272779987482, - "learning_rate": 7.670453131820672e-06, - "loss": 0.4123, - "step": 2430 - }, - { - "epoch": 1.168145838330535, - "grad_norm": 2.1958069851919646, - "learning_rate": 7.6586404456634e-06, - "loss": 0.4779, - "step": 2435 - }, - { - "epoch": 1.1705444950827537, - "grad_norm": 2.2233956624705424, - "learning_rate": 7.646807034263493e-06, - "loss": 0.4659, - "step": 2440 - }, - { - "epoch": 1.1729431518349724, - "grad_norm": 1.8414644463809167, - "learning_rate": 7.63495298986747e-06, - "loss": 0.4409, - "step": 2445 - }, - { - "epoch": 1.1753418085871912, - "grad_norm": 3.3591717272248878, - "learning_rate": 7.623078404882689e-06, - "loss": 0.4125, - "step": 2450 - }, - { - "epoch": 1.17774046533941, - "grad_norm": 2.2891639443640366, - "learning_rate": 7.611183371876636e-06, - "loss": 0.4907, - "step": 2455 - }, - { - "epoch": 1.1801391220916286, - "grad_norm": 2.099380667162895, - "learning_rate": 7.599267983576193e-06, - "loss": 0.4529, - "step": 2460 - }, - { - "epoch": 1.1825377788438474, - "grad_norm": 2.542301285127525, - "learning_rate": 7.587332332866923e-06, - "loss": 0.5088, - "step": 2465 - }, - { - "epoch": 1.1849364355960663, - "grad_norm": 2.2714860789638567, - "learning_rate": 7.5753765127923475e-06, - "loss": 0.4983, - "step": 2470 - }, - { - "epoch": 1.187335092348285, - "grad_norm": 2.949953798544259, - "learning_rate": 7.563400616553205e-06, - "loss": 0.4625, - "step": 2475 - }, - { - "epoch": 1.1897337491005038, - "grad_norm": 3.3907825639923255, - "learning_rate": 7.551404737506747e-06, - "loss": 0.5057, - "step": 2480 - }, - { - "epoch": 1.1921324058527225, - "grad_norm": 3.379571230136842, - "learning_rate": 7.539388969165996e-06, - "loss": 0.4563, - "step": 2485 - }, - { - "epoch": 1.1945310626049412, - "grad_norm": 2.278909491259441, - "learning_rate": 7.52735340519902e-06, - "loss": 0.4486, - "step": 2490 - }, - { - "epoch": 1.19692971935716, - "grad_norm": 2.009431291635904, - "learning_rate": 7.515298139428203e-06, - "loss": 0.416, - "step": 2495 - }, - { - "epoch": 1.1993283761093787, - "grad_norm": 2.326722204481952, - "learning_rate": 7.503223265829511e-06, - "loss": 0.4517, - "step": 2500 - }, - { - "epoch": 1.2017270328615974, - "grad_norm": 3.1329815616704404, - "learning_rate": 7.491128878531764e-06, - "loss": 0.5385, - "step": 2505 - }, - { - "epoch": 1.2041256896138162, - "grad_norm": 3.013574468212731, - "learning_rate": 7.479015071815899e-06, - "loss": 0.4193, - "step": 2510 - }, - { - "epoch": 1.2065243463660351, - "grad_norm": 2.1700351101261965, - "learning_rate": 7.466881940114232e-06, - "loss": 0.4916, - "step": 2515 - }, - { - "epoch": 1.2089230031182538, - "grad_norm": 2.6970784604552, - "learning_rate": 7.4547295780097336e-06, - "loss": 0.5202, - "step": 2520 - }, - { - "epoch": 1.2113216598704726, - "grad_norm": 1.9662106419481995, - "learning_rate": 7.442558080235276e-06, - "loss": 0.393, - "step": 2525 - }, - { - "epoch": 1.2137203166226913, - "grad_norm": 2.1077726186526786, - "learning_rate": 7.4303675416729075e-06, - "loss": 0.3749, - "step": 2530 - }, - { - "epoch": 1.21611897337491, - "grad_norm": 2.320554500963109, - "learning_rate": 7.418158057353099e-06, - "loss": 0.4419, - "step": 2535 - }, - { - "epoch": 1.2185176301271288, - "grad_norm": 2.895651302536835, - "learning_rate": 7.405929722454026e-06, - "loss": 0.4118, - "step": 2540 - }, - { - "epoch": 1.2209162868793475, - "grad_norm": 1.8616421523355307, - "learning_rate": 7.393682632300798e-06, - "loss": 0.4295, - "step": 2545 - }, - { - "epoch": 1.2233149436315662, - "grad_norm": 3.737431743510741, - "learning_rate": 7.38141688236474e-06, - "loss": 0.441, - "step": 2550 - }, - { - "epoch": 1.2257136003837852, - "grad_norm": 1.6958443120456206, - "learning_rate": 7.36913256826263e-06, - "loss": 0.4359, - "step": 2555 - }, - { - "epoch": 1.228112257136004, - "grad_norm": 2.313565281454391, - "learning_rate": 7.356829785755967e-06, - "loss": 0.4512, - "step": 2560 - }, - { - "epoch": 1.2305109138882226, - "grad_norm": 4.040641972263277, - "learning_rate": 7.344508630750219e-06, - "loss": 0.4191, - "step": 2565 - }, - { - "epoch": 1.2329095706404414, - "grad_norm": 2.4002855017322333, - "learning_rate": 7.3321691992940725e-06, - "loss": 0.5318, - "step": 2570 - }, - { - "epoch": 1.23530822739266, - "grad_norm": 2.30125067592132, - "learning_rate": 7.319811587578689e-06, - "loss": 0.4533, - "step": 2575 - }, - { - "epoch": 1.2377068841448788, - "grad_norm": 2.0772341503543466, - "learning_rate": 7.3074358919369535e-06, - "loss": 0.3884, - "step": 2580 - }, - { - "epoch": 1.2401055408970976, - "grad_norm": 2.0581617857738297, - "learning_rate": 7.295042208842718e-06, - "loss": 0.4596, - "step": 2585 - }, - { - "epoch": 1.2425041976493163, - "grad_norm": 2.044753740126548, - "learning_rate": 7.282630634910062e-06, - "loss": 0.3761, - "step": 2590 - }, - { - "epoch": 1.244902854401535, - "grad_norm": 1.8350309498563098, - "learning_rate": 7.270201266892528e-06, - "loss": 0.5365, - "step": 2595 - }, - { - "epoch": 1.247301511153754, - "grad_norm": 2.861599377026653, - "learning_rate": 7.2577542016823685e-06, - "loss": 0.4203, - "step": 2600 - }, - { - "epoch": 1.2497001679059727, - "grad_norm": 2.265775605763178, - "learning_rate": 7.2452895363097964e-06, - "loss": 0.4422, - "step": 2605 - }, - { - "epoch": 1.2520988246581914, - "grad_norm": 1.8956362962363034, - "learning_rate": 7.232807367942225e-06, - "loss": 0.4711, - "step": 2610 - }, - { - "epoch": 1.2544974814104102, - "grad_norm": 2.3672251257694237, - "learning_rate": 7.220307793883512e-06, - "loss": 0.483, - "step": 2615 - }, - { - "epoch": 1.256896138162629, - "grad_norm": 2.2661312851841737, - "learning_rate": 7.207790911573198e-06, - "loss": 0.4678, - "step": 2620 - }, - { - "epoch": 1.2592947949148476, - "grad_norm": 1.7750731193059077, - "learning_rate": 7.195256818585747e-06, - "loss": 0.4545, - "step": 2625 - }, - { - "epoch": 1.2616934516670664, - "grad_norm": 3.7693900871996577, - "learning_rate": 7.182705612629793e-06, - "loss": 0.4417, - "step": 2630 - }, - { - "epoch": 1.2640921084192853, - "grad_norm": 2.4900331079659117, - "learning_rate": 7.17013739154737e-06, - "loss": 0.4084, - "step": 2635 - }, - { - "epoch": 1.266490765171504, - "grad_norm": 2.7575013032291817, - "learning_rate": 7.157552253313148e-06, - "loss": 0.4247, - "step": 2640 - }, - { - "epoch": 1.2688894219237228, - "grad_norm": 4.7542236375020375, - "learning_rate": 7.144950296033682e-06, - "loss": 0.4535, - "step": 2645 - }, - { - "epoch": 1.2712880786759415, - "grad_norm": 2.270681456582455, - "learning_rate": 7.132331617946634e-06, - "loss": 0.4005, - "step": 2650 - }, - { - "epoch": 1.2736867354281602, - "grad_norm": 2.5862028929190712, - "learning_rate": 7.1196963174200105e-06, - "loss": 0.4171, - "step": 2655 - }, - { - "epoch": 1.276085392180379, - "grad_norm": 2.1393183657416763, - "learning_rate": 7.107044492951398e-06, - "loss": 0.4313, - "step": 2660 - }, - { - "epoch": 1.2784840489325977, - "grad_norm": 1.9123346331650293, - "learning_rate": 7.094376243167197e-06, - "loss": 0.4364, - "step": 2665 - }, - { - "epoch": 1.2808827056848164, - "grad_norm": 2.8631827616417556, - "learning_rate": 7.0816916668218485e-06, - "loss": 0.5317, - "step": 2670 - }, - { - "epoch": 1.2832813624370352, - "grad_norm": 2.5356423623576805, - "learning_rate": 7.068990862797064e-06, - "loss": 0.4507, - "step": 2675 - }, - { - "epoch": 1.285680019189254, - "grad_norm": 2.052684995040051, - "learning_rate": 7.056273930101061e-06, - "loss": 0.4467, - "step": 2680 - }, - { - "epoch": 1.2880786759414729, - "grad_norm": 2.914920449781333, - "learning_rate": 7.043540967867782e-06, - "loss": 0.4526, - "step": 2685 - }, - { - "epoch": 1.2904773326936916, - "grad_norm": 2.2350378228168992, - "learning_rate": 7.03079207535613e-06, - "loss": 0.4565, - "step": 2690 - }, - { - "epoch": 1.2928759894459103, - "grad_norm": 2.9232644902019236, - "learning_rate": 7.01802735194919e-06, - "loss": 0.4327, - "step": 2695 - }, - { - "epoch": 1.295274646198129, - "grad_norm": 2.152229747789746, - "learning_rate": 7.0052468971534574e-06, - "loss": 0.475, - "step": 2700 - }, - { - "epoch": 1.2976733029503478, - "grad_norm": 1.576250476623198, - "learning_rate": 6.992450810598059e-06, - "loss": 0.3645, - "step": 2705 - }, - { - "epoch": 1.3000719597025665, - "grad_norm": 2.47081607319409, - "learning_rate": 6.979639192033977e-06, - "loss": 0.4351, - "step": 2710 - }, - { - "epoch": 1.3024706164547855, - "grad_norm": 2.9694947631286364, - "learning_rate": 6.966812141333273e-06, - "loss": 0.4233, - "step": 2715 - }, - { - "epoch": 1.3048692732070042, - "grad_norm": 2.657085372294027, - "learning_rate": 6.9539697584883116e-06, - "loss": 0.547, - "step": 2720 - }, - { - "epoch": 1.307267929959223, - "grad_norm": 1.9832533039303075, - "learning_rate": 6.941112143610973e-06, - "loss": 0.4364, - "step": 2725 - }, - { - "epoch": 1.3096665867114416, - "grad_norm": 2.523218142090448, - "learning_rate": 6.928239396931882e-06, - "loss": 0.4612, - "step": 2730 - }, - { - "epoch": 1.3120652434636604, - "grad_norm": 2.406726447436041, - "learning_rate": 6.915351618799618e-06, - "loss": 0.4071, - "step": 2735 - }, - { - "epoch": 1.314463900215879, - "grad_norm": 2.926082789168316, - "learning_rate": 6.902448909679942e-06, - "loss": 0.5146, - "step": 2740 - }, - { - "epoch": 1.3168625569680978, - "grad_norm": 1.8386902076079574, - "learning_rate": 6.889531370155004e-06, - "loss": 0.4658, - "step": 2745 - }, - { - "epoch": 1.3192612137203166, - "grad_norm": 2.0797479375790537, - "learning_rate": 6.876599100922566e-06, - "loss": 0.4361, - "step": 2750 - }, - { - "epoch": 1.3216598704725353, - "grad_norm": 3.061445264656408, - "learning_rate": 6.863652202795215e-06, - "loss": 0.4968, - "step": 2755 - }, - { - "epoch": 1.324058527224754, - "grad_norm": 2.687479706556386, - "learning_rate": 6.850690776699574e-06, - "loss": 0.4246, - "step": 2760 - }, - { - "epoch": 1.3264571839769728, - "grad_norm": 1.9168048123882346, - "learning_rate": 6.837714923675516e-06, - "loss": 0.4289, - "step": 2765 - }, - { - "epoch": 1.3288558407291917, - "grad_norm": 2.098881194418661, - "learning_rate": 6.8247247448753866e-06, - "loss": 0.4942, - "step": 2770 - }, - { - "epoch": 1.3312544974814104, - "grad_norm": 2.9646922853585793, - "learning_rate": 6.811720341563197e-06, - "loss": 0.4919, - "step": 2775 - }, - { - "epoch": 1.3336531542336292, - "grad_norm": 2.0101478360020604, - "learning_rate": 6.798701815113851e-06, - "loss": 0.435, - "step": 2780 - }, - { - "epoch": 1.336051810985848, - "grad_norm": 1.992177399616399, - "learning_rate": 6.785669267012346e-06, - "loss": 0.406, - "step": 2785 - }, - { - "epoch": 1.3384504677380666, - "grad_norm": 2.310067473610632, - "learning_rate": 6.7726227988529834e-06, - "loss": 0.427, - "step": 2790 - }, - { - "epoch": 1.3408491244902854, - "grad_norm": 2.1802412610225006, - "learning_rate": 6.759562512338581e-06, - "loss": 0.4949, - "step": 2795 - }, - { - "epoch": 1.3432477812425043, - "grad_norm": 2.1743296833228176, - "learning_rate": 6.74648850927967e-06, - "loss": 0.4758, - "step": 2800 - }, - { - "epoch": 1.345646437994723, - "grad_norm": 2.354140603930052, - "learning_rate": 6.733400891593717e-06, - "loss": 0.4515, - "step": 2805 - }, - { - "epoch": 1.3480450947469418, - "grad_norm": 2.2693267222576283, - "learning_rate": 6.720299761304312e-06, - "loss": 0.4229, - "step": 2810 - }, - { - "epoch": 1.3504437514991605, - "grad_norm": 2.7850031035864338, - "learning_rate": 6.707185220540385e-06, - "loss": 0.4096, - "step": 2815 - }, - { - "epoch": 1.3528424082513792, - "grad_norm": 2.0590276052665417, - "learning_rate": 6.6940573715354075e-06, - "loss": 0.3731, - "step": 2820 - }, - { - "epoch": 1.355241065003598, - "grad_norm": 2.455922368927128, - "learning_rate": 6.6809163166265945e-06, - "loss": 0.4694, - "step": 2825 - }, - { - "epoch": 1.3576397217558167, - "grad_norm": 3.2507513740552443, - "learning_rate": 6.667762158254104e-06, - "loss": 0.4457, - "step": 2830 - }, - { - "epoch": 1.3600383785080354, - "grad_norm": 2.2692392527198755, - "learning_rate": 6.654594998960242e-06, - "loss": 0.4451, - "step": 2835 - }, - { - "epoch": 1.3624370352602542, - "grad_norm": 2.03006411627659, - "learning_rate": 6.641414941388666e-06, - "loss": 0.4626, - "step": 2840 - }, - { - "epoch": 1.364835692012473, - "grad_norm": 2.3940134301391947, - "learning_rate": 6.628222088283574e-06, - "loss": 0.4573, - "step": 2845 - }, - { - "epoch": 1.3672343487646919, - "grad_norm": 2.966132014556057, - "learning_rate": 6.615016542488917e-06, - "loss": 0.4224, - "step": 2850 - }, - { - "epoch": 1.3696330055169106, - "grad_norm": 1.9966562170831368, - "learning_rate": 6.601798406947587e-06, - "loss": 0.4081, - "step": 2855 - }, - { - "epoch": 1.3720316622691293, - "grad_norm": 2.492563728485512, - "learning_rate": 6.588567784700623e-06, - "loss": 0.4479, - "step": 2860 - }, - { - "epoch": 1.374430319021348, - "grad_norm": 2.3512673785706606, - "learning_rate": 6.575324778886399e-06, - "loss": 0.4658, - "step": 2865 - }, - { - "epoch": 1.3768289757735668, - "grad_norm": 2.7381141478453253, - "learning_rate": 6.562069492739824e-06, - "loss": 0.3877, - "step": 2870 - }, - { - "epoch": 1.3792276325257855, - "grad_norm": 2.4848123819551904, - "learning_rate": 6.548802029591542e-06, - "loss": 0.4748, - "step": 2875 - }, - { - "epoch": 1.3816262892780042, - "grad_norm": 1.925224171020017, - "learning_rate": 6.535522492867118e-06, - "loss": 0.4298, - "step": 2880 - }, - { - "epoch": 1.3840249460302232, - "grad_norm": 2.589600737598477, - "learning_rate": 6.522230986086234e-06, - "loss": 0.3791, - "step": 2885 - }, - { - "epoch": 1.386423602782442, - "grad_norm": 2.276724697441552, - "learning_rate": 6.50892761286189e-06, - "loss": 0.4497, - "step": 2890 - }, - { - "epoch": 1.3888222595346607, - "grad_norm": 2.944103381058651, - "learning_rate": 6.4956124768995855e-06, - "loss": 0.4675, - "step": 2895 - }, - { - "epoch": 1.3912209162868794, - "grad_norm": 2.2595645772519055, - "learning_rate": 6.482285681996516e-06, - "loss": 0.4558, - "step": 2900 - }, - { - "epoch": 1.3936195730390981, - "grad_norm": 2.4610301955189735, - "learning_rate": 6.468947332040764e-06, - "loss": 0.4907, - "step": 2905 - }, - { - "epoch": 1.3960182297913168, - "grad_norm": 2.3168770293484857, - "learning_rate": 6.455597531010489e-06, - "loss": 0.4145, - "step": 2910 - }, - { - "epoch": 1.3984168865435356, - "grad_norm": 2.0741330602408166, - "learning_rate": 6.442236382973115e-06, - "loss": 0.423, - "step": 2915 - }, - { - "epoch": 1.4008155432957543, - "grad_norm": 2.9099644962421296, - "learning_rate": 6.428863992084523e-06, - "loss": 0.4216, - "step": 2920 - }, - { - "epoch": 1.403214200047973, - "grad_norm": 2.8760922784455523, - "learning_rate": 6.415480462588235e-06, - "loss": 0.4346, - "step": 2925 - }, - { - "epoch": 1.4056128568001918, - "grad_norm": 2.546333453485272, - "learning_rate": 6.402085898814605e-06, - "loss": 0.5081, - "step": 2930 - }, - { - "epoch": 1.4080115135524107, - "grad_norm": 2.4728709538253337, - "learning_rate": 6.388680405180003e-06, - "loss": 0.4637, - "step": 2935 - }, - { - "epoch": 1.4104101703046295, - "grad_norm": 1.9286327814952449, - "learning_rate": 6.375264086185999e-06, - "loss": 0.4532, - "step": 2940 - }, - { - "epoch": 1.4128088270568482, - "grad_norm": 2.1675714888403066, - "learning_rate": 6.361837046418558e-06, - "loss": 0.4243, - "step": 2945 - }, - { - "epoch": 1.415207483809067, - "grad_norm": 2.859235664034111, - "learning_rate": 6.348399390547213e-06, - "loss": 0.4288, - "step": 2950 - }, - { - "epoch": 1.4176061405612856, - "grad_norm": 1.9916645003102105, - "learning_rate": 6.334951223324254e-06, - "loss": 0.4518, - "step": 2955 - }, - { - "epoch": 1.4200047973135044, - "grad_norm": 3.334082881089859, - "learning_rate": 6.3214926495839135e-06, - "loss": 0.4372, - "step": 2960 - }, - { - "epoch": 1.4224034540657233, - "grad_norm": 2.051225494757854, - "learning_rate": 6.308023774241547e-06, - "loss": 0.4134, - "step": 2965 - }, - { - "epoch": 1.424802110817942, - "grad_norm": 2.2747949811500585, - "learning_rate": 6.294544702292814e-06, - "loss": 0.4694, - "step": 2970 - }, - { - "epoch": 1.4272007675701608, - "grad_norm": 2.195088989988513, - "learning_rate": 6.281055538812861e-06, - "loss": 0.4546, - "step": 2975 - }, - { - "epoch": 1.4295994243223795, - "grad_norm": 1.827059462779726, - "learning_rate": 6.267556388955508e-06, - "loss": 0.4284, - "step": 2980 - }, - { - "epoch": 1.4319980810745983, - "grad_norm": 1.7451018183946225, - "learning_rate": 6.254047357952413e-06, - "loss": 0.452, - "step": 2985 - }, - { - "epoch": 1.434396737826817, - "grad_norm": 2.4438753461794764, - "learning_rate": 6.24052855111227e-06, - "loss": 0.4416, - "step": 2990 - }, - { - "epoch": 1.4367953945790357, - "grad_norm": 1.927460965161698, - "learning_rate": 6.227000073819973e-06, - "loss": 0.3954, - "step": 2995 - }, - { - "epoch": 1.4391940513312544, - "grad_norm": 2.7660502397338047, - "learning_rate": 6.213462031535811e-06, - "loss": 0.4431, - "step": 3000 - }, - { - "epoch": 1.4391940513312544, - "eval_loss": 0.8874278664588928, - "eval_runtime": 746.2797, - "eval_samples_per_second": 9.929, - "eval_steps_per_second": 0.622, - "step": 3000 - }, - { - "epoch": 1.4415927080834732, - "grad_norm": 2.611336223360776, - "learning_rate": 6.199914529794627e-06, - "loss": 0.4706, - "step": 3005 - }, - { - "epoch": 1.443991364835692, - "grad_norm": 2.5667262714384003, - "learning_rate": 6.186357674205008e-06, - "loss": 0.5219, - "step": 3010 - }, - { - "epoch": 1.4463900215879106, - "grad_norm": 2.548110034722319, - "learning_rate": 6.172791570448458e-06, - "loss": 0.4481, - "step": 3015 - }, - { - "epoch": 1.4487886783401296, - "grad_norm": 2.597919097756615, - "learning_rate": 6.159216324278577e-06, - "loss": 0.4718, - "step": 3020 - }, - { - "epoch": 1.4511873350923483, - "grad_norm": 3.674189698827941, - "learning_rate": 6.145632041520229e-06, - "loss": 0.4611, - "step": 3025 - }, - { - "epoch": 1.453585991844567, - "grad_norm": 2.67480821934183, - "learning_rate": 6.132038828068725e-06, - "loss": 0.457, - "step": 3030 - }, - { - "epoch": 1.4559846485967858, - "grad_norm": 2.8863659126445764, - "learning_rate": 6.118436789888995e-06, - "loss": 0.4911, - "step": 3035 - }, - { - "epoch": 1.4583833053490045, - "grad_norm": 2.9651286406517574, - "learning_rate": 6.104826033014762e-06, - "loss": 0.4528, - "step": 3040 - }, - { - "epoch": 1.4607819621012232, - "grad_norm": 2.25092477044882, - "learning_rate": 6.091206663547713e-06, - "loss": 0.4568, - "step": 3045 - }, - { - "epoch": 1.4631806188534422, - "grad_norm": 2.5707376320364337, - "learning_rate": 6.077578787656673e-06, - "loss": 0.4713, - "step": 3050 - }, - { - "epoch": 1.465579275605661, - "grad_norm": 2.631364741417103, - "learning_rate": 6.063942511576782e-06, - "loss": 0.481, - "step": 3055 - }, - { - "epoch": 1.4679779323578797, - "grad_norm": 2.800004593675168, - "learning_rate": 6.050297941608661e-06, - "loss": 0.4427, - "step": 3060 - }, - { - "epoch": 1.4703765891100984, - "grad_norm": 2.95675973317759, - "learning_rate": 6.036645184117585e-06, - "loss": 0.4765, - "step": 3065 - }, - { - "epoch": 1.4727752458623171, - "grad_norm": 2.2729302837721557, - "learning_rate": 6.022984345532654e-06, - "loss": 0.4479, - "step": 3070 - }, - { - "epoch": 1.4751739026145358, - "grad_norm": 2.5945168985243305, - "learning_rate": 6.009315532345964e-06, - "loss": 0.4236, - "step": 3075 - }, - { - "epoch": 1.4775725593667546, - "grad_norm": 2.010953790417192, - "learning_rate": 5.995638851111778e-06, - "loss": 0.4607, - "step": 3080 - }, - { - "epoch": 1.4799712161189733, - "grad_norm": 2.4135304972011773, - "learning_rate": 5.981954408445691e-06, - "loss": 0.3831, - "step": 3085 - }, - { - "epoch": 1.482369872871192, - "grad_norm": 3.5469108380440106, - "learning_rate": 5.968262311023807e-06, - "loss": 0.4975, - "step": 3090 - }, - { - "epoch": 1.4847685296234108, - "grad_norm": 2.4819879566093763, - "learning_rate": 5.954562665581893e-06, - "loss": 0.4859, - "step": 3095 - }, - { - "epoch": 1.4871671863756297, - "grad_norm": 3.092020698716526, - "learning_rate": 5.940855578914564e-06, - "loss": 0.4222, - "step": 3100 - }, - { - "epoch": 1.4895658431278485, - "grad_norm": 2.603907807597384, - "learning_rate": 5.927141157874441e-06, - "loss": 0.5217, - "step": 3105 - }, - { - "epoch": 1.4919644998800672, - "grad_norm": 1.9127228659037592, - "learning_rate": 5.913419509371317e-06, - "loss": 0.4107, - "step": 3110 - }, - { - "epoch": 1.494363156632286, - "grad_norm": 1.9952834925260585, - "learning_rate": 5.899690740371327e-06, - "loss": 0.4634, - "step": 3115 - }, - { - "epoch": 1.4967618133845046, - "grad_norm": 2.3004395086577785, - "learning_rate": 5.885954957896115e-06, - "loss": 0.4243, - "step": 3120 - }, - { - "epoch": 1.4991604701367234, - "grad_norm": 2.712237010357561, - "learning_rate": 5.872212269021996e-06, - "loss": 0.4012, - "step": 3125 - }, - { - "epoch": 1.5015591268889423, - "grad_norm": 1.9707705697898028, - "learning_rate": 5.858462780879127e-06, - "loss": 0.3963, - "step": 3130 - }, - { - "epoch": 1.503957783641161, - "grad_norm": 2.1775128153767516, - "learning_rate": 5.844706600650662e-06, - "loss": 0.4788, - "step": 3135 - }, - { - "epoch": 1.5063564403933798, - "grad_norm": 2.2650398526849296, - "learning_rate": 5.83094383557193e-06, - "loss": 0.442, - "step": 3140 - }, - { - "epoch": 1.5087550971455985, - "grad_norm": 2.3021274108333176, - "learning_rate": 5.817174592929585e-06, - "loss": 0.4619, - "step": 3145 - }, - { - "epoch": 1.5111537538978173, - "grad_norm": 3.0408864818621266, - "learning_rate": 5.803398980060782e-06, - "loss": 0.4848, - "step": 3150 - }, - { - "epoch": 1.513552410650036, - "grad_norm": 2.662432672052024, - "learning_rate": 5.789617104352327e-06, - "loss": 0.4537, - "step": 3155 - }, - { - "epoch": 1.5159510674022547, - "grad_norm": 2.1199816412732826, - "learning_rate": 5.775829073239859e-06, - "loss": 0.4684, - "step": 3160 - }, - { - "epoch": 1.5183497241544734, - "grad_norm": 1.9990361255845992, - "learning_rate": 5.762034994206992e-06, - "loss": 0.4333, - "step": 3165 - }, - { - "epoch": 1.5207483809066922, - "grad_norm": 3.1609456768731614, - "learning_rate": 5.748234974784487e-06, - "loss": 0.4952, - "step": 3170 - }, - { - "epoch": 1.523147037658911, - "grad_norm": 2.4476985735005927, - "learning_rate": 5.734429122549415e-06, - "loss": 0.4948, - "step": 3175 - }, - { - "epoch": 1.5255456944111296, - "grad_norm": 2.148237529847839, - "learning_rate": 5.720617545124319e-06, - "loss": 0.451, - "step": 3180 - }, - { - "epoch": 1.5279443511633484, - "grad_norm": 2.3650819595385397, - "learning_rate": 5.706800350176366e-06, - "loss": 0.5046, - "step": 3185 - }, - { - "epoch": 1.5303430079155673, - "grad_norm": 2.4798863005701026, - "learning_rate": 5.692977645416519e-06, - "loss": 0.4512, - "step": 3190 - }, - { - "epoch": 1.532741664667786, - "grad_norm": 2.5313251041761466, - "learning_rate": 5.679149538598689e-06, - "loss": 0.4968, - "step": 3195 - }, - { - "epoch": 1.5351403214200048, - "grad_norm": 5.398569331104567, - "learning_rate": 5.6653161375189e-06, - "loss": 0.5006, - "step": 3200 - }, - { - "epoch": 1.5375389781722235, - "grad_norm": 2.7840966965275005, - "learning_rate": 5.651477550014447e-06, - "loss": 0.4485, - "step": 3205 - }, - { - "epoch": 1.5399376349244425, - "grad_norm": 3.262453782851082, - "learning_rate": 5.637633883963057e-06, - "loss": 0.3752, - "step": 3210 - }, - { - "epoch": 1.5423362916766612, - "grad_norm": 2.237442214349481, - "learning_rate": 5.623785247282044e-06, - "loss": 0.4064, - "step": 3215 - }, - { - "epoch": 1.54473494842888, - "grad_norm": 2.9173406947413616, - "learning_rate": 5.609931747927469e-06, - "loss": 0.4112, - "step": 3220 - }, - { - "epoch": 1.5471336051810987, - "grad_norm": 3.0776007451762024, - "learning_rate": 5.5960734938933055e-06, - "loss": 0.5295, - "step": 3225 - }, - { - "epoch": 1.5495322619333174, - "grad_norm": 2.4545633340960666, - "learning_rate": 5.582210593210584e-06, - "loss": 0.3961, - "step": 3230 - }, - { - "epoch": 1.5519309186855361, - "grad_norm": 2.009510951840261, - "learning_rate": 5.568343153946563e-06, - "loss": 0.4884, - "step": 3235 - }, - { - "epoch": 1.5543295754377549, - "grad_norm": 2.2007510609373195, - "learning_rate": 5.5544712842038795e-06, - "loss": 0.484, - "step": 3240 - }, - { - "epoch": 1.5567282321899736, - "grad_norm": 2.8520591253973073, - "learning_rate": 5.540595092119709e-06, - "loss": 0.4786, - "step": 3245 - }, - { - "epoch": 1.5591268889421923, - "grad_norm": 1.604583271616001, - "learning_rate": 5.526714685864921e-06, - "loss": 0.3909, - "step": 3250 - }, - { - "epoch": 1.561525545694411, - "grad_norm": 2.6979214589842333, - "learning_rate": 5.5128301736432335e-06, - "loss": 0.4753, - "step": 3255 - }, - { - "epoch": 1.5639242024466298, - "grad_norm": 2.1624025899920074, - "learning_rate": 5.498941663690376e-06, - "loss": 0.3832, - "step": 3260 - }, - { - "epoch": 1.5663228591988485, - "grad_norm": 2.2374635152759357, - "learning_rate": 5.485049264273241e-06, - "loss": 0.4296, - "step": 3265 - }, - { - "epoch": 1.5687215159510672, - "grad_norm": 1.973150872622609, - "learning_rate": 5.471153083689041e-06, - "loss": 0.3775, - "step": 3270 - }, - { - "epoch": 1.5711201727032862, - "grad_norm": 2.0848939395577633, - "learning_rate": 5.457253230264463e-06, - "loss": 0.4023, - "step": 3275 - }, - { - "epoch": 1.573518829455505, - "grad_norm": 2.12845976407844, - "learning_rate": 5.443349812354828e-06, - "loss": 0.426, - "step": 3280 - }, - { - "epoch": 1.5759174862077237, - "grad_norm": 2.2433674944507462, - "learning_rate": 5.429442938343241e-06, - "loss": 0.4825, - "step": 3285 - }, - { - "epoch": 1.5783161429599424, - "grad_norm": 1.8440111398002124, - "learning_rate": 5.41553271663975e-06, - "loss": 0.3807, - "step": 3290 - }, - { - "epoch": 1.5807147997121613, - "grad_norm": 2.8628882707095014, - "learning_rate": 5.401619255680501e-06, - "loss": 0.5037, - "step": 3295 - }, - { - "epoch": 1.58311345646438, - "grad_norm": 2.3477798507731045, - "learning_rate": 5.38770266392689e-06, - "loss": 0.431, - "step": 3300 - }, - { - "epoch": 1.5855121132165988, - "grad_norm": 1.8413235582299343, - "learning_rate": 5.373783049864714e-06, - "loss": 0.4954, - "step": 3305 - }, - { - "epoch": 1.5879107699688175, - "grad_norm": 2.608126509725837, - "learning_rate": 5.3598605220033406e-06, - "loss": 0.5038, - "step": 3310 - }, - { - "epoch": 1.5903094267210363, - "grad_norm": 2.14593361907128, - "learning_rate": 5.345935188874845e-06, - "loss": 0.4741, - "step": 3315 - }, - { - "epoch": 1.592708083473255, - "grad_norm": 2.6895229502835774, - "learning_rate": 5.3320071590331725e-06, - "loss": 0.4673, - "step": 3320 - }, - { - "epoch": 1.5951067402254737, - "grad_norm": 2.3486279127223058, - "learning_rate": 5.318076541053287e-06, - "loss": 0.4204, - "step": 3325 - }, - { - "epoch": 1.5975053969776924, - "grad_norm": 2.3779256132380753, - "learning_rate": 5.304143443530333e-06, - "loss": 0.479, - "step": 3330 - }, - { - "epoch": 1.5999040537299112, - "grad_norm": 2.382506024792259, - "learning_rate": 5.290207975078783e-06, - "loss": 0.4944, - "step": 3335 - }, - { - "epoch": 1.60230271048213, - "grad_norm": 2.324583019078476, - "learning_rate": 5.276270244331589e-06, - "loss": 0.4517, - "step": 3340 - }, - { - "epoch": 1.6047013672343486, - "grad_norm": 1.907971038729415, - "learning_rate": 5.2623303599393385e-06, - "loss": 0.3964, - "step": 3345 - }, - { - "epoch": 1.6071000239865674, - "grad_norm": 2.793523495104258, - "learning_rate": 5.248388430569415e-06, - "loss": 0.4268, - "step": 3350 - }, - { - "epoch": 1.6094986807387863, - "grad_norm": 2.0671079091787576, - "learning_rate": 5.234444564905132e-06, - "loss": 0.4985, - "step": 3355 - }, - { - "epoch": 1.611897337491005, - "grad_norm": 2.9592798078808102, - "learning_rate": 5.220498871644908e-06, - "loss": 0.4388, - "step": 3360 - }, - { - "epoch": 1.6142959942432238, - "grad_norm": 2.1426600987595656, - "learning_rate": 5.206551459501401e-06, - "loss": 0.3915, - "step": 3365 - }, - { - "epoch": 1.6166946509954425, - "grad_norm": 2.7506183229782395, - "learning_rate": 5.192602437200673e-06, - "loss": 0.4848, - "step": 3370 - }, - { - "epoch": 1.6190933077476615, - "grad_norm": 2.8796183674653615, - "learning_rate": 5.178651913481334e-06, - "loss": 0.4312, - "step": 3375 - }, - { - "epoch": 1.6214919644998802, - "grad_norm": 2.4712356902239265, - "learning_rate": 5.164699997093699e-06, - "loss": 0.4644, - "step": 3380 - }, - { - "epoch": 1.623890621252099, - "grad_norm": 2.4895946890030802, - "learning_rate": 5.150746796798946e-06, - "loss": 0.4597, - "step": 3385 - }, - { - "epoch": 1.6262892780043177, - "grad_norm": 2.7280161391548825, - "learning_rate": 5.136792421368251e-06, - "loss": 0.4394, - "step": 3390 - }, - { - "epoch": 1.6286879347565364, - "grad_norm": 2.0191439616934725, - "learning_rate": 5.122836979581958e-06, - "loss": 0.4216, - "step": 3395 - }, - { - "epoch": 1.6310865915087551, - "grad_norm": 2.710615920560287, - "learning_rate": 5.1088805802287215e-06, - "loss": 0.5304, - "step": 3400 - }, - { - "epoch": 1.6334852482609739, - "grad_norm": 2.0928161833734, - "learning_rate": 5.09492333210466e-06, - "loss": 0.4244, - "step": 3405 - }, - { - "epoch": 1.6358839050131926, - "grad_norm": 1.8191682812562566, - "learning_rate": 5.080965344012509e-06, - "loss": 0.426, - "step": 3410 - }, - { - "epoch": 1.6382825617654113, - "grad_norm": 1.915320356764598, - "learning_rate": 5.067006724760772e-06, - "loss": 0.3817, - "step": 3415 - }, - { - "epoch": 1.64068121851763, - "grad_norm": 3.0320685782151062, - "learning_rate": 5.053047583162877e-06, - "loss": 0.5218, - "step": 3420 - }, - { - "epoch": 1.6430798752698488, - "grad_norm": 2.140553134411092, - "learning_rate": 5.039088028036317e-06, - "loss": 0.4807, - "step": 3425 - }, - { - "epoch": 1.6454785320220675, - "grad_norm": 2.410914424743909, - "learning_rate": 5.0251281682018125e-06, - "loss": 0.4184, - "step": 3430 - }, - { - "epoch": 1.6478771887742862, - "grad_norm": 2.6739455394590745, - "learning_rate": 5.011168112482459e-06, - "loss": 0.4477, - "step": 3435 - }, - { - "epoch": 1.6502758455265052, - "grad_norm": 3.163665779437731, - "learning_rate": 4.997207969702879e-06, - "loss": 0.4681, - "step": 3440 - }, - { - "epoch": 1.652674502278724, - "grad_norm": 2.553283311318629, - "learning_rate": 4.98324784868837e-06, - "loss": 0.4042, - "step": 3445 - }, - { - "epoch": 1.6550731590309427, - "grad_norm": 1.677275594926634, - "learning_rate": 4.969287858264068e-06, - "loss": 0.4427, - "step": 3450 - }, - { - "epoch": 1.6574718157831614, - "grad_norm": 3.518235414569066, - "learning_rate": 4.955328107254084e-06, - "loss": 0.4576, - "step": 3455 - }, - { - "epoch": 1.6598704725353803, - "grad_norm": 2.560656886321086, - "learning_rate": 4.941368704480666e-06, - "loss": 0.4832, - "step": 3460 - }, - { - "epoch": 1.662269129287599, - "grad_norm": 1.8114516258560824, - "learning_rate": 4.927409758763343e-06, - "loss": 0.419, - "step": 3465 - }, - { - "epoch": 1.6646677860398178, - "grad_norm": 2.1836726473973416, - "learning_rate": 4.913451378918087e-06, - "loss": 0.5074, - "step": 3470 - }, - { - "epoch": 1.6670664427920365, - "grad_norm": 2.0729685092094905, - "learning_rate": 4.899493673756456e-06, - "loss": 0.3973, - "step": 3475 - }, - { - "epoch": 1.6694650995442553, - "grad_norm": 2.332454402343443, - "learning_rate": 4.885536752084748e-06, - "loss": 0.437, - "step": 3480 - }, - { - "epoch": 1.671863756296474, - "grad_norm": 1.9422102291303713, - "learning_rate": 4.871580722703153e-06, - "loss": 0.4099, - "step": 3485 - }, - { - "epoch": 1.6742624130486927, - "grad_norm": 2.45511686162037, - "learning_rate": 4.857625694404907e-06, - "loss": 0.4417, - "step": 3490 - }, - { - "epoch": 1.6766610698009115, - "grad_norm": 2.477532923554625, - "learning_rate": 4.843671775975441e-06, - "loss": 0.4183, - "step": 3495 - }, - { - "epoch": 1.6790597265531302, - "grad_norm": 2.0134977559736424, - "learning_rate": 4.8297190761915335e-06, - "loss": 0.4511, - "step": 3500 - }, - { - "epoch": 1.681458383305349, - "grad_norm": 2.7722220766364476, - "learning_rate": 4.815767703820466e-06, - "loss": 0.4241, - "step": 3505 - }, - { - "epoch": 1.6838570400575676, - "grad_norm": 2.4222195684237007, - "learning_rate": 4.80181776761917e-06, - "loss": 0.4166, - "step": 3510 - }, - { - "epoch": 1.6862556968097864, - "grad_norm": 1.7497888557935886, - "learning_rate": 4.787869376333381e-06, - "loss": 0.384, - "step": 3515 - }, - { - "epoch": 1.6886543535620053, - "grad_norm": 2.252998156949262, - "learning_rate": 4.773922638696792e-06, - "loss": 0.4858, - "step": 3520 - }, - { - "epoch": 1.691053010314224, - "grad_norm": 2.156538837955339, - "learning_rate": 4.759977663430204e-06, - "loss": 0.3671, - "step": 3525 - }, - { - "epoch": 1.6934516670664428, - "grad_norm": 2.8590215367905345, - "learning_rate": 4.746034559240682e-06, - "loss": 0.455, - "step": 3530 - }, - { - "epoch": 1.6958503238186615, - "grad_norm": 2.774007584511685, - "learning_rate": 4.732093434820703e-06, - "loss": 0.4291, - "step": 3535 - }, - { - "epoch": 1.6982489805708805, - "grad_norm": 2.634909995207892, - "learning_rate": 4.7181543988473116e-06, - "loss": 0.4459, - "step": 3540 - }, - { - "epoch": 1.7006476373230992, - "grad_norm": 2.133691456687859, - "learning_rate": 4.704217559981273e-06, - "loss": 0.3831, - "step": 3545 - }, - { - "epoch": 1.703046294075318, - "grad_norm": 2.2922171922400163, - "learning_rate": 4.690283026866221e-06, - "loss": 0.4505, - "step": 3550 - }, - { - "epoch": 1.7054449508275367, - "grad_norm": 3.7940433061602357, - "learning_rate": 4.6763509081278215e-06, - "loss": 0.4534, - "step": 3555 - }, - { - "epoch": 1.7078436075797554, - "grad_norm": 2.208035399901243, - "learning_rate": 4.6624213123729146e-06, - "loss": 0.4009, - "step": 3560 - }, - { - "epoch": 1.7102422643319741, - "grad_norm": 2.356109324316714, - "learning_rate": 4.648494348188675e-06, - "loss": 0.5101, - "step": 3565 - }, - { - "epoch": 1.7126409210841929, - "grad_norm": 2.5764709432365316, - "learning_rate": 4.634570124141761e-06, - "loss": 0.4425, - "step": 3570 - }, - { - "epoch": 1.7150395778364116, - "grad_norm": 1.9758783243784745, - "learning_rate": 4.620648748777472e-06, - "loss": 0.445, - "step": 3575 - }, - { - "epoch": 1.7174382345886303, - "grad_norm": 2.2527649874877334, - "learning_rate": 4.606730330618899e-06, - "loss": 0.4632, - "step": 3580 - }, - { - "epoch": 1.719836891340849, - "grad_norm": 2.132159506220732, - "learning_rate": 4.592814978166084e-06, - "loss": 0.4718, - "step": 3585 - }, - { - "epoch": 1.7222355480930678, - "grad_norm": 2.8446174142457, - "learning_rate": 4.578902799895165e-06, - "loss": 0.4792, - "step": 3590 - }, - { - "epoch": 1.7246342048452865, - "grad_norm": 2.230569296862582, - "learning_rate": 4.5649939042575415e-06, - "loss": 0.4598, - "step": 3595 - }, - { - "epoch": 1.7270328615975052, - "grad_norm": 3.444809582847871, - "learning_rate": 4.5510883996790175e-06, - "loss": 0.6123, - "step": 3600 - }, - { - "epoch": 1.7294315183497242, - "grad_norm": 1.9684733506440177, - "learning_rate": 4.53718639455897e-06, - "loss": 0.3768, - "step": 3605 - }, - { - "epoch": 1.731830175101943, - "grad_norm": 2.368594527234669, - "learning_rate": 4.523287997269489e-06, - "loss": 0.4619, - "step": 3610 - }, - { - "epoch": 1.7342288318541617, - "grad_norm": 2.104619044987494, - "learning_rate": 4.509393316154545e-06, - "loss": 0.4856, - "step": 3615 - }, - { - "epoch": 1.7366274886063804, - "grad_norm": 2.824474105567847, - "learning_rate": 4.495502459529135e-06, - "loss": 0.4595, - "step": 3620 - }, - { - "epoch": 1.7390261453585993, - "grad_norm": 2.0922330865937235, - "learning_rate": 4.481615535678444e-06, - "loss": 0.4544, - "step": 3625 - }, - { - "epoch": 1.741424802110818, - "grad_norm": 3.699659997300747, - "learning_rate": 4.467732652857003e-06, - "loss": 0.4202, - "step": 3630 - }, - { - "epoch": 1.7438234588630368, - "grad_norm": 2.9983464058160516, - "learning_rate": 4.453853919287836e-06, - "loss": 0.423, - "step": 3635 - }, - { - "epoch": 1.7462221156152555, - "grad_norm": 2.0846769162351366, - "learning_rate": 4.439979443161624e-06, - "loss": 0.388, - "step": 3640 - }, - { - "epoch": 1.7486207723674743, - "grad_norm": 2.477314432741006, - "learning_rate": 4.42610933263586e-06, - "loss": 0.4506, - "step": 3645 - }, - { - "epoch": 1.751019429119693, - "grad_norm": 2.589674357755263, - "learning_rate": 4.4122436958340035e-06, - "loss": 0.3728, - "step": 3650 - }, - { - "epoch": 1.7534180858719117, - "grad_norm": 2.6366312614017122, - "learning_rate": 4.398382640844644e-06, - "loss": 0.479, - "step": 3655 - }, - { - "epoch": 1.7558167426241305, - "grad_norm": 2.014560929851438, - "learning_rate": 4.3845262757206495e-06, - "loss": 0.4408, - "step": 3660 - }, - { - "epoch": 1.7582153993763492, - "grad_norm": 1.7221380583217487, - "learning_rate": 4.370674708478326e-06, - "loss": 0.4317, - "step": 3665 - }, - { - "epoch": 1.760614056128568, - "grad_norm": 2.1638693438484995, - "learning_rate": 4.3568280470965865e-06, - "loss": 0.3856, - "step": 3670 - }, - { - "epoch": 1.7630127128807866, - "grad_norm": 2.6456939593036184, - "learning_rate": 4.342986399516092e-06, - "loss": 0.4169, - "step": 3675 - }, - { - "epoch": 1.7654113696330054, - "grad_norm": 2.585012432731737, - "learning_rate": 4.329149873638423e-06, - "loss": 0.4074, - "step": 3680 - }, - { - "epoch": 1.767810026385224, - "grad_norm": 2.192226239922893, - "learning_rate": 4.315318577325235e-06, - "loss": 0.4805, - "step": 3685 - }, - { - "epoch": 1.770208683137443, - "grad_norm": 2.3384135122928997, - "learning_rate": 4.3014926183974155e-06, - "loss": 0.4866, - "step": 3690 - }, - { - "epoch": 1.7726073398896618, - "grad_norm": 2.365443373467843, - "learning_rate": 4.287672104634241e-06, - "loss": 0.4481, - "step": 3695 - }, - { - "epoch": 1.7750059966418805, - "grad_norm": 2.542894793017788, - "learning_rate": 4.27385714377255e-06, - "loss": 0.4389, - "step": 3700 - }, - { - "epoch": 1.7774046533940993, - "grad_norm": 2.1437152802123096, - "learning_rate": 4.260047843505883e-06, - "loss": 0.4747, - "step": 3705 - }, - { - "epoch": 1.7798033101463182, - "grad_norm": 1.9219033326157453, - "learning_rate": 4.246244311483661e-06, - "loss": 0.4277, - "step": 3710 - }, - { - "epoch": 1.782201966898537, - "grad_norm": 2.5391812417621518, - "learning_rate": 4.232446655310337e-06, - "loss": 0.4594, - "step": 3715 - }, - { - "epoch": 1.7846006236507557, - "grad_norm": 2.6367801196236345, - "learning_rate": 4.218654982544559e-06, - "loss": 0.451, - "step": 3720 - }, - { - "epoch": 1.7869992804029744, - "grad_norm": 1.998666444994321, - "learning_rate": 4.204869400698331e-06, - "loss": 0.4611, - "step": 3725 - }, - { - "epoch": 1.7893979371551931, - "grad_norm": 2.396249588875812, - "learning_rate": 4.191090017236177e-06, - "loss": 0.4102, - "step": 3730 - }, - { - "epoch": 1.7917965939074119, - "grad_norm": 2.0438518913068906, - "learning_rate": 4.177316939574302e-06, - "loss": 0.3945, - "step": 3735 - }, - { - "epoch": 1.7941952506596306, - "grad_norm": 3.3647778164933895, - "learning_rate": 4.163550275079753e-06, - "loss": 0.4793, - "step": 3740 - }, - { - "epoch": 1.7965939074118493, - "grad_norm": 2.0510954320612833, - "learning_rate": 4.149790131069584e-06, - "loss": 0.4312, - "step": 3745 - }, - { - "epoch": 1.798992564164068, - "grad_norm": 2.1133217662374197, - "learning_rate": 4.136036614810022e-06, - "loss": 0.4127, - "step": 3750 - }, - { - "epoch": 1.8013912209162868, - "grad_norm": 2.110615768818984, - "learning_rate": 4.122289833515621e-06, - "loss": 0.4555, - "step": 3755 - }, - { - "epoch": 1.8037898776685055, - "grad_norm": 2.490483001412966, - "learning_rate": 4.1085498943484444e-06, - "loss": 0.4967, - "step": 3760 - }, - { - "epoch": 1.8061885344207242, - "grad_norm": 3.002844293787043, - "learning_rate": 4.094816904417207e-06, - "loss": 0.4706, - "step": 3765 - }, - { - "epoch": 1.8085871911729432, - "grad_norm": 2.183976582582103, - "learning_rate": 4.0810909707764555e-06, - "loss": 0.4638, - "step": 3770 - }, - { - "epoch": 1.810985847925162, - "grad_norm": 2.8167634413211466, - "learning_rate": 4.067372200425732e-06, - "loss": 0.4355, - "step": 3775 - }, - { - "epoch": 1.8133845046773807, - "grad_norm": 3.1466044507478825, - "learning_rate": 4.053660700308734e-06, - "loss": 0.4076, - "step": 3780 - }, - { - "epoch": 1.8157831614295994, - "grad_norm": 2.1976932894705, - "learning_rate": 4.039956577312486e-06, - "loss": 0.4186, - "step": 3785 - }, - { - "epoch": 1.8181818181818183, - "grad_norm": 2.6344969999958625, - "learning_rate": 4.026259938266508e-06, - "loss": 0.4368, - "step": 3790 - }, - { - "epoch": 1.820580474934037, - "grad_norm": 2.463495815471494, - "learning_rate": 4.012570889941973e-06, - "loss": 0.4712, - "step": 3795 - }, - { - "epoch": 1.8229791316862558, - "grad_norm": 2.3535074134581255, - "learning_rate": 3.998889539050884e-06, - "loss": 0.4478, - "step": 3800 - }, - { - "epoch": 1.8253777884384745, - "grad_norm": 1.8539164713669094, - "learning_rate": 3.98521599224524e-06, - "loss": 0.4817, - "step": 3805 - }, - { - "epoch": 1.8277764451906933, - "grad_norm": 2.4488933427042405, - "learning_rate": 3.971550356116205e-06, - "loss": 0.4517, - "step": 3810 - }, - { - "epoch": 1.830175101942912, - "grad_norm": 2.544480168164569, - "learning_rate": 3.957892737193274e-06, - "loss": 0.4154, - "step": 3815 - }, - { - "epoch": 1.8325737586951307, - "grad_norm": 2.3012603822017144, - "learning_rate": 3.944243241943442e-06, - "loss": 0.3977, - "step": 3820 - }, - { - "epoch": 1.8349724154473495, - "grad_norm": 2.5924353524670045, - "learning_rate": 3.930601976770383e-06, - "loss": 0.4526, - "step": 3825 - }, - { - "epoch": 1.8373710721995682, - "grad_norm": 2.0573535751534275, - "learning_rate": 3.916969048013607e-06, - "loss": 0.4115, - "step": 3830 - }, - { - "epoch": 1.839769728951787, - "grad_norm": 2.6580914984639237, - "learning_rate": 3.9033445619476415e-06, - "loss": 0.4362, - "step": 3835 - }, - { - "epoch": 1.8421683857040057, - "grad_norm": 2.1023106414276973, - "learning_rate": 3.8897286247812006e-06, - "loss": 0.4124, - "step": 3840 - }, - { - "epoch": 1.8445670424562244, - "grad_norm": 2.1445435272357445, - "learning_rate": 3.8761213426563546e-06, - "loss": 0.5047, - "step": 3845 - }, - { - "epoch": 1.8469656992084431, - "grad_norm": 2.2160537149648523, - "learning_rate": 3.862522821647702e-06, - "loss": 0.4063, - "step": 3850 - }, - { - "epoch": 1.849364355960662, - "grad_norm": 2.582471907598274, - "learning_rate": 3.848933167761549e-06, - "loss": 0.39, - "step": 3855 - }, - { - "epoch": 1.8517630127128808, - "grad_norm": 2.4913859988764173, - "learning_rate": 3.835352486935076e-06, - "loss": 0.4045, - "step": 3860 - }, - { - "epoch": 1.8541616694650995, - "grad_norm": 2.4347035193394366, - "learning_rate": 3.821780885035516e-06, - "loss": 0.4174, - "step": 3865 - }, - { - "epoch": 1.8565603262173183, - "grad_norm": 2.557816266559845, - "learning_rate": 3.808218467859326e-06, - "loss": 0.4846, - "step": 3870 - }, - { - "epoch": 1.8589589829695372, - "grad_norm": 2.871791500574788, - "learning_rate": 3.794665341131365e-06, - "loss": 0.3993, - "step": 3875 - }, - { - "epoch": 1.861357639721756, - "grad_norm": 4.453945141895481, - "learning_rate": 3.7811216105040705e-06, - "loss": 0.3666, - "step": 3880 - }, - { - "epoch": 1.8637562964739747, - "grad_norm": 2.7539613985240443, - "learning_rate": 3.7675873815566307e-06, - "loss": 0.4947, - "step": 3885 - }, - { - "epoch": 1.8661549532261934, - "grad_norm": 2.757681777789113, - "learning_rate": 3.754062759794165e-06, - "loss": 0.4364, - "step": 3890 - }, - { - "epoch": 1.8685536099784121, - "grad_norm": 1.8453147641515586, - "learning_rate": 3.740547850646902e-06, - "loss": 0.3905, - "step": 3895 - }, - { - "epoch": 1.8709522667306309, - "grad_norm": 4.16401248546863, - "learning_rate": 3.7270427594693565e-06, - "loss": 0.5428, - "step": 3900 - }, - { - "epoch": 1.8733509234828496, - "grad_norm": 2.7452054734057816, - "learning_rate": 3.713547591539504e-06, - "loss": 0.4806, - "step": 3905 - }, - { - "epoch": 1.8757495802350683, - "grad_norm": 2.188982964972483, - "learning_rate": 3.7000624520579677e-06, - "loss": 0.4278, - "step": 3910 - }, - { - "epoch": 1.878148236987287, - "grad_norm": 2.306468705586506, - "learning_rate": 3.6865874461471974e-06, - "loss": 0.4893, - "step": 3915 - }, - { - "epoch": 1.8805468937395058, - "grad_norm": 2.63525840513707, - "learning_rate": 3.6731226788506412e-06, - "loss": 0.4033, - "step": 3920 - }, - { - "epoch": 1.8829455504917245, - "grad_norm": 2.6391854567866573, - "learning_rate": 3.6596682551319347e-06, - "loss": 0.4328, - "step": 3925 - }, - { - "epoch": 1.8853442072439432, - "grad_norm": 2.3123066585387697, - "learning_rate": 3.6462242798740842e-06, - "loss": 0.4837, - "step": 3930 - }, - { - "epoch": 1.8877428639961622, - "grad_norm": 2.5028797266020484, - "learning_rate": 3.632790857878642e-06, - "loss": 0.3742, - "step": 3935 - }, - { - "epoch": 1.890141520748381, - "grad_norm": 2.0753848490152085, - "learning_rate": 3.6193680938648933e-06, - "loss": 0.3936, - "step": 3940 - }, - { - "epoch": 1.8925401775005997, - "grad_norm": 1.5576844963836263, - "learning_rate": 3.605956092469043e-06, - "loss": 0.47, - "step": 3945 - }, - { - "epoch": 1.8949388342528184, - "grad_norm": 1.7508643803207475, - "learning_rate": 3.5925549582433937e-06, - "loss": 0.3943, - "step": 3950 - }, - { - "epoch": 1.8973374910050371, - "grad_norm": 2.504931518208198, - "learning_rate": 3.5791647956555347e-06, - "loss": 0.4571, - "step": 3955 - }, - { - "epoch": 1.899736147757256, - "grad_norm": 1.372851159785718, - "learning_rate": 3.565785709087527e-06, - "loss": 0.3731, - "step": 3960 - }, - { - "epoch": 1.9021348045094748, - "grad_norm": 2.550831447418303, - "learning_rate": 3.5524178028350886e-06, - "loss": 0.4265, - "step": 3965 - }, - { - "epoch": 1.9045334612616935, - "grad_norm": 1.9108344497032907, - "learning_rate": 3.5390611811067844e-06, - "loss": 0.3443, - "step": 3970 - }, - { - "epoch": 1.9069321180139123, - "grad_norm": 2.112501662712423, - "learning_rate": 3.5257159480232085e-06, - "loss": 0.4774, - "step": 3975 - }, - { - "epoch": 1.909330774766131, - "grad_norm": 2.3842938517301517, - "learning_rate": 3.512382207616176e-06, - "loss": 0.5172, - "step": 3980 - }, - { - "epoch": 1.9117294315183497, - "grad_norm": 2.173550770298736, - "learning_rate": 3.4990600638279143e-06, - "loss": 0.4375, - "step": 3985 - }, - { - "epoch": 1.9141280882705685, - "grad_norm": 2.533900150370561, - "learning_rate": 3.4857496205102475e-06, - "loss": 0.4809, - "step": 3990 - }, - { - "epoch": 1.9165267450227872, - "grad_norm": 2.3636952079374325, - "learning_rate": 3.4724509814237894e-06, - "loss": 0.4117, - "step": 3995 - }, - { - "epoch": 1.918925401775006, - "grad_norm": 2.838190159310154, - "learning_rate": 3.4591642502371383e-06, - "loss": 0.4682, - "step": 4000 - }, - { - "epoch": 1.918925401775006, - "eval_loss": 0.8692654967308044, - "eval_runtime": 740.4824, - "eval_samples_per_second": 10.007, - "eval_steps_per_second": 0.627, - "step": 4000 - }, - { - "epoch": 1.9213240585272247, - "grad_norm": 2.1605145377053927, - "learning_rate": 3.445889530526061e-06, - "loss": 0.4229, - "step": 4005 - }, - { - "epoch": 1.9237227152794434, - "grad_norm": 2.2539865850184113, - "learning_rate": 3.4326269257726907e-06, - "loss": 0.4552, - "step": 4010 - }, - { - "epoch": 1.9261213720316621, - "grad_norm": 2.4596434765648127, - "learning_rate": 3.4193765393647217e-06, - "loss": 0.4016, - "step": 4015 - }, - { - "epoch": 1.928520028783881, - "grad_norm": 2.4021095095636045, - "learning_rate": 3.406138474594601e-06, - "loss": 0.4841, - "step": 4020 - }, - { - "epoch": 1.9309186855360998, - "grad_norm": 3.030531595041229, - "learning_rate": 3.3929128346587192e-06, - "loss": 0.4347, - "step": 4025 - }, - { - "epoch": 1.9333173422883185, - "grad_norm": 1.7489998010970578, - "learning_rate": 3.379699722656612e-06, - "loss": 0.4193, - "step": 4030 - }, - { - "epoch": 1.9357159990405373, - "grad_norm": 2.6910202006360446, - "learning_rate": 3.3664992415901575e-06, - "loss": 0.4323, - "step": 4035 - }, - { - "epoch": 1.9381146557927562, - "grad_norm": 3.0994878154800998, - "learning_rate": 3.3533114943627666e-06, - "loss": 0.4332, - "step": 4040 - }, - { - "epoch": 1.940513312544975, - "grad_norm": 2.31042002303757, - "learning_rate": 3.3401365837785828e-06, - "loss": 0.3942, - "step": 4045 - }, - { - "epoch": 1.9429119692971937, - "grad_norm": 2.233640476594205, - "learning_rate": 3.326974612541689e-06, - "loss": 0.4187, - "step": 4050 - }, - { - "epoch": 1.9453106260494124, - "grad_norm": 2.315235530807908, - "learning_rate": 3.313825683255295e-06, - "loss": 0.4322, - "step": 4055 - }, - { - "epoch": 1.9477092828016311, - "grad_norm": 2.453655450256884, - "learning_rate": 3.3006898984209436e-06, - "loss": 0.44, - "step": 4060 - }, - { - "epoch": 1.9501079395538499, - "grad_norm": 1.6854114253515358, - "learning_rate": 3.287567360437712e-06, - "loss": 0.439, - "step": 4065 - }, - { - "epoch": 1.9525065963060686, - "grad_norm": 2.0995266216210258, - "learning_rate": 3.274458171601414e-06, - "loss": 0.4386, - "step": 4070 - }, - { - "epoch": 1.9549052530582873, - "grad_norm": 2.947024685833257, - "learning_rate": 3.2613624341037994e-06, - "loss": 0.4262, - "step": 4075 - }, - { - "epoch": 1.957303909810506, - "grad_norm": 2.055447214443006, - "learning_rate": 3.248280250031758e-06, - "loss": 0.4234, - "step": 4080 - }, - { - "epoch": 1.9597025665627248, - "grad_norm": 2.404771167899297, - "learning_rate": 3.2352117213665247e-06, - "loss": 0.4254, - "step": 4085 - }, - { - "epoch": 1.9621012233149435, - "grad_norm": 2.5779058142571465, - "learning_rate": 3.222156949982889e-06, - "loss": 0.3543, - "step": 4090 - }, - { - "epoch": 1.9644998800671623, - "grad_norm": 2.713609918807594, - "learning_rate": 3.209116037648391e-06, - "loss": 0.4684, - "step": 4095 - }, - { - "epoch": 1.966898536819381, - "grad_norm": 2.479178615500214, - "learning_rate": 3.1960890860225364e-06, - "loss": 0.4405, - "step": 4100 - }, - { - "epoch": 1.9692971935716, - "grad_norm": 2.731958193705481, - "learning_rate": 3.1830761966560013e-06, - "loss": 0.4894, - "step": 4105 - }, - { - "epoch": 1.9716958503238187, - "grad_norm": 1.7562274667575501, - "learning_rate": 3.1700774709898403e-06, - "loss": 0.4007, - "step": 4110 - }, - { - "epoch": 1.9740945070760374, - "grad_norm": 2.3200173678085525, - "learning_rate": 3.157093010354692e-06, - "loss": 0.3679, - "step": 4115 - }, - { - "epoch": 1.9764931638282561, - "grad_norm": 2.4945622359580533, - "learning_rate": 3.1441229159700004e-06, - "loss": 0.4625, - "step": 4120 - }, - { - "epoch": 1.978891820580475, - "grad_norm": 2.488700609342745, - "learning_rate": 3.1311672889432145e-06, - "loss": 0.4021, - "step": 4125 - }, - { - "epoch": 1.9812904773326938, - "grad_norm": 1.3886751642557622, - "learning_rate": 3.1182262302690025e-06, - "loss": 0.3989, - "step": 4130 - }, - { - "epoch": 1.9836891340849125, - "grad_norm": 2.9028776554358866, - "learning_rate": 3.1052998408284664e-06, - "loss": 0.4477, - "step": 4135 - }, - { - "epoch": 1.9860877908371313, - "grad_norm": 2.4419026307903224, - "learning_rate": 3.0923882213883593e-06, - "loss": 0.4037, - "step": 4140 - }, - { - "epoch": 1.98848644758935, - "grad_norm": 2.8596964505015614, - "learning_rate": 3.0794914726002904e-06, - "loss": 0.4253, - "step": 4145 - }, - { - "epoch": 1.9908851043415687, - "grad_norm": 2.3119885246996614, - "learning_rate": 3.0666096949999493e-06, - "loss": 0.4408, - "step": 4150 - }, - { - "epoch": 1.9932837610937875, - "grad_norm": 1.603854256806702, - "learning_rate": 3.0537429890063177e-06, - "loss": 0.4348, - "step": 4155 - }, - { - "epoch": 1.9956824178460062, - "grad_norm": 2.247485386854169, - "learning_rate": 3.0408914549208867e-06, - "loss": 0.4137, - "step": 4160 - }, - { - "epoch": 1.998081074598225, - "grad_norm": 2.323713545683984, - "learning_rate": 3.0280551929268763e-06, - "loss": 0.429, - "step": 4165 - }, - { - "epoch": 2.0004797313504437, - "grad_norm": 1.7492359777642301, - "learning_rate": 3.0152343030884586e-06, - "loss": 0.4368, - "step": 4170 - }, - { - "epoch": 2.0028783881026624, - "grad_norm": 1.2104549778941618, - "learning_rate": 3.002428885349965e-06, - "loss": 0.1384, - "step": 4175 - }, - { - "epoch": 2.005277044854881, - "grad_norm": 1.2533431726821433, - "learning_rate": 2.9896390395351215e-06, - "loss": 0.1307, - "step": 4180 - }, - { - "epoch": 2.0076757016071, - "grad_norm": 1.6153822677232479, - "learning_rate": 2.9768648653462617e-06, - "loss": 0.1509, - "step": 4185 - }, - { - "epoch": 2.0100743583593186, - "grad_norm": 1.9706051369619293, - "learning_rate": 2.964106462363551e-06, - "loss": 0.1407, - "step": 4190 - }, - { - "epoch": 2.0124730151115378, - "grad_norm": 1.5835329547161818, - "learning_rate": 2.9513639300442156e-06, - "loss": 0.1423, - "step": 4195 - }, - { - "epoch": 2.0148716718637565, - "grad_norm": 1.4932494900950153, - "learning_rate": 2.9386373677217583e-06, - "loss": 0.1328, - "step": 4200 - }, - { - "epoch": 2.017270328615975, - "grad_norm": 1.373976549075157, - "learning_rate": 2.925926874605192e-06, - "loss": 0.1239, - "step": 4205 - }, - { - "epoch": 2.019668985368194, - "grad_norm": 2.6678759030139245, - "learning_rate": 2.9132325497782637e-06, - "loss": 0.1576, - "step": 4210 - }, - { - "epoch": 2.0220676421204127, - "grad_norm": 1.8908086442023389, - "learning_rate": 2.9005544921986774e-06, - "loss": 0.1438, - "step": 4215 - }, - { - "epoch": 2.0244662988726314, - "grad_norm": 1.4406396259902392, - "learning_rate": 2.8878928006973326e-06, - "loss": 0.1172, - "step": 4220 - }, - { - "epoch": 2.02686495562485, - "grad_norm": 3.144383893347305, - "learning_rate": 2.8752475739775456e-06, - "loss": 0.1248, - "step": 4225 - }, - { - "epoch": 2.029263612377069, - "grad_norm": 1.5254881527613178, - "learning_rate": 2.862618910614281e-06, - "loss": 0.1166, - "step": 4230 - }, - { - "epoch": 2.0316622691292876, - "grad_norm": 1.5380395388583785, - "learning_rate": 2.8500069090533845e-06, - "loss": 0.122, - "step": 4235 - }, - { - "epoch": 2.0340609258815063, - "grad_norm": 1.8517578832693309, - "learning_rate": 2.837411667610823e-06, - "loss": 0.1264, - "step": 4240 - }, - { - "epoch": 2.036459582633725, - "grad_norm": 1.5611087867626268, - "learning_rate": 2.8248332844719033e-06, - "loss": 0.1273, - "step": 4245 - }, - { - "epoch": 2.038858239385944, - "grad_norm": 1.9829636176688326, - "learning_rate": 2.8122718576905174e-06, - "loss": 0.1211, - "step": 4250 - }, - { - "epoch": 2.0412568961381625, - "grad_norm": 2.336890942831196, - "learning_rate": 2.7997274851883748e-06, - "loss": 0.1329, - "step": 4255 - }, - { - "epoch": 2.0436555528903813, - "grad_norm": 2.123217631063763, - "learning_rate": 2.7872002647542395e-06, - "loss": 0.1294, - "step": 4260 - }, - { - "epoch": 2.0460542096426, - "grad_norm": 1.3603389345629862, - "learning_rate": 2.7746902940431665e-06, - "loss": 0.1183, - "step": 4265 - }, - { - "epoch": 2.0484528663948187, - "grad_norm": 2.9582290350986686, - "learning_rate": 2.7621976705757446e-06, - "loss": 0.1573, - "step": 4270 - }, - { - "epoch": 2.0508515231470374, - "grad_norm": 1.6318991512169887, - "learning_rate": 2.7497224917373355e-06, - "loss": 0.1487, - "step": 4275 - }, - { - "epoch": 2.0532501798992566, - "grad_norm": 1.317567144216617, - "learning_rate": 2.7372648547773063e-06, - "loss": 0.1214, - "step": 4280 - }, - { - "epoch": 2.0556488366514754, - "grad_norm": 1.4968986861944702, - "learning_rate": 2.7248248568082796e-06, - "loss": 0.1311, - "step": 4285 - }, - { - "epoch": 2.058047493403694, - "grad_norm": 2.066601497926396, - "learning_rate": 2.712402594805377e-06, - "loss": 0.132, - "step": 4290 - }, - { - "epoch": 2.060446150155913, - "grad_norm": 2.0874309705799505, - "learning_rate": 2.6999981656054587e-06, - "loss": 0.1406, - "step": 4295 - }, - { - "epoch": 2.0628448069081315, - "grad_norm": 1.2870381784002116, - "learning_rate": 2.687611665906369e-06, - "loss": 0.1385, - "step": 4300 - }, - { - "epoch": 2.0652434636603503, - "grad_norm": 1.5033982222031699, - "learning_rate": 2.675243192266189e-06, - "loss": 0.1082, - "step": 4305 - }, - { - "epoch": 2.067642120412569, - "grad_norm": 1.4861250410016393, - "learning_rate": 2.662892841102474e-06, - "loss": 0.1354, - "step": 4310 - }, - { - "epoch": 2.0700407771647877, - "grad_norm": 1.3891683331641131, - "learning_rate": 2.65056070869151e-06, - "loss": 0.1365, - "step": 4315 - }, - { - "epoch": 2.0724394339170065, - "grad_norm": 1.6768904681198002, - "learning_rate": 2.6382468911675553e-06, - "loss": 0.1319, - "step": 4320 - }, - { - "epoch": 2.074838090669225, - "grad_norm": 2.125719533478666, - "learning_rate": 2.6259514845221045e-06, - "loss": 0.1524, - "step": 4325 - }, - { - "epoch": 2.077236747421444, - "grad_norm": 1.724095586385624, - "learning_rate": 2.6136745846031232e-06, - "loss": 0.1382, - "step": 4330 - }, - { - "epoch": 2.0796354041736627, - "grad_norm": 1.5007407801751065, - "learning_rate": 2.6014162871143123e-06, - "loss": 0.1319, - "step": 4335 - }, - { - "epoch": 2.0820340609258814, - "grad_norm": 1.7606564742094755, - "learning_rate": 2.589176687614357e-06, - "loss": 0.125, - "step": 4340 - }, - { - "epoch": 2.0844327176781, - "grad_norm": 1.8469687804386385, - "learning_rate": 2.5769558815161887e-06, - "loss": 0.1351, - "step": 4345 - }, - { - "epoch": 2.086831374430319, - "grad_norm": 1.8472855123668117, - "learning_rate": 2.5647539640862316e-06, - "loss": 0.1246, - "step": 4350 - }, - { - "epoch": 2.0892300311825376, - "grad_norm": 3.765807007677004, - "learning_rate": 2.5525710304436658e-06, - "loss": 0.1063, - "step": 4355 - }, - { - "epoch": 2.0916286879347563, - "grad_norm": 1.589551356385384, - "learning_rate": 2.540407175559684e-06, - "loss": 0.1129, - "step": 4360 - }, - { - "epoch": 2.0940273446869755, - "grad_norm": 1.7923356652831175, - "learning_rate": 2.5282624942567523e-06, - "loss": 0.119, - "step": 4365 - }, - { - "epoch": 2.096426001439194, - "grad_norm": 2.4656699199229273, - "learning_rate": 2.5161370812078687e-06, - "loss": 0.1119, - "step": 4370 - }, - { - "epoch": 2.098824658191413, - "grad_norm": 3.4413079643746376, - "learning_rate": 2.504031030935832e-06, - "loss": 0.1495, - "step": 4375 - }, - { - "epoch": 2.1012233149436317, - "grad_norm": 1.5898426640404113, - "learning_rate": 2.491944437812497e-06, - "loss": 0.1322, - "step": 4380 - }, - { - "epoch": 2.1036219716958504, - "grad_norm": 1.305705593587179, - "learning_rate": 2.4798773960580403e-06, - "loss": 0.1462, - "step": 4385 - }, - { - "epoch": 2.106020628448069, - "grad_norm": 1.54107319224433, - "learning_rate": 2.4678299997402276e-06, - "loss": 0.1191, - "step": 4390 - }, - { - "epoch": 2.108419285200288, - "grad_norm": 1.4056141128475004, - "learning_rate": 2.4558023427736817e-06, - "loss": 0.0998, - "step": 4395 - }, - { - "epoch": 2.1108179419525066, - "grad_norm": 1.5206415072257935, - "learning_rate": 2.4437945189191464e-06, - "loss": 0.1625, - "step": 4400 - }, - { - "epoch": 2.1132165987047253, - "grad_norm": 1.7861408291312562, - "learning_rate": 2.431806621782758e-06, - "loss": 0.1334, - "step": 4405 - }, - { - "epoch": 2.115615255456944, - "grad_norm": 1.4757372455135827, - "learning_rate": 2.4198387448153205e-06, - "loss": 0.1038, - "step": 4410 - }, - { - "epoch": 2.118013912209163, - "grad_norm": 1.9894165818208251, - "learning_rate": 2.407890981311566e-06, - "loss": 0.1199, - "step": 4415 - }, - { - "epoch": 2.1204125689613815, - "grad_norm": 2.0026704399459336, - "learning_rate": 2.3959634244094336e-06, - "loss": 0.1275, - "step": 4420 - }, - { - "epoch": 2.1228112257136003, - "grad_norm": 1.5674286742154901, - "learning_rate": 2.38405616708935e-06, - "loss": 0.1255, - "step": 4425 - }, - { - "epoch": 2.125209882465819, - "grad_norm": 1.5064556826612898, - "learning_rate": 2.37216930217349e-06, - "loss": 0.1347, - "step": 4430 - }, - { - "epoch": 2.1276085392180377, - "grad_norm": 1.8166945572467463, - "learning_rate": 2.3603029223250657e-06, - "loss": 0.1168, - "step": 4435 - }, - { - "epoch": 2.1300071959702565, - "grad_norm": 1.4920164777188665, - "learning_rate": 2.348457120047597e-06, - "loss": 0.1164, - "step": 4440 - }, - { - "epoch": 2.132405852722475, - "grad_norm": 1.8683865713478673, - "learning_rate": 2.336631987684192e-06, - "loss": 0.1335, - "step": 4445 - }, - { - "epoch": 2.1348045094746944, - "grad_norm": 1.5965976402737696, - "learning_rate": 2.3248276174168333e-06, - "loss": 0.1333, - "step": 4450 - }, - { - "epoch": 2.137203166226913, - "grad_norm": 2.0761993267672527, - "learning_rate": 2.3130441012656485e-06, - "loss": 0.117, - "step": 4455 - }, - { - "epoch": 2.139601822979132, - "grad_norm": 1.9907693084636011, - "learning_rate": 2.3012815310882e-06, - "loss": 0.1529, - "step": 4460 - }, - { - "epoch": 2.1420004797313505, - "grad_norm": 1.427001377831548, - "learning_rate": 2.289539998578768e-06, - "loss": 0.1144, - "step": 4465 - }, - { - "epoch": 2.1443991364835693, - "grad_norm": 1.9701633935879637, - "learning_rate": 2.2778195952676345e-06, - "loss": 0.1346, - "step": 4470 - }, - { - "epoch": 2.146797793235788, - "grad_norm": 1.8539619958760687, - "learning_rate": 2.266120412520369e-06, - "loss": 0.1171, - "step": 4475 - }, - { - "epoch": 2.1491964499880067, - "grad_norm": 1.3067217228012888, - "learning_rate": 2.25444254153712e-06, - "loss": 0.1147, - "step": 4480 - }, - { - "epoch": 2.1515951067402255, - "grad_norm": 2.077001334839875, - "learning_rate": 2.2427860733519034e-06, - "loss": 0.1338, - "step": 4485 - }, - { - "epoch": 2.153993763492444, - "grad_norm": 2.696463098963866, - "learning_rate": 2.2311510988318865e-06, - "loss": 0.1376, - "step": 4490 - }, - { - "epoch": 2.156392420244663, - "grad_norm": 1.4341789258623368, - "learning_rate": 2.2195377086766862e-06, - "loss": 0.1102, - "step": 4495 - }, - { - "epoch": 2.1587910769968817, - "grad_norm": 2.0711628473207466, - "learning_rate": 2.2079459934176596e-06, - "loss": 0.1632, - "step": 4500 - }, - { - "epoch": 2.1611897337491004, - "grad_norm": 1.6336280241127499, - "learning_rate": 2.1963760434172e-06, - "loss": 0.1233, - "step": 4505 - }, - { - "epoch": 2.163588390501319, - "grad_norm": 1.8495280260216118, - "learning_rate": 2.184827948868028e-06, - "loss": 0.1315, - "step": 4510 - }, - { - "epoch": 2.165987047253538, - "grad_norm": 1.9526925374165924, - "learning_rate": 2.173301799792497e-06, - "loss": 0.116, - "step": 4515 - }, - { - "epoch": 2.1683857040057566, - "grad_norm": 3.3827621253228974, - "learning_rate": 2.161797686041882e-06, - "loss": 0.1279, - "step": 4520 - }, - { - "epoch": 2.1707843607579758, - "grad_norm": 1.5109194055996138, - "learning_rate": 2.1503156972956823e-06, - "loss": 0.1154, - "step": 4525 - }, - { - "epoch": 2.1731830175101945, - "grad_norm": 2.2477580547961473, - "learning_rate": 2.1388559230609278e-06, - "loss": 0.1401, - "step": 4530 - }, - { - "epoch": 2.1755816742624132, - "grad_norm": 2.0474896431675096, - "learning_rate": 2.1274184526714733e-06, - "loss": 0.1051, - "step": 4535 - }, - { - "epoch": 2.177980331014632, - "grad_norm": 1.5451984571822854, - "learning_rate": 2.1160033752873053e-06, - "loss": 0.1326, - "step": 4540 - }, - { - "epoch": 2.1803789877668507, - "grad_norm": 2.126613331135665, - "learning_rate": 2.1046107798938493e-06, - "loss": 0.1549, - "step": 4545 - }, - { - "epoch": 2.1827776445190694, - "grad_norm": 1.4317126044877562, - "learning_rate": 2.09324075530127e-06, - "loss": 0.1164, - "step": 4550 - }, - { - "epoch": 2.185176301271288, - "grad_norm": 2.1177926714585587, - "learning_rate": 2.0818933901437905e-06, - "loss": 0.1415, - "step": 4555 - }, - { - "epoch": 2.187574958023507, - "grad_norm": 1.283216594231968, - "learning_rate": 2.0705687728789876e-06, - "loss": 0.1271, - "step": 4560 - }, - { - "epoch": 2.1899736147757256, - "grad_norm": 1.2736682129552483, - "learning_rate": 2.0592669917871093e-06, - "loss": 0.1424, - "step": 4565 - }, - { - "epoch": 2.1923722715279443, - "grad_norm": 1.829115525077864, - "learning_rate": 2.0479881349703885e-06, - "loss": 0.1298, - "step": 4570 - }, - { - "epoch": 2.194770928280163, - "grad_norm": 1.795266905694527, - "learning_rate": 2.0367322903523483e-06, - "loss": 0.1233, - "step": 4575 - }, - { - "epoch": 2.197169585032382, - "grad_norm": 1.2760353352996483, - "learning_rate": 2.0254995456771303e-06, - "loss": 0.1279, - "step": 4580 - }, - { - "epoch": 2.1995682417846005, - "grad_norm": 1.7111968554436063, - "learning_rate": 2.0142899885087937e-06, - "loss": 0.1275, - "step": 4585 - }, - { - "epoch": 2.2019668985368193, - "grad_norm": 1.4094962158008097, - "learning_rate": 2.0031037062306467e-06, - "loss": 0.1219, - "step": 4590 - }, - { - "epoch": 2.204365555289038, - "grad_norm": 1.4066646174683728, - "learning_rate": 1.9919407860445585e-06, - "loss": 0.1231, - "step": 4595 - }, - { - "epoch": 2.2067642120412567, - "grad_norm": 1.6071130525622066, - "learning_rate": 1.9808013149702776e-06, - "loss": 0.1323, - "step": 4600 - }, - { - "epoch": 2.2091628687934755, - "grad_norm": 2.547322876356862, - "learning_rate": 1.9696853798447603e-06, - "loss": 0.127, - "step": 4605 - }, - { - "epoch": 2.2115615255456946, - "grad_norm": 1.5483008900838984, - "learning_rate": 1.9585930673214885e-06, - "loss": 0.1074, - "step": 4610 - }, - { - "epoch": 2.2139601822979134, - "grad_norm": 1.5382419648678334, - "learning_rate": 1.9475244638697943e-06, - "loss": 0.1526, - "step": 4615 - }, - { - "epoch": 2.216358839050132, - "grad_norm": 2.8665464265307534, - "learning_rate": 1.936479655774193e-06, - "loss": 0.1432, - "step": 4620 - }, - { - "epoch": 2.218757495802351, - "grad_norm": 1.3472673683557654, - "learning_rate": 1.9254587291336975e-06, - "loss": 0.1111, - "step": 4625 - }, - { - "epoch": 2.2211561525545696, - "grad_norm": 3.2736752179779467, - "learning_rate": 1.9144617698611616e-06, - "loss": 0.1251, - "step": 4630 - }, - { - "epoch": 2.2235548093067883, - "grad_norm": 1.5309415711503425, - "learning_rate": 1.9034888636825976e-06, - "loss": 0.107, - "step": 4635 - }, - { - "epoch": 2.225953466059007, - "grad_norm": 1.6722342536437953, - "learning_rate": 1.8925400961365164e-06, - "loss": 0.1271, - "step": 4640 - }, - { - "epoch": 2.2283521228112257, - "grad_norm": 1.2072514830382435, - "learning_rate": 1.8816155525732566e-06, - "loss": 0.1161, - "step": 4645 - }, - { - "epoch": 2.2307507795634445, - "grad_norm": 1.8683460607192535, - "learning_rate": 1.870715318154322e-06, - "loss": 0.1153, - "step": 4650 - }, - { - "epoch": 2.233149436315663, - "grad_norm": 1.8005183568624918, - "learning_rate": 1.8598394778517125e-06, - "loss": 0.1312, - "step": 4655 - }, - { - "epoch": 2.235548093067882, - "grad_norm": 1.803493471243067, - "learning_rate": 1.8489881164472722e-06, - "loss": 0.144, - "step": 4660 - }, - { - "epoch": 2.2379467498201007, - "grad_norm": 1.2566663432508494, - "learning_rate": 1.8381613185320158e-06, - "loss": 0.1136, - "step": 4665 - }, - { - "epoch": 2.2403454065723194, - "grad_norm": 1.7624206247680096, - "learning_rate": 1.827359168505477e-06, - "loss": 0.1045, - "step": 4670 - }, - { - "epoch": 2.242744063324538, - "grad_norm": 1.5566145746764684, - "learning_rate": 1.8165817505750483e-06, - "loss": 0.0988, - "step": 4675 - }, - { - "epoch": 2.245142720076757, - "grad_norm": 1.5186784492397856, - "learning_rate": 1.8058291487553243e-06, - "loss": 0.1304, - "step": 4680 - }, - { - "epoch": 2.2475413768289756, - "grad_norm": 1.825383805349096, - "learning_rate": 1.795101446867451e-06, - "loss": 0.1324, - "step": 4685 - }, - { - "epoch": 2.2499400335811943, - "grad_norm": 1.8077386836692138, - "learning_rate": 1.784398728538463e-06, - "loss": 0.1119, - "step": 4690 - }, - { - "epoch": 2.2523386903334135, - "grad_norm": 1.3139307927703525, - "learning_rate": 1.773721077200643e-06, - "loss": 0.1284, - "step": 4695 - }, - { - "epoch": 2.2547373470856322, - "grad_norm": 1.9359776479828592, - "learning_rate": 1.7630685760908623e-06, - "loss": 0.1128, - "step": 4700 - }, - { - "epoch": 2.257136003837851, - "grad_norm": 1.5512067156614446, - "learning_rate": 1.7524413082499354e-06, - "loss": 0.1199, - "step": 4705 - }, - { - "epoch": 2.2595346605900697, - "grad_norm": 2.06959679504393, - "learning_rate": 1.7418393565219737e-06, - "loss": 0.1325, - "step": 4710 - }, - { - "epoch": 2.2619333173422884, - "grad_norm": 2.2066122371220804, - "learning_rate": 1.7312628035537388e-06, - "loss": 0.1191, - "step": 4715 - }, - { - "epoch": 2.264331974094507, - "grad_norm": 2.1618555873923513, - "learning_rate": 1.720711731793996e-06, - "loss": 0.141, - "step": 4720 - }, - { - "epoch": 2.266730630846726, - "grad_norm": 2.2477809764108643, - "learning_rate": 1.710186223492878e-06, - "loss": 0.1607, - "step": 4725 - }, - { - "epoch": 2.2691292875989446, - "grad_norm": 2.445594160005071, - "learning_rate": 1.6996863607012337e-06, - "loss": 0.1241, - "step": 4730 - }, - { - "epoch": 2.2715279443511633, - "grad_norm": 2.2636372479823024, - "learning_rate": 1.6892122252700005e-06, - "loss": 0.1471, - "step": 4735 - }, - { - "epoch": 2.273926601103382, - "grad_norm": 2.165613571335606, - "learning_rate": 1.6787638988495548e-06, - "loss": 0.1171, - "step": 4740 - }, - { - "epoch": 2.276325257855601, - "grad_norm": 1.5501086091890792, - "learning_rate": 1.668341462889083e-06, - "loss": 0.1152, - "step": 4745 - }, - { - "epoch": 2.2787239146078195, - "grad_norm": 2.324272542162646, - "learning_rate": 1.657944998635944e-06, - "loss": 0.1413, - "step": 4750 - }, - { - "epoch": 2.2811225713600383, - "grad_norm": 1.3705751476924688, - "learning_rate": 1.6475745871350356e-06, - "loss": 0.1255, - "step": 4755 - }, - { - "epoch": 2.283521228112257, - "grad_norm": 1.6009570905341144, - "learning_rate": 1.6372303092281616e-06, - "loss": 0.1013, - "step": 4760 - }, - { - "epoch": 2.2859198848644757, - "grad_norm": 2.0897856937356054, - "learning_rate": 1.6269122455534097e-06, - "loss": 0.1121, - "step": 4765 - }, - { - "epoch": 2.2883185416166945, - "grad_norm": 2.0733573382322046, - "learning_rate": 1.6166204765445093e-06, - "loss": 0.1211, - "step": 4770 - }, - { - "epoch": 2.290717198368913, - "grad_norm": 1.6760313776999054, - "learning_rate": 1.6063550824302143e-06, - "loss": 0.147, - "step": 4775 - }, - { - "epoch": 2.2931158551211324, - "grad_norm": 1.4628986967241502, - "learning_rate": 1.5961161432336736e-06, - "loss": 0.1017, - "step": 4780 - }, - { - "epoch": 2.295514511873351, - "grad_norm": 1.5119823167350506, - "learning_rate": 1.585903738771813e-06, - "loss": 0.1186, - "step": 4785 - }, - { - "epoch": 2.29791316862557, - "grad_norm": 2.114617503567997, - "learning_rate": 1.5757179486547043e-06, - "loss": 0.1107, - "step": 4790 - }, - { - "epoch": 2.3003118253777886, - "grad_norm": 2.1299147651150254, - "learning_rate": 1.5655588522849486e-06, - "loss": 0.1228, - "step": 4795 - }, - { - "epoch": 2.3027104821300073, - "grad_norm": 2.0091940656270784, - "learning_rate": 1.555426528857063e-06, - "loss": 0.1328, - "step": 4800 - }, - { - "epoch": 2.305109138882226, - "grad_norm": 2.278463048269149, - "learning_rate": 1.5453210573568523e-06, - "loss": 0.0995, - "step": 4805 - }, - { - "epoch": 2.3075077956344447, - "grad_norm": 2.0911503889315672, - "learning_rate": 1.5352425165608025e-06, - "loss": 0.1281, - "step": 4810 - }, - { - "epoch": 2.3099064523866635, - "grad_norm": 2.2008586407114072, - "learning_rate": 1.5251909850354613e-06, - "loss": 0.1319, - "step": 4815 - }, - { - "epoch": 2.312305109138882, - "grad_norm": 1.7793237602426986, - "learning_rate": 1.5151665411368288e-06, - "loss": 0.1015, - "step": 4820 - }, - { - "epoch": 2.314703765891101, - "grad_norm": 1.892162960562313, - "learning_rate": 1.5051692630097426e-06, - "loss": 0.1213, - "step": 4825 - }, - { - "epoch": 2.3171024226433197, - "grad_norm": 1.454131556874955, - "learning_rate": 1.4951992285872779e-06, - "loss": 0.1135, - "step": 4830 - }, - { - "epoch": 2.3195010793955384, - "grad_norm": 1.9721817974769122, - "learning_rate": 1.4852565155901254e-06, - "loss": 0.1032, - "step": 4835 - }, - { - "epoch": 2.321899736147757, - "grad_norm": 1.5362146521829505, - "learning_rate": 1.4753412015260017e-06, - "loss": 0.15, - "step": 4840 - }, - { - "epoch": 2.324298392899976, - "grad_norm": 1.883056468435468, - "learning_rate": 1.46545336368903e-06, - "loss": 0.1357, - "step": 4845 - }, - { - "epoch": 2.3266970496521946, - "grad_norm": 1.5230339355008213, - "learning_rate": 1.4555930791591483e-06, - "loss": 0.1174, - "step": 4850 - }, - { - "epoch": 2.3290957064044138, - "grad_norm": 2.0634402334235427, - "learning_rate": 1.4457604248015039e-06, - "loss": 0.1594, - "step": 4855 - }, - { - "epoch": 2.331494363156632, - "grad_norm": 2.5978427352784346, - "learning_rate": 1.4359554772658551e-06, - "loss": 0.1453, - "step": 4860 - }, - { - "epoch": 2.3338930199088512, - "grad_norm": 1.4828036272318572, - "learning_rate": 1.4261783129859712e-06, - "loss": 0.1135, - "step": 4865 - }, - { - "epoch": 2.33629167666107, - "grad_norm": 1.6377747205147426, - "learning_rate": 1.4164290081790449e-06, - "loss": 0.131, - "step": 4870 - }, - { - "epoch": 2.3386903334132887, - "grad_norm": 1.3431874234243162, - "learning_rate": 1.406707638845088e-06, - "loss": 0.0933, - "step": 4875 - }, - { - "epoch": 2.3410889901655074, - "grad_norm": 1.7396553876263976, - "learning_rate": 1.3970142807663434e-06, - "loss": 0.1035, - "step": 4880 - }, - { - "epoch": 2.343487646917726, - "grad_norm": 1.038152016301309, - "learning_rate": 1.3873490095066933e-06, - "loss": 0.1312, - "step": 4885 - }, - { - "epoch": 2.345886303669945, - "grad_norm": 1.9332345560468491, - "learning_rate": 1.3777119004110773e-06, - "loss": 0.119, - "step": 4890 - }, - { - "epoch": 2.3482849604221636, - "grad_norm": 1.2020052938888175, - "learning_rate": 1.3681030286048913e-06, - "loss": 0.1247, - "step": 4895 - }, - { - "epoch": 2.3506836171743823, - "grad_norm": 1.3034758430102855, - "learning_rate": 1.3585224689934123e-06, - "loss": 0.1417, - "step": 4900 - }, - { - "epoch": 2.353082273926601, - "grad_norm": 1.6110679516673296, - "learning_rate": 1.348970296261214e-06, - "loss": 0.1232, - "step": 4905 - }, - { - "epoch": 2.35548093067882, - "grad_norm": 1.3235836550393674, - "learning_rate": 1.3394465848715787e-06, - "loss": 0.1143, - "step": 4910 - }, - { - "epoch": 2.3578795874310385, - "grad_norm": 1.7551178365926876, - "learning_rate": 1.3299514090659216e-06, - "loss": 0.1253, - "step": 4915 - }, - { - "epoch": 2.3602782441832573, - "grad_norm": 1.8650654499857218, - "learning_rate": 1.320484842863211e-06, - "loss": 0.1238, - "step": 4920 - }, - { - "epoch": 2.362676900935476, - "grad_norm": 1.8525194939207048, - "learning_rate": 1.311046960059391e-06, - "loss": 0.1579, - "step": 4925 - }, - { - "epoch": 2.3650755576876947, - "grad_norm": 1.5852491550316363, - "learning_rate": 1.301637834226806e-06, - "loss": 0.1429, - "step": 4930 - }, - { - "epoch": 2.3674742144399135, - "grad_norm": 1.429254229854141, - "learning_rate": 1.2922575387136266e-06, - "loss": 0.1468, - "step": 4935 - }, - { - "epoch": 2.3698728711921326, - "grad_norm": 1.4466523780422678, - "learning_rate": 1.2829061466432807e-06, - "loss": 0.1008, - "step": 4940 - }, - { - "epoch": 2.372271527944351, - "grad_norm": 1.5252773310736942, - "learning_rate": 1.2735837309138827e-06, - "loss": 0.1144, - "step": 4945 - }, - { - "epoch": 2.37467018469657, - "grad_norm": 1.5855398691958067, - "learning_rate": 1.2642903641976595e-06, - "loss": 0.1279, - "step": 4950 - }, - { - "epoch": 2.377068841448789, - "grad_norm": 1.9995841417171394, - "learning_rate": 1.2550261189403912e-06, - "loss": 0.1472, - "step": 4955 - }, - { - "epoch": 2.3794674982010076, - "grad_norm": 1.8707045496556725, - "learning_rate": 1.2457910673608431e-06, - "loss": 0.122, - "step": 4960 - }, - { - "epoch": 2.3818661549532263, - "grad_norm": 1.7079603628945372, - "learning_rate": 1.2365852814502044e-06, - "loss": 0.1363, - "step": 4965 - }, - { - "epoch": 2.384264811705445, - "grad_norm": 1.739198799924586, - "learning_rate": 1.2274088329715218e-06, - "loss": 0.114, - "step": 4970 - }, - { - "epoch": 2.3866634684576638, - "grad_norm": 1.7400466578261822, - "learning_rate": 1.2182617934591518e-06, - "loss": 0.1263, - "step": 4975 - }, - { - "epoch": 2.3890621252098825, - "grad_norm": 1.7442631450776291, - "learning_rate": 1.209144234218188e-06, - "loss": 0.1183, - "step": 4980 - }, - { - "epoch": 2.391460781962101, - "grad_norm": 1.453171635685411, - "learning_rate": 1.2000562263239147e-06, - "loss": 0.1229, - "step": 4985 - }, - { - "epoch": 2.39385943871432, - "grad_norm": 1.774275653754628, - "learning_rate": 1.190997840621254e-06, - "loss": 0.1053, - "step": 4990 - }, - { - "epoch": 2.3962580954665387, - "grad_norm": 1.9739289764276322, - "learning_rate": 1.1819691477242069e-06, - "loss": 0.1166, - "step": 4995 - }, - { - "epoch": 2.3986567522187574, - "grad_norm": 1.5591416089972125, - "learning_rate": 1.172970218015307e-06, - "loss": 0.139, - "step": 5000 - }, - { - "epoch": 2.3986567522187574, - "eval_loss": 0.9828721284866333, - "eval_runtime": 739.7974, - "eval_samples_per_second": 10.016, - "eval_steps_per_second": 0.627, - "step": 5000 - }, - { - "epoch": 2.401055408970976, - "grad_norm": 1.6293250795813292, - "learning_rate": 1.164001121645069e-06, - "loss": 0.1341, - "step": 5005 - }, - { - "epoch": 2.403454065723195, - "grad_norm": 1.3132218762272834, - "learning_rate": 1.1550619285314506e-06, - "loss": 0.1555, - "step": 5010 - }, - { - "epoch": 2.4058527224754136, - "grad_norm": 1.9570731793037472, - "learning_rate": 1.146152708359294e-06, - "loss": 0.1604, - "step": 5015 - }, - { - "epoch": 2.4082513792276323, - "grad_norm": 1.5540385866208424, - "learning_rate": 1.1372735305797915e-06, - "loss": 0.13, - "step": 5020 - }, - { - "epoch": 2.4106500359798515, - "grad_norm": 1.5740122868688697, - "learning_rate": 1.1284244644099429e-06, - "loss": 0.1385, - "step": 5025 - }, - { - "epoch": 2.4130486927320702, - "grad_norm": 1.2901353951281462, - "learning_rate": 1.1196055788320148e-06, - "loss": 0.1055, - "step": 5030 - }, - { - "epoch": 2.415447349484289, - "grad_norm": 2.307961658041504, - "learning_rate": 1.1108169425930028e-06, - "loss": 0.1226, - "step": 5035 - }, - { - "epoch": 2.4178460062365077, - "grad_norm": 1.7431624927432852, - "learning_rate": 1.1020586242040943e-06, - "loss": 0.1233, - "step": 5040 - }, - { - "epoch": 2.4202446629887264, - "grad_norm": 1.7139513380658922, - "learning_rate": 1.09333069194014e-06, - "loss": 0.1123, - "step": 5045 - }, - { - "epoch": 2.422643319740945, - "grad_norm": 1.3214268846749446, - "learning_rate": 1.0846332138391158e-06, - "loss": 0.1282, - "step": 5050 - }, - { - "epoch": 2.425041976493164, - "grad_norm": 1.3545971596216881, - "learning_rate": 1.0759662577015934e-06, - "loss": 0.1234, - "step": 5055 - }, - { - "epoch": 2.4274406332453826, - "grad_norm": 1.5840901905123996, - "learning_rate": 1.067329891090213e-06, - "loss": 0.1085, - "step": 5060 - }, - { - "epoch": 2.4298392899976013, - "grad_norm": 1.8510314040612086, - "learning_rate": 1.0587241813291577e-06, - "loss": 0.1241, - "step": 5065 - }, - { - "epoch": 2.43223794674982, - "grad_norm": 1.6443534535540818, - "learning_rate": 1.0501491955036248e-06, - "loss": 0.1283, - "step": 5070 - }, - { - "epoch": 2.434636603502039, - "grad_norm": 1.630490467331264, - "learning_rate": 1.0416050004593064e-06, - "loss": 0.1309, - "step": 5075 - }, - { - "epoch": 2.4370352602542575, - "grad_norm": 1.546896056501047, - "learning_rate": 1.0330916628018706e-06, - "loss": 0.1251, - "step": 5080 - }, - { - "epoch": 2.4394339170064763, - "grad_norm": 2.444462757677636, - "learning_rate": 1.0246092488964338e-06, - "loss": 0.1567, - "step": 5085 - }, - { - "epoch": 2.441832573758695, - "grad_norm": 2.0106589140381788, - "learning_rate": 1.01615782486705e-06, - "loss": 0.1176, - "step": 5090 - }, - { - "epoch": 2.4442312305109137, - "grad_norm": 1.277389727131116, - "learning_rate": 1.0077374565961977e-06, - "loss": 0.1023, - "step": 5095 - }, - { - "epoch": 2.4466298872631325, - "grad_norm": 1.330955589156616, - "learning_rate": 9.993482097242568e-07, - "loss": 0.1302, - "step": 5100 - }, - { - "epoch": 2.449028544015351, - "grad_norm": 1.421052118495812, - "learning_rate": 9.90990149649006e-07, - "loss": 0.1065, - "step": 5105 - }, - { - "epoch": 2.4514272007675704, - "grad_norm": 2.470850062777196, - "learning_rate": 9.826633415251063e-07, - "loss": 0.0987, - "step": 5110 - }, - { - "epoch": 2.453825857519789, - "grad_norm": 1.9901504584119645, - "learning_rate": 9.743678502636028e-07, - "loss": 0.1376, - "step": 5115 - }, - { - "epoch": 2.456224514272008, - "grad_norm": 1.8474285543280216, - "learning_rate": 9.66103740531405e-07, - "loss": 0.143, - "step": 5120 - }, - { - "epoch": 2.4586231710242266, - "grad_norm": 1.7017419659624342, - "learning_rate": 9.578710767507938e-07, - "loss": 0.1094, - "step": 5125 - }, - { - "epoch": 2.4610218277764453, - "grad_norm": 1.3695825075670094, - "learning_rate": 9.496699230989149e-07, - "loss": 0.1131, - "step": 5130 - }, - { - "epoch": 2.463420484528664, - "grad_norm": 2.2153426080080645, - "learning_rate": 9.415003435072778e-07, - "loss": 0.1146, - "step": 5135 - }, - { - "epoch": 2.4658191412808828, - "grad_norm": 3.5120243652823238, - "learning_rate": 9.333624016612586e-07, - "loss": 0.1525, - "step": 5140 - }, - { - "epoch": 2.4682177980331015, - "grad_norm": 1.817355800367923, - "learning_rate": 9.252561609996075e-07, - "loss": 0.1159, - "step": 5145 - }, - { - "epoch": 2.47061645478532, - "grad_norm": 1.9821073225380732, - "learning_rate": 9.171816847139447e-07, - "loss": 0.1041, - "step": 5150 - }, - { - "epoch": 2.473015111537539, - "grad_norm": 1.459100052526109, - "learning_rate": 9.091390357482793e-07, - "loss": 0.1181, - "step": 5155 - }, - { - "epoch": 2.4754137682897577, - "grad_norm": 1.952875712371669, - "learning_rate": 9.011282767985069e-07, - "loss": 0.1098, - "step": 5160 - }, - { - "epoch": 2.4778124250419764, - "grad_norm": 1.534591950431475, - "learning_rate": 8.931494703119309e-07, - "loss": 0.1408, - "step": 5165 - }, - { - "epoch": 2.480211081794195, - "grad_norm": 1.7689562715329574, - "learning_rate": 8.852026784867701e-07, - "loss": 0.1313, - "step": 5170 - }, - { - "epoch": 2.482609738546414, - "grad_norm": 1.7339149389894069, - "learning_rate": 8.772879632716746e-07, - "loss": 0.1092, - "step": 5175 - }, - { - "epoch": 2.4850083952986326, - "grad_norm": 1.6464606444231056, - "learning_rate": 8.694053863652435e-07, - "loss": 0.1196, - "step": 5180 - }, - { - "epoch": 2.4874070520508513, - "grad_norm": 1.6648519910066735, - "learning_rate": 8.615550092155478e-07, - "loss": 0.0914, - "step": 5185 - }, - { - "epoch": 2.48980570880307, - "grad_norm": 1.7874580399443132, - "learning_rate": 8.53736893019641e-07, - "loss": 0.0952, - "step": 5190 - }, - { - "epoch": 2.4922043655552892, - "grad_norm": 1.4683803781623317, - "learning_rate": 8.459510987230957e-07, - "loss": 0.1495, - "step": 5195 - }, - { - "epoch": 2.494603022307508, - "grad_norm": 1.5568299629889224, - "learning_rate": 8.381976870195169e-07, - "loss": 0.1236, - "step": 5200 - }, - { - "epoch": 2.4970016790597267, - "grad_norm": 2.4873939051158263, - "learning_rate": 8.304767183500734e-07, - "loss": 0.1356, - "step": 5205 - }, - { - "epoch": 2.4994003358119454, - "grad_norm": 1.5690964117556039, - "learning_rate": 8.227882529030284e-07, - "loss": 0.1091, - "step": 5210 - }, - { - "epoch": 2.501798992564164, - "grad_norm": 1.6954049205320723, - "learning_rate": 8.151323506132663e-07, - "loss": 0.113, - "step": 5215 - }, - { - "epoch": 2.504197649316383, - "grad_norm": 1.5579032194602551, - "learning_rate": 8.075090711618322e-07, - "loss": 0.1085, - "step": 5220 - }, - { - "epoch": 2.5065963060686016, - "grad_norm": 1.764174191146571, - "learning_rate": 7.999184739754578e-07, - "loss": 0.1096, - "step": 5225 - }, - { - "epoch": 2.5089949628208204, - "grad_norm": 1.951667226426689, - "learning_rate": 7.92360618226104e-07, - "loss": 0.1288, - "step": 5230 - }, - { - "epoch": 2.511393619573039, - "grad_norm": 1.3975084205150126, - "learning_rate": 7.848355628304976e-07, - "loss": 0.1473, - "step": 5235 - }, - { - "epoch": 2.513792276325258, - "grad_norm": 1.4583105685332018, - "learning_rate": 7.773433664496738e-07, - "loss": 0.098, - "step": 5240 - }, - { - "epoch": 2.5161909330774765, - "grad_norm": 2.052173821531935, - "learning_rate": 7.698840874885161e-07, - "loss": 0.1119, - "step": 5245 - }, - { - "epoch": 2.5185895898296953, - "grad_norm": 2.1552887506028506, - "learning_rate": 7.624577840953046e-07, - "loss": 0.1497, - "step": 5250 - }, - { - "epoch": 2.520988246581914, - "grad_norm": 1.6928562717383377, - "learning_rate": 7.550645141612583e-07, - "loss": 0.1153, - "step": 5255 - }, - { - "epoch": 2.5233869033341327, - "grad_norm": 1.532142293479699, - "learning_rate": 7.477043353200891e-07, - "loss": 0.1266, - "step": 5260 - }, - { - "epoch": 2.5257855600863515, - "grad_norm": 1.5075112493817633, - "learning_rate": 7.403773049475477e-07, - "loss": 0.1195, - "step": 5265 - }, - { - "epoch": 2.5281842168385706, - "grad_norm": 1.4691450646957636, - "learning_rate": 7.330834801609782e-07, - "loss": 0.1091, - "step": 5270 - }, - { - "epoch": 2.530582873590789, - "grad_norm": 1.5131542211003923, - "learning_rate": 7.25822917818873e-07, - "loss": 0.1177, - "step": 5275 - }, - { - "epoch": 2.532981530343008, - "grad_norm": 1.3665626938316375, - "learning_rate": 7.185956745204298e-07, - "loss": 0.1369, - "step": 5280 - }, - { - "epoch": 2.535380187095227, - "grad_norm": 1.7874005964681707, - "learning_rate": 7.114018066051098e-07, - "loss": 0.127, - "step": 5285 - }, - { - "epoch": 2.5377788438474456, - "grad_norm": 2.660178675102634, - "learning_rate": 7.042413701522005e-07, - "loss": 0.1121, - "step": 5290 - }, - { - "epoch": 2.5401775005996643, - "grad_norm": 1.8021482036576246, - "learning_rate": 6.971144209803738e-07, - "loss": 0.1203, - "step": 5295 - }, - { - "epoch": 2.542576157351883, - "grad_norm": 3.5023632685823576, - "learning_rate": 6.900210146472564e-07, - "loss": 0.121, - "step": 5300 - }, - { - "epoch": 2.5449748141041018, - "grad_norm": 1.4117798013702134, - "learning_rate": 6.829612064489933e-07, - "loss": 0.1132, - "step": 5305 - }, - { - "epoch": 2.5473734708563205, - "grad_norm": 2.182251500140064, - "learning_rate": 6.759350514198171e-07, - "loss": 0.1195, - "step": 5310 - }, - { - "epoch": 2.549772127608539, - "grad_norm": 1.7117310660076108, - "learning_rate": 6.6894260433162e-07, - "loss": 0.1111, - "step": 5315 - }, - { - "epoch": 2.552170784360758, - "grad_norm": 1.5233014936758835, - "learning_rate": 6.619839196935251e-07, - "loss": 0.1361, - "step": 5320 - }, - { - "epoch": 2.5545694411129767, - "grad_norm": 1.3687130333207045, - "learning_rate": 6.550590517514666e-07, - "loss": 0.109, - "step": 5325 - }, - { - "epoch": 2.5569680978651954, - "grad_norm": 1.8037859341746436, - "learning_rate": 6.481680544877584e-07, - "loss": 0.1316, - "step": 5330 - }, - { - "epoch": 2.559366754617414, - "grad_norm": 2.2640573897300205, - "learning_rate": 6.413109816206803e-07, - "loss": 0.1333, - "step": 5335 - }, - { - "epoch": 2.561765411369633, - "grad_norm": 1.592815519460586, - "learning_rate": 6.344878866040571e-07, - "loss": 0.1116, - "step": 5340 - }, - { - "epoch": 2.5641640681218516, - "grad_norm": 1.9130650669785958, - "learning_rate": 6.276988226268388e-07, - "loss": 0.1327, - "step": 5345 - }, - { - "epoch": 2.5665627248740703, - "grad_norm": 1.7208032444949648, - "learning_rate": 6.209438426126946e-07, - "loss": 0.1693, - "step": 5350 - }, - { - "epoch": 2.5689613816262895, - "grad_norm": 1.5896417871178385, - "learning_rate": 6.142229992195886e-07, - "loss": 0.1156, - "step": 5355 - }, - { - "epoch": 2.571360038378508, - "grad_norm": 1.3231503619333465, - "learning_rate": 6.075363448393778e-07, - "loss": 0.0997, - "step": 5360 - }, - { - "epoch": 2.573758695130727, - "grad_norm": 2.3180058541025224, - "learning_rate": 6.00883931597403e-07, - "loss": 0.1207, - "step": 5365 - }, - { - "epoch": 2.5761573518829457, - "grad_norm": 1.8598922709868437, - "learning_rate": 5.942658113520778e-07, - "loss": 0.1187, - "step": 5370 - }, - { - "epoch": 2.5785560086351644, - "grad_norm": 1.741716946232789, - "learning_rate": 5.876820356944879e-07, - "loss": 0.1782, - "step": 5375 - }, - { - "epoch": 2.580954665387383, - "grad_norm": 2.0613835405768346, - "learning_rate": 5.811326559479885e-07, - "loss": 0.1276, - "step": 5380 - }, - { - "epoch": 2.583353322139602, - "grad_norm": 1.2306263180991488, - "learning_rate": 5.746177231678046e-07, - "loss": 0.1134, - "step": 5385 - }, - { - "epoch": 2.5857519788918206, - "grad_norm": 1.5217893010086523, - "learning_rate": 5.681372881406295e-07, - "loss": 0.1151, - "step": 5390 - }, - { - "epoch": 2.5881506356440394, - "grad_norm": 1.8109437720576, - "learning_rate": 5.616914013842362e-07, - "loss": 0.0977, - "step": 5395 - }, - { - "epoch": 2.590549292396258, - "grad_norm": 1.2622629020176752, - "learning_rate": 5.55280113147077e-07, - "loss": 0.1271, - "step": 5400 - }, - { - "epoch": 2.592947949148477, - "grad_norm": 1.571274143304735, - "learning_rate": 5.489034734078924e-07, - "loss": 0.1192, - "step": 5405 - }, - { - "epoch": 2.5953466059006955, - "grad_norm": 1.9698287341511391, - "learning_rate": 5.425615318753252e-07, - "loss": 0.1057, - "step": 5410 - }, - { - "epoch": 2.5977452626529143, - "grad_norm": 1.9701923754542536, - "learning_rate": 5.36254337987529e-07, - "loss": 0.1282, - "step": 5415 - }, - { - "epoch": 2.600143919405133, - "grad_norm": 2.2686032744150015, - "learning_rate": 5.299819409117857e-07, - "loss": 0.1429, - "step": 5420 - }, - { - "epoch": 2.6025425761573517, - "grad_norm": 1.2611154676332545, - "learning_rate": 5.237443895441213e-07, - "loss": 0.1188, - "step": 5425 - }, - { - "epoch": 2.604941232909571, - "grad_norm": 1.5048543165518988, - "learning_rate": 5.175417325089227e-07, - "loss": 0.1052, - "step": 5430 - }, - { - "epoch": 2.607339889661789, - "grad_norm": 1.5118763112028835, - "learning_rate": 5.113740181585646e-07, - "loss": 0.1258, - "step": 5435 - }, - { - "epoch": 2.6097385464140084, - "grad_norm": 2.3811131252232056, - "learning_rate": 5.05241294573024e-07, - "loss": 0.1081, - "step": 5440 - }, - { - "epoch": 2.6121372031662267, - "grad_norm": 2.454395400405562, - "learning_rate": 4.99143609559512e-07, - "loss": 0.1461, - "step": 5445 - }, - { - "epoch": 2.614535859918446, - "grad_norm": 1.5225164499950974, - "learning_rate": 4.930810106520983e-07, - "loss": 0.126, - "step": 5450 - }, - { - "epoch": 2.6169345166706646, - "grad_norm": 2.9727690647821166, - "learning_rate": 4.870535451113434e-07, - "loss": 0.0866, - "step": 5455 - }, - { - "epoch": 2.6193331734228833, - "grad_norm": 1.8766682156530599, - "learning_rate": 4.81061259923925e-07, - "loss": 0.1161, - "step": 5460 - }, - { - "epoch": 2.621731830175102, - "grad_norm": 1.3747965137374512, - "learning_rate": 4.751042018022761e-07, - "loss": 0.1177, - "step": 5465 - }, - { - "epoch": 2.6241304869273208, - "grad_norm": 2.4902927641849764, - "learning_rate": 4.691824171842219e-07, - "loss": 0.1545, - "step": 5470 - }, - { - "epoch": 2.6265291436795395, - "grad_norm": 2.20678634424907, - "learning_rate": 4.6329595223261193e-07, - "loss": 0.1212, - "step": 5475 - }, - { - "epoch": 2.628927800431758, - "grad_norm": 2.36358294096013, - "learning_rate": 4.574448528349662e-07, - "loss": 0.1337, - "step": 5480 - }, - { - "epoch": 2.631326457183977, - "grad_norm": 2.1209790191903153, - "learning_rate": 4.5162916460311346e-07, - "loss": 0.1327, - "step": 5485 - }, - { - "epoch": 2.6337251139361957, - "grad_norm": 2.1750708889962285, - "learning_rate": 4.4584893287283773e-07, - "loss": 0.1115, - "step": 5490 - }, - { - "epoch": 2.6361237706884144, - "grad_norm": 1.7181125564121318, - "learning_rate": 4.4010420270352463e-07, - "loss": 0.1152, - "step": 5495 - }, - { - "epoch": 2.638522427440633, - "grad_norm": 1.458326231995438, - "learning_rate": 4.3439501887781064e-07, - "loss": 0.1257, - "step": 5500 - }, - { - "epoch": 2.640921084192852, - "grad_norm": 1.349194936754368, - "learning_rate": 4.287214259012329e-07, - "loss": 0.1094, - "step": 5505 - }, - { - "epoch": 2.6433197409450706, - "grad_norm": 1.3226300725693332, - "learning_rate": 4.2308346800188193e-07, - "loss": 0.1287, - "step": 5510 - }, - { - "epoch": 2.64571839769729, - "grad_norm": 1.8303296939894655, - "learning_rate": 4.1748118913005675e-07, - "loss": 0.1119, - "step": 5515 - }, - { - "epoch": 2.648117054449508, - "grad_norm": 2.1148807551463658, - "learning_rate": 4.1191463295792545e-07, - "loss": 0.1229, - "step": 5520 - }, - { - "epoch": 2.6505157112017272, - "grad_norm": 1.8398900807422445, - "learning_rate": 4.063838428791805e-07, - "loss": 0.1479, - "step": 5525 - }, - { - "epoch": 2.6529143679539455, - "grad_norm": 1.9081399715986063, - "learning_rate": 4.0088886200870256e-07, - "loss": 0.1518, - "step": 5530 - }, - { - "epoch": 2.6553130247061647, - "grad_norm": 2.5071133379640123, - "learning_rate": 3.9542973318222376e-07, - "loss": 0.1453, - "step": 5535 - }, - { - "epoch": 2.6577116814583834, - "grad_norm": 1.3617983583001947, - "learning_rate": 3.900064989559965e-07, - "loss": 0.1369, - "step": 5540 - }, - { - "epoch": 2.660110338210602, - "grad_norm": 1.7350007966826908, - "learning_rate": 3.846192016064576e-07, - "loss": 0.1131, - "step": 5545 - }, - { - "epoch": 2.662508994962821, - "grad_norm": 1.4222833594841213, - "learning_rate": 3.7926788312989904e-07, - "loss": 0.1273, - "step": 5550 - }, - { - "epoch": 2.6649076517150396, - "grad_norm": 1.4888034133853167, - "learning_rate": 3.739525852421455e-07, - "loss": 0.1325, - "step": 5555 - }, - { - "epoch": 2.6673063084672584, - "grad_norm": 1.4598038003756197, - "learning_rate": 3.68673349378223e-07, - "loss": 0.1403, - "step": 5560 - }, - { - "epoch": 2.669704965219477, - "grad_norm": 1.595545300083971, - "learning_rate": 3.634302166920395e-07, - "loss": 0.1385, - "step": 5565 - }, - { - "epoch": 2.672103621971696, - "grad_norm": 1.7762121350952307, - "learning_rate": 3.5822322805606267e-07, - "loss": 0.1374, - "step": 5570 - }, - { - "epoch": 2.6745022787239146, - "grad_norm": 1.703923143430848, - "learning_rate": 3.5305242406100395e-07, - "loss": 0.1331, - "step": 5575 - }, - { - "epoch": 2.6769009354761333, - "grad_norm": 1.4304866394510962, - "learning_rate": 3.479178450154974e-07, - "loss": 0.1373, - "step": 5580 - }, - { - "epoch": 2.679299592228352, - "grad_norm": 1.7486370046393058, - "learning_rate": 3.4281953094578877e-07, - "loss": 0.1376, - "step": 5585 - }, - { - "epoch": 2.6816982489805707, - "grad_norm": 1.533651476967988, - "learning_rate": 3.3775752159542375e-07, - "loss": 0.1115, - "step": 5590 - }, - { - "epoch": 2.6840969057327895, - "grad_norm": 1.9305760137068588, - "learning_rate": 3.32731856424936e-07, - "loss": 0.1105, - "step": 5595 - }, - { - "epoch": 2.6864955624850086, - "grad_norm": 1.8129731172858077, - "learning_rate": 3.277425746115398e-07, - "loss": 0.112, - "step": 5600 - }, - { - "epoch": 2.688894219237227, - "grad_norm": 1.7960561922883571, - "learning_rate": 3.227897150488285e-07, - "loss": 0.1174, - "step": 5605 - }, - { - "epoch": 2.691292875989446, - "grad_norm": 2.0259280213774176, - "learning_rate": 3.178733163464676e-07, - "loss": 0.118, - "step": 5610 - }, - { - "epoch": 2.6936915327416644, - "grad_norm": 1.6681393489978669, - "learning_rate": 3.1299341682989313e-07, - "loss": 0.1258, - "step": 5615 - }, - { - "epoch": 2.6960901894938836, - "grad_norm": 1.5698047263231931, - "learning_rate": 3.08150054540014e-07, - "loss": 0.1089, - "step": 5620 - }, - { - "epoch": 2.6984888462461023, - "grad_norm": 1.8806087643705454, - "learning_rate": 3.033432672329173e-07, - "loss": 0.1584, - "step": 5625 - }, - { - "epoch": 2.700887502998321, - "grad_norm": 1.6181003470798663, - "learning_rate": 2.985730923795721e-07, - "loss": 0.1197, - "step": 5630 - }, - { - "epoch": 2.7032861597505398, - "grad_norm": 1.903755595617439, - "learning_rate": 2.93839567165537e-07, - "loss": 0.1082, - "step": 5635 - }, - { - "epoch": 2.7056848165027585, - "grad_norm": 1.4481162983792952, - "learning_rate": 2.891427284906706e-07, - "loss": 0.1235, - "step": 5640 - }, - { - "epoch": 2.7080834732549772, - "grad_norm": 1.3940387482587855, - "learning_rate": 2.8448261296884715e-07, - "loss": 0.1135, - "step": 5645 - }, - { - "epoch": 2.710482130007196, - "grad_norm": 1.601511394808099, - "learning_rate": 2.7985925692766426e-07, - "loss": 0.1317, - "step": 5650 - }, - { - "epoch": 2.7128807867594147, - "grad_norm": 1.3269174628464722, - "learning_rate": 2.7527269640816537e-07, - "loss": 0.1294, - "step": 5655 - }, - { - "epoch": 2.7152794435116334, - "grad_norm": 1.4993241080528643, - "learning_rate": 2.707229671645578e-07, - "loss": 0.1289, - "step": 5660 - }, - { - "epoch": 2.717678100263852, - "grad_norm": 1.779375793335884, - "learning_rate": 2.662101046639326e-07, - "loss": 0.1239, - "step": 5665 - }, - { - "epoch": 2.720076757016071, - "grad_norm": 1.9455875856444815, - "learning_rate": 2.617341440859883e-07, - "loss": 0.1219, - "step": 5670 - }, - { - "epoch": 2.7224754137682896, - "grad_norm": 1.672660061147154, - "learning_rate": 2.572951203227564e-07, - "loss": 0.1306, - "step": 5675 - }, - { - "epoch": 2.7248740705205083, - "grad_norm": 1.6430355652699298, - "learning_rate": 2.528930679783331e-07, - "loss": 0.0924, - "step": 5680 - }, - { - "epoch": 2.7272727272727275, - "grad_norm": 1.52811198720444, - "learning_rate": 2.485280213686031e-07, - "loss": 0.1083, - "step": 5685 - }, - { - "epoch": 2.729671384024946, - "grad_norm": 2.0031601863687545, - "learning_rate": 2.442000145209783e-07, - "loss": 0.1123, - "step": 5690 - }, - { - "epoch": 2.732070040777165, - "grad_norm": 2.183846855918397, - "learning_rate": 2.3990908117412725e-07, - "loss": 0.128, - "step": 5695 - }, - { - "epoch": 2.7344686975293837, - "grad_norm": 1.3334592284309938, - "learning_rate": 2.3565525477771635e-07, - "loss": 0.1044, - "step": 5700 - }, - { - "epoch": 2.7368673542816024, - "grad_norm": 1.7121458933661435, - "learning_rate": 2.3143856849214608e-07, - "loss": 0.1149, - "step": 5705 - }, - { - "epoch": 2.739266011033821, - "grad_norm": 1.6207522141153845, - "learning_rate": 2.2725905518829582e-07, - "loss": 0.1167, - "step": 5710 - }, - { - "epoch": 2.74166466778604, - "grad_norm": 1.2168116030947638, - "learning_rate": 2.231167474472651e-07, - "loss": 0.127, - "step": 5715 - }, - { - "epoch": 2.7440633245382586, - "grad_norm": 1.90993157648795, - "learning_rate": 2.190116775601181e-07, - "loss": 0.1255, - "step": 5720 - }, - { - "epoch": 2.7464619812904774, - "grad_norm": 1.6206812916963553, - "learning_rate": 2.149438775276358e-07, - "loss": 0.1203, - "step": 5725 - }, - { - "epoch": 2.748860638042696, - "grad_norm": 2.2235187423218092, - "learning_rate": 2.109133790600648e-07, - "loss": 0.1578, - "step": 5730 - }, - { - "epoch": 2.751259294794915, - "grad_norm": 1.8139066015122434, - "learning_rate": 2.0692021357686886e-07, - "loss": 0.1352, - "step": 5735 - }, - { - "epoch": 2.7536579515471336, - "grad_norm": 2.031694397609124, - "learning_rate": 2.0296441220648554e-07, - "loss": 0.1373, - "step": 5740 - }, - { - "epoch": 2.7560566082993523, - "grad_norm": 2.2355401023781516, - "learning_rate": 1.990460057860827e-07, - "loss": 0.1212, - "step": 5745 - }, - { - "epoch": 2.758455265051571, - "grad_norm": 1.7755393370077912, - "learning_rate": 1.9516502486132072e-07, - "loss": 0.1027, - "step": 5750 - }, - { - "epoch": 2.7608539218037897, - "grad_norm": 1.6871643573081383, - "learning_rate": 1.9132149968610903e-07, - "loss": 0.1231, - "step": 5755 - }, - { - "epoch": 2.7632525785560085, - "grad_norm": 1.7906723105033933, - "learning_rate": 1.875154602223761e-07, - "loss": 0.1286, - "step": 5760 - }, - { - "epoch": 2.765651235308227, - "grad_norm": 1.4557489056351423, - "learning_rate": 1.8374693613983197e-07, - "loss": 0.1195, - "step": 5765 - }, - { - "epoch": 2.7680498920604464, - "grad_norm": 1.2501535667579526, - "learning_rate": 1.800159568157378e-07, - "loss": 0.122, - "step": 5770 - }, - { - "epoch": 2.7704485488126647, - "grad_norm": 1.9933839036208927, - "learning_rate": 1.7632255133467836e-07, - "loss": 0.1157, - "step": 5775 - }, - { - "epoch": 2.772847205564884, - "grad_norm": 1.8434279839218284, - "learning_rate": 1.7266674848833155e-07, - "loss": 0.0927, - "step": 5780 - }, - { - "epoch": 2.7752458623171026, - "grad_norm": 1.5358444407642786, - "learning_rate": 1.6904857677525145e-07, - "loss": 0.1041, - "step": 5785 - }, - { - "epoch": 2.7776445190693213, - "grad_norm": 1.3531209392466852, - "learning_rate": 1.654680644006368e-07, - "loss": 0.1018, - "step": 5790 - }, - { - "epoch": 2.78004317582154, - "grad_norm": 2.1858517794960783, - "learning_rate": 1.6192523927611835e-07, - "loss": 0.119, - "step": 5795 - }, - { - "epoch": 2.7824418325737588, - "grad_norm": 1.5246185793951936, - "learning_rate": 1.584201290195364e-07, - "loss": 0.1271, - "step": 5800 - }, - { - "epoch": 2.7848404893259775, - "grad_norm": 1.6607987943748377, - "learning_rate": 1.5495276095472967e-07, - "loss": 0.1175, - "step": 5805 - }, - { - "epoch": 2.7872391460781962, - "grad_norm": 1.5511169416761914, - "learning_rate": 1.5152316211131835e-07, - "loss": 0.1189, - "step": 5810 - }, - { - "epoch": 2.789637802830415, - "grad_norm": 1.3636834104396771, - "learning_rate": 1.481313592244976e-07, - "loss": 0.0974, - "step": 5815 - }, - { - "epoch": 2.7920364595826337, - "grad_norm": 1.7780659172726045, - "learning_rate": 1.4477737873482556e-07, - "loss": 0.116, - "step": 5820 - }, - { - "epoch": 2.7944351163348524, - "grad_norm": 1.9201443415453343, - "learning_rate": 1.4146124678801886e-07, - "loss": 0.1048, - "step": 5825 - }, - { - "epoch": 2.796833773087071, - "grad_norm": 1.6060426355740423, - "learning_rate": 1.381829892347486e-07, - "loss": 0.1134, - "step": 5830 - }, - { - "epoch": 2.79923242983929, - "grad_norm": 1.8753892090672297, - "learning_rate": 1.3494263163043752e-07, - "loss": 0.1022, - "step": 5835 - }, - { - "epoch": 2.8016310865915086, - "grad_norm": 1.7909391408691406, - "learning_rate": 1.3174019923506365e-07, - "loss": 0.1083, - "step": 5840 - }, - { - "epoch": 2.804029743343728, - "grad_norm": 1.8615512629565283, - "learning_rate": 1.2857571701296146e-07, - "loss": 0.1248, - "step": 5845 - }, - { - "epoch": 2.806428400095946, - "grad_norm": 1.4977699048787447, - "learning_rate": 1.2544920963262653e-07, - "loss": 0.1261, - "step": 5850 - }, - { - "epoch": 2.8088270568481652, - "grad_norm": 1.8717329377806204, - "learning_rate": 1.2236070146652512e-07, - "loss": 0.1123, - "step": 5855 - }, - { - "epoch": 2.8112257136003835, - "grad_norm": 1.8892706724274941, - "learning_rate": 1.1931021659090325e-07, - "loss": 0.1276, - "step": 5860 - }, - { - "epoch": 2.8136243703526027, - "grad_norm": 1.691896819156689, - "learning_rate": 1.1629777878559956e-07, - "loss": 0.1244, - "step": 5865 - }, - { - "epoch": 2.8160230271048214, - "grad_norm": 1.8682571614765742, - "learning_rate": 1.1332341153385773e-07, - "loss": 0.1355, - "step": 5870 - }, - { - "epoch": 2.81842168385704, - "grad_norm": 1.2268638639315688, - "learning_rate": 1.1038713802214718e-07, - "loss": 0.1225, - "step": 5875 - }, - { - "epoch": 2.820820340609259, - "grad_norm": 2.0069927297948555, - "learning_rate": 1.0748898113997875e-07, - "loss": 0.1195, - "step": 5880 - }, - { - "epoch": 2.8232189973614776, - "grad_norm": 2.308460157379254, - "learning_rate": 1.0462896347972818e-07, - "loss": 0.1174, - "step": 5885 - }, - { - "epoch": 2.8256176541136964, - "grad_norm": 2.0122573238619075, - "learning_rate": 1.0180710733646127e-07, - "loss": 0.1556, - "step": 5890 - }, - { - "epoch": 2.828016310865915, - "grad_norm": 1.3824343924619586, - "learning_rate": 9.902343470775566e-08, - "loss": 0.1403, - "step": 5895 - }, - { - "epoch": 2.830414967618134, - "grad_norm": 2.967278007524201, - "learning_rate": 9.627796729353434e-08, - "loss": 0.1106, - "step": 5900 - }, - { - "epoch": 2.8328136243703526, - "grad_norm": 1.6475498343063206, - "learning_rate": 9.357072649589238e-08, - "loss": 0.1169, - "step": 5905 - }, - { - "epoch": 2.8352122811225713, - "grad_norm": 1.2585469032506666, - "learning_rate": 9.090173341893383e-08, - "loss": 0.1277, - "step": 5910 - }, - { - "epoch": 2.83761093787479, - "grad_norm": 1.1238112527321122, - "learning_rate": 8.827100886860451e-08, - "loss": 0.1043, - "step": 5915 - }, - { - "epoch": 2.8400095946270087, - "grad_norm": 1.488550388006429, - "learning_rate": 8.567857335253005e-08, - "loss": 0.13, - "step": 5920 - }, - { - "epoch": 2.8424082513792275, - "grad_norm": 1.3312873362378537, - "learning_rate": 8.312444707985811e-08, - "loss": 0.1245, - "step": 5925 - }, - { - "epoch": 2.8448069081314467, - "grad_norm": 2.290161387016882, - "learning_rate": 8.060864996109807e-08, - "loss": 0.1307, - "step": 5930 - }, - { - "epoch": 2.847205564883665, - "grad_norm": 1.9033120811547106, - "learning_rate": 7.813120160796772e-08, - "loss": 0.1371, - "step": 5935 - }, - { - "epoch": 2.849604221635884, - "grad_norm": 2.0871629668196676, - "learning_rate": 7.569212133323956e-08, - "loss": 0.1239, - "step": 5940 - }, - { - "epoch": 2.8520028783881024, - "grad_norm": 1.4650768736158892, - "learning_rate": 7.329142815059032e-08, - "loss": 0.119, - "step": 5945 - }, - { - "epoch": 2.8544015351403216, - "grad_norm": 1.1743412799238815, - "learning_rate": 7.092914077445335e-08, - "loss": 0.1018, - "step": 5950 - }, - { - "epoch": 2.8568001918925403, - "grad_norm": 1.5498522487672175, - "learning_rate": 6.860527761987145e-08, - "loss": 0.1197, - "step": 5955 - }, - { - "epoch": 2.859198848644759, - "grad_norm": 1.6056923353016053, - "learning_rate": 6.631985680235487e-08, - "loss": 0.1115, - "step": 5960 - }, - { - "epoch": 2.8615975053969778, - "grad_norm": 1.791424778629989, - "learning_rate": 6.407289613774014e-08, - "loss": 0.1414, - "step": 5965 - }, - { - "epoch": 2.8639961621491965, - "grad_norm": 1.4221448461917148, - "learning_rate": 6.186441314204816e-08, - "loss": 0.1289, - "step": 5970 - }, - { - "epoch": 2.8663948189014152, - "grad_norm": 1.495279144021361, - "learning_rate": 5.969442503135192e-08, - "loss": 0.112, - "step": 5975 - }, - { - "epoch": 2.868793475653634, - "grad_norm": 2.545350173896225, - "learning_rate": 5.756294872163948e-08, - "loss": 0.131, - "step": 5980 - }, - { - "epoch": 2.8711921324058527, - "grad_norm": 1.2008746098462784, - "learning_rate": 5.547000082868237e-08, - "loss": 0.1219, - "step": 5985 - }, - { - "epoch": 2.8735907891580714, - "grad_norm": 1.487751985051309, - "learning_rate": 5.341559766790738e-08, - "loss": 0.1323, - "step": 5990 - }, - { - "epoch": 2.87598944591029, - "grad_norm": 1.3980272709675226, - "learning_rate": 5.139975525426833e-08, - "loss": 0.1467, - "step": 5995 - }, - { - "epoch": 2.878388102662509, - "grad_norm": 2.10880272054709, - "learning_rate": 4.942248930212224e-08, - "loss": 0.146, - "step": 6000 - }, - { - "epoch": 2.878388102662509, - "eval_loss": 0.9810638427734375, - "eval_runtime": 739.6233, - "eval_samples_per_second": 10.019, - "eval_steps_per_second": 0.627, - "step": 6000 - }, - { - "epoch": 2.8807867594147276, - "grad_norm": 1.483009143164286, - "learning_rate": 4.748381522510392e-08, - "loss": 0.1158, - "step": 6005 - }, - { - "epoch": 2.8831854161669463, - "grad_norm": 1.539223427422332, - "learning_rate": 4.558374813601052e-08, - "loss": 0.1336, - "step": 6010 - }, - { - "epoch": 2.8855840729191655, - "grad_norm": 1.622424329571053, - "learning_rate": 4.372230284667878e-08, - "loss": 0.1108, - "step": 6015 - }, - { - "epoch": 2.887982729671384, - "grad_norm": 1.7328410726207495, - "learning_rate": 4.189949386787462e-08, - "loss": 0.1207, - "step": 6020 - }, - { - "epoch": 2.890381386423603, - "grad_norm": 1.954059556795084, - "learning_rate": 4.011533540917489e-08, - "loss": 0.1056, - "step": 6025 - }, - { - "epoch": 2.8927800431758213, - "grad_norm": 2.137802294017243, - "learning_rate": 3.836984137886024e-08, - "loss": 0.1277, - "step": 6030 - }, - { - "epoch": 2.8951786999280404, - "grad_norm": 1.6698284932650442, - "learning_rate": 3.6663025383805174e-08, - "loss": 0.1172, - "step": 6035 - }, - { - "epoch": 2.897577356680259, - "grad_norm": 1.6220510726257216, - "learning_rate": 3.4994900729373173e-08, - "loss": 0.1232, - "step": 6040 - }, - { - "epoch": 2.899976013432478, - "grad_norm": 1.9213808029719575, - "learning_rate": 3.3365480419310646e-08, - "loss": 0.1151, - "step": 6045 - }, - { - "epoch": 2.9023746701846966, - "grad_norm": 1.6501341986627394, - "learning_rate": 3.177477715564869e-08, - "loss": 0.1006, - "step": 6050 - }, - { - "epoch": 2.9047733269369154, - "grad_norm": 1.9659614646400356, - "learning_rate": 3.0222803338600394e-08, - "loss": 0.1014, - "step": 6055 - }, - { - "epoch": 2.907171983689134, - "grad_norm": 2.060843304606416, - "learning_rate": 2.87095710664681e-08, - "loss": 0.1278, - "step": 6060 - }, - { - "epoch": 2.909570640441353, - "grad_norm": 1.4858740242674862, - "learning_rate": 2.723509213554576e-08, - "loss": 0.117, - "step": 6065 - }, - { - "epoch": 2.9119692971935716, - "grad_norm": 1.6542843067890352, - "learning_rate": 2.5799378040030075e-08, - "loss": 0.1243, - "step": 6070 - }, - { - "epoch": 2.9143679539457903, - "grad_norm": 1.2106121391404152, - "learning_rate": 2.4402439971927816e-08, - "loss": 0.1129, - "step": 6075 - }, - { - "epoch": 2.916766610698009, - "grad_norm": 1.637222521799144, - "learning_rate": 2.3044288820969764e-08, - "loss": 0.1147, - "step": 6080 - }, - { - "epoch": 2.9191652674502278, - "grad_norm": 1.4227992719038707, - "learning_rate": 2.1724935174528006e-08, - "loss": 0.1277, - "step": 6085 - }, - { - "epoch": 2.9215639242024465, - "grad_norm": 1.6446009283719198, - "learning_rate": 2.044438931752879e-08, - "loss": 0.1555, - "step": 6090 - }, - { - "epoch": 2.923962580954665, - "grad_norm": 1.8090094335987503, - "learning_rate": 1.9202661232377018e-08, - "loss": 0.1245, - "step": 6095 - }, - { - "epoch": 2.9263612377068844, - "grad_norm": 1.3319652375080722, - "learning_rate": 1.799976059887576e-08, - "loss": 0.1353, - "step": 6100 - }, - { - "epoch": 2.9287598944591027, - "grad_norm": 1.5406269983982313, - "learning_rate": 1.6835696794151312e-08, - "loss": 0.1212, - "step": 6105 - }, - { - "epoch": 2.931158551211322, - "grad_norm": 1.690521455881083, - "learning_rate": 1.5710478892579927e-08, - "loss": 0.1586, - "step": 6110 - }, - { - "epoch": 2.9335572079635406, - "grad_norm": 1.705355483123506, - "learning_rate": 1.4624115665717865e-08, - "loss": 0.1324, - "step": 6115 - }, - { - "epoch": 2.9359558647157593, - "grad_norm": 1.7101043349695386, - "learning_rate": 1.3576615582233666e-08, - "loss": 0.0989, - "step": 6120 - }, - { - "epoch": 2.938354521467978, - "grad_norm": 1.6013100594407221, - "learning_rate": 1.2567986807838217e-08, - "loss": 0.095, - "step": 6125 - }, - { - "epoch": 2.9407531782201968, - "grad_norm": 1.906186669455512, - "learning_rate": 1.1598237205225904e-08, - "loss": 0.1555, - "step": 6130 - }, - { - "epoch": 2.9431518349724155, - "grad_norm": 1.60057625401919, - "learning_rate": 1.0667374334011327e-08, - "loss": 0.1195, - "step": 6135 - }, - { - "epoch": 2.9455504917246342, - "grad_norm": 1.854263325199233, - "learning_rate": 9.77540545066935e-09, - "loss": 0.1251, - "step": 6140 - }, - { - "epoch": 2.947949148476853, - "grad_norm": 1.453650090583012, - "learning_rate": 8.92233750848015e-09, - "loss": 0.1021, - "step": 6145 - }, - { - "epoch": 2.9503478052290717, - "grad_norm": 1.4057914251921044, - "learning_rate": 8.10817715747425e-09, - "loss": 0.1061, - "step": 6150 - }, - { - "epoch": 2.9527464619812904, - "grad_norm": 2.005590745242888, - "learning_rate": 7.332930744380906e-09, - "loss": 0.1232, - "step": 6155 - }, - { - "epoch": 2.955145118733509, - "grad_norm": 1.6052684537774102, - "learning_rate": 6.596604312578136e-09, - "loss": 0.0927, - "step": 6160 - }, - { - "epoch": 2.957543775485728, - "grad_norm": 1.4712133769137725, - "learning_rate": 5.899203602046655e-09, - "loss": 0.1218, - "step": 6165 - }, - { - "epoch": 2.9599424322379466, - "grad_norm": 1.6115906553260235, - "learning_rate": 5.240734049323237e-09, - "loss": 0.1501, - "step": 6170 - }, - { - "epoch": 2.9623410889901653, - "grad_norm": 1.5596040657708192, - "learning_rate": 4.621200787461866e-09, - "loss": 0.1269, - "step": 6175 - }, - { - "epoch": 2.964739745742384, - "grad_norm": 1.2433232808545898, - "learning_rate": 4.0406086459893194e-09, - "loss": 0.1087, - "step": 6180 - }, - { - "epoch": 2.9671384024946033, - "grad_norm": 1.520664803555638, - "learning_rate": 3.4989621508707548e-09, - "loss": 0.1001, - "step": 6185 - }, - { - "epoch": 2.9695370592468215, - "grad_norm": 2.213337556588271, - "learning_rate": 2.996265524472519e-09, - "loss": 0.1257, - "step": 6190 - }, - { - "epoch": 2.9719357159990407, - "grad_norm": 1.551517099633644, - "learning_rate": 2.532522685529948e-09, - "loss": 0.1055, - "step": 6195 - }, - { - "epoch": 2.9743343727512594, - "grad_norm": 1.5250172751119109, - "learning_rate": 2.107737249118502e-09, - "loss": 0.1058, - "step": 6200 - }, - { - "epoch": 2.976733029503478, - "grad_norm": 2.028832683383426, - "learning_rate": 1.7219125266221271e-09, - "loss": 0.1005, - "step": 6205 - }, - { - "epoch": 2.979131686255697, - "grad_norm": 3.659500022563047, - "learning_rate": 1.3750515257104913e-09, - "loss": 0.1084, - "step": 6210 - }, - { - "epoch": 2.9815303430079156, - "grad_norm": 1.8413767130375305, - "learning_rate": 1.0671569503134528e-09, - "loss": 0.1046, - "step": 6215 - }, - { - "epoch": 2.9839289997601344, - "grad_norm": 1.3540342868737936, - "learning_rate": 7.982312006010739e-10, - "loss": 0.1176, - "step": 6220 - }, - { - "epoch": 2.986327656512353, - "grad_norm": 1.0218623424780813, - "learning_rate": 5.68276372965304e-10, - "loss": 0.0957, - "step": 6225 - }, - { - "epoch": 2.988726313264572, - "grad_norm": 1.4590475891432568, - "learning_rate": 3.772942600033247e-10, - "loss": 0.1145, - "step": 6230 - }, - { - "epoch": 2.9911249700167906, - "grad_norm": 1.639227053529018, - "learning_rate": 2.2528635050145243e-10, - "loss": 0.1, - "step": 6235 - }, - { - "epoch": 2.9935236267690093, - "grad_norm": 1.3287877021744292, - "learning_rate": 1.1225382942736673e-10, - "loss": 0.0781, - "step": 6240 - }, - { - "epoch": 2.995922283521228, - "grad_norm": 1.86336909717487, - "learning_rate": 3.819757791734269e-11, - "loss": 0.1027, - "step": 6245 - }, - { - "epoch": 2.9983209402734468, - "grad_norm": 1.1404093519785887, - "learning_rate": 3.1181732718099656e-12, - "loss": 0.1106, - "step": 6250 - }, - { - "epoch": 2.9992804029743345, - "step": 6252, - "total_flos": 3.055078882714583e+17, - "train_loss": 0.20201521048409316, - "train_runtime": 48283.8245, - "train_samples_per_second": 4.144, - "train_steps_per_second": 0.129 + "epoch": 2.9964020148716717, + "step": 2082, + "total_flos": 5.073775214995702e+17, + "train_loss": 0.5172660127031643, + "train_runtime": 62603.1223, + "train_samples_per_second": 3.196, + "train_steps_per_second": 0.033 } ], "logging_steps": 5, - "max_steps": 6252, + "max_steps": 2082, "num_input_tokens_seen": 0, "num_train_epochs": 3, - "save_steps": 1000, + "save_steps": 700, "stateful_callbacks": { "TrainerControl": { "args": { @@ -8977,7 +2955,7 @@ "attributes": {} } }, - "total_flos": 3.055078882714583e+17, + "total_flos": 5.073775214995702e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null