diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4550 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 50.0, + "eval_steps": 500, + "global_step": 322050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0776276975624903, + "grad_norm": 2.2886219024658203, + "learning_rate": 4.992237230243752e-05, + "loss": 6.7024, + "step": 500 + }, + { + "epoch": 0.1552553951249806, + "grad_norm": 2.4521589279174805, + "learning_rate": 4.9844744604875024e-05, + "loss": 5.9478, + "step": 1000 + }, + { + "epoch": 0.2328830926874709, + "grad_norm": 2.662937641143799, + "learning_rate": 4.976711690731253e-05, + "loss": 5.6144, + "step": 1500 + }, + { + "epoch": 0.3105107902499612, + "grad_norm": 2.6923885345458984, + "learning_rate": 4.968948920975004e-05, + "loss": 5.3722, + "step": 2000 + }, + { + "epoch": 0.38813848781245147, + "grad_norm": 3.0573277473449707, + "learning_rate": 4.9611861512187554e-05, + "loss": 5.2099, + "step": 2500 + }, + { + "epoch": 0.4657661853749418, + "grad_norm": 2.492630958557129, + "learning_rate": 4.953423381462506e-05, + "loss": 5.0823, + "step": 3000 + }, + { + "epoch": 0.5433938829374321, + "grad_norm": 2.724647045135498, + "learning_rate": 4.945660611706257e-05, + "loss": 4.9667, + "step": 3500 + }, + { + "epoch": 0.6210215804999224, + "grad_norm": 2.4619314670562744, + "learning_rate": 4.937897841950008e-05, + "loss": 4.8775, + "step": 4000 + }, + { + "epoch": 0.6986492780624126, + "grad_norm": 2.758463144302368, + "learning_rate": 4.930135072193759e-05, + "loss": 4.7939, + "step": 4500 + }, + { + "epoch": 0.7762769756249029, + "grad_norm": 3.0213730335235596, + "learning_rate": 4.92237230243751e-05, + "loss": 4.7288, + "step": 5000 + }, + { + "epoch": 0.8539046731873933, + "grad_norm": 2.7506508827209473, + "learning_rate": 4.9146095326812606e-05, + "loss": 4.6724, + "step": 5500 + }, + { + "epoch": 0.9315323707498836, + "grad_norm": 3.3897273540496826, + "learning_rate": 4.906846762925012e-05, + "loss": 4.6203, + "step": 6000 + }, + { + "epoch": 1.0091600683123738, + "grad_norm": 3.2612226009368896, + "learning_rate": 4.899083993168763e-05, + "loss": 4.564, + "step": 6500 + }, + { + "epoch": 1.0867877658748641, + "grad_norm": 2.9909706115722656, + "learning_rate": 4.891321223412514e-05, + "loss": 4.4841, + "step": 7000 + }, + { + "epoch": 1.1644154634373545, + "grad_norm": 3.00471830368042, + "learning_rate": 4.883558453656264e-05, + "loss": 4.4436, + "step": 7500 + }, + { + "epoch": 1.2420431609998448, + "grad_norm": 3.588019609451294, + "learning_rate": 4.875795683900016e-05, + "loss": 4.4248, + "step": 8000 + }, + { + "epoch": 1.319670858562335, + "grad_norm": 3.1261277198791504, + "learning_rate": 4.868032914143767e-05, + "loss": 4.3914, + "step": 8500 + }, + { + "epoch": 1.3972985561248255, + "grad_norm": 3.248203754425049, + "learning_rate": 4.860270144387518e-05, + "loss": 4.3539, + "step": 9000 + }, + { + "epoch": 1.4749262536873156, + "grad_norm": 3.6183948516845703, + "learning_rate": 4.8525073746312687e-05, + "loss": 4.3439, + "step": 9500 + }, + { + "epoch": 1.5525539512498059, + "grad_norm": 3.6323795318603516, + "learning_rate": 4.8447446048750194e-05, + "loss": 4.3104, + "step": 10000 + }, + { + "epoch": 1.6301816488122962, + "grad_norm": 3.8555796146392822, + "learning_rate": 4.836981835118771e-05, + "loss": 4.2775, + "step": 10500 + }, + { + "epoch": 1.7078093463747865, + "grad_norm": 3.804065465927124, + "learning_rate": 4.8292190653625216e-05, + "loss": 4.2645, + "step": 11000 + }, + { + "epoch": 1.7854370439372769, + "grad_norm": 3.5051915645599365, + "learning_rate": 4.8214562956062723e-05, + "loss": 4.2555, + "step": 11500 + }, + { + "epoch": 1.863064741499767, + "grad_norm": 3.28206205368042, + "learning_rate": 4.813693525850024e-05, + "loss": 4.2254, + "step": 12000 + }, + { + "epoch": 1.9406924390622575, + "grad_norm": 3.6532084941864014, + "learning_rate": 4.8059307560937745e-05, + "loss": 4.2142, + "step": 12500 + }, + { + "epoch": 2.0183201366247476, + "grad_norm": 3.8629403114318848, + "learning_rate": 4.798167986337525e-05, + "loss": 4.1695, + "step": 13000 + }, + { + "epoch": 2.095947834187238, + "grad_norm": 3.7742209434509277, + "learning_rate": 4.790405216581276e-05, + "loss": 4.1056, + "step": 13500 + }, + { + "epoch": 2.1735755317497283, + "grad_norm": 3.638509750366211, + "learning_rate": 4.7826424468250275e-05, + "loss": 4.0926, + "step": 14000 + }, + { + "epoch": 2.2512032293122184, + "grad_norm": 3.4432594776153564, + "learning_rate": 4.774879677068778e-05, + "loss": 4.0826, + "step": 14500 + }, + { + "epoch": 2.328830926874709, + "grad_norm": 3.252643346786499, + "learning_rate": 4.76711690731253e-05, + "loss": 4.073, + "step": 15000 + }, + { + "epoch": 2.406458624437199, + "grad_norm": 3.611611843109131, + "learning_rate": 4.7593541375562804e-05, + "loss": 4.0556, + "step": 15500 + }, + { + "epoch": 2.4840863219996896, + "grad_norm": 3.842820644378662, + "learning_rate": 4.751591367800031e-05, + "loss": 4.0538, + "step": 16000 + }, + { + "epoch": 2.5617140195621797, + "grad_norm": 4.127362251281738, + "learning_rate": 4.7438285980437826e-05, + "loss": 4.0186, + "step": 16500 + }, + { + "epoch": 2.63934171712467, + "grad_norm": 3.498431921005249, + "learning_rate": 4.7360658282875334e-05, + "loss": 3.9995, + "step": 17000 + }, + { + "epoch": 2.7169694146871604, + "grad_norm": 3.7191123962402344, + "learning_rate": 4.728303058531284e-05, + "loss": 4.0059, + "step": 17500 + }, + { + "epoch": 2.794597112249651, + "grad_norm": 3.748997688293457, + "learning_rate": 4.720540288775035e-05, + "loss": 3.9807, + "step": 18000 + }, + { + "epoch": 2.872224809812141, + "grad_norm": 3.91758394241333, + "learning_rate": 4.712777519018786e-05, + "loss": 3.9752, + "step": 18500 + }, + { + "epoch": 2.949852507374631, + "grad_norm": 4.286660671234131, + "learning_rate": 4.705014749262537e-05, + "loss": 3.9597, + "step": 19000 + }, + { + "epoch": 3.0274802049371217, + "grad_norm": 4.166433334350586, + "learning_rate": 4.697251979506288e-05, + "loss": 3.9264, + "step": 19500 + }, + { + "epoch": 3.1051079024996118, + "grad_norm": 4.093895435333252, + "learning_rate": 4.689489209750039e-05, + "loss": 3.8771, + "step": 20000 + }, + { + "epoch": 3.1827356000621023, + "grad_norm": 3.8036608695983887, + "learning_rate": 4.68172643999379e-05, + "loss": 3.8691, + "step": 20500 + }, + { + "epoch": 3.2603632976245924, + "grad_norm": 3.8469622135162354, + "learning_rate": 4.6739636702375414e-05, + "loss": 3.8456, + "step": 21000 + }, + { + "epoch": 3.3379909951870825, + "grad_norm": 4.524165630340576, + "learning_rate": 4.6662009004812915e-05, + "loss": 3.8516, + "step": 21500 + }, + { + "epoch": 3.415618692749573, + "grad_norm": 4.203705310821533, + "learning_rate": 4.658438130725043e-05, + "loss": 3.8486, + "step": 22000 + }, + { + "epoch": 3.493246390312063, + "grad_norm": 3.79025936126709, + "learning_rate": 4.650675360968794e-05, + "loss": 3.8466, + "step": 22500 + }, + { + "epoch": 3.5708740878745537, + "grad_norm": 4.120058059692383, + "learning_rate": 4.642912591212545e-05, + "loss": 3.8195, + "step": 23000 + }, + { + "epoch": 3.648501785437044, + "grad_norm": 4.125455379486084, + "learning_rate": 4.635149821456296e-05, + "loss": 3.7975, + "step": 23500 + }, + { + "epoch": 3.7261294829995344, + "grad_norm": 4.129229545593262, + "learning_rate": 4.6273870517000466e-05, + "loss": 3.8115, + "step": 24000 + }, + { + "epoch": 3.8037571805620245, + "grad_norm": 4.444260597229004, + "learning_rate": 4.619624281943798e-05, + "loss": 3.8045, + "step": 24500 + }, + { + "epoch": 3.881384878124515, + "grad_norm": 4.36641788482666, + "learning_rate": 4.611861512187549e-05, + "loss": 3.813, + "step": 25000 + }, + { + "epoch": 3.959012575687005, + "grad_norm": 4.3214802742004395, + "learning_rate": 4.6040987424312996e-05, + "loss": 3.7778, + "step": 25500 + }, + { + "epoch": 4.036640273249495, + "grad_norm": 4.101747989654541, + "learning_rate": 4.5963359726750503e-05, + "loss": 3.7416, + "step": 26000 + }, + { + "epoch": 4.114267970811985, + "grad_norm": 4.384554386138916, + "learning_rate": 4.588573202918802e-05, + "loss": 3.7074, + "step": 26500 + }, + { + "epoch": 4.191895668374476, + "grad_norm": 4.370575904846191, + "learning_rate": 4.580810433162553e-05, + "loss": 3.7012, + "step": 27000 + }, + { + "epoch": 4.2695233659369665, + "grad_norm": 4.443875789642334, + "learning_rate": 4.573047663406303e-05, + "loss": 3.691, + "step": 27500 + }, + { + "epoch": 4.347151063499457, + "grad_norm": 4.347660064697266, + "learning_rate": 4.565284893650055e-05, + "loss": 3.6706, + "step": 28000 + }, + { + "epoch": 4.424778761061947, + "grad_norm": 4.289429187774658, + "learning_rate": 4.5575221238938055e-05, + "loss": 3.698, + "step": 28500 + }, + { + "epoch": 4.502406458624437, + "grad_norm": 4.255033016204834, + "learning_rate": 4.549759354137557e-05, + "loss": 3.6576, + "step": 29000 + }, + { + "epoch": 4.580034156186928, + "grad_norm": 4.466300010681152, + "learning_rate": 4.541996584381307e-05, + "loss": 3.6684, + "step": 29500 + }, + { + "epoch": 4.657661853749418, + "grad_norm": 4.410152435302734, + "learning_rate": 4.5342338146250584e-05, + "loss": 3.6477, + "step": 30000 + }, + { + "epoch": 4.735289551311908, + "grad_norm": 4.257645130157471, + "learning_rate": 4.52647104486881e-05, + "loss": 3.6531, + "step": 30500 + }, + { + "epoch": 4.812917248874398, + "grad_norm": 4.475682258605957, + "learning_rate": 4.5187082751125606e-05, + "loss": 3.6587, + "step": 31000 + }, + { + "epoch": 4.890544946436888, + "grad_norm": 4.372265338897705, + "learning_rate": 4.5109455053563114e-05, + "loss": 3.632, + "step": 31500 + }, + { + "epoch": 4.968172643999379, + "grad_norm": 4.2151360511779785, + "learning_rate": 4.503182735600062e-05, + "loss": 3.6336, + "step": 32000 + }, + { + "epoch": 5.045800341561869, + "grad_norm": 4.397316932678223, + "learning_rate": 4.4954199658438135e-05, + "loss": 3.566, + "step": 32500 + }, + { + "epoch": 5.123428039124359, + "grad_norm": 4.471977710723877, + "learning_rate": 4.487657196087564e-05, + "loss": 3.5522, + "step": 33000 + }, + { + "epoch": 5.2010557366868495, + "grad_norm": 4.2865471839904785, + "learning_rate": 4.479894426331315e-05, + "loss": 3.5675, + "step": 33500 + }, + { + "epoch": 5.2786834342493405, + "grad_norm": 4.559909343719482, + "learning_rate": 4.472131656575066e-05, + "loss": 3.54, + "step": 34000 + }, + { + "epoch": 5.356311131811831, + "grad_norm": 4.453431606292725, + "learning_rate": 4.464368886818817e-05, + "loss": 3.5392, + "step": 34500 + }, + { + "epoch": 5.433938829374321, + "grad_norm": 4.54495906829834, + "learning_rate": 4.456606117062569e-05, + "loss": 3.5424, + "step": 35000 + }, + { + "epoch": 5.511566526936811, + "grad_norm": 4.494850158691406, + "learning_rate": 4.448843347306319e-05, + "loss": 3.5414, + "step": 35500 + }, + { + "epoch": 5.589194224499302, + "grad_norm": 4.8761162757873535, + "learning_rate": 4.44108057755007e-05, + "loss": 3.525, + "step": 36000 + }, + { + "epoch": 5.666821922061792, + "grad_norm": 4.575265884399414, + "learning_rate": 4.433317807793821e-05, + "loss": 3.5405, + "step": 36500 + }, + { + "epoch": 5.744449619624282, + "grad_norm": 4.597631454467773, + "learning_rate": 4.4255550380375724e-05, + "loss": 3.5259, + "step": 37000 + }, + { + "epoch": 5.822077317186772, + "grad_norm": 4.326088905334473, + "learning_rate": 4.4177922682813225e-05, + "loss": 3.4985, + "step": 37500 + }, + { + "epoch": 5.899705014749262, + "grad_norm": 4.202051639556885, + "learning_rate": 4.410029498525074e-05, + "loss": 3.5087, + "step": 38000 + }, + { + "epoch": 5.977332712311753, + "grad_norm": 4.386417388916016, + "learning_rate": 4.402266728768825e-05, + "loss": 3.4926, + "step": 38500 + }, + { + "epoch": 6.054960409874243, + "grad_norm": 4.612489700317383, + "learning_rate": 4.394503959012576e-05, + "loss": 3.456, + "step": 39000 + }, + { + "epoch": 6.132588107436733, + "grad_norm": 4.2950286865234375, + "learning_rate": 4.386741189256327e-05, + "loss": 3.4195, + "step": 39500 + }, + { + "epoch": 6.2102158049992235, + "grad_norm": 4.728135585784912, + "learning_rate": 4.3789784195000776e-05, + "loss": 3.422, + "step": 40000 + }, + { + "epoch": 6.287843502561714, + "grad_norm": 4.690753936767578, + "learning_rate": 4.371215649743829e-05, + "loss": 3.4147, + "step": 40500 + }, + { + "epoch": 6.365471200124205, + "grad_norm": 4.528134346008301, + "learning_rate": 4.36345287998758e-05, + "loss": 3.4115, + "step": 41000 + }, + { + "epoch": 6.443098897686695, + "grad_norm": 4.323470592498779, + "learning_rate": 4.3556901102313305e-05, + "loss": 3.4058, + "step": 41500 + }, + { + "epoch": 6.520726595249185, + "grad_norm": 4.374230861663818, + "learning_rate": 4.347927340475082e-05, + "loss": 3.4112, + "step": 42000 + }, + { + "epoch": 6.598354292811675, + "grad_norm": 4.312314033508301, + "learning_rate": 4.340164570718833e-05, + "loss": 3.3881, + "step": 42500 + }, + { + "epoch": 6.675981990374165, + "grad_norm": 4.178228378295898, + "learning_rate": 4.332401800962584e-05, + "loss": 3.4044, + "step": 43000 + }, + { + "epoch": 6.753609687936656, + "grad_norm": 4.638906002044678, + "learning_rate": 4.324639031206334e-05, + "loss": 3.3954, + "step": 43500 + }, + { + "epoch": 6.831237385499146, + "grad_norm": 4.238986492156982, + "learning_rate": 4.3168762614500857e-05, + "loss": 3.4013, + "step": 44000 + }, + { + "epoch": 6.908865083061636, + "grad_norm": 4.471828460693359, + "learning_rate": 4.3091134916938364e-05, + "loss": 3.3806, + "step": 44500 + }, + { + "epoch": 6.986492780624126, + "grad_norm": 4.4187912940979, + "learning_rate": 4.301350721937588e-05, + "loss": 3.3834, + "step": 45000 + }, + { + "epoch": 7.064120478186617, + "grad_norm": 5.066268444061279, + "learning_rate": 4.293587952181338e-05, + "loss": 3.3064, + "step": 45500 + }, + { + "epoch": 7.1417481757491075, + "grad_norm": 4.942110538482666, + "learning_rate": 4.2858251824250894e-05, + "loss": 3.2971, + "step": 46000 + }, + { + "epoch": 7.219375873311598, + "grad_norm": 5.294034957885742, + "learning_rate": 4.278062412668841e-05, + "loss": 3.2643, + "step": 46500 + }, + { + "epoch": 7.297003570874088, + "grad_norm": 4.650871753692627, + "learning_rate": 4.2702996429125915e-05, + "loss": 3.2768, + "step": 47000 + }, + { + "epoch": 7.374631268436578, + "grad_norm": 5.170124053955078, + "learning_rate": 4.262536873156342e-05, + "loss": 3.2832, + "step": 47500 + }, + { + "epoch": 7.452258965999069, + "grad_norm": 4.852886199951172, + "learning_rate": 4.254774103400093e-05, + "loss": 3.2779, + "step": 48000 + }, + { + "epoch": 7.529886663561559, + "grad_norm": 5.047275543212891, + "learning_rate": 4.2470113336438445e-05, + "loss": 3.273, + "step": 48500 + }, + { + "epoch": 7.607514361124049, + "grad_norm": 4.9860520362854, + "learning_rate": 4.239248563887595e-05, + "loss": 3.2538, + "step": 49000 + }, + { + "epoch": 7.685142058686539, + "grad_norm": 4.9074859619140625, + "learning_rate": 4.231485794131346e-05, + "loss": 3.248, + "step": 49500 + }, + { + "epoch": 7.76276975624903, + "grad_norm": 4.936252593994141, + "learning_rate": 4.2237230243750974e-05, + "loss": 3.2492, + "step": 50000 + }, + { + "epoch": 7.84039745381152, + "grad_norm": 4.652443885803223, + "learning_rate": 4.215960254618848e-05, + "loss": 3.2412, + "step": 50500 + }, + { + "epoch": 7.91802515137401, + "grad_norm": 4.407495021820068, + "learning_rate": 4.2081974848625996e-05, + "loss": 3.2372, + "step": 51000 + }, + { + "epoch": 7.9956528489365, + "grad_norm": 4.413294792175293, + "learning_rate": 4.20043471510635e-05, + "loss": 3.2131, + "step": 51500 + }, + { + "epoch": 8.07328054649899, + "grad_norm": 4.42469596862793, + "learning_rate": 4.192671945350101e-05, + "loss": 3.1377, + "step": 52000 + }, + { + "epoch": 8.150908244061482, + "grad_norm": 4.906301498413086, + "learning_rate": 4.184909175593852e-05, + "loss": 3.1072, + "step": 52500 + }, + { + "epoch": 8.22853594162397, + "grad_norm": 5.0347900390625, + "learning_rate": 4.177146405837603e-05, + "loss": 3.1374, + "step": 53000 + }, + { + "epoch": 8.306163639186462, + "grad_norm": 5.217957496643066, + "learning_rate": 4.169383636081354e-05, + "loss": 3.1124, + "step": 53500 + }, + { + "epoch": 8.383791336748953, + "grad_norm": 4.475755214691162, + "learning_rate": 4.161620866325105e-05, + "loss": 3.1194, + "step": 54000 + }, + { + "epoch": 8.461419034311442, + "grad_norm": 5.22430419921875, + "learning_rate": 4.153858096568856e-05, + "loss": 3.1201, + "step": 54500 + }, + { + "epoch": 8.539046731873933, + "grad_norm": 6.327775955200195, + "learning_rate": 4.146095326812607e-05, + "loss": 3.1031, + "step": 55000 + }, + { + "epoch": 8.616674429436422, + "grad_norm": 4.703291893005371, + "learning_rate": 4.138332557056358e-05, + "loss": 3.1043, + "step": 55500 + }, + { + "epoch": 8.694302126998913, + "grad_norm": 5.288379192352295, + "learning_rate": 4.1305697873001085e-05, + "loss": 3.1024, + "step": 56000 + }, + { + "epoch": 8.771929824561404, + "grad_norm": 4.9670090675354, + "learning_rate": 4.12280701754386e-05, + "loss": 3.0797, + "step": 56500 + }, + { + "epoch": 8.849557522123893, + "grad_norm": 4.910192012786865, + "learning_rate": 4.115044247787611e-05, + "loss": 3.0869, + "step": 57000 + }, + { + "epoch": 8.927185219686384, + "grad_norm": 4.804894924163818, + "learning_rate": 4.1072814780313615e-05, + "loss": 3.0885, + "step": 57500 + }, + { + "epoch": 9.004812917248874, + "grad_norm": 5.052229404449463, + "learning_rate": 4.099518708275113e-05, + "loss": 3.0821, + "step": 58000 + }, + { + "epoch": 9.082440614811365, + "grad_norm": 5.419916152954102, + "learning_rate": 4.0917559385188637e-05, + "loss": 2.9879, + "step": 58500 + }, + { + "epoch": 9.160068312373856, + "grad_norm": 5.0662078857421875, + "learning_rate": 4.083993168762615e-05, + "loss": 2.9825, + "step": 59000 + }, + { + "epoch": 9.237696009936345, + "grad_norm": 4.776367664337158, + "learning_rate": 4.076230399006365e-05, + "loss": 2.977, + "step": 59500 + }, + { + "epoch": 9.315323707498836, + "grad_norm": 4.7674031257629395, + "learning_rate": 4.0684676292501166e-05, + "loss": 2.9971, + "step": 60000 + }, + { + "epoch": 9.392951405061325, + "grad_norm": 4.947634696960449, + "learning_rate": 4.0607048594938673e-05, + "loss": 2.9651, + "step": 60500 + }, + { + "epoch": 9.470579102623816, + "grad_norm": 4.943103790283203, + "learning_rate": 4.052942089737619e-05, + "loss": 2.9781, + "step": 61000 + }, + { + "epoch": 9.548206800186307, + "grad_norm": 5.14945125579834, + "learning_rate": 4.0451793199813695e-05, + "loss": 2.9702, + "step": 61500 + }, + { + "epoch": 9.625834497748796, + "grad_norm": 5.054744243621826, + "learning_rate": 4.03741655022512e-05, + "loss": 2.9553, + "step": 62000 + }, + { + "epoch": 9.703462195311287, + "grad_norm": 5.338235855102539, + "learning_rate": 4.029653780468872e-05, + "loss": 2.9489, + "step": 62500 + }, + { + "epoch": 9.781089892873778, + "grad_norm": 4.819457530975342, + "learning_rate": 4.0218910107126225e-05, + "loss": 2.9676, + "step": 63000 + }, + { + "epoch": 9.858717590436267, + "grad_norm": 4.814851760864258, + "learning_rate": 4.014128240956373e-05, + "loss": 2.9374, + "step": 63500 + }, + { + "epoch": 9.936345287998758, + "grad_norm": 4.723858833312988, + "learning_rate": 4.006365471200124e-05, + "loss": 2.9474, + "step": 64000 + }, + { + "epoch": 10.013972985561248, + "grad_norm": 4.435904026031494, + "learning_rate": 3.9986027014438754e-05, + "loss": 2.9094, + "step": 64500 + }, + { + "epoch": 10.091600683123739, + "grad_norm": 4.80678129196167, + "learning_rate": 3.990839931687627e-05, + "loss": 2.8467, + "step": 65000 + }, + { + "epoch": 10.16922838068623, + "grad_norm": 5.187747001647949, + "learning_rate": 3.983077161931377e-05, + "loss": 2.8237, + "step": 65500 + }, + { + "epoch": 10.246856078248719, + "grad_norm": 4.363202095031738, + "learning_rate": 3.9753143921751284e-05, + "loss": 2.8334, + "step": 66000 + }, + { + "epoch": 10.32448377581121, + "grad_norm": 5.085516929626465, + "learning_rate": 3.967551622418879e-05, + "loss": 2.8284, + "step": 66500 + }, + { + "epoch": 10.402111473373699, + "grad_norm": 4.973574638366699, + "learning_rate": 3.9597888526626306e-05, + "loss": 2.8194, + "step": 67000 + }, + { + "epoch": 10.47973917093619, + "grad_norm": 4.629599094390869, + "learning_rate": 3.952026082906381e-05, + "loss": 2.8284, + "step": 67500 + }, + { + "epoch": 10.557366868498681, + "grad_norm": 4.970963001251221, + "learning_rate": 3.944263313150132e-05, + "loss": 2.8285, + "step": 68000 + }, + { + "epoch": 10.63499456606117, + "grad_norm": 4.869990348815918, + "learning_rate": 3.936500543393883e-05, + "loss": 2.8048, + "step": 68500 + }, + { + "epoch": 10.712622263623661, + "grad_norm": 5.26320743560791, + "learning_rate": 3.928737773637634e-05, + "loss": 2.803, + "step": 69000 + }, + { + "epoch": 10.79024996118615, + "grad_norm": 4.8318352699279785, + "learning_rate": 3.920975003881385e-05, + "loss": 2.7984, + "step": 69500 + }, + { + "epoch": 10.867877658748641, + "grad_norm": 4.917919158935547, + "learning_rate": 3.913212234125136e-05, + "loss": 2.8091, + "step": 70000 + }, + { + "epoch": 10.945505356311132, + "grad_norm": 4.485991954803467, + "learning_rate": 3.905449464368887e-05, + "loss": 2.7917, + "step": 70500 + }, + { + "epoch": 11.023133053873622, + "grad_norm": 4.8984246253967285, + "learning_rate": 3.897686694612638e-05, + "loss": 2.7501, + "step": 71000 + }, + { + "epoch": 11.100760751436113, + "grad_norm": 4.431053161621094, + "learning_rate": 3.889923924856389e-05, + "loss": 2.6896, + "step": 71500 + }, + { + "epoch": 11.178388448998602, + "grad_norm": 4.597928524017334, + "learning_rate": 3.8821611551001395e-05, + "loss": 2.6874, + "step": 72000 + }, + { + "epoch": 11.256016146561093, + "grad_norm": 4.701462268829346, + "learning_rate": 3.874398385343891e-05, + "loss": 2.679, + "step": 72500 + }, + { + "epoch": 11.333643844123584, + "grad_norm": 4.706751346588135, + "learning_rate": 3.866635615587642e-05, + "loss": 2.6799, + "step": 73000 + }, + { + "epoch": 11.411271541686073, + "grad_norm": 4.8909430503845215, + "learning_rate": 3.858872845831393e-05, + "loss": 2.6779, + "step": 73500 + }, + { + "epoch": 11.488899239248564, + "grad_norm": 4.814470291137695, + "learning_rate": 3.851110076075144e-05, + "loss": 2.6723, + "step": 74000 + }, + { + "epoch": 11.566526936811055, + "grad_norm": 4.277644157409668, + "learning_rate": 3.8433473063188946e-05, + "loss": 2.6787, + "step": 74500 + }, + { + "epoch": 11.644154634373544, + "grad_norm": 4.709313869476318, + "learning_rate": 3.835584536562646e-05, + "loss": 2.6672, + "step": 75000 + }, + { + "epoch": 11.721782331936035, + "grad_norm": 4.462389945983887, + "learning_rate": 3.827821766806397e-05, + "loss": 2.66, + "step": 75500 + }, + { + "epoch": 11.799410029498524, + "grad_norm": 4.836484909057617, + "learning_rate": 3.8200589970501475e-05, + "loss": 2.6646, + "step": 76000 + }, + { + "epoch": 11.877037727061015, + "grad_norm": 4.758359909057617, + "learning_rate": 3.812296227293899e-05, + "loss": 2.6561, + "step": 76500 + }, + { + "epoch": 11.954665424623506, + "grad_norm": 4.208640098571777, + "learning_rate": 3.80453345753765e-05, + "loss": 2.6659, + "step": 77000 + }, + { + "epoch": 12.032293122185996, + "grad_norm": 4.91511344909668, + "learning_rate": 3.7967706877814005e-05, + "loss": 2.5897, + "step": 77500 + }, + { + "epoch": 12.109920819748487, + "grad_norm": 4.086484909057617, + "learning_rate": 3.789007918025151e-05, + "loss": 2.5594, + "step": 78000 + }, + { + "epoch": 12.187548517310976, + "grad_norm": 4.583057880401611, + "learning_rate": 3.781245148268903e-05, + "loss": 2.5543, + "step": 78500 + }, + { + "epoch": 12.265176214873467, + "grad_norm": 4.570094585418701, + "learning_rate": 3.7734823785126534e-05, + "loss": 2.5503, + "step": 79000 + }, + { + "epoch": 12.342803912435958, + "grad_norm": 4.889599800109863, + "learning_rate": 3.765719608756404e-05, + "loss": 2.5416, + "step": 79500 + }, + { + "epoch": 12.420431609998447, + "grad_norm": 4.4805426597595215, + "learning_rate": 3.757956839000155e-05, + "loss": 2.5589, + "step": 80000 + }, + { + "epoch": 12.498059307560938, + "grad_norm": 4.407408237457275, + "learning_rate": 3.7501940692439064e-05, + "loss": 2.5315, + "step": 80500 + }, + { + "epoch": 12.575687005123427, + "grad_norm": 4.637092113494873, + "learning_rate": 3.742431299487658e-05, + "loss": 2.5454, + "step": 81000 + }, + { + "epoch": 12.653314702685918, + "grad_norm": 4.7181854248046875, + "learning_rate": 3.7346685297314085e-05, + "loss": 2.5383, + "step": 81500 + }, + { + "epoch": 12.73094240024841, + "grad_norm": 4.588499546051025, + "learning_rate": 3.726905759975159e-05, + "loss": 2.5267, + "step": 82000 + }, + { + "epoch": 12.808570097810899, + "grad_norm": 4.137992858886719, + "learning_rate": 3.71914299021891e-05, + "loss": 2.5345, + "step": 82500 + }, + { + "epoch": 12.88619779537339, + "grad_norm": 4.400317668914795, + "learning_rate": 3.7113802204626615e-05, + "loss": 2.5259, + "step": 83000 + }, + { + "epoch": 12.963825492935879, + "grad_norm": 4.139917850494385, + "learning_rate": 3.703617450706412e-05, + "loss": 2.5335, + "step": 83500 + }, + { + "epoch": 13.04145319049837, + "grad_norm": 4.182736396789551, + "learning_rate": 3.695854680950163e-05, + "loss": 2.4574, + "step": 84000 + }, + { + "epoch": 13.11908088806086, + "grad_norm": 4.659245491027832, + "learning_rate": 3.6880919111939144e-05, + "loss": 2.4193, + "step": 84500 + }, + { + "epoch": 13.19670858562335, + "grad_norm": 4.163915157318115, + "learning_rate": 3.680329141437665e-05, + "loss": 2.4169, + "step": 85000 + }, + { + "epoch": 13.274336283185841, + "grad_norm": 4.518395900726318, + "learning_rate": 3.672566371681416e-05, + "loss": 2.4161, + "step": 85500 + }, + { + "epoch": 13.35196398074833, + "grad_norm": 4.277214050292969, + "learning_rate": 3.664803601925167e-05, + "loss": 2.4169, + "step": 86000 + }, + { + "epoch": 13.429591678310821, + "grad_norm": 4.701220989227295, + "learning_rate": 3.657040832168918e-05, + "loss": 2.424, + "step": 86500 + }, + { + "epoch": 13.507219375873312, + "grad_norm": 4.375713348388672, + "learning_rate": 3.649278062412669e-05, + "loss": 2.4193, + "step": 87000 + }, + { + "epoch": 13.584847073435801, + "grad_norm": 4.191773891448975, + "learning_rate": 3.64151529265642e-05, + "loss": 2.4188, + "step": 87500 + }, + { + "epoch": 13.662474770998292, + "grad_norm": 4.385691165924072, + "learning_rate": 3.633752522900171e-05, + "loss": 2.4149, + "step": 88000 + }, + { + "epoch": 13.740102468560782, + "grad_norm": 4.488534927368164, + "learning_rate": 3.625989753143922e-05, + "loss": 2.3998, + "step": 88500 + }, + { + "epoch": 13.817730166123273, + "grad_norm": 4.578937530517578, + "learning_rate": 3.618226983387673e-05, + "loss": 2.4065, + "step": 89000 + }, + { + "epoch": 13.895357863685764, + "grad_norm": 4.423867702484131, + "learning_rate": 3.610464213631424e-05, + "loss": 2.4004, + "step": 89500 + }, + { + "epoch": 13.972985561248253, + "grad_norm": 4.474419116973877, + "learning_rate": 3.602701443875175e-05, + "loss": 2.4044, + "step": 90000 + }, + { + "epoch": 14.050613258810744, + "grad_norm": 4.806559085845947, + "learning_rate": 3.5949386741189255e-05, + "loss": 2.3339, + "step": 90500 + }, + { + "epoch": 14.128240956373235, + "grad_norm": 4.276415824890137, + "learning_rate": 3.587175904362677e-05, + "loss": 2.2801, + "step": 91000 + }, + { + "epoch": 14.205868653935724, + "grad_norm": 4.825454235076904, + "learning_rate": 3.579413134606428e-05, + "loss": 2.297, + "step": 91500 + }, + { + "epoch": 14.283496351498215, + "grad_norm": 4.838090896606445, + "learning_rate": 3.5716503648501785e-05, + "loss": 2.299, + "step": 92000 + }, + { + "epoch": 14.361124049060704, + "grad_norm": 4.015684604644775, + "learning_rate": 3.56388759509393e-05, + "loss": 2.2892, + "step": 92500 + }, + { + "epoch": 14.438751746623195, + "grad_norm": 4.386364459991455, + "learning_rate": 3.5561248253376807e-05, + "loss": 2.3058, + "step": 93000 + }, + { + "epoch": 14.516379444185686, + "grad_norm": 4.3224968910217285, + "learning_rate": 3.548362055581432e-05, + "loss": 2.3027, + "step": 93500 + }, + { + "epoch": 14.594007141748175, + "grad_norm": 4.265476226806641, + "learning_rate": 3.540599285825182e-05, + "loss": 2.2993, + "step": 94000 + }, + { + "epoch": 14.671634839310666, + "grad_norm": 4.053600311279297, + "learning_rate": 3.5328365160689336e-05, + "loss": 2.2942, + "step": 94500 + }, + { + "epoch": 14.749262536873156, + "grad_norm": 4.602315902709961, + "learning_rate": 3.5250737463126844e-05, + "loss": 2.2906, + "step": 95000 + }, + { + "epoch": 14.826890234435647, + "grad_norm": 4.402678489685059, + "learning_rate": 3.517310976556436e-05, + "loss": 2.2702, + "step": 95500 + }, + { + "epoch": 14.904517931998138, + "grad_norm": 4.164185523986816, + "learning_rate": 3.5095482068001865e-05, + "loss": 2.2815, + "step": 96000 + }, + { + "epoch": 14.982145629560627, + "grad_norm": 3.9488399028778076, + "learning_rate": 3.501785437043937e-05, + "loss": 2.2949, + "step": 96500 + }, + { + "epoch": 15.059773327123118, + "grad_norm": 4.283924102783203, + "learning_rate": 3.494022667287689e-05, + "loss": 2.2053, + "step": 97000 + }, + { + "epoch": 15.137401024685607, + "grad_norm": 4.1038923263549805, + "learning_rate": 3.4862598975314395e-05, + "loss": 2.1718, + "step": 97500 + }, + { + "epoch": 15.215028722248098, + "grad_norm": 3.826446533203125, + "learning_rate": 3.47849712777519e-05, + "loss": 2.1859, + "step": 98000 + }, + { + "epoch": 15.292656419810589, + "grad_norm": 4.282005310058594, + "learning_rate": 3.470734358018941e-05, + "loss": 2.1854, + "step": 98500 + }, + { + "epoch": 15.370284117373078, + "grad_norm": 4.259530067443848, + "learning_rate": 3.4629715882626924e-05, + "loss": 2.188, + "step": 99000 + }, + { + "epoch": 15.44791181493557, + "grad_norm": 4.105893135070801, + "learning_rate": 3.455208818506443e-05, + "loss": 2.1824, + "step": 99500 + }, + { + "epoch": 15.52553951249806, + "grad_norm": 4.21387243270874, + "learning_rate": 3.447446048750194e-05, + "loss": 2.1729, + "step": 100000 + }, + { + "epoch": 15.60316721006055, + "grad_norm": 4.400328636169434, + "learning_rate": 3.4396832789939454e-05, + "loss": 2.1831, + "step": 100500 + }, + { + "epoch": 15.68079490762304, + "grad_norm": 4.224130153656006, + "learning_rate": 3.431920509237696e-05, + "loss": 2.1936, + "step": 101000 + }, + { + "epoch": 15.75842260518553, + "grad_norm": 3.9993326663970947, + "learning_rate": 3.4241577394814476e-05, + "loss": 2.1838, + "step": 101500 + }, + { + "epoch": 15.83605030274802, + "grad_norm": 4.2306671142578125, + "learning_rate": 3.4163949697251976e-05, + "loss": 2.1838, + "step": 102000 + }, + { + "epoch": 15.913678000310512, + "grad_norm": 4.4622368812561035, + "learning_rate": 3.408632199968949e-05, + "loss": 2.1836, + "step": 102500 + }, + { + "epoch": 15.991305697873, + "grad_norm": 4.376685619354248, + "learning_rate": 3.4008694302127005e-05, + "loss": 2.1779, + "step": 103000 + }, + { + "epoch": 16.06893339543549, + "grad_norm": 4.104698657989502, + "learning_rate": 3.393106660456451e-05, + "loss": 2.0854, + "step": 103500 + }, + { + "epoch": 16.14656109299798, + "grad_norm": 3.761953353881836, + "learning_rate": 3.385343890700202e-05, + "loss": 2.0603, + "step": 104000 + }, + { + "epoch": 16.224188790560472, + "grad_norm": 4.365135192871094, + "learning_rate": 3.377581120943953e-05, + "loss": 2.0572, + "step": 104500 + }, + { + "epoch": 16.301816488122963, + "grad_norm": 4.137313365936279, + "learning_rate": 3.369818351187704e-05, + "loss": 2.0691, + "step": 105000 + }, + { + "epoch": 16.379444185685454, + "grad_norm": 4.869952201843262, + "learning_rate": 3.362055581431455e-05, + "loss": 2.0935, + "step": 105500 + }, + { + "epoch": 16.45707188324794, + "grad_norm": 4.275235652923584, + "learning_rate": 3.354292811675206e-05, + "loss": 2.077, + "step": 106000 + }, + { + "epoch": 16.534699580810432, + "grad_norm": 4.092933177947998, + "learning_rate": 3.3465300419189565e-05, + "loss": 2.0977, + "step": 106500 + }, + { + "epoch": 16.612327278372923, + "grad_norm": 3.9494364261627197, + "learning_rate": 3.338767272162708e-05, + "loss": 2.095, + "step": 107000 + }, + { + "epoch": 16.689954975935414, + "grad_norm": 3.6660993099212646, + "learning_rate": 3.331004502406459e-05, + "loss": 2.0867, + "step": 107500 + }, + { + "epoch": 16.767582673497905, + "grad_norm": 4.6808977127075195, + "learning_rate": 3.3232417326502094e-05, + "loss": 2.0856, + "step": 108000 + }, + { + "epoch": 16.845210371060393, + "grad_norm": 3.951265335083008, + "learning_rate": 3.315478962893961e-05, + "loss": 2.0786, + "step": 108500 + }, + { + "epoch": 16.922838068622884, + "grad_norm": 3.390282392501831, + "learning_rate": 3.3077161931377116e-05, + "loss": 2.0756, + "step": 109000 + }, + { + "epoch": 17.000465766185375, + "grad_norm": 3.9212212562561035, + "learning_rate": 3.299953423381463e-05, + "loss": 2.0858, + "step": 109500 + }, + { + "epoch": 17.078093463747866, + "grad_norm": 4.350470542907715, + "learning_rate": 3.292190653625213e-05, + "loss": 1.969, + "step": 110000 + }, + { + "epoch": 17.155721161310357, + "grad_norm": 4.253689765930176, + "learning_rate": 3.2844278838689645e-05, + "loss": 1.9756, + "step": 110500 + }, + { + "epoch": 17.233348858872844, + "grad_norm": 4.202712059020996, + "learning_rate": 3.276665114112716e-05, + "loss": 1.9793, + "step": 111000 + }, + { + "epoch": 17.310976556435335, + "grad_norm": 4.103579998016357, + "learning_rate": 3.268902344356467e-05, + "loss": 1.9825, + "step": 111500 + }, + { + "epoch": 17.388604253997826, + "grad_norm": 4.335016250610352, + "learning_rate": 3.2611395746002175e-05, + "loss": 1.978, + "step": 112000 + }, + { + "epoch": 17.466231951560317, + "grad_norm": 4.291495323181152, + "learning_rate": 3.253376804843968e-05, + "loss": 1.9884, + "step": 112500 + }, + { + "epoch": 17.54385964912281, + "grad_norm": 4.035206317901611, + "learning_rate": 3.24561403508772e-05, + "loss": 2.0041, + "step": 113000 + }, + { + "epoch": 17.621487346685296, + "grad_norm": 3.9616289138793945, + "learning_rate": 3.2378512653314704e-05, + "loss": 1.9928, + "step": 113500 + }, + { + "epoch": 17.699115044247787, + "grad_norm": 4.101945400238037, + "learning_rate": 3.230088495575221e-05, + "loss": 1.9906, + "step": 114000 + }, + { + "epoch": 17.776742741810278, + "grad_norm": 4.0245490074157715, + "learning_rate": 3.2223257258189726e-05, + "loss": 1.9873, + "step": 114500 + }, + { + "epoch": 17.85437043937277, + "grad_norm": 4.1350908279418945, + "learning_rate": 3.2145629560627234e-05, + "loss": 1.9917, + "step": 115000 + }, + { + "epoch": 17.93199813693526, + "grad_norm": 4.366165637969971, + "learning_rate": 3.206800186306475e-05, + "loss": 1.9897, + "step": 115500 + }, + { + "epoch": 18.009625834497747, + "grad_norm": 4.272118091583252, + "learning_rate": 3.199037416550225e-05, + "loss": 1.9837, + "step": 116000 + }, + { + "epoch": 18.087253532060238, + "grad_norm": 4.427468776702881, + "learning_rate": 3.191274646793976e-05, + "loss": 1.8798, + "step": 116500 + }, + { + "epoch": 18.16488122962273, + "grad_norm": 4.1292033195495605, + "learning_rate": 3.183511877037727e-05, + "loss": 1.8857, + "step": 117000 + }, + { + "epoch": 18.24250892718522, + "grad_norm": 4.270112037658691, + "learning_rate": 3.1757491072814785e-05, + "loss": 1.8921, + "step": 117500 + }, + { + "epoch": 18.32013662474771, + "grad_norm": 4.079245567321777, + "learning_rate": 3.1679863375252286e-05, + "loss": 1.8984, + "step": 118000 + }, + { + "epoch": 18.3977643223102, + "grad_norm": 3.783048391342163, + "learning_rate": 3.16022356776898e-05, + "loss": 1.9001, + "step": 118500 + }, + { + "epoch": 18.47539201987269, + "grad_norm": 3.9977831840515137, + "learning_rate": 3.1524607980127314e-05, + "loss": 1.9026, + "step": 119000 + }, + { + "epoch": 18.55301971743518, + "grad_norm": 5.004773139953613, + "learning_rate": 3.144698028256482e-05, + "loss": 1.9027, + "step": 119500 + }, + { + "epoch": 18.63064741499767, + "grad_norm": 4.3422417640686035, + "learning_rate": 3.136935258500233e-05, + "loss": 1.9084, + "step": 120000 + }, + { + "epoch": 18.708275112560163, + "grad_norm": 3.9378857612609863, + "learning_rate": 3.129172488743984e-05, + "loss": 1.9038, + "step": 120500 + }, + { + "epoch": 18.78590281012265, + "grad_norm": 4.138620853424072, + "learning_rate": 3.121409718987735e-05, + "loss": 1.9133, + "step": 121000 + }, + { + "epoch": 18.86353050768514, + "grad_norm": 4.3769659996032715, + "learning_rate": 3.113646949231486e-05, + "loss": 1.9109, + "step": 121500 + }, + { + "epoch": 18.941158205247632, + "grad_norm": 3.955392837524414, + "learning_rate": 3.1058841794752366e-05, + "loss": 1.913, + "step": 122000 + }, + { + "epoch": 19.018785902810123, + "grad_norm": 4.047823905944824, + "learning_rate": 3.098121409718988e-05, + "loss": 1.8897, + "step": 122500 + }, + { + "epoch": 19.096413600372614, + "grad_norm": 4.446326732635498, + "learning_rate": 3.090358639962739e-05, + "loss": 1.7936, + "step": 123000 + }, + { + "epoch": 19.174041297935105, + "grad_norm": 3.9434542655944824, + "learning_rate": 3.08259587020649e-05, + "loss": 1.8065, + "step": 123500 + }, + { + "epoch": 19.251668995497592, + "grad_norm": 4.108802318572998, + "learning_rate": 3.0748331004502403e-05, + "loss": 1.8157, + "step": 124000 + }, + { + "epoch": 19.329296693060083, + "grad_norm": 4.374671459197998, + "learning_rate": 3.067070330693992e-05, + "loss": 1.8276, + "step": 124500 + }, + { + "epoch": 19.406924390622574, + "grad_norm": 3.985368013381958, + "learning_rate": 3.0593075609377425e-05, + "loss": 1.8246, + "step": 125000 + }, + { + "epoch": 19.484552088185065, + "grad_norm": 3.956395149230957, + "learning_rate": 3.0515447911814936e-05, + "loss": 1.8263, + "step": 125500 + }, + { + "epoch": 19.562179785747556, + "grad_norm": 3.358553886413574, + "learning_rate": 3.043782021425245e-05, + "loss": 1.8227, + "step": 126000 + }, + { + "epoch": 19.639807483310044, + "grad_norm": 4.203612804412842, + "learning_rate": 3.0360192516689955e-05, + "loss": 1.8225, + "step": 126500 + }, + { + "epoch": 19.717435180872535, + "grad_norm": 3.790905714035034, + "learning_rate": 3.028256481912747e-05, + "loss": 1.8433, + "step": 127000 + }, + { + "epoch": 19.795062878435026, + "grad_norm": 4.040520191192627, + "learning_rate": 3.0204937121564973e-05, + "loss": 1.8336, + "step": 127500 + }, + { + "epoch": 19.872690575997517, + "grad_norm": 4.027768135070801, + "learning_rate": 3.0127309424002488e-05, + "loss": 1.8314, + "step": 128000 + }, + { + "epoch": 19.950318273560008, + "grad_norm": 3.8109354972839355, + "learning_rate": 3.0049681726439992e-05, + "loss": 1.8425, + "step": 128500 + }, + { + "epoch": 20.027945971122495, + "grad_norm": 3.751999855041504, + "learning_rate": 2.9972054028877506e-05, + "loss": 1.7967, + "step": 129000 + }, + { + "epoch": 20.105573668684986, + "grad_norm": 3.9639225006103516, + "learning_rate": 2.9894426331315014e-05, + "loss": 1.7213, + "step": 129500 + }, + { + "epoch": 20.183201366247477, + "grad_norm": 4.027946472167969, + "learning_rate": 2.9816798633752525e-05, + "loss": 1.7408, + "step": 130000 + }, + { + "epoch": 20.260829063809968, + "grad_norm": 4.050852298736572, + "learning_rate": 2.9739170936190035e-05, + "loss": 1.7305, + "step": 130500 + }, + { + "epoch": 20.33845676137246, + "grad_norm": 4.3804216384887695, + "learning_rate": 2.9661543238627543e-05, + "loss": 1.7499, + "step": 131000 + }, + { + "epoch": 20.416084458934947, + "grad_norm": 4.021152019500732, + "learning_rate": 2.9583915541065054e-05, + "loss": 1.7484, + "step": 131500 + }, + { + "epoch": 20.493712156497438, + "grad_norm": 3.7631611824035645, + "learning_rate": 2.950628784350256e-05, + "loss": 1.7531, + "step": 132000 + }, + { + "epoch": 20.57133985405993, + "grad_norm": 4.4973249435424805, + "learning_rate": 2.9428660145940072e-05, + "loss": 1.767, + "step": 132500 + }, + { + "epoch": 20.64896755162242, + "grad_norm": 4.386341571807861, + "learning_rate": 2.935103244837758e-05, + "loss": 1.7621, + "step": 133000 + }, + { + "epoch": 20.72659524918491, + "grad_norm": 4.0129499435424805, + "learning_rate": 2.927340475081509e-05, + "loss": 1.7637, + "step": 133500 + }, + { + "epoch": 20.804222946747398, + "grad_norm": 4.22186279296875, + "learning_rate": 2.9195777053252605e-05, + "loss": 1.7643, + "step": 134000 + }, + { + "epoch": 20.88185064430989, + "grad_norm": 4.511717319488525, + "learning_rate": 2.911814935569011e-05, + "loss": 1.7761, + "step": 134500 + }, + { + "epoch": 20.95947834187238, + "grad_norm": 4.100383281707764, + "learning_rate": 2.9040521658127624e-05, + "loss": 1.7625, + "step": 135000 + }, + { + "epoch": 21.03710603943487, + "grad_norm": 4.241291046142578, + "learning_rate": 2.8962893960565128e-05, + "loss": 1.7083, + "step": 135500 + }, + { + "epoch": 21.114733736997362, + "grad_norm": 3.8240482807159424, + "learning_rate": 2.8885266263002642e-05, + "loss": 1.6514, + "step": 136000 + }, + { + "epoch": 21.19236143455985, + "grad_norm": 3.9241297245025635, + "learning_rate": 2.880763856544015e-05, + "loss": 1.662, + "step": 136500 + }, + { + "epoch": 21.26998913212234, + "grad_norm": 3.836834669113159, + "learning_rate": 2.873001086787766e-05, + "loss": 1.6674, + "step": 137000 + }, + { + "epoch": 21.34761682968483, + "grad_norm": 4.176065921783447, + "learning_rate": 2.865238317031517e-05, + "loss": 1.6754, + "step": 137500 + }, + { + "epoch": 21.425244527247322, + "grad_norm": 4.702647686004639, + "learning_rate": 2.857475547275268e-05, + "loss": 1.6841, + "step": 138000 + }, + { + "epoch": 21.502872224809813, + "grad_norm": 3.71679425239563, + "learning_rate": 2.849712777519019e-05, + "loss": 1.6918, + "step": 138500 + }, + { + "epoch": 21.5804999223723, + "grad_norm": 4.379159450531006, + "learning_rate": 2.8419500077627698e-05, + "loss": 1.6845, + "step": 139000 + }, + { + "epoch": 21.658127619934792, + "grad_norm": 3.984041213989258, + "learning_rate": 2.834187238006521e-05, + "loss": 1.7042, + "step": 139500 + }, + { + "epoch": 21.735755317497283, + "grad_norm": 4.80483865737915, + "learning_rate": 2.8264244682502716e-05, + "loss": 1.7063, + "step": 140000 + }, + { + "epoch": 21.813383015059774, + "grad_norm": 3.897512674331665, + "learning_rate": 2.8186616984940227e-05, + "loss": 1.697, + "step": 140500 + }, + { + "epoch": 21.891010712622265, + "grad_norm": 3.8755526542663574, + "learning_rate": 2.8108989287377735e-05, + "loss": 1.6936, + "step": 141000 + }, + { + "epoch": 21.968638410184752, + "grad_norm": 4.30952262878418, + "learning_rate": 2.8031361589815246e-05, + "loss": 1.7112, + "step": 141500 + }, + { + "epoch": 22.046266107747243, + "grad_norm": 4.38576602935791, + "learning_rate": 2.795373389225276e-05, + "loss": 1.644, + "step": 142000 + }, + { + "epoch": 22.123893805309734, + "grad_norm": 4.09429931640625, + "learning_rate": 2.7876106194690264e-05, + "loss": 1.6035, + "step": 142500 + }, + { + "epoch": 22.201521502872225, + "grad_norm": 4.038272857666016, + "learning_rate": 2.779847849712778e-05, + "loss": 1.6024, + "step": 143000 + }, + { + "epoch": 22.279149200434716, + "grad_norm": 4.369879245758057, + "learning_rate": 2.7720850799565286e-05, + "loss": 1.6185, + "step": 143500 + }, + { + "epoch": 22.356776897997204, + "grad_norm": 4.589230537414551, + "learning_rate": 2.7643223102002797e-05, + "loss": 1.6199, + "step": 144000 + }, + { + "epoch": 22.434404595559695, + "grad_norm": 4.705469608306885, + "learning_rate": 2.7565595404440304e-05, + "loss": 1.6101, + "step": 144500 + }, + { + "epoch": 22.512032293122186, + "grad_norm": 4.487303256988525, + "learning_rate": 2.7487967706877815e-05, + "loss": 1.6163, + "step": 145000 + }, + { + "epoch": 22.589659990684677, + "grad_norm": 3.795254945755005, + "learning_rate": 2.7410340009315326e-05, + "loss": 1.6382, + "step": 145500 + }, + { + "epoch": 22.667287688247168, + "grad_norm": 3.8786396980285645, + "learning_rate": 2.7332712311752834e-05, + "loss": 1.6223, + "step": 146000 + }, + { + "epoch": 22.744915385809655, + "grad_norm": 4.308375835418701, + "learning_rate": 2.7255084614190345e-05, + "loss": 1.6447, + "step": 146500 + }, + { + "epoch": 22.822543083372146, + "grad_norm": 4.034188747406006, + "learning_rate": 2.7177456916627852e-05, + "loss": 1.6351, + "step": 147000 + }, + { + "epoch": 22.900170780934637, + "grad_norm": 4.602024555206299, + "learning_rate": 2.7099829219065363e-05, + "loss": 1.6344, + "step": 147500 + }, + { + "epoch": 22.977798478497128, + "grad_norm": 4.131753921508789, + "learning_rate": 2.702220152150287e-05, + "loss": 1.6437, + "step": 148000 + }, + { + "epoch": 23.05542617605962, + "grad_norm": 3.612490177154541, + "learning_rate": 2.6944573823940382e-05, + "loss": 1.5592, + "step": 148500 + }, + { + "epoch": 23.13305387362211, + "grad_norm": 4.134332656860352, + "learning_rate": 2.6866946126377896e-05, + "loss": 1.5415, + "step": 149000 + }, + { + "epoch": 23.210681571184598, + "grad_norm": 4.3021321296691895, + "learning_rate": 2.67893184288154e-05, + "loss": 1.5512, + "step": 149500 + }, + { + "epoch": 23.28830926874709, + "grad_norm": 4.436678886413574, + "learning_rate": 2.6711690731252915e-05, + "loss": 1.5472, + "step": 150000 + }, + { + "epoch": 23.36593696630958, + "grad_norm": 4.172628402709961, + "learning_rate": 2.6634063033690422e-05, + "loss": 1.5494, + "step": 150500 + }, + { + "epoch": 23.44356466387207, + "grad_norm": 4.578736782073975, + "learning_rate": 2.6556435336127933e-05, + "loss": 1.5561, + "step": 151000 + }, + { + "epoch": 23.52119236143456, + "grad_norm": 4.1252336502075195, + "learning_rate": 2.647880763856544e-05, + "loss": 1.5626, + "step": 151500 + }, + { + "epoch": 23.59882005899705, + "grad_norm": 3.929494619369507, + "learning_rate": 2.640117994100295e-05, + "loss": 1.5769, + "step": 152000 + }, + { + "epoch": 23.67644775655954, + "grad_norm": 4.310312271118164, + "learning_rate": 2.6323552243440463e-05, + "loss": 1.5716, + "step": 152500 + }, + { + "epoch": 23.75407545412203, + "grad_norm": 3.970519781112671, + "learning_rate": 2.624592454587797e-05, + "loss": 1.5764, + "step": 153000 + }, + { + "epoch": 23.831703151684522, + "grad_norm": 3.880556583404541, + "learning_rate": 2.616829684831548e-05, + "loss": 1.5871, + "step": 153500 + }, + { + "epoch": 23.909330849247013, + "grad_norm": 4.146645545959473, + "learning_rate": 2.609066915075299e-05, + "loss": 1.5869, + "step": 154000 + }, + { + "epoch": 23.9869585468095, + "grad_norm": 4.036287784576416, + "learning_rate": 2.60130414531905e-05, + "loss": 1.583, + "step": 154500 + }, + { + "epoch": 24.06458624437199, + "grad_norm": 4.351132869720459, + "learning_rate": 2.5935413755628007e-05, + "loss": 1.4982, + "step": 155000 + }, + { + "epoch": 24.142213941934482, + "grad_norm": 4.366822242736816, + "learning_rate": 2.5857786058065518e-05, + "loss": 1.4897, + "step": 155500 + }, + { + "epoch": 24.219841639496973, + "grad_norm": 4.432433128356934, + "learning_rate": 2.5780158360503026e-05, + "loss": 1.4969, + "step": 156000 + }, + { + "epoch": 24.297469337059464, + "grad_norm": 4.0283613204956055, + "learning_rate": 2.570253066294054e-05, + "loss": 1.4992, + "step": 156500 + }, + { + "epoch": 24.37509703462195, + "grad_norm": 4.035061359405518, + "learning_rate": 2.562490296537805e-05, + "loss": 1.4968, + "step": 157000 + }, + { + "epoch": 24.452724732184443, + "grad_norm": 3.834836006164551, + "learning_rate": 2.554727526781556e-05, + "loss": 1.5156, + "step": 157500 + }, + { + "epoch": 24.530352429746934, + "grad_norm": 4.057690143585205, + "learning_rate": 2.546964757025307e-05, + "loss": 1.5052, + "step": 158000 + }, + { + "epoch": 24.607980127309425, + "grad_norm": 4.63842248916626, + "learning_rate": 2.5392019872690577e-05, + "loss": 1.5107, + "step": 158500 + }, + { + "epoch": 24.685607824871916, + "grad_norm": 3.624314069747925, + "learning_rate": 2.5314392175128088e-05, + "loss": 1.5185, + "step": 159000 + }, + { + "epoch": 24.763235522434403, + "grad_norm": 4.338582515716553, + "learning_rate": 2.5236764477565595e-05, + "loss": 1.5187, + "step": 159500 + }, + { + "epoch": 24.840863219996894, + "grad_norm": 3.9074742794036865, + "learning_rate": 2.5159136780003106e-05, + "loss": 1.524, + "step": 160000 + }, + { + "epoch": 24.918490917559385, + "grad_norm": 3.97880482673645, + "learning_rate": 2.5081509082440617e-05, + "loss": 1.5278, + "step": 160500 + }, + { + "epoch": 24.996118615121876, + "grad_norm": 4.298096656799316, + "learning_rate": 2.5003881384878125e-05, + "loss": 1.5267, + "step": 161000 + }, + { + "epoch": 25.073746312684367, + "grad_norm": 3.85455322265625, + "learning_rate": 2.4926253687315636e-05, + "loss": 1.442, + "step": 161500 + }, + { + "epoch": 25.151374010246855, + "grad_norm": 3.907085418701172, + "learning_rate": 2.4848625989753147e-05, + "loss": 1.4262, + "step": 162000 + }, + { + "epoch": 25.229001707809346, + "grad_norm": 4.488945484161377, + "learning_rate": 2.4770998292190654e-05, + "loss": 1.4391, + "step": 162500 + }, + { + "epoch": 25.306629405371837, + "grad_norm": 4.565778732299805, + "learning_rate": 2.4693370594628165e-05, + "loss": 1.447, + "step": 163000 + }, + { + "epoch": 25.384257102934328, + "grad_norm": 4.2508015632629395, + "learning_rate": 2.4615742897065676e-05, + "loss": 1.4442, + "step": 163500 + }, + { + "epoch": 25.46188480049682, + "grad_norm": 4.572117328643799, + "learning_rate": 2.4538115199503184e-05, + "loss": 1.4495, + "step": 164000 + }, + { + "epoch": 25.539512498059306, + "grad_norm": 4.516686916351318, + "learning_rate": 2.4460487501940695e-05, + "loss": 1.4625, + "step": 164500 + }, + { + "epoch": 25.617140195621797, + "grad_norm": 4.200167655944824, + "learning_rate": 2.4382859804378202e-05, + "loss": 1.4614, + "step": 165000 + }, + { + "epoch": 25.694767893184288, + "grad_norm": 3.777397632598877, + "learning_rate": 2.4305232106815713e-05, + "loss": 1.4632, + "step": 165500 + }, + { + "epoch": 25.77239559074678, + "grad_norm": 4.383970737457275, + "learning_rate": 2.4227604409253224e-05, + "loss": 1.4773, + "step": 166000 + }, + { + "epoch": 25.85002328830927, + "grad_norm": 4.216927528381348, + "learning_rate": 2.4149976711690735e-05, + "loss": 1.4794, + "step": 166500 + }, + { + "epoch": 25.927650985871757, + "grad_norm": 5.53390645980835, + "learning_rate": 2.4072349014128243e-05, + "loss": 1.4685, + "step": 167000 + }, + { + "epoch": 26.00527868343425, + "grad_norm": 3.9746012687683105, + "learning_rate": 2.3994721316565753e-05, + "loss": 1.4873, + "step": 167500 + }, + { + "epoch": 26.08290638099674, + "grad_norm": 4.278408527374268, + "learning_rate": 2.391709361900326e-05, + "loss": 1.3877, + "step": 168000 + }, + { + "epoch": 26.16053407855923, + "grad_norm": 4.082756042480469, + "learning_rate": 2.3839465921440772e-05, + "loss": 1.3938, + "step": 168500 + }, + { + "epoch": 26.23816177612172, + "grad_norm": 3.929353713989258, + "learning_rate": 2.376183822387828e-05, + "loss": 1.3903, + "step": 169000 + }, + { + "epoch": 26.31578947368421, + "grad_norm": 4.400444030761719, + "learning_rate": 2.368421052631579e-05, + "loss": 1.4032, + "step": 169500 + }, + { + "epoch": 26.3934171712467, + "grad_norm": 4.266624450683594, + "learning_rate": 2.36065828287533e-05, + "loss": 1.4028, + "step": 170000 + }, + { + "epoch": 26.47104486880919, + "grad_norm": 4.547267913818359, + "learning_rate": 2.3528955131190812e-05, + "loss": 1.4043, + "step": 170500 + }, + { + "epoch": 26.548672566371682, + "grad_norm": 4.04599666595459, + "learning_rate": 2.345132743362832e-05, + "loss": 1.4047, + "step": 171000 + }, + { + "epoch": 26.626300263934173, + "grad_norm": 4.308363437652588, + "learning_rate": 2.337369973606583e-05, + "loss": 1.4154, + "step": 171500 + }, + { + "epoch": 26.70392796149666, + "grad_norm": 3.774397373199463, + "learning_rate": 2.329607203850334e-05, + "loss": 1.4127, + "step": 172000 + }, + { + "epoch": 26.78155565905915, + "grad_norm": 4.222719669342041, + "learning_rate": 2.321844434094085e-05, + "loss": 1.4149, + "step": 172500 + }, + { + "epoch": 26.859183356621642, + "grad_norm": 4.3920135498046875, + "learning_rate": 2.3140816643378357e-05, + "loss": 1.4238, + "step": 173000 + }, + { + "epoch": 26.936811054184133, + "grad_norm": 4.5161213874816895, + "learning_rate": 2.306318894581587e-05, + "loss": 1.4232, + "step": 173500 + }, + { + "epoch": 27.014438751746624, + "grad_norm": 4.091419696807861, + "learning_rate": 2.298556124825338e-05, + "loss": 1.412, + "step": 174000 + }, + { + "epoch": 27.092066449309115, + "grad_norm": 4.063779830932617, + "learning_rate": 2.290793355069089e-05, + "loss": 1.3344, + "step": 174500 + }, + { + "epoch": 27.169694146871603, + "grad_norm": 4.165656089782715, + "learning_rate": 2.2830305853128397e-05, + "loss": 1.3348, + "step": 175000 + }, + { + "epoch": 27.247321844434094, + "grad_norm": 4.288286209106445, + "learning_rate": 2.2752678155565908e-05, + "loss": 1.3389, + "step": 175500 + }, + { + "epoch": 27.324949541996585, + "grad_norm": 4.2835211753845215, + "learning_rate": 2.2675050458003416e-05, + "loss": 1.3493, + "step": 176000 + }, + { + "epoch": 27.402577239559076, + "grad_norm": 4.381802558898926, + "learning_rate": 2.2597422760440927e-05, + "loss": 1.358, + "step": 176500 + }, + { + "epoch": 27.480204937121567, + "grad_norm": 4.263532638549805, + "learning_rate": 2.2519795062878434e-05, + "loss": 1.3632, + "step": 177000 + }, + { + "epoch": 27.557832634684054, + "grad_norm": 4.2341742515563965, + "learning_rate": 2.244216736531595e-05, + "loss": 1.3734, + "step": 177500 + }, + { + "epoch": 27.635460332246545, + "grad_norm": 3.9163522720336914, + "learning_rate": 2.2364539667753456e-05, + "loss": 1.3658, + "step": 178000 + }, + { + "epoch": 27.713088029809036, + "grad_norm": 4.0479841232299805, + "learning_rate": 2.2286911970190967e-05, + "loss": 1.3593, + "step": 178500 + }, + { + "epoch": 27.790715727371527, + "grad_norm": 5.027287483215332, + "learning_rate": 2.2209284272628475e-05, + "loss": 1.3869, + "step": 179000 + }, + { + "epoch": 27.868343424934018, + "grad_norm": 4.199400424957275, + "learning_rate": 2.2131656575065985e-05, + "loss": 1.3882, + "step": 179500 + }, + { + "epoch": 27.945971122496506, + "grad_norm": 3.9147210121154785, + "learning_rate": 2.2054028877503493e-05, + "loss": 1.3781, + "step": 180000 + }, + { + "epoch": 28.023598820058996, + "grad_norm": 4.450961112976074, + "learning_rate": 2.1976401179941004e-05, + "loss": 1.3514, + "step": 180500 + }, + { + "epoch": 28.101226517621487, + "grad_norm": 4.467356204986572, + "learning_rate": 2.189877348237851e-05, + "loss": 1.2839, + "step": 181000 + }, + { + "epoch": 28.17885421518398, + "grad_norm": 4.179466247558594, + "learning_rate": 2.1821145784816026e-05, + "loss": 1.3017, + "step": 181500 + }, + { + "epoch": 28.25648191274647, + "grad_norm": 3.7988483905792236, + "learning_rate": 2.1743518087253533e-05, + "loss": 1.3177, + "step": 182000 + }, + { + "epoch": 28.334109610308957, + "grad_norm": 3.9721014499664307, + "learning_rate": 2.1665890389691044e-05, + "loss": 1.302, + "step": 182500 + }, + { + "epoch": 28.411737307871448, + "grad_norm": 4.474249362945557, + "learning_rate": 2.1588262692128552e-05, + "loss": 1.3053, + "step": 183000 + }, + { + "epoch": 28.48936500543394, + "grad_norm": 4.546684741973877, + "learning_rate": 2.1510634994566063e-05, + "loss": 1.3231, + "step": 183500 + }, + { + "epoch": 28.56699270299643, + "grad_norm": 4.715445518493652, + "learning_rate": 2.143300729700357e-05, + "loss": 1.3305, + "step": 184000 + }, + { + "epoch": 28.64462040055892, + "grad_norm": 4.777371406555176, + "learning_rate": 2.135537959944108e-05, + "loss": 1.3231, + "step": 184500 + }, + { + "epoch": 28.72224809812141, + "grad_norm": 4.404980182647705, + "learning_rate": 2.1277751901878592e-05, + "loss": 1.3266, + "step": 185000 + }, + { + "epoch": 28.7998757956839, + "grad_norm": 4.121158599853516, + "learning_rate": 2.1200124204316103e-05, + "loss": 1.3326, + "step": 185500 + }, + { + "epoch": 28.87750349324639, + "grad_norm": 4.212721824645996, + "learning_rate": 2.112249650675361e-05, + "loss": 1.3239, + "step": 186000 + }, + { + "epoch": 28.95513119080888, + "grad_norm": 3.941192626953125, + "learning_rate": 2.104486880919112e-05, + "loss": 1.337, + "step": 186500 + }, + { + "epoch": 29.032758888371372, + "grad_norm": 4.226070404052734, + "learning_rate": 2.096724111162863e-05, + "loss": 1.2999, + "step": 187000 + }, + { + "epoch": 29.11038658593386, + "grad_norm": 4.37491512298584, + "learning_rate": 2.088961341406614e-05, + "loss": 1.2449, + "step": 187500 + }, + { + "epoch": 29.18801428349635, + "grad_norm": 4.1313347816467285, + "learning_rate": 2.0811985716503648e-05, + "loss": 1.2655, + "step": 188000 + }, + { + "epoch": 29.26564198105884, + "grad_norm": 4.144821643829346, + "learning_rate": 2.073435801894116e-05, + "loss": 1.2701, + "step": 188500 + }, + { + "epoch": 29.343269678621333, + "grad_norm": 4.262469291687012, + "learning_rate": 2.065673032137867e-05, + "loss": 1.2671, + "step": 189000 + }, + { + "epoch": 29.420897376183824, + "grad_norm": 4.0824761390686035, + "learning_rate": 2.057910262381618e-05, + "loss": 1.2757, + "step": 189500 + }, + { + "epoch": 29.49852507374631, + "grad_norm": 4.00981330871582, + "learning_rate": 2.0501474926253688e-05, + "loss": 1.275, + "step": 190000 + }, + { + "epoch": 29.576152771308802, + "grad_norm": 4.502607822418213, + "learning_rate": 2.04238472286912e-05, + "loss": 1.278, + "step": 190500 + }, + { + "epoch": 29.653780468871293, + "grad_norm": 4.623337745666504, + "learning_rate": 2.0346219531128707e-05, + "loss": 1.2805, + "step": 191000 + }, + { + "epoch": 29.731408166433784, + "grad_norm": 4.471139430999756, + "learning_rate": 2.0268591833566218e-05, + "loss": 1.2761, + "step": 191500 + }, + { + "epoch": 29.809035863996275, + "grad_norm": 4.283520698547363, + "learning_rate": 2.0190964136003725e-05, + "loss": 1.2907, + "step": 192000 + }, + { + "epoch": 29.886663561558763, + "grad_norm": 4.755760192871094, + "learning_rate": 2.011333643844124e-05, + "loss": 1.2887, + "step": 192500 + }, + { + "epoch": 29.964291259121254, + "grad_norm": 4.386314392089844, + "learning_rate": 2.0035708740878747e-05, + "loss": 1.2949, + "step": 193000 + }, + { + "epoch": 30.041918956683745, + "grad_norm": 4.468728542327881, + "learning_rate": 1.9958081043316258e-05, + "loss": 1.2377, + "step": 193500 + }, + { + "epoch": 30.119546654246236, + "grad_norm": 4.082640171051025, + "learning_rate": 1.9880453345753765e-05, + "loss": 1.2118, + "step": 194000 + }, + { + "epoch": 30.197174351808727, + "grad_norm": 4.6380205154418945, + "learning_rate": 1.9802825648191276e-05, + "loss": 1.2211, + "step": 194500 + }, + { + "epoch": 30.274802049371214, + "grad_norm": 4.422779083251953, + "learning_rate": 1.9725197950628784e-05, + "loss": 1.2255, + "step": 195000 + }, + { + "epoch": 30.352429746933705, + "grad_norm": 4.414443016052246, + "learning_rate": 1.9647570253066295e-05, + "loss": 1.2277, + "step": 195500 + }, + { + "epoch": 30.430057444496196, + "grad_norm": 4.212508201599121, + "learning_rate": 1.9569942555503802e-05, + "loss": 1.236, + "step": 196000 + }, + { + "epoch": 30.507685142058687, + "grad_norm": 4.3478803634643555, + "learning_rate": 1.9492314857941317e-05, + "loss": 1.2387, + "step": 196500 + }, + { + "epoch": 30.585312839621178, + "grad_norm": 5.213949203491211, + "learning_rate": 1.9414687160378824e-05, + "loss": 1.2434, + "step": 197000 + }, + { + "epoch": 30.662940537183665, + "grad_norm": 3.907501459121704, + "learning_rate": 1.9337059462816335e-05, + "loss": 1.2415, + "step": 197500 + }, + { + "epoch": 30.740568234746156, + "grad_norm": 4.092105865478516, + "learning_rate": 1.9259431765253843e-05, + "loss": 1.2515, + "step": 198000 + }, + { + "epoch": 30.818195932308647, + "grad_norm": 4.422701835632324, + "learning_rate": 1.9181804067691354e-05, + "loss": 1.2554, + "step": 198500 + }, + { + "epoch": 30.89582362987114, + "grad_norm": 4.132325172424316, + "learning_rate": 1.910417637012886e-05, + "loss": 1.2607, + "step": 199000 + }, + { + "epoch": 30.97345132743363, + "grad_norm": 4.294840335845947, + "learning_rate": 1.9026548672566372e-05, + "loss": 1.2457, + "step": 199500 + }, + { + "epoch": 31.05107902499612, + "grad_norm": 4.593545913696289, + "learning_rate": 1.894892097500388e-05, + "loss": 1.1969, + "step": 200000 + }, + { + "epoch": 31.128706722558608, + "grad_norm": 3.965829610824585, + "learning_rate": 1.8871293277441394e-05, + "loss": 1.1812, + "step": 200500 + }, + { + "epoch": 31.2063344201211, + "grad_norm": 4.391860008239746, + "learning_rate": 1.87936655798789e-05, + "loss": 1.1764, + "step": 201000 + }, + { + "epoch": 31.28396211768359, + "grad_norm": 4.370110511779785, + "learning_rate": 1.8716037882316413e-05, + "loss": 1.1804, + "step": 201500 + }, + { + "epoch": 31.36158981524608, + "grad_norm": 4.167665958404541, + "learning_rate": 1.863841018475392e-05, + "loss": 1.1993, + "step": 202000 + }, + { + "epoch": 31.439217512808572, + "grad_norm": 4.17106294631958, + "learning_rate": 1.856078248719143e-05, + "loss": 1.1915, + "step": 202500 + }, + { + "epoch": 31.51684521037106, + "grad_norm": 4.328006267547607, + "learning_rate": 1.848315478962894e-05, + "loss": 1.2023, + "step": 203000 + }, + { + "epoch": 31.59447290793355, + "grad_norm": 4.033382415771484, + "learning_rate": 1.840552709206645e-05, + "loss": 1.2049, + "step": 203500 + }, + { + "epoch": 31.67210060549604, + "grad_norm": 4.497017860412598, + "learning_rate": 1.832789939450396e-05, + "loss": 1.2005, + "step": 204000 + }, + { + "epoch": 31.749728303058532, + "grad_norm": 4.34217643737793, + "learning_rate": 1.825027169694147e-05, + "loss": 1.1972, + "step": 204500 + }, + { + "epoch": 31.827356000621023, + "grad_norm": 4.198293209075928, + "learning_rate": 1.817264399937898e-05, + "loss": 1.2119, + "step": 205000 + }, + { + "epoch": 31.90498369818351, + "grad_norm": 4.584846019744873, + "learning_rate": 1.809501630181649e-05, + "loss": 1.2265, + "step": 205500 + }, + { + "epoch": 31.982611395746, + "grad_norm": 4.147974014282227, + "learning_rate": 1.8017388604253997e-05, + "loss": 1.231, + "step": 206000 + }, + { + "epoch": 32.06023909330849, + "grad_norm": 4.133516311645508, + "learning_rate": 1.793976090669151e-05, + "loss": 1.1624, + "step": 206500 + }, + { + "epoch": 32.13786679087098, + "grad_norm": 3.903019905090332, + "learning_rate": 1.7862133209129016e-05, + "loss": 1.1447, + "step": 207000 + }, + { + "epoch": 32.21549448843347, + "grad_norm": 4.349834442138672, + "learning_rate": 1.7784505511566527e-05, + "loss": 1.1472, + "step": 207500 + }, + { + "epoch": 32.29312218599596, + "grad_norm": 5.044727325439453, + "learning_rate": 1.7706877814004038e-05, + "loss": 1.1497, + "step": 208000 + }, + { + "epoch": 32.37074988355845, + "grad_norm": 4.564863681793213, + "learning_rate": 1.762925011644155e-05, + "loss": 1.1568, + "step": 208500 + }, + { + "epoch": 32.448377581120944, + "grad_norm": 4.659034252166748, + "learning_rate": 1.7551622418879056e-05, + "loss": 1.1652, + "step": 209000 + }, + { + "epoch": 32.526005278683435, + "grad_norm": 4.484036445617676, + "learning_rate": 1.7473994721316567e-05, + "loss": 1.1689, + "step": 209500 + }, + { + "epoch": 32.603632976245926, + "grad_norm": 3.8715898990631104, + "learning_rate": 1.7396367023754075e-05, + "loss": 1.1625, + "step": 210000 + }, + { + "epoch": 32.68126067380842, + "grad_norm": 4.791990280151367, + "learning_rate": 1.7318739326191586e-05, + "loss": 1.1649, + "step": 210500 + }, + { + "epoch": 32.75888837137091, + "grad_norm": 4.657315254211426, + "learning_rate": 1.7241111628629093e-05, + "loss": 1.1658, + "step": 211000 + }, + { + "epoch": 32.83651606893339, + "grad_norm": 4.780379295349121, + "learning_rate": 1.7163483931066604e-05, + "loss": 1.1789, + "step": 211500 + }, + { + "epoch": 32.91414376649588, + "grad_norm": 4.298798561096191, + "learning_rate": 1.7085856233504115e-05, + "loss": 1.1873, + "step": 212000 + }, + { + "epoch": 32.991771464058374, + "grad_norm": 4.570270538330078, + "learning_rate": 1.7008228535941626e-05, + "loss": 1.1736, + "step": 212500 + }, + { + "epoch": 33.069399161620865, + "grad_norm": 4.421665191650391, + "learning_rate": 1.6930600838379134e-05, + "loss": 1.1079, + "step": 213000 + }, + { + "epoch": 33.147026859183356, + "grad_norm": 4.232321739196777, + "learning_rate": 1.6852973140816645e-05, + "loss": 1.0986, + "step": 213500 + }, + { + "epoch": 33.22465455674585, + "grad_norm": 4.439553737640381, + "learning_rate": 1.6775345443254152e-05, + "loss": 1.114, + "step": 214000 + }, + { + "epoch": 33.30228225430834, + "grad_norm": 3.9282166957855225, + "learning_rate": 1.6697717745691663e-05, + "loss": 1.1229, + "step": 214500 + }, + { + "epoch": 33.37990995187083, + "grad_norm": 4.5075907707214355, + "learning_rate": 1.662009004812917e-05, + "loss": 1.1298, + "step": 215000 + }, + { + "epoch": 33.45753764943332, + "grad_norm": 4.296872138977051, + "learning_rate": 1.6542462350566685e-05, + "loss": 1.1271, + "step": 215500 + }, + { + "epoch": 33.53516534699581, + "grad_norm": 3.8833069801330566, + "learning_rate": 1.6464834653004193e-05, + "loss": 1.1334, + "step": 216000 + }, + { + "epoch": 33.6127930445583, + "grad_norm": 4.518033027648926, + "learning_rate": 1.6387206955441703e-05, + "loss": 1.1251, + "step": 216500 + }, + { + "epoch": 33.690420742120786, + "grad_norm": 4.618717670440674, + "learning_rate": 1.630957925787921e-05, + "loss": 1.137, + "step": 217000 + }, + { + "epoch": 33.76804843968328, + "grad_norm": 4.346001148223877, + "learning_rate": 1.6231951560316722e-05, + "loss": 1.1439, + "step": 217500 + }, + { + "epoch": 33.84567613724577, + "grad_norm": 4.203965663909912, + "learning_rate": 1.615432386275423e-05, + "loss": 1.1424, + "step": 218000 + }, + { + "epoch": 33.92330383480826, + "grad_norm": 4.829082489013672, + "learning_rate": 1.607669616519174e-05, + "loss": 1.1476, + "step": 218500 + }, + { + "epoch": 34.00093153237075, + "grad_norm": 4.414132118225098, + "learning_rate": 1.5999068467629248e-05, + "loss": 1.1452, + "step": 219000 + }, + { + "epoch": 34.07855922993324, + "grad_norm": 4.220102787017822, + "learning_rate": 1.5921440770066762e-05, + "loss": 1.0785, + "step": 219500 + }, + { + "epoch": 34.15618692749573, + "grad_norm": 4.156444549560547, + "learning_rate": 1.584381307250427e-05, + "loss": 1.0781, + "step": 220000 + }, + { + "epoch": 34.23381462505822, + "grad_norm": 3.997420072555542, + "learning_rate": 1.576618537494178e-05, + "loss": 1.0911, + "step": 220500 + }, + { + "epoch": 34.311442322620714, + "grad_norm": 4.4925537109375, + "learning_rate": 1.568855767737929e-05, + "loss": 1.0861, + "step": 221000 + }, + { + "epoch": 34.389070020183205, + "grad_norm": 4.4098615646362305, + "learning_rate": 1.56109299798168e-05, + "loss": 1.0984, + "step": 221500 + }, + { + "epoch": 34.46669771774569, + "grad_norm": 4.235119819641113, + "learning_rate": 1.5533302282254307e-05, + "loss": 1.0945, + "step": 222000 + }, + { + "epoch": 34.54432541530818, + "grad_norm": 4.796499729156494, + "learning_rate": 1.5455674584691818e-05, + "loss": 1.0973, + "step": 222500 + }, + { + "epoch": 34.62195311287067, + "grad_norm": 4.959954261779785, + "learning_rate": 1.537804688712933e-05, + "loss": 1.0978, + "step": 223000 + }, + { + "epoch": 34.69958081043316, + "grad_norm": 4.675489902496338, + "learning_rate": 1.530041918956684e-05, + "loss": 1.1047, + "step": 223500 + }, + { + "epoch": 34.77720850799565, + "grad_norm": 4.466859340667725, + "learning_rate": 1.5222791492004349e-05, + "loss": 1.093, + "step": 224000 + }, + { + "epoch": 34.85483620555814, + "grad_norm": 4.607345104217529, + "learning_rate": 1.5145163794441858e-05, + "loss": 1.1098, + "step": 224500 + }, + { + "epoch": 34.932463903120635, + "grad_norm": 3.9733870029449463, + "learning_rate": 1.5067536096879367e-05, + "loss": 1.1199, + "step": 225000 + }, + { + "epoch": 35.010091600683126, + "grad_norm": 4.052885055541992, + "learning_rate": 1.4989908399316877e-05, + "loss": 1.1009, + "step": 225500 + }, + { + "epoch": 35.08771929824562, + "grad_norm": 4.508426189422607, + "learning_rate": 1.4912280701754386e-05, + "loss": 1.0394, + "step": 226000 + }, + { + "epoch": 35.16534699580811, + "grad_norm": 4.186591148376465, + "learning_rate": 1.4834653004191895e-05, + "loss": 1.0526, + "step": 226500 + }, + { + "epoch": 35.24297469337059, + "grad_norm": 4.583897590637207, + "learning_rate": 1.4757025306629408e-05, + "loss": 1.0492, + "step": 227000 + }, + { + "epoch": 35.32060239093308, + "grad_norm": 4.202432155609131, + "learning_rate": 1.4679397609066917e-05, + "loss": 1.0575, + "step": 227500 + }, + { + "epoch": 35.39823008849557, + "grad_norm": 4.248536586761475, + "learning_rate": 1.4601769911504426e-05, + "loss": 1.0694, + "step": 228000 + }, + { + "epoch": 35.475857786058064, + "grad_norm": 4.490120887756348, + "learning_rate": 1.4524142213941935e-05, + "loss": 1.0661, + "step": 228500 + }, + { + "epoch": 35.553485483620555, + "grad_norm": 4.558992862701416, + "learning_rate": 1.4446514516379445e-05, + "loss": 1.0683, + "step": 229000 + }, + { + "epoch": 35.631113181183046, + "grad_norm": 4.340649127960205, + "learning_rate": 1.4368886818816954e-05, + "loss": 1.0733, + "step": 229500 + }, + { + "epoch": 35.70874087874554, + "grad_norm": 4.814639091491699, + "learning_rate": 1.4291259121254463e-05, + "loss": 1.0699, + "step": 230000 + }, + { + "epoch": 35.78636857630803, + "grad_norm": 5.107011795043945, + "learning_rate": 1.4213631423691972e-05, + "loss": 1.0785, + "step": 230500 + }, + { + "epoch": 35.86399627387052, + "grad_norm": 4.92033052444458, + "learning_rate": 1.4136003726129485e-05, + "loss": 1.0779, + "step": 231000 + }, + { + "epoch": 35.94162397143301, + "grad_norm": 5.033237457275391, + "learning_rate": 1.4058376028566994e-05, + "loss": 1.0863, + "step": 231500 + }, + { + "epoch": 36.019251668995494, + "grad_norm": 4.0776591300964355, + "learning_rate": 1.3980748331004504e-05, + "loss": 1.0703, + "step": 232000 + }, + { + "epoch": 36.096879366557985, + "grad_norm": 4.491557598114014, + "learning_rate": 1.3903120633442013e-05, + "loss": 1.0207, + "step": 232500 + }, + { + "epoch": 36.174507064120476, + "grad_norm": 4.444462299346924, + "learning_rate": 1.3825492935879522e-05, + "loss": 1.0357, + "step": 233000 + }, + { + "epoch": 36.25213476168297, + "grad_norm": 4.559656143188477, + "learning_rate": 1.3747865238317031e-05, + "loss": 1.0295, + "step": 233500 + }, + { + "epoch": 36.32976245924546, + "grad_norm": 4.09979248046875, + "learning_rate": 1.367023754075454e-05, + "loss": 1.0142, + "step": 234000 + }, + { + "epoch": 36.40739015680795, + "grad_norm": 4.5045084953308105, + "learning_rate": 1.3592609843192053e-05, + "loss": 1.0292, + "step": 234500 + }, + { + "epoch": 36.48501785437044, + "grad_norm": 5.544869422912598, + "learning_rate": 1.3514982145629562e-05, + "loss": 1.0371, + "step": 235000 + }, + { + "epoch": 36.56264555193293, + "grad_norm": 4.618766784667969, + "learning_rate": 1.3437354448067072e-05, + "loss": 1.0376, + "step": 235500 + }, + { + "epoch": 36.64027324949542, + "grad_norm": 4.791065216064453, + "learning_rate": 1.3359726750504581e-05, + "loss": 1.0438, + "step": 236000 + }, + { + "epoch": 36.71790094705791, + "grad_norm": 4.122102737426758, + "learning_rate": 1.328209905294209e-05, + "loss": 1.0462, + "step": 236500 + }, + { + "epoch": 36.7955286446204, + "grad_norm": 4.137369632720947, + "learning_rate": 1.32044713553796e-05, + "loss": 1.0444, + "step": 237000 + }, + { + "epoch": 36.87315634218289, + "grad_norm": 4.59998083114624, + "learning_rate": 1.3126843657817109e-05, + "loss": 1.0508, + "step": 237500 + }, + { + "epoch": 36.95078403974538, + "grad_norm": 4.751966953277588, + "learning_rate": 1.3049215960254618e-05, + "loss": 1.0474, + "step": 238000 + }, + { + "epoch": 37.02841173730787, + "grad_norm": 4.363110065460205, + "learning_rate": 1.297158826269213e-05, + "loss": 1.026, + "step": 238500 + }, + { + "epoch": 37.10603943487036, + "grad_norm": 5.005125045776367, + "learning_rate": 1.289396056512964e-05, + "loss": 0.9971, + "step": 239000 + }, + { + "epoch": 37.18366713243285, + "grad_norm": 4.143869400024414, + "learning_rate": 1.2816332867567149e-05, + "loss": 0.9877, + "step": 239500 + }, + { + "epoch": 37.26129482999534, + "grad_norm": 4.527329444885254, + "learning_rate": 1.2738705170004658e-05, + "loss": 0.9914, + "step": 240000 + }, + { + "epoch": 37.338922527557834, + "grad_norm": 3.8393781185150146, + "learning_rate": 1.2661077472442168e-05, + "loss": 1.0098, + "step": 240500 + }, + { + "epoch": 37.416550225120325, + "grad_norm": 4.1036295890808105, + "learning_rate": 1.2583449774879677e-05, + "loss": 1.0058, + "step": 241000 + }, + { + "epoch": 37.494177922682816, + "grad_norm": 4.97705078125, + "learning_rate": 1.2505822077317186e-05, + "loss": 1.0098, + "step": 241500 + }, + { + "epoch": 37.57180562024531, + "grad_norm": 4.289205074310303, + "learning_rate": 1.2428194379754697e-05, + "loss": 1.0117, + "step": 242000 + }, + { + "epoch": 37.64943331780779, + "grad_norm": 4.353816509246826, + "learning_rate": 1.2350566682192206e-05, + "loss": 1.0162, + "step": 242500 + }, + { + "epoch": 37.72706101537028, + "grad_norm": 4.447281837463379, + "learning_rate": 1.2272938984629717e-05, + "loss": 1.0202, + "step": 243000 + }, + { + "epoch": 37.80468871293277, + "grad_norm": 4.254565715789795, + "learning_rate": 1.2195311287067226e-05, + "loss": 1.0252, + "step": 243500 + }, + { + "epoch": 37.882316410495264, + "grad_norm": 4.382399559020996, + "learning_rate": 1.2117683589504736e-05, + "loss": 1.023, + "step": 244000 + }, + { + "epoch": 37.959944108057755, + "grad_norm": 4.591485977172852, + "learning_rate": 1.2040055891942245e-05, + "loss": 1.024, + "step": 244500 + }, + { + "epoch": 38.037571805620246, + "grad_norm": 4.238889217376709, + "learning_rate": 1.1962428194379756e-05, + "loss": 0.996, + "step": 245000 + }, + { + "epoch": 38.11519950318274, + "grad_norm": 5.276005268096924, + "learning_rate": 1.1884800496817265e-05, + "loss": 0.97, + "step": 245500 + }, + { + "epoch": 38.19282720074523, + "grad_norm": 4.318702697753906, + "learning_rate": 1.1807172799254774e-05, + "loss": 0.9679, + "step": 246000 + }, + { + "epoch": 38.27045489830772, + "grad_norm": 4.6534504890441895, + "learning_rate": 1.1729545101692284e-05, + "loss": 0.9754, + "step": 246500 + }, + { + "epoch": 38.34808259587021, + "grad_norm": 4.487671375274658, + "learning_rate": 1.1651917404129794e-05, + "loss": 0.9771, + "step": 247000 + }, + { + "epoch": 38.425710293432694, + "grad_norm": 4.206161975860596, + "learning_rate": 1.1574289706567304e-05, + "loss": 0.9824, + "step": 247500 + }, + { + "epoch": 38.503337990995185, + "grad_norm": 4.533993721008301, + "learning_rate": 1.1496662009004813e-05, + "loss": 0.98, + "step": 248000 + }, + { + "epoch": 38.580965688557676, + "grad_norm": 4.58768892288208, + "learning_rate": 1.1419034311442322e-05, + "loss": 0.9891, + "step": 248500 + }, + { + "epoch": 38.65859338612017, + "grad_norm": 4.578085422515869, + "learning_rate": 1.1341406613879833e-05, + "loss": 0.9912, + "step": 249000 + }, + { + "epoch": 38.73622108368266, + "grad_norm": 4.549184799194336, + "learning_rate": 1.1263778916317342e-05, + "loss": 0.998, + "step": 249500 + }, + { + "epoch": 38.81384878124515, + "grad_norm": 4.277008056640625, + "learning_rate": 1.1186151218754852e-05, + "loss": 0.9872, + "step": 250000 + }, + { + "epoch": 38.89147647880764, + "grad_norm": 4.436850070953369, + "learning_rate": 1.1108523521192361e-05, + "loss": 0.9902, + "step": 250500 + }, + { + "epoch": 38.96910417637013, + "grad_norm": 4.574080944061279, + "learning_rate": 1.1030895823629872e-05, + "loss": 1.0062, + "step": 251000 + }, + { + "epoch": 39.04673187393262, + "grad_norm": 4.431211471557617, + "learning_rate": 1.0953268126067381e-05, + "loss": 0.9653, + "step": 251500 + }, + { + "epoch": 39.12435957149511, + "grad_norm": 4.642630100250244, + "learning_rate": 1.087564042850489e-05, + "loss": 0.9415, + "step": 252000 + }, + { + "epoch": 39.2019872690576, + "grad_norm": 4.911776065826416, + "learning_rate": 1.0798012730942401e-05, + "loss": 0.9479, + "step": 252500 + }, + { + "epoch": 39.27961496662009, + "grad_norm": 4.803096771240234, + "learning_rate": 1.072038503337991e-05, + "loss": 0.9548, + "step": 253000 + }, + { + "epoch": 39.35724266418258, + "grad_norm": 4.382226943969727, + "learning_rate": 1.064275733581742e-05, + "loss": 0.9501, + "step": 253500 + }, + { + "epoch": 39.43487036174507, + "grad_norm": 4.663143634796143, + "learning_rate": 1.0565129638254929e-05, + "loss": 0.9671, + "step": 254000 + }, + { + "epoch": 39.51249805930756, + "grad_norm": 4.334278106689453, + "learning_rate": 1.048750194069244e-05, + "loss": 0.9637, + "step": 254500 + }, + { + "epoch": 39.59012575687005, + "grad_norm": 4.499300956726074, + "learning_rate": 1.040987424312995e-05, + "loss": 0.959, + "step": 255000 + }, + { + "epoch": 39.66775345443254, + "grad_norm": 4.04175329208374, + "learning_rate": 1.0332246545567458e-05, + "loss": 0.9625, + "step": 255500 + }, + { + "epoch": 39.74538115199503, + "grad_norm": 4.483138084411621, + "learning_rate": 1.0254618848004968e-05, + "loss": 0.9654, + "step": 256000 + }, + { + "epoch": 39.823008849557525, + "grad_norm": 4.5711140632629395, + "learning_rate": 1.0176991150442479e-05, + "loss": 0.9705, + "step": 256500 + }, + { + "epoch": 39.900636547120016, + "grad_norm": 4.339575290679932, + "learning_rate": 1.0099363452879988e-05, + "loss": 0.971, + "step": 257000 + }, + { + "epoch": 39.9782642446825, + "grad_norm": 4.528174877166748, + "learning_rate": 1.0021735755317497e-05, + "loss": 0.9714, + "step": 257500 + }, + { + "epoch": 40.05589194224499, + "grad_norm": 4.42559289932251, + "learning_rate": 9.944108057755006e-06, + "loss": 0.9325, + "step": 258000 + }, + { + "epoch": 40.13351963980748, + "grad_norm": 4.588589191436768, + "learning_rate": 9.866480360192517e-06, + "loss": 0.9248, + "step": 258500 + }, + { + "epoch": 40.21114733736997, + "grad_norm": 5.253052711486816, + "learning_rate": 9.788852662630027e-06, + "loss": 0.9285, + "step": 259000 + }, + { + "epoch": 40.28877503493246, + "grad_norm": 4.5551042556762695, + "learning_rate": 9.711224965067536e-06, + "loss": 0.9384, + "step": 259500 + }, + { + "epoch": 40.366402732494954, + "grad_norm": 4.9546990394592285, + "learning_rate": 9.633597267505045e-06, + "loss": 0.9332, + "step": 260000 + }, + { + "epoch": 40.444030430057445, + "grad_norm": 4.840395450592041, + "learning_rate": 9.555969569942556e-06, + "loss": 0.9321, + "step": 260500 + }, + { + "epoch": 40.521658127619936, + "grad_norm": 4.765369415283203, + "learning_rate": 9.478341872380065e-06, + "loss": 0.9366, + "step": 261000 + }, + { + "epoch": 40.59928582518243, + "grad_norm": 4.869214057922363, + "learning_rate": 9.400714174817574e-06, + "loss": 0.9419, + "step": 261500 + }, + { + "epoch": 40.67691352274492, + "grad_norm": 4.868770599365234, + "learning_rate": 9.323086477255084e-06, + "loss": 0.9431, + "step": 262000 + }, + { + "epoch": 40.7545412203074, + "grad_norm": 5.142333030700684, + "learning_rate": 9.245458779692595e-06, + "loss": 0.9455, + "step": 262500 + }, + { + "epoch": 40.83216891786989, + "grad_norm": 4.263994216918945, + "learning_rate": 9.167831082130104e-06, + "loss": 0.9497, + "step": 263000 + }, + { + "epoch": 40.909796615432384, + "grad_norm": 4.486149311065674, + "learning_rate": 9.090203384567613e-06, + "loss": 0.9484, + "step": 263500 + }, + { + "epoch": 40.987424312994875, + "grad_norm": 4.359130859375, + "learning_rate": 9.012575687005124e-06, + "loss": 0.9441, + "step": 264000 + }, + { + "epoch": 41.065052010557366, + "grad_norm": 4.38929557800293, + "learning_rate": 8.934947989442633e-06, + "loss": 0.9057, + "step": 264500 + }, + { + "epoch": 41.14267970811986, + "grad_norm": 4.379587650299072, + "learning_rate": 8.857320291880143e-06, + "loss": 0.9024, + "step": 265000 + }, + { + "epoch": 41.22030740568235, + "grad_norm": 4.549973964691162, + "learning_rate": 8.779692594317652e-06, + "loss": 0.9116, + "step": 265500 + }, + { + "epoch": 41.29793510324484, + "grad_norm": 4.387326240539551, + "learning_rate": 8.702064896755163e-06, + "loss": 0.9132, + "step": 266000 + }, + { + "epoch": 41.37556280080733, + "grad_norm": 4.824013710021973, + "learning_rate": 8.624437199192672e-06, + "loss": 0.9048, + "step": 266500 + }, + { + "epoch": 41.45319049836982, + "grad_norm": 4.79560661315918, + "learning_rate": 8.546809501630181e-06, + "loss": 0.9142, + "step": 267000 + }, + { + "epoch": 41.53081819593231, + "grad_norm": 4.503738880157471, + "learning_rate": 8.46918180406769e-06, + "loss": 0.92, + "step": 267500 + }, + { + "epoch": 41.608445893494796, + "grad_norm": 4.430568218231201, + "learning_rate": 8.391554106505201e-06, + "loss": 0.9258, + "step": 268000 + }, + { + "epoch": 41.68607359105729, + "grad_norm": 4.630665302276611, + "learning_rate": 8.31392640894271e-06, + "loss": 0.9226, + "step": 268500 + }, + { + "epoch": 41.76370128861978, + "grad_norm": 4.298410415649414, + "learning_rate": 8.23629871138022e-06, + "loss": 0.9264, + "step": 269000 + }, + { + "epoch": 41.84132898618227, + "grad_norm": 4.575562000274658, + "learning_rate": 8.15867101381773e-06, + "loss": 0.9194, + "step": 269500 + }, + { + "epoch": 41.91895668374476, + "grad_norm": 4.254932880401611, + "learning_rate": 8.08104331625524e-06, + "loss": 0.9339, + "step": 270000 + }, + { + "epoch": 41.99658438130725, + "grad_norm": 4.799808502197266, + "learning_rate": 8.00341561869275e-06, + "loss": 0.9262, + "step": 270500 + }, + { + "epoch": 42.07421207886974, + "grad_norm": 4.432214260101318, + "learning_rate": 7.925787921130259e-06, + "loss": 0.8875, + "step": 271000 + }, + { + "epoch": 42.15183977643223, + "grad_norm": 4.276678085327148, + "learning_rate": 7.84816022356777e-06, + "loss": 0.8923, + "step": 271500 + }, + { + "epoch": 42.229467473994724, + "grad_norm": 5.178389072418213, + "learning_rate": 7.770532526005279e-06, + "loss": 0.8835, + "step": 272000 + }, + { + "epoch": 42.307095171557215, + "grad_norm": 4.696712017059326, + "learning_rate": 7.692904828442788e-06, + "loss": 0.8872, + "step": 272500 + }, + { + "epoch": 42.3847228691197, + "grad_norm": 4.507452011108398, + "learning_rate": 7.615277130880298e-06, + "loss": 0.892, + "step": 273000 + }, + { + "epoch": 42.46235056668219, + "grad_norm": 4.397420883178711, + "learning_rate": 7.537649433317809e-06, + "loss": 0.9004, + "step": 273500 + }, + { + "epoch": 42.53997826424468, + "grad_norm": 4.42085599899292, + "learning_rate": 7.460021735755318e-06, + "loss": 0.9006, + "step": 274000 + }, + { + "epoch": 42.61760596180717, + "grad_norm": 4.6971306800842285, + "learning_rate": 7.3823940381928275e-06, + "loss": 0.8923, + "step": 274500 + }, + { + "epoch": 42.69523365936966, + "grad_norm": 4.580519199371338, + "learning_rate": 7.304766340630337e-06, + "loss": 0.8984, + "step": 275000 + }, + { + "epoch": 42.772861356932154, + "grad_norm": 4.263189315795898, + "learning_rate": 7.227138643067848e-06, + "loss": 0.9049, + "step": 275500 + }, + { + "epoch": 42.850489054494645, + "grad_norm": 4.588529586791992, + "learning_rate": 7.149510945505357e-06, + "loss": 0.9078, + "step": 276000 + }, + { + "epoch": 42.928116752057136, + "grad_norm": 4.9102559089660645, + "learning_rate": 7.071883247942866e-06, + "loss": 0.9073, + "step": 276500 + }, + { + "epoch": 43.00574444961963, + "grad_norm": 4.7918853759765625, + "learning_rate": 6.994255550380375e-06, + "loss": 0.9072, + "step": 277000 + }, + { + "epoch": 43.08337214718212, + "grad_norm": 3.824863910675049, + "learning_rate": 6.916627852817886e-06, + "loss": 0.8697, + "step": 277500 + }, + { + "epoch": 43.1609998447446, + "grad_norm": 4.692780017852783, + "learning_rate": 6.839000155255396e-06, + "loss": 0.8758, + "step": 278000 + }, + { + "epoch": 43.23862754230709, + "grad_norm": 5.024048805236816, + "learning_rate": 6.761372457692905e-06, + "loss": 0.8725, + "step": 278500 + }, + { + "epoch": 43.316255239869584, + "grad_norm": 4.9430975914001465, + "learning_rate": 6.683744760130414e-06, + "loss": 0.8739, + "step": 279000 + }, + { + "epoch": 43.393882937432075, + "grad_norm": 4.70835542678833, + "learning_rate": 6.606117062567925e-06, + "loss": 0.8774, + "step": 279500 + }, + { + "epoch": 43.471510634994566, + "grad_norm": 4.474407196044922, + "learning_rate": 6.528489365005434e-06, + "loss": 0.8788, + "step": 280000 + }, + { + "epoch": 43.54913833255706, + "grad_norm": 4.508847713470459, + "learning_rate": 6.4508616674429435e-06, + "loss": 0.8812, + "step": 280500 + }, + { + "epoch": 43.62676603011955, + "grad_norm": 4.584230422973633, + "learning_rate": 6.373233969880453e-06, + "loss": 0.8787, + "step": 281000 + }, + { + "epoch": 43.70439372768204, + "grad_norm": 4.892379283905029, + "learning_rate": 6.295606272317964e-06, + "loss": 0.8883, + "step": 281500 + }, + { + "epoch": 43.78202142524453, + "grad_norm": 4.759417533874512, + "learning_rate": 6.217978574755473e-06, + "loss": 0.885, + "step": 282000 + }, + { + "epoch": 43.85964912280702, + "grad_norm": 4.658566474914551, + "learning_rate": 6.140350877192982e-06, + "loss": 0.8799, + "step": 282500 + }, + { + "epoch": 43.937276820369505, + "grad_norm": 4.660683631896973, + "learning_rate": 6.062723179630492e-06, + "loss": 0.8899, + "step": 283000 + }, + { + "epoch": 44.014904517931996, + "grad_norm": 4.208764553070068, + "learning_rate": 5.985095482068002e-06, + "loss": 0.8801, + "step": 283500 + }, + { + "epoch": 44.09253221549449, + "grad_norm": 4.277160167694092, + "learning_rate": 5.907467784505512e-06, + "loss": 0.854, + "step": 284000 + }, + { + "epoch": 44.17015991305698, + "grad_norm": 4.98652982711792, + "learning_rate": 5.829840086943022e-06, + "loss": 0.8548, + "step": 284500 + }, + { + "epoch": 44.24778761061947, + "grad_norm": 4.677061557769775, + "learning_rate": 5.752212389380531e-06, + "loss": 0.8661, + "step": 285000 + }, + { + "epoch": 44.32541530818196, + "grad_norm": 4.650174617767334, + "learning_rate": 5.674584691818041e-06, + "loss": 0.8626, + "step": 285500 + }, + { + "epoch": 44.40304300574445, + "grad_norm": 4.145635604858398, + "learning_rate": 5.59695699425555e-06, + "loss": 0.8635, + "step": 286000 + }, + { + "epoch": 44.48067070330694, + "grad_norm": 4.334202766418457, + "learning_rate": 5.51932929669306e-06, + "loss": 0.8633, + "step": 286500 + }, + { + "epoch": 44.55829840086943, + "grad_norm": 4.45126485824585, + "learning_rate": 5.44170159913057e-06, + "loss": 0.863, + "step": 287000 + }, + { + "epoch": 44.63592609843192, + "grad_norm": 4.916016578674316, + "learning_rate": 5.36407390156808e-06, + "loss": 0.8687, + "step": 287500 + }, + { + "epoch": 44.71355379599441, + "grad_norm": 4.656139373779297, + "learning_rate": 5.286446204005589e-06, + "loss": 0.8665, + "step": 288000 + }, + { + "epoch": 44.7911814935569, + "grad_norm": 4.845007419586182, + "learning_rate": 5.208818506443099e-06, + "loss": 0.8681, + "step": 288500 + }, + { + "epoch": 44.86880919111939, + "grad_norm": 4.315593242645264, + "learning_rate": 5.131190808880608e-06, + "loss": 0.863, + "step": 289000 + }, + { + "epoch": 44.94643688868188, + "grad_norm": 4.265692710876465, + "learning_rate": 5.053563111318118e-06, + "loss": 0.8605, + "step": 289500 + }, + { + "epoch": 45.02406458624437, + "grad_norm": 4.859785079956055, + "learning_rate": 4.975935413755628e-06, + "loss": 0.8637, + "step": 290000 + }, + { + "epoch": 45.10169228380686, + "grad_norm": 4.233875751495361, + "learning_rate": 4.898307716193138e-06, + "loss": 0.8408, + "step": 290500 + }, + { + "epoch": 45.17931998136935, + "grad_norm": 4.796300411224365, + "learning_rate": 4.820680018630647e-06, + "loss": 0.85, + "step": 291000 + }, + { + "epoch": 45.256947678931844, + "grad_norm": 4.32379150390625, + "learning_rate": 4.743052321068157e-06, + "loss": 0.8455, + "step": 291500 + }, + { + "epoch": 45.334575376494335, + "grad_norm": 4.826063632965088, + "learning_rate": 4.665424623505667e-06, + "loss": 0.853, + "step": 292000 + }, + { + "epoch": 45.412203074056826, + "grad_norm": 4.197807312011719, + "learning_rate": 4.587796925943176e-06, + "loss": 0.8563, + "step": 292500 + }, + { + "epoch": 45.48983077161931, + "grad_norm": 4.949887275695801, + "learning_rate": 4.5101692283806865e-06, + "loss": 0.8478, + "step": 293000 + }, + { + "epoch": 45.5674584691818, + "grad_norm": 4.073297023773193, + "learning_rate": 4.432541530818196e-06, + "loss": 0.8502, + "step": 293500 + }, + { + "epoch": 45.64508616674429, + "grad_norm": 4.890108108520508, + "learning_rate": 4.354913833255706e-06, + "loss": 0.8482, + "step": 294000 + }, + { + "epoch": 45.72271386430678, + "grad_norm": 4.2948079109191895, + "learning_rate": 4.277286135693216e-06, + "loss": 0.847, + "step": 294500 + }, + { + "epoch": 45.800341561869274, + "grad_norm": 4.1356425285339355, + "learning_rate": 4.199658438130725e-06, + "loss": 0.8543, + "step": 295000 + }, + { + "epoch": 45.877969259431765, + "grad_norm": 4.8358001708984375, + "learning_rate": 4.122030740568235e-06, + "loss": 0.8519, + "step": 295500 + }, + { + "epoch": 45.955596956994256, + "grad_norm": 4.316599369049072, + "learning_rate": 4.0444030430057445e-06, + "loss": 0.8518, + "step": 296000 + }, + { + "epoch": 46.03322465455675, + "grad_norm": 5.166982173919678, + "learning_rate": 3.9667753454432546e-06, + "loss": 0.837, + "step": 296500 + }, + { + "epoch": 46.11085235211924, + "grad_norm": 5.095579624176025, + "learning_rate": 3.889147647880765e-06, + "loss": 0.8304, + "step": 297000 + }, + { + "epoch": 46.18848004968173, + "grad_norm": 4.376230716705322, + "learning_rate": 3.8115199503182735e-06, + "loss": 0.8317, + "step": 297500 + }, + { + "epoch": 46.26610774724422, + "grad_norm": 4.394167900085449, + "learning_rate": 3.7338922527557836e-06, + "loss": 0.8334, + "step": 298000 + }, + { + "epoch": 46.343735444806704, + "grad_norm": 4.203426361083984, + "learning_rate": 3.656264555193293e-06, + "loss": 0.8282, + "step": 298500 + }, + { + "epoch": 46.421363142369195, + "grad_norm": 4.700695991516113, + "learning_rate": 3.578636857630803e-06, + "loss": 0.8387, + "step": 299000 + }, + { + "epoch": 46.498990839931686, + "grad_norm": 4.512545585632324, + "learning_rate": 3.501009160068312e-06, + "loss": 0.8371, + "step": 299500 + }, + { + "epoch": 46.57661853749418, + "grad_norm": 4.69306755065918, + "learning_rate": 3.4233814625058222e-06, + "loss": 0.8328, + "step": 300000 + }, + { + "epoch": 46.65424623505667, + "grad_norm": 4.748707294464111, + "learning_rate": 3.3457537649433315e-06, + "loss": 0.8387, + "step": 300500 + }, + { + "epoch": 46.73187393261916, + "grad_norm": 4.850402355194092, + "learning_rate": 3.2681260673808416e-06, + "loss": 0.8433, + "step": 301000 + }, + { + "epoch": 46.80950163018165, + "grad_norm": 4.6922197341918945, + "learning_rate": 3.1904983698183512e-06, + "loss": 0.8437, + "step": 301500 + }, + { + "epoch": 46.88712932774414, + "grad_norm": 4.400567054748535, + "learning_rate": 3.112870672255861e-06, + "loss": 0.8395, + "step": 302000 + }, + { + "epoch": 46.96475702530663, + "grad_norm": 4.891355037689209, + "learning_rate": 3.0352429746933706e-06, + "loss": 0.8376, + "step": 302500 + }, + { + "epoch": 47.04238472286912, + "grad_norm": 4.655758857727051, + "learning_rate": 2.9576152771308803e-06, + "loss": 0.8284, + "step": 303000 + }, + { + "epoch": 47.12001242043161, + "grad_norm": 4.718132972717285, + "learning_rate": 2.87998757956839e-06, + "loss": 0.8187, + "step": 303500 + }, + { + "epoch": 47.1976401179941, + "grad_norm": 4.415502071380615, + "learning_rate": 2.8023598820059e-06, + "loss": 0.8213, + "step": 304000 + }, + { + "epoch": 47.27526781555659, + "grad_norm": 5.419862270355225, + "learning_rate": 2.7247321844434097e-06, + "loss": 0.8256, + "step": 304500 + }, + { + "epoch": 47.35289551311908, + "grad_norm": 4.600099563598633, + "learning_rate": 2.6471044868809193e-06, + "loss": 0.8259, + "step": 305000 + }, + { + "epoch": 47.43052321068157, + "grad_norm": 5.056214332580566, + "learning_rate": 2.569476789318429e-06, + "loss": 0.8232, + "step": 305500 + }, + { + "epoch": 47.50815090824406, + "grad_norm": 4.458391189575195, + "learning_rate": 2.4918490917559387e-06, + "loss": 0.8297, + "step": 306000 + }, + { + "epoch": 47.58577860580655, + "grad_norm": 4.724514961242676, + "learning_rate": 2.4142213941934484e-06, + "loss": 0.8257, + "step": 306500 + }, + { + "epoch": 47.663406303369044, + "grad_norm": 4.462941646575928, + "learning_rate": 2.336593696630958e-06, + "loss": 0.8265, + "step": 307000 + }, + { + "epoch": 47.741034000931535, + "grad_norm": 4.594760417938232, + "learning_rate": 2.2589659990684677e-06, + "loss": 0.8285, + "step": 307500 + }, + { + "epoch": 47.818661698494026, + "grad_norm": 4.6404032707214355, + "learning_rate": 2.1813383015059778e-06, + "loss": 0.8261, + "step": 308000 + }, + { + "epoch": 47.89628939605651, + "grad_norm": 3.944291830062866, + "learning_rate": 2.1037106039434874e-06, + "loss": 0.834, + "step": 308500 + }, + { + "epoch": 47.973917093619, + "grad_norm": 4.836678504943848, + "learning_rate": 2.026082906380997e-06, + "loss": 0.827, + "step": 309000 + }, + { + "epoch": 48.05154479118149, + "grad_norm": 4.680452823638916, + "learning_rate": 1.9484552088185068e-06, + "loss": 0.8142, + "step": 309500 + }, + { + "epoch": 48.12917248874398, + "grad_norm": 5.229122161865234, + "learning_rate": 1.8708275112560162e-06, + "loss": 0.8151, + "step": 310000 + }, + { + "epoch": 48.206800186306474, + "grad_norm": 4.585724353790283, + "learning_rate": 1.7931998136935261e-06, + "loss": 0.8188, + "step": 310500 + }, + { + "epoch": 48.284427883868965, + "grad_norm": 4.325538158416748, + "learning_rate": 1.7155721161310358e-06, + "loss": 0.8115, + "step": 311000 + }, + { + "epoch": 48.362055581431456, + "grad_norm": 4.884690761566162, + "learning_rate": 1.6379444185685455e-06, + "loss": 0.8105, + "step": 311500 + }, + { + "epoch": 48.43968327899395, + "grad_norm": 4.815389633178711, + "learning_rate": 1.5603167210060551e-06, + "loss": 0.814, + "step": 312000 + }, + { + "epoch": 48.51731097655644, + "grad_norm": 4.258877277374268, + "learning_rate": 1.4826890234435648e-06, + "loss": 0.814, + "step": 312500 + }, + { + "epoch": 48.59493867411893, + "grad_norm": 4.596804618835449, + "learning_rate": 1.4050613258810745e-06, + "loss": 0.8168, + "step": 313000 + }, + { + "epoch": 48.67256637168141, + "grad_norm": 4.754199504852295, + "learning_rate": 1.3274336283185841e-06, + "loss": 0.8205, + "step": 313500 + }, + { + "epoch": 48.7501940692439, + "grad_norm": 4.652686595916748, + "learning_rate": 1.2498059307560938e-06, + "loss": 0.818, + "step": 314000 + }, + { + "epoch": 48.827821766806395, + "grad_norm": 4.778179168701172, + "learning_rate": 1.1721782331936035e-06, + "loss": 0.8215, + "step": 314500 + }, + { + "epoch": 48.905449464368886, + "grad_norm": 4.835714817047119, + "learning_rate": 1.0945505356311131e-06, + "loss": 0.8184, + "step": 315000 + }, + { + "epoch": 48.98307716193138, + "grad_norm": 4.331784725189209, + "learning_rate": 1.016922838068623e-06, + "loss": 0.8149, + "step": 315500 + }, + { + "epoch": 49.06070485949387, + "grad_norm": 4.657207012176514, + "learning_rate": 9.392951405061327e-07, + "loss": 0.8137, + "step": 316000 + }, + { + "epoch": 49.13833255705636, + "grad_norm": 4.450284481048584, + "learning_rate": 8.616674429436423e-07, + "loss": 0.8115, + "step": 316500 + }, + { + "epoch": 49.21596025461885, + "grad_norm": 3.921935558319092, + "learning_rate": 7.84039745381152e-07, + "loss": 0.8102, + "step": 317000 + }, + { + "epoch": 49.29358795218134, + "grad_norm": 4.742419719696045, + "learning_rate": 7.064120478186618e-07, + "loss": 0.8069, + "step": 317500 + }, + { + "epoch": 49.37121564974383, + "grad_norm": 4.7592387199401855, + "learning_rate": 6.287843502561715e-07, + "loss": 0.8111, + "step": 318000 + }, + { + "epoch": 49.44884334730632, + "grad_norm": 4.364270210266113, + "learning_rate": 5.511566526936811e-07, + "loss": 0.8044, + "step": 318500 + }, + { + "epoch": 49.526471044868806, + "grad_norm": 4.5575337409973145, + "learning_rate": 4.735289551311908e-07, + "loss": 0.8007, + "step": 319000 + }, + { + "epoch": 49.6040987424313, + "grad_norm": 4.399910926818848, + "learning_rate": 3.9590125756870057e-07, + "loss": 0.8097, + "step": 319500 + }, + { + "epoch": 49.68172643999379, + "grad_norm": 4.863783836364746, + "learning_rate": 3.1827356000621023e-07, + "loss": 0.8093, + "step": 320000 + }, + { + "epoch": 49.75935413755628, + "grad_norm": 4.700865745544434, + "learning_rate": 2.4064586244371996e-07, + "loss": 0.812, + "step": 320500 + }, + { + "epoch": 49.83698183511877, + "grad_norm": 4.929879188537598, + "learning_rate": 1.6301816488122962e-07, + "loss": 0.8121, + "step": 321000 + }, + { + "epoch": 49.91460953268126, + "grad_norm": 4.459561347961426, + "learning_rate": 8.539046731873933e-08, + "loss": 0.8108, + "step": 321500 + }, + { + "epoch": 49.99223723024375, + "grad_norm": 4.53715181350708, + "learning_rate": 7.76276975624903e-09, + "loss": 0.8126, + "step": 322000 + }, + { + "epoch": 50.0, + "step": 322050, + "total_flos": 9.94521893679661e+17, + "train_loss": 1.8893472661618176, + "train_runtime": 93675.3384, + "train_samples_per_second": 110.014, + "train_steps_per_second": 3.438 + } + ], + "logging_steps": 500, + "max_steps": 322050, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.94521893679661e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}