{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8313847752663029, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 31.813573837280273, "learning_rate": 5e-06, "loss": 9.1179, "step": 5 }, { "epoch": 0.0, "grad_norm": 27.920026779174805, "learning_rate": 1e-05, "loss": 8.6958, "step": 10 }, { "epoch": 0.0, "grad_norm": 153.7402801513672, "learning_rate": 1.5e-05, "loss": 8.1658, "step": 15 }, { "epoch": 0.0, "grad_norm": 7.882206916809082, "learning_rate": 2e-05, "loss": 7.78, "step": 20 }, { "epoch": 0.01, "grad_norm": 8.389701843261719, "learning_rate": 2.5e-05, "loss": 7.5562, "step": 25 }, { "epoch": 0.01, "grad_norm": 5.589534759521484, "learning_rate": 3e-05, "loss": 7.4236, "step": 30 }, { "epoch": 0.01, "grad_norm": 4.589334487915039, "learning_rate": 3.5e-05, "loss": 7.208, "step": 35 }, { "epoch": 0.01, "grad_norm": 53.45998001098633, "learning_rate": 4e-05, "loss": 6.9417, "step": 40 }, { "epoch": 0.01, "grad_norm": 6.914029598236084, "learning_rate": 4.5e-05, "loss": 6.6974, "step": 45 }, { "epoch": 0.01, "grad_norm": 7.271754264831543, "learning_rate": 5e-05, "loss": 6.4774, "step": 50 }, { "epoch": 0.01, "grad_norm": 50.406898498535156, "learning_rate": 5.500000000000001e-05, "loss": 6.2127, "step": 55 }, { "epoch": 0.01, "grad_norm": 5.283674240112305, "learning_rate": 6e-05, "loss": 6.1023, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.251434326171875, "learning_rate": 6.500000000000001e-05, "loss": 5.8973, "step": 65 }, { "epoch": 0.01, "grad_norm": 3.821683406829834, "learning_rate": 7e-05, "loss": 5.6945, "step": 70 }, { "epoch": 0.02, "grad_norm": 3.5881142616271973, "learning_rate": 7.500000000000001e-05, "loss": 5.4818, "step": 75 }, { "epoch": 0.02, "grad_norm": 3.2939279079437256, "learning_rate": 8e-05, "loss": 5.3125, "step": 80 }, { "epoch": 0.02, "grad_norm": 3.72703218460083, "learning_rate": 8.5e-05, "loss": 5.1715, "step": 85 }, { "epoch": 0.02, "grad_norm": 3.7567694187164307, "learning_rate": 9e-05, "loss": 4.9456, "step": 90 }, { "epoch": 0.02, "grad_norm": 3.2102720737457275, "learning_rate": 9.5e-05, "loss": 4.8999, "step": 95 }, { "epoch": 0.02, "grad_norm": 2.9953739643096924, "learning_rate": 0.0001, "loss": 4.699, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.5861263275146484, "learning_rate": 9.999972205865686e-05, "loss": 4.58, "step": 105 }, { "epoch": 0.02, "grad_norm": 3.5365688800811768, "learning_rate": 9.999888823771751e-05, "loss": 4.4493, "step": 110 }, { "epoch": 0.02, "grad_norm": 3.0703377723693848, "learning_rate": 9.999749854645204e-05, "loss": 4.4265, "step": 115 }, { "epoch": 0.02, "grad_norm": 5.441176414489746, "learning_rate": 9.99955530003106e-05, "loss": 4.3361, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.3856356143951416, "learning_rate": 9.99930516209231e-05, "loss": 4.3491, "step": 125 }, { "epoch": 0.03, "grad_norm": 55.84754943847656, "learning_rate": 9.998999443609897e-05, "loss": 4.1683, "step": 130 }, { "epoch": 0.03, "grad_norm": 3.092285633087158, "learning_rate": 9.998638147982696e-05, "loss": 4.1626, "step": 135 }, { "epoch": 0.03, "grad_norm": 2.619375228881836, "learning_rate": 9.998221279227467e-05, "loss": 4.0489, "step": 140 }, { "epoch": 0.03, "grad_norm": 2.414034843444824, "learning_rate": 9.997748841978812e-05, "loss": 3.9887, "step": 145 }, { "epoch": 0.03, "grad_norm": 2.347266435623169, "learning_rate": 9.997220841489122e-05, "loss": 3.8839, "step": 150 }, { "epoch": 0.03, "grad_norm": 2.6054680347442627, "learning_rate": 9.996637283628528e-05, "loss": 3.8374, "step": 155 }, { "epoch": 0.03, "grad_norm": 2.4477622509002686, "learning_rate": 9.995998174884821e-05, "loss": 3.7885, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.374764919281006, "learning_rate": 9.995303522363394e-05, "loss": 3.6678, "step": 165 }, { "epoch": 0.04, "grad_norm": 2.28847599029541, "learning_rate": 9.99455333378715e-05, "loss": 3.7301, "step": 170 }, { "epoch": 0.04, "grad_norm": 2.0600550174713135, "learning_rate": 9.993747617496428e-05, "loss": 3.6988, "step": 175 }, { "epoch": 0.04, "grad_norm": 2.428739309310913, "learning_rate": 9.9928863824489e-05, "loss": 3.6342, "step": 180 }, { "epoch": 0.04, "grad_norm": 2.138685941696167, "learning_rate": 9.99196963821948e-05, "loss": 3.507, "step": 185 }, { "epoch": 0.04, "grad_norm": 2.2313568592071533, "learning_rate": 9.990997395000217e-05, "loss": 3.5209, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.0725700855255127, "learning_rate": 9.989969663600169e-05, "loss": 3.4709, "step": 195 }, { "epoch": 0.04, "grad_norm": 1.8447208404541016, "learning_rate": 9.9888864554453e-05, "loss": 3.4876, "step": 200 }, { "epoch": 0.04, "grad_norm": 2.315260410308838, "learning_rate": 9.987747782578342e-05, "loss": 3.5003, "step": 205 }, { "epoch": 0.04, "grad_norm": 2.2463152408599854, "learning_rate": 9.986553657658668e-05, "loss": 3.4555, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.66461181640625, "learning_rate": 9.985304093962145e-05, "loss": 3.3826, "step": 215 }, { "epoch": 0.05, "grad_norm": 2.163248062133789, "learning_rate": 9.983999105380988e-05, "loss": 3.329, "step": 220 }, { "epoch": 0.05, "grad_norm": 1.7948611974716187, "learning_rate": 9.982638706423608e-05, "loss": 3.3666, "step": 225 }, { "epoch": 0.05, "grad_norm": 1.7669121026992798, "learning_rate": 9.98122291221445e-05, "loss": 3.3379, "step": 230 }, { "epoch": 0.05, "grad_norm": 1.7311700582504272, "learning_rate": 9.979751738493826e-05, "loss": 3.3163, "step": 235 }, { "epoch": 0.05, "grad_norm": 1.7457441091537476, "learning_rate": 9.978225201617732e-05, "loss": 3.292, "step": 240 }, { "epoch": 0.05, "grad_norm": 4.000087738037109, "learning_rate": 9.976643318557678e-05, "loss": 3.2588, "step": 245 }, { "epoch": 0.05, "grad_norm": 2.4156339168548584, "learning_rate": 9.975006106900495e-05, "loss": 3.244, "step": 250 }, { "epoch": 0.05, "grad_norm": 1.8009788990020752, "learning_rate": 9.973313584848132e-05, "loss": 3.2657, "step": 255 }, { "epoch": 0.05, "grad_norm": 2.0066516399383545, "learning_rate": 9.971565771217464e-05, "loss": 3.1446, "step": 260 }, { "epoch": 0.06, "grad_norm": 2.2172601222991943, "learning_rate": 9.969762685440076e-05, "loss": 3.3102, "step": 265 }, { "epoch": 0.06, "grad_norm": 1.6497077941894531, "learning_rate": 9.967904347562054e-05, "loss": 3.1381, "step": 270 }, { "epoch": 0.06, "grad_norm": 1.7990940809249878, "learning_rate": 9.965990778243755e-05, "loss": 3.2215, "step": 275 }, { "epoch": 0.06, "grad_norm": 2.077507734298706, "learning_rate": 9.964021998759577e-05, "loss": 3.2076, "step": 280 }, { "epoch": 0.06, "grad_norm": 1.731491208076477, "learning_rate": 9.961998030997733e-05, "loss": 3.273, "step": 285 }, { "epoch": 0.06, "grad_norm": 1.7491103410720825, "learning_rate": 9.95991889745999e-05, "loss": 3.2319, "step": 290 }, { "epoch": 0.06, "grad_norm": 1.697852373123169, "learning_rate": 9.957784621261441e-05, "loss": 3.1617, "step": 295 }, { "epoch": 0.06, "grad_norm": 1.6772892475128174, "learning_rate": 9.955595226130226e-05, "loss": 3.0351, "step": 300 }, { "epoch": 0.06, "grad_norm": 1.686403512954712, "learning_rate": 9.953350736407282e-05, "loss": 3.1364, "step": 305 }, { "epoch": 0.06, "grad_norm": 1.605745553970337, "learning_rate": 9.951051177046069e-05, "loss": 3.1433, "step": 310 }, { "epoch": 0.07, "grad_norm": 1.5012344121932983, "learning_rate": 9.948696573612292e-05, "loss": 3.1276, "step": 315 }, { "epoch": 0.07, "grad_norm": 1.677473783493042, "learning_rate": 9.946286952283618e-05, "loss": 3.1291, "step": 320 }, { "epoch": 0.07, "grad_norm": 1.7051786184310913, "learning_rate": 9.943822339849381e-05, "loss": 3.0843, "step": 325 }, { "epoch": 0.07, "grad_norm": 1.4949860572814941, "learning_rate": 9.941302763710288e-05, "loss": 3.0474, "step": 330 }, { "epoch": 0.07, "grad_norm": 1.5325379371643066, "learning_rate": 9.938728251878116e-05, "loss": 3.1196, "step": 335 }, { "epoch": 0.07, "grad_norm": 1.5314396619796753, "learning_rate": 9.936098832975393e-05, "loss": 3.0491, "step": 340 }, { "epoch": 0.07, "grad_norm": 1.7919921875, "learning_rate": 9.933414536235091e-05, "loss": 2.9929, "step": 345 }, { "epoch": 0.07, "grad_norm": 1.5717966556549072, "learning_rate": 9.93067539150029e-05, "loss": 3.0304, "step": 350 }, { "epoch": 0.07, "grad_norm": 4.389132976531982, "learning_rate": 9.927881429223853e-05, "loss": 3.0005, "step": 355 }, { "epoch": 0.07, "grad_norm": 1.6020395755767822, "learning_rate": 9.925032680468085e-05, "loss": 3.071, "step": 360 }, { "epoch": 0.08, "grad_norm": 1.7732460498809814, "learning_rate": 9.922129176904388e-05, "loss": 3.0712, "step": 365 }, { "epoch": 0.08, "grad_norm": 1.502739667892456, "learning_rate": 9.919170950812911e-05, "loss": 3.0481, "step": 370 }, { "epoch": 0.08, "grad_norm": 1.4535770416259766, "learning_rate": 9.916158035082184e-05, "loss": 3.0377, "step": 375 }, { "epoch": 0.08, "grad_norm": 1.457908272743225, "learning_rate": 9.913090463208763e-05, "loss": 3.0151, "step": 380 }, { "epoch": 0.08, "grad_norm": 1.3857684135437012, "learning_rate": 9.90996826929685e-05, "loss": 3.0602, "step": 385 }, { "epoch": 0.08, "grad_norm": 1.4621527194976807, "learning_rate": 9.906791488057916e-05, "loss": 2.9867, "step": 390 }, { "epoch": 0.08, "grad_norm": 1.4058239459991455, "learning_rate": 9.903560154810313e-05, "loss": 2.9423, "step": 395 }, { "epoch": 0.08, "grad_norm": 1.3318129777908325, "learning_rate": 9.900274305478887e-05, "loss": 2.9254, "step": 400 }, { "epoch": 0.08, "grad_norm": 1.401201844215393, "learning_rate": 9.896933976594572e-05, "loss": 3.0216, "step": 405 }, { "epoch": 0.09, "grad_norm": 1.4442485570907593, "learning_rate": 9.893539205293989e-05, "loss": 2.9923, "step": 410 }, { "epoch": 0.09, "grad_norm": 1.4190860986709595, "learning_rate": 9.890090029319028e-05, "loss": 2.9611, "step": 415 }, { "epoch": 0.09, "grad_norm": 1.4127558469772339, "learning_rate": 9.886586487016433e-05, "loss": 2.979, "step": 420 }, { "epoch": 0.09, "grad_norm": 1.3861864805221558, "learning_rate": 9.883028617337378e-05, "loss": 2.9438, "step": 425 }, { "epoch": 0.09, "grad_norm": 1.4450087547302246, "learning_rate": 9.879416459837022e-05, "loss": 2.9116, "step": 430 }, { "epoch": 0.09, "grad_norm": 1.407228708267212, "learning_rate": 9.875750054674082e-05, "loss": 2.9427, "step": 435 }, { "epoch": 0.09, "grad_norm": 1.5297348499298096, "learning_rate": 9.872029442610382e-05, "loss": 2.9826, "step": 440 }, { "epoch": 0.09, "grad_norm": 1.481693148612976, "learning_rate": 9.8682546650104e-05, "loss": 2.9041, "step": 445 }, { "epoch": 0.09, "grad_norm": 1.3595296144485474, "learning_rate": 9.864425763840802e-05, "loss": 2.9036, "step": 450 }, { "epoch": 0.09, "grad_norm": 1.3935821056365967, "learning_rate": 9.860542781669988e-05, "loss": 2.9413, "step": 455 }, { "epoch": 0.1, "grad_norm": 1.4735407829284668, "learning_rate": 9.85660576166761e-05, "loss": 2.973, "step": 460 }, { "epoch": 0.1, "grad_norm": 1.3592067956924438, "learning_rate": 9.852614747604093e-05, "loss": 2.9061, "step": 465 }, { "epoch": 0.1, "grad_norm": 1.4290657043457031, "learning_rate": 9.848569783850145e-05, "loss": 2.9136, "step": 470 }, { "epoch": 0.1, "grad_norm": 1.2641561031341553, "learning_rate": 9.844470915376278e-05, "loss": 2.8821, "step": 475 }, { "epoch": 0.1, "grad_norm": 1.3433492183685303, "learning_rate": 9.840318187752292e-05, "loss": 2.9061, "step": 480 }, { "epoch": 0.1, "grad_norm": 1.3268264532089233, "learning_rate": 9.836111647146771e-05, "loss": 2.9067, "step": 485 }, { "epoch": 0.1, "grad_norm": 1.4298189878463745, "learning_rate": 9.831851340326577e-05, "loss": 2.93, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.2761307954788208, "learning_rate": 9.82753731465633e-05, "loss": 2.8199, "step": 495 }, { "epoch": 0.1, "grad_norm": 1.3382200002670288, "learning_rate": 9.823169618097871e-05, "loss": 2.854, "step": 500 }, { "epoch": 0.1, "grad_norm": 1.5193979740142822, "learning_rate": 9.81874829920974e-05, "loss": 2.8094, "step": 505 }, { "epoch": 0.11, "grad_norm": 1.2841696739196777, "learning_rate": 9.814273407146623e-05, "loss": 2.8539, "step": 510 }, { "epoch": 0.11, "grad_norm": 1.268577218055725, "learning_rate": 9.809744991658829e-05, "loss": 2.8192, "step": 515 }, { "epoch": 0.11, "grad_norm": 1.2177411317825317, "learning_rate": 9.805163103091708e-05, "loss": 2.8578, "step": 520 }, { "epoch": 0.11, "grad_norm": 1.3120864629745483, "learning_rate": 9.800527792385112e-05, "loss": 2.8267, "step": 525 }, { "epoch": 0.11, "grad_norm": 1.2718641757965088, "learning_rate": 9.79583911107282e-05, "loss": 2.9266, "step": 530 }, { "epoch": 0.11, "grad_norm": 1.300713300704956, "learning_rate": 9.791097111281968e-05, "loss": 2.8243, "step": 535 }, { "epoch": 0.11, "grad_norm": 1.2401148080825806, "learning_rate": 9.786301845732467e-05, "loss": 2.9235, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.2712082862854004, "learning_rate": 9.781453367736418e-05, "loss": 2.8, "step": 545 }, { "epoch": 0.11, "grad_norm": 1.2234678268432617, "learning_rate": 9.776551731197524e-05, "loss": 2.8874, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.2925634384155273, "learning_rate": 9.771596990610478e-05, "loss": 2.8941, "step": 555 }, { "epoch": 0.12, "grad_norm": 1.286523461341858, "learning_rate": 9.766589201060372e-05, "loss": 2.8159, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.2230117321014404, "learning_rate": 9.761528418222077e-05, "loss": 2.8885, "step": 565 }, { "epoch": 0.12, "grad_norm": 1.2012773752212524, "learning_rate": 9.756414698359624e-05, "loss": 2.8415, "step": 570 }, { "epoch": 0.12, "grad_norm": 1.2559715509414673, "learning_rate": 9.75124809832558e-05, "loss": 2.8911, "step": 575 }, { "epoch": 0.12, "grad_norm": 1.252892255783081, "learning_rate": 9.746028675560413e-05, "loss": 2.8023, "step": 580 }, { "epoch": 0.12, "grad_norm": 1.2187546491622925, "learning_rate": 9.740756488091861e-05, "loss": 2.8552, "step": 585 }, { "epoch": 0.12, "grad_norm": 1.2338809967041016, "learning_rate": 9.735431594534277e-05, "loss": 2.8293, "step": 590 }, { "epoch": 0.12, "grad_norm": 1.2111319303512573, "learning_rate": 9.730054054087983e-05, "loss": 2.7828, "step": 595 }, { "epoch": 0.12, "grad_norm": 1.204712152481079, "learning_rate": 9.724623926538612e-05, "loss": 2.8412, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.2021050453186035, "learning_rate": 9.719141272256443e-05, "loss": 2.7744, "step": 605 }, { "epoch": 0.13, "grad_norm": 1.1791361570358276, "learning_rate": 9.713606152195726e-05, "loss": 2.8102, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.2355018854141235, "learning_rate": 9.708018627894011e-05, "loss": 2.8044, "step": 615 }, { "epoch": 0.13, "grad_norm": 1.9050341844558716, "learning_rate": 9.702378761471456e-05, "loss": 2.8365, "step": 620 }, { "epoch": 0.13, "grad_norm": 1.1793168783187866, "learning_rate": 9.696686615630146e-05, "loss": 2.7307, "step": 625 }, { "epoch": 0.13, "grad_norm": 1.618408441543579, "learning_rate": 9.690942253653385e-05, "loss": 2.7531, "step": 630 }, { "epoch": 0.13, "grad_norm": 1.1792315244674683, "learning_rate": 9.685145739405002e-05, "loss": 2.8224, "step": 635 }, { "epoch": 0.13, "grad_norm": 1.2722834348678589, "learning_rate": 9.679297137328634e-05, "loss": 2.7523, "step": 640 }, { "epoch": 0.13, "grad_norm": 1.2244315147399902, "learning_rate": 9.673396512447013e-05, "loss": 2.7402, "step": 645 }, { "epoch": 0.14, "grad_norm": 1.217962622642517, "learning_rate": 9.667443930361247e-05, "loss": 2.7718, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.1462675333023071, "learning_rate": 9.661439457250076e-05, "loss": 2.7529, "step": 655 }, { "epoch": 0.14, "grad_norm": 1.1446068286895752, "learning_rate": 9.655383159869158e-05, "loss": 2.7839, "step": 660 }, { "epoch": 0.14, "grad_norm": 1.1892576217651367, "learning_rate": 9.649275105550309e-05, "loss": 2.7048, "step": 665 }, { "epoch": 0.14, "grad_norm": 1.1554418802261353, "learning_rate": 9.643115362200762e-05, "loss": 2.7552, "step": 670 }, { "epoch": 0.14, "grad_norm": 1.1442291736602783, "learning_rate": 9.636903998302409e-05, "loss": 2.8037, "step": 675 }, { "epoch": 0.14, "grad_norm": 1.131379246711731, "learning_rate": 9.630641082911045e-05, "loss": 2.6973, "step": 680 }, { "epoch": 0.14, "grad_norm": 1.1673518419265747, "learning_rate": 9.624326685655593e-05, "loss": 2.7727, "step": 685 }, { "epoch": 0.14, "grad_norm": 1.1787452697753906, "learning_rate": 9.617960876737337e-05, "loss": 2.7412, "step": 690 }, { "epoch": 0.14, "grad_norm": 1.1861211061477661, "learning_rate": 9.611543726929134e-05, "loss": 2.7139, "step": 695 }, { "epoch": 0.15, "grad_norm": 1.2275561094284058, "learning_rate": 9.605075307574635e-05, "loss": 2.8026, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.1469069719314575, "learning_rate": 9.598555690587487e-05, "loss": 2.7468, "step": 705 }, { "epoch": 0.15, "grad_norm": 1.1670516729354858, "learning_rate": 9.591984948450532e-05, "loss": 2.7824, "step": 710 }, { "epoch": 0.15, "grad_norm": 1.1783413887023926, "learning_rate": 9.585363154215008e-05, "loss": 2.8328, "step": 715 }, { "epoch": 0.15, "grad_norm": 1.108250379562378, "learning_rate": 9.578690381499728e-05, "loss": 2.7159, "step": 720 }, { "epoch": 0.15, "grad_norm": 1.202819585800171, "learning_rate": 9.571966704490271e-05, "loss": 2.7425, "step": 725 }, { "epoch": 0.15, "grad_norm": 1.1453441381454468, "learning_rate": 9.565192197938148e-05, "loss": 2.7279, "step": 730 }, { "epoch": 0.15, "grad_norm": 1.1307199001312256, "learning_rate": 9.558366937159977e-05, "loss": 2.7687, "step": 735 }, { "epoch": 0.15, "grad_norm": 1.1222865581512451, "learning_rate": 9.551490998036646e-05, "loss": 2.7295, "step": 740 }, { "epoch": 0.15, "grad_norm": 1.175696849822998, "learning_rate": 9.544564457012463e-05, "loss": 2.7889, "step": 745 }, { "epoch": 0.16, "grad_norm": 1.1392408609390259, "learning_rate": 9.537587391094314e-05, "loss": 2.7533, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.142823576927185, "learning_rate": 9.5305598778508e-05, "loss": 2.6836, "step": 755 }, { "epoch": 0.16, "grad_norm": 1.1527529954910278, "learning_rate": 9.52348199541138e-05, "loss": 2.7062, "step": 760 }, { "epoch": 0.16, "grad_norm": 1.2020232677459717, "learning_rate": 9.516353822465504e-05, "loss": 2.7576, "step": 765 }, { "epoch": 0.16, "grad_norm": 1.1423379182815552, "learning_rate": 9.509175438261726e-05, "loss": 2.7109, "step": 770 }, { "epoch": 0.16, "grad_norm": 1.208016037940979, "learning_rate": 9.501946922606838e-05, "loss": 2.774, "step": 775 }, { "epoch": 0.16, "grad_norm": 1.1515254974365234, "learning_rate": 9.494668355864973e-05, "loss": 2.7202, "step": 780 }, { "epoch": 0.16, "grad_norm": 1.1667124032974243, "learning_rate": 9.487339818956716e-05, "loss": 2.6929, "step": 785 }, { "epoch": 0.16, "grad_norm": 1.1528996229171753, "learning_rate": 9.479961393358203e-05, "loss": 2.7247, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.1446163654327393, "learning_rate": 9.472533161100215e-05, "loss": 2.7139, "step": 795 }, { "epoch": 0.17, "grad_norm": 1.1616708040237427, "learning_rate": 9.465055204767265e-05, "loss": 2.7139, "step": 800 }, { "epoch": 0.17, "grad_norm": 1.0954192876815796, "learning_rate": 9.457527607496685e-05, "loss": 2.7367, "step": 805 }, { "epoch": 0.17, "grad_norm": 1.1313517093658447, "learning_rate": 9.44995045297769e-05, "loss": 2.7652, "step": 810 }, { "epoch": 0.17, "grad_norm": 1.102845311164856, "learning_rate": 9.442323825450464e-05, "loss": 2.649, "step": 815 }, { "epoch": 0.17, "grad_norm": 1.1344150304794312, "learning_rate": 9.43464780970521e-05, "loss": 2.6686, "step": 820 }, { "epoch": 0.17, "grad_norm": 1.1737732887268066, "learning_rate": 9.426922491081212e-05, "loss": 2.7243, "step": 825 }, { "epoch": 0.17, "grad_norm": 1.5500872135162354, "learning_rate": 9.419147955465888e-05, "loss": 2.682, "step": 830 }, { "epoch": 0.17, "grad_norm": 1.1714314222335815, "learning_rate": 9.411324289293832e-05, "loss": 2.6775, "step": 835 }, { "epoch": 0.17, "grad_norm": 1.1273174285888672, "learning_rate": 9.403451579545859e-05, "loss": 2.6827, "step": 840 }, { "epoch": 0.18, "grad_norm": 1.0785410404205322, "learning_rate": 9.395529913748025e-05, "loss": 2.6474, "step": 845 }, { "epoch": 0.18, "grad_norm": 1.1376752853393555, "learning_rate": 9.387559379970672e-05, "loss": 2.7519, "step": 850 }, { "epoch": 0.18, "grad_norm": 1.923419713973999, "learning_rate": 9.379540066827431e-05, "loss": 2.6756, "step": 855 }, { "epoch": 0.18, "grad_norm": 1.1519583463668823, "learning_rate": 9.371472063474248e-05, "loss": 2.7166, "step": 860 }, { "epoch": 0.18, "grad_norm": 1.0922170877456665, "learning_rate": 9.363355459608394e-05, "loss": 2.6794, "step": 865 }, { "epoch": 0.18, "grad_norm": 1.2151093482971191, "learning_rate": 9.355190345467457e-05, "loss": 2.6482, "step": 870 }, { "epoch": 0.18, "grad_norm": 1.0864477157592773, "learning_rate": 9.346976811828352e-05, "loss": 2.6631, "step": 875 }, { "epoch": 0.18, "grad_norm": 1.1231117248535156, "learning_rate": 9.338714950006297e-05, "loss": 2.6855, "step": 880 }, { "epoch": 0.18, "grad_norm": 1.0890246629714966, "learning_rate": 9.330404851853817e-05, "loss": 2.7035, "step": 885 }, { "epoch": 0.18, "grad_norm": 1.1322407722473145, "learning_rate": 9.3220466097597e-05, "loss": 2.6365, "step": 890 }, { "epoch": 0.19, "grad_norm": 1.0793037414550781, "learning_rate": 9.313640316647991e-05, "loss": 2.7133, "step": 895 }, { "epoch": 0.19, "grad_norm": 1.0490481853485107, "learning_rate": 9.305186065976945e-05, "loss": 2.6949, "step": 900 }, { "epoch": 0.19, "grad_norm": 1.1165353059768677, "learning_rate": 9.296683951737993e-05, "loss": 2.6312, "step": 905 }, { "epoch": 0.19, "grad_norm": 1.0708872079849243, "learning_rate": 9.288134068454697e-05, "loss": 2.6534, "step": 910 }, { "epoch": 0.19, "grad_norm": 1.0487841367721558, "learning_rate": 9.2795365111817e-05, "loss": 2.6838, "step": 915 }, { "epoch": 0.19, "grad_norm": 1.0860440731048584, "learning_rate": 9.270891375503665e-05, "loss": 2.7283, "step": 920 }, { "epoch": 0.19, "grad_norm": 1.076627254486084, "learning_rate": 9.262198757534218e-05, "loss": 2.6791, "step": 925 }, { "epoch": 0.19, "grad_norm": 1.0581265687942505, "learning_rate": 9.253458753914874e-05, "loss": 2.6602, "step": 930 }, { "epoch": 0.19, "grad_norm": 1.0864975452423096, "learning_rate": 9.244671461813969e-05, "loss": 2.6428, "step": 935 }, { "epoch": 0.2, "grad_norm": 1.1001484394073486, "learning_rate": 9.235836978925572e-05, "loss": 2.7087, "step": 940 }, { "epoch": 0.2, "grad_norm": 1.130135178565979, "learning_rate": 9.226955403468406e-05, "loss": 2.6837, "step": 945 }, { "epoch": 0.2, "grad_norm": 1.0518337488174438, "learning_rate": 9.21802683418475e-05, "loss": 2.6547, "step": 950 }, { "epoch": 0.2, "grad_norm": 1.0764212608337402, "learning_rate": 9.209051370339347e-05, "loss": 2.6823, "step": 955 }, { "epoch": 0.2, "grad_norm": 1.0864237546920776, "learning_rate": 9.200029111718295e-05, "loss": 2.6558, "step": 960 }, { "epoch": 0.2, "grad_norm": 1.0547573566436768, "learning_rate": 9.190960158627941e-05, "loss": 2.6308, "step": 965 }, { "epoch": 0.2, "grad_norm": 1.0916203260421753, "learning_rate": 9.181844611893766e-05, "loss": 2.6403, "step": 970 }, { "epoch": 0.2, "grad_norm": 1.1068625450134277, "learning_rate": 9.172682572859261e-05, "loss": 2.6449, "step": 975 }, { "epoch": 0.2, "grad_norm": 1.0739907026290894, "learning_rate": 9.163474143384806e-05, "loss": 2.6834, "step": 980 }, { "epoch": 0.2, "grad_norm": 1.0589396953582764, "learning_rate": 9.154219425846528e-05, "loss": 2.6742, "step": 985 }, { "epoch": 0.21, "grad_norm": 1.0412026643753052, "learning_rate": 9.144918523135175e-05, "loss": 2.6834, "step": 990 }, { "epoch": 0.21, "grad_norm": 1.122963309288025, "learning_rate": 9.13557153865496e-05, "loss": 2.6321, "step": 995 }, { "epoch": 0.21, "grad_norm": 1.151918888092041, "learning_rate": 9.12617857632242e-05, "loss": 2.6733, "step": 1000 }, { "epoch": 0.21, "grad_norm": 1.0950418710708618, "learning_rate": 9.116739740565259e-05, "loss": 2.6735, "step": 1005 }, { "epoch": 0.21, "grad_norm": 1.0299748182296753, "learning_rate": 9.107255136321184e-05, "loss": 2.639, "step": 1010 }, { "epoch": 0.21, "grad_norm": 1.0754122734069824, "learning_rate": 9.09772486903674e-05, "loss": 2.6601, "step": 1015 }, { "epoch": 0.21, "grad_norm": 1.0586258172988892, "learning_rate": 9.08814904466614e-05, "loss": 2.6266, "step": 1020 }, { "epoch": 0.21, "grad_norm": 1.0555306673049927, "learning_rate": 9.078527769670085e-05, "loss": 2.6555, "step": 1025 }, { "epoch": 0.21, "grad_norm": 1.0900399684906006, "learning_rate": 9.068861151014575e-05, "loss": 2.6175, "step": 1030 }, { "epoch": 0.22, "grad_norm": 1.0734572410583496, "learning_rate": 9.05914929616973e-05, "loss": 2.6134, "step": 1035 }, { "epoch": 0.22, "grad_norm": 1.1126459836959839, "learning_rate": 9.04939231310859e-05, "loss": 2.6445, "step": 1040 }, { "epoch": 0.22, "grad_norm": 1.0274248123168945, "learning_rate": 9.039590310305914e-05, "loss": 2.7062, "step": 1045 }, { "epoch": 0.22, "grad_norm": 1.0258575677871704, "learning_rate": 9.029743396736974e-05, "loss": 2.5811, "step": 1050 }, { "epoch": 0.22, "grad_norm": 1.0055421590805054, "learning_rate": 9.019851681876348e-05, "loss": 2.6982, "step": 1055 }, { "epoch": 0.22, "grad_norm": 1.0245190858840942, "learning_rate": 9.009915275696693e-05, "loss": 2.6102, "step": 1060 }, { "epoch": 0.22, "grad_norm": 1.5464237928390503, "learning_rate": 8.999934288667534e-05, "loss": 2.6405, "step": 1065 }, { "epoch": 0.22, "grad_norm": 1.0202620029449463, "learning_rate": 8.989908831754028e-05, "loss": 2.5989, "step": 1070 }, { "epoch": 0.22, "grad_norm": 1.009238362312317, "learning_rate": 8.979839016415735e-05, "loss": 2.6014, "step": 1075 }, { "epoch": 0.22, "grad_norm": 1.04855215549469, "learning_rate": 8.969724954605373e-05, "loss": 2.6536, "step": 1080 }, { "epoch": 0.23, "grad_norm": 1.0804136991500854, "learning_rate": 8.959566758767581e-05, "loss": 2.6686, "step": 1085 }, { "epoch": 0.23, "grad_norm": 1.0398406982421875, "learning_rate": 8.949364541837661e-05, "loss": 2.616, "step": 1090 }, { "epoch": 0.23, "grad_norm": 1.0304359197616577, "learning_rate": 8.939118417240329e-05, "loss": 2.6174, "step": 1095 }, { "epoch": 0.23, "grad_norm": 1.080388069152832, "learning_rate": 8.92882849888845e-05, "loss": 2.627, "step": 1100 }, { "epoch": 0.23, "grad_norm": 1.0364700555801392, "learning_rate": 8.918494901181773e-05, "loss": 2.6303, "step": 1105 }, { "epoch": 0.23, "grad_norm": 1.0459529161453247, "learning_rate": 8.908117739005659e-05, "loss": 2.5842, "step": 1110 }, { "epoch": 0.23, "grad_norm": 1.0185799598693848, "learning_rate": 8.897697127729805e-05, "loss": 2.6425, "step": 1115 }, { "epoch": 0.23, "grad_norm": 0.9739039540290833, "learning_rate": 8.887233183206957e-05, "loss": 2.5828, "step": 1120 }, { "epoch": 0.23, "grad_norm": 1.059308409690857, "learning_rate": 8.876726021771627e-05, "loss": 2.5769, "step": 1125 }, { "epoch": 0.23, "grad_norm": 1.0338994264602661, "learning_rate": 8.866175760238798e-05, "loss": 2.6329, "step": 1130 }, { "epoch": 0.24, "grad_norm": 1.0227488279342651, "learning_rate": 8.855582515902625e-05, "loss": 2.5905, "step": 1135 }, { "epoch": 0.24, "grad_norm": 0.9925109148025513, "learning_rate": 8.844946406535131e-05, "loss": 2.5524, "step": 1140 }, { "epoch": 0.24, "grad_norm": 1.0282485485076904, "learning_rate": 8.834267550384893e-05, "loss": 2.581, "step": 1145 }, { "epoch": 0.24, "grad_norm": 1.0401309728622437, "learning_rate": 8.823546066175741e-05, "loss": 2.6801, "step": 1150 }, { "epoch": 0.24, "grad_norm": 1.0400043725967407, "learning_rate": 8.81278207310542e-05, "loss": 2.6248, "step": 1155 }, { "epoch": 0.24, "grad_norm": 0.9869978427886963, "learning_rate": 8.801975690844278e-05, "loss": 2.682, "step": 1160 }, { "epoch": 0.24, "grad_norm": 1.0918792486190796, "learning_rate": 8.791127039533934e-05, "loss": 2.5757, "step": 1165 }, { "epoch": 0.24, "grad_norm": 1.0104856491088867, "learning_rate": 8.780236239785935e-05, "loss": 2.58, "step": 1170 }, { "epoch": 0.24, "grad_norm": 1.059522271156311, "learning_rate": 8.76930341268042e-05, "loss": 2.6208, "step": 1175 }, { "epoch": 0.25, "grad_norm": 1.0344668626785278, "learning_rate": 8.758328679764776e-05, "loss": 2.6312, "step": 1180 }, { "epoch": 0.25, "grad_norm": 1.0061168670654297, "learning_rate": 8.747312163052284e-05, "loss": 2.5952, "step": 1185 }, { "epoch": 0.25, "grad_norm": 1.0210862159729004, "learning_rate": 8.736253985020761e-05, "loss": 2.6657, "step": 1190 }, { "epoch": 0.25, "grad_norm": 1.0098751783370972, "learning_rate": 8.725154268611203e-05, "loss": 2.6222, "step": 1195 }, { "epoch": 0.25, "grad_norm": 1.041806936264038, "learning_rate": 8.714013137226411e-05, "loss": 2.5781, "step": 1200 }, { "epoch": 0.25, "grad_norm": 1.0064157247543335, "learning_rate": 8.702830714729628e-05, "loss": 2.5136, "step": 1205 }, { "epoch": 0.25, "grad_norm": 0.9981704354286194, "learning_rate": 8.691607125443153e-05, "loss": 2.611, "step": 1210 }, { "epoch": 0.25, "grad_norm": 1.0080243349075317, "learning_rate": 8.680342494146967e-05, "loss": 2.5375, "step": 1215 }, { "epoch": 0.25, "grad_norm": 1.0076395273208618, "learning_rate": 8.66903694607734e-05, "loss": 2.6269, "step": 1220 }, { "epoch": 0.25, "grad_norm": 0.9885242581367493, "learning_rate": 8.65769060692544e-05, "loss": 2.5712, "step": 1225 }, { "epoch": 0.26, "grad_norm": 1.0156185626983643, "learning_rate": 8.646303602835936e-05, "loss": 2.6381, "step": 1230 }, { "epoch": 0.26, "grad_norm": 0.99817955493927, "learning_rate": 8.634876060405597e-05, "loss": 2.6589, "step": 1235 }, { "epoch": 0.26, "grad_norm": 0.9817842841148376, "learning_rate": 8.623408106681884e-05, "loss": 2.5775, "step": 1240 }, { "epoch": 0.26, "grad_norm": 0.9927291870117188, "learning_rate": 8.611899869161535e-05, "loss": 2.5987, "step": 1245 }, { "epoch": 0.26, "grad_norm": 1.006893515586853, "learning_rate": 8.600351475789147e-05, "loss": 2.565, "step": 1250 }, { "epoch": 0.26, "grad_norm": 1.1008330583572388, "learning_rate": 8.588763054955764e-05, "loss": 2.6247, "step": 1255 }, { "epoch": 0.26, "grad_norm": 1.0435577630996704, "learning_rate": 8.57713473549743e-05, "loss": 2.6331, "step": 1260 }, { "epoch": 0.26, "grad_norm": 1.0769011974334717, "learning_rate": 8.565466646693778e-05, "loss": 2.6576, "step": 1265 }, { "epoch": 0.26, "grad_norm": 1.0972942113876343, "learning_rate": 8.553758918266578e-05, "loss": 2.6234, "step": 1270 }, { "epoch": 0.27, "grad_norm": 0.964900016784668, "learning_rate": 8.5420116803783e-05, "loss": 2.5506, "step": 1275 }, { "epoch": 0.27, "grad_norm": 0.9919426441192627, "learning_rate": 8.530225063630668e-05, "loss": 2.5138, "step": 1280 }, { "epoch": 0.27, "grad_norm": 1.0192852020263672, "learning_rate": 8.518399199063205e-05, "loss": 2.6286, "step": 1285 }, { "epoch": 0.27, "grad_norm": 1.030393362045288, "learning_rate": 8.50653421815178e-05, "loss": 2.5331, "step": 1290 }, { "epoch": 0.27, "grad_norm": 1.0030856132507324, "learning_rate": 8.494630252807138e-05, "loss": 2.6121, "step": 1295 }, { "epoch": 0.27, "grad_norm": 1.058526635169983, "learning_rate": 8.482687435373449e-05, "loss": 2.6006, "step": 1300 }, { "epoch": 0.27, "grad_norm": 0.9560089111328125, "learning_rate": 8.470705898626817e-05, "loss": 2.6244, "step": 1305 }, { "epoch": 0.27, "grad_norm": 0.9747763872146606, "learning_rate": 8.458685775773822e-05, "loss": 2.5592, "step": 1310 }, { "epoch": 0.27, "grad_norm": 1.0110520124435425, "learning_rate": 8.446627200450025e-05, "loss": 2.6079, "step": 1315 }, { "epoch": 0.27, "grad_norm": 0.9854313731193542, "learning_rate": 8.434530306718493e-05, "loss": 2.5953, "step": 1320 }, { "epoch": 0.28, "grad_norm": 0.98088538646698, "learning_rate": 8.4223952290683e-05, "loss": 2.598, "step": 1325 }, { "epoch": 0.28, "grad_norm": 0.9649909734725952, "learning_rate": 8.41022210241304e-05, "loss": 2.5269, "step": 1330 }, { "epoch": 0.28, "grad_norm": 1.0427321195602417, "learning_rate": 8.398011062089316e-05, "loss": 2.5821, "step": 1335 }, { "epoch": 0.28, "grad_norm": 0.9849219918251038, "learning_rate": 8.385762243855249e-05, "loss": 2.5982, "step": 1340 }, { "epoch": 0.28, "grad_norm": 1.015129804611206, "learning_rate": 8.373475783888958e-05, "loss": 2.5903, "step": 1345 }, { "epoch": 0.28, "grad_norm": 0.9560769200325012, "learning_rate": 8.36115181878705e-05, "loss": 2.5121, "step": 1350 }, { "epoch": 0.28, "grad_norm": 0.9451593160629272, "learning_rate": 8.348790485563101e-05, "loss": 2.605, "step": 1355 }, { "epoch": 0.28, "grad_norm": 1.0395113229751587, "learning_rate": 8.336391921646134e-05, "loss": 2.6172, "step": 1360 }, { "epoch": 0.28, "grad_norm": 0.9910452961921692, "learning_rate": 8.323956264879089e-05, "loss": 2.5662, "step": 1365 }, { "epoch": 0.28, "grad_norm": 0.9585237503051758, "learning_rate": 8.311483653517294e-05, "loss": 2.5895, "step": 1370 }, { "epoch": 0.29, "grad_norm": 0.9870867133140564, "learning_rate": 8.298974226226919e-05, "loss": 2.6034, "step": 1375 }, { "epoch": 0.29, "grad_norm": 0.9943922758102417, "learning_rate": 8.28642812208345e-05, "loss": 2.5244, "step": 1380 }, { "epoch": 0.29, "grad_norm": 1.010125756263733, "learning_rate": 8.273845480570123e-05, "loss": 2.6087, "step": 1385 }, { "epoch": 0.29, "grad_norm": 0.9950689077377319, "learning_rate": 8.26122644157639e-05, "loss": 2.5111, "step": 1390 }, { "epoch": 0.29, "grad_norm": 1.0169402360916138, "learning_rate": 8.248571145396362e-05, "loss": 2.5522, "step": 1395 }, { "epoch": 0.29, "grad_norm": 0.9725683927536011, "learning_rate": 8.235879732727236e-05, "loss": 2.5374, "step": 1400 }, { "epoch": 0.29, "grad_norm": 0.9679161310195923, "learning_rate": 8.223152344667745e-05, "loss": 2.5739, "step": 1405 }, { "epoch": 0.29, "grad_norm": 0.9871531128883362, "learning_rate": 8.21038912271658e-05, "loss": 2.5947, "step": 1410 }, { "epoch": 0.29, "grad_norm": 0.9824729561805725, "learning_rate": 8.197590208770824e-05, "loss": 2.5648, "step": 1415 }, { "epoch": 0.3, "grad_norm": 0.9750092625617981, "learning_rate": 8.184755745124371e-05, "loss": 2.5038, "step": 1420 }, { "epoch": 0.3, "grad_norm": 0.9619457721710205, "learning_rate": 8.171885874466342e-05, "loss": 2.5755, "step": 1425 }, { "epoch": 0.3, "grad_norm": 0.993933916091919, "learning_rate": 8.158980739879507e-05, "loss": 2.5726, "step": 1430 }, { "epoch": 0.3, "grad_norm": 0.9767422080039978, "learning_rate": 8.146040484838677e-05, "loss": 2.6105, "step": 1435 }, { "epoch": 0.3, "grad_norm": 1.0110574960708618, "learning_rate": 8.133065253209132e-05, "loss": 2.5235, "step": 1440 }, { "epoch": 0.3, "grad_norm": 0.9937053918838501, "learning_rate": 8.120055189245e-05, "loss": 2.538, "step": 1445 }, { "epoch": 0.3, "grad_norm": 0.9573049545288086, "learning_rate": 8.10701043758767e-05, "loss": 2.524, "step": 1450 }, { "epoch": 0.3, "grad_norm": 0.9540808796882629, "learning_rate": 8.093931143264174e-05, "loss": 2.5675, "step": 1455 }, { "epoch": 0.3, "grad_norm": 1.008131504058838, "learning_rate": 8.080817451685576e-05, "loss": 2.5015, "step": 1460 }, { "epoch": 0.3, "grad_norm": 0.9936932921409607, "learning_rate": 8.067669508645356e-05, "loss": 2.5818, "step": 1465 }, { "epoch": 0.31, "grad_norm": 0.9977306723594666, "learning_rate": 8.054487460317797e-05, "loss": 2.5082, "step": 1470 }, { "epoch": 0.31, "grad_norm": 0.9737963080406189, "learning_rate": 8.041271453256345e-05, "loss": 2.5276, "step": 1475 }, { "epoch": 0.31, "grad_norm": 0.9630906581878662, "learning_rate": 8.02802163439199e-05, "loss": 2.44, "step": 1480 }, { "epoch": 0.31, "grad_norm": 0.9698747396469116, "learning_rate": 8.01473815103163e-05, "loss": 2.5131, "step": 1485 }, { "epoch": 0.31, "grad_norm": 0.9699703454971313, "learning_rate": 8.001421150856434e-05, "loss": 2.4736, "step": 1490 }, { "epoch": 0.31, "grad_norm": 0.9490015506744385, "learning_rate": 7.988070781920197e-05, "loss": 2.4978, "step": 1495 }, { "epoch": 0.31, "grad_norm": 0.9748291969299316, "learning_rate": 7.9746871926477e-05, "loss": 2.5235, "step": 1500 }, { "epoch": 0.31, "grad_norm": 0.9626737236976624, "learning_rate": 7.961270531833052e-05, "loss": 2.4812, "step": 1505 }, { "epoch": 0.31, "grad_norm": 0.9995949864387512, "learning_rate": 7.947820948638045e-05, "loss": 2.558, "step": 1510 }, { "epoch": 0.31, "grad_norm": 0.9817675352096558, "learning_rate": 7.934338592590486e-05, "loss": 2.5591, "step": 1515 }, { "epoch": 0.32, "grad_norm": 0.9922047257423401, "learning_rate": 7.92082361358254e-05, "loss": 2.4919, "step": 1520 }, { "epoch": 0.32, "grad_norm": 0.9521270394325256, "learning_rate": 7.907276161869065e-05, "loss": 2.4929, "step": 1525 }, { "epoch": 0.32, "grad_norm": 0.9504519104957581, "learning_rate": 7.893696388065936e-05, "loss": 2.4731, "step": 1530 }, { "epoch": 0.32, "grad_norm": 1.0164660215377808, "learning_rate": 7.88008444314838e-05, "loss": 2.5618, "step": 1535 }, { "epoch": 0.32, "grad_norm": 0.9919765591621399, "learning_rate": 7.866440478449283e-05, "loss": 2.5814, "step": 1540 }, { "epoch": 0.32, "grad_norm": 0.9793297648429871, "learning_rate": 7.852764645657522e-05, "loss": 2.5629, "step": 1545 }, { "epoch": 0.32, "grad_norm": 0.9306464791297913, "learning_rate": 7.839057096816271e-05, "loss": 2.5159, "step": 1550 }, { "epoch": 0.32, "grad_norm": 0.9663394689559937, "learning_rate": 7.82531798432131e-05, "loss": 2.5165, "step": 1555 }, { "epoch": 0.32, "grad_norm": 0.9504035115242004, "learning_rate": 7.811547460919333e-05, "loss": 2.5199, "step": 1560 }, { "epoch": 0.33, "grad_norm": 0.9541032314300537, "learning_rate": 7.797745679706254e-05, "loss": 2.533, "step": 1565 }, { "epoch": 0.33, "grad_norm": 0.9573409557342529, "learning_rate": 7.783912794125496e-05, "loss": 2.5546, "step": 1570 }, { "epoch": 0.33, "grad_norm": 0.9424445033073425, "learning_rate": 7.770048957966291e-05, "loss": 2.5258, "step": 1575 }, { "epoch": 0.33, "grad_norm": 0.9768951535224915, "learning_rate": 7.756154325361967e-05, "loss": 2.4991, "step": 1580 }, { "epoch": 0.33, "grad_norm": 0.9881746172904968, "learning_rate": 7.74222905078824e-05, "loss": 2.5044, "step": 1585 }, { "epoch": 0.33, "grad_norm": 0.9817689657211304, "learning_rate": 7.728273289061489e-05, "loss": 2.5459, "step": 1590 }, { "epoch": 0.33, "grad_norm": 0.9682257771492004, "learning_rate": 7.714287195337044e-05, "loss": 2.5957, "step": 1595 }, { "epoch": 0.33, "grad_norm": 0.9901173710823059, "learning_rate": 7.700270925107448e-05, "loss": 2.4403, "step": 1600 }, { "epoch": 0.33, "grad_norm": 0.948213517665863, "learning_rate": 7.686224634200742e-05, "loss": 2.4842, "step": 1605 }, { "epoch": 0.33, "grad_norm": 0.9362388849258423, "learning_rate": 7.672148478778722e-05, "loss": 2.4878, "step": 1610 }, { "epoch": 0.34, "grad_norm": 0.9562108516693115, "learning_rate": 7.658042615335212e-05, "loss": 2.4728, "step": 1615 }, { "epoch": 0.34, "grad_norm": 0.9186726212501526, "learning_rate": 7.643907200694318e-05, "loss": 2.4542, "step": 1620 }, { "epoch": 0.34, "grad_norm": 0.9547454714775085, "learning_rate": 7.629742392008684e-05, "loss": 2.5156, "step": 1625 }, { "epoch": 0.34, "grad_norm": 0.9697511196136475, "learning_rate": 7.615548346757749e-05, "loss": 2.5298, "step": 1630 }, { "epoch": 0.34, "grad_norm": 0.95559161901474, "learning_rate": 7.60132522274599e-05, "loss": 2.5401, "step": 1635 }, { "epoch": 0.34, "grad_norm": 0.9713429808616638, "learning_rate": 7.587073178101178e-05, "loss": 2.5195, "step": 1640 }, { "epoch": 0.34, "grad_norm": 0.9706266522407532, "learning_rate": 7.572792371272609e-05, "loss": 2.5245, "step": 1645 }, { "epoch": 0.34, "grad_norm": 0.978418231010437, "learning_rate": 7.55848296102935e-05, "loss": 2.4615, "step": 1650 }, { "epoch": 0.34, "grad_norm": 0.956576406955719, "learning_rate": 7.544145106458465e-05, "loss": 2.524, "step": 1655 }, { "epoch": 0.35, "grad_norm": 0.9353023171424866, "learning_rate": 7.529778966963259e-05, "loss": 2.3879, "step": 1660 }, { "epoch": 0.35, "grad_norm": 1.0076223611831665, "learning_rate": 7.515384702261496e-05, "loss": 2.491, "step": 1665 }, { "epoch": 0.35, "grad_norm": 0.9393713474273682, "learning_rate": 7.500962472383627e-05, "loss": 2.4818, "step": 1670 }, { "epoch": 0.35, "grad_norm": 0.957146942615509, "learning_rate": 7.486512437671011e-05, "loss": 2.5437, "step": 1675 }, { "epoch": 0.35, "grad_norm": 0.9632449746131897, "learning_rate": 7.472034758774128e-05, "loss": 2.5207, "step": 1680 }, { "epoch": 0.35, "grad_norm": 0.9379749894142151, "learning_rate": 7.457529596650797e-05, "loss": 2.5303, "step": 1685 }, { "epoch": 0.35, "grad_norm": 0.9668945074081421, "learning_rate": 7.442997112564392e-05, "loss": 2.5195, "step": 1690 }, { "epoch": 0.35, "grad_norm": 0.9343632459640503, "learning_rate": 7.428437468082037e-05, "loss": 2.4692, "step": 1695 }, { "epoch": 0.35, "grad_norm": 0.9466119408607483, "learning_rate": 7.413850825072817e-05, "loss": 2.5595, "step": 1700 }, { "epoch": 0.35, "grad_norm": 0.9706186056137085, "learning_rate": 7.39923734570598e-05, "loss": 2.5421, "step": 1705 }, { "epoch": 0.36, "grad_norm": 0.9418413639068604, "learning_rate": 7.384597192449126e-05, "loss": 2.5056, "step": 1710 }, { "epoch": 0.36, "grad_norm": 0.9419413805007935, "learning_rate": 7.369930528066412e-05, "loss": 2.5314, "step": 1715 }, { "epoch": 0.36, "grad_norm": 0.9597350358963013, "learning_rate": 7.355237515616732e-05, "loss": 2.4869, "step": 1720 }, { "epoch": 0.36, "grad_norm": 0.9583315849304199, "learning_rate": 7.340518318451914e-05, "loss": 2.5011, "step": 1725 }, { "epoch": 0.36, "grad_norm": 0.9204713702201843, "learning_rate": 7.325773100214893e-05, "loss": 2.5222, "step": 1730 }, { "epoch": 0.36, "grad_norm": 0.9271583557128906, "learning_rate": 7.311002024837899e-05, "loss": 2.5486, "step": 1735 }, { "epoch": 0.36, "grad_norm": 0.9411045908927917, "learning_rate": 7.296205256540633e-05, "loss": 2.4855, "step": 1740 }, { "epoch": 0.36, "grad_norm": 0.9310178756713867, "learning_rate": 7.281382959828443e-05, "loss": 2.594, "step": 1745 }, { "epoch": 0.36, "grad_norm": 0.9783688187599182, "learning_rate": 7.26653529949049e-05, "loss": 2.5684, "step": 1750 }, { "epoch": 0.36, "grad_norm": 0.9621866345405579, "learning_rate": 7.25166244059792e-05, "loss": 2.441, "step": 1755 }, { "epoch": 0.37, "grad_norm": 0.9207419753074646, "learning_rate": 7.236764548502029e-05, "loss": 2.4502, "step": 1760 }, { "epoch": 0.37, "grad_norm": 0.9349749684333801, "learning_rate": 7.221841788832421e-05, "loss": 2.5613, "step": 1765 }, { "epoch": 0.37, "grad_norm": 0.9797950387001038, "learning_rate": 7.206894327495173e-05, "loss": 2.4534, "step": 1770 }, { "epoch": 0.37, "grad_norm": 0.9534410834312439, "learning_rate": 7.191922330670982e-05, "loss": 2.4731, "step": 1775 }, { "epoch": 0.37, "grad_norm": 0.939933717250824, "learning_rate": 7.176925964813326e-05, "loss": 2.4995, "step": 1780 }, { "epoch": 0.37, "grad_norm": 0.9610041379928589, "learning_rate": 7.161905396646607e-05, "loss": 2.4542, "step": 1785 }, { "epoch": 0.37, "grad_norm": 0.9338921904563904, "learning_rate": 7.146860793164299e-05, "loss": 2.504, "step": 1790 }, { "epoch": 0.37, "grad_norm": 0.918339729309082, "learning_rate": 7.131792321627098e-05, "loss": 2.4966, "step": 1795 }, { "epoch": 0.37, "grad_norm": 0.9426106214523315, "learning_rate": 7.116700149561048e-05, "loss": 2.5555, "step": 1800 }, { "epoch": 0.38, "grad_norm": 0.9430469870567322, "learning_rate": 7.101584444755696e-05, "loss": 2.5065, "step": 1805 }, { "epoch": 0.38, "grad_norm": 0.9407751560211182, "learning_rate": 7.086445375262212e-05, "loss": 2.4893, "step": 1810 }, { "epoch": 0.38, "grad_norm": 0.9642773866653442, "learning_rate": 7.071283109391528e-05, "loss": 2.467, "step": 1815 }, { "epoch": 0.38, "grad_norm": 0.9692126512527466, "learning_rate": 7.056097815712466e-05, "loss": 2.5445, "step": 1820 }, { "epoch": 0.38, "grad_norm": 0.9674531817436218, "learning_rate": 7.040889663049862e-05, "loss": 2.4867, "step": 1825 }, { "epoch": 0.38, "grad_norm": 0.9319456219673157, "learning_rate": 7.025658820482693e-05, "loss": 2.4393, "step": 1830 }, { "epoch": 0.38, "grad_norm": 0.9372115135192871, "learning_rate": 7.010405457342192e-05, "loss": 2.5165, "step": 1835 }, { "epoch": 0.38, "grad_norm": 0.9726076126098633, "learning_rate": 6.995129743209967e-05, "loss": 2.4505, "step": 1840 }, { "epoch": 0.38, "grad_norm": 0.9362107515335083, "learning_rate": 6.97983184791612e-05, "loss": 2.4647, "step": 1845 }, { "epoch": 0.38, "grad_norm": 0.9178482890129089, "learning_rate": 6.964511941537355e-05, "loss": 2.4714, "step": 1850 }, { "epoch": 0.39, "grad_norm": 0.9568119645118713, "learning_rate": 6.949170194395083e-05, "loss": 2.5276, "step": 1855 }, { "epoch": 0.39, "grad_norm": 0.9537916779518127, "learning_rate": 6.933806777053536e-05, "loss": 2.4758, "step": 1860 }, { "epoch": 0.39, "grad_norm": 0.9236088991165161, "learning_rate": 6.918421860317872e-05, "loss": 2.4279, "step": 1865 }, { "epoch": 0.39, "grad_norm": 0.9248093962669373, "learning_rate": 6.903015615232263e-05, "loss": 2.4253, "step": 1870 }, { "epoch": 0.39, "grad_norm": 0.9502087831497192, "learning_rate": 6.887588213078012e-05, "loss": 2.4653, "step": 1875 }, { "epoch": 0.39, "grad_norm": 0.9479555487632751, "learning_rate": 6.87213982537163e-05, "loss": 2.4596, "step": 1880 }, { "epoch": 0.39, "grad_norm": 0.9418471455574036, "learning_rate": 6.856670623862943e-05, "loss": 2.5136, "step": 1885 }, { "epoch": 0.39, "grad_norm": 0.9373717308044434, "learning_rate": 6.841180780533179e-05, "loss": 2.4837, "step": 1890 }, { "epoch": 0.39, "grad_norm": 0.9609478116035461, "learning_rate": 6.82567046759305e-05, "loss": 2.422, "step": 1895 }, { "epoch": 0.39, "grad_norm": 0.9436773061752319, "learning_rate": 6.810139857480844e-05, "loss": 2.4756, "step": 1900 }, { "epoch": 0.4, "grad_norm": 0.947167694568634, "learning_rate": 6.794589122860509e-05, "loss": 2.405, "step": 1905 }, { "epoch": 0.4, "grad_norm": 0.9732680916786194, "learning_rate": 6.779018436619725e-05, "loss": 2.4835, "step": 1910 }, { "epoch": 0.4, "grad_norm": 0.950404703617096, "learning_rate": 6.763427971867992e-05, "loss": 2.5281, "step": 1915 }, { "epoch": 0.4, "grad_norm": 0.9311380982398987, "learning_rate": 6.747817901934699e-05, "loss": 2.4992, "step": 1920 }, { "epoch": 0.4, "grad_norm": 0.9213589429855347, "learning_rate": 6.732188400367197e-05, "loss": 2.4271, "step": 1925 }, { "epoch": 0.4, "grad_norm": 0.9262340664863586, "learning_rate": 6.716539640928871e-05, "loss": 2.4826, "step": 1930 }, { "epoch": 0.4, "grad_norm": 0.9462832808494568, "learning_rate": 6.70087179759721e-05, "loss": 2.4917, "step": 1935 }, { "epoch": 0.4, "grad_norm": 0.9576349854469299, "learning_rate": 6.685185044561874e-05, "loss": 2.4312, "step": 1940 }, { "epoch": 0.4, "grad_norm": 0.9459937810897827, "learning_rate": 6.669479556222747e-05, "loss": 2.5343, "step": 1945 }, { "epoch": 0.41, "grad_norm": 0.9218082427978516, "learning_rate": 6.653755507188013e-05, "loss": 2.5057, "step": 1950 }, { "epoch": 0.41, "grad_norm": 0.950690507888794, "learning_rate": 6.638013072272205e-05, "loss": 2.4339, "step": 1955 }, { "epoch": 0.41, "grad_norm": 0.9415127635002136, "learning_rate": 6.622252426494259e-05, "loss": 2.4953, "step": 1960 }, { "epoch": 0.41, "grad_norm": 0.898391604423523, "learning_rate": 6.606473745075581e-05, "loss": 2.4518, "step": 1965 }, { "epoch": 0.41, "grad_norm": 0.9288300275802612, "learning_rate": 6.590677203438084e-05, "loss": 2.4957, "step": 1970 }, { "epoch": 0.41, "grad_norm": 0.92172771692276, "learning_rate": 6.574862977202252e-05, "loss": 2.4942, "step": 1975 }, { "epoch": 0.41, "grad_norm": 0.9391177892684937, "learning_rate": 6.559031242185174e-05, "loss": 2.5103, "step": 1980 }, { "epoch": 0.41, "grad_norm": 0.9494156837463379, "learning_rate": 6.543182174398597e-05, "loss": 2.4935, "step": 1985 }, { "epoch": 0.41, "grad_norm": 0.9366905689239502, "learning_rate": 6.52731595004697e-05, "loss": 2.5083, "step": 1990 }, { "epoch": 0.41, "grad_norm": 0.9476346373558044, "learning_rate": 6.51143274552548e-05, "loss": 2.4404, "step": 1995 }, { "epoch": 0.42, "grad_norm": 0.9403136968612671, "learning_rate": 6.495532737418098e-05, "loss": 2.4399, "step": 2000 }, { "epoch": 0.42, "grad_norm": 0.9438544511795044, "learning_rate": 6.479616102495605e-05, "loss": 2.4434, "step": 2005 }, { "epoch": 0.42, "grad_norm": 0.9154673218727112, "learning_rate": 6.463683017713638e-05, "loss": 2.4772, "step": 2010 }, { "epoch": 0.42, "grad_norm": 0.925992488861084, "learning_rate": 6.447733660210715e-05, "loss": 2.3922, "step": 2015 }, { "epoch": 0.42, "grad_norm": 0.926508903503418, "learning_rate": 6.431768207306272e-05, "loss": 2.4002, "step": 2020 }, { "epoch": 0.42, "grad_norm": 0.9747362732887268, "learning_rate": 6.415786836498684e-05, "loss": 2.4693, "step": 2025 }, { "epoch": 0.42, "grad_norm": 0.918900728225708, "learning_rate": 6.399789725463298e-05, "loss": 2.4109, "step": 2030 }, { "epoch": 0.42, "grad_norm": 0.9118623733520508, "learning_rate": 6.383777052050458e-05, "loss": 2.4303, "step": 2035 }, { "epoch": 0.42, "grad_norm": 0.9325087666511536, "learning_rate": 6.367748994283518e-05, "loss": 2.4623, "step": 2040 }, { "epoch": 0.43, "grad_norm": 0.9554570913314819, "learning_rate": 6.351705730356877e-05, "loss": 2.4295, "step": 2045 }, { "epoch": 0.43, "grad_norm": 0.9043391942977905, "learning_rate": 6.335647438633987e-05, "loss": 2.3834, "step": 2050 }, { "epoch": 0.43, "grad_norm": 0.9207665920257568, "learning_rate": 6.319574297645374e-05, "loss": 2.4103, "step": 2055 }, { "epoch": 0.43, "grad_norm": 0.9363177418708801, "learning_rate": 6.303486486086654e-05, "loss": 2.3831, "step": 2060 }, { "epoch": 0.43, "grad_norm": 0.9334292411804199, "learning_rate": 6.287384182816546e-05, "loss": 2.5353, "step": 2065 }, { "epoch": 0.43, "grad_norm": 0.9631732106208801, "learning_rate": 6.271267566854883e-05, "loss": 2.4478, "step": 2070 }, { "epoch": 0.43, "grad_norm": 0.9069787859916687, "learning_rate": 6.255136817380618e-05, "loss": 2.4026, "step": 2075 }, { "epoch": 0.43, "grad_norm": 0.915317952632904, "learning_rate": 6.23899211372984e-05, "loss": 2.4569, "step": 2080 }, { "epoch": 0.43, "grad_norm": 0.9367572665214539, "learning_rate": 6.222833635393772e-05, "loss": 2.4625, "step": 2085 }, { "epoch": 0.43, "grad_norm": 0.9076709151268005, "learning_rate": 6.206661562016782e-05, "loss": 2.3889, "step": 2090 }, { "epoch": 0.44, "grad_norm": 0.909893810749054, "learning_rate": 6.190476073394382e-05, "loss": 2.4695, "step": 2095 }, { "epoch": 0.44, "grad_norm": 0.9313353896141052, "learning_rate": 6.17427734947123e-05, "loss": 2.5082, "step": 2100 }, { "epoch": 0.44, "grad_norm": 0.9223794937133789, "learning_rate": 6.158065570339127e-05, "loss": 2.4231, "step": 2105 }, { "epoch": 0.44, "grad_norm": 0.9507770538330078, "learning_rate": 6.141840916235021e-05, "loss": 2.4544, "step": 2110 }, { "epoch": 0.44, "grad_norm": 0.9403271675109863, "learning_rate": 6.125603567539001e-05, "loss": 2.4226, "step": 2115 }, { "epoch": 0.44, "grad_norm": 0.9203287363052368, "learning_rate": 6.109353704772284e-05, "loss": 2.419, "step": 2120 }, { "epoch": 0.44, "grad_norm": 0.9149782657623291, "learning_rate": 6.0930915085952164e-05, "loss": 2.4555, "step": 2125 }, { "epoch": 0.44, "grad_norm": 0.910736620426178, "learning_rate": 6.076817159805267e-05, "loss": 2.4473, "step": 2130 }, { "epoch": 0.44, "grad_norm": 0.9063466191291809, "learning_rate": 6.06053083933501e-05, "loss": 2.4813, "step": 2135 }, { "epoch": 0.44, "grad_norm": 0.9377574324607849, "learning_rate": 6.044232728250116e-05, "loss": 2.4192, "step": 2140 }, { "epoch": 0.45, "grad_norm": 0.9170573353767395, "learning_rate": 6.027923007747339e-05, "loss": 2.391, "step": 2145 }, { "epoch": 0.45, "grad_norm": 0.9315197467803955, "learning_rate": 6.011601859152506e-05, "loss": 2.3765, "step": 2150 }, { "epoch": 0.45, "grad_norm": 0.9108867645263672, "learning_rate": 5.995269463918495e-05, "loss": 2.4168, "step": 2155 }, { "epoch": 0.45, "grad_norm": 0.9182575941085815, "learning_rate": 5.97892600362322e-05, "loss": 2.441, "step": 2160 }, { "epoch": 0.45, "grad_norm": 0.9704192876815796, "learning_rate": 5.962571659967614e-05, "loss": 2.4847, "step": 2165 }, { "epoch": 0.45, "grad_norm": 0.9109625220298767, "learning_rate": 5.946206614773606e-05, "loss": 2.4149, "step": 2170 }, { "epoch": 0.45, "grad_norm": 0.949019730091095, "learning_rate": 5.929831049982103e-05, "loss": 2.4266, "step": 2175 }, { "epoch": 0.45, "grad_norm": 0.9033789038658142, "learning_rate": 5.9134451476509633e-05, "loss": 2.4469, "step": 2180 }, { "epoch": 0.45, "grad_norm": 0.9018952250480652, "learning_rate": 5.897049089952974e-05, "loss": 2.4051, "step": 2185 }, { "epoch": 0.46, "grad_norm": 0.941893458366394, "learning_rate": 5.880643059173826e-05, "loss": 2.438, "step": 2190 }, { "epoch": 0.46, "grad_norm": 0.9332031011581421, "learning_rate": 5.864227237710093e-05, "loss": 2.393, "step": 2195 }, { "epoch": 0.46, "grad_norm": 0.9140224456787109, "learning_rate": 5.847801808067189e-05, "loss": 2.4287, "step": 2200 }, { "epoch": 0.46, "grad_norm": 0.9171401858329773, "learning_rate": 5.831366952857357e-05, "loss": 2.4062, "step": 2205 }, { "epoch": 0.46, "grad_norm": 0.9194997549057007, "learning_rate": 5.814922854797622e-05, "loss": 2.4013, "step": 2210 }, { "epoch": 0.46, "grad_norm": 0.9855097532272339, "learning_rate": 5.798469696707775e-05, "loss": 2.4905, "step": 2215 }, { "epoch": 0.46, "grad_norm": 0.946471095085144, "learning_rate": 5.782007661508331e-05, "loss": 2.4176, "step": 2220 }, { "epoch": 0.46, "grad_norm": 0.9259464740753174, "learning_rate": 5.765536932218495e-05, "loss": 2.4195, "step": 2225 }, { "epoch": 0.46, "grad_norm": 0.9670231342315674, "learning_rate": 5.7490576919541315e-05, "loss": 2.433, "step": 2230 }, { "epoch": 0.46, "grad_norm": 0.911496639251709, "learning_rate": 5.732570123925729e-05, "loss": 2.4162, "step": 2235 }, { "epoch": 0.47, "grad_norm": 0.925327718257904, "learning_rate": 5.7160744114363593e-05, "loss": 2.4379, "step": 2240 }, { "epoch": 0.47, "grad_norm": 0.9388567805290222, "learning_rate": 5.699570737879641e-05, "loss": 2.4124, "step": 2245 }, { "epoch": 0.47, "grad_norm": 0.9244475960731506, "learning_rate": 5.683059286737702e-05, "loss": 2.4292, "step": 2250 }, { "epoch": 0.47, "grad_norm": 0.9055370092391968, "learning_rate": 5.666540241579139e-05, "loss": 2.3783, "step": 2255 }, { "epoch": 0.47, "grad_norm": 0.9099582433700562, "learning_rate": 5.6500137860569766e-05, "loss": 2.4469, "step": 2260 }, { "epoch": 0.47, "grad_norm": 0.8946197628974915, "learning_rate": 5.633480103906624e-05, "loss": 2.4016, "step": 2265 }, { "epoch": 0.47, "grad_norm": 0.9014321565628052, "learning_rate": 5.616939378943834e-05, "loss": 2.4056, "step": 2270 }, { "epoch": 0.47, "grad_norm": 0.9044644832611084, "learning_rate": 5.6003917950626595e-05, "loss": 2.3981, "step": 2275 }, { "epoch": 0.47, "grad_norm": 0.94088214635849, "learning_rate": 5.583837536233407e-05, "loss": 2.4835, "step": 2280 }, { "epoch": 0.47, "grad_norm": 0.8926177620887756, "learning_rate": 5.567276786500596e-05, "loss": 2.4517, "step": 2285 }, { "epoch": 0.48, "grad_norm": 0.8916089534759521, "learning_rate": 5.5507097299809054e-05, "loss": 2.3458, "step": 2290 }, { "epoch": 0.48, "grad_norm": 0.937061607837677, "learning_rate": 5.534136550861133e-05, "loss": 2.4264, "step": 2295 }, { "epoch": 0.48, "grad_norm": 0.9375845193862915, "learning_rate": 5.5175574333961465e-05, "loss": 2.4403, "step": 2300 }, { "epoch": 0.48, "grad_norm": 0.9123733639717102, "learning_rate": 5.500972561906832e-05, "loss": 2.4226, "step": 2305 }, { "epoch": 0.48, "grad_norm": 0.9240120053291321, "learning_rate": 5.484382120778048e-05, "loss": 2.436, "step": 2310 }, { "epoch": 0.48, "grad_norm": 0.9178423881530762, "learning_rate": 5.467786294456575e-05, "loss": 2.4285, "step": 2315 }, { "epoch": 0.48, "grad_norm": 0.9010103940963745, "learning_rate": 5.451185267449061e-05, "loss": 2.3652, "step": 2320 }, { "epoch": 0.48, "grad_norm": 0.9071227312088013, "learning_rate": 5.43457922431998e-05, "loss": 2.4407, "step": 2325 }, { "epoch": 0.48, "grad_norm": 0.9325879216194153, "learning_rate": 5.417968349689566e-05, "loss": 2.4164, "step": 2330 }, { "epoch": 0.49, "grad_norm": 0.9494773149490356, "learning_rate": 5.401352828231772e-05, "loss": 2.441, "step": 2335 }, { "epoch": 0.49, "grad_norm": 0.9466855525970459, "learning_rate": 5.384732844672211e-05, "loss": 2.4787, "step": 2340 }, { "epoch": 0.49, "grad_norm": 0.907010555267334, "learning_rate": 5.368108583786107e-05, "loss": 2.3742, "step": 2345 }, { "epoch": 0.49, "grad_norm": 0.93699711561203, "learning_rate": 5.3514802303962344e-05, "loss": 2.3629, "step": 2350 }, { "epoch": 0.49, "grad_norm": 1.030379295349121, "learning_rate": 5.334847969370868e-05, "loss": 2.4036, "step": 2355 }, { "epoch": 0.49, "grad_norm": 0.907166063785553, "learning_rate": 5.3182119856217284e-05, "loss": 2.4725, "step": 2360 }, { "epoch": 0.49, "grad_norm": 0.9387956261634827, "learning_rate": 5.3015724641019214e-05, "loss": 2.3503, "step": 2365 }, { "epoch": 0.49, "grad_norm": 0.9360509514808655, "learning_rate": 5.284929589803884e-05, "loss": 2.4759, "step": 2370 }, { "epoch": 0.49, "grad_norm": 0.9375712871551514, "learning_rate": 5.2682835477573336e-05, "loss": 2.4025, "step": 2375 }, { "epoch": 0.49, "grad_norm": 0.916762113571167, "learning_rate": 5.2516345230271965e-05, "loss": 2.3794, "step": 2380 }, { "epoch": 0.5, "grad_norm": 0.9070376753807068, "learning_rate": 5.234982700711569e-05, "loss": 2.4983, "step": 2385 }, { "epoch": 0.5, "grad_norm": 0.9349552392959595, "learning_rate": 5.218328265939643e-05, "loss": 2.4623, "step": 2390 }, { "epoch": 0.5, "grad_norm": 0.9127172827720642, "learning_rate": 5.201671403869657e-05, "loss": 2.4297, "step": 2395 }, { "epoch": 0.5, "grad_norm": 0.9283477067947388, "learning_rate": 5.1850122996868366e-05, "loss": 2.4022, "step": 2400 }, { "epoch": 0.5, "grad_norm": 0.9371105432510376, "learning_rate": 5.168351138601334e-05, "loss": 2.4772, "step": 2405 }, { "epoch": 0.5, "grad_norm": 0.9107518196105957, "learning_rate": 5.1516881058461675e-05, "loss": 2.3453, "step": 2410 }, { "epoch": 0.5, "grad_norm": 0.9191110730171204, "learning_rate": 5.135023386675166e-05, "loss": 2.4052, "step": 2415 }, { "epoch": 0.5, "grad_norm": 0.904039740562439, "learning_rate": 5.118357166360906e-05, "loss": 2.3583, "step": 2420 }, { "epoch": 0.5, "grad_norm": 0.9445516467094421, "learning_rate": 5.101689630192655e-05, "loss": 2.4346, "step": 2425 }, { "epoch": 0.51, "grad_norm": 0.9454602003097534, "learning_rate": 5.085020963474307e-05, "loss": 2.3718, "step": 2430 }, { "epoch": 0.51, "grad_norm": 0.9205687642097473, "learning_rate": 5.068351351522329e-05, "loss": 2.4452, "step": 2435 }, { "epoch": 0.51, "grad_norm": 0.9437626004219055, "learning_rate": 5.0516809796636935e-05, "loss": 2.4275, "step": 2440 }, { "epoch": 0.51, "grad_norm": 0.9340523481369019, "learning_rate": 5.035010033233821e-05, "loss": 2.4543, "step": 2445 }, { "epoch": 0.51, "grad_norm": 0.9576351046562195, "learning_rate": 5.018338697574523e-05, "loss": 2.3797, "step": 2450 }, { "epoch": 0.51, "grad_norm": 0.888482391834259, "learning_rate": 5.0016671580319354e-05, "loss": 2.4352, "step": 2455 }, { "epoch": 0.51, "grad_norm": 0.9680615663528442, "learning_rate": 4.984995599954461e-05, "loss": 2.4499, "step": 2460 }, { "epoch": 0.51, "grad_norm": 0.9344144463539124, "learning_rate": 4.968324208690712e-05, "loss": 2.4594, "step": 2465 }, { "epoch": 0.51, "grad_norm": 0.9282997846603394, "learning_rate": 4.951653169587441e-05, "loss": 2.3162, "step": 2470 }, { "epoch": 0.51, "grad_norm": 0.8972924947738647, "learning_rate": 4.93498266798749e-05, "loss": 2.428, "step": 2475 }, { "epoch": 0.52, "grad_norm": 0.9192982912063599, "learning_rate": 4.918312889227722e-05, "loss": 2.4126, "step": 2480 }, { "epoch": 0.52, "grad_norm": 0.9322929382324219, "learning_rate": 4.901644018636966e-05, "loss": 2.3799, "step": 2485 }, { "epoch": 0.52, "grad_norm": 0.9455317258834839, "learning_rate": 4.8849762415339526e-05, "loss": 2.4297, "step": 2490 }, { "epoch": 0.52, "grad_norm": 0.9278294444084167, "learning_rate": 4.868309743225256e-05, "loss": 2.3809, "step": 2495 }, { "epoch": 0.52, "grad_norm": 0.921036958694458, "learning_rate": 4.851644709003233e-05, "loss": 2.4302, "step": 2500 }, { "epoch": 0.52, "grad_norm": 0.933717668056488, "learning_rate": 4.834981324143964e-05, "loss": 2.4012, "step": 2505 }, { "epoch": 0.52, "grad_norm": 0.9190297722816467, "learning_rate": 4.818319773905191e-05, "loss": 2.3523, "step": 2510 }, { "epoch": 0.52, "grad_norm": 0.9000710844993591, "learning_rate": 4.801660243524261e-05, "loss": 2.4201, "step": 2515 }, { "epoch": 0.52, "grad_norm": 0.9176121950149536, "learning_rate": 4.7850029182160626e-05, "loss": 2.3806, "step": 2520 }, { "epoch": 0.52, "grad_norm": 0.9202491044998169, "learning_rate": 4.768347983170973e-05, "loss": 2.4076, "step": 2525 }, { "epoch": 0.53, "grad_norm": 0.9020574688911438, "learning_rate": 4.7516956235527884e-05, "loss": 2.4504, "step": 2530 }, { "epoch": 0.53, "grad_norm": 0.894968569278717, "learning_rate": 4.735046024496682e-05, "loss": 2.404, "step": 2535 }, { "epoch": 0.53, "grad_norm": 0.9300612807273865, "learning_rate": 4.7183993711071286e-05, "loss": 2.3997, "step": 2540 }, { "epoch": 0.53, "grad_norm": 0.929145097732544, "learning_rate": 4.7017558484558554e-05, "loss": 2.4052, "step": 2545 }, { "epoch": 0.53, "grad_norm": 0.9086617231369019, "learning_rate": 4.6851156415797844e-05, "loss": 2.402, "step": 2550 }, { "epoch": 0.53, "grad_norm": 0.894952118396759, "learning_rate": 4.6684789354789746e-05, "loss": 2.4397, "step": 2555 }, { "epoch": 0.53, "grad_norm": 0.9135778546333313, "learning_rate": 4.651845915114563e-05, "loss": 2.4014, "step": 2560 }, { "epoch": 0.53, "grad_norm": 0.9323270916938782, "learning_rate": 4.6352167654067095e-05, "loss": 2.4016, "step": 2565 }, { "epoch": 0.53, "grad_norm": 0.911676824092865, "learning_rate": 4.618591671232544e-05, "loss": 2.3717, "step": 2570 }, { "epoch": 0.54, "grad_norm": 0.881301999092102, "learning_rate": 4.601970817424106e-05, "loss": 2.3498, "step": 2575 }, { "epoch": 0.54, "grad_norm": 0.9046157002449036, "learning_rate": 4.585354388766292e-05, "loss": 2.3722, "step": 2580 }, { "epoch": 0.54, "grad_norm": 0.9167553186416626, "learning_rate": 4.568742569994802e-05, "loss": 2.3989, "step": 2585 }, { "epoch": 0.54, "grad_norm": 0.9558010697364807, "learning_rate": 4.552135545794086e-05, "loss": 2.4688, "step": 2590 }, { "epoch": 0.54, "grad_norm": 0.930412232875824, "learning_rate": 4.535533500795288e-05, "loss": 2.4169, "step": 2595 }, { "epoch": 0.54, "grad_norm": 0.95481938123703, "learning_rate": 4.5189366195741953e-05, "loss": 2.4134, "step": 2600 }, { "epoch": 0.54, "grad_norm": 0.8904323577880859, "learning_rate": 4.502345086649186e-05, "loss": 2.3209, "step": 2605 }, { "epoch": 0.54, "grad_norm": 0.9117818474769592, "learning_rate": 4.485759086479179e-05, "loss": 2.3939, "step": 2610 }, { "epoch": 0.54, "grad_norm": 0.9135521650314331, "learning_rate": 4.469178803461579e-05, "loss": 2.3403, "step": 2615 }, { "epoch": 0.54, "grad_norm": 0.8931097984313965, "learning_rate": 4.4526044219302326e-05, "loss": 2.3794, "step": 2620 }, { "epoch": 0.55, "grad_norm": 0.9080528020858765, "learning_rate": 4.4360361261533745e-05, "loss": 2.3682, "step": 2625 }, { "epoch": 0.55, "grad_norm": 0.8922243118286133, "learning_rate": 4.419474100331579e-05, "loss": 2.3913, "step": 2630 }, { "epoch": 0.55, "grad_norm": 0.9073590636253357, "learning_rate": 4.402918528595715e-05, "loss": 2.332, "step": 2635 }, { "epoch": 0.55, "grad_norm": 0.8992304801940918, "learning_rate": 4.386369595004896e-05, "loss": 2.4228, "step": 2640 }, { "epoch": 0.55, "grad_norm": 0.9212525486946106, "learning_rate": 4.3698274835444354e-05, "loss": 2.3724, "step": 2645 }, { "epoch": 0.55, "grad_norm": 0.9063347578048706, "learning_rate": 4.3532923781238e-05, "loss": 2.3823, "step": 2650 }, { "epoch": 0.55, "grad_norm": 0.9009177684783936, "learning_rate": 4.336764462574566e-05, "loss": 2.3531, "step": 2655 }, { "epoch": 0.55, "grad_norm": 0.9231202602386475, "learning_rate": 4.320243920648376e-05, "loss": 2.4197, "step": 2660 }, { "epoch": 0.55, "grad_norm": 0.9398454427719116, "learning_rate": 4.303730936014894e-05, "loss": 2.392, "step": 2665 }, { "epoch": 0.55, "grad_norm": 0.8874997496604919, "learning_rate": 4.287225692259765e-05, "loss": 2.3254, "step": 2670 }, { "epoch": 0.56, "grad_norm": 0.937724232673645, "learning_rate": 4.270728372882575e-05, "loss": 2.416, "step": 2675 }, { "epoch": 0.56, "grad_norm": 0.9113511443138123, "learning_rate": 4.254239161294804e-05, "loss": 2.4305, "step": 2680 }, { "epoch": 0.56, "grad_norm": 0.9297624230384827, "learning_rate": 4.237758240817802e-05, "loss": 2.3571, "step": 2685 }, { "epoch": 0.56, "grad_norm": 0.8927726745605469, "learning_rate": 4.2212857946807336e-05, "loss": 2.3647, "step": 2690 }, { "epoch": 0.56, "grad_norm": 0.9231744408607483, "learning_rate": 4.2048220060185516e-05, "loss": 2.4092, "step": 2695 }, { "epoch": 0.56, "grad_norm": 0.9141883850097656, "learning_rate": 4.188367057869957e-05, "loss": 2.3413, "step": 2700 }, { "epoch": 0.56, "grad_norm": 0.9077816605567932, "learning_rate": 4.171921133175365e-05, "loss": 2.4024, "step": 2705 }, { "epoch": 0.56, "grad_norm": 0.9256351590156555, "learning_rate": 4.155484414774872e-05, "loss": 2.4133, "step": 2710 }, { "epoch": 0.56, "grad_norm": 0.9212350249290466, "learning_rate": 4.139057085406221e-05, "loss": 2.3808, "step": 2715 }, { "epoch": 0.57, "grad_norm": 0.9098197221755981, "learning_rate": 4.1226393277027726e-05, "loss": 2.4289, "step": 2720 }, { "epoch": 0.57, "grad_norm": 0.9494619369506836, "learning_rate": 4.106231324191471e-05, "loss": 2.4195, "step": 2725 }, { "epoch": 0.57, "grad_norm": 0.8974795937538147, "learning_rate": 4.089833257290817e-05, "loss": 2.4012, "step": 2730 }, { "epoch": 0.57, "grad_norm": 0.8910524249076843, "learning_rate": 4.073445309308842e-05, "loss": 2.3182, "step": 2735 }, { "epoch": 0.57, "grad_norm": 0.8898556232452393, "learning_rate": 4.0570676624410756e-05, "loss": 2.4341, "step": 2740 }, { "epoch": 0.57, "grad_norm": 0.9277542233467102, "learning_rate": 4.040700498768525e-05, "loss": 2.387, "step": 2745 }, { "epoch": 0.57, "grad_norm": 0.9058116674423218, "learning_rate": 4.024344000255648e-05, "loss": 2.3898, "step": 2750 }, { "epoch": 0.57, "grad_norm": 0.9045486450195312, "learning_rate": 4.0079983487483313e-05, "loss": 2.3977, "step": 2755 }, { "epoch": 0.57, "grad_norm": 0.9825146794319153, "learning_rate": 3.9916637259718683e-05, "loss": 2.3811, "step": 2760 }, { "epoch": 0.57, "grad_norm": 0.879527747631073, "learning_rate": 3.9753403135289396e-05, "loss": 2.3712, "step": 2765 }, { "epoch": 0.58, "grad_norm": 0.9239062666893005, "learning_rate": 3.9590282928975914e-05, "loss": 2.3904, "step": 2770 }, { "epoch": 0.58, "grad_norm": 0.9276576638221741, "learning_rate": 3.942727845429221e-05, "loss": 2.3147, "step": 2775 }, { "epoch": 0.58, "grad_norm": 0.9070415496826172, "learning_rate": 3.926439152346558e-05, "loss": 2.3752, "step": 2780 }, { "epoch": 0.58, "grad_norm": 0.9174474477767944, "learning_rate": 3.910162394741653e-05, "loss": 2.3734, "step": 2785 }, { "epoch": 0.58, "grad_norm": 0.9264554977416992, "learning_rate": 3.893897753573861e-05, "loss": 2.3948, "step": 2790 }, { "epoch": 0.58, "grad_norm": 0.9267555475234985, "learning_rate": 3.877645409667829e-05, "loss": 2.394, "step": 2795 }, { "epoch": 0.58, "grad_norm": 0.9059020280838013, "learning_rate": 3.861405543711491e-05, "loss": 2.3697, "step": 2800 }, { "epoch": 0.58, "grad_norm": 0.9132973551750183, "learning_rate": 3.8451783362540507e-05, "loss": 2.409, "step": 2805 }, { "epoch": 0.58, "grad_norm": 0.920639157295227, "learning_rate": 3.828963967703983e-05, "loss": 2.3474, "step": 2810 }, { "epoch": 0.59, "grad_norm": 0.9337471723556519, "learning_rate": 3.8127626183270223e-05, "loss": 2.3821, "step": 2815 }, { "epoch": 0.59, "grad_norm": 0.9225865006446838, "learning_rate": 3.796574468244161e-05, "loss": 2.4106, "step": 2820 }, { "epoch": 0.59, "grad_norm": 0.9328856468200684, "learning_rate": 3.7803996974296444e-05, "loss": 2.4665, "step": 2825 }, { "epoch": 0.59, "grad_norm": 0.9074559807777405, "learning_rate": 3.7642384857089776e-05, "loss": 2.3724, "step": 2830 }, { "epoch": 0.59, "grad_norm": 0.9395449161529541, "learning_rate": 3.748091012756915e-05, "loss": 2.3418, "step": 2835 }, { "epoch": 0.59, "grad_norm": 0.9058288931846619, "learning_rate": 3.731957458095467e-05, "loss": 2.3908, "step": 2840 }, { "epoch": 0.59, "grad_norm": 1.0165904760360718, "learning_rate": 3.71583800109191e-05, "loss": 2.3796, "step": 2845 }, { "epoch": 0.59, "grad_norm": 0.9278620481491089, "learning_rate": 3.699732820956784e-05, "loss": 2.329, "step": 2850 }, { "epoch": 0.59, "grad_norm": 0.9070698618888855, "learning_rate": 3.6836420967419057e-05, "loss": 2.4322, "step": 2855 }, { "epoch": 0.59, "grad_norm": 0.9063517451286316, "learning_rate": 3.6675660073383745e-05, "loss": 2.3743, "step": 2860 }, { "epoch": 0.6, "grad_norm": 0.8901442885398865, "learning_rate": 3.6515047314745856e-05, "loss": 2.317, "step": 2865 }, { "epoch": 0.6, "grad_norm": 0.9144355058670044, "learning_rate": 3.6354584477142437e-05, "loss": 2.4009, "step": 2870 }, { "epoch": 0.6, "grad_norm": 0.9156922101974487, "learning_rate": 3.6194273344543736e-05, "loss": 2.3737, "step": 2875 }, { "epoch": 0.6, "grad_norm": 0.9051995277404785, "learning_rate": 3.6034115699233425e-05, "loss": 2.4593, "step": 2880 }, { "epoch": 0.6, "grad_norm": 0.9019202589988708, "learning_rate": 3.5874113321788736e-05, "loss": 2.3608, "step": 2885 }, { "epoch": 0.6, "grad_norm": 0.9227021336555481, "learning_rate": 3.571426799106071e-05, "loss": 2.3829, "step": 2890 }, { "epoch": 0.6, "grad_norm": 0.9123120903968811, "learning_rate": 3.555458148415437e-05, "loss": 2.3874, "step": 2895 }, { "epoch": 0.6, "grad_norm": 0.9297784566879272, "learning_rate": 3.539505557640901e-05, "loss": 2.349, "step": 2900 }, { "epoch": 0.6, "grad_norm": 0.9303633570671082, "learning_rate": 3.523569204137843e-05, "loss": 2.4062, "step": 2905 }, { "epoch": 0.6, "grad_norm": 0.9044543504714966, "learning_rate": 3.5076492650811246e-05, "loss": 2.4351, "step": 2910 }, { "epoch": 0.61, "grad_norm": 0.9213395714759827, "learning_rate": 3.491745917463113e-05, "loss": 2.316, "step": 2915 }, { "epoch": 0.61, "grad_norm": 0.9098991751670837, "learning_rate": 3.475859338091721e-05, "loss": 2.3523, "step": 2920 }, { "epoch": 0.61, "grad_norm": 0.9128162264823914, "learning_rate": 3.4599897035884374e-05, "loss": 2.3663, "step": 2925 }, { "epoch": 0.61, "grad_norm": 0.9543825387954712, "learning_rate": 3.444137190386363e-05, "loss": 2.3587, "step": 2930 }, { "epoch": 0.61, "grad_norm": 0.9522660970687866, "learning_rate": 3.4283019747282514e-05, "loss": 2.3508, "step": 2935 }, { "epoch": 0.61, "grad_norm": 0.8969595432281494, "learning_rate": 3.412484232664545e-05, "loss": 2.3126, "step": 2940 }, { "epoch": 0.61, "grad_norm": 0.9274249076843262, "learning_rate": 3.396684140051424e-05, "loss": 2.3236, "step": 2945 }, { "epoch": 0.61, "grad_norm": 0.8981707692146301, "learning_rate": 3.3809018725488466e-05, "loss": 2.3745, "step": 2950 }, { "epoch": 0.61, "grad_norm": 0.947674036026001, "learning_rate": 3.365137605618598e-05, "loss": 2.3715, "step": 2955 }, { "epoch": 0.62, "grad_norm": 0.891778290271759, "learning_rate": 3.3493915145223395e-05, "loss": 2.3635, "step": 2960 }, { "epoch": 0.62, "grad_norm": 0.9314977526664734, "learning_rate": 3.3336637743196584e-05, "loss": 2.2918, "step": 2965 }, { "epoch": 0.62, "grad_norm": 0.9266039729118347, "learning_rate": 3.317954559866126e-05, "loss": 2.4019, "step": 2970 }, { "epoch": 0.62, "grad_norm": 0.8894385099411011, "learning_rate": 3.302264045811344e-05, "loss": 2.3935, "step": 2975 }, { "epoch": 0.62, "grad_norm": 0.9218305945396423, "learning_rate": 3.286592406597021e-05, "loss": 2.3447, "step": 2980 }, { "epoch": 0.62, "grad_norm": 0.9280163645744324, "learning_rate": 3.270939816455012e-05, "loss": 2.3861, "step": 2985 }, { "epoch": 0.62, "grad_norm": 0.9048585891723633, "learning_rate": 3.255306449405395e-05, "loss": 2.4027, "step": 2990 }, { "epoch": 0.62, "grad_norm": 0.8996835947036743, "learning_rate": 3.2396924792545304e-05, "loss": 2.3135, "step": 2995 }, { "epoch": 0.62, "grad_norm": 0.9637964367866516, "learning_rate": 3.224098079593132e-05, "loss": 2.4027, "step": 3000 }, { "epoch": 0.62, "grad_norm": 0.8991714715957642, "learning_rate": 3.2085234237943354e-05, "loss": 2.3699, "step": 3005 }, { "epoch": 0.63, "grad_norm": 0.9499838948249817, "learning_rate": 3.19296868501177e-05, "loss": 2.3901, "step": 3010 }, { "epoch": 0.63, "grad_norm": 0.9127416014671326, "learning_rate": 3.177434036177636e-05, "loss": 2.4039, "step": 3015 }, { "epoch": 0.63, "grad_norm": 0.9323961734771729, "learning_rate": 3.1619196500007804e-05, "loss": 2.3456, "step": 3020 }, { "epoch": 0.63, "grad_norm": 0.9023792743682861, "learning_rate": 3.146425698964776e-05, "loss": 2.3986, "step": 3025 }, { "epoch": 0.63, "grad_norm": 0.9215497374534607, "learning_rate": 3.1309523553260046e-05, "loss": 2.3066, "step": 3030 }, { "epoch": 0.63, "grad_norm": 0.9214668869972229, "learning_rate": 3.115499791111743e-05, "loss": 2.3897, "step": 3035 }, { "epoch": 0.63, "grad_norm": 0.8906429409980774, "learning_rate": 3.10006817811825e-05, "loss": 2.2407, "step": 3040 }, { "epoch": 0.63, "grad_norm": 0.9099827408790588, "learning_rate": 3.084657687908855e-05, "loss": 2.3565, "step": 3045 }, { "epoch": 0.63, "grad_norm": 0.8930659890174866, "learning_rate": 3.069268491812052e-05, "loss": 2.3377, "step": 3050 }, { "epoch": 0.63, "grad_norm": 0.904613733291626, "learning_rate": 3.0539007609195934e-05, "loss": 2.3979, "step": 3055 }, { "epoch": 0.64, "grad_norm": 0.9138222932815552, "learning_rate": 3.0385546660845908e-05, "loss": 2.341, "step": 3060 }, { "epoch": 0.64, "grad_norm": 0.928210437297821, "learning_rate": 3.0232303779196132e-05, "loss": 2.3547, "step": 3065 }, { "epoch": 0.64, "grad_norm": 0.9129458665847778, "learning_rate": 3.0079280667947885e-05, "loss": 2.3852, "step": 3070 }, { "epoch": 0.64, "grad_norm": 0.9085707664489746, "learning_rate": 2.9926479028359132e-05, "loss": 2.4033, "step": 3075 }, { "epoch": 0.64, "grad_norm": 0.919783890247345, "learning_rate": 2.97739005592256e-05, "loss": 2.3779, "step": 3080 }, { "epoch": 0.64, "grad_norm": 0.9020473957061768, "learning_rate": 2.962154695686187e-05, "loss": 2.311, "step": 3085 }, { "epoch": 0.64, "grad_norm": 0.9025293588638306, "learning_rate": 2.9469419915082536e-05, "loss": 2.3531, "step": 3090 }, { "epoch": 0.64, "grad_norm": 0.9105976223945618, "learning_rate": 2.9317521125183368e-05, "loss": 2.379, "step": 3095 }, { "epoch": 0.64, "grad_norm": 0.9467954039573669, "learning_rate": 2.9165852275922524e-05, "loss": 2.2659, "step": 3100 }, { "epoch": 0.65, "grad_norm": 0.9154301881790161, "learning_rate": 2.901441505350174e-05, "loss": 2.4311, "step": 3105 }, { "epoch": 0.65, "grad_norm": 0.9146058559417725, "learning_rate": 2.886321114154762e-05, "loss": 2.3807, "step": 3110 }, { "epoch": 0.65, "grad_norm": 0.9149680137634277, "learning_rate": 2.87122422210929e-05, "loss": 2.3241, "step": 3115 }, { "epoch": 0.65, "grad_norm": 0.9131379127502441, "learning_rate": 2.8561509970557736e-05, "loss": 2.254, "step": 3120 }, { "epoch": 0.65, "grad_norm": 0.9278967380523682, "learning_rate": 2.8411016065731146e-05, "loss": 2.3627, "step": 3125 }, { "epoch": 0.65, "grad_norm": 0.9049065113067627, "learning_rate": 2.826076217975222e-05, "loss": 2.3892, "step": 3130 }, { "epoch": 0.65, "grad_norm": 0.9476072788238525, "learning_rate": 2.8110749983091632e-05, "loss": 2.3093, "step": 3135 }, { "epoch": 0.65, "grad_norm": 0.9162594079971313, "learning_rate": 2.7960981143533053e-05, "loss": 2.3172, "step": 3140 }, { "epoch": 0.65, "grad_norm": 0.9001609086990356, "learning_rate": 2.781145732615457e-05, "loss": 2.3749, "step": 3145 }, { "epoch": 0.65, "grad_norm": 0.8743308186531067, "learning_rate": 2.7662180193310218e-05, "loss": 2.3167, "step": 3150 }, { "epoch": 0.66, "grad_norm": 0.9399544596672058, "learning_rate": 2.751315140461145e-05, "loss": 2.2892, "step": 3155 }, { "epoch": 0.66, "grad_norm": 0.8993040919303894, "learning_rate": 2.7364372616908744e-05, "loss": 2.4035, "step": 3160 }, { "epoch": 0.66, "grad_norm": 0.8989415168762207, "learning_rate": 2.7215845484273152e-05, "loss": 2.3562, "step": 3165 }, { "epoch": 0.66, "grad_norm": 0.9252132773399353, "learning_rate": 2.7067571657977893e-05, "loss": 2.3413, "step": 3170 }, { "epoch": 0.66, "grad_norm": 0.9166437387466431, "learning_rate": 2.691955278648003e-05, "loss": 2.3385, "step": 3175 }, { "epoch": 0.66, "grad_norm": 0.9225659370422363, "learning_rate": 2.6771790515402112e-05, "loss": 2.369, "step": 3180 }, { "epoch": 0.66, "grad_norm": 0.9323848485946655, "learning_rate": 2.6624286487513916e-05, "loss": 2.3854, "step": 3185 }, { "epoch": 0.66, "grad_norm": 0.9322205185890198, "learning_rate": 2.6477042342714137e-05, "loss": 2.3311, "step": 3190 }, { "epoch": 0.66, "grad_norm": 0.9424782991409302, "learning_rate": 2.633005971801219e-05, "loss": 2.3474, "step": 3195 }, { "epoch": 0.67, "grad_norm": 0.9437265396118164, "learning_rate": 2.6183340247510013e-05, "loss": 2.3294, "step": 3200 }, { "epoch": 0.67, "grad_norm": 0.9069112539291382, "learning_rate": 2.6036885562383856e-05, "loss": 2.3599, "step": 3205 }, { "epoch": 0.67, "grad_norm": 0.9054490327835083, "learning_rate": 2.5890697290866206e-05, "loss": 2.3053, "step": 3210 }, { "epoch": 0.67, "grad_norm": 0.8967999815940857, "learning_rate": 2.5744777058227642e-05, "loss": 2.3403, "step": 3215 }, { "epoch": 0.67, "grad_norm": 0.9110304713249207, "learning_rate": 2.5599126486758777e-05, "loss": 2.2913, "step": 3220 }, { "epoch": 0.67, "grad_norm": 0.9111791849136353, "learning_rate": 2.5453747195752243e-05, "loss": 2.2663, "step": 3225 }, { "epoch": 0.67, "grad_norm": 0.90242999792099, "learning_rate": 2.530864080148464e-05, "loss": 2.3761, "step": 3230 }, { "epoch": 0.67, "grad_norm": 0.9307655692100525, "learning_rate": 2.5163808917198615e-05, "loss": 2.2882, "step": 3235 }, { "epoch": 0.67, "grad_norm": 0.9393335580825806, "learning_rate": 2.501925315308492e-05, "loss": 2.3714, "step": 3240 }, { "epoch": 0.67, "grad_norm": 0.9044473171234131, "learning_rate": 2.4874975116264477e-05, "loss": 2.3814, "step": 3245 }, { "epoch": 0.68, "grad_norm": 0.928254246711731, "learning_rate": 2.4730976410770534e-05, "loss": 2.3812, "step": 3250 }, { "epoch": 0.68, "grad_norm": 0.9389528036117554, "learning_rate": 2.458725863753084e-05, "loss": 2.3669, "step": 3255 }, { "epoch": 0.68, "grad_norm": 0.9334967732429504, "learning_rate": 2.4443823394349834e-05, "loss": 2.3799, "step": 3260 }, { "epoch": 0.68, "grad_norm": 0.9133898019790649, "learning_rate": 2.430067227589088e-05, "loss": 2.4019, "step": 3265 }, { "epoch": 0.68, "grad_norm": 0.8969634771347046, "learning_rate": 2.4157806873658517e-05, "loss": 2.3894, "step": 3270 }, { "epoch": 0.68, "grad_norm": 0.8983045220375061, "learning_rate": 2.401522877598087e-05, "loss": 2.3523, "step": 3275 }, { "epoch": 0.68, "grad_norm": 0.9043636322021484, "learning_rate": 2.3872939567991827e-05, "loss": 2.3632, "step": 3280 }, { "epoch": 0.68, "grad_norm": 0.8893569111824036, "learning_rate": 2.373094083161353e-05, "loss": 2.3254, "step": 3285 }, { "epoch": 0.68, "grad_norm": 0.9358165264129639, "learning_rate": 2.358923414553877e-05, "loss": 2.3605, "step": 3290 }, { "epoch": 0.68, "grad_norm": 0.8925164937973022, "learning_rate": 2.3447821085213405e-05, "loss": 2.3003, "step": 3295 }, { "epoch": 0.69, "grad_norm": 0.9067592024803162, "learning_rate": 2.3306703222818878e-05, "loss": 2.3638, "step": 3300 }, { "epoch": 0.69, "grad_norm": 0.9226784110069275, "learning_rate": 2.3165882127254705e-05, "loss": 2.2942, "step": 3305 }, { "epoch": 0.69, "grad_norm": 0.9251599907875061, "learning_rate": 2.302535936412108e-05, "loss": 2.381, "step": 3310 }, { "epoch": 0.69, "grad_norm": 0.9351403713226318, "learning_rate": 2.2885136495701415e-05, "loss": 2.3022, "step": 3315 }, { "epoch": 0.69, "grad_norm": 0.9234192967414856, "learning_rate": 2.274521508094501e-05, "loss": 2.3531, "step": 3320 }, { "epoch": 0.69, "grad_norm": 0.8894725441932678, "learning_rate": 2.2605596675449698e-05, "loss": 2.3283, "step": 3325 }, { "epoch": 0.69, "grad_norm": 0.8899527192115784, "learning_rate": 2.246628283144457e-05, "loss": 2.3818, "step": 3330 }, { "epoch": 0.69, "grad_norm": 0.9192507863044739, "learning_rate": 2.232727509777269e-05, "loss": 2.341, "step": 3335 }, { "epoch": 0.69, "grad_norm": 0.9314063191413879, "learning_rate": 2.2188575019873932e-05, "loss": 2.3511, "step": 3340 }, { "epoch": 0.7, "grad_norm": 0.9228346347808838, "learning_rate": 2.2050184139767704e-05, "loss": 2.3609, "step": 3345 }, { "epoch": 0.7, "grad_norm": 0.9301695823669434, "learning_rate": 2.191210399603591e-05, "loss": 2.3318, "step": 3350 }, { "epoch": 0.7, "grad_norm": 0.9185614585876465, "learning_rate": 2.1774336123805772e-05, "loss": 2.4049, "step": 3355 }, { "epoch": 0.7, "grad_norm": 0.9196087121963501, "learning_rate": 2.1636882054732776e-05, "loss": 2.3333, "step": 3360 }, { "epoch": 0.7, "grad_norm": 0.921023964881897, "learning_rate": 2.1499743316983684e-05, "loss": 2.3288, "step": 3365 }, { "epoch": 0.7, "grad_norm": 0.9260315299034119, "learning_rate": 2.1362921435219473e-05, "loss": 2.3624, "step": 3370 }, { "epoch": 0.7, "grad_norm": 0.9450960159301758, "learning_rate": 2.1226417930578464e-05, "loss": 2.4221, "step": 3375 }, { "epoch": 0.7, "grad_norm": 0.9169435501098633, "learning_rate": 2.109023432065935e-05, "loss": 2.3724, "step": 3380 }, { "epoch": 0.7, "grad_norm": 0.9358202219009399, "learning_rate": 2.095437211950434e-05, "loss": 2.3792, "step": 3385 }, { "epoch": 0.7, "grad_norm": 0.9272687435150146, "learning_rate": 2.0818832837582352e-05, "loss": 2.3538, "step": 3390 }, { "epoch": 0.71, "grad_norm": 0.9306479096412659, "learning_rate": 2.068361798177218e-05, "loss": 2.3273, "step": 3395 }, { "epoch": 0.71, "grad_norm": 0.9368727207183838, "learning_rate": 2.0548729055345778e-05, "loss": 2.324, "step": 3400 }, { "epoch": 0.71, "grad_norm": 0.9212680459022522, "learning_rate": 2.0414167557951514e-05, "loss": 2.3944, "step": 3405 }, { "epoch": 0.71, "grad_norm": 0.917975902557373, "learning_rate": 2.0279934985597527e-05, "loss": 2.3215, "step": 3410 }, { "epoch": 0.71, "grad_norm": 0.9334181547164917, "learning_rate": 2.0146032830635054e-05, "loss": 2.3373, "step": 3415 }, { "epoch": 0.71, "grad_norm": 0.9223392009735107, "learning_rate": 2.001246258174192e-05, "loss": 2.4093, "step": 3420 }, { "epoch": 0.71, "grad_norm": 0.9029899835586548, "learning_rate": 1.9879225723905886e-05, "loss": 2.3304, "step": 3425 }, { "epoch": 0.71, "grad_norm": 0.9155861735343933, "learning_rate": 1.9746323738408203e-05, "loss": 2.3056, "step": 3430 }, { "epoch": 0.71, "grad_norm": 0.9587869048118591, "learning_rate": 1.9613758102807117e-05, "loss": 2.3085, "step": 3435 }, { "epoch": 0.71, "grad_norm": 0.9450895190238953, "learning_rate": 1.9481530290921474e-05, "loss": 2.3667, "step": 3440 }, { "epoch": 0.72, "grad_norm": 0.9262810945510864, "learning_rate": 1.934964177281428e-05, "loss": 2.2548, "step": 3445 }, { "epoch": 0.72, "grad_norm": 0.9066665768623352, "learning_rate": 1.9218094014776434e-05, "loss": 2.3256, "step": 3450 }, { "epoch": 0.72, "grad_norm": 0.925977349281311, "learning_rate": 1.9086888479310333e-05, "loss": 2.3263, "step": 3455 }, { "epoch": 0.72, "grad_norm": 0.9015002846717834, "learning_rate": 1.895602662511371e-05, "loss": 2.193, "step": 3460 }, { "epoch": 0.72, "grad_norm": 0.900550365447998, "learning_rate": 1.8825509907063327e-05, "loss": 2.3363, "step": 3465 }, { "epoch": 0.72, "grad_norm": 0.932928204536438, "learning_rate": 1.8695339776198872e-05, "loss": 2.3329, "step": 3470 }, { "epoch": 0.72, "grad_norm": 0.9403447508811951, "learning_rate": 1.8565517679706783e-05, "loss": 2.3214, "step": 3475 }, { "epoch": 0.72, "grad_norm": 0.8891700506210327, "learning_rate": 1.8436045060904174e-05, "loss": 2.2526, "step": 3480 }, { "epoch": 0.72, "grad_norm": 0.9113773703575134, "learning_rate": 1.830692335922279e-05, "loss": 2.344, "step": 3485 }, { "epoch": 0.73, "grad_norm": 0.8912283182144165, "learning_rate": 1.8178154010192994e-05, "loss": 2.3739, "step": 3490 }, { "epoch": 0.73, "grad_norm": 0.925250232219696, "learning_rate": 1.8049738445427822e-05, "loss": 2.3043, "step": 3495 }, { "epoch": 0.73, "grad_norm": 0.925449013710022, "learning_rate": 1.7921678092607052e-05, "loss": 2.3278, "step": 3500 }, { "epoch": 0.73, "grad_norm": 0.9105491638183594, "learning_rate": 1.7793974375461352e-05, "loss": 2.2991, "step": 3505 }, { "epoch": 0.73, "grad_norm": 0.949779212474823, "learning_rate": 1.7666628713756417e-05, "loss": 2.3522, "step": 3510 }, { "epoch": 0.73, "grad_norm": 0.944808304309845, "learning_rate": 1.7539642523277228e-05, "loss": 2.3239, "step": 3515 }, { "epoch": 0.73, "grad_norm": 0.9059569239616394, "learning_rate": 1.7413017215812273e-05, "loss": 2.2738, "step": 3520 }, { "epoch": 0.73, "grad_norm": 0.9568001627922058, "learning_rate": 1.728675419913788e-05, "loss": 2.2816, "step": 3525 }, { "epoch": 0.73, "grad_norm": 0.9053666591644287, "learning_rate": 1.716085487700253e-05, "loss": 2.2909, "step": 3530 }, { "epoch": 0.73, "grad_norm": 0.9108089804649353, "learning_rate": 1.703532064911131e-05, "loss": 2.3613, "step": 3535 }, { "epoch": 0.74, "grad_norm": 0.9420007467269897, "learning_rate": 1.6910152911110283e-05, "loss": 2.3443, "step": 3540 }, { "epoch": 0.74, "grad_norm": 0.9437248706817627, "learning_rate": 1.6785353054571024e-05, "loss": 2.3616, "step": 3545 }, { "epoch": 0.74, "grad_norm": 0.9269158840179443, "learning_rate": 1.666092246697512e-05, "loss": 2.3475, "step": 3550 }, { "epoch": 0.74, "grad_norm": 0.9224128723144531, "learning_rate": 1.6536862531698766e-05, "loss": 2.3165, "step": 3555 }, { "epoch": 0.74, "grad_norm": 0.9019190669059753, "learning_rate": 1.6413174627997328e-05, "loss": 2.3543, "step": 3560 }, { "epoch": 0.74, "grad_norm": 1.174714207649231, "learning_rate": 1.6289860130990147e-05, "loss": 2.3201, "step": 3565 }, { "epoch": 0.74, "grad_norm": 0.9111523628234863, "learning_rate": 1.6166920411645064e-05, "loss": 2.3747, "step": 3570 }, { "epoch": 0.74, "grad_norm": 0.913772463798523, "learning_rate": 1.6044356836763315e-05, "loss": 2.3417, "step": 3575 }, { "epoch": 0.74, "grad_norm": 0.9173497557640076, "learning_rate": 1.5922170768964285e-05, "loss": 2.3649, "step": 3580 }, { "epoch": 0.75, "grad_norm": 0.9145777821540833, "learning_rate": 1.5800363566670362e-05, "loss": 2.3156, "step": 3585 }, { "epoch": 0.75, "grad_norm": 0.9274765849113464, "learning_rate": 1.5678936584091852e-05, "loss": 2.2597, "step": 3590 }, { "epoch": 0.75, "grad_norm": 0.9273658394813538, "learning_rate": 1.5557891171211892e-05, "loss": 2.3189, "step": 3595 }, { "epoch": 0.75, "grad_norm": 0.9246422052383423, "learning_rate": 1.5437228673771465e-05, "loss": 2.2589, "step": 3600 }, { "epoch": 0.75, "grad_norm": 0.946138322353363, "learning_rate": 1.5316950433254445e-05, "loss": 2.2714, "step": 3605 }, { "epoch": 0.75, "grad_norm": 0.9329668283462524, "learning_rate": 1.5197057786872649e-05, "loss": 2.2799, "step": 3610 }, { "epoch": 0.75, "grad_norm": 0.9475228786468506, "learning_rate": 1.5077552067551015e-05, "loss": 2.3693, "step": 3615 }, { "epoch": 0.75, "grad_norm": 0.9241766929626465, "learning_rate": 1.4958434603912747e-05, "loss": 2.3682, "step": 3620 }, { "epoch": 0.75, "grad_norm": 0.9258906841278076, "learning_rate": 1.4839706720264546e-05, "loss": 2.2912, "step": 3625 }, { "epoch": 0.75, "grad_norm": 0.9170982837677002, "learning_rate": 1.4721369736581924e-05, "loss": 2.2662, "step": 3630 }, { "epoch": 0.76, "grad_norm": 0.9363173246383667, "learning_rate": 1.4603424968494484e-05, "loss": 2.3692, "step": 3635 }, { "epoch": 0.76, "grad_norm": 0.9334496259689331, "learning_rate": 1.448587372727132e-05, "loss": 2.286, "step": 3640 }, { "epoch": 0.76, "grad_norm": 0.9272920489311218, "learning_rate": 1.4368717319806419e-05, "loss": 2.3078, "step": 3645 }, { "epoch": 0.76, "grad_norm": 0.9488955140113831, "learning_rate": 1.4251957048604152e-05, "loss": 2.3554, "step": 3650 }, { "epoch": 0.76, "grad_norm": 0.918768584728241, "learning_rate": 1.413559421176479e-05, "loss": 2.3845, "step": 3655 }, { "epoch": 0.76, "grad_norm": 0.9137639999389648, "learning_rate": 1.4019630102970056e-05, "loss": 2.3445, "step": 3660 }, { "epoch": 0.76, "grad_norm": 0.9241656064987183, "learning_rate": 1.3904066011468753e-05, "loss": 2.3188, "step": 3665 }, { "epoch": 0.76, "grad_norm": 0.9146557450294495, "learning_rate": 1.3788903222062433e-05, "loss": 2.3191, "step": 3670 }, { "epoch": 0.76, "grad_norm": 0.947729229927063, "learning_rate": 1.3674143015091118e-05, "loss": 2.353, "step": 3675 }, { "epoch": 0.76, "grad_norm": 0.9175273180007935, "learning_rate": 1.355978666641905e-05, "loss": 2.3044, "step": 3680 }, { "epoch": 0.77, "grad_norm": 0.9403489828109741, "learning_rate": 1.3445835447420507e-05, "loss": 2.3539, "step": 3685 }, { "epoch": 0.77, "grad_norm": 0.9238479137420654, "learning_rate": 1.3332290624965688e-05, "loss": 2.2995, "step": 3690 }, { "epoch": 0.77, "grad_norm": 0.9429262280464172, "learning_rate": 1.3219153461406609e-05, "loss": 2.3249, "step": 3695 }, { "epoch": 0.77, "grad_norm": 0.8907161355018616, "learning_rate": 1.3106425214563078e-05, "loss": 2.3333, "step": 3700 }, { "epoch": 0.77, "grad_norm": 0.9309660792350769, "learning_rate": 1.2994107137708716e-05, "loss": 2.3351, "step": 3705 }, { "epoch": 0.77, "grad_norm": 0.9421401619911194, "learning_rate": 1.2882200479556988e-05, "loss": 2.3823, "step": 3710 }, { "epoch": 0.77, "grad_norm": 0.9495819807052612, "learning_rate": 1.2770706484247397e-05, "loss": 2.3302, "step": 3715 }, { "epoch": 0.77, "grad_norm": 0.9074979424476624, "learning_rate": 1.2659626391331564e-05, "loss": 2.2637, "step": 3720 }, { "epoch": 0.77, "grad_norm": 0.9183170199394226, "learning_rate": 1.2548961435759493e-05, "loss": 2.3008, "step": 3725 }, { "epoch": 0.78, "grad_norm": 0.9259270429611206, "learning_rate": 1.2438712847865846e-05, "loss": 2.3748, "step": 3730 }, { "epoch": 0.78, "grad_norm": 0.9168004989624023, "learning_rate": 1.2328881853356244e-05, "loss": 2.2474, "step": 3735 }, { "epoch": 0.78, "grad_norm": 0.8991266489028931, "learning_rate": 1.221946967329365e-05, "loss": 2.346, "step": 3740 }, { "epoch": 0.78, "grad_norm": 0.9321264028549194, "learning_rate": 1.2110477524084796e-05, "loss": 2.3265, "step": 3745 }, { "epoch": 0.78, "grad_norm": 0.9339208602905273, "learning_rate": 1.2001906617466657e-05, "loss": 2.3527, "step": 3750 }, { "epoch": 0.78, "grad_norm": 0.8979222178459167, "learning_rate": 1.1893758160492978e-05, "loss": 2.2731, "step": 3755 }, { "epoch": 0.78, "grad_norm": 0.9186643362045288, "learning_rate": 1.1786033355520859e-05, "loss": 2.368, "step": 3760 }, { "epoch": 0.78, "grad_norm": 0.9406339526176453, "learning_rate": 1.1678733400197373e-05, "loss": 2.3329, "step": 3765 }, { "epoch": 0.78, "grad_norm": 0.9028125405311584, "learning_rate": 1.1571859487446263e-05, "loss": 2.3374, "step": 3770 }, { "epoch": 0.78, "grad_norm": 0.9233968257904053, "learning_rate": 1.1465412805454695e-05, "loss": 2.3105, "step": 3775 }, { "epoch": 0.79, "grad_norm": 0.9305930733680725, "learning_rate": 1.1359394537660011e-05, "loss": 2.3692, "step": 3780 }, { "epoch": 0.79, "grad_norm": 0.9098057746887207, "learning_rate": 1.125380586273661e-05, "loss": 2.3132, "step": 3785 }, { "epoch": 0.79, "grad_norm": 0.9284851551055908, "learning_rate": 1.1148647954582808e-05, "loss": 2.2891, "step": 3790 }, { "epoch": 0.79, "grad_norm": 0.911727249622345, "learning_rate": 1.1043921982307819e-05, "loss": 2.3675, "step": 3795 }, { "epoch": 0.79, "grad_norm": 0.9202545881271362, "learning_rate": 1.0939629110218735e-05, "loss": 2.3074, "step": 3800 }, { "epoch": 0.79, "grad_norm": 0.9392959475517273, "learning_rate": 1.0835770497807596e-05, "loss": 2.3653, "step": 3805 }, { "epoch": 0.79, "grad_norm": 0.9099482297897339, "learning_rate": 1.0732347299738493e-05, "loss": 2.3247, "step": 3810 }, { "epoch": 0.79, "grad_norm": 0.9176100492477417, "learning_rate": 1.0629360665834732e-05, "loss": 2.3071, "step": 3815 }, { "epoch": 0.79, "grad_norm": 0.945218563079834, "learning_rate": 1.052681174106604e-05, "loss": 2.3201, "step": 3820 }, { "epoch": 0.8, "grad_norm": 0.9112182259559631, "learning_rate": 1.0424701665535852e-05, "loss": 2.3579, "step": 3825 }, { "epoch": 0.8, "grad_norm": 0.9128363728523254, "learning_rate": 1.0323031574468638e-05, "loss": 2.2977, "step": 3830 }, { "epoch": 0.8, "grad_norm": 0.9202331304550171, "learning_rate": 1.0221802598197261e-05, "loss": 2.339, "step": 3835 }, { "epoch": 0.8, "grad_norm": 0.9266894459724426, "learning_rate": 1.0121015862150423e-05, "loss": 2.2969, "step": 3840 }, { "epoch": 0.8, "grad_norm": 0.9341091513633728, "learning_rate": 1.0020672486840154e-05, "loss": 2.3475, "step": 3845 }, { "epoch": 0.8, "grad_norm": 0.9220916628837585, "learning_rate": 9.920773587849364e-06, "loss": 2.3231, "step": 3850 }, { "epoch": 0.8, "grad_norm": 0.9296770095825195, "learning_rate": 9.821320275819401e-06, "loss": 2.2958, "step": 3855 }, { "epoch": 0.8, "grad_norm": 0.9489056468009949, "learning_rate": 9.72231365643777e-06, "loss": 2.2493, "step": 3860 }, { "epoch": 0.8, "grad_norm": 0.9161087274551392, "learning_rate": 9.623754830425779e-06, "loss": 2.3089, "step": 3865 }, { "epoch": 0.8, "grad_norm": 0.9204024076461792, "learning_rate": 9.52564489352632e-06, "loss": 2.2413, "step": 3870 }, { "epoch": 0.81, "grad_norm": 0.9348191022872925, "learning_rate": 9.427984936491702e-06, "loss": 2.29, "step": 3875 }, { "epoch": 0.81, "grad_norm": 0.9372574090957642, "learning_rate": 9.330776045071509e-06, "loss": 2.2761, "step": 3880 }, { "epoch": 0.81, "grad_norm": 0.9387868642807007, "learning_rate": 9.23401930000054e-06, "loss": 2.2701, "step": 3885 }, { "epoch": 0.81, "grad_norm": 0.9021495580673218, "learning_rate": 9.137715776986772e-06, "loss": 2.2859, "step": 3890 }, { "epoch": 0.81, "grad_norm": 0.9116939306259155, "learning_rate": 9.041866546699434e-06, "loss": 2.2792, "step": 3895 }, { "epoch": 0.81, "grad_norm": 0.9261936545372009, "learning_rate": 8.946472674757078e-06, "loss": 2.2656, "step": 3900 }, { "epoch": 0.81, "grad_norm": 0.9336845278739929, "learning_rate": 8.851535221715735e-06, "loss": 2.3391, "step": 3905 }, { "epoch": 0.81, "grad_norm": 0.9195213913917542, "learning_rate": 8.757055243057132e-06, "loss": 2.2762, "step": 3910 }, { "epoch": 0.81, "grad_norm": 0.9221490621566772, "learning_rate": 8.663033789176967e-06, "loss": 2.3548, "step": 3915 }, { "epoch": 0.81, "grad_norm": 0.9198946356773376, "learning_rate": 8.5694719053732e-06, "loss": 2.3239, "step": 3920 }, { "epoch": 0.82, "grad_norm": 0.9193469285964966, "learning_rate": 8.476370631834458e-06, "loss": 2.2747, "step": 3925 }, { "epoch": 0.82, "grad_norm": 0.9079307317733765, "learning_rate": 8.383731003628452e-06, "loss": 2.347, "step": 3930 }, { "epoch": 0.82, "grad_norm": 0.9317283630371094, "learning_rate": 8.291554050690508e-06, "loss": 2.2833, "step": 3935 }, { "epoch": 0.82, "grad_norm": 0.9155171513557434, "learning_rate": 8.199840797812058e-06, "loss": 2.3311, "step": 3940 }, { "epoch": 0.82, "grad_norm": 0.9332835674285889, "learning_rate": 8.108592264629295e-06, "loss": 2.3594, "step": 3945 }, { "epoch": 0.82, "grad_norm": 0.9451523423194885, "learning_rate": 8.017809465611803e-06, "loss": 2.2932, "step": 3950 }, { "epoch": 0.82, "grad_norm": 0.9449190497398376, "learning_rate": 7.927493410051324e-06, "loss": 2.3888, "step": 3955 }, { "epoch": 0.82, "grad_norm": 0.9128648042678833, "learning_rate": 7.837645102050473e-06, "loss": 2.2962, "step": 3960 }, { "epoch": 0.82, "grad_norm": 0.9178701639175415, "learning_rate": 7.748265540511635e-06, "loss": 2.3119, "step": 3965 }, { "epoch": 0.83, "grad_norm": 0.9139108061790466, "learning_rate": 7.65935571912582e-06, "loss": 2.259, "step": 3970 }, { "epoch": 0.83, "grad_norm": 0.9180004596710205, "learning_rate": 7.5709166263616405e-06, "loss": 2.2977, "step": 3975 }, { "epoch": 0.83, "grad_norm": 0.9333677887916565, "learning_rate": 7.482949245454302e-06, "loss": 2.2495, "step": 3980 }, { "epoch": 0.83, "grad_norm": 0.9381062984466553, "learning_rate": 7.3954545543946876e-06, "loss": 2.302, "step": 3985 }, { "epoch": 0.83, "grad_norm": 0.9359762072563171, "learning_rate": 7.308433525918468e-06, "loss": 2.2843, "step": 3990 }, { "epoch": 0.83, "grad_norm": 0.9028638005256653, "learning_rate": 7.221887127495313e-06, "loss": 2.23, "step": 3995 }, { "epoch": 0.83, "grad_norm": 0.9264029264450073, "learning_rate": 7.1358163213181114e-06, "loss": 2.3494, "step": 4000 } ], "logging_steps": 5, "max_steps": 4811, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.1658574908358656e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }