diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21795 @@ +{ + "best_metric": 2.0305252075195312, + "best_model_checkpoint": "/gpfs/radev/home/ap2853/scratch/c2s_models/train_c2s_full_1b/checkpoint-27900", + "epoch": 3.7449664429530203, + "eval_steps": 100, + "global_step": 27900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006406406406406406, + "grad_norm": 2.9401302337646484, + "learning_rate": 9.99679671984112e-06, + "loss": 3.9489, + "step": 10 + }, + { + "epoch": 0.0012812812812812813, + "grad_norm": 2.2548134326934814, + "learning_rate": 9.993593439682236e-06, + "loss": 3.2882, + "step": 20 + }, + { + "epoch": 0.0019219219219219219, + "grad_norm": 1.4946377277374268, + "learning_rate": 9.990390159523353e-06, + "loss": 2.9585, + "step": 30 + }, + { + "epoch": 0.0025625625625625625, + "grad_norm": 1.859741449356079, + "learning_rate": 9.98718687936447e-06, + "loss": 2.7796, + "step": 40 + }, + { + "epoch": 0.0032032032032032033, + "grad_norm": 1.7827876806259155, + "learning_rate": 9.983983599205588e-06, + "loss": 2.6482, + "step": 50 + }, + { + "epoch": 0.0038438438438438438, + "grad_norm": 2.0380122661590576, + "learning_rate": 9.980780319046704e-06, + "loss": 2.5502, + "step": 60 + }, + { + "epoch": 0.004484484484484484, + "grad_norm": 1.4964377880096436, + "learning_rate": 9.977577038887823e-06, + "loss": 2.4852, + "step": 70 + }, + { + "epoch": 0.005125125125125125, + "grad_norm": 1.9251023530960083, + "learning_rate": 9.97437375872894e-06, + "loss": 2.463, + "step": 80 + }, + { + "epoch": 0.005765765765765766, + "grad_norm": 1.8076153993606567, + "learning_rate": 9.971170478570058e-06, + "loss": 2.4314, + "step": 90 + }, + { + "epoch": 0.006406406406406407, + "grad_norm": 1.5380029678344727, + "learning_rate": 9.967967198411174e-06, + "loss": 2.4312, + "step": 100 + }, + { + "epoch": 0.006406406406406407, + "eval_loss": 2.4192681312561035, + "eval_runtime": 99.5638, + "eval_samples_per_second": 10.044, + "eval_steps_per_second": 5.022, + "step": 100 + }, + { + "epoch": 0.007047047047047047, + "grad_norm": 1.9866392612457275, + "learning_rate": 9.964763918252291e-06, + "loss": 2.394, + "step": 110 + }, + { + "epoch": 0.0076876876876876875, + "grad_norm": 1.4123798608779907, + "learning_rate": 9.961560638093408e-06, + "loss": 2.4009, + "step": 120 + }, + { + "epoch": 0.008328328328328327, + "grad_norm": 1.5345197916030884, + "learning_rate": 9.958357357934526e-06, + "loss": 2.3618, + "step": 130 + }, + { + "epoch": 0.008968968968968968, + "grad_norm": 1.346252202987671, + "learning_rate": 9.955154077775643e-06, + "loss": 2.3489, + "step": 140 + }, + { + "epoch": 0.00960960960960961, + "grad_norm": 1.498039722442627, + "learning_rate": 9.951950797616761e-06, + "loss": 2.3642, + "step": 150 + }, + { + "epoch": 0.01025025025025025, + "grad_norm": 1.2046945095062256, + "learning_rate": 9.948747517457878e-06, + "loss": 2.3346, + "step": 160 + }, + { + "epoch": 0.01089089089089089, + "grad_norm": 1.650439739227295, + "learning_rate": 9.945544237298996e-06, + "loss": 2.3319, + "step": 170 + }, + { + "epoch": 0.011531531531531532, + "grad_norm": 1.819641351699829, + "learning_rate": 9.942340957140113e-06, + "loss": 2.294, + "step": 180 + }, + { + "epoch": 0.012172172172172173, + "grad_norm": 1.5125728845596313, + "learning_rate": 9.93913767698123e-06, + "loss": 2.2876, + "step": 190 + }, + { + "epoch": 0.012812812812812813, + "grad_norm": 1.2440060377120972, + "learning_rate": 9.935934396822346e-06, + "loss": 2.3208, + "step": 200 + }, + { + "epoch": 0.012812812812812813, + "eval_loss": 2.3174383640289307, + "eval_runtime": 98.9774, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.052, + "step": 200 + }, + { + "epoch": 0.013453453453453454, + "grad_norm": 1.1538126468658447, + "learning_rate": 9.932731116663464e-06, + "loss": 2.3283, + "step": 210 + }, + { + "epoch": 0.014094094094094093, + "grad_norm": 1.3118531703948975, + "learning_rate": 9.929527836504581e-06, + "loss": 2.2764, + "step": 220 + }, + { + "epoch": 0.014734734734734734, + "grad_norm": 1.161333680152893, + "learning_rate": 9.9263245563457e-06, + "loss": 2.2471, + "step": 230 + }, + { + "epoch": 0.015375375375375375, + "grad_norm": 1.5464813709259033, + "learning_rate": 9.923121276186816e-06, + "loss": 2.2699, + "step": 240 + }, + { + "epoch": 0.016016016016016016, + "grad_norm": 1.2160066366195679, + "learning_rate": 9.919917996027933e-06, + "loss": 2.2721, + "step": 250 + }, + { + "epoch": 0.016656656656656655, + "grad_norm": 1.4125703573226929, + "learning_rate": 9.91671471586905e-06, + "loss": 2.264, + "step": 260 + }, + { + "epoch": 0.017297297297297298, + "grad_norm": 1.0305856466293335, + "learning_rate": 9.913511435710168e-06, + "loss": 2.2518, + "step": 270 + }, + { + "epoch": 0.017937937937937937, + "grad_norm": 1.1297743320465088, + "learning_rate": 9.910308155551284e-06, + "loss": 2.297, + "step": 280 + }, + { + "epoch": 0.01857857857857858, + "grad_norm": 1.3597170114517212, + "learning_rate": 9.907104875392403e-06, + "loss": 2.2957, + "step": 290 + }, + { + "epoch": 0.01921921921921922, + "grad_norm": 1.4308130741119385, + "learning_rate": 9.903901595233521e-06, + "loss": 2.2373, + "step": 300 + }, + { + "epoch": 0.01921921921921922, + "eval_loss": 2.2671942710876465, + "eval_runtime": 99.1603, + "eval_samples_per_second": 10.085, + "eval_steps_per_second": 5.042, + "step": 300 + }, + { + "epoch": 0.01985985985985986, + "grad_norm": 1.258988618850708, + "learning_rate": 9.900698315074638e-06, + "loss": 2.2485, + "step": 310 + }, + { + "epoch": 0.0205005005005005, + "grad_norm": 1.1126744747161865, + "learning_rate": 9.897495034915755e-06, + "loss": 2.2211, + "step": 320 + }, + { + "epoch": 0.021141141141141143, + "grad_norm": 1.2072731256484985, + "learning_rate": 9.894291754756871e-06, + "loss": 2.2268, + "step": 330 + }, + { + "epoch": 0.02178178178178178, + "grad_norm": 1.6628332138061523, + "learning_rate": 9.89108847459799e-06, + "loss": 2.2493, + "step": 340 + }, + { + "epoch": 0.02242242242242242, + "grad_norm": 1.5348600149154663, + "learning_rate": 9.887885194439106e-06, + "loss": 2.2501, + "step": 350 + }, + { + "epoch": 0.023063063063063063, + "grad_norm": 1.034929633140564, + "learning_rate": 9.884681914280225e-06, + "loss": 2.231, + "step": 360 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 0.9858831763267517, + "learning_rate": 9.881478634121341e-06, + "loss": 2.2372, + "step": 370 + }, + { + "epoch": 0.024344344344344345, + "grad_norm": 0.974418580532074, + "learning_rate": 9.878275353962458e-06, + "loss": 2.228, + "step": 380 + }, + { + "epoch": 0.024984984984984984, + "grad_norm": 1.2869054079055786, + "learning_rate": 9.875072073803576e-06, + "loss": 2.2242, + "step": 390 + }, + { + "epoch": 0.025625625625625627, + "grad_norm": 1.260129690170288, + "learning_rate": 9.871868793644693e-06, + "loss": 2.2064, + "step": 400 + }, + { + "epoch": 0.025625625625625627, + "eval_loss": 2.239616870880127, + "eval_runtime": 98.8691, + "eval_samples_per_second": 10.114, + "eval_steps_per_second": 5.057, + "step": 400 + }, + { + "epoch": 0.026266266266266266, + "grad_norm": 1.229314923286438, + "learning_rate": 9.86866551348581e-06, + "loss": 2.2123, + "step": 410 + }, + { + "epoch": 0.02690690690690691, + "grad_norm": 1.0697575807571411, + "learning_rate": 9.865462233326928e-06, + "loss": 2.2388, + "step": 420 + }, + { + "epoch": 0.027547547547547548, + "grad_norm": 1.1127859354019165, + "learning_rate": 9.862258953168045e-06, + "loss": 2.2439, + "step": 430 + }, + { + "epoch": 0.028188188188188187, + "grad_norm": 1.0716787576675415, + "learning_rate": 9.859055673009163e-06, + "loss": 2.1971, + "step": 440 + }, + { + "epoch": 0.02882882882882883, + "grad_norm": 1.22200608253479, + "learning_rate": 9.85585239285028e-06, + "loss": 2.2089, + "step": 450 + }, + { + "epoch": 0.02946946946946947, + "grad_norm": 1.3846632242202759, + "learning_rate": 9.852649112691396e-06, + "loss": 2.2086, + "step": 460 + }, + { + "epoch": 0.03011011011011011, + "grad_norm": 1.1426323652267456, + "learning_rate": 9.849445832532513e-06, + "loss": 2.2088, + "step": 470 + }, + { + "epoch": 0.03075075075075075, + "grad_norm": 1.2188793420791626, + "learning_rate": 9.846242552373631e-06, + "loss": 2.1951, + "step": 480 + }, + { + "epoch": 0.03139139139139139, + "grad_norm": 1.2974520921707153, + "learning_rate": 9.843039272214748e-06, + "loss": 2.232, + "step": 490 + }, + { + "epoch": 0.03203203203203203, + "grad_norm": 0.9593750834465027, + "learning_rate": 9.839835992055866e-06, + "loss": 2.2033, + "step": 500 + }, + { + "epoch": 0.03203203203203203, + "eval_loss": 2.2161059379577637, + "eval_runtime": 99.0548, + "eval_samples_per_second": 10.095, + "eval_steps_per_second": 5.048, + "step": 500 + }, + { + "epoch": 0.032672672672672674, + "grad_norm": 1.2013003826141357, + "learning_rate": 9.836632711896983e-06, + "loss": 2.2249, + "step": 510 + }, + { + "epoch": 0.03331331331331331, + "grad_norm": 1.036780595779419, + "learning_rate": 9.833429431738101e-06, + "loss": 2.2022, + "step": 520 + }, + { + "epoch": 0.03395395395395395, + "grad_norm": 0.9950962066650391, + "learning_rate": 9.830226151579218e-06, + "loss": 2.1838, + "step": 530 + }, + { + "epoch": 0.034594594594594595, + "grad_norm": 0.905576229095459, + "learning_rate": 9.827022871420335e-06, + "loss": 2.1785, + "step": 540 + }, + { + "epoch": 0.03523523523523524, + "grad_norm": 0.9075835347175598, + "learning_rate": 9.823819591261451e-06, + "loss": 2.1802, + "step": 550 + }, + { + "epoch": 0.03587587587587587, + "grad_norm": 1.1423232555389404, + "learning_rate": 9.82061631110257e-06, + "loss": 2.216, + "step": 560 + }, + { + "epoch": 0.036516516516516516, + "grad_norm": 0.9267427921295166, + "learning_rate": 9.817413030943686e-06, + "loss": 2.1745, + "step": 570 + }, + { + "epoch": 0.03715715715715716, + "grad_norm": 1.0812605619430542, + "learning_rate": 9.814209750784805e-06, + "loss": 2.1822, + "step": 580 + }, + { + "epoch": 0.0377977977977978, + "grad_norm": 0.9606761932373047, + "learning_rate": 9.811006470625921e-06, + "loss": 2.2007, + "step": 590 + }, + { + "epoch": 0.03843843843843844, + "grad_norm": 1.1749407052993774, + "learning_rate": 9.807803190467038e-06, + "loss": 2.1762, + "step": 600 + }, + { + "epoch": 0.03843843843843844, + "eval_loss": 2.2011826038360596, + "eval_runtime": 99.0889, + "eval_samples_per_second": 10.092, + "eval_steps_per_second": 5.046, + "step": 600 + }, + { + "epoch": 0.03907907907907908, + "grad_norm": 1.5855426788330078, + "learning_rate": 9.804599910308157e-06, + "loss": 2.1653, + "step": 610 + }, + { + "epoch": 0.03971971971971972, + "grad_norm": 1.0931143760681152, + "learning_rate": 9.801396630149273e-06, + "loss": 2.1957, + "step": 620 + }, + { + "epoch": 0.04036036036036036, + "grad_norm": 0.8430980443954468, + "learning_rate": 9.798193349990392e-06, + "loss": 2.1894, + "step": 630 + }, + { + "epoch": 0.041001001001001, + "grad_norm": 0.9702726602554321, + "learning_rate": 9.794990069831508e-06, + "loss": 2.1907, + "step": 640 + }, + { + "epoch": 0.04164164164164164, + "grad_norm": 1.0787761211395264, + "learning_rate": 9.791786789672627e-06, + "loss": 2.1922, + "step": 650 + }, + { + "epoch": 0.042282282282282285, + "grad_norm": 0.9241836071014404, + "learning_rate": 9.788583509513743e-06, + "loss": 2.1866, + "step": 660 + }, + { + "epoch": 0.04292292292292292, + "grad_norm": 0.8241314888000488, + "learning_rate": 9.78538022935486e-06, + "loss": 2.1769, + "step": 670 + }, + { + "epoch": 0.04356356356356356, + "grad_norm": 0.7783017158508301, + "learning_rate": 9.782176949195977e-06, + "loss": 2.1526, + "step": 680 + }, + { + "epoch": 0.044204204204204206, + "grad_norm": 0.8009265065193176, + "learning_rate": 9.778973669037095e-06, + "loss": 2.1566, + "step": 690 + }, + { + "epoch": 0.04484484484484484, + "grad_norm": 0.8154035806655884, + "learning_rate": 9.775770388878212e-06, + "loss": 2.1745, + "step": 700 + }, + { + "epoch": 0.04484484484484484, + "eval_loss": 2.188918352127075, + "eval_runtime": 98.963, + "eval_samples_per_second": 10.105, + "eval_steps_per_second": 5.052, + "step": 700 + }, + { + "epoch": 0.045485485485485484, + "grad_norm": 1.1314467191696167, + "learning_rate": 9.77256710871933e-06, + "loss": 2.1939, + "step": 710 + }, + { + "epoch": 0.04612612612612613, + "grad_norm": 0.9179493188858032, + "learning_rate": 9.769363828560447e-06, + "loss": 2.1498, + "step": 720 + }, + { + "epoch": 0.04676676676676677, + "grad_norm": 0.8480121493339539, + "learning_rate": 9.766160548401565e-06, + "loss": 2.1302, + "step": 730 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 1.093794822692871, + "learning_rate": 9.762957268242682e-06, + "loss": 2.1542, + "step": 740 + }, + { + "epoch": 0.04804804804804805, + "grad_norm": 1.032202124595642, + "learning_rate": 9.759753988083798e-06, + "loss": 2.1745, + "step": 750 + }, + { + "epoch": 0.04868868868868869, + "grad_norm": 0.754345715045929, + "learning_rate": 9.756550707924915e-06, + "loss": 2.1663, + "step": 760 + }, + { + "epoch": 0.04932932932932933, + "grad_norm": 0.7865972518920898, + "learning_rate": 9.753347427766033e-06, + "loss": 2.192, + "step": 770 + }, + { + "epoch": 0.04996996996996997, + "grad_norm": 0.9923487901687622, + "learning_rate": 9.75014414760715e-06, + "loss": 2.1489, + "step": 780 + }, + { + "epoch": 0.05061061061061061, + "grad_norm": 0.9781584143638611, + "learning_rate": 9.746940867448268e-06, + "loss": 2.141, + "step": 790 + }, + { + "epoch": 0.051251251251251254, + "grad_norm": 0.7666671276092529, + "learning_rate": 9.743737587289385e-06, + "loss": 2.1452, + "step": 800 + }, + { + "epoch": 0.051251251251251254, + "eval_loss": 2.1780078411102295, + "eval_runtime": 99.0698, + "eval_samples_per_second": 10.094, + "eval_steps_per_second": 5.047, + "step": 800 + }, + { + "epoch": 0.05189189189189189, + "grad_norm": 0.8632680177688599, + "learning_rate": 9.740534307130502e-06, + "loss": 2.1417, + "step": 810 + }, + { + "epoch": 0.05253253253253253, + "grad_norm": 0.8318341374397278, + "learning_rate": 9.73733102697162e-06, + "loss": 2.1557, + "step": 820 + }, + { + "epoch": 0.053173173173173174, + "grad_norm": 0.7503217458724976, + "learning_rate": 9.734127746812737e-06, + "loss": 2.141, + "step": 830 + }, + { + "epoch": 0.05381381381381382, + "grad_norm": 0.7540414929389954, + "learning_rate": 9.730924466653853e-06, + "loss": 2.1537, + "step": 840 + }, + { + "epoch": 0.05445445445445445, + "grad_norm": 0.8077926635742188, + "learning_rate": 9.727721186494972e-06, + "loss": 2.1571, + "step": 850 + }, + { + "epoch": 0.055095095095095095, + "grad_norm": 0.7664414048194885, + "learning_rate": 9.72451790633609e-06, + "loss": 2.1454, + "step": 860 + }, + { + "epoch": 0.05573573573573574, + "grad_norm": 0.7282930016517639, + "learning_rate": 9.721314626177207e-06, + "loss": 2.1908, + "step": 870 + }, + { + "epoch": 0.05637637637637637, + "grad_norm": 0.9039175510406494, + "learning_rate": 9.718111346018323e-06, + "loss": 2.1392, + "step": 880 + }, + { + "epoch": 0.057017017017017016, + "grad_norm": 0.8304852247238159, + "learning_rate": 9.71490806585944e-06, + "loss": 2.1635, + "step": 890 + }, + { + "epoch": 0.05765765765765766, + "grad_norm": 0.8299368619918823, + "learning_rate": 9.711704785700559e-06, + "loss": 2.1533, + "step": 900 + }, + { + "epoch": 0.05765765765765766, + "eval_loss": 2.1680660247802734, + "eval_runtime": 98.9216, + "eval_samples_per_second": 10.109, + "eval_steps_per_second": 5.055, + "step": 900 + }, + { + "epoch": 0.0582982982982983, + "grad_norm": 0.8337482213973999, + "learning_rate": 9.708501505541675e-06, + "loss": 2.1754, + "step": 910 + }, + { + "epoch": 0.05893893893893894, + "grad_norm": 0.9944339990615845, + "learning_rate": 9.705298225382794e-06, + "loss": 2.1647, + "step": 920 + }, + { + "epoch": 0.05957957957957958, + "grad_norm": 0.7565041780471802, + "learning_rate": 9.70209494522391e-06, + "loss": 2.1799, + "step": 930 + }, + { + "epoch": 0.06022022022022022, + "grad_norm": 0.7579349279403687, + "learning_rate": 9.698891665065029e-06, + "loss": 2.1281, + "step": 940 + }, + { + "epoch": 0.06086086086086086, + "grad_norm": 0.8098254203796387, + "learning_rate": 9.695688384906145e-06, + "loss": 2.1129, + "step": 950 + }, + { + "epoch": 0.0615015015015015, + "grad_norm": 1.1852097511291504, + "learning_rate": 9.692485104747262e-06, + "loss": 2.1336, + "step": 960 + }, + { + "epoch": 0.06214214214214214, + "grad_norm": 0.8178196549415588, + "learning_rate": 9.689281824588379e-06, + "loss": 2.1323, + "step": 970 + }, + { + "epoch": 0.06278278278278278, + "grad_norm": 0.8084219098091125, + "learning_rate": 9.686078544429497e-06, + "loss": 2.1416, + "step": 980 + }, + { + "epoch": 0.06342342342342343, + "grad_norm": 0.9296208024024963, + "learning_rate": 9.682875264270614e-06, + "loss": 2.1364, + "step": 990 + }, + { + "epoch": 0.06406406406406406, + "grad_norm": 0.8570599555969238, + "learning_rate": 9.679671984111732e-06, + "loss": 2.1607, + "step": 1000 + }, + { + "epoch": 0.06406406406406406, + "eval_loss": 2.1602447032928467, + "eval_runtime": 98.8457, + "eval_samples_per_second": 10.117, + "eval_steps_per_second": 5.058, + "step": 1000 + }, + { + "epoch": 0.0647047047047047, + "grad_norm": 1.0591017007827759, + "learning_rate": 9.676468703952849e-06, + "loss": 2.1344, + "step": 1010 + }, + { + "epoch": 0.06534534534534535, + "grad_norm": 0.9065935611724854, + "learning_rate": 9.673265423793965e-06, + "loss": 2.1669, + "step": 1020 + }, + { + "epoch": 0.06598598598598598, + "grad_norm": 0.7907170653343201, + "learning_rate": 9.670062143635082e-06, + "loss": 2.1159, + "step": 1030 + }, + { + "epoch": 0.06662662662662662, + "grad_norm": 0.8142951726913452, + "learning_rate": 9.6668588634762e-06, + "loss": 2.1436, + "step": 1040 + }, + { + "epoch": 0.06726726726726727, + "grad_norm": 0.8167764544487, + "learning_rate": 9.663655583317317e-06, + "loss": 2.1569, + "step": 1050 + }, + { + "epoch": 0.0679079079079079, + "grad_norm": 0.8406487703323364, + "learning_rate": 9.660452303158435e-06, + "loss": 2.136, + "step": 1060 + }, + { + "epoch": 0.06854854854854855, + "grad_norm": 0.8966623544692993, + "learning_rate": 9.657249022999552e-06, + "loss": 2.1554, + "step": 1070 + }, + { + "epoch": 0.06918918918918919, + "grad_norm": 0.7379564642906189, + "learning_rate": 9.65404574284067e-06, + "loss": 2.1528, + "step": 1080 + }, + { + "epoch": 0.06982982982982983, + "grad_norm": 0.7592926621437073, + "learning_rate": 9.650842462681787e-06, + "loss": 2.1438, + "step": 1090 + }, + { + "epoch": 0.07047047047047048, + "grad_norm": 0.7466913461685181, + "learning_rate": 9.647639182522904e-06, + "loss": 2.1421, + "step": 1100 + }, + { + "epoch": 0.07047047047047048, + "eval_loss": 2.1529006958007812, + "eval_runtime": 98.9445, + "eval_samples_per_second": 10.107, + "eval_steps_per_second": 5.053, + "step": 1100 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 0.8553027510643005, + "learning_rate": 9.64443590236402e-06, + "loss": 2.1415, + "step": 1110 + }, + { + "epoch": 0.07175175175175175, + "grad_norm": 0.7739918231964111, + "learning_rate": 9.641232622205139e-06, + "loss": 2.1368, + "step": 1120 + }, + { + "epoch": 0.0723923923923924, + "grad_norm": 0.7692018747329712, + "learning_rate": 9.638029342046255e-06, + "loss": 2.1217, + "step": 1130 + }, + { + "epoch": 0.07303303303303303, + "grad_norm": 0.7803593873977661, + "learning_rate": 9.634826061887374e-06, + "loss": 2.1641, + "step": 1140 + }, + { + "epoch": 0.07367367367367367, + "grad_norm": 0.7034474015235901, + "learning_rate": 9.63162278172849e-06, + "loss": 2.1358, + "step": 1150 + }, + { + "epoch": 0.07431431431431432, + "grad_norm": 0.9745317697525024, + "learning_rate": 9.628419501569609e-06, + "loss": 2.1226, + "step": 1160 + }, + { + "epoch": 0.07495495495495495, + "grad_norm": 1.0803890228271484, + "learning_rate": 9.625216221410725e-06, + "loss": 2.1216, + "step": 1170 + }, + { + "epoch": 0.0755955955955956, + "grad_norm": 0.8124716281890869, + "learning_rate": 9.622012941251842e-06, + "loss": 2.1258, + "step": 1180 + }, + { + "epoch": 0.07623623623623624, + "grad_norm": 0.9271863698959351, + "learning_rate": 9.61880966109296e-06, + "loss": 2.1545, + "step": 1190 + }, + { + "epoch": 0.07687687687687687, + "grad_norm": 0.7706722617149353, + "learning_rate": 9.615606380934077e-06, + "loss": 2.1358, + "step": 1200 + }, + { + "epoch": 0.07687687687687687, + "eval_loss": 2.1485989093780518, + "eval_runtime": 99.2209, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.039, + "step": 1200 + }, + { + "epoch": 0.07751751751751752, + "grad_norm": 0.8027122616767883, + "learning_rate": 9.612403100775196e-06, + "loss": 2.1441, + "step": 1210 + }, + { + "epoch": 0.07815815815815816, + "grad_norm": 0.7750908136367798, + "learning_rate": 9.609199820616312e-06, + "loss": 2.116, + "step": 1220 + }, + { + "epoch": 0.0787987987987988, + "grad_norm": 0.6906116604804993, + "learning_rate": 9.605996540457429e-06, + "loss": 2.1289, + "step": 1230 + }, + { + "epoch": 0.07943943943943944, + "grad_norm": 0.8581239581108093, + "learning_rate": 9.602793260298546e-06, + "loss": 2.1343, + "step": 1240 + }, + { + "epoch": 0.08008008008008008, + "grad_norm": 1.026809573173523, + "learning_rate": 9.599589980139664e-06, + "loss": 2.1307, + "step": 1250 + }, + { + "epoch": 0.08072072072072072, + "grad_norm": 0.7176423072814941, + "learning_rate": 9.59638669998078e-06, + "loss": 2.115, + "step": 1260 + }, + { + "epoch": 0.08136136136136136, + "grad_norm": 0.7856579422950745, + "learning_rate": 9.593183419821899e-06, + "loss": 2.1171, + "step": 1270 + }, + { + "epoch": 0.082002002002002, + "grad_norm": 0.8396461606025696, + "learning_rate": 9.589980139663016e-06, + "loss": 2.0997, + "step": 1280 + }, + { + "epoch": 0.08264264264264264, + "grad_norm": 0.6931939721107483, + "learning_rate": 9.586776859504134e-06, + "loss": 2.1171, + "step": 1290 + }, + { + "epoch": 0.08328328328328329, + "grad_norm": 0.6357343792915344, + "learning_rate": 9.58357357934525e-06, + "loss": 2.141, + "step": 1300 + }, + { + "epoch": 0.08328328328328329, + "eval_loss": 2.1414124965667725, + "eval_runtime": 98.9922, + "eval_samples_per_second": 10.102, + "eval_steps_per_second": 5.051, + "step": 1300 + }, + { + "epoch": 0.08392392392392392, + "grad_norm": 0.674089789390564, + "learning_rate": 9.580370299186367e-06, + "loss": 2.1417, + "step": 1310 + }, + { + "epoch": 0.08456456456456457, + "grad_norm": 0.8322325348854065, + "learning_rate": 9.577167019027484e-06, + "loss": 2.1386, + "step": 1320 + }, + { + "epoch": 0.0852052052052052, + "grad_norm": 1.3690531253814697, + "learning_rate": 9.573963738868602e-06, + "loss": 2.1126, + "step": 1330 + }, + { + "epoch": 0.08584584584584584, + "grad_norm": 0.8282061219215393, + "learning_rate": 9.570760458709719e-06, + "loss": 2.1163, + "step": 1340 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 1.1951463222503662, + "learning_rate": 9.567557178550837e-06, + "loss": 2.1205, + "step": 1350 + }, + { + "epoch": 0.08712712712712713, + "grad_norm": 0.8390799760818481, + "learning_rate": 9.564353898391954e-06, + "loss": 2.114, + "step": 1360 + }, + { + "epoch": 0.08776776776776776, + "grad_norm": 0.801324725151062, + "learning_rate": 9.561150618233072e-06, + "loss": 2.1303, + "step": 1370 + }, + { + "epoch": 0.08840840840840841, + "grad_norm": 0.8951178789138794, + "learning_rate": 9.557947338074189e-06, + "loss": 2.1207, + "step": 1380 + }, + { + "epoch": 0.08904904904904905, + "grad_norm": 1.085943579673767, + "learning_rate": 9.554744057915306e-06, + "loss": 2.1105, + "step": 1390 + }, + { + "epoch": 0.08968968968968968, + "grad_norm": 0.7810286283493042, + "learning_rate": 9.551540777756422e-06, + "loss": 2.1337, + "step": 1400 + }, + { + "epoch": 0.08968968968968968, + "eval_loss": 2.137179136276245, + "eval_runtime": 98.8962, + "eval_samples_per_second": 10.112, + "eval_steps_per_second": 5.056, + "step": 1400 + }, + { + "epoch": 0.09033033033033033, + "grad_norm": 0.6787813901901245, + "learning_rate": 9.54833749759754e-06, + "loss": 2.1192, + "step": 1410 + }, + { + "epoch": 0.09097097097097097, + "grad_norm": 0.8344500660896301, + "learning_rate": 9.545134217438657e-06, + "loss": 2.1101, + "step": 1420 + }, + { + "epoch": 0.09161161161161162, + "grad_norm": 0.7459837198257446, + "learning_rate": 9.541930937279776e-06, + "loss": 2.123, + "step": 1430 + }, + { + "epoch": 0.09225225225225225, + "grad_norm": 0.6521297097206116, + "learning_rate": 9.538727657120892e-06, + "loss": 2.0906, + "step": 1440 + }, + { + "epoch": 0.09289289289289289, + "grad_norm": 0.6362139582633972, + "learning_rate": 9.535524376962009e-06, + "loss": 2.1154, + "step": 1450 + }, + { + "epoch": 0.09353353353353354, + "grad_norm": 0.6726361513137817, + "learning_rate": 9.532321096803127e-06, + "loss": 2.1368, + "step": 1460 + }, + { + "epoch": 0.09417417417417417, + "grad_norm": 0.9578641653060913, + "learning_rate": 9.529117816644244e-06, + "loss": 2.1159, + "step": 1470 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 0.8151896595954895, + "learning_rate": 9.525914536485363e-06, + "loss": 2.1405, + "step": 1480 + }, + { + "epoch": 0.09545545545545546, + "grad_norm": 0.694631814956665, + "learning_rate": 9.52271125632648e-06, + "loss": 2.1126, + "step": 1490 + }, + { + "epoch": 0.0960960960960961, + "grad_norm": 0.7634385824203491, + "learning_rate": 9.519507976167598e-06, + "loss": 2.1297, + "step": 1500 + }, + { + "epoch": 0.0960960960960961, + "eval_loss": 2.132540464401245, + "eval_runtime": 98.7552, + "eval_samples_per_second": 10.126, + "eval_steps_per_second": 5.063, + "step": 1500 + }, + { + "epoch": 0.09673673673673673, + "grad_norm": 0.7835580706596375, + "learning_rate": 9.516304696008714e-06, + "loss": 2.1314, + "step": 1510 + }, + { + "epoch": 0.09737737737737738, + "grad_norm": 0.7323389649391174, + "learning_rate": 9.513101415849831e-06, + "loss": 2.1396, + "step": 1520 + }, + { + "epoch": 0.09801801801801802, + "grad_norm": 0.8194797039031982, + "learning_rate": 9.509898135690948e-06, + "loss": 2.0894, + "step": 1530 + }, + { + "epoch": 0.09865865865865867, + "grad_norm": 0.734104335308075, + "learning_rate": 9.506694855532066e-06, + "loss": 2.112, + "step": 1540 + }, + { + "epoch": 0.0992992992992993, + "grad_norm": 0.8642280101776123, + "learning_rate": 9.503491575373183e-06, + "loss": 2.1098, + "step": 1550 + }, + { + "epoch": 0.09993993993993994, + "grad_norm": 0.7576484084129333, + "learning_rate": 9.500288295214301e-06, + "loss": 2.1053, + "step": 1560 + }, + { + "epoch": 0.10058058058058059, + "grad_norm": 0.6890721321105957, + "learning_rate": 9.497085015055418e-06, + "loss": 2.0883, + "step": 1570 + }, + { + "epoch": 0.10122122122122122, + "grad_norm": 0.6535369753837585, + "learning_rate": 9.493881734896534e-06, + "loss": 2.1248, + "step": 1580 + }, + { + "epoch": 0.10186186186186186, + "grad_norm": 0.7473631501197815, + "learning_rate": 9.490678454737653e-06, + "loss": 2.0969, + "step": 1590 + }, + { + "epoch": 0.10250250250250251, + "grad_norm": 0.6895433664321899, + "learning_rate": 9.48747517457877e-06, + "loss": 2.1226, + "step": 1600 + }, + { + "epoch": 0.10250250250250251, + "eval_loss": 2.128535747528076, + "eval_runtime": 99.166, + "eval_samples_per_second": 10.084, + "eval_steps_per_second": 5.042, + "step": 1600 + }, + { + "epoch": 0.10314314314314314, + "grad_norm": 0.7126756310462952, + "learning_rate": 9.484271894419886e-06, + "loss": 2.09, + "step": 1610 + }, + { + "epoch": 0.10378378378378378, + "grad_norm": 0.7647325992584229, + "learning_rate": 9.481068614261004e-06, + "loss": 2.1306, + "step": 1620 + }, + { + "epoch": 0.10442442442442443, + "grad_norm": 0.7653983235359192, + "learning_rate": 9.477865334102121e-06, + "loss": 2.111, + "step": 1630 + }, + { + "epoch": 0.10506506506506506, + "grad_norm": 0.7532052993774414, + "learning_rate": 9.47466205394324e-06, + "loss": 2.1125, + "step": 1640 + }, + { + "epoch": 0.1057057057057057, + "grad_norm": 0.8752217888832092, + "learning_rate": 9.471458773784356e-06, + "loss": 2.1218, + "step": 1650 + }, + { + "epoch": 0.10634634634634635, + "grad_norm": 0.7363693118095398, + "learning_rate": 9.468255493625473e-06, + "loss": 2.0789, + "step": 1660 + }, + { + "epoch": 0.10698698698698698, + "grad_norm": 0.7645154595375061, + "learning_rate": 9.46505221346659e-06, + "loss": 2.1146, + "step": 1670 + }, + { + "epoch": 0.10762762762762763, + "grad_norm": 0.7656025886535645, + "learning_rate": 9.461848933307708e-06, + "loss": 2.1247, + "step": 1680 + }, + { + "epoch": 0.10826826826826827, + "grad_norm": 0.893290102481842, + "learning_rate": 9.458645653148824e-06, + "loss": 2.1189, + "step": 1690 + }, + { + "epoch": 0.1089089089089089, + "grad_norm": 0.6659897565841675, + "learning_rate": 9.455442372989943e-06, + "loss": 2.0933, + "step": 1700 + }, + { + "epoch": 0.1089089089089089, + "eval_loss": 2.125424861907959, + "eval_runtime": 98.9268, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 1700 + }, + { + "epoch": 0.10954954954954955, + "grad_norm": 0.6630749106407166, + "learning_rate": 9.45223909283106e-06, + "loss": 2.1147, + "step": 1710 + }, + { + "epoch": 0.11019019019019019, + "grad_norm": 0.7636531591415405, + "learning_rate": 9.449035812672178e-06, + "loss": 2.1104, + "step": 1720 + }, + { + "epoch": 0.11083083083083083, + "grad_norm": 0.6244649887084961, + "learning_rate": 9.445832532513294e-06, + "loss": 2.0957, + "step": 1730 + }, + { + "epoch": 0.11147147147147148, + "grad_norm": 0.8571925163269043, + "learning_rate": 9.442629252354411e-06, + "loss": 2.1127, + "step": 1740 + }, + { + "epoch": 0.11211211211211211, + "grad_norm": 0.6850053071975708, + "learning_rate": 9.43942597219553e-06, + "loss": 2.1045, + "step": 1750 + }, + { + "epoch": 0.11275275275275275, + "grad_norm": 0.716699481010437, + "learning_rate": 9.436222692036646e-06, + "loss": 2.0728, + "step": 1760 + }, + { + "epoch": 0.1133933933933934, + "grad_norm": 0.742249071598053, + "learning_rate": 9.433019411877765e-06, + "loss": 2.0918, + "step": 1770 + }, + { + "epoch": 0.11403403403403403, + "grad_norm": 0.6848533749580383, + "learning_rate": 9.429816131718881e-06, + "loss": 2.0769, + "step": 1780 + }, + { + "epoch": 0.11467467467467468, + "grad_norm": 0.8706116080284119, + "learning_rate": 9.426612851559998e-06, + "loss": 2.1278, + "step": 1790 + }, + { + "epoch": 0.11531531531531532, + "grad_norm": 0.7491025328636169, + "learning_rate": 9.423409571401115e-06, + "loss": 2.1126, + "step": 1800 + }, + { + "epoch": 0.11531531531531532, + "eval_loss": 2.121605157852173, + "eval_runtime": 98.8478, + "eval_samples_per_second": 10.117, + "eval_steps_per_second": 5.058, + "step": 1800 + }, + { + "epoch": 0.11595595595595595, + "grad_norm": 0.8098132014274597, + "learning_rate": 9.420206291242233e-06, + "loss": 2.1015, + "step": 1810 + }, + { + "epoch": 0.1165965965965966, + "grad_norm": 0.9224849343299866, + "learning_rate": 9.41700301108335e-06, + "loss": 2.1215, + "step": 1820 + }, + { + "epoch": 0.11723723723723724, + "grad_norm": 0.6793172359466553, + "learning_rate": 9.413799730924468e-06, + "loss": 2.1115, + "step": 1830 + }, + { + "epoch": 0.11787787787787787, + "grad_norm": 0.7940657138824463, + "learning_rate": 9.410596450765585e-06, + "loss": 2.0879, + "step": 1840 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 0.947806715965271, + "learning_rate": 9.407393170606703e-06, + "loss": 2.1014, + "step": 1850 + }, + { + "epoch": 0.11915915915915916, + "grad_norm": 0.7339703440666199, + "learning_rate": 9.40418989044782e-06, + "loss": 2.1189, + "step": 1860 + }, + { + "epoch": 0.1197997997997998, + "grad_norm": 0.9161699414253235, + "learning_rate": 9.400986610288936e-06, + "loss": 2.0974, + "step": 1870 + }, + { + "epoch": 0.12044044044044044, + "grad_norm": 0.6893654465675354, + "learning_rate": 9.397783330130053e-06, + "loss": 2.096, + "step": 1880 + }, + { + "epoch": 0.12108108108108108, + "grad_norm": 0.6452145576477051, + "learning_rate": 9.394580049971171e-06, + "loss": 2.1264, + "step": 1890 + }, + { + "epoch": 0.12172172172172172, + "grad_norm": 0.8689838647842407, + "learning_rate": 9.391376769812288e-06, + "loss": 2.0991, + "step": 1900 + }, + { + "epoch": 0.12172172172172172, + "eval_loss": 2.120067834854126, + "eval_runtime": 98.9693, + "eval_samples_per_second": 10.104, + "eval_steps_per_second": 5.052, + "step": 1900 + }, + { + "epoch": 0.12236236236236236, + "grad_norm": 0.9230114221572876, + "learning_rate": 9.388173489653406e-06, + "loss": 2.1093, + "step": 1910 + }, + { + "epoch": 0.123003003003003, + "grad_norm": 0.8922761678695679, + "learning_rate": 9.384970209494523e-06, + "loss": 2.1015, + "step": 1920 + }, + { + "epoch": 0.12364364364364365, + "grad_norm": 0.7505314350128174, + "learning_rate": 9.381766929335641e-06, + "loss": 2.1449, + "step": 1930 + }, + { + "epoch": 0.12428428428428429, + "grad_norm": 0.7936354279518127, + "learning_rate": 9.378563649176758e-06, + "loss": 2.1323, + "step": 1940 + }, + { + "epoch": 0.12492492492492492, + "grad_norm": 0.8380959033966064, + "learning_rate": 9.375360369017875e-06, + "loss": 2.1223, + "step": 1950 + }, + { + "epoch": 0.12556556556556556, + "grad_norm": 0.7633472084999084, + "learning_rate": 9.372157088858991e-06, + "loss": 2.1203, + "step": 1960 + }, + { + "epoch": 0.1262062062062062, + "grad_norm": 0.6278607249259949, + "learning_rate": 9.36895380870011e-06, + "loss": 2.0937, + "step": 1970 + }, + { + "epoch": 0.12684684684684686, + "grad_norm": 0.7140986323356628, + "learning_rate": 9.365750528541226e-06, + "loss": 2.118, + "step": 1980 + }, + { + "epoch": 0.12748748748748748, + "grad_norm": 0.8503913283348083, + "learning_rate": 9.362547248382345e-06, + "loss": 2.1064, + "step": 1990 + }, + { + "epoch": 0.12812812812812813, + "grad_norm": 0.8607761263847351, + "learning_rate": 9.359343968223461e-06, + "loss": 2.1054, + "step": 2000 + }, + { + "epoch": 0.12812812812812813, + "eval_loss": 2.116664171218872, + "eval_runtime": 99.0609, + "eval_samples_per_second": 10.095, + "eval_steps_per_second": 5.047, + "step": 2000 + }, + { + "epoch": 0.12876876876876878, + "grad_norm": 0.810498833656311, + "learning_rate": 9.356140688064578e-06, + "loss": 2.1215, + "step": 2010 + }, + { + "epoch": 0.1294094094094094, + "grad_norm": 0.5971576571464539, + "learning_rate": 9.352937407905696e-06, + "loss": 2.0815, + "step": 2020 + }, + { + "epoch": 0.13005005005005005, + "grad_norm": 0.7978019714355469, + "learning_rate": 9.349734127746813e-06, + "loss": 2.0826, + "step": 2030 + }, + { + "epoch": 0.1306906906906907, + "grad_norm": 0.7856429219245911, + "learning_rate": 9.346530847587931e-06, + "loss": 2.1178, + "step": 2040 + }, + { + "epoch": 0.13133133133133132, + "grad_norm": 0.610020101070404, + "learning_rate": 9.343327567429048e-06, + "loss": 2.0959, + "step": 2050 + }, + { + "epoch": 0.13197197197197197, + "grad_norm": 0.8602128028869629, + "learning_rate": 9.340124287270167e-06, + "loss": 2.0855, + "step": 2060 + }, + { + "epoch": 0.13261261261261262, + "grad_norm": 0.7461118698120117, + "learning_rate": 9.336921007111283e-06, + "loss": 2.1051, + "step": 2070 + }, + { + "epoch": 0.13325325325325324, + "grad_norm": 0.6008646488189697, + "learning_rate": 9.3337177269524e-06, + "loss": 2.1097, + "step": 2080 + }, + { + "epoch": 0.1338938938938939, + "grad_norm": 0.7542864084243774, + "learning_rate": 9.330514446793517e-06, + "loss": 2.1057, + "step": 2090 + }, + { + "epoch": 0.13453453453453454, + "grad_norm": 0.6587682962417603, + "learning_rate": 9.327311166634635e-06, + "loss": 2.0717, + "step": 2100 + }, + { + "epoch": 0.13453453453453454, + "eval_loss": 2.113861322402954, + "eval_runtime": 98.8506, + "eval_samples_per_second": 10.116, + "eval_steps_per_second": 5.058, + "step": 2100 + }, + { + "epoch": 0.1351751751751752, + "grad_norm": 0.6886266469955444, + "learning_rate": 9.324107886475752e-06, + "loss": 2.1092, + "step": 2110 + }, + { + "epoch": 0.1358158158158158, + "grad_norm": 0.6924049854278564, + "learning_rate": 9.32090460631687e-06, + "loss": 2.0743, + "step": 2120 + }, + { + "epoch": 0.13645645645645646, + "grad_norm": 0.6305820941925049, + "learning_rate": 9.317701326157987e-06, + "loss": 2.1235, + "step": 2130 + }, + { + "epoch": 0.1370970970970971, + "grad_norm": 0.7048687934875488, + "learning_rate": 9.314498045999105e-06, + "loss": 2.0817, + "step": 2140 + }, + { + "epoch": 0.13773773773773773, + "grad_norm": 0.7016113996505737, + "learning_rate": 9.311294765840222e-06, + "loss": 2.0788, + "step": 2150 + }, + { + "epoch": 0.13837837837837838, + "grad_norm": 0.6380958557128906, + "learning_rate": 9.308091485681338e-06, + "loss": 2.0981, + "step": 2160 + }, + { + "epoch": 0.13901901901901903, + "grad_norm": 0.881716787815094, + "learning_rate": 9.304888205522455e-06, + "loss": 2.1062, + "step": 2170 + }, + { + "epoch": 0.13965965965965965, + "grad_norm": 0.6827107667922974, + "learning_rate": 9.301684925363573e-06, + "loss": 2.1052, + "step": 2180 + }, + { + "epoch": 0.1403003003003003, + "grad_norm": 0.8060847520828247, + "learning_rate": 9.29848164520469e-06, + "loss": 2.1071, + "step": 2190 + }, + { + "epoch": 0.14094094094094095, + "grad_norm": 0.6237013339996338, + "learning_rate": 9.295278365045808e-06, + "loss": 2.1083, + "step": 2200 + }, + { + "epoch": 0.14094094094094095, + "eval_loss": 2.1115634441375732, + "eval_runtime": 99.231, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.039, + "step": 2200 + }, + { + "epoch": 0.14158158158158157, + "grad_norm": 0.5867820978164673, + "learning_rate": 9.292075084886925e-06, + "loss": 2.1005, + "step": 2210 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 0.663402795791626, + "learning_rate": 9.288871804728042e-06, + "loss": 2.1105, + "step": 2220 + }, + { + "epoch": 0.14286286286286287, + "grad_norm": 0.7630754709243774, + "learning_rate": 9.285668524569158e-06, + "loss": 2.0621, + "step": 2230 + }, + { + "epoch": 0.1435035035035035, + "grad_norm": 0.7367675304412842, + "learning_rate": 9.282465244410277e-06, + "loss": 2.1162, + "step": 2240 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.7971994876861572, + "learning_rate": 9.279261964251393e-06, + "loss": 2.0683, + "step": 2250 + }, + { + "epoch": 0.1447847847847848, + "grad_norm": 0.6509613990783691, + "learning_rate": 9.276058684092512e-06, + "loss": 2.0778, + "step": 2260 + }, + { + "epoch": 0.14542542542542541, + "grad_norm": 0.7286920547485352, + "learning_rate": 9.272855403933628e-06, + "loss": 2.0727, + "step": 2270 + }, + { + "epoch": 0.14606606606606606, + "grad_norm": 0.7628468871116638, + "learning_rate": 9.269652123774747e-06, + "loss": 2.0767, + "step": 2280 + }, + { + "epoch": 0.1467067067067067, + "grad_norm": 0.7225992679595947, + "learning_rate": 9.266448843615863e-06, + "loss": 2.0786, + "step": 2290 + }, + { + "epoch": 0.14734734734734733, + "grad_norm": 0.7567510604858398, + "learning_rate": 9.26324556345698e-06, + "loss": 2.0983, + "step": 2300 + }, + { + "epoch": 0.14734734734734733, + "eval_loss": 2.1110680103302, + "eval_runtime": 99.2871, + "eval_samples_per_second": 10.072, + "eval_steps_per_second": 5.036, + "step": 2300 + }, + { + "epoch": 0.14798798798798798, + "grad_norm": 0.809766948223114, + "learning_rate": 9.260042283298098e-06, + "loss": 2.0749, + "step": 2310 + }, + { + "epoch": 0.14862862862862863, + "grad_norm": 0.7898465394973755, + "learning_rate": 9.256839003139215e-06, + "loss": 2.1061, + "step": 2320 + }, + { + "epoch": 0.14926926926926926, + "grad_norm": 0.7167459726333618, + "learning_rate": 9.253635722980333e-06, + "loss": 2.1222, + "step": 2330 + }, + { + "epoch": 0.1499099099099099, + "grad_norm": 0.7634709477424622, + "learning_rate": 9.25043244282145e-06, + "loss": 2.1032, + "step": 2340 + }, + { + "epoch": 0.15055055055055055, + "grad_norm": 0.8492723107337952, + "learning_rate": 9.247229162662567e-06, + "loss": 2.1228, + "step": 2350 + }, + { + "epoch": 0.1511911911911912, + "grad_norm": 0.6447978019714355, + "learning_rate": 9.244025882503685e-06, + "loss": 2.1031, + "step": 2360 + }, + { + "epoch": 0.15183183183183183, + "grad_norm": 0.757310688495636, + "learning_rate": 9.240822602344802e-06, + "loss": 2.1096, + "step": 2370 + }, + { + "epoch": 0.15247247247247248, + "grad_norm": 0.6055355668067932, + "learning_rate": 9.237619322185919e-06, + "loss": 2.1049, + "step": 2380 + }, + { + "epoch": 0.15311311311311313, + "grad_norm": 0.6686123013496399, + "learning_rate": 9.234416042027037e-06, + "loss": 2.0744, + "step": 2390 + }, + { + "epoch": 0.15375375375375375, + "grad_norm": 0.7597561478614807, + "learning_rate": 9.231212761868154e-06, + "loss": 2.0981, + "step": 2400 + }, + { + "epoch": 0.15375375375375375, + "eval_loss": 2.106098175048828, + "eval_runtime": 99.2501, + "eval_samples_per_second": 10.076, + "eval_steps_per_second": 5.038, + "step": 2400 + }, + { + "epoch": 0.1543943943943944, + "grad_norm": 0.635094940662384, + "learning_rate": 9.228009481709272e-06, + "loss": 2.105, + "step": 2410 + }, + { + "epoch": 0.15503503503503505, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.224806201550389e-06, + "loss": 2.0686, + "step": 2420 + }, + { + "epoch": 0.15567567567567567, + "grad_norm": 0.7147831320762634, + "learning_rate": 9.221602921391505e-06, + "loss": 2.0769, + "step": 2430 + }, + { + "epoch": 0.15631631631631632, + "grad_norm": 0.8072251081466675, + "learning_rate": 9.218399641232622e-06, + "loss": 2.0943, + "step": 2440 + }, + { + "epoch": 0.15695695695695697, + "grad_norm": 0.6609693169593811, + "learning_rate": 9.21519636107374e-06, + "loss": 2.0871, + "step": 2450 + }, + { + "epoch": 0.1575975975975976, + "grad_norm": 0.8483486175537109, + "learning_rate": 9.211993080914857e-06, + "loss": 2.0795, + "step": 2460 + }, + { + "epoch": 0.15823823823823824, + "grad_norm": 0.6840553283691406, + "learning_rate": 9.208789800755975e-06, + "loss": 2.0868, + "step": 2470 + }, + { + "epoch": 0.1588788788788789, + "grad_norm": 0.7068288326263428, + "learning_rate": 9.205586520597092e-06, + "loss": 2.0649, + "step": 2480 + }, + { + "epoch": 0.1595195195195195, + "grad_norm": 0.7380810976028442, + "learning_rate": 9.20238324043821e-06, + "loss": 2.0733, + "step": 2490 + }, + { + "epoch": 0.16016016016016016, + "grad_norm": 0.7049792408943176, + "learning_rate": 9.199179960279327e-06, + "loss": 2.0834, + "step": 2500 + }, + { + "epoch": 0.16016016016016016, + "eval_loss": 2.104818105697632, + "eval_runtime": 99.3161, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.034, + "step": 2500 + }, + { + "epoch": 0.1608008008008008, + "grad_norm": 0.6536545157432556, + "learning_rate": 9.195976680120444e-06, + "loss": 2.0919, + "step": 2510 + }, + { + "epoch": 0.16144144144144143, + "grad_norm": 0.7614091038703918, + "learning_rate": 9.19277339996156e-06, + "loss": 2.0966, + "step": 2520 + }, + { + "epoch": 0.16208208208208208, + "grad_norm": 0.696030855178833, + "learning_rate": 9.189570119802679e-06, + "loss": 2.0731, + "step": 2530 + }, + { + "epoch": 0.16272272272272273, + "grad_norm": 0.9704303741455078, + "learning_rate": 9.186366839643795e-06, + "loss": 2.0785, + "step": 2540 + }, + { + "epoch": 0.16336336336336335, + "grad_norm": 0.6941235065460205, + "learning_rate": 9.183163559484914e-06, + "loss": 2.0695, + "step": 2550 + }, + { + "epoch": 0.164004004004004, + "grad_norm": 0.7895737886428833, + "learning_rate": 9.17996027932603e-06, + "loss": 2.0907, + "step": 2560 + }, + { + "epoch": 0.16464464464464465, + "grad_norm": 0.7447530031204224, + "learning_rate": 9.176756999167147e-06, + "loss": 2.0847, + "step": 2570 + }, + { + "epoch": 0.16528528528528527, + "grad_norm": 0.7622038125991821, + "learning_rate": 9.173553719008265e-06, + "loss": 2.052, + "step": 2580 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 0.7055556178092957, + "learning_rate": 9.170350438849382e-06, + "loss": 2.0979, + "step": 2590 + }, + { + "epoch": 0.16656656656656657, + "grad_norm": 0.6614319682121277, + "learning_rate": 9.1671471586905e-06, + "loss": 2.0952, + "step": 2600 + }, + { + "epoch": 0.16656656656656657, + "eval_loss": 2.1026060581207275, + "eval_runtime": 98.8201, + "eval_samples_per_second": 10.119, + "eval_steps_per_second": 5.06, + "step": 2600 + }, + { + "epoch": 0.16720720720720722, + "grad_norm": 0.6550982594490051, + "learning_rate": 9.163943878531617e-06, + "loss": 2.0745, + "step": 2610 + }, + { + "epoch": 0.16784784784784784, + "grad_norm": 0.6947475671768188, + "learning_rate": 9.160740598372735e-06, + "loss": 2.0862, + "step": 2620 + }, + { + "epoch": 0.1684884884884885, + "grad_norm": 0.6655051112174988, + "learning_rate": 9.157537318213852e-06, + "loss": 2.0813, + "step": 2630 + }, + { + "epoch": 0.16912912912912914, + "grad_norm": 0.5886589884757996, + "learning_rate": 9.154334038054969e-06, + "loss": 2.1122, + "step": 2640 + }, + { + "epoch": 0.16976976976976976, + "grad_norm": 0.6389265060424805, + "learning_rate": 9.151130757896085e-06, + "loss": 2.0789, + "step": 2650 + }, + { + "epoch": 0.1704104104104104, + "grad_norm": 0.6327618360519409, + "learning_rate": 9.147927477737204e-06, + "loss": 2.0449, + "step": 2660 + }, + { + "epoch": 0.17105105105105106, + "grad_norm": 0.7381449341773987, + "learning_rate": 9.14472419757832e-06, + "loss": 2.0934, + "step": 2670 + }, + { + "epoch": 0.17169169169169168, + "grad_norm": 0.6343180537223816, + "learning_rate": 9.141520917419439e-06, + "loss": 2.0768, + "step": 2680 + }, + { + "epoch": 0.17233233233233233, + "grad_norm": 0.6861628293991089, + "learning_rate": 9.138317637260556e-06, + "loss": 2.0827, + "step": 2690 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 0.7896407842636108, + "learning_rate": 9.135114357101674e-06, + "loss": 2.0736, + "step": 2700 + }, + { + "epoch": 0.17297297297297298, + "eval_loss": 2.101081132888794, + "eval_runtime": 98.7456, + "eval_samples_per_second": 10.127, + "eval_steps_per_second": 5.064, + "step": 2700 + }, + { + "epoch": 0.1736136136136136, + "grad_norm": 0.6029135584831238, + "learning_rate": 9.13191107694279e-06, + "loss": 2.0433, + "step": 2710 + }, + { + "epoch": 0.17425425425425425, + "grad_norm": 0.9753668308258057, + "learning_rate": 9.128707796783907e-06, + "loss": 2.0716, + "step": 2720 + }, + { + "epoch": 0.1748948948948949, + "grad_norm": 0.6830595135688782, + "learning_rate": 9.125504516625024e-06, + "loss": 2.0663, + "step": 2730 + }, + { + "epoch": 0.17553553553553553, + "grad_norm": 0.6865383982658386, + "learning_rate": 9.122301236466142e-06, + "loss": 2.0515, + "step": 2740 + }, + { + "epoch": 0.17617617617617617, + "grad_norm": 0.6678487062454224, + "learning_rate": 9.119097956307259e-06, + "loss": 2.1013, + "step": 2750 + }, + { + "epoch": 0.17681681681681682, + "grad_norm": 0.7546699643135071, + "learning_rate": 9.115894676148377e-06, + "loss": 2.0351, + "step": 2760 + }, + { + "epoch": 0.17745745745745745, + "grad_norm": 0.7077041864395142, + "learning_rate": 9.112691395989494e-06, + "loss": 2.0679, + "step": 2770 + }, + { + "epoch": 0.1780980980980981, + "grad_norm": 0.682655930519104, + "learning_rate": 9.10948811583061e-06, + "loss": 2.0566, + "step": 2780 + }, + { + "epoch": 0.17873873873873874, + "grad_norm": 0.6080804467201233, + "learning_rate": 9.106284835671729e-06, + "loss": 2.0616, + "step": 2790 + }, + { + "epoch": 0.17937937937937937, + "grad_norm": 0.6820564270019531, + "learning_rate": 9.103081555512846e-06, + "loss": 2.0932, + "step": 2800 + }, + { + "epoch": 0.17937937937937937, + "eval_loss": 2.0988144874572754, + "eval_runtime": 99.1774, + "eval_samples_per_second": 10.083, + "eval_steps_per_second": 5.041, + "step": 2800 + }, + { + "epoch": 0.18002002002002002, + "grad_norm": 0.5526257753372192, + "learning_rate": 9.099878275353962e-06, + "loss": 2.1138, + "step": 2810 + }, + { + "epoch": 0.18066066066066067, + "grad_norm": 0.6748570799827576, + "learning_rate": 9.09667499519508e-06, + "loss": 2.0878, + "step": 2820 + }, + { + "epoch": 0.1813013013013013, + "grad_norm": 0.6675301194190979, + "learning_rate": 9.093471715036197e-06, + "loss": 2.0713, + "step": 2830 + }, + { + "epoch": 0.18194194194194194, + "grad_norm": 0.7286090850830078, + "learning_rate": 9.090268434877316e-06, + "loss": 2.0988, + "step": 2840 + }, + { + "epoch": 0.1825825825825826, + "grad_norm": 0.7662502527236938, + "learning_rate": 9.087065154718432e-06, + "loss": 2.075, + "step": 2850 + }, + { + "epoch": 0.18322322322322324, + "grad_norm": 0.6795459389686584, + "learning_rate": 9.083861874559549e-06, + "loss": 2.0893, + "step": 2860 + }, + { + "epoch": 0.18386386386386386, + "grad_norm": 0.6231416463851929, + "learning_rate": 9.080658594400667e-06, + "loss": 2.0917, + "step": 2870 + }, + { + "epoch": 0.1845045045045045, + "grad_norm": 0.7620142698287964, + "learning_rate": 9.077455314241784e-06, + "loss": 2.0307, + "step": 2880 + }, + { + "epoch": 0.18514514514514516, + "grad_norm": 0.6979257464408875, + "learning_rate": 9.074252034082902e-06, + "loss": 2.0662, + "step": 2890 + }, + { + "epoch": 0.18578578578578578, + "grad_norm": 0.5804643630981445, + "learning_rate": 9.071048753924019e-06, + "loss": 2.0613, + "step": 2900 + }, + { + "epoch": 0.18578578578578578, + "eval_loss": 2.0972344875335693, + "eval_runtime": 99.234, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.039, + "step": 2900 + }, + { + "epoch": 0.18642642642642643, + "grad_norm": 0.7419028282165527, + "learning_rate": 9.067845473765137e-06, + "loss": 2.0636, + "step": 2910 + }, + { + "epoch": 0.18706706706706708, + "grad_norm": 0.7618236541748047, + "learning_rate": 9.064642193606254e-06, + "loss": 2.0519, + "step": 2920 + }, + { + "epoch": 0.1877077077077077, + "grad_norm": 0.5812451243400574, + "learning_rate": 9.06143891344737e-06, + "loss": 2.1006, + "step": 2930 + }, + { + "epoch": 0.18834834834834835, + "grad_norm": 0.6834234595298767, + "learning_rate": 9.058235633288487e-06, + "loss": 2.0534, + "step": 2940 + }, + { + "epoch": 0.188988988988989, + "grad_norm": 0.6607987284660339, + "learning_rate": 9.055032353129606e-06, + "loss": 2.0839, + "step": 2950 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 0.6382490396499634, + "learning_rate": 9.051829072970723e-06, + "loss": 2.085, + "step": 2960 + }, + { + "epoch": 0.19027027027027027, + "grad_norm": 0.6113517880439758, + "learning_rate": 9.048625792811841e-06, + "loss": 2.076, + "step": 2970 + }, + { + "epoch": 0.19091091091091092, + "grad_norm": 0.6453644633293152, + "learning_rate": 9.045422512652958e-06, + "loss": 2.0997, + "step": 2980 + }, + { + "epoch": 0.19155155155155154, + "grad_norm": 0.6111727952957153, + "learning_rate": 9.042219232494074e-06, + "loss": 2.0862, + "step": 2990 + }, + { + "epoch": 0.1921921921921922, + "grad_norm": 0.6452946066856384, + "learning_rate": 9.039015952335191e-06, + "loss": 2.0733, + "step": 3000 + }, + { + "epoch": 0.1921921921921922, + "eval_loss": 2.0955660343170166, + "eval_runtime": 99.267, + "eval_samples_per_second": 10.074, + "eval_steps_per_second": 5.037, + "step": 3000 + }, + { + "epoch": 0.19283283283283284, + "grad_norm": 0.6006574630737305, + "learning_rate": 9.03581267217631e-06, + "loss": 2.063, + "step": 3010 + }, + { + "epoch": 0.19347347347347346, + "grad_norm": 0.6869122982025146, + "learning_rate": 9.032609392017426e-06, + "loss": 2.0952, + "step": 3020 + }, + { + "epoch": 0.1941141141141141, + "grad_norm": 0.5634675621986389, + "learning_rate": 9.029406111858544e-06, + "loss": 2.0956, + "step": 3030 + }, + { + "epoch": 0.19475475475475476, + "grad_norm": 0.6178089380264282, + "learning_rate": 9.026202831699661e-06, + "loss": 2.0201, + "step": 3040 + }, + { + "epoch": 0.19539539539539538, + "grad_norm": 0.663336992263794, + "learning_rate": 9.02299955154078e-06, + "loss": 2.0747, + "step": 3050 + }, + { + "epoch": 0.19603603603603603, + "grad_norm": 0.9335659742355347, + "learning_rate": 9.019796271381896e-06, + "loss": 2.1026, + "step": 3060 + }, + { + "epoch": 0.19667667667667668, + "grad_norm": 0.6373148560523987, + "learning_rate": 9.016592991223013e-06, + "loss": 2.0617, + "step": 3070 + }, + { + "epoch": 0.19731731731731733, + "grad_norm": 0.6642004251480103, + "learning_rate": 9.01338971106413e-06, + "loss": 2.096, + "step": 3080 + }, + { + "epoch": 0.19795795795795795, + "grad_norm": 0.6753519773483276, + "learning_rate": 9.010186430905248e-06, + "loss": 2.077, + "step": 3090 + }, + { + "epoch": 0.1985985985985986, + "grad_norm": 0.569271445274353, + "learning_rate": 9.006983150746364e-06, + "loss": 2.0502, + "step": 3100 + }, + { + "epoch": 0.1985985985985986, + "eval_loss": 2.093749761581421, + "eval_runtime": 99.1455, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 3100 + }, + { + "epoch": 0.19923923923923925, + "grad_norm": 0.6224071979522705, + "learning_rate": 9.003779870587483e-06, + "loss": 2.0576, + "step": 3110 + }, + { + "epoch": 0.19987987987987987, + "grad_norm": 0.6364945769309998, + "learning_rate": 9.0005765904286e-06, + "loss": 2.0429, + "step": 3120 + }, + { + "epoch": 0.20052052052052052, + "grad_norm": 0.7530544996261597, + "learning_rate": 8.997373310269718e-06, + "loss": 2.0803, + "step": 3130 + }, + { + "epoch": 0.20116116116116117, + "grad_norm": 0.7073140740394592, + "learning_rate": 8.994170030110834e-06, + "loss": 2.057, + "step": 3140 + }, + { + "epoch": 0.2018018018018018, + "grad_norm": 0.631070613861084, + "learning_rate": 8.990966749951951e-06, + "loss": 2.0786, + "step": 3150 + }, + { + "epoch": 0.20244244244244244, + "grad_norm": 0.711856484413147, + "learning_rate": 8.98776346979307e-06, + "loss": 2.0761, + "step": 3160 + }, + { + "epoch": 0.2030830830830831, + "grad_norm": 0.6684698462486267, + "learning_rate": 8.984560189634186e-06, + "loss": 2.07, + "step": 3170 + }, + { + "epoch": 0.20372372372372372, + "grad_norm": 0.6424736380577087, + "learning_rate": 8.981356909475304e-06, + "loss": 2.0619, + "step": 3180 + }, + { + "epoch": 0.20436436436436436, + "grad_norm": 0.6278524994850159, + "learning_rate": 8.978153629316421e-06, + "loss": 2.0607, + "step": 3190 + }, + { + "epoch": 0.20500500500500501, + "grad_norm": 0.6745601296424866, + "learning_rate": 8.974950349157538e-06, + "loss": 2.0979, + "step": 3200 + }, + { + "epoch": 0.20500500500500501, + "eval_loss": 2.092132568359375, + "eval_runtime": 99.2104, + "eval_samples_per_second": 10.08, + "eval_steps_per_second": 5.04, + "step": 3200 + }, + { + "epoch": 0.20564564564564564, + "grad_norm": 0.6170912384986877, + "learning_rate": 8.971747068998654e-06, + "loss": 2.1035, + "step": 3210 + }, + { + "epoch": 0.20628628628628629, + "grad_norm": 0.7534422278404236, + "learning_rate": 8.968543788839773e-06, + "loss": 2.054, + "step": 3220 + }, + { + "epoch": 0.20692692692692694, + "grad_norm": 0.672907829284668, + "learning_rate": 8.96534050868089e-06, + "loss": 2.0634, + "step": 3230 + }, + { + "epoch": 0.20756756756756756, + "grad_norm": 0.6018652319908142, + "learning_rate": 8.962137228522008e-06, + "loss": 2.0809, + "step": 3240 + }, + { + "epoch": 0.2082082082082082, + "grad_norm": 0.6929556727409363, + "learning_rate": 8.958933948363125e-06, + "loss": 2.1188, + "step": 3250 + }, + { + "epoch": 0.20884884884884886, + "grad_norm": 0.6032728552818298, + "learning_rate": 8.955730668204243e-06, + "loss": 2.08, + "step": 3260 + }, + { + "epoch": 0.20948948948948948, + "grad_norm": 0.6318669319152832, + "learning_rate": 8.95252738804536e-06, + "loss": 2.0684, + "step": 3270 + }, + { + "epoch": 0.21013013013013013, + "grad_norm": 0.6870554685592651, + "learning_rate": 8.949324107886476e-06, + "loss": 2.0783, + "step": 3280 + }, + { + "epoch": 0.21077077077077078, + "grad_norm": 0.6430284976959229, + "learning_rate": 8.946120827727593e-06, + "loss": 2.084, + "step": 3290 + }, + { + "epoch": 0.2114114114114114, + "grad_norm": 0.654576301574707, + "learning_rate": 8.942917547568711e-06, + "loss": 2.1042, + "step": 3300 + }, + { + "epoch": 0.2114114114114114, + "eval_loss": 2.0920584201812744, + "eval_runtime": 99.1218, + "eval_samples_per_second": 10.089, + "eval_steps_per_second": 5.044, + "step": 3300 + }, + { + "epoch": 0.21205205205205205, + "grad_norm": 0.7410485744476318, + "learning_rate": 8.939714267409828e-06, + "loss": 2.0787, + "step": 3310 + }, + { + "epoch": 0.2126926926926927, + "grad_norm": 0.8399192690849304, + "learning_rate": 8.936510987250946e-06, + "loss": 2.0877, + "step": 3320 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.6334551572799683, + "learning_rate": 8.933307707092063e-06, + "loss": 2.0447, + "step": 3330 + }, + { + "epoch": 0.21397397397397397, + "grad_norm": 0.8268193006515503, + "learning_rate": 8.930104426933181e-06, + "loss": 2.0678, + "step": 3340 + }, + { + "epoch": 0.21461461461461462, + "grad_norm": 0.7777826189994812, + "learning_rate": 8.926901146774298e-06, + "loss": 2.0719, + "step": 3350 + }, + { + "epoch": 0.21525525525525527, + "grad_norm": 0.6634954214096069, + "learning_rate": 8.923697866615415e-06, + "loss": 2.065, + "step": 3360 + }, + { + "epoch": 0.2158958958958959, + "grad_norm": 0.6769670248031616, + "learning_rate": 8.920494586456531e-06, + "loss": 2.0575, + "step": 3370 + }, + { + "epoch": 0.21653653653653654, + "grad_norm": 0.6782456636428833, + "learning_rate": 8.91729130629765e-06, + "loss": 2.0914, + "step": 3380 + }, + { + "epoch": 0.2171771771771772, + "grad_norm": 0.5194129943847656, + "learning_rate": 8.914088026138766e-06, + "loss": 2.0793, + "step": 3390 + }, + { + "epoch": 0.2178178178178178, + "grad_norm": 0.5972870588302612, + "learning_rate": 8.910884745979885e-06, + "loss": 2.0644, + "step": 3400 + }, + { + "epoch": 0.2178178178178178, + "eval_loss": 2.089409589767456, + "eval_runtime": 99.0707, + "eval_samples_per_second": 10.094, + "eval_steps_per_second": 5.047, + "step": 3400 + }, + { + "epoch": 0.21845845845845846, + "grad_norm": 0.6698557138442993, + "learning_rate": 8.907681465821001e-06, + "loss": 2.0889, + "step": 3410 + }, + { + "epoch": 0.2190990990990991, + "grad_norm": 0.6106517910957336, + "learning_rate": 8.904478185662118e-06, + "loss": 2.0835, + "step": 3420 + }, + { + "epoch": 0.21973973973973973, + "grad_norm": 0.6931442022323608, + "learning_rate": 8.901274905503236e-06, + "loss": 2.0821, + "step": 3430 + }, + { + "epoch": 0.22038038038038038, + "grad_norm": 0.6137683391571045, + "learning_rate": 8.898071625344353e-06, + "loss": 2.0666, + "step": 3440 + }, + { + "epoch": 0.22102102102102103, + "grad_norm": 0.599586546421051, + "learning_rate": 8.894868345185471e-06, + "loss": 2.0867, + "step": 3450 + }, + { + "epoch": 0.22166166166166165, + "grad_norm": 0.6044438481330872, + "learning_rate": 8.891665065026588e-06, + "loss": 2.0751, + "step": 3460 + }, + { + "epoch": 0.2223023023023023, + "grad_norm": 0.6222479343414307, + "learning_rate": 8.888461784867706e-06, + "loss": 2.0877, + "step": 3470 + }, + { + "epoch": 0.22294294294294295, + "grad_norm": 0.5773678421974182, + "learning_rate": 8.885258504708823e-06, + "loss": 2.0502, + "step": 3480 + }, + { + "epoch": 0.22358358358358357, + "grad_norm": 0.5937193036079407, + "learning_rate": 8.88205522454994e-06, + "loss": 2.0945, + "step": 3490 + }, + { + "epoch": 0.22422422422422422, + "grad_norm": 0.6824532747268677, + "learning_rate": 8.878851944391056e-06, + "loss": 2.0528, + "step": 3500 + }, + { + "epoch": 0.22422422422422422, + "eval_loss": 2.0892724990844727, + "eval_runtime": 99.0067, + "eval_samples_per_second": 10.1, + "eval_steps_per_second": 5.05, + "step": 3500 + }, + { + "epoch": 0.22486486486486487, + "grad_norm": 0.6509242057800293, + "learning_rate": 8.875648664232175e-06, + "loss": 2.1105, + "step": 3510 + }, + { + "epoch": 0.2255055055055055, + "grad_norm": 0.6354013085365295, + "learning_rate": 8.872445384073291e-06, + "loss": 2.0607, + "step": 3520 + }, + { + "epoch": 0.22614614614614614, + "grad_norm": 0.6222078204154968, + "learning_rate": 8.86924210391441e-06, + "loss": 2.0638, + "step": 3530 + }, + { + "epoch": 0.2267867867867868, + "grad_norm": 0.5962913632392883, + "learning_rate": 8.866038823755527e-06, + "loss": 2.0623, + "step": 3540 + }, + { + "epoch": 0.22742742742742741, + "grad_norm": 0.6757493615150452, + "learning_rate": 8.862835543596643e-06, + "loss": 2.0916, + "step": 3550 + }, + { + "epoch": 0.22806806806806806, + "grad_norm": 0.5757195353507996, + "learning_rate": 8.859632263437762e-06, + "loss": 2.0587, + "step": 3560 + }, + { + "epoch": 0.2287087087087087, + "grad_norm": 0.6077573299407959, + "learning_rate": 8.856428983278878e-06, + "loss": 2.0703, + "step": 3570 + }, + { + "epoch": 0.22934934934934936, + "grad_norm": 0.6776688694953918, + "learning_rate": 8.853225703119995e-06, + "loss": 2.0932, + "step": 3580 + }, + { + "epoch": 0.22998998998998998, + "grad_norm": 0.7286055088043213, + "learning_rate": 8.850022422961113e-06, + "loss": 2.0642, + "step": 3590 + }, + { + "epoch": 0.23063063063063063, + "grad_norm": 0.5565325617790222, + "learning_rate": 8.84681914280223e-06, + "loss": 2.0548, + "step": 3600 + }, + { + "epoch": 0.23063063063063063, + "eval_loss": 2.086730480194092, + "eval_runtime": 99.0428, + "eval_samples_per_second": 10.097, + "eval_steps_per_second": 5.048, + "step": 3600 + }, + { + "epoch": 0.23127127127127128, + "grad_norm": 0.6085419058799744, + "learning_rate": 8.843615862643348e-06, + "loss": 2.0301, + "step": 3610 + }, + { + "epoch": 0.2319119119119119, + "grad_norm": 0.5839805603027344, + "learning_rate": 8.840412582484465e-06, + "loss": 2.0478, + "step": 3620 + }, + { + "epoch": 0.23255255255255255, + "grad_norm": 0.7211072444915771, + "learning_rate": 8.837209302325582e-06, + "loss": 2.0971, + "step": 3630 + }, + { + "epoch": 0.2331931931931932, + "grad_norm": 0.656085729598999, + "learning_rate": 8.834006022166698e-06, + "loss": 2.0331, + "step": 3640 + }, + { + "epoch": 0.23383383383383383, + "grad_norm": 0.5496543645858765, + "learning_rate": 8.830802742007817e-06, + "loss": 2.067, + "step": 3650 + }, + { + "epoch": 0.23447447447447448, + "grad_norm": 0.5275883078575134, + "learning_rate": 8.827599461848933e-06, + "loss": 2.0737, + "step": 3660 + }, + { + "epoch": 0.23511511511511513, + "grad_norm": 0.5332633852958679, + "learning_rate": 8.824396181690052e-06, + "loss": 2.0562, + "step": 3670 + }, + { + "epoch": 0.23575575575575575, + "grad_norm": 0.7012920379638672, + "learning_rate": 8.821192901531168e-06, + "loss": 2.0704, + "step": 3680 + }, + { + "epoch": 0.2363963963963964, + "grad_norm": 0.570817232131958, + "learning_rate": 8.817989621372287e-06, + "loss": 2.0765, + "step": 3690 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 0.6036481261253357, + "learning_rate": 8.814786341213403e-06, + "loss": 2.0905, + "step": 3700 + }, + { + "epoch": 0.23703703703703705, + "eval_loss": 2.086113452911377, + "eval_runtime": 98.9534, + "eval_samples_per_second": 10.106, + "eval_steps_per_second": 5.053, + "step": 3700 + }, + { + "epoch": 0.23767767767767767, + "grad_norm": 0.5793745517730713, + "learning_rate": 8.81158306105452e-06, + "loss": 2.0549, + "step": 3710 + }, + { + "epoch": 0.23831831831831832, + "grad_norm": 0.566124439239502, + "learning_rate": 8.808379780895638e-06, + "loss": 2.0537, + "step": 3720 + }, + { + "epoch": 0.23895895895895897, + "grad_norm": 0.5538589358329773, + "learning_rate": 8.805176500736755e-06, + "loss": 2.0669, + "step": 3730 + }, + { + "epoch": 0.2395995995995996, + "grad_norm": 0.558520495891571, + "learning_rate": 8.801973220577873e-06, + "loss": 2.1043, + "step": 3740 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 0.6997453570365906, + "learning_rate": 8.79876994041899e-06, + "loss": 2.0715, + "step": 3750 + }, + { + "epoch": 0.2408808808808809, + "grad_norm": 0.641663134098053, + "learning_rate": 8.795566660260107e-06, + "loss": 2.074, + "step": 3760 + }, + { + "epoch": 0.2415215215215215, + "grad_norm": 0.7044941782951355, + "learning_rate": 8.792363380101223e-06, + "loss": 2.0649, + "step": 3770 + }, + { + "epoch": 0.24216216216216216, + "grad_norm": 0.6711502075195312, + "learning_rate": 8.789160099942342e-06, + "loss": 2.0886, + "step": 3780 + }, + { + "epoch": 0.2428028028028028, + "grad_norm": 0.6159843802452087, + "learning_rate": 8.785956819783458e-06, + "loss": 2.0574, + "step": 3790 + }, + { + "epoch": 0.24344344344344343, + "grad_norm": 0.6314002871513367, + "learning_rate": 8.782753539624577e-06, + "loss": 2.0853, + "step": 3800 + }, + { + "epoch": 0.24344344344344343, + "eval_loss": 2.0856714248657227, + "eval_runtime": 98.9938, + "eval_samples_per_second": 10.102, + "eval_steps_per_second": 5.051, + "step": 3800 + }, + { + "epoch": 0.24408408408408408, + "grad_norm": 0.7047820091247559, + "learning_rate": 8.779550259465693e-06, + "loss": 2.0269, + "step": 3810 + }, + { + "epoch": 0.24472472472472473, + "grad_norm": 0.7113313674926758, + "learning_rate": 8.776346979306812e-06, + "loss": 2.0679, + "step": 3820 + }, + { + "epoch": 0.24536536536536538, + "grad_norm": 0.6536078453063965, + "learning_rate": 8.773143699147929e-06, + "loss": 2.0673, + "step": 3830 + }, + { + "epoch": 0.246006006006006, + "grad_norm": 0.654203474521637, + "learning_rate": 8.769940418989045e-06, + "loss": 2.1041, + "step": 3840 + }, + { + "epoch": 0.24664664664664665, + "grad_norm": 0.6029314398765564, + "learning_rate": 8.766737138830162e-06, + "loss": 2.0688, + "step": 3850 + }, + { + "epoch": 0.2472872872872873, + "grad_norm": 0.6532240509986877, + "learning_rate": 8.76353385867128e-06, + "loss": 2.0567, + "step": 3860 + }, + { + "epoch": 0.24792792792792792, + "grad_norm": 0.5579843521118164, + "learning_rate": 8.760330578512397e-06, + "loss": 2.081, + "step": 3870 + }, + { + "epoch": 0.24856856856856857, + "grad_norm": 0.7852092385292053, + "learning_rate": 8.757127298353515e-06, + "loss": 2.0464, + "step": 3880 + }, + { + "epoch": 0.24920920920920922, + "grad_norm": 0.6831695437431335, + "learning_rate": 8.753924018194632e-06, + "loss": 2.0724, + "step": 3890 + }, + { + "epoch": 0.24984984984984984, + "grad_norm": 0.5026164650917053, + "learning_rate": 8.75072073803575e-06, + "loss": 2.0894, + "step": 3900 + }, + { + "epoch": 0.24984984984984984, + "eval_loss": 2.083714008331299, + "eval_runtime": 98.9301, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 3900 + }, + { + "epoch": 0.3339204697091006, + "grad_norm": 0.5608721375465393, + "learning_rate": 6.660688359381673e-06, + "loss": 2.047, + "step": 3910 + }, + { + "epoch": 0.3347744862556712, + "grad_norm": 0.5373656749725342, + "learning_rate": 6.652147920403109e-06, + "loss": 2.0301, + "step": 3920 + }, + { + "epoch": 0.33562850280224177, + "grad_norm": 0.5691947340965271, + "learning_rate": 6.643607481424545e-06, + "loss": 2.0573, + "step": 3930 + }, + { + "epoch": 0.3364825193488124, + "grad_norm": 0.63722163438797, + "learning_rate": 6.635067042445981e-06, + "loss": 2.0341, + "step": 3940 + }, + { + "epoch": 0.337336535895383, + "grad_norm": 0.5499609112739563, + "learning_rate": 6.626526603467419e-06, + "loss": 2.0756, + "step": 3950 + }, + { + "epoch": 0.33819055244195356, + "grad_norm": 0.6073453426361084, + "learning_rate": 6.617986164488855e-06, + "loss": 2.0549, + "step": 3960 + }, + { + "epoch": 0.33904456898852414, + "grad_norm": 0.5716825723648071, + "learning_rate": 6.609445725510292e-06, + "loss": 2.0512, + "step": 3970 + }, + { + "epoch": 0.3398985855350947, + "grad_norm": 0.5261934399604797, + "learning_rate": 6.600905286531728e-06, + "loss": 2.0627, + "step": 3980 + }, + { + "epoch": 0.34075260208166536, + "grad_norm": 0.6741161346435547, + "learning_rate": 6.592364847553164e-06, + "loss": 2.0467, + "step": 3990 + }, + { + "epoch": 0.34160661862823594, + "grad_norm": 0.5317021608352661, + "learning_rate": 6.583824408574602e-06, + "loss": 2.0704, + "step": 4000 + }, + { + "epoch": 0.34160661862823594, + "eval_loss": 2.0800442695617676, + "eval_runtime": 99.8576, + "eval_samples_per_second": 10.014, + "eval_steps_per_second": 5.007, + "step": 4000 + }, + { + "epoch": 0.3424606351748065, + "grad_norm": 0.5947962999343872, + "learning_rate": 6.575283969596038e-06, + "loss": 2.0348, + "step": 4010 + }, + { + "epoch": 0.3433146517213771, + "grad_norm": 0.5036411881446838, + "learning_rate": 6.566743530617475e-06, + "loss": 2.0589, + "step": 4020 + }, + { + "epoch": 0.3441686682679477, + "grad_norm": 0.5852782726287842, + "learning_rate": 6.558203091638911e-06, + "loss": 2.0557, + "step": 4030 + }, + { + "epoch": 0.34502268481451825, + "grad_norm": 0.5305678248405457, + "learning_rate": 6.549662652660347e-06, + "loss": 2.0487, + "step": 4040 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 0.5548572540283203, + "learning_rate": 6.541122213681784e-06, + "loss": 2.0411, + "step": 4050 + }, + { + "epoch": 0.34673071790765947, + "grad_norm": 0.5388315916061401, + "learning_rate": 6.53258177470322e-06, + "loss": 2.0236, + "step": 4060 + }, + { + "epoch": 0.34758473445423005, + "grad_norm": 0.5466106534004211, + "learning_rate": 6.5240413357246564e-06, + "loss": 2.0639, + "step": 4070 + }, + { + "epoch": 0.34843875100080063, + "grad_norm": 0.5653154253959656, + "learning_rate": 6.515500896746093e-06, + "loss": 2.0625, + "step": 4080 + }, + { + "epoch": 0.3492927675473712, + "grad_norm": 0.5568574666976929, + "learning_rate": 6.506960457767529e-06, + "loss": 2.0632, + "step": 4090 + }, + { + "epoch": 0.35014678409394184, + "grad_norm": 0.6117140650749207, + "learning_rate": 6.498420018788967e-06, + "loss": 2.0439, + "step": 4100 + }, + { + "epoch": 0.35014678409394184, + "eval_loss": 2.0791141986846924, + "eval_runtime": 99.4675, + "eval_samples_per_second": 10.054, + "eval_steps_per_second": 5.027, + "step": 4100 + }, + { + "epoch": 0.3510008006405124, + "grad_norm": 0.5750178098678589, + "learning_rate": 6.489879579810403e-06, + "loss": 2.0585, + "step": 4110 + }, + { + "epoch": 0.351854817187083, + "grad_norm": 0.5808545351028442, + "learning_rate": 6.481339140831839e-06, + "loss": 2.0522, + "step": 4120 + }, + { + "epoch": 0.3527088337336536, + "grad_norm": 0.6266270875930786, + "learning_rate": 6.472798701853276e-06, + "loss": 2.0635, + "step": 4130 + }, + { + "epoch": 0.35356285028022416, + "grad_norm": 0.6173312664031982, + "learning_rate": 6.464258262874712e-06, + "loss": 2.0667, + "step": 4140 + }, + { + "epoch": 0.3544168668267948, + "grad_norm": 0.5644112825393677, + "learning_rate": 6.45571782389615e-06, + "loss": 2.0809, + "step": 4150 + }, + { + "epoch": 0.3552708833733654, + "grad_norm": 0.5110538005828857, + "learning_rate": 6.447177384917586e-06, + "loss": 2.0636, + "step": 4160 + }, + { + "epoch": 0.35612489991993596, + "grad_norm": 0.5960603952407837, + "learning_rate": 6.438636945939022e-06, + "loss": 2.0816, + "step": 4170 + }, + { + "epoch": 0.35697891646650654, + "grad_norm": 0.6166285872459412, + "learning_rate": 6.430096506960459e-06, + "loss": 2.0512, + "step": 4180 + }, + { + "epoch": 0.3578329330130771, + "grad_norm": 0.6123340725898743, + "learning_rate": 6.421556067981895e-06, + "loss": 2.0568, + "step": 4190 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 0.5808490514755249, + "learning_rate": 6.413015629003331e-06, + "loss": 2.0651, + "step": 4200 + }, + { + "epoch": 0.3586869495596477, + "eval_loss": 2.078325033187866, + "eval_runtime": 99.4675, + "eval_samples_per_second": 10.054, + "eval_steps_per_second": 5.027, + "step": 4200 + }, + { + "epoch": 0.35954096610621833, + "grad_norm": 0.550093412399292, + "learning_rate": 6.404475190024768e-06, + "loss": 2.0594, + "step": 4210 + }, + { + "epoch": 0.3603949826527889, + "grad_norm": 0.6824820637702942, + "learning_rate": 6.395934751046204e-06, + "loss": 2.0576, + "step": 4220 + }, + { + "epoch": 0.3612489991993595, + "grad_norm": 0.5822415351867676, + "learning_rate": 6.38739431206764e-06, + "loss": 2.0594, + "step": 4230 + }, + { + "epoch": 0.36210301574593007, + "grad_norm": 0.6032450199127197, + "learning_rate": 6.378853873089077e-06, + "loss": 2.0715, + "step": 4240 + }, + { + "epoch": 0.36295703229250065, + "grad_norm": 0.5853261947631836, + "learning_rate": 6.370313434110513e-06, + "loss": 2.05, + "step": 4250 + }, + { + "epoch": 0.3638110488390713, + "grad_norm": 0.6220340132713318, + "learning_rate": 6.361772995131951e-06, + "loss": 2.1133, + "step": 4260 + }, + { + "epoch": 0.36466506538564186, + "grad_norm": 0.6459780931472778, + "learning_rate": 6.353232556153387e-06, + "loss": 2.0811, + "step": 4270 + }, + { + "epoch": 0.36551908193221244, + "grad_norm": 0.5899947881698608, + "learning_rate": 6.344692117174823e-06, + "loss": 2.0617, + "step": 4280 + }, + { + "epoch": 0.366373098478783, + "grad_norm": 0.6085699796676636, + "learning_rate": 6.33615167819626e-06, + "loss": 2.0656, + "step": 4290 + }, + { + "epoch": 0.3672271150253536, + "grad_norm": 0.6055794954299927, + "learning_rate": 6.327611239217696e-06, + "loss": 2.0569, + "step": 4300 + }, + { + "epoch": 0.3672271150253536, + "eval_loss": 2.077655553817749, + "eval_runtime": 98.8011, + "eval_samples_per_second": 10.121, + "eval_steps_per_second": 5.061, + "step": 4300 + }, + { + "epoch": 0.3680811315719242, + "grad_norm": 0.5629417300224304, + "learning_rate": 6.319070800239133e-06, + "loss": 2.0774, + "step": 4310 + }, + { + "epoch": 0.3689351481184948, + "grad_norm": 0.6046031713485718, + "learning_rate": 6.31053036126057e-06, + "loss": 2.0479, + "step": 4320 + }, + { + "epoch": 0.3697891646650654, + "grad_norm": 0.5327040553092957, + "learning_rate": 6.301989922282006e-06, + "loss": 2.0399, + "step": 4330 + }, + { + "epoch": 0.370643181211636, + "grad_norm": 0.5413053631782532, + "learning_rate": 6.293449483303442e-06, + "loss": 2.0414, + "step": 4340 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 0.6113752126693726, + "learning_rate": 6.284909044324879e-06, + "loss": 2.0206, + "step": 4350 + }, + { + "epoch": 0.37235121430477713, + "grad_norm": 0.5839834809303284, + "learning_rate": 6.2763686053463154e-06, + "loss": 2.0705, + "step": 4360 + }, + { + "epoch": 0.37320523085134777, + "grad_norm": 0.5761412382125854, + "learning_rate": 6.267828166367752e-06, + "loss": 2.0449, + "step": 4370 + }, + { + "epoch": 0.37405924739791835, + "grad_norm": 0.6302218437194824, + "learning_rate": 6.259287727389188e-06, + "loss": 2.042, + "step": 4380 + }, + { + "epoch": 0.37491326394448893, + "grad_norm": 0.5558602213859558, + "learning_rate": 6.250747288410624e-06, + "loss": 2.0546, + "step": 4390 + }, + { + "epoch": 0.3757672804910595, + "grad_norm": 0.5387922525405884, + "learning_rate": 6.242206849432061e-06, + "loss": 2.0605, + "step": 4400 + }, + { + "epoch": 0.3757672804910595, + "eval_loss": 2.076936721801758, + "eval_runtime": 99.0017, + "eval_samples_per_second": 10.101, + "eval_steps_per_second": 5.05, + "step": 4400 + }, + { + "epoch": 0.3766212970376301, + "grad_norm": 0.6043440103530884, + "learning_rate": 6.233666410453498e-06, + "loss": 2.0524, + "step": 4410 + }, + { + "epoch": 0.37747531358420067, + "grad_norm": 0.5480827689170837, + "learning_rate": 6.225125971474935e-06, + "loss": 2.0423, + "step": 4420 + }, + { + "epoch": 0.3783293301307713, + "grad_norm": 0.5199385285377502, + "learning_rate": 6.216585532496371e-06, + "loss": 2.0346, + "step": 4430 + }, + { + "epoch": 0.3791833466773419, + "grad_norm": 0.5246111154556274, + "learning_rate": 6.208045093517807e-06, + "loss": 2.0688, + "step": 4440 + }, + { + "epoch": 0.38003736322391246, + "grad_norm": 0.5296744704246521, + "learning_rate": 6.199504654539243e-06, + "loss": 2.0552, + "step": 4450 + }, + { + "epoch": 0.38089137977048304, + "grad_norm": 0.5457771420478821, + "learning_rate": 6.190964215560681e-06, + "loss": 2.0334, + "step": 4460 + }, + { + "epoch": 0.3817453963170536, + "grad_norm": 0.5534031987190247, + "learning_rate": 6.182423776582117e-06, + "loss": 2.0416, + "step": 4470 + }, + { + "epoch": 0.38259941286362426, + "grad_norm": 0.5709179043769836, + "learning_rate": 6.173883337603554e-06, + "loss": 2.0795, + "step": 4480 + }, + { + "epoch": 0.38345342941019483, + "grad_norm": 0.5756047368049622, + "learning_rate": 6.16534289862499e-06, + "loss": 2.0876, + "step": 4490 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 0.5713403820991516, + "learning_rate": 6.156802459646426e-06, + "loss": 2.0673, + "step": 4500 + }, + { + "epoch": 0.3843074459567654, + "eval_loss": 2.0763320922851562, + "eval_runtime": 98.878, + "eval_samples_per_second": 10.113, + "eval_steps_per_second": 5.057, + "step": 4500 + }, + { + "epoch": 0.385161462503336, + "grad_norm": 0.5884702205657959, + "learning_rate": 6.148262020667863e-06, + "loss": 2.0722, + "step": 4510 + }, + { + "epoch": 0.3860154790499066, + "grad_norm": 0.5472369194030762, + "learning_rate": 6.139721581689299e-06, + "loss": 2.0711, + "step": 4520 + }, + { + "epoch": 0.3868694955964772, + "grad_norm": 0.6504119634628296, + "learning_rate": 6.131181142710736e-06, + "loss": 2.0431, + "step": 4530 + }, + { + "epoch": 0.3877235121430478, + "grad_norm": 0.6400245428085327, + "learning_rate": 6.122640703732172e-06, + "loss": 2.0535, + "step": 4540 + }, + { + "epoch": 0.38857752868961837, + "grad_norm": 0.6420487761497498, + "learning_rate": 6.114100264753608e-06, + "loss": 2.0662, + "step": 4550 + }, + { + "epoch": 0.38943154523618895, + "grad_norm": 0.6386040449142456, + "learning_rate": 6.105559825775045e-06, + "loss": 2.0476, + "step": 4560 + }, + { + "epoch": 0.3902855617827595, + "grad_norm": 0.56058669090271, + "learning_rate": 6.097019386796482e-06, + "loss": 2.0601, + "step": 4570 + }, + { + "epoch": 0.3911395783293301, + "grad_norm": 0.8170416355133057, + "learning_rate": 6.088478947817918e-06, + "loss": 2.0618, + "step": 4580 + }, + { + "epoch": 0.39199359487590074, + "grad_norm": 0.7694204449653625, + "learning_rate": 6.079938508839355e-06, + "loss": 2.0863, + "step": 4590 + }, + { + "epoch": 0.3928476114224713, + "grad_norm": 0.693332850933075, + "learning_rate": 6.071398069860791e-06, + "loss": 2.0622, + "step": 4600 + }, + { + "epoch": 0.3928476114224713, + "eval_loss": 2.0757362842559814, + "eval_runtime": 98.8423, + "eval_samples_per_second": 10.117, + "eval_steps_per_second": 5.059, + "step": 4600 + }, + { + "epoch": 0.3937016279690419, + "grad_norm": 0.7094119191169739, + "learning_rate": 6.062857630882227e-06, + "loss": 2.0691, + "step": 4610 + }, + { + "epoch": 0.3945556445156125, + "grad_norm": 0.5764107704162598, + "learning_rate": 6.054317191903665e-06, + "loss": 2.04, + "step": 4620 + }, + { + "epoch": 0.39540966106218306, + "grad_norm": 0.5961339473724365, + "learning_rate": 6.045776752925101e-06, + "loss": 2.035, + "step": 4630 + }, + { + "epoch": 0.3962636776087537, + "grad_norm": 0.6162221431732178, + "learning_rate": 6.037236313946538e-06, + "loss": 2.0548, + "step": 4640 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 0.64893639087677, + "learning_rate": 6.028695874967974e-06, + "loss": 2.0286, + "step": 4650 + }, + { + "epoch": 0.39797171070189485, + "grad_norm": 0.5995349884033203, + "learning_rate": 6.02015543598941e-06, + "loss": 2.0837, + "step": 4660 + }, + { + "epoch": 0.39882572724846543, + "grad_norm": 0.526799201965332, + "learning_rate": 6.011614997010847e-06, + "loss": 2.0606, + "step": 4670 + }, + { + "epoch": 0.399679743795036, + "grad_norm": 0.570662796497345, + "learning_rate": 6.003074558032283e-06, + "loss": 2.0761, + "step": 4680 + }, + { + "epoch": 0.4005337603416066, + "grad_norm": 0.5964308381080627, + "learning_rate": 5.99453411905372e-06, + "loss": 2.0613, + "step": 4690 + }, + { + "epoch": 0.40138777688817723, + "grad_norm": 0.5859782099723816, + "learning_rate": 5.985993680075156e-06, + "loss": 2.0845, + "step": 4700 + }, + { + "epoch": 0.40138777688817723, + "eval_loss": 2.074390172958374, + "eval_runtime": 98.8514, + "eval_samples_per_second": 10.116, + "eval_steps_per_second": 5.058, + "step": 4700 + }, + { + "epoch": 0.4022417934347478, + "grad_norm": 0.531798243522644, + "learning_rate": 5.977453241096592e-06, + "loss": 2.0398, + "step": 4710 + }, + { + "epoch": 0.4030958099813184, + "grad_norm": 0.5802189111709595, + "learning_rate": 5.96891280211803e-06, + "loss": 2.0344, + "step": 4720 + }, + { + "epoch": 0.40394982652788897, + "grad_norm": 0.6294711232185364, + "learning_rate": 5.960372363139466e-06, + "loss": 2.0789, + "step": 4730 + }, + { + "epoch": 0.40480384307445955, + "grad_norm": 0.5669849514961243, + "learning_rate": 5.951831924160902e-06, + "loss": 2.0539, + "step": 4740 + }, + { + "epoch": 0.4056578596210302, + "grad_norm": 0.6840581297874451, + "learning_rate": 5.943291485182339e-06, + "loss": 2.0784, + "step": 4750 + }, + { + "epoch": 0.40651187616760076, + "grad_norm": 0.6162413954734802, + "learning_rate": 5.934751046203775e-06, + "loss": 2.0694, + "step": 4760 + }, + { + "epoch": 0.40736589271417134, + "grad_norm": 0.5217962265014648, + "learning_rate": 5.926210607225213e-06, + "loss": 2.093, + "step": 4770 + }, + { + "epoch": 0.4082199092607419, + "grad_norm": 0.5702998638153076, + "learning_rate": 5.917670168246649e-06, + "loss": 2.0329, + "step": 4780 + }, + { + "epoch": 0.4090739258073125, + "grad_norm": 0.7252790331840515, + "learning_rate": 5.909129729268085e-06, + "loss": 2.0593, + "step": 4790 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 0.542995810508728, + "learning_rate": 5.9005892902895216e-06, + "loss": 2.0582, + "step": 4800 + }, + { + "epoch": 0.4099279423538831, + "eval_loss": 2.074084997177124, + "eval_runtime": 98.584, + "eval_samples_per_second": 10.144, + "eval_steps_per_second": 5.072, + "step": 4800 + }, + { + "epoch": 0.4107819589004537, + "grad_norm": 0.5005578994750977, + "learning_rate": 5.892048851310958e-06, + "loss": 2.0291, + "step": 4810 + }, + { + "epoch": 0.4116359754470243, + "grad_norm": 0.5405492186546326, + "learning_rate": 5.883508412332395e-06, + "loss": 2.0288, + "step": 4820 + }, + { + "epoch": 0.4124899919935949, + "grad_norm": 0.487716943025589, + "learning_rate": 5.874967973353831e-06, + "loss": 2.0483, + "step": 4830 + }, + { + "epoch": 0.41334400854016545, + "grad_norm": 0.48800957202911377, + "learning_rate": 5.866427534375267e-06, + "loss": 2.0608, + "step": 4840 + }, + { + "epoch": 0.41419802508673603, + "grad_norm": 0.6357758641242981, + "learning_rate": 5.857887095396703e-06, + "loss": 2.06, + "step": 4850 + }, + { + "epoch": 0.41505204163330667, + "grad_norm": 0.5239138603210449, + "learning_rate": 5.84934665641814e-06, + "loss": 2.0216, + "step": 4860 + }, + { + "epoch": 0.41590605817987725, + "grad_norm": 0.4909784495830536, + "learning_rate": 5.840806217439576e-06, + "loss": 2.0506, + "step": 4870 + }, + { + "epoch": 0.4167600747264478, + "grad_norm": 0.5664153695106506, + "learning_rate": 5.832265778461014e-06, + "loss": 2.0261, + "step": 4880 + }, + { + "epoch": 0.4176140912730184, + "grad_norm": 0.5648866891860962, + "learning_rate": 5.82372533948245e-06, + "loss": 2.0376, + "step": 4890 + }, + { + "epoch": 0.418468107819589, + "grad_norm": 0.5945410132408142, + "learning_rate": 5.815184900503886e-06, + "loss": 2.0555, + "step": 4900 + }, + { + "epoch": 0.418468107819589, + "eval_loss": 2.0723860263824463, + "eval_runtime": 98.7193, + "eval_samples_per_second": 10.13, + "eval_steps_per_second": 5.065, + "step": 4900 + }, + { + "epoch": 0.4193221243661596, + "grad_norm": 0.5736713409423828, + "learning_rate": 5.806644461525323e-06, + "loss": 2.0666, + "step": 4910 + }, + { + "epoch": 0.4201761409127302, + "grad_norm": 0.6498487591743469, + "learning_rate": 5.798104022546759e-06, + "loss": 2.0615, + "step": 4920 + }, + { + "epoch": 0.4210301574593008, + "grad_norm": 0.694471538066864, + "learning_rate": 5.789563583568197e-06, + "loss": 2.0657, + "step": 4930 + }, + { + "epoch": 0.42188417400587136, + "grad_norm": 0.6121436357498169, + "learning_rate": 5.781023144589633e-06, + "loss": 2.0569, + "step": 4940 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 0.5739856958389282, + "learning_rate": 5.772482705611069e-06, + "loss": 2.0774, + "step": 4950 + }, + { + "epoch": 0.4235922070990125, + "grad_norm": 0.6429994702339172, + "learning_rate": 5.763942266632505e-06, + "loss": 2.0487, + "step": 4960 + }, + { + "epoch": 0.42444622364558315, + "grad_norm": 0.5392478704452515, + "learning_rate": 5.7554018276539416e-06, + "loss": 2.0283, + "step": 4970 + }, + { + "epoch": 0.42530024019215373, + "grad_norm": 0.565315842628479, + "learning_rate": 5.7468613886753784e-06, + "loss": 2.0268, + "step": 4980 + }, + { + "epoch": 0.4261542567387243, + "grad_norm": 0.669299840927124, + "learning_rate": 5.738320949696815e-06, + "loss": 2.0408, + "step": 4990 + }, + { + "epoch": 0.4270082732852949, + "grad_norm": 0.6179564595222473, + "learning_rate": 5.729780510718251e-06, + "loss": 2.0819, + "step": 5000 + }, + { + "epoch": 0.4270082732852949, + "eval_loss": 2.0726468563079834, + "eval_runtime": 98.5294, + "eval_samples_per_second": 10.149, + "eval_steps_per_second": 5.075, + "step": 5000 + }, + { + "epoch": 0.42786228983186547, + "grad_norm": 0.6250094175338745, + "learning_rate": 5.721240071739687e-06, + "loss": 2.0698, + "step": 5010 + }, + { + "epoch": 0.4287163063784361, + "grad_norm": 0.4930519759654999, + "learning_rate": 5.712699632761124e-06, + "loss": 2.0532, + "step": 5020 + }, + { + "epoch": 0.4295703229250067, + "grad_norm": 0.6259885430335999, + "learning_rate": 5.704159193782561e-06, + "loss": 2.0651, + "step": 5030 + }, + { + "epoch": 0.43042433947157727, + "grad_norm": 0.5688281059265137, + "learning_rate": 5.695618754803998e-06, + "loss": 2.057, + "step": 5040 + }, + { + "epoch": 0.43127835601814785, + "grad_norm": 0.5666618943214417, + "learning_rate": 5.687078315825434e-06, + "loss": 2.0392, + "step": 5050 + }, + { + "epoch": 0.4321323725647184, + "grad_norm": 0.7095407247543335, + "learning_rate": 5.67853787684687e-06, + "loss": 2.0294, + "step": 5060 + }, + { + "epoch": 0.432986389111289, + "grad_norm": 0.6068659424781799, + "learning_rate": 5.669997437868307e-06, + "loss": 2.0576, + "step": 5070 + }, + { + "epoch": 0.43384040565785964, + "grad_norm": 0.5697404146194458, + "learning_rate": 5.661456998889744e-06, + "loss": 2.0242, + "step": 5080 + }, + { + "epoch": 0.4346944222044302, + "grad_norm": 0.6200443506240845, + "learning_rate": 5.65291655991118e-06, + "loss": 2.0759, + "step": 5090 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 0.5611567497253418, + "learning_rate": 5.644376120932617e-06, + "loss": 2.0455, + "step": 5100 + }, + { + "epoch": 0.4355484387510008, + "eval_loss": 2.07175350189209, + "eval_runtime": 98.9289, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 5100 + }, + { + "epoch": 0.4364024552975714, + "grad_norm": 0.6449030041694641, + "learning_rate": 5.635835681954053e-06, + "loss": 2.0377, + "step": 5110 + }, + { + "epoch": 0.43725647184414196, + "grad_norm": 0.6077715754508972, + "learning_rate": 5.627295242975489e-06, + "loss": 2.0611, + "step": 5120 + }, + { + "epoch": 0.4381104883907126, + "grad_norm": 0.5876298546791077, + "learning_rate": 5.618754803996926e-06, + "loss": 2.0634, + "step": 5130 + }, + { + "epoch": 0.4389645049372832, + "grad_norm": 0.6145151853561401, + "learning_rate": 5.610214365018362e-06, + "loss": 2.0416, + "step": 5140 + }, + { + "epoch": 0.43981852148385375, + "grad_norm": 0.5999618768692017, + "learning_rate": 5.601673926039799e-06, + "loss": 2.0692, + "step": 5150 + }, + { + "epoch": 0.44067253803042433, + "grad_norm": 0.5729487538337708, + "learning_rate": 5.593133487061235e-06, + "loss": 2.0002, + "step": 5160 + }, + { + "epoch": 0.4415265545769949, + "grad_norm": 0.5832698941230774, + "learning_rate": 5.584593048082671e-06, + "loss": 2.0488, + "step": 5170 + }, + { + "epoch": 0.4423805711235655, + "grad_norm": 0.6237705945968628, + "learning_rate": 5.576052609104109e-06, + "loss": 2.0394, + "step": 5180 + }, + { + "epoch": 0.4432345876701361, + "grad_norm": 0.5795707106590271, + "learning_rate": 5.567512170125545e-06, + "loss": 2.0547, + "step": 5190 + }, + { + "epoch": 0.4440886042167067, + "grad_norm": 0.5339071154594421, + "learning_rate": 5.558971731146982e-06, + "loss": 2.0555, + "step": 5200 + }, + { + "epoch": 0.4440886042167067, + "eval_loss": 2.0713462829589844, + "eval_runtime": 99.056, + "eval_samples_per_second": 10.095, + "eval_steps_per_second": 5.048, + "step": 5200 + }, + { + "epoch": 0.4449426207632773, + "grad_norm": 0.5246272087097168, + "learning_rate": 5.550431292168418e-06, + "loss": 2.0571, + "step": 5210 + }, + { + "epoch": 0.44579663730984787, + "grad_norm": 0.5855641961097717, + "learning_rate": 5.541890853189854e-06, + "loss": 2.0425, + "step": 5220 + }, + { + "epoch": 0.44665065385641844, + "grad_norm": 0.6182217001914978, + "learning_rate": 5.53335041421129e-06, + "loss": 2.0631, + "step": 5230 + }, + { + "epoch": 0.4475046704029891, + "grad_norm": 0.5944363474845886, + "learning_rate": 5.524809975232728e-06, + "loss": 2.0234, + "step": 5240 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 0.5854187607765198, + "learning_rate": 5.516269536254164e-06, + "loss": 2.028, + "step": 5250 + }, + { + "epoch": 0.44921270349613024, + "grad_norm": 0.5205993056297302, + "learning_rate": 5.507729097275601e-06, + "loss": 2.0795, + "step": 5260 + }, + { + "epoch": 0.4500667200427008, + "grad_norm": 0.6661815047264099, + "learning_rate": 5.499188658297037e-06, + "loss": 2.0647, + "step": 5270 + }, + { + "epoch": 0.4509207365892714, + "grad_norm": 0.5682797431945801, + "learning_rate": 5.490648219318473e-06, + "loss": 2.0187, + "step": 5280 + }, + { + "epoch": 0.45177475313584203, + "grad_norm": 0.6261969208717346, + "learning_rate": 5.48210778033991e-06, + "loss": 2.0437, + "step": 5290 + }, + { + "epoch": 0.4526287696824126, + "grad_norm": 0.6430942416191101, + "learning_rate": 5.473567341361346e-06, + "loss": 2.0278, + "step": 5300 + }, + { + "epoch": 0.4526287696824126, + "eval_loss": 2.070575714111328, + "eval_runtime": 98.6678, + "eval_samples_per_second": 10.135, + "eval_steps_per_second": 5.068, + "step": 5300 + }, + { + "epoch": 0.4534827862289832, + "grad_norm": 0.5734032988548279, + "learning_rate": 5.465026902382783e-06, + "loss": 2.055, + "step": 5310 + }, + { + "epoch": 0.45433680277555377, + "grad_norm": 0.5649511218070984, + "learning_rate": 5.456486463404219e-06, + "loss": 2.0769, + "step": 5320 + }, + { + "epoch": 0.45519081932212435, + "grad_norm": 0.5609380006790161, + "learning_rate": 5.447946024425655e-06, + "loss": 2.0287, + "step": 5330 + }, + { + "epoch": 0.45604483586869493, + "grad_norm": 0.5055214166641235, + "learning_rate": 5.439405585447093e-06, + "loss": 2.0595, + "step": 5340 + }, + { + "epoch": 0.45689885241526557, + "grad_norm": 0.6321738362312317, + "learning_rate": 5.430865146468529e-06, + "loss": 2.0208, + "step": 5350 + }, + { + "epoch": 0.45775286896183615, + "grad_norm": 0.7255424857139587, + "learning_rate": 5.422324707489965e-06, + "loss": 2.0044, + "step": 5360 + }, + { + "epoch": 0.4586068855084067, + "grad_norm": 0.6330071091651917, + "learning_rate": 5.413784268511402e-06, + "loss": 2.0307, + "step": 5370 + }, + { + "epoch": 0.4594609020549773, + "grad_norm": 0.49322399497032166, + "learning_rate": 5.405243829532838e-06, + "loss": 2.0655, + "step": 5380 + }, + { + "epoch": 0.4603149186015479, + "grad_norm": 0.5671298503875732, + "learning_rate": 5.396703390554276e-06, + "loss": 1.9937, + "step": 5390 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 0.6089677810668945, + "learning_rate": 5.388162951575712e-06, + "loss": 2.0664, + "step": 5400 + }, + { + "epoch": 0.4611689351481185, + "eval_loss": 2.0699245929718018, + "eval_runtime": 98.8733, + "eval_samples_per_second": 10.114, + "eval_steps_per_second": 5.057, + "step": 5400 + }, + { + "epoch": 0.4620229516946891, + "grad_norm": 0.6179537177085876, + "learning_rate": 5.379622512597148e-06, + "loss": 2.0713, + "step": 5410 + }, + { + "epoch": 0.4628769682412597, + "grad_norm": 0.5245445966720581, + "learning_rate": 5.3710820736185846e-06, + "loss": 2.0533, + "step": 5420 + }, + { + "epoch": 0.46373098478783026, + "grad_norm": 0.5217622518539429, + "learning_rate": 5.362541634640021e-06, + "loss": 2.069, + "step": 5430 + }, + { + "epoch": 0.46458500133440084, + "grad_norm": 0.561970591545105, + "learning_rate": 5.354001195661458e-06, + "loss": 2.0622, + "step": 5440 + }, + { + "epoch": 0.4654390178809714, + "grad_norm": 0.4817509651184082, + "learning_rate": 5.345460756682894e-06, + "loss": 2.0504, + "step": 5450 + }, + { + "epoch": 0.46629303442754205, + "grad_norm": 0.5928997993469238, + "learning_rate": 5.33692031770433e-06, + "loss": 2.0416, + "step": 5460 + }, + { + "epoch": 0.46714705097411263, + "grad_norm": 0.5909265875816345, + "learning_rate": 5.328379878725766e-06, + "loss": 2.0193, + "step": 5470 + }, + { + "epoch": 0.4680010675206832, + "grad_norm": 0.63572096824646, + "learning_rate": 5.319839439747203e-06, + "loss": 2.0607, + "step": 5480 + }, + { + "epoch": 0.4688550840672538, + "grad_norm": 0.5362561941146851, + "learning_rate": 5.31129900076864e-06, + "loss": 2.0396, + "step": 5490 + }, + { + "epoch": 0.46970910061382437, + "grad_norm": 0.5716733336448669, + "learning_rate": 5.302758561790077e-06, + "loss": 2.0624, + "step": 5500 + }, + { + "epoch": 0.46970910061382437, + "eval_loss": 2.0693435668945312, + "eval_runtime": 98.7515, + "eval_samples_per_second": 10.126, + "eval_steps_per_second": 5.063, + "step": 5500 + }, + { + "epoch": 0.470563117160395, + "grad_norm": 0.5095422863960266, + "learning_rate": 5.294218122811513e-06, + "loss": 2.0081, + "step": 5510 + }, + { + "epoch": 0.4714171337069656, + "grad_norm": 0.6708410382270813, + "learning_rate": 5.285677683832949e-06, + "loss": 2.0545, + "step": 5520 + }, + { + "epoch": 0.47227115025353616, + "grad_norm": 0.6041153073310852, + "learning_rate": 5.277137244854386e-06, + "loss": 2.0504, + "step": 5530 + }, + { + "epoch": 0.47312516680010674, + "grad_norm": 0.5644756555557251, + "learning_rate": 5.268596805875822e-06, + "loss": 2.0685, + "step": 5540 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 0.5705012679100037, + "learning_rate": 5.26005636689726e-06, + "loss": 2.0169, + "step": 5550 + }, + { + "epoch": 0.47483319989324796, + "grad_norm": 0.5327626466751099, + "learning_rate": 5.251515927918696e-06, + "loss": 2.0637, + "step": 5560 + }, + { + "epoch": 0.47568721643981854, + "grad_norm": 0.6007681488990784, + "learning_rate": 5.242975488940132e-06, + "loss": 2.0452, + "step": 5570 + }, + { + "epoch": 0.4765412329863891, + "grad_norm": 0.5735289454460144, + "learning_rate": 5.2344350499615685e-06, + "loss": 2.0617, + "step": 5580 + }, + { + "epoch": 0.4773952495329597, + "grad_norm": 0.5276105403900146, + "learning_rate": 5.2258946109830046e-06, + "loss": 2.0742, + "step": 5590 + }, + { + "epoch": 0.4782492660795303, + "grad_norm": 0.7130099534988403, + "learning_rate": 5.217354172004441e-06, + "loss": 2.0468, + "step": 5600 + }, + { + "epoch": 0.4782492660795303, + "eval_loss": 2.0682835578918457, + "eval_runtime": 99.1121, + "eval_samples_per_second": 10.09, + "eval_steps_per_second": 5.045, + "step": 5600 + }, + { + "epoch": 0.47910328262610086, + "grad_norm": 0.5376310348510742, + "learning_rate": 5.208813733025878e-06, + "loss": 2.0501, + "step": 5610 + }, + { + "epoch": 0.4799572991726715, + "grad_norm": 0.5130000114440918, + "learning_rate": 5.200273294047314e-06, + "loss": 2.044, + "step": 5620 + }, + { + "epoch": 0.48081131571924207, + "grad_norm": 0.5238226056098938, + "learning_rate": 5.19173285506875e-06, + "loss": 2.048, + "step": 5630 + }, + { + "epoch": 0.48166533226581265, + "grad_norm": 0.5614004135131836, + "learning_rate": 5.183192416090187e-06, + "loss": 2.0333, + "step": 5640 + }, + { + "epoch": 0.48251934881238323, + "grad_norm": 0.5451902747154236, + "learning_rate": 5.174651977111624e-06, + "loss": 2.0219, + "step": 5650 + }, + { + "epoch": 0.4833733653589538, + "grad_norm": 0.6397396326065063, + "learning_rate": 5.166111538133061e-06, + "loss": 2.0264, + "step": 5660 + }, + { + "epoch": 0.48422738190552445, + "grad_norm": 0.517250657081604, + "learning_rate": 5.157571099154497e-06, + "loss": 2.0612, + "step": 5670 + }, + { + "epoch": 0.485081398452095, + "grad_norm": 0.5595155358314514, + "learning_rate": 5.149030660175933e-06, + "loss": 2.0297, + "step": 5680 + }, + { + "epoch": 0.4859354149986656, + "grad_norm": 0.5949190855026245, + "learning_rate": 5.14049022119737e-06, + "loss": 2.0485, + "step": 5690 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 0.5923450589179993, + "learning_rate": 5.131949782218807e-06, + "loss": 2.0562, + "step": 5700 + }, + { + "epoch": 0.4867894315452362, + "eval_loss": 2.0683388710021973, + "eval_runtime": 98.8272, + "eval_samples_per_second": 10.119, + "eval_steps_per_second": 5.059, + "step": 5700 + }, + { + "epoch": 0.48764344809180676, + "grad_norm": 0.551146924495697, + "learning_rate": 5.123409343240244e-06, + "loss": 2.0274, + "step": 5710 + }, + { + "epoch": 0.48849746463837734, + "grad_norm": 0.6246230602264404, + "learning_rate": 5.11486890426168e-06, + "loss": 2.0715, + "step": 5720 + }, + { + "epoch": 0.489351481184948, + "grad_norm": 0.5968629121780396, + "learning_rate": 5.106328465283116e-06, + "loss": 2.0357, + "step": 5730 + }, + { + "epoch": 0.49020549773151856, + "grad_norm": 0.5812126994132996, + "learning_rate": 5.097788026304552e-06, + "loss": 2.0391, + "step": 5740 + }, + { + "epoch": 0.49105951427808914, + "grad_norm": 0.5794074535369873, + "learning_rate": 5.089247587325989e-06, + "loss": 2.0282, + "step": 5750 + }, + { + "epoch": 0.4919135308246597, + "grad_norm": 0.542427122592926, + "learning_rate": 5.080707148347425e-06, + "loss": 2.0375, + "step": 5760 + }, + { + "epoch": 0.4927675473712303, + "grad_norm": 0.601831316947937, + "learning_rate": 5.072166709368862e-06, + "loss": 2.0626, + "step": 5770 + }, + { + "epoch": 0.49362156391780093, + "grad_norm": 0.5945174098014832, + "learning_rate": 5.063626270390298e-06, + "loss": 2.029, + "step": 5780 + }, + { + "epoch": 0.4944755804643715, + "grad_norm": 0.5705648064613342, + "learning_rate": 5.055085831411734e-06, + "loss": 2.0753, + "step": 5790 + }, + { + "epoch": 0.4953295970109421, + "grad_norm": 0.5841405391693115, + "learning_rate": 5.046545392433172e-06, + "loss": 2.0684, + "step": 5800 + }, + { + "epoch": 0.4953295970109421, + "eval_loss": 2.0675156116485596, + "eval_runtime": 98.7614, + "eval_samples_per_second": 10.125, + "eval_steps_per_second": 5.063, + "step": 5800 + }, + { + "epoch": 0.49618361355751267, + "grad_norm": 0.5943530797958374, + "learning_rate": 5.038004953454608e-06, + "loss": 2.0203, + "step": 5810 + }, + { + "epoch": 0.49703763010408325, + "grad_norm": 0.5854504704475403, + "learning_rate": 5.029464514476045e-06, + "loss": 2.0291, + "step": 5820 + }, + { + "epoch": 0.49789164665065383, + "grad_norm": 0.5930184721946716, + "learning_rate": 5.020924075497481e-06, + "loss": 2.0685, + "step": 5830 + }, + { + "epoch": 0.49874566319722446, + "grad_norm": 0.5872290730476379, + "learning_rate": 5.012383636518917e-06, + "loss": 2.0351, + "step": 5840 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 0.5630264282226562, + "learning_rate": 5.003843197540355e-06, + "loss": 2.047, + "step": 5850 + }, + { + "epoch": 0.5004536962903656, + "grad_norm": 0.5513464212417603, + "learning_rate": 4.99530275856179e-06, + "loss": 2.044, + "step": 5860 + }, + { + "epoch": 0.5013077128369362, + "grad_norm": 0.6369175314903259, + "learning_rate": 4.986762319583227e-06, + "loss": 2.0288, + "step": 5870 + }, + { + "epoch": 0.5021617293835068, + "grad_norm": 0.5816603899002075, + "learning_rate": 4.978221880604664e-06, + "loss": 2.0566, + "step": 5880 + }, + { + "epoch": 0.5030157459300774, + "grad_norm": 0.5797505974769592, + "learning_rate": 4.9696814416261004e-06, + "loss": 2.0276, + "step": 5890 + }, + { + "epoch": 0.5038697624766479, + "grad_norm": 0.5142800211906433, + "learning_rate": 4.9611410026475365e-06, + "loss": 2.0813, + "step": 5900 + }, + { + "epoch": 0.5038697624766479, + "eval_loss": 2.0671706199645996, + "eval_runtime": 98.8372, + "eval_samples_per_second": 10.118, + "eval_steps_per_second": 5.059, + "step": 5900 + }, + { + "epoch": 0.5047237790232185, + "grad_norm": 0.6141909956932068, + "learning_rate": 4.9526005636689725e-06, + "loss": 2.0213, + "step": 5910 + }, + { + "epoch": 0.5055777955697892, + "grad_norm": 0.5329708456993103, + "learning_rate": 4.944060124690409e-06, + "loss": 2.0479, + "step": 5920 + }, + { + "epoch": 0.5064318121163598, + "grad_norm": 0.5788347125053406, + "learning_rate": 4.935519685711846e-06, + "loss": 2.054, + "step": 5930 + }, + { + "epoch": 0.5072858286629304, + "grad_norm": 0.6542540788650513, + "learning_rate": 4.926979246733283e-06, + "loss": 2.0455, + "step": 5940 + }, + { + "epoch": 0.508139845209501, + "grad_norm": 0.5416275858879089, + "learning_rate": 4.918438807754719e-06, + "loss": 2.0591, + "step": 5950 + }, + { + "epoch": 0.5089938617560715, + "grad_norm": 0.5104756951332092, + "learning_rate": 4.909898368776155e-06, + "loss": 2.0029, + "step": 5960 + }, + { + "epoch": 0.5098478783026421, + "grad_norm": 0.6373957395553589, + "learning_rate": 4.901357929797592e-06, + "loss": 2.0327, + "step": 5970 + }, + { + "epoch": 0.5107018948492127, + "grad_norm": 0.5671820640563965, + "learning_rate": 4.892817490819029e-06, + "loss": 2.0184, + "step": 5980 + }, + { + "epoch": 0.5115559113957833, + "grad_norm": 0.5045920610427856, + "learning_rate": 4.884277051840465e-06, + "loss": 2.0655, + "step": 5990 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 0.6044976115226746, + "learning_rate": 4.875736612861902e-06, + "loss": 2.0821, + "step": 6000 + }, + { + "epoch": 0.5124099279423538, + "eval_loss": 2.0663814544677734, + "eval_runtime": 99.3571, + "eval_samples_per_second": 10.065, + "eval_steps_per_second": 5.032, + "step": 6000 + }, + { + "epoch": 0.5132639444889244, + "grad_norm": 0.5572605133056641, + "learning_rate": 4.867196173883338e-06, + "loss": 2.0422, + "step": 6010 + }, + { + "epoch": 0.514117961035495, + "grad_norm": 0.5898503065109253, + "learning_rate": 4.858655734904775e-06, + "loss": 2.0607, + "step": 6020 + }, + { + "epoch": 0.5149719775820657, + "grad_norm": 0.5786698460578918, + "learning_rate": 4.850115295926211e-06, + "loss": 2.0438, + "step": 6030 + }, + { + "epoch": 0.5158259941286363, + "grad_norm": 0.5419963002204895, + "learning_rate": 4.8415748569476475e-06, + "loss": 2.0746, + "step": 6040 + }, + { + "epoch": 0.5166800106752069, + "grad_norm": 0.5842531323432922, + "learning_rate": 4.833034417969084e-06, + "loss": 2.038, + "step": 6050 + }, + { + "epoch": 0.5175340272217774, + "grad_norm": 0.6030822396278381, + "learning_rate": 4.8244939789905204e-06, + "loss": 2.0576, + "step": 6060 + }, + { + "epoch": 0.518388043768348, + "grad_norm": 0.5286454558372498, + "learning_rate": 4.815953540011957e-06, + "loss": 2.0291, + "step": 6070 + }, + { + "epoch": 0.5192420603149186, + "grad_norm": 0.5508376955986023, + "learning_rate": 4.807413101033393e-06, + "loss": 2.0423, + "step": 6080 + }, + { + "epoch": 0.5200960768614892, + "grad_norm": 0.5382469296455383, + "learning_rate": 4.79887266205483e-06, + "loss": 2.0574, + "step": 6090 + }, + { + "epoch": 0.5209500934080598, + "grad_norm": 0.5823044776916504, + "learning_rate": 4.790332223076267e-06, + "loss": 2.0456, + "step": 6100 + }, + { + "epoch": 0.5209500934080598, + "eval_loss": 2.0655107498168945, + "eval_runtime": 99.3273, + "eval_samples_per_second": 10.068, + "eval_steps_per_second": 5.034, + "step": 6100 + }, + { + "epoch": 0.5218041099546303, + "grad_norm": 0.5577982664108276, + "learning_rate": 4.781791784097703e-06, + "loss": 2.0618, + "step": 6110 + }, + { + "epoch": 0.5226581265012009, + "grad_norm": 0.5563845634460449, + "learning_rate": 4.77325134511914e-06, + "loss": 2.0557, + "step": 6120 + }, + { + "epoch": 0.5235121430477716, + "grad_norm": 0.6378036737442017, + "learning_rate": 4.764710906140576e-06, + "loss": 2.0338, + "step": 6130 + }, + { + "epoch": 0.5243661595943422, + "grad_norm": 0.6540238857269287, + "learning_rate": 4.756170467162012e-06, + "loss": 2.0695, + "step": 6140 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 0.5159985423088074, + "learning_rate": 4.747630028183449e-06, + "loss": 2.0505, + "step": 6150 + }, + { + "epoch": 0.5260741926874833, + "grad_norm": 0.5910516381263733, + "learning_rate": 4.739089589204886e-06, + "loss": 2.0328, + "step": 6160 + }, + { + "epoch": 0.5269282092340539, + "grad_norm": 0.6169615983963013, + "learning_rate": 4.730549150226322e-06, + "loss": 2.0317, + "step": 6170 + }, + { + "epoch": 0.5277822257806245, + "grad_norm": 0.5604709982872009, + "learning_rate": 4.722008711247759e-06, + "loss": 2.0395, + "step": 6180 + }, + { + "epoch": 0.5286362423271951, + "grad_norm": 0.5917683839797974, + "learning_rate": 4.713468272269195e-06, + "loss": 2.0571, + "step": 6190 + }, + { + "epoch": 0.5294902588737657, + "grad_norm": 0.6007024645805359, + "learning_rate": 4.7049278332906315e-06, + "loss": 2.0755, + "step": 6200 + }, + { + "epoch": 0.5294902588737657, + "eval_loss": 2.065096616744995, + "eval_runtime": 99.2711, + "eval_samples_per_second": 10.073, + "eval_steps_per_second": 5.037, + "step": 6200 + }, + { + "epoch": 0.5303442754203362, + "grad_norm": 0.6155685782432556, + "learning_rate": 4.696387394312068e-06, + "loss": 2.0576, + "step": 6210 + }, + { + "epoch": 0.5311982919669068, + "grad_norm": 0.5749043226242065, + "learning_rate": 4.687846955333504e-06, + "loss": 2.0291, + "step": 6220 + }, + { + "epoch": 0.5320523085134774, + "grad_norm": 0.6788541674613953, + "learning_rate": 4.679306516354941e-06, + "loss": 2.0164, + "step": 6230 + }, + { + "epoch": 0.5329063250600481, + "grad_norm": 0.5900527834892273, + "learning_rate": 4.670766077376377e-06, + "loss": 2.0543, + "step": 6240 + }, + { + "epoch": 0.5337603416066187, + "grad_norm": 0.5669994354248047, + "learning_rate": 4.662225638397814e-06, + "loss": 2.0272, + "step": 6250 + }, + { + "epoch": 0.5346143581531893, + "grad_norm": 0.5225370526313782, + "learning_rate": 4.65368519941925e-06, + "loss": 2.0891, + "step": 6260 + }, + { + "epoch": 0.5354683746997598, + "grad_norm": 0.5951294898986816, + "learning_rate": 4.645144760440687e-06, + "loss": 2.0397, + "step": 6270 + }, + { + "epoch": 0.5363223912463304, + "grad_norm": 0.5531771183013916, + "learning_rate": 4.636604321462124e-06, + "loss": 2.0515, + "step": 6280 + }, + { + "epoch": 0.537176407792901, + "grad_norm": 0.53160160779953, + "learning_rate": 4.62806388248356e-06, + "loss": 2.0721, + "step": 6290 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 0.5429141521453857, + "learning_rate": 4.619523443504996e-06, + "loss": 2.0588, + "step": 6300 + }, + { + "epoch": 0.5380304243394716, + "eval_loss": 2.06453275680542, + "eval_runtime": 99.2144, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.04, + "step": 6300 + }, + { + "epoch": 0.5388844408860421, + "grad_norm": 0.4986340403556824, + "learning_rate": 4.610983004526433e-06, + "loss": 2.0166, + "step": 6310 + }, + { + "epoch": 0.5397384574326127, + "grad_norm": 0.5342271327972412, + "learning_rate": 4.60244256554787e-06, + "loss": 2.0259, + "step": 6320 + }, + { + "epoch": 0.5405924739791833, + "grad_norm": 0.6318298578262329, + "learning_rate": 4.5939021265693066e-06, + "loss": 2.0369, + "step": 6330 + }, + { + "epoch": 0.5414464905257539, + "grad_norm": 0.5513337254524231, + "learning_rate": 4.585361687590743e-06, + "loss": 2.0554, + "step": 6340 + }, + { + "epoch": 0.5423005070723246, + "grad_norm": 0.6259679198265076, + "learning_rate": 4.576821248612179e-06, + "loss": 2.0331, + "step": 6350 + }, + { + "epoch": 0.5431545236188952, + "grad_norm": 0.5690695643424988, + "learning_rate": 4.5682808096336155e-06, + "loss": 2.0492, + "step": 6360 + }, + { + "epoch": 0.5440085401654657, + "grad_norm": 0.616737425327301, + "learning_rate": 4.559740370655052e-06, + "loss": 2.0283, + "step": 6370 + }, + { + "epoch": 0.5448625567120363, + "grad_norm": 0.6700045466423035, + "learning_rate": 4.551199931676488e-06, + "loss": 2.0369, + "step": 6380 + }, + { + "epoch": 0.5457165732586069, + "grad_norm": 0.5789743065834045, + "learning_rate": 4.542659492697925e-06, + "loss": 2.0513, + "step": 6390 + }, + { + "epoch": 0.5465705898051775, + "grad_norm": 0.503028929233551, + "learning_rate": 4.534119053719361e-06, + "loss": 2.0433, + "step": 6400 + }, + { + "epoch": 0.5465705898051775, + "eval_loss": 2.0636653900146484, + "eval_runtime": 99.1503, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 6400 + }, + { + "epoch": 0.547424606351748, + "grad_norm": 0.5215557217597961, + "learning_rate": 4.525578614740798e-06, + "loss": 2.0615, + "step": 6410 + }, + { + "epoch": 0.5482786228983186, + "grad_norm": 0.5647512078285217, + "learning_rate": 4.517038175762234e-06, + "loss": 1.9944, + "step": 6420 + }, + { + "epoch": 0.5491326394448892, + "grad_norm": 0.5839424133300781, + "learning_rate": 4.508497736783671e-06, + "loss": 2.0645, + "step": 6430 + }, + { + "epoch": 0.5499866559914598, + "grad_norm": 0.5427899360656738, + "learning_rate": 4.499957297805108e-06, + "loss": 2.0424, + "step": 6440 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 0.5012800097465515, + "learning_rate": 4.491416858826544e-06, + "loss": 2.0631, + "step": 6450 + }, + { + "epoch": 0.5516946890846011, + "grad_norm": 0.5525136590003967, + "learning_rate": 4.482876419847981e-06, + "loss": 2.0479, + "step": 6460 + }, + { + "epoch": 0.5525487056311716, + "grad_norm": 0.5929039716720581, + "learning_rate": 4.474335980869417e-06, + "loss": 2.0691, + "step": 6470 + }, + { + "epoch": 0.5534027221777422, + "grad_norm": 0.603726863861084, + "learning_rate": 4.465795541890854e-06, + "loss": 2.0499, + "step": 6480 + }, + { + "epoch": 0.5542567387243128, + "grad_norm": 0.540930449962616, + "learning_rate": 4.4572551029122905e-06, + "loss": 2.0402, + "step": 6490 + }, + { + "epoch": 0.5551107552708834, + "grad_norm": 0.5366859436035156, + "learning_rate": 4.4487146639337266e-06, + "loss": 2.0181, + "step": 6500 + }, + { + "epoch": 0.5551107552708834, + "eval_loss": 2.063614845275879, + "eval_runtime": 99.0169, + "eval_samples_per_second": 10.099, + "eval_steps_per_second": 5.05, + "step": 6500 + }, + { + "epoch": 0.555964771817454, + "grad_norm": 0.5817170143127441, + "learning_rate": 4.4401742249551634e-06, + "loss": 2.0233, + "step": 6510 + }, + { + "epoch": 0.5568187883640245, + "grad_norm": 0.5765772461891174, + "learning_rate": 4.4316337859765995e-06, + "loss": 2.0452, + "step": 6520 + }, + { + "epoch": 0.5576728049105951, + "grad_norm": 0.5237901210784912, + "learning_rate": 4.4230933469980355e-06, + "loss": 2.0444, + "step": 6530 + }, + { + "epoch": 0.5585268214571657, + "grad_norm": 0.5542150735855103, + "learning_rate": 4.414552908019472e-06, + "loss": 2.0409, + "step": 6540 + }, + { + "epoch": 0.5593808380037363, + "grad_norm": 0.5346140265464783, + "learning_rate": 4.406012469040909e-06, + "loss": 2.0505, + "step": 6550 + }, + { + "epoch": 0.5602348545503069, + "grad_norm": 0.5194046497344971, + "learning_rate": 4.397472030062346e-06, + "loss": 2.0303, + "step": 6560 + }, + { + "epoch": 0.5610888710968776, + "grad_norm": 0.6005309820175171, + "learning_rate": 4.388931591083782e-06, + "loss": 2.0443, + "step": 6570 + }, + { + "epoch": 0.5619428876434481, + "grad_norm": 0.5340930223464966, + "learning_rate": 4.380391152105218e-06, + "loss": 2.0599, + "step": 6580 + }, + { + "epoch": 0.5627969041900187, + "grad_norm": 0.5244272947311401, + "learning_rate": 4.371850713126655e-06, + "loss": 2.0311, + "step": 6590 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 0.5906960368156433, + "learning_rate": 4.363310274148092e-06, + "loss": 2.0282, + "step": 6600 + }, + { + "epoch": 0.5636509207365893, + "eval_loss": 2.062767744064331, + "eval_runtime": 98.8722, + "eval_samples_per_second": 10.114, + "eval_steps_per_second": 5.057, + "step": 6600 + }, + { + "epoch": 0.5645049372831599, + "grad_norm": 0.5671685934066772, + "learning_rate": 4.354769835169529e-06, + "loss": 2.0688, + "step": 6610 + }, + { + "epoch": 0.5653589538297304, + "grad_norm": 0.5608102083206177, + "learning_rate": 4.346229396190965e-06, + "loss": 2.0465, + "step": 6620 + }, + { + "epoch": 0.566212970376301, + "grad_norm": 0.5790672898292542, + "learning_rate": 4.337688957212401e-06, + "loss": 2.0184, + "step": 6630 + }, + { + "epoch": 0.5670669869228716, + "grad_norm": 0.5642273426055908, + "learning_rate": 4.329148518233838e-06, + "loss": 2.0271, + "step": 6640 + }, + { + "epoch": 0.5679210034694422, + "grad_norm": 0.5485665202140808, + "learning_rate": 4.320608079255274e-06, + "loss": 2.0533, + "step": 6650 + }, + { + "epoch": 0.5687750200160128, + "grad_norm": 0.6455518007278442, + "learning_rate": 4.3120676402767105e-06, + "loss": 2.0465, + "step": 6660 + }, + { + "epoch": 0.5696290365625833, + "grad_norm": 0.5416640639305115, + "learning_rate": 4.303527201298147e-06, + "loss": 2.0891, + "step": 6670 + }, + { + "epoch": 0.570483053109154, + "grad_norm": 0.5357679724693298, + "learning_rate": 4.294986762319583e-06, + "loss": 2.0335, + "step": 6680 + }, + { + "epoch": 0.5713370696557246, + "grad_norm": 0.5133402347564697, + "learning_rate": 4.28644632334102e-06, + "loss": 2.04, + "step": 6690 + }, + { + "epoch": 0.5721910862022952, + "grad_norm": 0.5662258267402649, + "learning_rate": 4.277905884362456e-06, + "loss": 2.0373, + "step": 6700 + }, + { + "epoch": 0.5721910862022952, + "eval_loss": 2.0626380443573, + "eval_runtime": 99.1808, + "eval_samples_per_second": 10.083, + "eval_steps_per_second": 5.041, + "step": 6700 + }, + { + "epoch": 0.5730451027488658, + "grad_norm": 0.5616562962532043, + "learning_rate": 4.269365445383893e-06, + "loss": 2.0597, + "step": 6710 + }, + { + "epoch": 0.5738991192954364, + "grad_norm": 0.5565703511238098, + "learning_rate": 4.26082500640533e-06, + "loss": 2.035, + "step": 6720 + }, + { + "epoch": 0.5747531358420069, + "grad_norm": 0.6030809879302979, + "learning_rate": 4.252284567426766e-06, + "loss": 2.0444, + "step": 6730 + }, + { + "epoch": 0.5756071523885775, + "grad_norm": 0.5424668192863464, + "learning_rate": 4.243744128448203e-06, + "loss": 2.0534, + "step": 6740 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 0.5354906916618347, + "learning_rate": 4.235203689469639e-06, + "loss": 2.033, + "step": 6750 + }, + { + "epoch": 0.5773151854817187, + "grad_norm": 0.5503740906715393, + "learning_rate": 4.226663250491075e-06, + "loss": 2.0309, + "step": 6760 + }, + { + "epoch": 0.5781692020282893, + "grad_norm": 0.600864827632904, + "learning_rate": 4.218122811512512e-06, + "loss": 2.0531, + "step": 6770 + }, + { + "epoch": 0.5790232185748598, + "grad_norm": 0.6190809607505798, + "learning_rate": 4.209582372533949e-06, + "loss": 2.0308, + "step": 6780 + }, + { + "epoch": 0.5798772351214305, + "grad_norm": 0.5197070837020874, + "learning_rate": 4.201041933555386e-06, + "loss": 2.0396, + "step": 6790 + }, + { + "epoch": 0.5807312516680011, + "grad_norm": 0.6103793978691101, + "learning_rate": 4.192501494576822e-06, + "loss": 2.0258, + "step": 6800 + }, + { + "epoch": 0.5807312516680011, + "eval_loss": 2.062330722808838, + "eval_runtime": 99.1236, + "eval_samples_per_second": 10.088, + "eval_steps_per_second": 5.044, + "step": 6800 + }, + { + "epoch": 0.5815852682145717, + "grad_norm": 0.540984570980072, + "learning_rate": 4.183961055598258e-06, + "loss": 2.0359, + "step": 6810 + }, + { + "epoch": 0.5824392847611423, + "grad_norm": 0.6142724752426147, + "learning_rate": 4.1754206166196945e-06, + "loss": 2.0433, + "step": 6820 + }, + { + "epoch": 0.5832933013077128, + "grad_norm": 0.5233826041221619, + "learning_rate": 4.166880177641131e-06, + "loss": 2.0364, + "step": 6830 + }, + { + "epoch": 0.5841473178542834, + "grad_norm": 0.6223013997077942, + "learning_rate": 4.158339738662567e-06, + "loss": 2.0555, + "step": 6840 + }, + { + "epoch": 0.585001334400854, + "grad_norm": 0.6881380081176758, + "learning_rate": 4.149799299684004e-06, + "loss": 2.0529, + "step": 6850 + }, + { + "epoch": 0.5858553509474246, + "grad_norm": 0.60942143201828, + "learning_rate": 4.14125886070544e-06, + "loss": 2.0601, + "step": 6860 + }, + { + "epoch": 0.5867093674939952, + "grad_norm": 0.5089098811149597, + "learning_rate": 4.132718421726877e-06, + "loss": 2.0576, + "step": 6870 + }, + { + "epoch": 0.5875633840405657, + "grad_norm": 0.5346848964691162, + "learning_rate": 4.124177982748314e-06, + "loss": 2.0333, + "step": 6880 + }, + { + "epoch": 0.5884174005871364, + "grad_norm": 0.576016902923584, + "learning_rate": 4.11563754376975e-06, + "loss": 2.0475, + "step": 6890 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 0.5049400925636292, + "learning_rate": 4.107097104791187e-06, + "loss": 2.0424, + "step": 6900 + }, + { + "epoch": 0.589271417133707, + "eval_loss": 2.061659097671509, + "eval_runtime": 99.083, + "eval_samples_per_second": 10.093, + "eval_steps_per_second": 5.046, + "step": 6900 + }, + { + "epoch": 0.5901254336802776, + "grad_norm": 0.538563072681427, + "learning_rate": 4.098556665812623e-06, + "loss": 2.0278, + "step": 6910 + }, + { + "epoch": 0.5909794502268482, + "grad_norm": 0.631867527961731, + "learning_rate": 4.09001622683406e-06, + "loss": 2.0441, + "step": 6920 + }, + { + "epoch": 0.5918334667734187, + "grad_norm": 0.6422947645187378, + "learning_rate": 4.081475787855496e-06, + "loss": 2.0301, + "step": 6930 + }, + { + "epoch": 0.5926874833199893, + "grad_norm": 0.6247473955154419, + "learning_rate": 4.072935348876933e-06, + "loss": 2.0079, + "step": 6940 + }, + { + "epoch": 0.5935414998665599, + "grad_norm": 0.5848476886749268, + "learning_rate": 4.0643949098983696e-06, + "loss": 2.014, + "step": 6950 + }, + { + "epoch": 0.5943955164131305, + "grad_norm": 0.5247405171394348, + "learning_rate": 4.055854470919806e-06, + "loss": 2.0357, + "step": 6960 + }, + { + "epoch": 0.5952495329597011, + "grad_norm": 0.5828875303268433, + "learning_rate": 4.047314031941242e-06, + "loss": 2.0203, + "step": 6970 + }, + { + "epoch": 0.5961035495062716, + "grad_norm": 0.5308778285980225, + "learning_rate": 4.0387735929626785e-06, + "loss": 2.0446, + "step": 6980 + }, + { + "epoch": 0.5969575660528422, + "grad_norm": 0.6658284068107605, + "learning_rate": 4.030233153984115e-06, + "loss": 2.076, + "step": 6990 + }, + { + "epoch": 0.5978115825994129, + "grad_norm": 0.5998433828353882, + "learning_rate": 4.021692715005552e-06, + "loss": 2.0472, + "step": 7000 + }, + { + "epoch": 0.5978115825994129, + "eval_loss": 2.0610265731811523, + "eval_runtime": 99.1916, + "eval_samples_per_second": 10.081, + "eval_steps_per_second": 5.041, + "step": 7000 + }, + { + "epoch": 0.5986655991459835, + "grad_norm": 0.6604376435279846, + "learning_rate": 4.013152276026988e-06, + "loss": 2.0522, + "step": 7010 + }, + { + "epoch": 0.5995196156925541, + "grad_norm": 0.5252701640129089, + "learning_rate": 4.004611837048424e-06, + "loss": 2.0546, + "step": 7020 + }, + { + "epoch": 0.6003736322391247, + "grad_norm": 0.5231185555458069, + "learning_rate": 3.996071398069861e-06, + "loss": 2.0597, + "step": 7030 + }, + { + "epoch": 0.6012276487856952, + "grad_norm": 0.5638464093208313, + "learning_rate": 3.987530959091297e-06, + "loss": 2.0253, + "step": 7040 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 0.6865386962890625, + "learning_rate": 3.978990520112734e-06, + "loss": 2.0161, + "step": 7050 + }, + { + "epoch": 0.6029356818788364, + "grad_norm": 0.5177867412567139, + "learning_rate": 3.970450081134171e-06, + "loss": 2.0625, + "step": 7060 + }, + { + "epoch": 0.603789698425407, + "grad_norm": 0.505166232585907, + "learning_rate": 3.961909642155607e-06, + "loss": 2.0391, + "step": 7070 + }, + { + "epoch": 0.6046437149719776, + "grad_norm": 0.5872111320495605, + "learning_rate": 3.953369203177044e-06, + "loss": 2.0006, + "step": 7080 + }, + { + "epoch": 0.6054977315185481, + "grad_norm": 0.5561593770980835, + "learning_rate": 3.94482876419848e-06, + "loss": 2.0046, + "step": 7090 + }, + { + "epoch": 0.6063517480651187, + "grad_norm": 0.4805944263935089, + "learning_rate": 3.936288325219917e-06, + "loss": 2.0452, + "step": 7100 + }, + { + "epoch": 0.6063517480651187, + "eval_loss": 2.0605008602142334, + "eval_runtime": 98.9828, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.051, + "step": 7100 + }, + { + "epoch": 0.6072057646116894, + "grad_norm": 0.4770943820476532, + "learning_rate": 3.9277478862413535e-06, + "loss": 2.0427, + "step": 7110 + }, + { + "epoch": 0.60805978115826, + "grad_norm": 0.48157086968421936, + "learning_rate": 3.9192074472627895e-06, + "loss": 2.0401, + "step": 7120 + }, + { + "epoch": 0.6089137977048306, + "grad_norm": 0.48966723680496216, + "learning_rate": 3.910667008284226e-06, + "loss": 2.0389, + "step": 7130 + }, + { + "epoch": 0.6097678142514011, + "grad_norm": 0.5447493195533752, + "learning_rate": 3.9021265693056624e-06, + "loss": 2.0206, + "step": 7140 + }, + { + "epoch": 0.6106218307979717, + "grad_norm": 0.5208780169487, + "learning_rate": 3.8935861303270985e-06, + "loss": 2.0661, + "step": 7150 + }, + { + "epoch": 0.6114758473445423, + "grad_norm": 0.548076868057251, + "learning_rate": 3.885045691348535e-06, + "loss": 2.0497, + "step": 7160 + }, + { + "epoch": 0.6123298638911129, + "grad_norm": 0.5473942160606384, + "learning_rate": 3.876505252369972e-06, + "loss": 2.0206, + "step": 7170 + }, + { + "epoch": 0.6131838804376835, + "grad_norm": 0.6074666976928711, + "learning_rate": 3.867964813391409e-06, + "loss": 2.0296, + "step": 7180 + }, + { + "epoch": 0.614037896984254, + "grad_norm": 0.5882102847099304, + "learning_rate": 3.859424374412845e-06, + "loss": 2.039, + "step": 7190 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 0.6144769787788391, + "learning_rate": 3.850883935434281e-06, + "loss": 2.058, + "step": 7200 + }, + { + "epoch": 0.6148919135308246, + "eval_loss": 2.060422897338867, + "eval_runtime": 98.8228, + "eval_samples_per_second": 10.119, + "eval_steps_per_second": 5.06, + "step": 7200 + }, + { + "epoch": 0.6157459300773952, + "grad_norm": 0.6333703994750977, + "learning_rate": 3.842343496455718e-06, + "loss": 2.0343, + "step": 7210 + }, + { + "epoch": 0.6165999466239659, + "grad_norm": 0.6200190782546997, + "learning_rate": 3.833803057477155e-06, + "loss": 2.0516, + "step": 7220 + }, + { + "epoch": 0.6174539631705365, + "grad_norm": 0.7211159467697144, + "learning_rate": 3.825262618498592e-06, + "loss": 2.052, + "step": 7230 + }, + { + "epoch": 0.618307979717107, + "grad_norm": 0.6151908040046692, + "learning_rate": 3.816722179520028e-06, + "loss": 2.0628, + "step": 7240 + }, + { + "epoch": 0.6191619962636776, + "grad_norm": 0.5245025157928467, + "learning_rate": 3.8081817405414638e-06, + "loss": 2.0287, + "step": 7250 + }, + { + "epoch": 0.6200160128102482, + "grad_norm": 0.5179617404937744, + "learning_rate": 3.7996413015629006e-06, + "loss": 2.0384, + "step": 7260 + }, + { + "epoch": 0.6208700293568188, + "grad_norm": 0.5305375456809998, + "learning_rate": 3.791100862584337e-06, + "loss": 2.0535, + "step": 7270 + }, + { + "epoch": 0.6217240459033894, + "grad_norm": 0.4980165362358093, + "learning_rate": 3.782560423605774e-06, + "loss": 2.0215, + "step": 7280 + }, + { + "epoch": 0.62257806244996, + "grad_norm": 0.5280580520629883, + "learning_rate": 3.7740199846272104e-06, + "loss": 2.0707, + "step": 7290 + }, + { + "epoch": 0.6234320789965305, + "grad_norm": 0.5952538251876831, + "learning_rate": 3.7654795456486464e-06, + "loss": 2.0395, + "step": 7300 + }, + { + "epoch": 0.6234320789965305, + "eval_loss": 2.059544324874878, + "eval_runtime": 99.0871, + "eval_samples_per_second": 10.092, + "eval_steps_per_second": 5.046, + "step": 7300 + }, + { + "epoch": 0.6242860955431011, + "grad_norm": 0.5330567359924316, + "learning_rate": 3.7569391066700833e-06, + "loss": 2.0756, + "step": 7310 + }, + { + "epoch": 0.6251401120896717, + "grad_norm": 0.6171514987945557, + "learning_rate": 3.7483986676915197e-06, + "loss": 2.03, + "step": 7320 + }, + { + "epoch": 0.6259941286362424, + "grad_norm": 0.5237666964530945, + "learning_rate": 3.7398582287129557e-06, + "loss": 2.0358, + "step": 7330 + }, + { + "epoch": 0.626848145182813, + "grad_norm": 0.49751928448677063, + "learning_rate": 3.7313177897343926e-06, + "loss": 2.0394, + "step": 7340 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 0.5514543652534485, + "learning_rate": 3.722777350755829e-06, + "loss": 2.0491, + "step": 7350 + }, + { + "epoch": 0.6285561782759541, + "grad_norm": 0.49144166707992554, + "learning_rate": 3.714236911777266e-06, + "loss": 2.0504, + "step": 7360 + }, + { + "epoch": 0.6294101948225247, + "grad_norm": 0.4847368896007538, + "learning_rate": 3.705696472798702e-06, + "loss": 2.0326, + "step": 7370 + }, + { + "epoch": 0.6302642113690953, + "grad_norm": 0.5339867472648621, + "learning_rate": 3.6971560338201384e-06, + "loss": 2.0161, + "step": 7380 + }, + { + "epoch": 0.6311182279156659, + "grad_norm": 0.5152847766876221, + "learning_rate": 3.6886155948415753e-06, + "loss": 2.0773, + "step": 7390 + }, + { + "epoch": 0.6319722444622364, + "grad_norm": 0.5037885308265686, + "learning_rate": 3.6800751558630117e-06, + "loss": 2.0617, + "step": 7400 + }, + { + "epoch": 0.6319722444622364, + "eval_loss": 2.0594868659973145, + "eval_runtime": 98.194, + "eval_samples_per_second": 10.184, + "eval_steps_per_second": 5.092, + "step": 7400 + }, + { + "epoch": 0.632826261008807, + "grad_norm": 0.6314968466758728, + "learning_rate": 3.6715347168844486e-06, + "loss": 2.06, + "step": 7410 + }, + { + "epoch": 0.6336802775553776, + "grad_norm": 0.5687624216079712, + "learning_rate": 3.6629942779058846e-06, + "loss": 2.0328, + "step": 7420 + }, + { + "epoch": 0.6345342941019482, + "grad_norm": 0.5416333079338074, + "learning_rate": 3.654453838927321e-06, + "loss": 2.0212, + "step": 7430 + }, + { + "epoch": 0.6353883106485189, + "grad_norm": 0.5656910538673401, + "learning_rate": 3.645913399948758e-06, + "loss": 2.0601, + "step": 7440 + }, + { + "epoch": 0.6362423271950894, + "grad_norm": 0.5079363584518433, + "learning_rate": 3.637372960970194e-06, + "loss": 2.0135, + "step": 7450 + }, + { + "epoch": 0.63709634374166, + "grad_norm": 0.4980289041996002, + "learning_rate": 3.628832521991631e-06, + "loss": 2.062, + "step": 7460 + }, + { + "epoch": 0.6379503602882306, + "grad_norm": 0.5247538089752197, + "learning_rate": 3.6202920830130672e-06, + "loss": 2.0328, + "step": 7470 + }, + { + "epoch": 0.6388043768348012, + "grad_norm": 0.5211943984031677, + "learning_rate": 3.6117516440345037e-06, + "loss": 2.0173, + "step": 7480 + }, + { + "epoch": 0.6396583933813718, + "grad_norm": 0.5648311376571655, + "learning_rate": 3.6032112050559406e-06, + "loss": 2.0673, + "step": 7490 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 0.5538136959075928, + "learning_rate": 3.5946707660773766e-06, + "loss": 2.0541, + "step": 7500 + }, + { + "epoch": 0.6405124099279423, + "eval_loss": 2.0588557720184326, + "eval_runtime": 98.1936, + "eval_samples_per_second": 10.184, + "eval_steps_per_second": 5.092, + "step": 7500 + }, + { + "epoch": 0.6413664264745129, + "grad_norm": 0.5176765322685242, + "learning_rate": 3.586130327098813e-06, + "loss": 2.0329, + "step": 7510 + }, + { + "epoch": 0.6422204430210835, + "grad_norm": 0.5251041650772095, + "learning_rate": 3.57758988812025e-06, + "loss": 2.0514, + "step": 7520 + }, + { + "epoch": 0.6430744595676541, + "grad_norm": 0.5257436633110046, + "learning_rate": 3.569049449141686e-06, + "loss": 2.0424, + "step": 7530 + }, + { + "epoch": 0.6439284761142248, + "grad_norm": 0.5508720874786377, + "learning_rate": 3.5605090101631228e-06, + "loss": 2.0182, + "step": 7540 + }, + { + "epoch": 0.6447824926607953, + "grad_norm": 0.620539665222168, + "learning_rate": 3.5519685711845592e-06, + "loss": 2.0259, + "step": 7550 + }, + { + "epoch": 0.6456365092073659, + "grad_norm": 0.5437766909599304, + "learning_rate": 3.5434281322059957e-06, + "loss": 2.0313, + "step": 7560 + }, + { + "epoch": 0.6464905257539365, + "grad_norm": 0.5391945242881775, + "learning_rate": 3.534887693227432e-06, + "loss": 2.0508, + "step": 7570 + }, + { + "epoch": 0.6473445423005071, + "grad_norm": 0.49488940834999084, + "learning_rate": 3.5263472542488686e-06, + "loss": 2.0433, + "step": 7580 + }, + { + "epoch": 0.6481985588470777, + "grad_norm": 0.49338310956954956, + "learning_rate": 3.5178068152703054e-06, + "loss": 2.0393, + "step": 7590 + }, + { + "epoch": 0.6490525753936482, + "grad_norm": 0.546257734298706, + "learning_rate": 3.509266376291742e-06, + "loss": 2.0397, + "step": 7600 + }, + { + "epoch": 0.6490525753936482, + "eval_loss": 2.0585122108459473, + "eval_runtime": 98.1579, + "eval_samples_per_second": 10.188, + "eval_steps_per_second": 5.094, + "step": 7600 + }, + { + "epoch": 0.6499065919402188, + "grad_norm": 0.5638222098350525, + "learning_rate": 3.500725937313178e-06, + "loss": 2.0369, + "step": 7610 + }, + { + "epoch": 0.6507606084867894, + "grad_norm": 0.4783158004283905, + "learning_rate": 3.4921854983346148e-06, + "loss": 2.0689, + "step": 7620 + }, + { + "epoch": 0.65161462503336, + "grad_norm": 0.5572123527526855, + "learning_rate": 3.4836450593560512e-06, + "loss": 2.0489, + "step": 7630 + }, + { + "epoch": 0.6524686415799306, + "grad_norm": 0.5384513735771179, + "learning_rate": 3.4751046203774872e-06, + "loss": 2.0261, + "step": 7640 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 0.4891786575317383, + "learning_rate": 3.466564181398924e-06, + "loss": 2.022, + "step": 7650 + }, + { + "epoch": 0.6541766746730718, + "grad_norm": 0.499834805727005, + "learning_rate": 3.4580237424203605e-06, + "loss": 2.0493, + "step": 7660 + }, + { + "epoch": 0.6550306912196424, + "grad_norm": 0.5120176672935486, + "learning_rate": 3.4494833034417974e-06, + "loss": 2.026, + "step": 7670 + }, + { + "epoch": 0.655884707766213, + "grad_norm": 0.5497430562973022, + "learning_rate": 3.440942864463234e-06, + "loss": 2.0357, + "step": 7680 + }, + { + "epoch": 0.6567387243127836, + "grad_norm": 0.5623518228530884, + "learning_rate": 3.43240242548467e-06, + "loss": 2.0151, + "step": 7690 + }, + { + "epoch": 0.6575927408593542, + "grad_norm": 0.6008960008621216, + "learning_rate": 3.4238619865061068e-06, + "loss": 2.0443, + "step": 7700 + }, + { + "epoch": 0.6575927408593542, + "eval_loss": 2.058133840560913, + "eval_runtime": 101.1458, + "eval_samples_per_second": 9.887, + "eval_steps_per_second": 4.943, + "step": 7700 + }, + { + "epoch": 0.6584467574059247, + "grad_norm": 0.5446147322654724, + "learning_rate": 3.415321547527543e-06, + "loss": 2.0068, + "step": 7710 + }, + { + "epoch": 0.6593007739524953, + "grad_norm": 0.5315154194831848, + "learning_rate": 3.40678110854898e-06, + "loss": 2.0261, + "step": 7720 + }, + { + "epoch": 0.6601547904990659, + "grad_norm": 0.5347596406936646, + "learning_rate": 3.398240669570416e-06, + "loss": 2.0595, + "step": 7730 + }, + { + "epoch": 0.6610088070456365, + "grad_norm": 0.5882947444915771, + "learning_rate": 3.3897002305918525e-06, + "loss": 2.0016, + "step": 7740 + }, + { + "epoch": 0.661862823592207, + "grad_norm": 0.5320292711257935, + "learning_rate": 3.3811597916132894e-06, + "loss": 2.0349, + "step": 7750 + }, + { + "epoch": 0.6627168401387777, + "grad_norm": 0.4915473759174347, + "learning_rate": 3.3726193526347254e-06, + "loss": 1.9986, + "step": 7760 + }, + { + "epoch": 0.6635708566853483, + "grad_norm": 0.5196160674095154, + "learning_rate": 3.3640789136561623e-06, + "loss": 2.0095, + "step": 7770 + }, + { + "epoch": 0.6644248732319189, + "grad_norm": 0.5164250135421753, + "learning_rate": 3.3555384746775987e-06, + "loss": 2.0241, + "step": 7780 + }, + { + "epoch": 0.6652788897784895, + "grad_norm": 0.5463265180587769, + "learning_rate": 3.346998035699035e-06, + "loss": 2.0216, + "step": 7790 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 0.5488865971565247, + "learning_rate": 3.338457596720472e-06, + "loss": 2.0307, + "step": 7800 + }, + { + "epoch": 0.6661329063250601, + "eval_loss": 2.0575978755950928, + "eval_runtime": 101.701, + "eval_samples_per_second": 9.833, + "eval_steps_per_second": 4.916, + "step": 7800 + }, + { + "epoch": 1.0000800320128052, + "grad_norm": 0.7740113139152527, + "learning_rate": 4.996796925048046e-06, + "loss": 2.0395, + "step": 7810 + }, + { + "epoch": 1.0013605442176872, + "grad_norm": 0.718994677066803, + "learning_rate": 4.9903907751441385e-06, + "loss": 2.04, + "step": 7820 + }, + { + "epoch": 1.002641056422569, + "grad_norm": 0.5779696702957153, + "learning_rate": 4.983984625240231e-06, + "loss": 2.0225, + "step": 7830 + }, + { + "epoch": 1.003921568627451, + "grad_norm": 0.5766503810882568, + "learning_rate": 4.977578475336323e-06, + "loss": 2.0371, + "step": 7840 + }, + { + "epoch": 1.005202080832333, + "grad_norm": 0.5812973380088806, + "learning_rate": 4.971172325432416e-06, + "loss": 2.0419, + "step": 7850 + }, + { + "epoch": 1.006482593037215, + "grad_norm": 0.623308002948761, + "learning_rate": 4.964766175528508e-06, + "loss": 2.0505, + "step": 7860 + }, + { + "epoch": 1.0077631052420968, + "grad_norm": 0.5263451933860779, + "learning_rate": 4.9583600256246e-06, + "loss": 2.0547, + "step": 7870 + }, + { + "epoch": 1.0090436174469788, + "grad_norm": 0.5198953151702881, + "learning_rate": 4.951953875720693e-06, + "loss": 2.0203, + "step": 7880 + }, + { + "epoch": 1.0103241296518608, + "grad_norm": 0.5853062868118286, + "learning_rate": 4.945547725816784e-06, + "loss": 2.0358, + "step": 7890 + }, + { + "epoch": 1.0116046418567426, + "grad_norm": 0.5690246820449829, + "learning_rate": 4.939141575912877e-06, + "loss": 2.0607, + "step": 7900 + }, + { + "epoch": 1.0116046418567426, + "eval_loss": 2.060058116912842, + "eval_runtime": 99.7073, + "eval_samples_per_second": 10.029, + "eval_steps_per_second": 5.015, + "step": 7900 + }, + { + "epoch": 1.0128851540616246, + "grad_norm": 0.6459298729896545, + "learning_rate": 4.932735426008969e-06, + "loss": 2.0358, + "step": 7910 + }, + { + "epoch": 1.0141656662665066, + "grad_norm": 0.7262232899665833, + "learning_rate": 4.926329276105061e-06, + "loss": 2.0391, + "step": 7920 + }, + { + "epoch": 1.0154461784713886, + "grad_norm": 0.575965940952301, + "learning_rate": 4.919923126201154e-06, + "loss": 2.0571, + "step": 7930 + }, + { + "epoch": 1.0167266906762704, + "grad_norm": 0.6091157793998718, + "learning_rate": 4.913516976297245e-06, + "loss": 2.0116, + "step": 7940 + }, + { + "epoch": 1.0180072028811524, + "grad_norm": 0.5863317847251892, + "learning_rate": 4.9071108263933385e-06, + "loss": 2.0389, + "step": 7950 + }, + { + "epoch": 1.0192877150860344, + "grad_norm": 0.8138238191604614, + "learning_rate": 4.90070467648943e-06, + "loss": 2.0153, + "step": 7960 + }, + { + "epoch": 1.0205682272909165, + "grad_norm": 0.5913819074630737, + "learning_rate": 4.894298526585522e-06, + "loss": 2.0315, + "step": 7970 + }, + { + "epoch": 1.0218487394957982, + "grad_norm": 0.643348217010498, + "learning_rate": 4.887892376681615e-06, + "loss": 2.0161, + "step": 7980 + }, + { + "epoch": 1.0231292517006803, + "grad_norm": 0.6365050673484802, + "learning_rate": 4.881486226777707e-06, + "loss": 2.0395, + "step": 7990 + }, + { + "epoch": 1.0244097639055623, + "grad_norm": 0.628605842590332, + "learning_rate": 4.8750800768737995e-06, + "loss": 2.066, + "step": 8000 + }, + { + "epoch": 1.0244097639055623, + "eval_loss": 2.0600643157958984, + "eval_runtime": 99.495, + "eval_samples_per_second": 10.051, + "eval_steps_per_second": 5.025, + "step": 8000 + }, + { + "epoch": 1.0256902761104443, + "grad_norm": 0.568298876285553, + "learning_rate": 4.868673926969891e-06, + "loss": 2.0526, + "step": 8010 + }, + { + "epoch": 1.026970788315326, + "grad_norm": 0.5762647986412048, + "learning_rate": 4.862267777065983e-06, + "loss": 2.0247, + "step": 8020 + }, + { + "epoch": 1.028251300520208, + "grad_norm": 0.5563777089118958, + "learning_rate": 4.8558616271620766e-06, + "loss": 2.0326, + "step": 8030 + }, + { + "epoch": 1.02953181272509, + "grad_norm": 0.5955259203910828, + "learning_rate": 4.849455477258168e-06, + "loss": 2.0452, + "step": 8040 + }, + { + "epoch": 1.0308123249299719, + "grad_norm": 0.5595470666885376, + "learning_rate": 4.8430493273542605e-06, + "loss": 2.0335, + "step": 8050 + }, + { + "epoch": 1.032092837134854, + "grad_norm": 0.5725470781326294, + "learning_rate": 4.836643177450353e-06, + "loss": 2.0465, + "step": 8060 + }, + { + "epoch": 1.033373349339736, + "grad_norm": 0.58101886510849, + "learning_rate": 4.830237027546445e-06, + "loss": 2.0177, + "step": 8070 + }, + { + "epoch": 1.034653861544618, + "grad_norm": 0.5748034119606018, + "learning_rate": 4.8238308776425376e-06, + "loss": 2.0472, + "step": 8080 + }, + { + "epoch": 1.0359343737494997, + "grad_norm": 0.5992661118507385, + "learning_rate": 4.817424727738629e-06, + "loss": 2.0163, + "step": 8090 + }, + { + "epoch": 1.0372148859543817, + "grad_norm": 0.6099635362625122, + "learning_rate": 4.8110185778347215e-06, + "loss": 2.0443, + "step": 8100 + }, + { + "epoch": 1.0372148859543817, + "eval_loss": 2.0590715408325195, + "eval_runtime": 99.7637, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 5.012, + "step": 8100 + }, + { + "epoch": 1.0384953981592637, + "grad_norm": 0.6065194010734558, + "learning_rate": 4.804612427930814e-06, + "loss": 2.0325, + "step": 8110 + }, + { + "epoch": 1.0397759103641457, + "grad_norm": 0.5869019031524658, + "learning_rate": 4.798206278026906e-06, + "loss": 2.0056, + "step": 8120 + }, + { + "epoch": 1.0410564225690275, + "grad_norm": 0.6351775527000427, + "learning_rate": 4.7918001281229986e-06, + "loss": 2.045, + "step": 8130 + }, + { + "epoch": 1.0423369347739095, + "grad_norm": 0.6560531258583069, + "learning_rate": 4.785393978219091e-06, + "loss": 2.0568, + "step": 8140 + }, + { + "epoch": 1.0436174469787916, + "grad_norm": 0.6789016127586365, + "learning_rate": 4.778987828315183e-06, + "loss": 2.0492, + "step": 8150 + }, + { + "epoch": 1.0448979591836736, + "grad_norm": 0.6798705458641052, + "learning_rate": 4.772581678411275e-06, + "loss": 2.059, + "step": 8160 + }, + { + "epoch": 1.0461784713885554, + "grad_norm": 0.539069652557373, + "learning_rate": 4.766175528507367e-06, + "loss": 2.019, + "step": 8170 + }, + { + "epoch": 1.0474589835934374, + "grad_norm": 0.5451361536979675, + "learning_rate": 4.7597693786034595e-06, + "loss": 2.0505, + "step": 8180 + }, + { + "epoch": 1.0487394957983194, + "grad_norm": 0.629255473613739, + "learning_rate": 4.753363228699552e-06, + "loss": 2.0216, + "step": 8190 + }, + { + "epoch": 1.0500200080032012, + "grad_norm": 0.6603861451148987, + "learning_rate": 4.746957078795644e-06, + "loss": 2.0266, + "step": 8200 + }, + { + "epoch": 1.0500200080032012, + "eval_loss": 2.0588185787200928, + "eval_runtime": 98.7043, + "eval_samples_per_second": 10.131, + "eval_steps_per_second": 5.066, + "step": 8200 + }, + { + "epoch": 1.0513005202080832, + "grad_norm": 0.6830970644950867, + "learning_rate": 4.740550928891736e-06, + "loss": 2.075, + "step": 8210 + }, + { + "epoch": 1.0525810324129652, + "grad_norm": 0.5211127996444702, + "learning_rate": 4.734144778987829e-06, + "loss": 2.006, + "step": 8220 + }, + { + "epoch": 1.0538615446178472, + "grad_norm": 0.6494899392127991, + "learning_rate": 4.727738629083921e-06, + "loss": 2.0478, + "step": 8230 + }, + { + "epoch": 1.055142056822729, + "grad_norm": 0.5123782753944397, + "learning_rate": 4.721332479180013e-06, + "loss": 2.0282, + "step": 8240 + }, + { + "epoch": 1.056422569027611, + "grad_norm": 0.5879936814308167, + "learning_rate": 4.714926329276105e-06, + "loss": 2.0656, + "step": 8250 + }, + { + "epoch": 1.057703081232493, + "grad_norm": 0.5019848346710205, + "learning_rate": 4.708520179372198e-06, + "loss": 2.0206, + "step": 8260 + }, + { + "epoch": 1.058983593437375, + "grad_norm": 0.54250168800354, + "learning_rate": 4.70211402946829e-06, + "loss": 2.0572, + "step": 8270 + }, + { + "epoch": 1.0602641056422568, + "grad_norm": 0.5240892767906189, + "learning_rate": 4.695707879564382e-06, + "loss": 2.0483, + "step": 8280 + }, + { + "epoch": 1.0615446178471388, + "grad_norm": 0.6275247931480408, + "learning_rate": 4.689301729660474e-06, + "loss": 2.0346, + "step": 8290 + }, + { + "epoch": 1.0628251300520208, + "grad_norm": 0.6213353276252747, + "learning_rate": 4.682895579756567e-06, + "loss": 2.0228, + "step": 8300 + }, + { + "epoch": 1.0628251300520208, + "eval_loss": 2.0585317611694336, + "eval_runtime": 98.584, + "eval_samples_per_second": 10.144, + "eval_steps_per_second": 5.072, + "step": 8300 + }, + { + "epoch": 1.0641056422569029, + "grad_norm": 0.5836710929870605, + "learning_rate": 4.6764894298526595e-06, + "loss": 2.0357, + "step": 8310 + }, + { + "epoch": 1.0653861544617846, + "grad_norm": 0.5407329797744751, + "learning_rate": 4.670083279948751e-06, + "loss": 2.0647, + "step": 8320 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.5546687841415405, + "learning_rate": 4.663677130044843e-06, + "loss": 2.0501, + "step": 8330 + }, + { + "epoch": 1.0679471788715487, + "grad_norm": 0.5902862548828125, + "learning_rate": 4.657270980140936e-06, + "loss": 2.0524, + "step": 8340 + }, + { + "epoch": 1.0692276910764307, + "grad_norm": 0.5358943939208984, + "learning_rate": 4.650864830237028e-06, + "loss": 2.0324, + "step": 8350 + }, + { + "epoch": 1.0705082032813125, + "grad_norm": 0.5571943521499634, + "learning_rate": 4.6444586803331205e-06, + "loss": 2.0092, + "step": 8360 + }, + { + "epoch": 1.0717887154861945, + "grad_norm": 0.53328937292099, + "learning_rate": 4.638052530429212e-06, + "loss": 2.0282, + "step": 8370 + }, + { + "epoch": 1.0730692276910765, + "grad_norm": 0.678426206111908, + "learning_rate": 4.631646380525305e-06, + "loss": 2.0427, + "step": 8380 + }, + { + "epoch": 1.0743497398959585, + "grad_norm": 0.6406043171882629, + "learning_rate": 4.625240230621397e-06, + "loss": 2.0246, + "step": 8390 + }, + { + "epoch": 1.0756302521008403, + "grad_norm": 0.5901826024055481, + "learning_rate": 4.618834080717489e-06, + "loss": 2.0276, + "step": 8400 + }, + { + "epoch": 1.0756302521008403, + "eval_loss": 2.058272361755371, + "eval_runtime": 98.6499, + "eval_samples_per_second": 10.137, + "eval_steps_per_second": 5.068, + "step": 8400 + }, + { + "epoch": 1.0769107643057223, + "grad_norm": 0.5525951385498047, + "learning_rate": 4.6124279308135815e-06, + "loss": 2.0386, + "step": 8410 + }, + { + "epoch": 1.0781912765106043, + "grad_norm": 0.5235487222671509, + "learning_rate": 4.606021780909674e-06, + "loss": 2.0413, + "step": 8420 + }, + { + "epoch": 1.079471788715486, + "grad_norm": 0.5588756203651428, + "learning_rate": 4.599615631005766e-06, + "loss": 2.021, + "step": 8430 + }, + { + "epoch": 1.0807523009203681, + "grad_norm": 0.5304463505744934, + "learning_rate": 4.593209481101858e-06, + "loss": 2.0393, + "step": 8440 + }, + { + "epoch": 1.0820328131252501, + "grad_norm": 0.7028805017471313, + "learning_rate": 4.58680333119795e-06, + "loss": 2.02, + "step": 8450 + }, + { + "epoch": 1.0833133253301321, + "grad_norm": 0.735097348690033, + "learning_rate": 4.5803971812940425e-06, + "loss": 2.0358, + "step": 8460 + }, + { + "epoch": 1.084593837535014, + "grad_norm": 0.6663596034049988, + "learning_rate": 4.573991031390135e-06, + "loss": 2.0365, + "step": 8470 + }, + { + "epoch": 1.085874349739896, + "grad_norm": 0.589572012424469, + "learning_rate": 4.567584881486227e-06, + "loss": 2.0444, + "step": 8480 + }, + { + "epoch": 1.087154861944778, + "grad_norm": 0.5971885323524475, + "learning_rate": 4.5611787315823196e-06, + "loss": 2.0306, + "step": 8490 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 0.5263055562973022, + "learning_rate": 4.554772581678412e-06, + "loss": 2.018, + "step": 8500 + }, + { + "epoch": 1.08843537414966, + "eval_loss": 2.0575432777404785, + "eval_runtime": 98.791, + "eval_samples_per_second": 10.122, + "eval_steps_per_second": 5.061, + "step": 8500 + }, + { + "epoch": 1.0897158863545418, + "grad_norm": 0.5179014801979065, + "learning_rate": 4.548366431774504e-06, + "loss": 2.0413, + "step": 8510 + }, + { + "epoch": 1.0909963985594238, + "grad_norm": 0.5456497073173523, + "learning_rate": 4.541960281870596e-06, + "loss": 2.0544, + "step": 8520 + }, + { + "epoch": 1.0922769107643058, + "grad_norm": 0.6493037939071655, + "learning_rate": 4.535554131966688e-06, + "loss": 2.0687, + "step": 8530 + }, + { + "epoch": 1.0935574229691878, + "grad_norm": 0.5566242933273315, + "learning_rate": 4.5291479820627806e-06, + "loss": 2.0408, + "step": 8540 + }, + { + "epoch": 1.0948379351740696, + "grad_norm": 0.6045375466346741, + "learning_rate": 4.522741832158873e-06, + "loss": 2.0458, + "step": 8550 + }, + { + "epoch": 1.0961184473789516, + "grad_norm": 0.5247881412506104, + "learning_rate": 4.516335682254965e-06, + "loss": 2.0354, + "step": 8560 + }, + { + "epoch": 1.0973989595838336, + "grad_norm": 0.5688020586967468, + "learning_rate": 4.509929532351057e-06, + "loss": 2.0047, + "step": 8570 + }, + { + "epoch": 1.0986794717887154, + "grad_norm": 0.5846471190452576, + "learning_rate": 4.50352338244715e-06, + "loss": 2.0359, + "step": 8580 + }, + { + "epoch": 1.0999599839935974, + "grad_norm": 0.6008567214012146, + "learning_rate": 4.4971172325432416e-06, + "loss": 2.0378, + "step": 8590 + }, + { + "epoch": 1.1012404961984794, + "grad_norm": 0.6397348046302795, + "learning_rate": 4.490711082639334e-06, + "loss": 2.0602, + "step": 8600 + }, + { + "epoch": 1.1012404961984794, + "eval_loss": 2.0577335357666016, + "eval_runtime": 98.7283, + "eval_samples_per_second": 10.129, + "eval_steps_per_second": 5.064, + "step": 8600 + }, + { + "epoch": 1.1025210084033614, + "grad_norm": 0.6789408922195435, + "learning_rate": 4.484304932735426e-06, + "loss": 2.0403, + "step": 8610 + }, + { + "epoch": 1.1038015206082432, + "grad_norm": 0.7066412568092346, + "learning_rate": 4.477898782831519e-06, + "loss": 2.0377, + "step": 8620 + }, + { + "epoch": 1.1050820328131252, + "grad_norm": 0.6749730110168457, + "learning_rate": 4.471492632927611e-06, + "loss": 2.0187, + "step": 8630 + }, + { + "epoch": 1.1063625450180072, + "grad_norm": 0.5682045221328735, + "learning_rate": 4.4650864830237025e-06, + "loss": 2.0206, + "step": 8640 + }, + { + "epoch": 1.1076430572228892, + "grad_norm": 0.5407593846321106, + "learning_rate": 4.458680333119795e-06, + "loss": 2.0246, + "step": 8650 + }, + { + "epoch": 1.108923569427771, + "grad_norm": 0.5678638219833374, + "learning_rate": 4.452274183215888e-06, + "loss": 2.0428, + "step": 8660 + }, + { + "epoch": 1.110204081632653, + "grad_norm": 0.5593268275260925, + "learning_rate": 4.44586803331198e-06, + "loss": 2.0527, + "step": 8670 + }, + { + "epoch": 1.111484593837535, + "grad_norm": 0.6249791383743286, + "learning_rate": 4.439461883408072e-06, + "loss": 2.0137, + "step": 8680 + }, + { + "epoch": 1.112765106042417, + "grad_norm": 0.7263971567153931, + "learning_rate": 4.433055733504164e-06, + "loss": 2.0361, + "step": 8690 + }, + { + "epoch": 1.1140456182472989, + "grad_norm": 0.48830723762512207, + "learning_rate": 4.426649583600257e-06, + "loss": 2.0424, + "step": 8700 + }, + { + "epoch": 1.1140456182472989, + "eval_loss": 2.0569541454315186, + "eval_runtime": 99.0098, + "eval_samples_per_second": 10.1, + "eval_steps_per_second": 5.05, + "step": 8700 + }, + { + "epoch": 1.1153261304521809, + "grad_norm": 0.6187960505485535, + "learning_rate": 4.420243433696349e-06, + "loss": 2.0395, + "step": 8710 + }, + { + "epoch": 1.1166066426570629, + "grad_norm": 0.5017440915107727, + "learning_rate": 4.413837283792441e-06, + "loss": 2.0252, + "step": 8720 + }, + { + "epoch": 1.1178871548619447, + "grad_norm": 0.553180456161499, + "learning_rate": 4.407431133888533e-06, + "loss": 2.0321, + "step": 8730 + }, + { + "epoch": 1.1191676670668267, + "grad_norm": 0.5666632056236267, + "learning_rate": 4.401024983984626e-06, + "loss": 2.0294, + "step": 8740 + }, + { + "epoch": 1.1204481792717087, + "grad_norm": 0.5148674845695496, + "learning_rate": 4.394618834080718e-06, + "loss": 2.0564, + "step": 8750 + }, + { + "epoch": 1.1217286914765907, + "grad_norm": 0.5826708078384399, + "learning_rate": 4.38821268417681e-06, + "loss": 2.0273, + "step": 8760 + }, + { + "epoch": 1.1230092036814725, + "grad_norm": 0.5150731205940247, + "learning_rate": 4.3818065342729025e-06, + "loss": 2.0312, + "step": 8770 + }, + { + "epoch": 1.1242897158863545, + "grad_norm": 0.5426385402679443, + "learning_rate": 4.375400384368995e-06, + "loss": 2.0303, + "step": 8780 + }, + { + "epoch": 1.1255702280912365, + "grad_norm": 0.5782310962677002, + "learning_rate": 4.368994234465087e-06, + "loss": 2.0154, + "step": 8790 + }, + { + "epoch": 1.1268507402961185, + "grad_norm": 0.7119567394256592, + "learning_rate": 4.362588084561179e-06, + "loss": 2.0135, + "step": 8800 + }, + { + "epoch": 1.1268507402961185, + "eval_loss": 2.056509017944336, + "eval_runtime": 99.65, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 5.018, + "step": 8800 + }, + { + "epoch": 1.1281312525010003, + "grad_norm": 0.640973687171936, + "learning_rate": 4.356181934657271e-06, + "loss": 2.0323, + "step": 8810 + }, + { + "epoch": 1.1294117647058823, + "grad_norm": 0.5564003586769104, + "learning_rate": 4.3497757847533635e-06, + "loss": 2.0404, + "step": 8820 + }, + { + "epoch": 1.1306922769107643, + "grad_norm": 0.5527184009552002, + "learning_rate": 4.343369634849456e-06, + "loss": 2.0297, + "step": 8830 + }, + { + "epoch": 1.1319727891156464, + "grad_norm": 0.7402225136756897, + "learning_rate": 4.336963484945548e-06, + "loss": 2.0241, + "step": 8840 + }, + { + "epoch": 1.1332533013205282, + "grad_norm": 0.6281604766845703, + "learning_rate": 4.330557335041641e-06, + "loss": 2.0446, + "step": 8850 + }, + { + "epoch": 1.1345338135254102, + "grad_norm": 0.5926194787025452, + "learning_rate": 4.324151185137733e-06, + "loss": 2.018, + "step": 8860 + }, + { + "epoch": 1.1358143257302922, + "grad_norm": 0.5378844141960144, + "learning_rate": 4.3177450352338245e-06, + "loss": 2.0394, + "step": 8870 + }, + { + "epoch": 1.137094837935174, + "grad_norm": 0.5743001699447632, + "learning_rate": 4.311338885329917e-06, + "loss": 2.0412, + "step": 8880 + }, + { + "epoch": 1.138375350140056, + "grad_norm": 0.6045801639556885, + "learning_rate": 4.304932735426009e-06, + "loss": 2.04, + "step": 8890 + }, + { + "epoch": 1.139655862344938, + "grad_norm": 0.6552196741104126, + "learning_rate": 4.2985265855221016e-06, + "loss": 2.0216, + "step": 8900 + }, + { + "epoch": 1.139655862344938, + "eval_loss": 2.056703567504883, + "eval_runtime": 99.6537, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 5.017, + "step": 8900 + }, + { + "epoch": 1.14093637454982, + "grad_norm": 0.7684733867645264, + "learning_rate": 4.292120435618194e-06, + "loss": 2.0079, + "step": 8910 + }, + { + "epoch": 1.1422168867547018, + "grad_norm": 0.6571674942970276, + "learning_rate": 4.2857142857142855e-06, + "loss": 2.0467, + "step": 8920 + }, + { + "epoch": 1.1434973989595838, + "grad_norm": 0.6338903307914734, + "learning_rate": 4.279308135810379e-06, + "loss": 2.0476, + "step": 8930 + }, + { + "epoch": 1.1447779111644658, + "grad_norm": 0.6187698841094971, + "learning_rate": 4.272901985906471e-06, + "loss": 2.031, + "step": 8940 + }, + { + "epoch": 1.1460584233693478, + "grad_norm": 0.5363069772720337, + "learning_rate": 4.2664958360025626e-06, + "loss": 2.0144, + "step": 8950 + }, + { + "epoch": 1.1473389355742296, + "grad_norm": 0.5464866161346436, + "learning_rate": 4.260089686098655e-06, + "loss": 2.0175, + "step": 8960 + }, + { + "epoch": 1.1486194477791116, + "grad_norm": 0.5811886787414551, + "learning_rate": 4.253683536194747e-06, + "loss": 2.0758, + "step": 8970 + }, + { + "epoch": 1.1498999599839936, + "grad_norm": 0.5649928450584412, + "learning_rate": 4.24727738629084e-06, + "loss": 2.0875, + "step": 8980 + }, + { + "epoch": 1.1511804721888756, + "grad_norm": 0.6172623038291931, + "learning_rate": 4.240871236386932e-06, + "loss": 2.0767, + "step": 8990 + }, + { + "epoch": 1.1524609843937574, + "grad_norm": 0.7383103370666504, + "learning_rate": 4.2344650864830236e-06, + "loss": 2.0246, + "step": 9000 + }, + { + "epoch": 1.1524609843937574, + "eval_loss": 2.056033134460449, + "eval_runtime": 99.966, + "eval_samples_per_second": 10.003, + "eval_steps_per_second": 5.002, + "step": 9000 + }, + { + "epoch": 1.1537414965986394, + "grad_norm": 0.5509638786315918, + "learning_rate": 4.228058936579117e-06, + "loss": 2.031, + "step": 9010 + }, + { + "epoch": 1.1550220088035215, + "grad_norm": 0.5470526814460754, + "learning_rate": 4.221652786675208e-06, + "loss": 2.0362, + "step": 9020 + }, + { + "epoch": 1.1563025210084033, + "grad_norm": 0.5429520010948181, + "learning_rate": 4.215246636771301e-06, + "loss": 2.0339, + "step": 9030 + }, + { + "epoch": 1.1575830332132853, + "grad_norm": 0.7113022208213806, + "learning_rate": 4.208840486867393e-06, + "loss": 2.0495, + "step": 9040 + }, + { + "epoch": 1.1588635454181673, + "grad_norm": 0.59480881690979, + "learning_rate": 4.202434336963485e-06, + "loss": 2.0054, + "step": 9050 + }, + { + "epoch": 1.1601440576230493, + "grad_norm": 0.5894181728363037, + "learning_rate": 4.196028187059578e-06, + "loss": 2.0491, + "step": 9060 + }, + { + "epoch": 1.1614245698279313, + "grad_norm": 0.5791051387786865, + "learning_rate": 4.189622037155669e-06, + "loss": 2.0559, + "step": 9070 + }, + { + "epoch": 1.162705082032813, + "grad_norm": 0.6649132966995239, + "learning_rate": 4.183215887251762e-06, + "loss": 2.0723, + "step": 9080 + }, + { + "epoch": 1.163985594237695, + "grad_norm": 0.5229988694190979, + "learning_rate": 4.176809737347854e-06, + "loss": 2.0144, + "step": 9090 + }, + { + "epoch": 1.165266106442577, + "grad_norm": 0.5805575847625732, + "learning_rate": 4.170403587443946e-06, + "loss": 2.0391, + "step": 9100 + }, + { + "epoch": 1.165266106442577, + "eval_loss": 2.0554511547088623, + "eval_runtime": 98.6973, + "eval_samples_per_second": 10.132, + "eval_steps_per_second": 5.066, + "step": 9100 + }, + { + "epoch": 1.166546618647459, + "grad_norm": 0.5783365964889526, + "learning_rate": 4.163997437540039e-06, + "loss": 2.0159, + "step": 9110 + }, + { + "epoch": 1.167827130852341, + "grad_norm": 0.5854288339614868, + "learning_rate": 4.157591287636131e-06, + "loss": 2.0347, + "step": 9120 + }, + { + "epoch": 1.169107643057223, + "grad_norm": 0.5515726208686829, + "learning_rate": 4.1511851377322235e-06, + "loss": 2.0512, + "step": 9130 + }, + { + "epoch": 1.170388155262105, + "grad_norm": 0.5261589288711548, + "learning_rate": 4.144778987828316e-06, + "loss": 2.0588, + "step": 9140 + }, + { + "epoch": 1.1716686674669867, + "grad_norm": 0.5191331505775452, + "learning_rate": 4.138372837924407e-06, + "loss": 2.0423, + "step": 9150 + }, + { + "epoch": 1.1729491796718687, + "grad_norm": 0.6107770204544067, + "learning_rate": 4.1319666880205e-06, + "loss": 2.0076, + "step": 9160 + }, + { + "epoch": 1.1742296918767507, + "grad_norm": 0.5812585949897766, + "learning_rate": 4.125560538116592e-06, + "loss": 2.0123, + "step": 9170 + }, + { + "epoch": 1.1755102040816325, + "grad_norm": 0.5599560141563416, + "learning_rate": 4.1191543882126845e-06, + "loss": 2.0249, + "step": 9180 + }, + { + "epoch": 1.1767907162865145, + "grad_norm": 0.5351222157478333, + "learning_rate": 4.112748238308777e-06, + "loss": 2.0282, + "step": 9190 + }, + { + "epoch": 1.1780712284913966, + "grad_norm": 0.5384876728057861, + "learning_rate": 4.106342088404869e-06, + "loss": 2.0419, + "step": 9200 + }, + { + "epoch": 1.1780712284913966, + "eval_loss": 2.0551235675811768, + "eval_runtime": 99.416, + "eval_samples_per_second": 10.059, + "eval_steps_per_second": 5.029, + "step": 9200 + }, + { + "epoch": 1.1793517406962786, + "grad_norm": 0.5739924311637878, + "learning_rate": 4.099935938500962e-06, + "loss": 2.072, + "step": 9210 + }, + { + "epoch": 1.1806322529011606, + "grad_norm": 0.5525854229927063, + "learning_rate": 4.093529788597054e-06, + "loss": 2.0488, + "step": 9220 + }, + { + "epoch": 1.1819127651060424, + "grad_norm": 0.5480990409851074, + "learning_rate": 4.0871236386931455e-06, + "loss": 2.0233, + "step": 9230 + }, + { + "epoch": 1.1831932773109244, + "grad_norm": 0.5385409593582153, + "learning_rate": 4.080717488789238e-06, + "loss": 2.0448, + "step": 9240 + }, + { + "epoch": 1.1844737895158064, + "grad_norm": 0.5837866067886353, + "learning_rate": 4.07431133888533e-06, + "loss": 2.0387, + "step": 9250 + }, + { + "epoch": 1.1857543017206882, + "grad_norm": 0.5853747129440308, + "learning_rate": 4.067905188981423e-06, + "loss": 2.0584, + "step": 9260 + }, + { + "epoch": 1.1870348139255702, + "grad_norm": 0.5508265495300293, + "learning_rate": 4.061499039077515e-06, + "loss": 2.0211, + "step": 9270 + }, + { + "epoch": 1.1883153261304522, + "grad_norm": 0.5867159962654114, + "learning_rate": 4.0550928891736065e-06, + "loss": 2.0016, + "step": 9280 + }, + { + "epoch": 1.1895958383353342, + "grad_norm": 0.5611317157745361, + "learning_rate": 4.0486867392697e-06, + "loss": 2.0549, + "step": 9290 + }, + { + "epoch": 1.190876350540216, + "grad_norm": 0.5700793266296387, + "learning_rate": 4.042280589365791e-06, + "loss": 2.0517, + "step": 9300 + }, + { + "epoch": 1.190876350540216, + "eval_loss": 2.0553061962127686, + "eval_runtime": 100.0, + "eval_samples_per_second": 10.0, + "eval_steps_per_second": 5.0, + "step": 9300 + }, + { + "epoch": 1.192156862745098, + "grad_norm": 0.6431117653846741, + "learning_rate": 4.0358744394618836e-06, + "loss": 2.0306, + "step": 9310 + }, + { + "epoch": 1.19343737494998, + "grad_norm": 0.5513091683387756, + "learning_rate": 4.029468289557976e-06, + "loss": 2.0356, + "step": 9320 + }, + { + "epoch": 1.1947178871548618, + "grad_norm": 0.561978280544281, + "learning_rate": 4.023062139654068e-06, + "loss": 2.0345, + "step": 9330 + }, + { + "epoch": 1.1959983993597438, + "grad_norm": 0.5052829384803772, + "learning_rate": 4.016655989750161e-06, + "loss": 2.0712, + "step": 9340 + }, + { + "epoch": 1.1972789115646258, + "grad_norm": 0.6181693077087402, + "learning_rate": 4.010249839846252e-06, + "loss": 2.0767, + "step": 9350 + }, + { + "epoch": 1.1985594237695079, + "grad_norm": 0.5558566451072693, + "learning_rate": 4.0038436899423446e-06, + "loss": 2.0337, + "step": 9360 + }, + { + "epoch": 1.1998399359743899, + "grad_norm": 0.5830146074295044, + "learning_rate": 3.997437540038438e-06, + "loss": 2.0225, + "step": 9370 + }, + { + "epoch": 1.2011204481792717, + "grad_norm": 0.5830032229423523, + "learning_rate": 3.991031390134529e-06, + "loss": 2.031, + "step": 9380 + }, + { + "epoch": 1.2024009603841537, + "grad_norm": 0.524630606174469, + "learning_rate": 3.984625240230622e-06, + "loss": 2.0641, + "step": 9390 + }, + { + "epoch": 1.2036814725890357, + "grad_norm": 0.5185086131095886, + "learning_rate": 3.978219090326714e-06, + "loss": 2.0459, + "step": 9400 + }, + { + "epoch": 1.2036814725890357, + "eval_loss": 2.0544145107269287, + "eval_runtime": 99.5316, + "eval_samples_per_second": 10.047, + "eval_steps_per_second": 5.024, + "step": 9400 + }, + { + "epoch": 1.2049619847939175, + "grad_norm": 0.6136828660964966, + "learning_rate": 3.971812940422806e-06, + "loss": 2.0247, + "step": 9410 + }, + { + "epoch": 1.2062424969987995, + "grad_norm": 0.5434430837631226, + "learning_rate": 3.965406790518899e-06, + "loss": 2.0359, + "step": 9420 + }, + { + "epoch": 1.2075230092036815, + "grad_norm": 0.5703685283660889, + "learning_rate": 3.95900064061499e-06, + "loss": 2.0055, + "step": 9430 + }, + { + "epoch": 1.2088035214085635, + "grad_norm": 0.5542050004005432, + "learning_rate": 3.952594490711083e-06, + "loss": 2.0288, + "step": 9440 + }, + { + "epoch": 1.2100840336134453, + "grad_norm": 0.5820972919464111, + "learning_rate": 3.946188340807175e-06, + "loss": 2.0562, + "step": 9450 + }, + { + "epoch": 1.2113645458183273, + "grad_norm": 0.6455557346343994, + "learning_rate": 3.939782190903267e-06, + "loss": 2.0401, + "step": 9460 + }, + { + "epoch": 1.2126450580232093, + "grad_norm": 0.6048575639724731, + "learning_rate": 3.93337604099936e-06, + "loss": 2.0594, + "step": 9470 + }, + { + "epoch": 1.2139255702280913, + "grad_norm": 0.5332948565483093, + "learning_rate": 3.926969891095452e-06, + "loss": 2.0354, + "step": 9480 + }, + { + "epoch": 1.2152060824329731, + "grad_norm": 0.5830275416374207, + "learning_rate": 3.9205637411915445e-06, + "loss": 2.0257, + "step": 9490 + }, + { + "epoch": 1.2164865946378551, + "grad_norm": 0.6083182692527771, + "learning_rate": 3.914157591287637e-06, + "loss": 2.0324, + "step": 9500 + }, + { + "epoch": 1.2164865946378551, + "eval_loss": 2.054752826690674, + "eval_runtime": 99.7348, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 5.013, + "step": 9500 + }, + { + "epoch": 1.2177671068427371, + "grad_norm": 0.6217626929283142, + "learning_rate": 3.907751441383728e-06, + "loss": 1.9986, + "step": 9510 + }, + { + "epoch": 1.2190476190476192, + "grad_norm": 0.5222347974777222, + "learning_rate": 3.901345291479821e-06, + "loss": 1.9824, + "step": 9520 + }, + { + "epoch": 1.220328131252501, + "grad_norm": 0.5925136804580688, + "learning_rate": 3.894939141575913e-06, + "loss": 2.0384, + "step": 9530 + }, + { + "epoch": 1.221608643457383, + "grad_norm": 0.57725989818573, + "learning_rate": 3.8885329916720055e-06, + "loss": 2.0398, + "step": 9540 + }, + { + "epoch": 1.222889155662265, + "grad_norm": 0.5565349459648132, + "learning_rate": 3.882126841768098e-06, + "loss": 2.0592, + "step": 9550 + }, + { + "epoch": 1.2241696678671468, + "grad_norm": 0.6713798642158508, + "learning_rate": 3.87572069186419e-06, + "loss": 2.0382, + "step": 9560 + }, + { + "epoch": 1.2254501800720288, + "grad_norm": 0.5489569902420044, + "learning_rate": 3.869314541960283e-06, + "loss": 2.0315, + "step": 9570 + }, + { + "epoch": 1.2267306922769108, + "grad_norm": 0.6010563373565674, + "learning_rate": 3.862908392056374e-06, + "loss": 2.0423, + "step": 9580 + }, + { + "epoch": 1.2280112044817928, + "grad_norm": 0.5540730357170105, + "learning_rate": 3.8565022421524665e-06, + "loss": 2.05, + "step": 9590 + }, + { + "epoch": 1.2292917166866746, + "grad_norm": 0.5796175003051758, + "learning_rate": 3.850096092248559e-06, + "loss": 1.995, + "step": 9600 + }, + { + "epoch": 1.2292917166866746, + "eval_loss": 2.054189682006836, + "eval_runtime": 99.3115, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.035, + "step": 9600 + }, + { + "epoch": 1.2305722288915566, + "grad_norm": 0.5845100283622742, + "learning_rate": 3.843689942344651e-06, + "loss": 2.0672, + "step": 9610 + }, + { + "epoch": 1.2318527410964386, + "grad_norm": 0.5468641519546509, + "learning_rate": 3.837283792440744e-06, + "loss": 2.0307, + "step": 9620 + }, + { + "epoch": 1.2331332533013206, + "grad_norm": 0.5301446914672852, + "learning_rate": 3.830877642536835e-06, + "loss": 2.0205, + "step": 9630 + }, + { + "epoch": 1.2344137655062024, + "grad_norm": 0.5915380120277405, + "learning_rate": 3.824471492632928e-06, + "loss": 2.0291, + "step": 9640 + }, + { + "epoch": 1.2356942777110844, + "grad_norm": 0.5630759596824646, + "learning_rate": 3.818065342729021e-06, + "loss": 2.0108, + "step": 9650 + }, + { + "epoch": 1.2369747899159664, + "grad_norm": 0.5415170788764954, + "learning_rate": 3.8116591928251122e-06, + "loss": 2.0513, + "step": 9660 + }, + { + "epoch": 1.2382553021208484, + "grad_norm": 0.6158027052879333, + "learning_rate": 3.8052530429212046e-06, + "loss": 2.0158, + "step": 9670 + }, + { + "epoch": 1.2395358143257302, + "grad_norm": 0.6029166579246521, + "learning_rate": 3.7988468930172965e-06, + "loss": 2.0285, + "step": 9680 + }, + { + "epoch": 1.2408163265306122, + "grad_norm": 0.5941691994667053, + "learning_rate": 3.7924407431133893e-06, + "loss": 2.0606, + "step": 9690 + }, + { + "epoch": 1.2420968387354943, + "grad_norm": 0.5230644941329956, + "learning_rate": 3.7860345932094817e-06, + "loss": 2.0413, + "step": 9700 + }, + { + "epoch": 1.2420968387354943, + "eval_loss": 2.054208993911743, + "eval_runtime": 99.4257, + "eval_samples_per_second": 10.058, + "eval_steps_per_second": 5.029, + "step": 9700 + }, + { + "epoch": 1.243377350940376, + "grad_norm": 0.5665590167045593, + "learning_rate": 3.7796284433055736e-06, + "loss": 2.049, + "step": 9710 + }, + { + "epoch": 1.244657863145258, + "grad_norm": 0.5213637351989746, + "learning_rate": 3.773222293401666e-06, + "loss": 2.0367, + "step": 9720 + }, + { + "epoch": 1.24593837535014, + "grad_norm": 0.5417614579200745, + "learning_rate": 3.766816143497758e-06, + "loss": 2.0392, + "step": 9730 + }, + { + "epoch": 1.247218887555022, + "grad_norm": 0.5841271877288818, + "learning_rate": 3.7604099935938503e-06, + "loss": 2.0346, + "step": 9740 + }, + { + "epoch": 1.2484993997599039, + "grad_norm": 0.5251507759094238, + "learning_rate": 3.7540038436899427e-06, + "loss": 2.0386, + "step": 9750 + }, + { + "epoch": 1.2497799119647859, + "grad_norm": 0.5105574727058411, + "learning_rate": 3.7475976937860346e-06, + "loss": 2.0266, + "step": 9760 + }, + { + "epoch": 1.251060424169668, + "grad_norm": 0.5459333062171936, + "learning_rate": 3.7411915438821274e-06, + "loss": 2.0536, + "step": 9770 + }, + { + "epoch": 1.2523409363745497, + "grad_norm": 0.5234960317611694, + "learning_rate": 3.7347853939782194e-06, + "loss": 2.0312, + "step": 9780 + }, + { + "epoch": 1.2536214485794317, + "grad_norm": 0.5144737958908081, + "learning_rate": 3.7283792440743117e-06, + "loss": 2.0374, + "step": 9790 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.587037980556488, + "learning_rate": 3.721973094170404e-06, + "loss": 2.0438, + "step": 9800 + }, + { + "epoch": 1.2549019607843137, + "eval_loss": 2.053664445877075, + "eval_runtime": 99.1442, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 9800 + }, + { + "epoch": 1.2561824729891957, + "grad_norm": 0.6080870628356934, + "learning_rate": 3.715566944266496e-06, + "loss": 2.0679, + "step": 9810 + }, + { + "epoch": 1.2574629851940777, + "grad_norm": 0.6218572854995728, + "learning_rate": 3.7091607943625884e-06, + "loss": 1.9985, + "step": 9820 + }, + { + "epoch": 1.2587434973989595, + "grad_norm": 0.6245348453521729, + "learning_rate": 3.7027546444586804e-06, + "loss": 2.0422, + "step": 9830 + }, + { + "epoch": 1.2600240096038415, + "grad_norm": 0.5775768756866455, + "learning_rate": 3.6963484945547727e-06, + "loss": 2.0326, + "step": 9840 + }, + { + "epoch": 1.2613045218087235, + "grad_norm": 0.6755483746528625, + "learning_rate": 3.6899423446508655e-06, + "loss": 2.0396, + "step": 9850 + }, + { + "epoch": 1.2625850340136053, + "grad_norm": 0.6291812062263489, + "learning_rate": 3.683536194746957e-06, + "loss": 2.0173, + "step": 9860 + }, + { + "epoch": 1.2638655462184873, + "grad_norm": 0.5336599946022034, + "learning_rate": 3.67713004484305e-06, + "loss": 2.0217, + "step": 9870 + }, + { + "epoch": 1.2651460584233694, + "grad_norm": 0.6076573133468628, + "learning_rate": 3.6707238949391418e-06, + "loss": 2.0252, + "step": 9880 + }, + { + "epoch": 1.2664265706282514, + "grad_norm": 0.5939626097679138, + "learning_rate": 3.664317745035234e-06, + "loss": 2.0559, + "step": 9890 + }, + { + "epoch": 1.2677070828331334, + "grad_norm": 0.552836537361145, + "learning_rate": 3.6579115951313265e-06, + "loss": 2.0234, + "step": 9900 + }, + { + "epoch": 1.2677070828331334, + "eval_loss": 2.0530409812927246, + "eval_runtime": 99.1336, + "eval_samples_per_second": 10.087, + "eval_steps_per_second": 5.044, + "step": 9900 + }, + { + "epoch": 1.2689875950380152, + "grad_norm": 0.5579429268836975, + "learning_rate": 3.6515054452274185e-06, + "loss": 2.0606, + "step": 9910 + }, + { + "epoch": 1.2702681072428972, + "grad_norm": 0.6526882648468018, + "learning_rate": 3.645099295323511e-06, + "loss": 2.0306, + "step": 9920 + }, + { + "epoch": 1.2715486194477792, + "grad_norm": 0.5143726468086243, + "learning_rate": 3.638693145419603e-06, + "loss": 2.0193, + "step": 9930 + }, + { + "epoch": 1.272829131652661, + "grad_norm": 0.5656778812408447, + "learning_rate": 3.632286995515695e-06, + "loss": 2.0027, + "step": 9940 + }, + { + "epoch": 1.274109643857543, + "grad_norm": 0.5471328496932983, + "learning_rate": 3.625880845611788e-06, + "loss": 2.0292, + "step": 9950 + }, + { + "epoch": 1.275390156062425, + "grad_norm": 0.5603408813476562, + "learning_rate": 3.61947469570788e-06, + "loss": 2.0045, + "step": 9960 + }, + { + "epoch": 1.276670668267307, + "grad_norm": 0.47398802638053894, + "learning_rate": 3.6130685458039722e-06, + "loss": 2.0263, + "step": 9970 + }, + { + "epoch": 1.2779511804721888, + "grad_norm": 0.539801299571991, + "learning_rate": 3.6066623959000646e-06, + "loss": 2.0117, + "step": 9980 + }, + { + "epoch": 1.2792316926770708, + "grad_norm": 0.5288535356521606, + "learning_rate": 3.6002562459961565e-06, + "loss": 2.0066, + "step": 9990 + }, + { + "epoch": 1.2805122048819528, + "grad_norm": 0.608700156211853, + "learning_rate": 3.593850096092249e-06, + "loss": 2.0319, + "step": 10000 + }, + { + "epoch": 1.2805122048819528, + "eval_loss": 2.0524823665618896, + "eval_runtime": 99.123, + "eval_samples_per_second": 10.088, + "eval_steps_per_second": 5.044, + "step": 10000 + }, + { + "epoch": 1.2817927170868346, + "grad_norm": 0.54178386926651, + "learning_rate": 3.587443946188341e-06, + "loss": 2.0233, + "step": 10010 + }, + { + "epoch": 1.2830732292917166, + "grad_norm": 0.6180025935173035, + "learning_rate": 3.5810377962844332e-06, + "loss": 2.0276, + "step": 10020 + }, + { + "epoch": 1.2843537414965986, + "grad_norm": 0.5264298915863037, + "learning_rate": 3.574631646380526e-06, + "loss": 2.0516, + "step": 10030 + }, + { + "epoch": 1.2856342537014807, + "grad_norm": 0.604239821434021, + "learning_rate": 3.568225496476618e-06, + "loss": 2.0007, + "step": 10040 + }, + { + "epoch": 1.2869147659063627, + "grad_norm": 0.7111391425132751, + "learning_rate": 3.5618193465727103e-06, + "loss": 2.0468, + "step": 10050 + }, + { + "epoch": 1.2881952781112445, + "grad_norm": 0.6468245387077332, + "learning_rate": 3.5554131966688023e-06, + "loss": 2.0405, + "step": 10060 + }, + { + "epoch": 1.2894757903161265, + "grad_norm": 0.5630547404289246, + "learning_rate": 3.5490070467648946e-06, + "loss": 2.0402, + "step": 10070 + }, + { + "epoch": 1.2907563025210085, + "grad_norm": 0.5179298520088196, + "learning_rate": 3.542600896860987e-06, + "loss": 1.987, + "step": 10080 + }, + { + "epoch": 1.2920368147258903, + "grad_norm": 0.5753514766693115, + "learning_rate": 3.536194746957079e-06, + "loss": 2.0206, + "step": 10090 + }, + { + "epoch": 1.2933173269307723, + "grad_norm": 0.5431790947914124, + "learning_rate": 3.5297885970531713e-06, + "loss": 2.0548, + "step": 10100 + }, + { + "epoch": 1.2933173269307723, + "eval_loss": 2.0521152019500732, + "eval_runtime": 99.0994, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.045, + "step": 10100 + }, + { + "epoch": 1.2945978391356543, + "grad_norm": 0.5497155785560608, + "learning_rate": 3.5233824471492633e-06, + "loss": 2.0285, + "step": 10110 + }, + { + "epoch": 1.2958783513405363, + "grad_norm": 0.6134312748908997, + "learning_rate": 3.5169762972453556e-06, + "loss": 2.0303, + "step": 10120 + }, + { + "epoch": 1.297158863545418, + "grad_norm": 0.5417413711547852, + "learning_rate": 3.5105701473414484e-06, + "loss": 2.0536, + "step": 10130 + }, + { + "epoch": 1.2984393757503, + "grad_norm": 0.5879444479942322, + "learning_rate": 3.5041639974375404e-06, + "loss": 2.038, + "step": 10140 + }, + { + "epoch": 1.2997198879551821, + "grad_norm": 0.5949389338493347, + "learning_rate": 3.4977578475336327e-06, + "loss": 2.0338, + "step": 10150 + }, + { + "epoch": 1.301000400160064, + "grad_norm": 0.5224178433418274, + "learning_rate": 3.4913516976297247e-06, + "loss": 2.0411, + "step": 10160 + }, + { + "epoch": 1.302280912364946, + "grad_norm": 0.5205852389335632, + "learning_rate": 3.484945547725817e-06, + "loss": 2.0258, + "step": 10170 + }, + { + "epoch": 1.303561424569828, + "grad_norm": 0.5723111629486084, + "learning_rate": 3.4785393978219094e-06, + "loss": 2.0226, + "step": 10180 + }, + { + "epoch": 1.30484193677471, + "grad_norm": 0.5134619474411011, + "learning_rate": 3.4721332479180014e-06, + "loss": 2.0444, + "step": 10190 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.5348407626152039, + "learning_rate": 3.4657270980140937e-06, + "loss": 2.0145, + "step": 10200 + }, + { + "epoch": 1.306122448979592, + "eval_loss": 2.051806926727295, + "eval_runtime": 99.149, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 10200 + }, + { + "epoch": 1.3074029611844737, + "grad_norm": 0.5550008416175842, + "learning_rate": 3.4593209481101857e-06, + "loss": 2.0606, + "step": 10210 + }, + { + "epoch": 1.3086834733893558, + "grad_norm": 0.5189396142959595, + "learning_rate": 3.4529147982062785e-06, + "loss": 2.034, + "step": 10220 + }, + { + "epoch": 1.3099639855942378, + "grad_norm": 0.5279061794281006, + "learning_rate": 3.446508648302371e-06, + "loss": 2.0319, + "step": 10230 + }, + { + "epoch": 1.3112444977991196, + "grad_norm": 0.5703163146972656, + "learning_rate": 3.4401024983984628e-06, + "loss": 2.0314, + "step": 10240 + }, + { + "epoch": 1.3125250100040016, + "grad_norm": 0.6252114176750183, + "learning_rate": 3.433696348494555e-06, + "loss": 1.993, + "step": 10250 + }, + { + "epoch": 1.3138055222088836, + "grad_norm": 0.5590977072715759, + "learning_rate": 3.427290198590647e-06, + "loss": 2.0505, + "step": 10260 + }, + { + "epoch": 1.3150860344137656, + "grad_norm": 0.5424992442131042, + "learning_rate": 3.4208840486867395e-06, + "loss": 2.0414, + "step": 10270 + }, + { + "epoch": 1.3163665466186474, + "grad_norm": 0.5849244594573975, + "learning_rate": 3.414477898782832e-06, + "loss": 2.0071, + "step": 10280 + }, + { + "epoch": 1.3176470588235294, + "grad_norm": 0.5255408883094788, + "learning_rate": 3.4080717488789238e-06, + "loss": 2.0389, + "step": 10290 + }, + { + "epoch": 1.3189275710284114, + "grad_norm": 0.4956037104129791, + "learning_rate": 3.4016655989750166e-06, + "loss": 2.0287, + "step": 10300 + }, + { + "epoch": 1.3189275710284114, + "eval_loss": 2.051372528076172, + "eval_runtime": 99.1664, + "eval_samples_per_second": 10.084, + "eval_steps_per_second": 5.042, + "step": 10300 + }, + { + "epoch": 1.3202080832332932, + "grad_norm": 0.4993341863155365, + "learning_rate": 3.395259449071108e-06, + "loss": 2.026, + "step": 10310 + }, + { + "epoch": 1.3214885954381752, + "grad_norm": 0.5299360156059265, + "learning_rate": 3.388853299167201e-06, + "loss": 2.042, + "step": 10320 + }, + { + "epoch": 1.3227691076430572, + "grad_norm": 0.5740581750869751, + "learning_rate": 3.3824471492632932e-06, + "loss": 2.0569, + "step": 10330 + }, + { + "epoch": 1.3240496198479392, + "grad_norm": 0.5364329218864441, + "learning_rate": 3.376040999359385e-06, + "loss": 2.0324, + "step": 10340 + }, + { + "epoch": 1.3253301320528212, + "grad_norm": 0.6236289143562317, + "learning_rate": 3.3696348494554776e-06, + "loss": 2.031, + "step": 10350 + }, + { + "epoch": 1.326610644257703, + "grad_norm": 0.5699196457862854, + "learning_rate": 3.36322869955157e-06, + "loss": 2.0305, + "step": 10360 + }, + { + "epoch": 1.327891156462585, + "grad_norm": 0.5235342383384705, + "learning_rate": 3.356822549647662e-06, + "loss": 2.0462, + "step": 10370 + }, + { + "epoch": 1.329171668667467, + "grad_norm": 0.529191255569458, + "learning_rate": 3.3504163997437542e-06, + "loss": 2.0034, + "step": 10380 + }, + { + "epoch": 1.3304521808723488, + "grad_norm": 0.5550540089607239, + "learning_rate": 3.344010249839846e-06, + "loss": 2.0158, + "step": 10390 + }, + { + "epoch": 1.3317326930772309, + "grad_norm": 0.520746111869812, + "learning_rate": 3.337604099935939e-06, + "loss": 2.0009, + "step": 10400 + }, + { + "epoch": 1.3317326930772309, + "eval_loss": 2.0515472888946533, + "eval_runtime": 99.0648, + "eval_samples_per_second": 10.094, + "eval_steps_per_second": 5.047, + "step": 10400 + }, + { + "epoch": 1.3330132052821129, + "grad_norm": 0.5105487704277039, + "learning_rate": 3.3311979500320313e-06, + "loss": 2.0296, + "step": 10410 + }, + { + "epoch": 1.3342937174869949, + "grad_norm": 0.5224109292030334, + "learning_rate": 3.3247918001281233e-06, + "loss": 2.0376, + "step": 10420 + }, + { + "epoch": 1.3355742296918767, + "grad_norm": 0.5944901704788208, + "learning_rate": 3.3183856502242157e-06, + "loss": 2.059, + "step": 10430 + }, + { + "epoch": 1.3368547418967587, + "grad_norm": 0.5145648121833801, + "learning_rate": 3.3119795003203076e-06, + "loss": 2.0578, + "step": 10440 + }, + { + "epoch": 1.3381352541016407, + "grad_norm": 0.617449939250946, + "learning_rate": 3.3055733504164e-06, + "loss": 2.0399, + "step": 10450 + }, + { + "epoch": 1.3394157663065225, + "grad_norm": 0.5237758159637451, + "learning_rate": 3.2991672005124923e-06, + "loss": 2.0474, + "step": 10460 + }, + { + "epoch": 1.3406962785114045, + "grad_norm": 0.5313665270805359, + "learning_rate": 3.2927610506085843e-06, + "loss": 2.0433, + "step": 10470 + }, + { + "epoch": 1.3419767907162865, + "grad_norm": 0.6537880301475525, + "learning_rate": 3.286354900704677e-06, + "loss": 1.995, + "step": 10480 + }, + { + "epoch": 1.3432573029211685, + "grad_norm": 0.6162858009338379, + "learning_rate": 3.2799487508007686e-06, + "loss": 2.0267, + "step": 10490 + }, + { + "epoch": 1.3445378151260505, + "grad_norm": 0.5398708581924438, + "learning_rate": 3.2735426008968614e-06, + "loss": 2.0481, + "step": 10500 + }, + { + "epoch": 1.3445378151260505, + "eval_loss": 2.0509071350097656, + "eval_runtime": 98.9856, + "eval_samples_per_second": 10.102, + "eval_steps_per_second": 5.051, + "step": 10500 + }, + { + "epoch": 1.3458183273309323, + "grad_norm": 0.5513540506362915, + "learning_rate": 3.2671364509929538e-06, + "loss": 2.0313, + "step": 10510 + }, + { + "epoch": 1.3470988395358143, + "grad_norm": 0.5159798860549927, + "learning_rate": 3.2607303010890457e-06, + "loss": 2.0131, + "step": 10520 + }, + { + "epoch": 1.3483793517406963, + "grad_norm": 0.6065656542778015, + "learning_rate": 3.254324151185138e-06, + "loss": 2.0335, + "step": 10530 + }, + { + "epoch": 1.3496598639455781, + "grad_norm": 0.6260432600975037, + "learning_rate": 3.24791800128123e-06, + "loss": 2.0343, + "step": 10540 + }, + { + "epoch": 1.3509403761504601, + "grad_norm": 0.581193745136261, + "learning_rate": 3.2415118513773224e-06, + "loss": 2.045, + "step": 10550 + }, + { + "epoch": 1.3522208883553422, + "grad_norm": 0.5331265926361084, + "learning_rate": 3.235105701473415e-06, + "loss": 2.0505, + "step": 10560 + }, + { + "epoch": 1.3535014005602242, + "grad_norm": 0.6132077574729919, + "learning_rate": 3.2286995515695067e-06, + "loss": 2.0658, + "step": 10570 + }, + { + "epoch": 1.3547819127651062, + "grad_norm": 0.5657082796096802, + "learning_rate": 3.2222934016655995e-06, + "loss": 2.0355, + "step": 10580 + }, + { + "epoch": 1.356062424969988, + "grad_norm": 0.526736319065094, + "learning_rate": 3.2158872517616914e-06, + "loss": 2.0322, + "step": 10590 + }, + { + "epoch": 1.35734293717487, + "grad_norm": 0.5426225662231445, + "learning_rate": 3.209481101857784e-06, + "loss": 2.0405, + "step": 10600 + }, + { + "epoch": 1.35734293717487, + "eval_loss": 2.0509629249572754, + "eval_runtime": 98.9708, + "eval_samples_per_second": 10.104, + "eval_steps_per_second": 5.052, + "step": 10600 + }, + { + "epoch": 1.3586234493797518, + "grad_norm": 0.5064254403114319, + "learning_rate": 3.203074951953876e-06, + "loss": 2.0416, + "step": 10610 + }, + { + "epoch": 1.3599039615846338, + "grad_norm": 0.5440548062324524, + "learning_rate": 3.196668802049968e-06, + "loss": 2.0526, + "step": 10620 + }, + { + "epoch": 1.3611844737895158, + "grad_norm": 0.58577960729599, + "learning_rate": 3.1902626521460605e-06, + "loss": 2.0292, + "step": 10630 + }, + { + "epoch": 1.3624649859943978, + "grad_norm": 0.5705699324607849, + "learning_rate": 3.1838565022421524e-06, + "loss": 2.042, + "step": 10640 + }, + { + "epoch": 1.3637454981992798, + "grad_norm": 0.5341757535934448, + "learning_rate": 3.1774503523382448e-06, + "loss": 2.0116, + "step": 10650 + }, + { + "epoch": 1.3650260104041616, + "grad_norm": 0.6056538820266724, + "learning_rate": 3.1710442024343376e-06, + "loss": 2.0721, + "step": 10660 + }, + { + "epoch": 1.3663065226090436, + "grad_norm": 0.4826757311820984, + "learning_rate": 3.1646380525304295e-06, + "loss": 2.0463, + "step": 10670 + }, + { + "epoch": 1.3675870348139256, + "grad_norm": 0.5867482423782349, + "learning_rate": 3.158231902626522e-06, + "loss": 2.028, + "step": 10680 + }, + { + "epoch": 1.3688675470188074, + "grad_norm": 0.5410043597221375, + "learning_rate": 3.151825752722614e-06, + "loss": 2.0253, + "step": 10690 + }, + { + "epoch": 1.3701480592236894, + "grad_norm": 0.5709148049354553, + "learning_rate": 3.145419602818706e-06, + "loss": 2.0292, + "step": 10700 + }, + { + "epoch": 1.3701480592236894, + "eval_loss": 2.0503604412078857, + "eval_runtime": 98.9792, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.052, + "step": 10700 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.5207972526550293, + "learning_rate": 3.1390134529147986e-06, + "loss": 2.0498, + "step": 10710 + }, + { + "epoch": 1.3727090836334535, + "grad_norm": 0.5460594892501831, + "learning_rate": 3.1326073030108905e-06, + "loss": 2.0332, + "step": 10720 + }, + { + "epoch": 1.3739895958383355, + "grad_norm": 0.5334863066673279, + "learning_rate": 3.126201153106983e-06, + "loss": 2.0316, + "step": 10730 + }, + { + "epoch": 1.3752701080432173, + "grad_norm": 0.537344217300415, + "learning_rate": 3.119795003203075e-06, + "loss": 2.0213, + "step": 10740 + }, + { + "epoch": 1.3765506202480993, + "grad_norm": 0.5321835279464722, + "learning_rate": 3.113388853299167e-06, + "loss": 2.0214, + "step": 10750 + }, + { + "epoch": 1.377831132452981, + "grad_norm": 0.5158102512359619, + "learning_rate": 3.10698270339526e-06, + "loss": 2.0267, + "step": 10760 + }, + { + "epoch": 1.379111644657863, + "grad_norm": 0.5449768900871277, + "learning_rate": 3.100576553491352e-06, + "loss": 2.0605, + "step": 10770 + }, + { + "epoch": 1.380392156862745, + "grad_norm": 0.6002796292304993, + "learning_rate": 3.0941704035874443e-06, + "loss": 2.0518, + "step": 10780 + }, + { + "epoch": 1.381672669067627, + "grad_norm": 0.6357347965240479, + "learning_rate": 3.0877642536835367e-06, + "loss": 2.0349, + "step": 10790 + }, + { + "epoch": 1.382953181272509, + "grad_norm": 0.5326115489006042, + "learning_rate": 3.0813581037796286e-06, + "loss": 2.0338, + "step": 10800 + }, + { + "epoch": 1.382953181272509, + "eval_loss": 2.0504214763641357, + "eval_runtime": 98.5425, + "eval_samples_per_second": 10.148, + "eval_steps_per_second": 5.074, + "step": 10800 + }, + { + "epoch": 1.384233693477391, + "grad_norm": 0.6251921653747559, + "learning_rate": 3.074951953875721e-06, + "loss": 2.0001, + "step": 10810 + }, + { + "epoch": 1.385514205682273, + "grad_norm": 0.5165422558784485, + "learning_rate": 3.068545803971813e-06, + "loss": 2.0127, + "step": 10820 + }, + { + "epoch": 1.386794717887155, + "grad_norm": 0.5653632879257202, + "learning_rate": 3.0621396540679053e-06, + "loss": 2.0206, + "step": 10830 + }, + { + "epoch": 1.3880752300920367, + "grad_norm": 0.6335858106613159, + "learning_rate": 3.055733504163998e-06, + "loss": 2.0242, + "step": 10840 + }, + { + "epoch": 1.3893557422969187, + "grad_norm": 0.5169991254806519, + "learning_rate": 3.04932735426009e-06, + "loss": 1.9994, + "step": 10850 + }, + { + "epoch": 1.3906362545018007, + "grad_norm": 0.5450325012207031, + "learning_rate": 3.0429212043561824e-06, + "loss": 2.0271, + "step": 10860 + }, + { + "epoch": 1.3919167667066827, + "grad_norm": 0.567139744758606, + "learning_rate": 3.0365150544522743e-06, + "loss": 2.0143, + "step": 10870 + }, + { + "epoch": 1.3931972789115648, + "grad_norm": 0.5426153540611267, + "learning_rate": 3.0301089045483667e-06, + "loss": 2.0127, + "step": 10880 + }, + { + "epoch": 1.3944777911164465, + "grad_norm": 0.5378096103668213, + "learning_rate": 3.023702754644459e-06, + "loss": 1.9883, + "step": 10890 + }, + { + "epoch": 1.3957583033213286, + "grad_norm": 0.561470627784729, + "learning_rate": 3.017296604740551e-06, + "loss": 2.023, + "step": 10900 + }, + { + "epoch": 1.3957583033213286, + "eval_loss": 2.049633026123047, + "eval_runtime": 98.5354, + "eval_samples_per_second": 10.149, + "eval_steps_per_second": 5.074, + "step": 10900 + }, + { + "epoch": 1.3970388155262106, + "grad_norm": 0.5516214966773987, + "learning_rate": 3.0108904548366434e-06, + "loss": 2.0643, + "step": 10910 + }, + { + "epoch": 1.3983193277310924, + "grad_norm": 0.4827578365802765, + "learning_rate": 3.0044843049327353e-06, + "loss": 2.0247, + "step": 10920 + }, + { + "epoch": 1.3995998399359744, + "grad_norm": 0.5677744746208191, + "learning_rate": 2.998078155028828e-06, + "loss": 2.0615, + "step": 10930 + }, + { + "epoch": 1.4008803521408564, + "grad_norm": 0.5234193801879883, + "learning_rate": 2.9916720051249205e-06, + "loss": 2.0143, + "step": 10940 + }, + { + "epoch": 1.4021608643457384, + "grad_norm": 0.5785296559333801, + "learning_rate": 2.9852658552210124e-06, + "loss": 2.0158, + "step": 10950 + }, + { + "epoch": 1.4034413765506202, + "grad_norm": 0.5425563454627991, + "learning_rate": 2.978859705317105e-06, + "loss": 2.0083, + "step": 10960 + }, + { + "epoch": 1.4047218887555022, + "grad_norm": 0.5265486240386963, + "learning_rate": 2.9724535554131967e-06, + "loss": 2.0602, + "step": 10970 + }, + { + "epoch": 1.4060024009603842, + "grad_norm": 0.485811710357666, + "learning_rate": 2.966047405509289e-06, + "loss": 1.9982, + "step": 10980 + }, + { + "epoch": 1.407282913165266, + "grad_norm": 0.5507806539535522, + "learning_rate": 2.9596412556053815e-06, + "loss": 2.0149, + "step": 10990 + }, + { + "epoch": 1.408563425370148, + "grad_norm": 0.5665221810340881, + "learning_rate": 2.9532351057014734e-06, + "loss": 2.047, + "step": 11000 + }, + { + "epoch": 1.408563425370148, + "eval_loss": 2.049713134765625, + "eval_runtime": 99.4377, + "eval_samples_per_second": 10.057, + "eval_steps_per_second": 5.028, + "step": 11000 + }, + { + "epoch": 1.40984393757503, + "grad_norm": 0.5265170335769653, + "learning_rate": 2.946828955797566e-06, + "loss": 2.0403, + "step": 11010 + }, + { + "epoch": 1.411124449779912, + "grad_norm": 0.6089005470275879, + "learning_rate": 2.9404228058936577e-06, + "loss": 1.9957, + "step": 11020 + }, + { + "epoch": 1.412404961984794, + "grad_norm": 0.6830588579177856, + "learning_rate": 2.9340166559897505e-06, + "loss": 1.9934, + "step": 11030 + }, + { + "epoch": 1.4136854741896758, + "grad_norm": 0.6724534630775452, + "learning_rate": 2.927610506085843e-06, + "loss": 2.0147, + "step": 11040 + }, + { + "epoch": 1.4149659863945578, + "grad_norm": 0.49919191002845764, + "learning_rate": 2.921204356181935e-06, + "loss": 2.0375, + "step": 11050 + }, + { + "epoch": 1.4162464985994399, + "grad_norm": 0.6525809168815613, + "learning_rate": 2.9147982062780272e-06, + "loss": 2.0364, + "step": 11060 + }, + { + "epoch": 1.4175270108043216, + "grad_norm": 0.5216648578643799, + "learning_rate": 2.908392056374119e-06, + "loss": 2.0526, + "step": 11070 + }, + { + "epoch": 1.4188075230092037, + "grad_norm": 0.5448505282402039, + "learning_rate": 2.9019859064702115e-06, + "loss": 1.9872, + "step": 11080 + }, + { + "epoch": 1.4200880352140857, + "grad_norm": 0.5219195485115051, + "learning_rate": 2.895579756566304e-06, + "loss": 2.0573, + "step": 11090 + }, + { + "epoch": 1.4213685474189677, + "grad_norm": 0.5638669729232788, + "learning_rate": 2.889173606662396e-06, + "loss": 2.0511, + "step": 11100 + }, + { + "epoch": 1.4213685474189677, + "eval_loss": 2.049281597137451, + "eval_runtime": 99.6036, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 5.02, + "step": 11100 + }, + { + "epoch": 1.4226490596238495, + "grad_norm": 0.513096272945404, + "learning_rate": 2.8827674567584886e-06, + "loss": 2.0564, + "step": 11110 + }, + { + "epoch": 1.4239295718287315, + "grad_norm": 0.5226659774780273, + "learning_rate": 2.8763613068545806e-06, + "loss": 2.0418, + "step": 11120 + }, + { + "epoch": 1.4252100840336135, + "grad_norm": 0.5489218831062317, + "learning_rate": 2.869955156950673e-06, + "loss": 2.0133, + "step": 11130 + }, + { + "epoch": 1.4264905962384953, + "grad_norm": 0.5376583337783813, + "learning_rate": 2.8635490070467653e-06, + "loss": 2.0261, + "step": 11140 + }, + { + "epoch": 1.4277711084433773, + "grad_norm": 0.5678580403327942, + "learning_rate": 2.8571428571428573e-06, + "loss": 2.0399, + "step": 11150 + }, + { + "epoch": 1.4290516206482593, + "grad_norm": 0.541229784488678, + "learning_rate": 2.8507367072389496e-06, + "loss": 2.0245, + "step": 11160 + }, + { + "epoch": 1.4303321328531413, + "grad_norm": 0.5533165335655212, + "learning_rate": 2.8443305573350416e-06, + "loss": 2.0679, + "step": 11170 + }, + { + "epoch": 1.4316126450580233, + "grad_norm": 0.5560323596000671, + "learning_rate": 2.837924407431134e-06, + "loss": 2.04, + "step": 11180 + }, + { + "epoch": 1.4328931572629051, + "grad_norm": 0.4824937880039215, + "learning_rate": 2.8315182575272267e-06, + "loss": 2.0237, + "step": 11190 + }, + { + "epoch": 1.4341736694677871, + "grad_norm": 0.5155817866325378, + "learning_rate": 2.8251121076233182e-06, + "loss": 2.007, + "step": 11200 + }, + { + "epoch": 1.4341736694677871, + "eval_loss": 2.0490078926086426, + "eval_runtime": 99.8932, + "eval_samples_per_second": 10.011, + "eval_steps_per_second": 5.005, + "step": 11200 + }, + { + "epoch": 1.4354541816726691, + "grad_norm": 0.5290130376815796, + "learning_rate": 2.818705957719411e-06, + "loss": 1.9982, + "step": 11210 + }, + { + "epoch": 1.436734693877551, + "grad_norm": 0.49369412660598755, + "learning_rate": 2.8122998078155034e-06, + "loss": 2.0366, + "step": 11220 + }, + { + "epoch": 1.438015206082433, + "grad_norm": 0.5680967569351196, + "learning_rate": 2.8058936579115954e-06, + "loss": 2.0296, + "step": 11230 + }, + { + "epoch": 1.439295718287315, + "grad_norm": 0.5298722386360168, + "learning_rate": 2.7994875080076877e-06, + "loss": 2.0521, + "step": 11240 + }, + { + "epoch": 1.440576230492197, + "grad_norm": 0.49635031819343567, + "learning_rate": 2.7930813581037797e-06, + "loss": 2.0323, + "step": 11250 + }, + { + "epoch": 1.4418567426970788, + "grad_norm": 0.5607994198799133, + "learning_rate": 2.786675208199872e-06, + "loss": 2.0054, + "step": 11260 + }, + { + "epoch": 1.4431372549019608, + "grad_norm": 0.47166672348976135, + "learning_rate": 2.7802690582959644e-06, + "loss": 2.0273, + "step": 11270 + }, + { + "epoch": 1.4444177671068428, + "grad_norm": 0.5480349063873291, + "learning_rate": 2.7738629083920563e-06, + "loss": 2.0768, + "step": 11280 + }, + { + "epoch": 1.4456982793117246, + "grad_norm": 0.5854456424713135, + "learning_rate": 2.767456758488149e-06, + "loss": 2.0255, + "step": 11290 + }, + { + "epoch": 1.4469787915166066, + "grad_norm": 0.4981571137905121, + "learning_rate": 2.761050608584241e-06, + "loss": 2.0187, + "step": 11300 + }, + { + "epoch": 1.4469787915166066, + "eval_loss": 2.048654317855835, + "eval_runtime": 99.9023, + "eval_samples_per_second": 10.01, + "eval_steps_per_second": 5.005, + "step": 11300 + }, + { + "epoch": 1.4482593037214886, + "grad_norm": 0.5446431636810303, + "learning_rate": 2.7546444586803334e-06, + "loss": 2.0112, + "step": 11310 + }, + { + "epoch": 1.4495398159263706, + "grad_norm": 0.634131133556366, + "learning_rate": 2.748238308776426e-06, + "loss": 2.01, + "step": 11320 + }, + { + "epoch": 1.4508203281312526, + "grad_norm": 0.5008243322372437, + "learning_rate": 2.7418321588725178e-06, + "loss": 2.0596, + "step": 11330 + }, + { + "epoch": 1.4521008403361344, + "grad_norm": 0.5615090131759644, + "learning_rate": 2.73542600896861e-06, + "loss": 2.0458, + "step": 11340 + }, + { + "epoch": 1.4533813525410164, + "grad_norm": 0.5126698017120361, + "learning_rate": 2.729019859064702e-06, + "loss": 2.0417, + "step": 11350 + }, + { + "epoch": 1.4546618647458984, + "grad_norm": 0.5614376664161682, + "learning_rate": 2.7226137091607944e-06, + "loss": 2.0055, + "step": 11360 + }, + { + "epoch": 1.4559423769507802, + "grad_norm": 0.5798308849334717, + "learning_rate": 2.7162075592568872e-06, + "loss": 2.0426, + "step": 11370 + }, + { + "epoch": 1.4572228891556622, + "grad_norm": 0.5568918585777283, + "learning_rate": 2.709801409352979e-06, + "loss": 2.0319, + "step": 11380 + }, + { + "epoch": 1.4585034013605442, + "grad_norm": 0.5090416073799133, + "learning_rate": 2.7033952594490715e-06, + "loss": 2.0174, + "step": 11390 + }, + { + "epoch": 1.4597839135654262, + "grad_norm": 0.5406888127326965, + "learning_rate": 2.6969891095451635e-06, + "loss": 2.0338, + "step": 11400 + }, + { + "epoch": 1.4597839135654262, + "eval_loss": 2.0484492778778076, + "eval_runtime": 99.8004, + "eval_samples_per_second": 10.02, + "eval_steps_per_second": 5.01, + "step": 11400 + }, + { + "epoch": 1.4610644257703083, + "grad_norm": 0.5530595183372498, + "learning_rate": 2.690582959641256e-06, + "loss": 2.0025, + "step": 11410 + }, + { + "epoch": 1.46234493797519, + "grad_norm": 0.47053778171539307, + "learning_rate": 2.6841768097373482e-06, + "loss": 2.0359, + "step": 11420 + }, + { + "epoch": 1.463625450180072, + "grad_norm": 0.5242395997047424, + "learning_rate": 2.67777065983344e-06, + "loss": 2.0162, + "step": 11430 + }, + { + "epoch": 1.4649059623849539, + "grad_norm": 0.5642728805541992, + "learning_rate": 2.6713645099295325e-06, + "loss": 2.0372, + "step": 11440 + }, + { + "epoch": 1.4661864745898359, + "grad_norm": 0.5605606436729431, + "learning_rate": 2.6649583600256245e-06, + "loss": 2.0256, + "step": 11450 + }, + { + "epoch": 1.4674669867947179, + "grad_norm": 0.49169278144836426, + "learning_rate": 2.658552210121717e-06, + "loss": 2.0293, + "step": 11460 + }, + { + "epoch": 1.4687474989995999, + "grad_norm": 0.5536981821060181, + "learning_rate": 2.6521460602178096e-06, + "loss": 2.0429, + "step": 11470 + }, + { + "epoch": 1.470028011204482, + "grad_norm": 0.48586413264274597, + "learning_rate": 2.6457399103139016e-06, + "loss": 2.0335, + "step": 11480 + }, + { + "epoch": 1.4713085234093637, + "grad_norm": 0.5224795341491699, + "learning_rate": 2.639333760409994e-06, + "loss": 2.0395, + "step": 11490 + }, + { + "epoch": 1.4725890356142457, + "grad_norm": 0.49194952845573425, + "learning_rate": 2.632927610506086e-06, + "loss": 2.0357, + "step": 11500 + }, + { + "epoch": 1.4725890356142457, + "eval_loss": 2.0482566356658936, + "eval_runtime": 99.9705, + "eval_samples_per_second": 10.003, + "eval_steps_per_second": 5.001, + "step": 11500 + }, + { + "epoch": 1.4738695478191277, + "grad_norm": 0.5626738667488098, + "learning_rate": 2.6265214606021783e-06, + "loss": 2.0273, + "step": 11510 + }, + { + "epoch": 1.4751500600240095, + "grad_norm": 0.5302162170410156, + "learning_rate": 2.6201153106982706e-06, + "loss": 2.0381, + "step": 11520 + }, + { + "epoch": 1.4764305722288915, + "grad_norm": 0.4928155839443207, + "learning_rate": 2.6137091607943626e-06, + "loss": 1.9951, + "step": 11530 + }, + { + "epoch": 1.4777110844337735, + "grad_norm": 0.5598331093788147, + "learning_rate": 2.607303010890455e-06, + "loss": 1.9921, + "step": 11540 + }, + { + "epoch": 1.4789915966386555, + "grad_norm": 0.5367702841758728, + "learning_rate": 2.600896860986547e-06, + "loss": 2.0363, + "step": 11550 + }, + { + "epoch": 1.4802721088435375, + "grad_norm": 0.5699712038040161, + "learning_rate": 2.5944907110826397e-06, + "loss": 2.0159, + "step": 11560 + }, + { + "epoch": 1.4815526210484193, + "grad_norm": 0.5477683544158936, + "learning_rate": 2.588084561178732e-06, + "loss": 2.0371, + "step": 11570 + }, + { + "epoch": 1.4828331332533013, + "grad_norm": 0.558222770690918, + "learning_rate": 2.581678411274824e-06, + "loss": 2.029, + "step": 11580 + }, + { + "epoch": 1.4841136454581831, + "grad_norm": 0.5154983997344971, + "learning_rate": 2.5752722613709164e-06, + "loss": 1.9831, + "step": 11590 + }, + { + "epoch": 1.4853941576630652, + "grad_norm": 0.5119442939758301, + "learning_rate": 2.5688661114670083e-06, + "loss": 2.0158, + "step": 11600 + }, + { + "epoch": 1.4853941576630652, + "eval_loss": 2.0478100776672363, + "eval_runtime": 100.2631, + "eval_samples_per_second": 9.974, + "eval_steps_per_second": 4.987, + "step": 11600 + }, + { + "epoch": 1.4866746698679472, + "grad_norm": 0.5495085716247559, + "learning_rate": 2.5624599615631007e-06, + "loss": 2.0299, + "step": 11610 + }, + { + "epoch": 1.4879551820728292, + "grad_norm": 0.5720316767692566, + "learning_rate": 2.556053811659193e-06, + "loss": 2.0443, + "step": 11620 + }, + { + "epoch": 1.4892356942777112, + "grad_norm": 0.5065346956253052, + "learning_rate": 2.549647661755285e-06, + "loss": 2.0156, + "step": 11630 + }, + { + "epoch": 1.490516206482593, + "grad_norm": 0.553152322769165, + "learning_rate": 2.5432415118513778e-06, + "loss": 2.0176, + "step": 11640 + }, + { + "epoch": 1.491796718687475, + "grad_norm": 0.5240514874458313, + "learning_rate": 2.53683536194747e-06, + "loss": 2.0506, + "step": 11650 + }, + { + "epoch": 1.493077230892357, + "grad_norm": 0.499566912651062, + "learning_rate": 2.530429212043562e-06, + "loss": 1.9855, + "step": 11660 + }, + { + "epoch": 1.4943577430972388, + "grad_norm": 0.5063506960868835, + "learning_rate": 2.5240230621396545e-06, + "loss": 2.0325, + "step": 11670 + }, + { + "epoch": 1.4956382553021208, + "grad_norm": 0.5436708927154541, + "learning_rate": 2.5176169122357464e-06, + "loss": 2.0225, + "step": 11680 + }, + { + "epoch": 1.4969187675070028, + "grad_norm": 0.5449490547180176, + "learning_rate": 2.5112107623318388e-06, + "loss": 2.0252, + "step": 11690 + }, + { + "epoch": 1.4981992797118848, + "grad_norm": 0.4780167043209076, + "learning_rate": 2.504804612427931e-06, + "loss": 2.0297, + "step": 11700 + }, + { + "epoch": 1.4981992797118848, + "eval_loss": 2.047696352005005, + "eval_runtime": 100.4412, + "eval_samples_per_second": 9.956, + "eval_steps_per_second": 4.978, + "step": 11700 + }, + { + "epoch": 1.4994797919167668, + "grad_norm": 0.5443697571754456, + "learning_rate": 2.498398462524023e-06, + "loss": 2.0305, + "step": 11710 + }, + { + "epoch": 1.5007603041216486, + "grad_norm": 0.5622988939285278, + "learning_rate": 2.4919923126201155e-06, + "loss": 2.0363, + "step": 11720 + }, + { + "epoch": 1.5020408163265306, + "grad_norm": 0.5276266932487488, + "learning_rate": 2.485586162716208e-06, + "loss": 2.0153, + "step": 11730 + }, + { + "epoch": 1.5033213285314124, + "grad_norm": 0.576263427734375, + "learning_rate": 2.4791800128123e-06, + "loss": 2.0082, + "step": 11740 + }, + { + "epoch": 1.5046018407362944, + "grad_norm": 0.557447612285614, + "learning_rate": 2.472773862908392e-06, + "loss": 2.023, + "step": 11750 + }, + { + "epoch": 1.5058823529411764, + "grad_norm": 0.5237441062927246, + "learning_rate": 2.4663677130044845e-06, + "loss": 2.0219, + "step": 11760 + }, + { + "epoch": 1.5071628651460585, + "grad_norm": 0.5576190948486328, + "learning_rate": 2.459961563100577e-06, + "loss": 2.0207, + "step": 11770 + }, + { + "epoch": 1.5084433773509405, + "grad_norm": 0.5819739103317261, + "learning_rate": 2.4535554131966692e-06, + "loss": 2.0091, + "step": 11780 + }, + { + "epoch": 1.5097238895558225, + "grad_norm": 0.5563110113143921, + "learning_rate": 2.447149263292761e-06, + "loss": 2.0456, + "step": 11790 + }, + { + "epoch": 1.5110044017607043, + "grad_norm": 0.5227601528167725, + "learning_rate": 2.4407431133888535e-06, + "loss": 2.0574, + "step": 11800 + }, + { + "epoch": 1.5110044017607043, + "eval_loss": 2.0476009845733643, + "eval_runtime": 100.4291, + "eval_samples_per_second": 9.957, + "eval_steps_per_second": 4.979, + "step": 11800 + }, + { + "epoch": 1.5122849139655863, + "grad_norm": 0.5243479609489441, + "learning_rate": 2.4343369634849455e-06, + "loss": 2.0271, + "step": 11810 + }, + { + "epoch": 1.513565426170468, + "grad_norm": 0.5676461458206177, + "learning_rate": 2.4279308135810383e-06, + "loss": 2.03, + "step": 11820 + }, + { + "epoch": 1.51484593837535, + "grad_norm": 0.4777722656726837, + "learning_rate": 2.4215246636771302e-06, + "loss": 2.022, + "step": 11830 + }, + { + "epoch": 1.516126450580232, + "grad_norm": 0.5181421041488647, + "learning_rate": 2.4151185137732226e-06, + "loss": 2.0287, + "step": 11840 + }, + { + "epoch": 1.517406962785114, + "grad_norm": 0.601375937461853, + "learning_rate": 2.4087123638693145e-06, + "loss": 2.0235, + "step": 11850 + }, + { + "epoch": 1.5186874749899961, + "grad_norm": 0.5849976539611816, + "learning_rate": 2.402306213965407e-06, + "loss": 2.012, + "step": 11860 + }, + { + "epoch": 1.519967987194878, + "grad_norm": 0.5280182957649231, + "learning_rate": 2.3959000640614993e-06, + "loss": 2.0154, + "step": 11870 + }, + { + "epoch": 1.52124849939976, + "grad_norm": 0.511968195438385, + "learning_rate": 2.3894939141575916e-06, + "loss": 2.0395, + "step": 11880 + }, + { + "epoch": 1.5225290116046417, + "grad_norm": 0.551898717880249, + "learning_rate": 2.3830877642536836e-06, + "loss": 2.0001, + "step": 11890 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.5360147356987, + "learning_rate": 2.376681614349776e-06, + "loss": 2.0597, + "step": 11900 + }, + { + "epoch": 1.5238095238095237, + "eval_loss": 2.04695987701416, + "eval_runtime": 99.822, + "eval_samples_per_second": 10.018, + "eval_steps_per_second": 5.009, + "step": 11900 + }, + { + "epoch": 1.5250900360144057, + "grad_norm": 0.5133785605430603, + "learning_rate": 2.370275464445868e-06, + "loss": 2.015, + "step": 11910 + }, + { + "epoch": 1.5263705482192877, + "grad_norm": 0.5302312970161438, + "learning_rate": 2.3638693145419607e-06, + "loss": 2.0294, + "step": 11920 + }, + { + "epoch": 1.5276510604241698, + "grad_norm": 0.4700910151004791, + "learning_rate": 2.3574631646380526e-06, + "loss": 2.0163, + "step": 11930 + }, + { + "epoch": 1.5289315726290518, + "grad_norm": 0.5158265829086304, + "learning_rate": 2.351057014734145e-06, + "loss": 2.0465, + "step": 11940 + }, + { + "epoch": 1.5302120848339336, + "grad_norm": 0.5115975141525269, + "learning_rate": 2.344650864830237e-06, + "loss": 2.0268, + "step": 11950 + }, + { + "epoch": 1.5314925970388156, + "grad_norm": 0.5275582671165466, + "learning_rate": 2.3382447149263297e-06, + "loss": 2.0449, + "step": 11960 + }, + { + "epoch": 1.5327731092436974, + "grad_norm": 0.6041198968887329, + "learning_rate": 2.3318385650224217e-06, + "loss": 2.02, + "step": 11970 + }, + { + "epoch": 1.5340536214485794, + "grad_norm": 0.5098729133605957, + "learning_rate": 2.325432415118514e-06, + "loss": 2.0326, + "step": 11980 + }, + { + "epoch": 1.5353341336534614, + "grad_norm": 0.5320337414741516, + "learning_rate": 2.319026265214606e-06, + "loss": 2.0282, + "step": 11990 + }, + { + "epoch": 1.5366146458583434, + "grad_norm": 0.5207633376121521, + "learning_rate": 2.3126201153106984e-06, + "loss": 2.0131, + "step": 12000 + }, + { + "epoch": 1.5366146458583434, + "eval_loss": 2.0469398498535156, + "eval_runtime": 100.2772, + "eval_samples_per_second": 9.972, + "eval_steps_per_second": 4.986, + "step": 12000 + }, + { + "epoch": 1.5378951580632254, + "grad_norm": 0.5872837901115417, + "learning_rate": 2.3062139654067907e-06, + "loss": 2.0213, + "step": 12010 + }, + { + "epoch": 1.5391756702681072, + "grad_norm": 0.5789574384689331, + "learning_rate": 2.299807815502883e-06, + "loss": 2.0385, + "step": 12020 + }, + { + "epoch": 1.5404561824729892, + "grad_norm": 0.525754451751709, + "learning_rate": 2.293401665598975e-06, + "loss": 2.0177, + "step": 12030 + }, + { + "epoch": 1.541736694677871, + "grad_norm": 0.5442928671836853, + "learning_rate": 2.2869955156950674e-06, + "loss": 2.0465, + "step": 12040 + }, + { + "epoch": 1.543017206882753, + "grad_norm": 0.5661875605583191, + "learning_rate": 2.2805893657911598e-06, + "loss": 2.0367, + "step": 12050 + }, + { + "epoch": 1.544297719087635, + "grad_norm": 0.54268479347229, + "learning_rate": 2.274183215887252e-06, + "loss": 2.0191, + "step": 12060 + }, + { + "epoch": 1.545578231292517, + "grad_norm": 0.5016104578971863, + "learning_rate": 2.267777065983344e-06, + "loss": 2.0518, + "step": 12070 + }, + { + "epoch": 1.546858743497399, + "grad_norm": 0.5148719549179077, + "learning_rate": 2.2613709160794365e-06, + "loss": 2.0253, + "step": 12080 + }, + { + "epoch": 1.548139255702281, + "grad_norm": 0.5588960647583008, + "learning_rate": 2.2549647661755284e-06, + "loss": 1.9878, + "step": 12090 + }, + { + "epoch": 1.5494197679071628, + "grad_norm": 0.482328325510025, + "learning_rate": 2.2485586162716208e-06, + "loss": 2.0306, + "step": 12100 + }, + { + "epoch": 1.5494197679071628, + "eval_loss": 2.046799898147583, + "eval_runtime": 100.2859, + "eval_samples_per_second": 9.971, + "eval_steps_per_second": 4.986, + "step": 12100 + }, + { + "epoch": 3.4510774710596617, + "grad_norm": 0.5546593070030212, + "learning_rate": 2.242152466367713e-06, + "loss": 2.0075, + "step": 12110 + }, + { + "epoch": 3.4539269813000892, + "grad_norm": 0.5128650665283203, + "learning_rate": 2.2357463164638055e-06, + "loss": 2.0324, + "step": 12120 + }, + { + "epoch": 3.4567764915405164, + "grad_norm": 0.5232759714126587, + "learning_rate": 2.2293401665598975e-06, + "loss": 2.0405, + "step": 12130 + }, + { + "epoch": 3.459626001780944, + "grad_norm": 0.4983081817626953, + "learning_rate": 2.22293401665599e-06, + "loss": 2.0282, + "step": 12140 + }, + { + "epoch": 3.4624755120213715, + "grad_norm": 0.6100337505340576, + "learning_rate": 2.216527866752082e-06, + "loss": 2.0741, + "step": 12150 + }, + { + "epoch": 3.4653250222617986, + "grad_norm": 0.5565346479415894, + "learning_rate": 2.2101217168481746e-06, + "loss": 2.0402, + "step": 12160 + }, + { + "epoch": 3.468174532502226, + "grad_norm": 0.5013384222984314, + "learning_rate": 2.2037155669442665e-06, + "loss": 2.0275, + "step": 12170 + }, + { + "epoch": 3.4710240427426537, + "grad_norm": 0.5147426724433899, + "learning_rate": 2.197309417040359e-06, + "loss": 2.0196, + "step": 12180 + }, + { + "epoch": 3.473873552983081, + "grad_norm": 0.5874170064926147, + "learning_rate": 2.1909032671364512e-06, + "loss": 2.0267, + "step": 12190 + }, + { + "epoch": 3.4767230632235084, + "grad_norm": 0.535470187664032, + "learning_rate": 2.1844971172325436e-06, + "loss": 2.0293, + "step": 12200 + }, + { + "epoch": 3.4767230632235084, + "eval_loss": 2.0467445850372314, + "eval_runtime": 99.9294, + "eval_samples_per_second": 10.007, + "eval_steps_per_second": 5.004, + "step": 12200 + }, + { + "epoch": 3.479572573463936, + "grad_norm": 0.6096872687339783, + "learning_rate": 2.1780909673286356e-06, + "loss": 2.02, + "step": 12210 + }, + { + "epoch": 3.482422083704363, + "grad_norm": 0.5604642629623413, + "learning_rate": 2.171684817424728e-06, + "loss": 2.0279, + "step": 12220 + }, + { + "epoch": 3.4852715939447907, + "grad_norm": 0.5537389516830444, + "learning_rate": 2.1652786675208203e-06, + "loss": 2.0111, + "step": 12230 + }, + { + "epoch": 3.4881211041852183, + "grad_norm": 0.541696310043335, + "learning_rate": 2.1588725176169122e-06, + "loss": 1.9872, + "step": 12240 + }, + { + "epoch": 3.4909706144256454, + "grad_norm": 0.53986656665802, + "learning_rate": 2.1524663677130046e-06, + "loss": 2.0234, + "step": 12250 + }, + { + "epoch": 3.493820124666073, + "grad_norm": 0.5188933610916138, + "learning_rate": 2.146060217809097e-06, + "loss": 2.0336, + "step": 12260 + }, + { + "epoch": 3.4966696349065005, + "grad_norm": 0.5203558802604675, + "learning_rate": 2.1396540679051893e-06, + "loss": 2.0097, + "step": 12270 + }, + { + "epoch": 3.4995191451469276, + "grad_norm": 0.5333787798881531, + "learning_rate": 2.1332479180012813e-06, + "loss": 2.0643, + "step": 12280 + }, + { + "epoch": 3.502368655387355, + "grad_norm": 0.4590858519077301, + "learning_rate": 2.1268417680973736e-06, + "loss": 2.0481, + "step": 12290 + }, + { + "epoch": 3.5052181656277828, + "grad_norm": 0.5285468697547913, + "learning_rate": 2.120435618193466e-06, + "loss": 2.0291, + "step": 12300 + }, + { + "epoch": 3.5052181656277828, + "eval_loss": 2.0466067790985107, + "eval_runtime": 99.2486, + "eval_samples_per_second": 10.076, + "eval_steps_per_second": 5.038, + "step": 12300 + }, + { + "epoch": 3.50806767586821, + "grad_norm": 0.48412200808525085, + "learning_rate": 2.1140294682895584e-06, + "loss": 2.0402, + "step": 12310 + }, + { + "epoch": 3.5109171861086375, + "grad_norm": 0.6032649278640747, + "learning_rate": 2.1076233183856503e-06, + "loss": 2.0287, + "step": 12320 + }, + { + "epoch": 3.513766696349065, + "grad_norm": 0.61771160364151, + "learning_rate": 2.1012171684817427e-06, + "loss": 2.0319, + "step": 12330 + }, + { + "epoch": 3.5166162065894926, + "grad_norm": 0.542523980140686, + "learning_rate": 2.0948110185778346e-06, + "loss": 2.0293, + "step": 12340 + }, + { + "epoch": 3.5194657168299197, + "grad_norm": 0.5993557572364807, + "learning_rate": 2.088404868673927e-06, + "loss": 2.0295, + "step": 12350 + }, + { + "epoch": 3.5223152270703473, + "grad_norm": 0.5857672691345215, + "learning_rate": 2.0819987187700194e-06, + "loss": 2.0595, + "step": 12360 + }, + { + "epoch": 3.525164737310775, + "grad_norm": 0.511431097984314, + "learning_rate": 2.0755925688661117e-06, + "loss": 2.0043, + "step": 12370 + }, + { + "epoch": 3.5280142475512024, + "grad_norm": 0.5659806132316589, + "learning_rate": 2.0691864189622037e-06, + "loss": 2.0351, + "step": 12380 + }, + { + "epoch": 3.5308637577916295, + "grad_norm": 0.5137180685997009, + "learning_rate": 2.062780269058296e-06, + "loss": 2.0509, + "step": 12390 + }, + { + "epoch": 3.533713268032057, + "grad_norm": 0.5334777235984802, + "learning_rate": 2.0563741191543884e-06, + "loss": 2.048, + "step": 12400 + }, + { + "epoch": 3.533713268032057, + "eval_loss": 2.046497106552124, + "eval_runtime": 99.3964, + "eval_samples_per_second": 10.061, + "eval_steps_per_second": 5.03, + "step": 12400 + }, + { + "epoch": 3.5365627782724847, + "grad_norm": 0.5657955408096313, + "learning_rate": 2.049967969250481e-06, + "loss": 2.038, + "step": 12410 + }, + { + "epoch": 3.539412288512912, + "grad_norm": 0.6064898371696472, + "learning_rate": 2.0435618193465727e-06, + "loss": 2.0117, + "step": 12420 + }, + { + "epoch": 3.5422617987533394, + "grad_norm": 0.5460246205329895, + "learning_rate": 2.037155669442665e-06, + "loss": 2.0349, + "step": 12430 + }, + { + "epoch": 3.545111308993767, + "grad_norm": 0.514192521572113, + "learning_rate": 2.0307495195387575e-06, + "loss": 2.03, + "step": 12440 + }, + { + "epoch": 3.547960819234194, + "grad_norm": 0.5747187733650208, + "learning_rate": 2.02434336963485e-06, + "loss": 2.0259, + "step": 12450 + }, + { + "epoch": 3.5508103294746216, + "grad_norm": 0.5013620853424072, + "learning_rate": 2.0179372197309418e-06, + "loss": 2.0476, + "step": 12460 + }, + { + "epoch": 3.553659839715049, + "grad_norm": 0.5189679861068726, + "learning_rate": 2.011531069827034e-06, + "loss": 1.9952, + "step": 12470 + }, + { + "epoch": 3.5565093499554763, + "grad_norm": 0.5946024656295776, + "learning_rate": 2.005124919923126e-06, + "loss": 2.0115, + "step": 12480 + }, + { + "epoch": 3.559358860195904, + "grad_norm": 0.6720226407051086, + "learning_rate": 1.998718770019219e-06, + "loss": 2.0522, + "step": 12490 + }, + { + "epoch": 3.5622083704363314, + "grad_norm": 0.526604950428009, + "learning_rate": 1.992312620115311e-06, + "loss": 2.058, + "step": 12500 + }, + { + "epoch": 3.5622083704363314, + "eval_loss": 2.0465705394744873, + "eval_runtime": 99.2186, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.039, + "step": 12500 + }, + { + "epoch": 3.5650578806767586, + "grad_norm": 0.49224352836608887, + "learning_rate": 1.985906470211403e-06, + "loss": 2.0419, + "step": 12510 + }, + { + "epoch": 3.567907390917186, + "grad_norm": 0.4674229025840759, + "learning_rate": 1.979500320307495e-06, + "loss": 2.02, + "step": 12520 + }, + { + "epoch": 3.5707569011576137, + "grad_norm": 0.5586433410644531, + "learning_rate": 1.9730941704035875e-06, + "loss": 2.0593, + "step": 12530 + }, + { + "epoch": 3.573606411398041, + "grad_norm": 0.5551003217697144, + "learning_rate": 1.96668802049968e-06, + "loss": 2.0137, + "step": 12540 + }, + { + "epoch": 3.5764559216384684, + "grad_norm": 0.5269743204116821, + "learning_rate": 1.9602818705957723e-06, + "loss": 2.0045, + "step": 12550 + }, + { + "epoch": 3.579305431878896, + "grad_norm": 0.5576637387275696, + "learning_rate": 1.953875720691864e-06, + "loss": 2.0143, + "step": 12560 + }, + { + "epoch": 3.582154942119323, + "grad_norm": 0.5212885737419128, + "learning_rate": 1.9474695707879566e-06, + "loss": 2.0637, + "step": 12570 + }, + { + "epoch": 3.5850044523597506, + "grad_norm": 0.5004526376724243, + "learning_rate": 1.941063420884049e-06, + "loss": 2.0068, + "step": 12580 + }, + { + "epoch": 3.587853962600178, + "grad_norm": 0.5198807120323181, + "learning_rate": 1.9346572709801413e-06, + "loss": 2.024, + "step": 12590 + }, + { + "epoch": 3.5907034728406053, + "grad_norm": 0.5772445201873779, + "learning_rate": 1.9282511210762332e-06, + "loss": 1.9967, + "step": 12600 + }, + { + "epoch": 3.5907034728406053, + "eval_loss": 2.046096086502075, + "eval_runtime": 99.0889, + "eval_samples_per_second": 10.092, + "eval_steps_per_second": 5.046, + "step": 12600 + }, + { + "epoch": 3.593552983081033, + "grad_norm": 0.5501157641410828, + "learning_rate": 1.9218449711723256e-06, + "loss": 2.053, + "step": 12610 + }, + { + "epoch": 3.5964024933214604, + "grad_norm": 0.5690680742263794, + "learning_rate": 1.9154388212684176e-06, + "loss": 2.0267, + "step": 12620 + }, + { + "epoch": 3.5992520035618876, + "grad_norm": 0.5340695381164551, + "learning_rate": 1.9090326713645103e-06, + "loss": 2.0102, + "step": 12630 + }, + { + "epoch": 3.602101513802315, + "grad_norm": 0.5075132846832275, + "learning_rate": 1.9026265214606023e-06, + "loss": 2.0389, + "step": 12640 + }, + { + "epoch": 3.6049510240427427, + "grad_norm": 0.5131998062133789, + "learning_rate": 1.8962203715566947e-06, + "loss": 2.0137, + "step": 12650 + }, + { + "epoch": 3.60780053428317, + "grad_norm": 0.5327854156494141, + "learning_rate": 1.8898142216527868e-06, + "loss": 2.0175, + "step": 12660 + }, + { + "epoch": 3.6106500445235974, + "grad_norm": 0.559907853603363, + "learning_rate": 1.883408071748879e-06, + "loss": 2.0391, + "step": 12670 + }, + { + "epoch": 3.613499554764025, + "grad_norm": 0.5571854710578918, + "learning_rate": 1.8770019218449713e-06, + "loss": 2.0433, + "step": 12680 + }, + { + "epoch": 3.6163490650044525, + "grad_norm": 0.5199988484382629, + "learning_rate": 1.8705957719410637e-06, + "loss": 2.0239, + "step": 12690 + }, + { + "epoch": 3.6191985752448796, + "grad_norm": 0.5144962072372437, + "learning_rate": 1.8641896220371559e-06, + "loss": 2.0293, + "step": 12700 + }, + { + "epoch": 3.6191985752448796, + "eval_loss": 2.0460119247436523, + "eval_runtime": 99.2153, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.04, + "step": 12700 + }, + { + "epoch": 3.622048085485307, + "grad_norm": 0.4945001006126404, + "learning_rate": 1.857783472133248e-06, + "loss": 2.0161, + "step": 12710 + }, + { + "epoch": 3.624897595725735, + "grad_norm": 0.5314003825187683, + "learning_rate": 1.8513773222293402e-06, + "loss": 2.0173, + "step": 12720 + }, + { + "epoch": 3.627747105966162, + "grad_norm": 0.5037127137184143, + "learning_rate": 1.8449711723254328e-06, + "loss": 2.061, + "step": 12730 + }, + { + "epoch": 3.6305966162065895, + "grad_norm": 0.5831037163734436, + "learning_rate": 1.838565022421525e-06, + "loss": 2.0186, + "step": 12740 + }, + { + "epoch": 3.633446126447017, + "grad_norm": 0.5599620342254639, + "learning_rate": 1.832158872517617e-06, + "loss": 2.0394, + "step": 12750 + }, + { + "epoch": 3.6362956366874446, + "grad_norm": 0.543129026889801, + "learning_rate": 1.8257527226137092e-06, + "loss": 2.0262, + "step": 12760 + }, + { + "epoch": 3.6391451469278717, + "grad_norm": 0.5593090653419495, + "learning_rate": 1.8193465727098016e-06, + "loss": 2.0316, + "step": 12770 + }, + { + "epoch": 3.6419946571682993, + "grad_norm": 0.49719634652137756, + "learning_rate": 1.812940422805894e-06, + "loss": 2.0332, + "step": 12780 + }, + { + "epoch": 3.644844167408727, + "grad_norm": 0.5381009578704834, + "learning_rate": 1.8065342729019861e-06, + "loss": 2.0232, + "step": 12790 + }, + { + "epoch": 3.647693677649154, + "grad_norm": 0.4995189905166626, + "learning_rate": 1.8001281229980783e-06, + "loss": 2.0279, + "step": 12800 + }, + { + "epoch": 3.647693677649154, + "eval_loss": 2.0456788539886475, + "eval_runtime": 99.2397, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.038, + "step": 12800 + }, + { + "epoch": 3.6505431878895815, + "grad_norm": 0.5659130215644836, + "learning_rate": 1.7937219730941704e-06, + "loss": 2.0061, + "step": 12810 + }, + { + "epoch": 3.653392698130009, + "grad_norm": 0.5485902428627014, + "learning_rate": 1.787315823190263e-06, + "loss": 2.0367, + "step": 12820 + }, + { + "epoch": 3.6562422083704362, + "grad_norm": 0.5191209316253662, + "learning_rate": 1.7809096732863552e-06, + "loss": 2.0558, + "step": 12830 + }, + { + "epoch": 3.659091718610864, + "grad_norm": 0.5433029532432556, + "learning_rate": 1.7745035233824473e-06, + "loss": 2.0278, + "step": 12840 + }, + { + "epoch": 3.6619412288512914, + "grad_norm": 0.5491988658905029, + "learning_rate": 1.7680973734785395e-06, + "loss": 2.0234, + "step": 12850 + }, + { + "epoch": 3.6647907390917185, + "grad_norm": 0.5946006178855896, + "learning_rate": 1.7616912235746316e-06, + "loss": 2.0259, + "step": 12860 + }, + { + "epoch": 3.667640249332146, + "grad_norm": 0.5561248660087585, + "learning_rate": 1.7552850736707242e-06, + "loss": 2.0301, + "step": 12870 + }, + { + "epoch": 3.6704897595725736, + "grad_norm": 0.5854030847549438, + "learning_rate": 1.7488789237668164e-06, + "loss": 2.0477, + "step": 12880 + }, + { + "epoch": 3.6733392698130007, + "grad_norm": 0.5319736003875732, + "learning_rate": 1.7424727738629085e-06, + "loss": 2.0438, + "step": 12890 + }, + { + "epoch": 3.6761887800534283, + "grad_norm": 0.6675127744674683, + "learning_rate": 1.7360666239590007e-06, + "loss": 1.9977, + "step": 12900 + }, + { + "epoch": 3.6761887800534283, + "eval_loss": 2.045774459838867, + "eval_runtime": 99.3171, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.034, + "step": 12900 + }, + { + "epoch": 3.679038290293856, + "grad_norm": 0.5847189426422119, + "learning_rate": 1.7296604740550928e-06, + "loss": 2.028, + "step": 12910 + }, + { + "epoch": 3.681887800534283, + "grad_norm": 0.4568108022212982, + "learning_rate": 1.7232543241511854e-06, + "loss": 2.0211, + "step": 12920 + }, + { + "epoch": 3.6847373107747106, + "grad_norm": 0.5319178700447083, + "learning_rate": 1.7168481742472776e-06, + "loss": 2.0161, + "step": 12930 + }, + { + "epoch": 3.687586821015138, + "grad_norm": 0.5896403193473816, + "learning_rate": 1.7104420243433697e-06, + "loss": 2.0309, + "step": 12940 + }, + { + "epoch": 3.6904363312555652, + "grad_norm": 0.5127771496772766, + "learning_rate": 1.7040358744394619e-06, + "loss": 2.0223, + "step": 12950 + }, + { + "epoch": 3.693285841495993, + "grad_norm": 0.5874524712562561, + "learning_rate": 1.697629724535554e-06, + "loss": 2.0504, + "step": 12960 + }, + { + "epoch": 3.6961353517364204, + "grad_norm": 0.5469905734062195, + "learning_rate": 1.6912235746316466e-06, + "loss": 2.0001, + "step": 12970 + }, + { + "epoch": 3.6989848619768475, + "grad_norm": 0.5713154673576355, + "learning_rate": 1.6848174247277388e-06, + "loss": 2.0103, + "step": 12980 + }, + { + "epoch": 3.701834372217275, + "grad_norm": 0.5152024626731873, + "learning_rate": 1.678411274823831e-06, + "loss": 2.0357, + "step": 12990 + }, + { + "epoch": 3.7046838824577026, + "grad_norm": 0.5928826928138733, + "learning_rate": 1.672005124919923e-06, + "loss": 2.0149, + "step": 13000 + }, + { + "epoch": 3.7046838824577026, + "eval_loss": 2.04563570022583, + "eval_runtime": 99.3373, + "eval_samples_per_second": 10.067, + "eval_steps_per_second": 5.033, + "step": 13000 + }, + { + "epoch": 3.7075333926981298, + "grad_norm": 0.5253980159759521, + "learning_rate": 1.6655989750160157e-06, + "loss": 2.0268, + "step": 13010 + }, + { + "epoch": 3.7103829029385573, + "grad_norm": 0.4610345661640167, + "learning_rate": 1.6591928251121078e-06, + "loss": 2.0119, + "step": 13020 + }, + { + "epoch": 3.713232413178985, + "grad_norm": 0.5618305206298828, + "learning_rate": 1.6527866752082e-06, + "loss": 2.034, + "step": 13030 + }, + { + "epoch": 3.716081923419412, + "grad_norm": 0.5720108151435852, + "learning_rate": 1.6463805253042921e-06, + "loss": 2.0145, + "step": 13040 + }, + { + "epoch": 3.7189314336598396, + "grad_norm": 0.5844509601593018, + "learning_rate": 1.6399743754003843e-06, + "loss": 2.0254, + "step": 13050 + }, + { + "epoch": 3.721780943900267, + "grad_norm": 0.4931798279285431, + "learning_rate": 1.6335682254964769e-06, + "loss": 1.9958, + "step": 13060 + }, + { + "epoch": 3.7246304541406947, + "grad_norm": 0.5254191160202026, + "learning_rate": 1.627162075592569e-06, + "loss": 2.0289, + "step": 13070 + }, + { + "epoch": 3.727479964381122, + "grad_norm": 0.5591420531272888, + "learning_rate": 1.6207559256886612e-06, + "loss": 2.0336, + "step": 13080 + }, + { + "epoch": 3.7303294746215494, + "grad_norm": 0.5102550983428955, + "learning_rate": 1.6143497757847533e-06, + "loss": 1.9987, + "step": 13090 + }, + { + "epoch": 3.733178984861977, + "grad_norm": 0.5043312907218933, + "learning_rate": 1.6079436258808457e-06, + "loss": 2.0445, + "step": 13100 + }, + { + "epoch": 3.733178984861977, + "eval_loss": 2.0453715324401855, + "eval_runtime": 99.1158, + "eval_samples_per_second": 10.089, + "eval_steps_per_second": 5.045, + "step": 13100 + }, + { + "epoch": 3.7360284951024045, + "grad_norm": 0.5792537927627563, + "learning_rate": 1.601537475976938e-06, + "loss": 2.0013, + "step": 13110 + }, + { + "epoch": 3.7388780053428317, + "grad_norm": 0.5740078687667847, + "learning_rate": 1.5951313260730302e-06, + "loss": 2.0344, + "step": 13120 + }, + { + "epoch": 3.7417275155832592, + "grad_norm": 0.5518689155578613, + "learning_rate": 1.5887251761691224e-06, + "loss": 2.0372, + "step": 13130 + }, + { + "epoch": 3.744577025823687, + "grad_norm": 0.510488748550415, + "learning_rate": 1.5823190262652148e-06, + "loss": 2.0235, + "step": 13140 + }, + { + "epoch": 3.747426536064114, + "grad_norm": 0.5459367632865906, + "learning_rate": 1.575912876361307e-06, + "loss": 2.0174, + "step": 13150 + }, + { + "epoch": 3.7502760463045415, + "grad_norm": 0.5442193746566772, + "learning_rate": 1.5695067264573993e-06, + "loss": 2.0168, + "step": 13160 + }, + { + "epoch": 3.753125556544969, + "grad_norm": 0.5245689749717712, + "learning_rate": 1.5631005765534914e-06, + "loss": 2.0254, + "step": 13170 + }, + { + "epoch": 3.755975066785396, + "grad_norm": 0.5361449122428894, + "learning_rate": 1.5566944266495836e-06, + "loss": 2.0347, + "step": 13180 + }, + { + "epoch": 3.7588245770258237, + "grad_norm": 0.5554986596107483, + "learning_rate": 1.550288276745676e-06, + "loss": 2.0124, + "step": 13190 + }, + { + "epoch": 3.7616740872662513, + "grad_norm": 0.5109909772872925, + "learning_rate": 1.5438821268417683e-06, + "loss": 2.0287, + "step": 13200 + }, + { + "epoch": 3.7616740872662513, + "eval_loss": 2.045180559158325, + "eval_runtime": 98.7424, + "eval_samples_per_second": 10.127, + "eval_steps_per_second": 5.064, + "step": 13200 + }, + { + "epoch": 3.7645235975066784, + "grad_norm": 0.5418664216995239, + "learning_rate": 1.5374759769378605e-06, + "loss": 2.054, + "step": 13210 + }, + { + "epoch": 3.767373107747106, + "grad_norm": 0.5335536003112793, + "learning_rate": 1.5310698270339526e-06, + "loss": 2.0164, + "step": 13220 + }, + { + "epoch": 3.7702226179875336, + "grad_norm": 0.46435827016830444, + "learning_rate": 1.524663677130045e-06, + "loss": 1.9936, + "step": 13230 + }, + { + "epoch": 3.7730721282279607, + "grad_norm": 0.5345796942710876, + "learning_rate": 1.5182575272261372e-06, + "loss": 2.0085, + "step": 13240 + }, + { + "epoch": 3.7759216384683882, + "grad_norm": 0.543218731880188, + "learning_rate": 1.5118513773222295e-06, + "loss": 1.9945, + "step": 13250 + }, + { + "epoch": 3.778771148708816, + "grad_norm": 0.5109902024269104, + "learning_rate": 1.5054452274183217e-06, + "loss": 2.0033, + "step": 13260 + }, + { + "epoch": 3.781620658949243, + "grad_norm": 0.5215564966201782, + "learning_rate": 1.499039077514414e-06, + "loss": 2.0381, + "step": 13270 + }, + { + "epoch": 3.7844701691896705, + "grad_norm": 0.4731573164463043, + "learning_rate": 1.4926329276105062e-06, + "loss": 2.0365, + "step": 13280 + }, + { + "epoch": 3.787319679430098, + "grad_norm": 0.4979744851589203, + "learning_rate": 1.4862267777065984e-06, + "loss": 2.0156, + "step": 13290 + }, + { + "epoch": 3.790169189670525, + "grad_norm": 0.511603057384491, + "learning_rate": 1.4798206278026907e-06, + "loss": 1.9742, + "step": 13300 + }, + { + "epoch": 3.790169189670525, + "eval_loss": 2.044865608215332, + "eval_runtime": 99.2698, + "eval_samples_per_second": 10.074, + "eval_steps_per_second": 5.037, + "step": 13300 + }, + { + "epoch": 3.7930186999109528, + "grad_norm": 0.4679126441478729, + "learning_rate": 1.473414477898783e-06, + "loss": 2.0291, + "step": 13310 + }, + { + "epoch": 3.7958682101513803, + "grad_norm": 0.4647921621799469, + "learning_rate": 1.4670083279948753e-06, + "loss": 1.9977, + "step": 13320 + }, + { + "epoch": 3.7987177203918074, + "grad_norm": 0.5578404664993286, + "learning_rate": 1.4606021780909674e-06, + "loss": 2.0358, + "step": 13330 + }, + { + "epoch": 3.801567230632235, + "grad_norm": 0.5208609104156494, + "learning_rate": 1.4541960281870596e-06, + "loss": 2.0294, + "step": 13340 + }, + { + "epoch": 3.8044167408726626, + "grad_norm": 0.5082564353942871, + "learning_rate": 1.447789878283152e-06, + "loss": 2.0691, + "step": 13350 + }, + { + "epoch": 3.8072662511130897, + "grad_norm": 0.4937707781791687, + "learning_rate": 1.4413837283792443e-06, + "loss": 2.0302, + "step": 13360 + }, + { + "epoch": 3.8101157613535173, + "grad_norm": 0.5191481709480286, + "learning_rate": 1.4349775784753365e-06, + "loss": 1.9866, + "step": 13370 + }, + { + "epoch": 3.812965271593945, + "grad_norm": 0.5355870127677917, + "learning_rate": 1.4285714285714286e-06, + "loss": 2.0613, + "step": 13380 + }, + { + "epoch": 3.815814781834372, + "grad_norm": 0.5211424827575684, + "learning_rate": 1.4221652786675208e-06, + "loss": 1.9967, + "step": 13390 + }, + { + "epoch": 3.8186642920747995, + "grad_norm": 0.4805680811405182, + "learning_rate": 1.4157591287636134e-06, + "loss": 2.0063, + "step": 13400 + }, + { + "epoch": 3.8186642920747995, + "eval_loss": 2.0447990894317627, + "eval_runtime": 99.2866, + "eval_samples_per_second": 10.072, + "eval_steps_per_second": 5.036, + "step": 13400 + }, + { + "epoch": 3.821513802315227, + "grad_norm": 0.6089988350868225, + "learning_rate": 1.4093529788597055e-06, + "loss": 2.0092, + "step": 13410 + }, + { + "epoch": 3.8243633125556546, + "grad_norm": 0.5222851037979126, + "learning_rate": 1.4029468289557977e-06, + "loss": 2.053, + "step": 13420 + }, + { + "epoch": 3.8272128227960818, + "grad_norm": 0.5201483368873596, + "learning_rate": 1.3965406790518898e-06, + "loss": 2.0574, + "step": 13430 + }, + { + "epoch": 3.8300623330365093, + "grad_norm": 0.5058591365814209, + "learning_rate": 1.3901345291479822e-06, + "loss": 2.0141, + "step": 13440 + }, + { + "epoch": 3.832911843276937, + "grad_norm": 0.5813695788383484, + "learning_rate": 1.3837283792440746e-06, + "loss": 2.0406, + "step": 13450 + }, + { + "epoch": 3.8357613535173645, + "grad_norm": 0.5166908502578735, + "learning_rate": 1.3773222293401667e-06, + "loss": 2.0228, + "step": 13460 + }, + { + "epoch": 3.8386108637577916, + "grad_norm": 0.6206843852996826, + "learning_rate": 1.3709160794362589e-06, + "loss": 2.026, + "step": 13470 + }, + { + "epoch": 3.841460373998219, + "grad_norm": 0.4896029233932495, + "learning_rate": 1.364509929532351e-06, + "loss": 2.0333, + "step": 13480 + }, + { + "epoch": 3.8443098842386467, + "grad_norm": 0.5111129283905029, + "learning_rate": 1.3581037796284436e-06, + "loss": 2.0121, + "step": 13490 + }, + { + "epoch": 3.847159394479074, + "grad_norm": 0.5457862615585327, + "learning_rate": 1.3516976297245358e-06, + "loss": 2.0093, + "step": 13500 + }, + { + "epoch": 3.847159394479074, + "eval_loss": 2.04461407661438, + "eval_runtime": 99.1024, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.045, + "step": 13500 + }, + { + "epoch": 3.8500089047195014, + "grad_norm": 0.538062334060669, + "learning_rate": 1.345291479820628e-06, + "loss": 2.0358, + "step": 13510 + }, + { + "epoch": 3.852858414959929, + "grad_norm": 0.5242393612861633, + "learning_rate": 1.33888532991672e-06, + "loss": 2.0331, + "step": 13520 + }, + { + "epoch": 3.855707925200356, + "grad_norm": 0.5289698243141174, + "learning_rate": 1.3324791800128122e-06, + "loss": 2.0078, + "step": 13530 + }, + { + "epoch": 3.8585574354407837, + "grad_norm": 0.5151503086090088, + "learning_rate": 1.3260730301089048e-06, + "loss": 2.0463, + "step": 13540 + }, + { + "epoch": 3.8614069456812112, + "grad_norm": 0.5115975737571716, + "learning_rate": 1.319666880204997e-06, + "loss": 2.0334, + "step": 13550 + }, + { + "epoch": 3.8642564559216384, + "grad_norm": 0.5621274709701538, + "learning_rate": 1.3132607303010891e-06, + "loss": 2.0439, + "step": 13560 + }, + { + "epoch": 3.867105966162066, + "grad_norm": 0.5460280179977417, + "learning_rate": 1.3068545803971813e-06, + "loss": 2.0014, + "step": 13570 + }, + { + "epoch": 3.8699554764024935, + "grad_norm": 0.524300217628479, + "learning_rate": 1.3004484304932734e-06, + "loss": 2.0277, + "step": 13580 + }, + { + "epoch": 3.8728049866429206, + "grad_norm": 0.5772647261619568, + "learning_rate": 1.294042280589366e-06, + "loss": 2.0063, + "step": 13590 + }, + { + "epoch": 3.875654496883348, + "grad_norm": 0.5323388576507568, + "learning_rate": 1.2876361306854582e-06, + "loss": 2.0306, + "step": 13600 + }, + { + "epoch": 3.875654496883348, + "eval_loss": 2.044602155685425, + "eval_runtime": 99.2737, + "eval_samples_per_second": 10.073, + "eval_steps_per_second": 5.037, + "step": 13600 + }, + { + "epoch": 3.8785040071237757, + "grad_norm": 0.57416170835495, + "learning_rate": 1.2812299807815503e-06, + "loss": 2.0369, + "step": 13610 + }, + { + "epoch": 3.881353517364203, + "grad_norm": 0.5117653608322144, + "learning_rate": 1.2748238308776425e-06, + "loss": 1.9994, + "step": 13620 + }, + { + "epoch": 3.8842030276046304, + "grad_norm": 0.4790748357772827, + "learning_rate": 1.268417680973735e-06, + "loss": 2.0061, + "step": 13630 + }, + { + "epoch": 3.887052537845058, + "grad_norm": 0.48775649070739746, + "learning_rate": 1.2620115310698272e-06, + "loss": 2.0527, + "step": 13640 + }, + { + "epoch": 3.889902048085485, + "grad_norm": 0.4819037616252899, + "learning_rate": 1.2556053811659194e-06, + "loss": 2.0403, + "step": 13650 + }, + { + "epoch": 3.8927515583259127, + "grad_norm": 0.5073307156562805, + "learning_rate": 1.2491992312620115e-06, + "loss": 2.0114, + "step": 13660 + }, + { + "epoch": 3.8956010685663403, + "grad_norm": 0.5168735980987549, + "learning_rate": 1.242793081358104e-06, + "loss": 2.0362, + "step": 13670 + }, + { + "epoch": 3.8984505788067674, + "grad_norm": 0.5055269598960876, + "learning_rate": 1.236386931454196e-06, + "loss": 2.0426, + "step": 13680 + }, + { + "epoch": 3.901300089047195, + "grad_norm": 0.5459708571434021, + "learning_rate": 1.2299807815502884e-06, + "loss": 2.011, + "step": 13690 + }, + { + "epoch": 3.9041495992876225, + "grad_norm": 0.5798599123954773, + "learning_rate": 1.2235746316463806e-06, + "loss": 1.999, + "step": 13700 + }, + { + "epoch": 3.9041495992876225, + "eval_loss": 2.044389009475708, + "eval_runtime": 99.4564, + "eval_samples_per_second": 10.055, + "eval_steps_per_second": 5.027, + "step": 13700 + }, + { + "epoch": 3.9069991095280496, + "grad_norm": 0.4718180000782013, + "learning_rate": 1.2171684817424727e-06, + "loss": 2.023, + "step": 13710 + }, + { + "epoch": 3.909848619768477, + "grad_norm": 0.558879017829895, + "learning_rate": 1.2107623318385651e-06, + "loss": 1.996, + "step": 13720 + }, + { + "epoch": 3.9126981300089048, + "grad_norm": 0.5425081253051758, + "learning_rate": 1.2043561819346573e-06, + "loss": 2.0198, + "step": 13730 + }, + { + "epoch": 3.915547640249332, + "grad_norm": 0.516757607460022, + "learning_rate": 1.1979500320307496e-06, + "loss": 2.006, + "step": 13740 + }, + { + "epoch": 3.9183971504897595, + "grad_norm": 0.5674097537994385, + "learning_rate": 1.1915438821268418e-06, + "loss": 2.014, + "step": 13750 + }, + { + "epoch": 3.921246660730187, + "grad_norm": 0.47472038865089417, + "learning_rate": 1.185137732222934e-06, + "loss": 2.0161, + "step": 13760 + }, + { + "epoch": 3.9240961709706146, + "grad_norm": 0.46472105383872986, + "learning_rate": 1.1787315823190263e-06, + "loss": 1.9891, + "step": 13770 + }, + { + "epoch": 3.9269456812110417, + "grad_norm": 0.5644447207450867, + "learning_rate": 1.1723254324151185e-06, + "loss": 2.0327, + "step": 13780 + }, + { + "epoch": 3.9297951914514693, + "grad_norm": 0.49525147676467896, + "learning_rate": 1.1659192825112108e-06, + "loss": 1.9746, + "step": 13790 + }, + { + "epoch": 3.932644701691897, + "grad_norm": 0.5136142373085022, + "learning_rate": 1.159513132607303e-06, + "loss": 2.0308, + "step": 13800 + }, + { + "epoch": 3.932644701691897, + "eval_loss": 2.044067621231079, + "eval_runtime": 98.9728, + "eval_samples_per_second": 10.104, + "eval_steps_per_second": 5.052, + "step": 13800 + }, + { + "epoch": 3.9354942119323244, + "grad_norm": 0.5278817415237427, + "learning_rate": 1.1531069827033954e-06, + "loss": 2.0229, + "step": 13810 + }, + { + "epoch": 3.9383437221727515, + "grad_norm": 0.5376277565956116, + "learning_rate": 1.1467008327994875e-06, + "loss": 2.0198, + "step": 13820 + }, + { + "epoch": 3.941193232413179, + "grad_norm": 0.5397053956985474, + "learning_rate": 1.1402946828955799e-06, + "loss": 2.0582, + "step": 13830 + }, + { + "epoch": 3.9440427426536067, + "grad_norm": 0.5622960925102234, + "learning_rate": 1.133888532991672e-06, + "loss": 2.017, + "step": 13840 + }, + { + "epoch": 3.946892252894034, + "grad_norm": 0.5214881896972656, + "learning_rate": 1.1274823830877642e-06, + "loss": 1.9991, + "step": 13850 + }, + { + "epoch": 3.9497417631344613, + "grad_norm": 0.47489604353904724, + "learning_rate": 1.1210762331838566e-06, + "loss": 2.0538, + "step": 13860 + }, + { + "epoch": 3.952591273374889, + "grad_norm": 0.5163717269897461, + "learning_rate": 1.1146700832799487e-06, + "loss": 2.0365, + "step": 13870 + }, + { + "epoch": 3.955440783615316, + "grad_norm": 0.5314338803291321, + "learning_rate": 1.108263933376041e-06, + "loss": 2.0118, + "step": 13880 + }, + { + "epoch": 3.9582902938557436, + "grad_norm": 0.47694677114486694, + "learning_rate": 1.1018577834721333e-06, + "loss": 2.0093, + "step": 13890 + }, + { + "epoch": 3.961139804096171, + "grad_norm": 0.49023205041885376, + "learning_rate": 1.0954516335682256e-06, + "loss": 2.0123, + "step": 13900 + }, + { + "epoch": 3.961139804096171, + "eval_loss": 2.0438590049743652, + "eval_runtime": 99.2315, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.039, + "step": 13900 + }, + { + "epoch": 3.9639893143365983, + "grad_norm": 0.5529634952545166, + "learning_rate": 1.0890454836643178e-06, + "loss": 2.026, + "step": 13910 + }, + { + "epoch": 3.966838824577026, + "grad_norm": 0.49043890833854675, + "learning_rate": 1.0826393337604101e-06, + "loss": 2.0315, + "step": 13920 + }, + { + "epoch": 3.9696883348174534, + "grad_norm": 0.5095582008361816, + "learning_rate": 1.0762331838565023e-06, + "loss": 1.998, + "step": 13930 + }, + { + "epoch": 3.9725378450578805, + "grad_norm": 0.5738527774810791, + "learning_rate": 1.0698270339525947e-06, + "loss": 2.0027, + "step": 13940 + }, + { + "epoch": 3.975387355298308, + "grad_norm": 0.5268344879150391, + "learning_rate": 1.0634208840486868e-06, + "loss": 2.0028, + "step": 13950 + }, + { + "epoch": 3.9782368655387357, + "grad_norm": 0.4693433344364166, + "learning_rate": 1.0570147341447792e-06, + "loss": 2.049, + "step": 13960 + }, + { + "epoch": 3.981086375779163, + "grad_norm": 0.5160017013549805, + "learning_rate": 1.0506085842408713e-06, + "loss": 2.0355, + "step": 13970 + }, + { + "epoch": 3.9839358860195904, + "grad_norm": 0.6007476449012756, + "learning_rate": 1.0442024343369635e-06, + "loss": 2.0596, + "step": 13980 + }, + { + "epoch": 3.986785396260018, + "grad_norm": 0.5093830227851868, + "learning_rate": 1.0377962844330559e-06, + "loss": 2.041, + "step": 13990 + }, + { + "epoch": 3.989634906500445, + "grad_norm": 0.5473350882530212, + "learning_rate": 1.031390134529148e-06, + "loss": 2.0365, + "step": 14000 + }, + { + "epoch": 3.989634906500445, + "eval_loss": 2.043860673904419, + "eval_runtime": 99.085, + "eval_samples_per_second": 10.092, + "eval_steps_per_second": 5.046, + "step": 14000 + }, + { + "epoch": 3.9924844167408726, + "grad_norm": 0.5005938410758972, + "learning_rate": 1.0249839846252404e-06, + "loss": 2.0238, + "step": 14010 + }, + { + "epoch": 3.9953339269813, + "grad_norm": 0.5401079654693604, + "learning_rate": 1.0185778347213326e-06, + "loss": 2.0377, + "step": 14020 + }, + { + "epoch": 3.9981834372217273, + "grad_norm": 0.5096141695976257, + "learning_rate": 1.012171684817425e-06, + "loss": 2.0088, + "step": 14030 + }, + { + "epoch": 4.001032947462155, + "grad_norm": 0.5210298895835876, + "learning_rate": 1.005765534913517e-06, + "loss": 2.0331, + "step": 14040 + }, + { + "epoch": 4.003882457702582, + "grad_norm": 0.47449570894241333, + "learning_rate": 9.993593850096094e-07, + "loss": 2.0141, + "step": 14050 + }, + { + "epoch": 4.00673196794301, + "grad_norm": 0.5214652419090271, + "learning_rate": 9.929532351057016e-07, + "loss": 2.0099, + "step": 14060 + }, + { + "epoch": 4.009581478183438, + "grad_norm": 0.4921712875366211, + "learning_rate": 9.865470852017938e-07, + "loss": 2.0109, + "step": 14070 + }, + { + "epoch": 4.012430988423865, + "grad_norm": 0.4758373200893402, + "learning_rate": 9.801409352978861e-07, + "loss": 2.0104, + "step": 14080 + }, + { + "epoch": 4.015280498664292, + "grad_norm": 0.5070475935935974, + "learning_rate": 9.737347853939783e-07, + "loss": 2.0215, + "step": 14090 + }, + { + "epoch": 4.01813000890472, + "grad_norm": 0.5147979855537415, + "learning_rate": 9.673286354900707e-07, + "loss": 1.9953, + "step": 14100 + }, + { + "epoch": 4.01813000890472, + "eval_loss": 2.0437448024749756, + "eval_runtime": 99.1205, + "eval_samples_per_second": 10.089, + "eval_steps_per_second": 5.044, + "step": 14100 + }, + { + "epoch": 4.020979519145147, + "grad_norm": 0.5636800527572632, + "learning_rate": 9.609224855861628e-07, + "loss": 2.0432, + "step": 14110 + }, + { + "epoch": 4.023829029385574, + "grad_norm": 0.48588666319847107, + "learning_rate": 9.545163356822552e-07, + "loss": 2.0504, + "step": 14120 + }, + { + "epoch": 4.026678539626002, + "grad_norm": 0.5209534764289856, + "learning_rate": 9.481101857783473e-07, + "loss": 2.0058, + "step": 14130 + }, + { + "epoch": 4.029528049866429, + "grad_norm": 0.4877401888370514, + "learning_rate": 9.417040358744395e-07, + "loss": 2.0261, + "step": 14140 + }, + { + "epoch": 4.032377560106856, + "grad_norm": 0.5681753754615784, + "learning_rate": 9.352978859705319e-07, + "loss": 2.0174, + "step": 14150 + }, + { + "epoch": 4.035227070347284, + "grad_norm": 0.5463777184486389, + "learning_rate": 9.28891736066624e-07, + "loss": 2.0013, + "step": 14160 + }, + { + "epoch": 4.0380765805877115, + "grad_norm": 0.4704899489879608, + "learning_rate": 9.224855861627164e-07, + "loss": 2.0138, + "step": 14170 + }, + { + "epoch": 4.040926090828139, + "grad_norm": 0.5133561491966248, + "learning_rate": 9.160794362588085e-07, + "loss": 2.0229, + "step": 14180 + }, + { + "epoch": 4.043775601068567, + "grad_norm": 0.5174084901809692, + "learning_rate": 9.096732863549008e-07, + "loss": 2.0311, + "step": 14190 + }, + { + "epoch": 4.046625111308994, + "grad_norm": 0.49283725023269653, + "learning_rate": 9.032671364509931e-07, + "loss": 2.0092, + "step": 14200 + }, + { + "epoch": 4.046625111308994, + "eval_loss": 2.043433666229248, + "eval_runtime": 99.0743, + "eval_samples_per_second": 10.093, + "eval_steps_per_second": 5.047, + "step": 14200 + }, + { + "epoch": 4.049474621549421, + "grad_norm": 0.5019715428352356, + "learning_rate": 8.968609865470852e-07, + "loss": 2.0344, + "step": 14210 + }, + { + "epoch": 4.052324131789849, + "grad_norm": 0.5088763236999512, + "learning_rate": 8.904548366431776e-07, + "loss": 2.0216, + "step": 14220 + }, + { + "epoch": 4.055173642030276, + "grad_norm": 0.5045077800750732, + "learning_rate": 8.840486867392697e-07, + "loss": 2.0279, + "step": 14230 + }, + { + "epoch": 4.058023152270703, + "grad_norm": 0.5076019763946533, + "learning_rate": 8.776425368353621e-07, + "loss": 2.0421, + "step": 14240 + }, + { + "epoch": 4.060872662511131, + "grad_norm": 0.49383342266082764, + "learning_rate": 8.712363869314543e-07, + "loss": 2.018, + "step": 14250 + }, + { + "epoch": 4.063722172751558, + "grad_norm": 0.4705515503883362, + "learning_rate": 8.648302370275464e-07, + "loss": 2.0218, + "step": 14260 + }, + { + "epoch": 4.066571682991985, + "grad_norm": 0.5173514485359192, + "learning_rate": 8.584240871236388e-07, + "loss": 2.0092, + "step": 14270 + }, + { + "epoch": 4.069421193232413, + "grad_norm": 0.4916168451309204, + "learning_rate": 8.520179372197309e-07, + "loss": 2.0169, + "step": 14280 + }, + { + "epoch": 4.0722707034728405, + "grad_norm": 0.4872733950614929, + "learning_rate": 8.456117873158233e-07, + "loss": 2.0068, + "step": 14290 + }, + { + "epoch": 4.075120213713268, + "grad_norm": 0.5647644400596619, + "learning_rate": 8.392056374119155e-07, + "loss": 2.0008, + "step": 14300 + }, + { + "epoch": 4.075120213713268, + "eval_loss": 2.043240785598755, + "eval_runtime": 99.2058, + "eval_samples_per_second": 10.08, + "eval_steps_per_second": 5.04, + "step": 14300 + }, + { + "epoch": 4.077969723953696, + "grad_norm": 0.5437302589416504, + "learning_rate": 8.327994875080078e-07, + "loss": 2.0172, + "step": 14310 + }, + { + "epoch": 4.080819234194123, + "grad_norm": 0.5404428243637085, + "learning_rate": 8.263933376041e-07, + "loss": 2.0339, + "step": 14320 + }, + { + "epoch": 4.083668744434551, + "grad_norm": 0.5169433951377869, + "learning_rate": 8.199871877001921e-07, + "loss": 2.03, + "step": 14330 + }, + { + "epoch": 4.086518254674978, + "grad_norm": 0.48950499296188354, + "learning_rate": 8.135810377962845e-07, + "loss": 2.0209, + "step": 14340 + }, + { + "epoch": 4.089367764915405, + "grad_norm": 0.48941537737846375, + "learning_rate": 8.071748878923767e-07, + "loss": 2.0278, + "step": 14350 + }, + { + "epoch": 4.092217275155833, + "grad_norm": 0.4822404682636261, + "learning_rate": 8.00768737988469e-07, + "loss": 1.9925, + "step": 14360 + }, + { + "epoch": 4.09506678539626, + "grad_norm": 0.5627114176750183, + "learning_rate": 7.943625880845612e-07, + "loss": 2.0017, + "step": 14370 + }, + { + "epoch": 4.097916295636687, + "grad_norm": 0.5145583152770996, + "learning_rate": 7.879564381806535e-07, + "loss": 2.014, + "step": 14380 + }, + { + "epoch": 4.100765805877115, + "grad_norm": 0.476077139377594, + "learning_rate": 7.815502882767457e-07, + "loss": 2.0253, + "step": 14390 + }, + { + "epoch": 4.103615316117542, + "grad_norm": 0.4902302026748657, + "learning_rate": 7.75144138372838e-07, + "loss": 2.0256, + "step": 14400 + }, + { + "epoch": 4.103615316117542, + "eval_loss": 2.0430707931518555, + "eval_runtime": 99.116, + "eval_samples_per_second": 10.089, + "eval_steps_per_second": 5.045, + "step": 14400 + }, + { + "epoch": 4.1064648263579695, + "grad_norm": 0.4917190670967102, + "learning_rate": 7.687379884689302e-07, + "loss": 2.0237, + "step": 14410 + }, + { + "epoch": 4.1093143365983975, + "grad_norm": 0.48008298873901367, + "learning_rate": 7.623318385650225e-07, + "loss": 2.0297, + "step": 14420 + }, + { + "epoch": 4.112163846838825, + "grad_norm": 0.4533262252807617, + "learning_rate": 7.559256886611148e-07, + "loss": 2.0198, + "step": 14430 + }, + { + "epoch": 4.115013357079252, + "grad_norm": 0.48702970147132874, + "learning_rate": 7.49519538757207e-07, + "loss": 2.0298, + "step": 14440 + }, + { + "epoch": 4.11786286731968, + "grad_norm": 0.48160573840141296, + "learning_rate": 7.431133888532992e-07, + "loss": 2.0452, + "step": 14450 + }, + { + "epoch": 4.120712377560107, + "grad_norm": 0.48124462366104126, + "learning_rate": 7.367072389493914e-07, + "loss": 2.0156, + "step": 14460 + }, + { + "epoch": 4.123561887800534, + "grad_norm": 0.48351117968559265, + "learning_rate": 7.303010890454837e-07, + "loss": 2.0027, + "step": 14470 + }, + { + "epoch": 4.126411398040962, + "grad_norm": 0.5064377784729004, + "learning_rate": 7.23894939141576e-07, + "loss": 2.0332, + "step": 14480 + }, + { + "epoch": 4.129260908281389, + "grad_norm": 0.4717184901237488, + "learning_rate": 7.174887892376682e-07, + "loss": 2.0374, + "step": 14490 + }, + { + "epoch": 4.132110418521816, + "grad_norm": 0.507052481174469, + "learning_rate": 7.110826393337604e-07, + "loss": 2.0335, + "step": 14500 + }, + { + "epoch": 4.132110418521816, + "eval_loss": 2.0429084300994873, + "eval_runtime": 99.1818, + "eval_samples_per_second": 10.082, + "eval_steps_per_second": 5.041, + "step": 14500 + }, + { + "epoch": 4.134959928762244, + "grad_norm": 0.46373996138572693, + "learning_rate": 7.046764894298528e-07, + "loss": 2.04, + "step": 14510 + }, + { + "epoch": 4.137809439002671, + "grad_norm": 0.500369131565094, + "learning_rate": 6.982703395259449e-07, + "loss": 1.9717, + "step": 14520 + }, + { + "epoch": 4.1406589492430985, + "grad_norm": 0.47627460956573486, + "learning_rate": 6.918641896220373e-07, + "loss": 2.0219, + "step": 14530 + }, + { + "epoch": 4.1435084594835265, + "grad_norm": 0.4577222764492035, + "learning_rate": 6.854580397181294e-07, + "loss": 1.9991, + "step": 14540 + }, + { + "epoch": 4.146357969723954, + "grad_norm": 0.4755384027957916, + "learning_rate": 6.790518898142218e-07, + "loss": 2.0254, + "step": 14550 + }, + { + "epoch": 4.149207479964381, + "grad_norm": 0.4971500337123871, + "learning_rate": 6.72645739910314e-07, + "loss": 2.0144, + "step": 14560 + }, + { + "epoch": 4.152056990204809, + "grad_norm": 0.4956054091453552, + "learning_rate": 6.662395900064061e-07, + "loss": 2.0501, + "step": 14570 + }, + { + "epoch": 4.154906500445236, + "grad_norm": 0.4786563813686371, + "learning_rate": 6.598334401024985e-07, + "loss": 1.9868, + "step": 14580 + }, + { + "epoch": 4.157756010685663, + "grad_norm": 0.4747292101383209, + "learning_rate": 6.534272901985906e-07, + "loss": 2.0442, + "step": 14590 + }, + { + "epoch": 4.160605520926091, + "grad_norm": 0.46925073862075806, + "learning_rate": 6.47021140294683e-07, + "loss": 2.0345, + "step": 14600 + }, + { + "epoch": 4.160605520926091, + "eval_loss": 2.0427427291870117, + "eval_runtime": 99.2559, + "eval_samples_per_second": 10.075, + "eval_steps_per_second": 5.037, + "step": 14600 + }, + { + "epoch": 4.163455031166518, + "grad_norm": 0.4990875720977783, + "learning_rate": 6.406149903907752e-07, + "loss": 2.0123, + "step": 14610 + }, + { + "epoch": 4.166304541406945, + "grad_norm": 0.46079766750335693, + "learning_rate": 6.342088404868675e-07, + "loss": 2.0184, + "step": 14620 + }, + { + "epoch": 4.169154051647373, + "grad_norm": 0.5010029077529907, + "learning_rate": 6.278026905829597e-07, + "loss": 2.0524, + "step": 14630 + }, + { + "epoch": 4.1720035618878, + "grad_norm": 0.4925595223903656, + "learning_rate": 6.21396540679052e-07, + "loss": 2.0467, + "step": 14640 + }, + { + "epoch": 4.1748530721282275, + "grad_norm": 0.4509677588939667, + "learning_rate": 6.149903907751442e-07, + "loss": 2.0128, + "step": 14650 + }, + { + "epoch": 4.1777025823686555, + "grad_norm": 0.4880792796611786, + "learning_rate": 6.085842408712364e-07, + "loss": 1.9989, + "step": 14660 + }, + { + "epoch": 4.180552092609083, + "grad_norm": 0.4964626133441925, + "learning_rate": 6.021780909673286e-07, + "loss": 2.0136, + "step": 14670 + }, + { + "epoch": 4.183401602849511, + "grad_norm": 0.46766600012779236, + "learning_rate": 5.957719410634209e-07, + "loss": 2.0002, + "step": 14680 + }, + { + "epoch": 4.186251113089938, + "grad_norm": 0.48704707622528076, + "learning_rate": 5.893657911595132e-07, + "loss": 2.0394, + "step": 14690 + }, + { + "epoch": 4.189100623330365, + "grad_norm": 0.5206152200698853, + "learning_rate": 5.829596412556054e-07, + "loss": 2.0207, + "step": 14700 + }, + { + "epoch": 4.189100623330365, + "eval_loss": 2.042611837387085, + "eval_runtime": 99.2584, + "eval_samples_per_second": 10.075, + "eval_steps_per_second": 5.037, + "step": 14700 + }, + { + "epoch": 4.191950133570793, + "grad_norm": 0.4617534875869751, + "learning_rate": 5.765534913516977e-07, + "loss": 2.0204, + "step": 14710 + }, + { + "epoch": 4.19479964381122, + "grad_norm": 0.4623759388923645, + "learning_rate": 5.701473414477899e-07, + "loss": 2.0188, + "step": 14720 + }, + { + "epoch": 4.197649154051647, + "grad_norm": 0.4455204904079437, + "learning_rate": 5.637411915438821e-07, + "loss": 2.0427, + "step": 14730 + }, + { + "epoch": 4.200498664292075, + "grad_norm": 0.5701692700386047, + "learning_rate": 5.573350416399744e-07, + "loss": 2.0028, + "step": 14740 + }, + { + "epoch": 4.203348174532502, + "grad_norm": 0.48934704065322876, + "learning_rate": 5.509288917360666e-07, + "loss": 1.9963, + "step": 14750 + }, + { + "epoch": 4.206197684772929, + "grad_norm": 0.4875647723674774, + "learning_rate": 5.445227418321589e-07, + "loss": 2.0354, + "step": 14760 + }, + { + "epoch": 4.2090471950133574, + "grad_norm": 0.4656578004360199, + "learning_rate": 5.381165919282512e-07, + "loss": 2.006, + "step": 14770 + }, + { + "epoch": 4.211896705253785, + "grad_norm": 0.5381175875663757, + "learning_rate": 5.317104420243434e-07, + "loss": 2.0257, + "step": 14780 + }, + { + "epoch": 4.214746215494212, + "grad_norm": 0.477753609418869, + "learning_rate": 5.253042921204357e-07, + "loss": 2.0294, + "step": 14790 + }, + { + "epoch": 4.21759572573464, + "grad_norm": 0.4912634789943695, + "learning_rate": 5.188981422165279e-07, + "loss": 2.0379, + "step": 14800 + }, + { + "epoch": 4.21759572573464, + "eval_loss": 2.042499542236328, + "eval_runtime": 99.2392, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.038, + "step": 14800 + }, + { + "epoch": 4.220445235975067, + "grad_norm": 0.45377182960510254, + "learning_rate": 5.124919923126202e-07, + "loss": 2.0226, + "step": 14810 + }, + { + "epoch": 4.223294746215494, + "grad_norm": 0.4863813817501068, + "learning_rate": 5.060858424087125e-07, + "loss": 2.027, + "step": 14820 + }, + { + "epoch": 4.226144256455922, + "grad_norm": 0.4592478573322296, + "learning_rate": 4.996796925048047e-07, + "loss": 2.0243, + "step": 14830 + }, + { + "epoch": 4.228993766696349, + "grad_norm": 0.4891451895236969, + "learning_rate": 4.932735426008969e-07, + "loss": 2.0361, + "step": 14840 + }, + { + "epoch": 4.231843276936776, + "grad_norm": 0.4831571578979492, + "learning_rate": 4.868673926969891e-07, + "loss": 2.0453, + "step": 14850 + }, + { + "epoch": 4.234692787177204, + "grad_norm": 0.46735844016075134, + "learning_rate": 4.804612427930814e-07, + "loss": 2.0351, + "step": 14860 + }, + { + "epoch": 4.237542297417631, + "grad_norm": 0.5098645091056824, + "learning_rate": 4.7405509288917367e-07, + "loss": 2.0323, + "step": 14870 + }, + { + "epoch": 4.2403918076580585, + "grad_norm": 0.4800586700439453, + "learning_rate": 4.6764894298526593e-07, + "loss": 2.0298, + "step": 14880 + }, + { + "epoch": 4.2432413178984865, + "grad_norm": 0.4520800709724426, + "learning_rate": 4.612427930813582e-07, + "loss": 2.0181, + "step": 14890 + }, + { + "epoch": 4.246090828138914, + "grad_norm": 0.5034991502761841, + "learning_rate": 4.548366431774504e-07, + "loss": 2.0216, + "step": 14900 + }, + { + "epoch": 4.246090828138914, + "eval_loss": 2.0423622131347656, + "eval_runtime": 99.1542, + "eval_samples_per_second": 10.085, + "eval_steps_per_second": 5.043, + "step": 14900 + }, + { + "epoch": 4.248940338379341, + "grad_norm": 0.456511527299881, + "learning_rate": 4.484304932735426e-07, + "loss": 1.9933, + "step": 14910 + }, + { + "epoch": 4.251789848619769, + "grad_norm": 0.4512493908405304, + "learning_rate": 4.4202434336963487e-07, + "loss": 2.0269, + "step": 14920 + }, + { + "epoch": 4.254639358860196, + "grad_norm": 0.49248650670051575, + "learning_rate": 4.3561819346572713e-07, + "loss": 1.9984, + "step": 14930 + }, + { + "epoch": 4.257488869100623, + "grad_norm": 0.46570587158203125, + "learning_rate": 4.292120435618194e-07, + "loss": 2.0029, + "step": 14940 + }, + { + "epoch": 4.260338379341051, + "grad_norm": 0.5560067892074585, + "learning_rate": 4.2280589365791166e-07, + "loss": 2.0418, + "step": 14950 + }, + { + "epoch": 4.263187889581478, + "grad_norm": 0.48884886503219604, + "learning_rate": 4.163997437540039e-07, + "loss": 1.9966, + "step": 14960 + }, + { + "epoch": 4.266037399821905, + "grad_norm": 0.4981346130371094, + "learning_rate": 4.099935938500961e-07, + "loss": 2.0336, + "step": 14970 + }, + { + "epoch": 4.268886910062333, + "grad_norm": 0.48836010694503784, + "learning_rate": 4.0358744394618834e-07, + "loss": 2.0336, + "step": 14980 + }, + { + "epoch": 4.27173642030276, + "grad_norm": 0.5348659753799438, + "learning_rate": 3.971812940422806e-07, + "loss": 2.0482, + "step": 14990 + }, + { + "epoch": 4.2745859305431875, + "grad_norm": 0.49484753608703613, + "learning_rate": 3.9077514413837286e-07, + "loss": 2.0119, + "step": 15000 + }, + { + "epoch": 4.2745859305431875, + "eval_loss": 2.042267084121704, + "eval_runtime": 99.4064, + "eval_samples_per_second": 10.06, + "eval_steps_per_second": 5.03, + "step": 15000 + }, + { + "epoch": 4.2774354407836155, + "grad_norm": 0.4720345139503479, + "learning_rate": 3.843689942344651e-07, + "loss": 2.0184, + "step": 15010 + }, + { + "epoch": 4.280284951024043, + "grad_norm": 0.47287288308143616, + "learning_rate": 3.779628443305574e-07, + "loss": 2.0244, + "step": 15020 + }, + { + "epoch": 4.283134461264471, + "grad_norm": 0.45684701204299927, + "learning_rate": 3.715566944266496e-07, + "loss": 2.0464, + "step": 15030 + }, + { + "epoch": 4.285983971504898, + "grad_norm": 0.4994913339614868, + "learning_rate": 3.6515054452274186e-07, + "loss": 2.023, + "step": 15040 + }, + { + "epoch": 4.288833481745325, + "grad_norm": 0.4420747756958008, + "learning_rate": 3.587443946188341e-07, + "loss": 2.0245, + "step": 15050 + }, + { + "epoch": 4.291682991985752, + "grad_norm": 0.4551266133785248, + "learning_rate": 3.523382447149264e-07, + "loss": 1.9831, + "step": 15060 + }, + { + "epoch": 4.29453250222618, + "grad_norm": 0.4731452763080597, + "learning_rate": 3.4593209481101864e-07, + "loss": 2.0185, + "step": 15070 + }, + { + "epoch": 4.297382012466607, + "grad_norm": 0.45530498027801514, + "learning_rate": 3.395259449071109e-07, + "loss": 2.0228, + "step": 15080 + }, + { + "epoch": 4.300231522707035, + "grad_norm": 0.44847631454467773, + "learning_rate": 3.3311979500320306e-07, + "loss": 2.0489, + "step": 15090 + }, + { + "epoch": 4.303081032947462, + "grad_norm": 0.5331618189811707, + "learning_rate": 3.267136450992953e-07, + "loss": 2.0163, + "step": 15100 + }, + { + "epoch": 4.303081032947462, + "eval_loss": 2.042123317718506, + "eval_runtime": 99.3275, + "eval_samples_per_second": 10.068, + "eval_steps_per_second": 5.034, + "step": 15100 + }, + { + "epoch": 4.305930543187889, + "grad_norm": 0.46870335936546326, + "learning_rate": 3.203074951953876e-07, + "loss": 2.0225, + "step": 15110 + }, + { + "epoch": 4.308780053428317, + "grad_norm": 0.49238336086273193, + "learning_rate": 3.1390134529147985e-07, + "loss": 2.0763, + "step": 15120 + }, + { + "epoch": 4.3116295636687445, + "grad_norm": 0.487042635679245, + "learning_rate": 3.074951953875721e-07, + "loss": 2.044, + "step": 15130 + }, + { + "epoch": 4.314479073909172, + "grad_norm": 0.4582352042198181, + "learning_rate": 3.010890454836643e-07, + "loss": 2.0615, + "step": 15140 + }, + { + "epoch": 4.3173285841496, + "grad_norm": 0.47779443860054016, + "learning_rate": 2.946828955797566e-07, + "loss": 2.0154, + "step": 15150 + }, + { + "epoch": 4.320178094390027, + "grad_norm": 0.48991677165031433, + "learning_rate": 2.8827674567584884e-07, + "loss": 2.0161, + "step": 15160 + }, + { + "epoch": 4.323027604630454, + "grad_norm": 0.49436208605766296, + "learning_rate": 2.8187059577194105e-07, + "loss": 2.0358, + "step": 15170 + }, + { + "epoch": 4.325877114870882, + "grad_norm": 0.4528005123138428, + "learning_rate": 2.754644458680333e-07, + "loss": 2.0148, + "step": 15180 + }, + { + "epoch": 4.328726625111309, + "grad_norm": 0.4622327983379364, + "learning_rate": 2.690582959641256e-07, + "loss": 2.0151, + "step": 15190 + }, + { + "epoch": 4.331576135351736, + "grad_norm": 0.4738125503063202, + "learning_rate": 2.6265214606021784e-07, + "loss": 2.0272, + "step": 15200 + }, + { + "epoch": 4.331576135351736, + "eval_loss": 2.0419702529907227, + "eval_runtime": 99.4017, + "eval_samples_per_second": 10.06, + "eval_steps_per_second": 5.03, + "step": 15200 + }, + { + "epoch": 4.334425645592164, + "grad_norm": 0.4631751477718353, + "learning_rate": 2.562459961563101e-07, + "loss": 2.0151, + "step": 15210 + }, + { + "epoch": 4.337275155832591, + "grad_norm": 0.49889975786209106, + "learning_rate": 2.4983984625240236e-07, + "loss": 2.0478, + "step": 15220 + }, + { + "epoch": 4.340124666073018, + "grad_norm": 0.45168715715408325, + "learning_rate": 2.4343369634849457e-07, + "loss": 2.0292, + "step": 15230 + }, + { + "epoch": 4.342974176313446, + "grad_norm": 0.5072616934776306, + "learning_rate": 2.3702754644458683e-07, + "loss": 2.0183, + "step": 15240 + }, + { + "epoch": 4.3458236865538735, + "grad_norm": 0.5032421946525574, + "learning_rate": 2.306213965406791e-07, + "loss": 2.0098, + "step": 15250 + }, + { + "epoch": 4.348673196794301, + "grad_norm": 0.4905742406845093, + "learning_rate": 2.242152466367713e-07, + "loss": 2.0349, + "step": 15260 + }, + { + "epoch": 4.351522707034729, + "grad_norm": 0.5213619470596313, + "learning_rate": 2.1780909673286357e-07, + "loss": 2.0246, + "step": 15270 + }, + { + "epoch": 4.354372217275156, + "grad_norm": 0.47906991839408875, + "learning_rate": 2.1140294682895583e-07, + "loss": 2.0212, + "step": 15280 + }, + { + "epoch": 4.357221727515583, + "grad_norm": 0.4957728087902069, + "learning_rate": 2.0499679692504804e-07, + "loss": 2.021, + "step": 15290 + }, + { + "epoch": 4.360071237756011, + "grad_norm": 0.44690993428230286, + "learning_rate": 1.985906470211403e-07, + "loss": 2.0376, + "step": 15300 + }, + { + "epoch": 4.360071237756011, + "eval_loss": 2.041898488998413, + "eval_runtime": 99.5154, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 5.024, + "step": 15300 + }, + { + "epoch": 4.362920747996438, + "grad_norm": 0.446969598531723, + "learning_rate": 1.9218449711723256e-07, + "loss": 2.0322, + "step": 15310 + }, + { + "epoch": 4.365770258236865, + "grad_norm": 0.4475710690021515, + "learning_rate": 1.857783472133248e-07, + "loss": 2.0318, + "step": 15320 + }, + { + "epoch": 4.368619768477293, + "grad_norm": 0.45626747608184814, + "learning_rate": 1.7937219730941706e-07, + "loss": 2.0228, + "step": 15330 + }, + { + "epoch": 4.37146927871772, + "grad_norm": 0.42936715483665466, + "learning_rate": 1.7296604740550932e-07, + "loss": 2.0457, + "step": 15340 + }, + { + "epoch": 4.374318788958147, + "grad_norm": 0.43782496452331543, + "learning_rate": 1.6655989750160153e-07, + "loss": 1.9622, + "step": 15350 + }, + { + "epoch": 4.377168299198575, + "grad_norm": 0.48435845971107483, + "learning_rate": 1.601537475976938e-07, + "loss": 2.0286, + "step": 15360 + }, + { + "epoch": 4.3800178094390025, + "grad_norm": 0.47565901279449463, + "learning_rate": 1.5374759769378605e-07, + "loss": 2.0014, + "step": 15370 + }, + { + "epoch": 4.3828673196794306, + "grad_norm": 0.4597352147102356, + "learning_rate": 1.473414477898783e-07, + "loss": 2.0069, + "step": 15380 + }, + { + "epoch": 4.385716829919858, + "grad_norm": 0.4486582279205322, + "learning_rate": 1.4093529788597053e-07, + "loss": 2.0413, + "step": 15390 + }, + { + "epoch": 4.388566340160285, + "grad_norm": 0.455665647983551, + "learning_rate": 1.345291479820628e-07, + "loss": 2.0162, + "step": 15400 + }, + { + "epoch": 4.388566340160285, + "eval_loss": 2.0417721271514893, + "eval_runtime": 99.1826, + "eval_samples_per_second": 10.082, + "eval_steps_per_second": 5.041, + "step": 15400 + }, + { + "epoch": 4.391415850400712, + "grad_norm": 0.45837944746017456, + "learning_rate": 1.2812299807815505e-07, + "loss": 1.9732, + "step": 15410 + }, + { + "epoch": 4.39426536064114, + "grad_norm": 0.4252510368824005, + "learning_rate": 1.2171684817424729e-07, + "loss": 2.0138, + "step": 15420 + }, + { + "epoch": 4.397114870881567, + "grad_norm": 0.49600866436958313, + "learning_rate": 1.1531069827033955e-07, + "loss": 2.0193, + "step": 15430 + }, + { + "epoch": 4.399964381121995, + "grad_norm": 0.42285075783729553, + "learning_rate": 1.0890454836643178e-07, + "loss": 2.0225, + "step": 15440 + }, + { + "epoch": 4.402813891362422, + "grad_norm": 0.5210908055305481, + "learning_rate": 1.0249839846252402e-07, + "loss": 2.0294, + "step": 15450 + }, + { + "epoch": 4.405663401602849, + "grad_norm": 0.448207825422287, + "learning_rate": 9.609224855861628e-08, + "loss": 1.9902, + "step": 15460 + }, + { + "epoch": 4.408512911843277, + "grad_norm": 0.45898616313934326, + "learning_rate": 8.968609865470853e-08, + "loss": 2.0227, + "step": 15470 + }, + { + "epoch": 4.411362422083704, + "grad_norm": 0.4249895215034485, + "learning_rate": 8.327994875080077e-08, + "loss": 2.0255, + "step": 15480 + }, + { + "epoch": 4.414211932324132, + "grad_norm": 0.44755247235298157, + "learning_rate": 7.687379884689303e-08, + "loss": 2.0284, + "step": 15490 + }, + { + "epoch": 4.41706144256456, + "grad_norm": 0.467061847448349, + "learning_rate": 7.046764894298526e-08, + "loss": 2.0549, + "step": 15500 + }, + { + "epoch": 4.41706144256456, + "eval_loss": 2.0417046546936035, + "eval_runtime": 99.2091, + "eval_samples_per_second": 10.08, + "eval_steps_per_second": 5.04, + "step": 15500 + }, + { + "epoch": 4.419910952804987, + "grad_norm": 0.4667827785015106, + "learning_rate": 6.406149903907752e-08, + "loss": 2.0094, + "step": 15510 + }, + { + "epoch": 4.422760463045414, + "grad_norm": 0.4421955943107605, + "learning_rate": 5.7655349135169774e-08, + "loss": 2.0206, + "step": 15520 + }, + { + "epoch": 4.425609973285842, + "grad_norm": 0.4796900749206543, + "learning_rate": 5.124919923126201e-08, + "loss": 2.0234, + "step": 15530 + }, + { + "epoch": 4.428459483526269, + "grad_norm": 0.46110549569129944, + "learning_rate": 4.4843049327354265e-08, + "loss": 2.0719, + "step": 15540 + }, + { + "epoch": 4.431308993766696, + "grad_norm": 0.43615448474884033, + "learning_rate": 3.8436899423446514e-08, + "loss": 2.004, + "step": 15550 + }, + { + "epoch": 4.434158504007124, + "grad_norm": 0.4552094340324402, + "learning_rate": 3.203074951953876e-08, + "loss": 2.018, + "step": 15560 + }, + { + "epoch": 4.437008014247551, + "grad_norm": 0.4268529415130615, + "learning_rate": 2.5624599615631005e-08, + "loss": 2.0226, + "step": 15570 + }, + { + "epoch": 4.439857524487978, + "grad_norm": 0.4385978579521179, + "learning_rate": 1.9218449711723257e-08, + "loss": 2.0217, + "step": 15580 + }, + { + "epoch": 4.442707034728406, + "grad_norm": 0.48627611994743347, + "learning_rate": 1.2812299807815502e-08, + "loss": 2.0048, + "step": 15590 + }, + { + "epoch": 4.4455565449688335, + "grad_norm": 0.42732298374176025, + "learning_rate": 6.406149903907751e-09, + "loss": 2.0082, + "step": 15600 + }, + { + "epoch": 4.4455565449688335, + "eval_loss": 2.0416414737701416, + "eval_runtime": 99.0939, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.046, + "step": 15600 + }, + { + "epoch": 4.448406055209261, + "grad_norm": 0.4420681297779083, + "learning_rate": 0.0, + "loss": 2.0069, + "step": 15610 + }, + { + "epoch": 0.99968, + "grad_norm": 0.7863059639930725, + "learning_rate": 4.999199615815592e-06, + "loss": 2.0359, + "step": 15620 + }, + { + "epoch": 1.00032, + "grad_norm": 0.8261616230010986, + "learning_rate": 4.995998079077958e-06, + "loss": 2.0272, + "step": 15630 + }, + { + "epoch": 1.00096, + "grad_norm": 1.2386047840118408, + "learning_rate": 4.992796542340324e-06, + "loss": 2.0275, + "step": 15640 + }, + { + "epoch": 1.0016, + "grad_norm": 1.005250096321106, + "learning_rate": 4.98959500560269e-06, + "loss": 2.0273, + "step": 15650 + }, + { + "epoch": 1.00224, + "grad_norm": 0.7347936630249023, + "learning_rate": 4.986393468865056e-06, + "loss": 2.0332, + "step": 15660 + }, + { + "epoch": 1.00288, + "grad_norm": 0.8586557507514954, + "learning_rate": 4.983191932127422e-06, + "loss": 2.0431, + "step": 15670 + }, + { + "epoch": 1.00352, + "grad_norm": 0.66364985704422, + "learning_rate": 4.979990395389787e-06, + "loss": 2.0106, + "step": 15680 + }, + { + "epoch": 1.00416, + "grad_norm": 0.6150959134101868, + "learning_rate": 4.976788858652153e-06, + "loss": 2.0459, + "step": 15690 + }, + { + "epoch": 1.0048, + "grad_norm": 0.7407456636428833, + "learning_rate": 4.973587321914519e-06, + "loss": 2.0127, + "step": 15700 + }, + { + "epoch": 1.0048, + "eval_loss": 2.050377607345581, + "eval_runtime": 99.449, + "eval_samples_per_second": 10.055, + "eval_steps_per_second": 5.028, + "step": 15700 + }, + { + "epoch": 1.0054400000000001, + "grad_norm": 0.6863277554512024, + "learning_rate": 4.970385785176885e-06, + "loss": 2.0219, + "step": 15710 + }, + { + "epoch": 1.00608, + "grad_norm": 0.736400842666626, + "learning_rate": 4.967184248439251e-06, + "loss": 2.0259, + "step": 15720 + }, + { + "epoch": 1.00672, + "grad_norm": 0.7324177026748657, + "learning_rate": 4.963982711701617e-06, + "loss": 2.0163, + "step": 15730 + }, + { + "epoch": 1.00736, + "grad_norm": 0.7272530198097229, + "learning_rate": 4.960781174963983e-06, + "loss": 2.0265, + "step": 15740 + }, + { + "epoch": 1.008, + "grad_norm": 0.682123601436615, + "learning_rate": 4.957579638226349e-06, + "loss": 2.0479, + "step": 15750 + }, + { + "epoch": 1.00864, + "grad_norm": 0.774432897567749, + "learning_rate": 4.954378101488715e-06, + "loss": 2.0273, + "step": 15760 + }, + { + "epoch": 1.00928, + "grad_norm": 0.6753953695297241, + "learning_rate": 4.9511765647510805e-06, + "loss": 2.0358, + "step": 15770 + }, + { + "epoch": 1.00992, + "grad_norm": 0.7278711795806885, + "learning_rate": 4.947975028013447e-06, + "loss": 2.0327, + "step": 15780 + }, + { + "epoch": 1.01056, + "grad_norm": 0.6097889542579651, + "learning_rate": 4.944773491275813e-06, + "loss": 2.0118, + "step": 15790 + }, + { + "epoch": 1.0112, + "grad_norm": 0.9159688353538513, + "learning_rate": 4.9415719545381784e-06, + "loss": 2.0241, + "step": 15800 + }, + { + "epoch": 1.0112, + "eval_loss": 2.049624443054199, + "eval_runtime": 99.2626, + "eval_samples_per_second": 10.074, + "eval_steps_per_second": 5.037, + "step": 15800 + }, + { + "epoch": 1.01184, + "grad_norm": 0.689122200012207, + "learning_rate": 4.938370417800545e-06, + "loss": 2.0432, + "step": 15810 + }, + { + "epoch": 1.01248, + "grad_norm": 0.733984112739563, + "learning_rate": 4.935168881062911e-06, + "loss": 2.0437, + "step": 15820 + }, + { + "epoch": 1.01312, + "grad_norm": 0.6271092295646667, + "learning_rate": 4.931967344325276e-06, + "loss": 2.044, + "step": 15830 + }, + { + "epoch": 1.01376, + "grad_norm": 0.6616642475128174, + "learning_rate": 4.928765807587643e-06, + "loss": 2.015, + "step": 15840 + }, + { + "epoch": 1.0144, + "grad_norm": 0.6139689087867737, + "learning_rate": 4.925564270850009e-06, + "loss": 2.0044, + "step": 15850 + }, + { + "epoch": 1.01504, + "grad_norm": 0.6150696873664856, + "learning_rate": 4.922362734112374e-06, + "loss": 2.0086, + "step": 15860 + }, + { + "epoch": 1.01568, + "grad_norm": 0.6278733611106873, + "learning_rate": 4.91916119737474e-06, + "loss": 2.0221, + "step": 15870 + }, + { + "epoch": 1.01632, + "grad_norm": 0.6292057633399963, + "learning_rate": 4.915959660637107e-06, + "loss": 2.0128, + "step": 15880 + }, + { + "epoch": 1.01696, + "grad_norm": 0.6009802222251892, + "learning_rate": 4.912758123899472e-06, + "loss": 2.0146, + "step": 15890 + }, + { + "epoch": 1.0176, + "grad_norm": 0.736315131187439, + "learning_rate": 4.909556587161838e-06, + "loss": 2.0409, + "step": 15900 + }, + { + "epoch": 1.0176, + "eval_loss": 2.049333095550537, + "eval_runtime": 99.3102, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.035, + "step": 15900 + }, + { + "epoch": 1.01824, + "grad_norm": 0.7083340287208557, + "learning_rate": 4.906355050424204e-06, + "loss": 2.0488, + "step": 15910 + }, + { + "epoch": 1.01888, + "grad_norm": 0.7024874687194824, + "learning_rate": 4.9031535136865695e-06, + "loss": 2.05, + "step": 15920 + }, + { + "epoch": 1.01952, + "grad_norm": 0.60423344373703, + "learning_rate": 4.899951976948936e-06, + "loss": 1.9965, + "step": 15930 + }, + { + "epoch": 1.02016, + "grad_norm": 0.709780216217041, + "learning_rate": 4.896750440211302e-06, + "loss": 2.0311, + "step": 15940 + }, + { + "epoch": 1.0208, + "grad_norm": 0.7139421701431274, + "learning_rate": 4.8935489034736675e-06, + "loss": 2.0122, + "step": 15950 + }, + { + "epoch": 1.02144, + "grad_norm": 0.6455684304237366, + "learning_rate": 4.890347366736033e-06, + "loss": 2.0438, + "step": 15960 + }, + { + "epoch": 1.02208, + "grad_norm": 0.6643734574317932, + "learning_rate": 4.8871458299984e-06, + "loss": 2.0006, + "step": 15970 + }, + { + "epoch": 1.02272, + "grad_norm": 0.8344186544418335, + "learning_rate": 4.8839442932607654e-06, + "loss": 2.0017, + "step": 15980 + }, + { + "epoch": 1.02336, + "grad_norm": 0.6340540647506714, + "learning_rate": 4.880742756523131e-06, + "loss": 2.0021, + "step": 15990 + }, + { + "epoch": 1.024, + "grad_norm": 0.6304895281791687, + "learning_rate": 4.877541219785498e-06, + "loss": 2.0278, + "step": 16000 + }, + { + "epoch": 1.024, + "eval_loss": 2.0487542152404785, + "eval_runtime": 99.1634, + "eval_samples_per_second": 10.084, + "eval_steps_per_second": 5.042, + "step": 16000 + }, + { + "epoch": 1.02464, + "grad_norm": 0.5626567006111145, + "learning_rate": 4.874339683047863e-06, + "loss": 2.0197, + "step": 16010 + }, + { + "epoch": 1.02528, + "grad_norm": 0.5974437594413757, + "learning_rate": 4.871138146310229e-06, + "loss": 2.0158, + "step": 16020 + }, + { + "epoch": 1.02592, + "grad_norm": 0.5794137716293335, + "learning_rate": 4.867936609572596e-06, + "loss": 2.0045, + "step": 16030 + }, + { + "epoch": 1.02656, + "grad_norm": 0.6216360926628113, + "learning_rate": 4.864735072834961e-06, + "loss": 2.0391, + "step": 16040 + }, + { + "epoch": 1.0272, + "grad_norm": 0.7104542255401611, + "learning_rate": 4.861533536097327e-06, + "loss": 2.0457, + "step": 16050 + }, + { + "epoch": 1.02784, + "grad_norm": 0.6782556772232056, + "learning_rate": 4.858331999359693e-06, + "loss": 1.9936, + "step": 16060 + }, + { + "epoch": 1.02848, + "grad_norm": 0.7673866748809814, + "learning_rate": 4.855130462622059e-06, + "loss": 2.0346, + "step": 16070 + }, + { + "epoch": 1.02912, + "grad_norm": 0.6345120668411255, + "learning_rate": 4.851928925884425e-06, + "loss": 2.049, + "step": 16080 + }, + { + "epoch": 1.02976, + "grad_norm": 0.6022641658782959, + "learning_rate": 4.848727389146791e-06, + "loss": 2.0472, + "step": 16090 + }, + { + "epoch": 1.0304, + "grad_norm": 0.6524453163146973, + "learning_rate": 4.845525852409157e-06, + "loss": 2.0457, + "step": 16100 + }, + { + "epoch": 1.0304, + "eval_loss": 2.049445867538452, + "eval_runtime": 99.2294, + "eval_samples_per_second": 10.078, + "eval_steps_per_second": 5.039, + "step": 16100 + }, + { + "epoch": 1.03104, + "grad_norm": 0.6599230170249939, + "learning_rate": 4.842324315671523e-06, + "loss": 2.041, + "step": 16110 + }, + { + "epoch": 1.03168, + "grad_norm": 0.6208943724632263, + "learning_rate": 4.839122778933889e-06, + "loss": 2.0523, + "step": 16120 + }, + { + "epoch": 1.03232, + "grad_norm": 0.6133295297622681, + "learning_rate": 4.8359212421962545e-06, + "loss": 2.0007, + "step": 16130 + }, + { + "epoch": 1.03296, + "grad_norm": 0.6858235001564026, + "learning_rate": 4.83271970545862e-06, + "loss": 2.0315, + "step": 16140 + }, + { + "epoch": 1.0336, + "grad_norm": 0.6205055117607117, + "learning_rate": 4.829518168720987e-06, + "loss": 2.0114, + "step": 16150 + }, + { + "epoch": 1.03424, + "grad_norm": 0.6582651138305664, + "learning_rate": 4.8263166319833525e-06, + "loss": 2.0176, + "step": 16160 + }, + { + "epoch": 1.03488, + "grad_norm": 0.6905097961425781, + "learning_rate": 4.823115095245718e-06, + "loss": 2.0338, + "step": 16170 + }, + { + "epoch": 1.03552, + "grad_norm": 0.6507758498191833, + "learning_rate": 4.819913558508084e-06, + "loss": 2.0382, + "step": 16180 + }, + { + "epoch": 1.03616, + "grad_norm": 0.6980578899383545, + "learning_rate": 4.81671202177045e-06, + "loss": 2.027, + "step": 16190 + }, + { + "epoch": 1.0368, + "grad_norm": 0.6131721138954163, + "learning_rate": 4.813510485032816e-06, + "loss": 2.0456, + "step": 16200 + }, + { + "epoch": 1.0368, + "eval_loss": 2.048192024230957, + "eval_runtime": 99.2342, + "eval_samples_per_second": 10.077, + "eval_steps_per_second": 5.039, + "step": 16200 + }, + { + "epoch": 1.03744, + "grad_norm": 0.6967636346817017, + "learning_rate": 4.810308948295182e-06, + "loss": 2.0024, + "step": 16210 + }, + { + "epoch": 1.03808, + "grad_norm": 0.6524723172187805, + "learning_rate": 4.8071074115575476e-06, + "loss": 2.0663, + "step": 16220 + }, + { + "epoch": 1.03872, + "grad_norm": 0.7181164026260376, + "learning_rate": 4.803905874819914e-06, + "loss": 2.0482, + "step": 16230 + }, + { + "epoch": 1.03936, + "grad_norm": 0.6536914706230164, + "learning_rate": 4.80070433808228e-06, + "loss": 2.0432, + "step": 16240 + }, + { + "epoch": 1.04, + "grad_norm": 0.682151734828949, + "learning_rate": 4.7975028013446455e-06, + "loss": 2.0246, + "step": 16250 + }, + { + "epoch": 1.04064, + "grad_norm": 0.5902345180511475, + "learning_rate": 4.794301264607012e-06, + "loss": 2.0352, + "step": 16260 + }, + { + "epoch": 1.04128, + "grad_norm": 0.6463891267776489, + "learning_rate": 4.791099727869378e-06, + "loss": 2.0132, + "step": 16270 + }, + { + "epoch": 1.04192, + "grad_norm": 0.5589125752449036, + "learning_rate": 4.7878981911317435e-06, + "loss": 2.0283, + "step": 16280 + }, + { + "epoch": 1.04256, + "grad_norm": 0.6977293491363525, + "learning_rate": 4.78469665439411e-06, + "loss": 2.0297, + "step": 16290 + }, + { + "epoch": 1.0432, + "grad_norm": 0.6409101486206055, + "learning_rate": 4.781495117656476e-06, + "loss": 2.0402, + "step": 16300 + }, + { + "epoch": 1.0432, + "eval_loss": 2.0480778217315674, + "eval_runtime": 99.1509, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 16300 + }, + { + "epoch": 1.04384, + "grad_norm": 0.8234467506408691, + "learning_rate": 4.7782935809188415e-06, + "loss": 2.0179, + "step": 16310 + }, + { + "epoch": 1.04448, + "grad_norm": 0.6057077050209045, + "learning_rate": 4.775092044181208e-06, + "loss": 2.0022, + "step": 16320 + }, + { + "epoch": 1.04512, + "grad_norm": 0.5955457091331482, + "learning_rate": 4.771890507443574e-06, + "loss": 2.0085, + "step": 16330 + }, + { + "epoch": 1.04576, + "grad_norm": 0.6484575271606445, + "learning_rate": 4.7686889707059395e-06, + "loss": 2.03, + "step": 16340 + }, + { + "epoch": 1.0464, + "grad_norm": 0.7228317856788635, + "learning_rate": 4.765487433968305e-06, + "loss": 2.0155, + "step": 16350 + }, + { + "epoch": 1.04704, + "grad_norm": 0.735862135887146, + "learning_rate": 4.762285897230671e-06, + "loss": 2.0442, + "step": 16360 + }, + { + "epoch": 1.04768, + "grad_norm": 0.7349162697792053, + "learning_rate": 4.759084360493037e-06, + "loss": 2.033, + "step": 16370 + }, + { + "epoch": 1.04832, + "grad_norm": 0.6394819617271423, + "learning_rate": 4.755882823755403e-06, + "loss": 2.0079, + "step": 16380 + }, + { + "epoch": 1.04896, + "grad_norm": 0.6491113901138306, + "learning_rate": 4.752681287017769e-06, + "loss": 2.0325, + "step": 16390 + }, + { + "epoch": 1.0496, + "grad_norm": 0.6450079083442688, + "learning_rate": 4.749479750280135e-06, + "loss": 2.0339, + "step": 16400 + }, + { + "epoch": 1.0496, + "eval_loss": 2.047717332839966, + "eval_runtime": 99.3795, + "eval_samples_per_second": 10.062, + "eval_steps_per_second": 5.031, + "step": 16400 + }, + { + "epoch": 1.05024, + "grad_norm": 0.5671599507331848, + "learning_rate": 4.7462782135425e-06, + "loss": 2.066, + "step": 16410 + }, + { + "epoch": 1.05088, + "grad_norm": 0.6196804046630859, + "learning_rate": 4.743076676804867e-06, + "loss": 2.0442, + "step": 16420 + }, + { + "epoch": 1.05152, + "grad_norm": 0.5986396074295044, + "learning_rate": 4.7398751400672326e-06, + "loss": 2.0049, + "step": 16430 + }, + { + "epoch": 1.05216, + "grad_norm": 0.7346808314323425, + "learning_rate": 4.736673603329598e-06, + "loss": 1.9991, + "step": 16440 + }, + { + "epoch": 1.0528, + "grad_norm": 0.631497859954834, + "learning_rate": 4.733472066591965e-06, + "loss": 2.0414, + "step": 16450 + }, + { + "epoch": 1.05344, + "grad_norm": 0.6475789546966553, + "learning_rate": 4.7302705298543305e-06, + "loss": 2.0289, + "step": 16460 + }, + { + "epoch": 1.05408, + "grad_norm": 0.7186465263366699, + "learning_rate": 4.727068993116696e-06, + "loss": 2.0203, + "step": 16470 + }, + { + "epoch": 1.05472, + "grad_norm": 1.0701770782470703, + "learning_rate": 4.723867456379062e-06, + "loss": 2.0291, + "step": 16480 + }, + { + "epoch": 1.05536, + "grad_norm": 0.7437950968742371, + "learning_rate": 4.7206659196414285e-06, + "loss": 2.0146, + "step": 16490 + }, + { + "epoch": 1.056, + "grad_norm": 0.7150045037269592, + "learning_rate": 4.717464382903794e-06, + "loss": 1.9934, + "step": 16500 + }, + { + "epoch": 1.056, + "eval_loss": 2.0487072467803955, + "eval_runtime": 99.389, + "eval_samples_per_second": 10.061, + "eval_steps_per_second": 5.031, + "step": 16500 + }, + { + "epoch": 1.05664, + "grad_norm": 0.6178598999977112, + "learning_rate": 4.71426284616616e-06, + "loss": 2.0244, + "step": 16510 + }, + { + "epoch": 1.05728, + "grad_norm": 0.6366320848464966, + "learning_rate": 4.7110613094285265e-06, + "loss": 2.0105, + "step": 16520 + }, + { + "epoch": 1.05792, + "grad_norm": 0.5788693428039551, + "learning_rate": 4.707859772690892e-06, + "loss": 2.0542, + "step": 16530 + }, + { + "epoch": 1.05856, + "grad_norm": 0.5591841340065002, + "learning_rate": 4.704658235953258e-06, + "loss": 2.0441, + "step": 16540 + }, + { + "epoch": 1.0592, + "grad_norm": 0.679841935634613, + "learning_rate": 4.7014566992156245e-06, + "loss": 2.0394, + "step": 16550 + }, + { + "epoch": 1.05984, + "grad_norm": 0.6606016755104065, + "learning_rate": 4.69825516247799e-06, + "loss": 2.0118, + "step": 16560 + }, + { + "epoch": 1.06048, + "grad_norm": 0.561490535736084, + "learning_rate": 4.695053625740356e-06, + "loss": 1.9913, + "step": 16570 + }, + { + "epoch": 1.06112, + "grad_norm": 0.7106103897094727, + "learning_rate": 4.691852089002722e-06, + "loss": 2.04, + "step": 16580 + }, + { + "epoch": 1.06176, + "grad_norm": 0.6973317861557007, + "learning_rate": 4.688650552265087e-06, + "loss": 2.0272, + "step": 16590 + }, + { + "epoch": 1.0624, + "grad_norm": 0.6442530155181885, + "learning_rate": 4.685449015527454e-06, + "loss": 2.0355, + "step": 16600 + }, + { + "epoch": 1.0624, + "eval_loss": 2.047421932220459, + "eval_runtime": 99.1752, + "eval_samples_per_second": 10.083, + "eval_steps_per_second": 5.042, + "step": 16600 + }, + { + "epoch": 1.06304, + "grad_norm": 0.6881089806556702, + "learning_rate": 4.6822474787898196e-06, + "loss": 2.0436, + "step": 16610 + }, + { + "epoch": 1.06368, + "grad_norm": 0.6413591504096985, + "learning_rate": 4.679045942052185e-06, + "loss": 2.0438, + "step": 16620 + }, + { + "epoch": 1.06432, + "grad_norm": 0.5700530409812927, + "learning_rate": 4.675844405314551e-06, + "loss": 2.0283, + "step": 16630 + }, + { + "epoch": 1.06496, + "grad_norm": 0.6032475233078003, + "learning_rate": 4.672642868576917e-06, + "loss": 2.0294, + "step": 16640 + }, + { + "epoch": 1.0656, + "grad_norm": 0.6175091862678528, + "learning_rate": 4.669441331839283e-06, + "loss": 2.0272, + "step": 16650 + }, + { + "epoch": 1.06624, + "grad_norm": 0.6341057419776917, + "learning_rate": 4.666239795101649e-06, + "loss": 2.0302, + "step": 16660 + }, + { + "epoch": 1.06688, + "grad_norm": 0.6054055094718933, + "learning_rate": 4.663038258364015e-06, + "loss": 2.0211, + "step": 16670 + }, + { + "epoch": 1.06752, + "grad_norm": 0.5983806848526001, + "learning_rate": 4.659836721626381e-06, + "loss": 2.0211, + "step": 16680 + }, + { + "epoch": 1.06816, + "grad_norm": 0.6351720690727234, + "learning_rate": 4.656635184888747e-06, + "loss": 2.0615, + "step": 16690 + }, + { + "epoch": 1.0688, + "grad_norm": 0.6418411135673523, + "learning_rate": 4.653433648151113e-06, + "loss": 2.0036, + "step": 16700 + }, + { + "epoch": 1.0688, + "eval_loss": 2.0469391345977783, + "eval_runtime": 99.2513, + "eval_samples_per_second": 10.075, + "eval_steps_per_second": 5.038, + "step": 16700 + }, + { + "epoch": 1.06944, + "grad_norm": 0.5719327926635742, + "learning_rate": 4.650232111413479e-06, + "loss": 2.0489, + "step": 16710 + }, + { + "epoch": 1.07008, + "grad_norm": 0.632648229598999, + "learning_rate": 4.647030574675845e-06, + "loss": 2.0198, + "step": 16720 + }, + { + "epoch": 1.0707200000000001, + "grad_norm": 0.5791489481925964, + "learning_rate": 4.643829037938211e-06, + "loss": 2.0042, + "step": 16730 + }, + { + "epoch": 1.07136, + "grad_norm": 0.696101725101471, + "learning_rate": 4.640627501200577e-06, + "loss": 2.0353, + "step": 16740 + }, + { + "epoch": 1.072, + "grad_norm": 0.6109420657157898, + "learning_rate": 4.637425964462943e-06, + "loss": 2.0301, + "step": 16750 + }, + { + "epoch": 1.07264, + "grad_norm": 0.6553672552108765, + "learning_rate": 4.634224427725309e-06, + "loss": 2.0321, + "step": 16760 + }, + { + "epoch": 1.07328, + "grad_norm": 0.5798039436340332, + "learning_rate": 4.631022890987674e-06, + "loss": 2.0336, + "step": 16770 + }, + { + "epoch": 1.07392, + "grad_norm": 0.5964285731315613, + "learning_rate": 4.627821354250041e-06, + "loss": 2.0314, + "step": 16780 + }, + { + "epoch": 1.07456, + "grad_norm": 0.6674721240997314, + "learning_rate": 4.624619817512407e-06, + "loss": 2.0391, + "step": 16790 + }, + { + "epoch": 1.0752, + "grad_norm": 0.5824782848358154, + "learning_rate": 4.621418280774772e-06, + "loss": 2.0247, + "step": 16800 + }, + { + "epoch": 1.0752, + "eval_loss": 2.047208547592163, + "eval_runtime": 99.1467, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 16800 + }, + { + "epoch": 1.07584, + "grad_norm": 0.6081963181495667, + "learning_rate": 4.618216744037138e-06, + "loss": 2.0421, + "step": 16810 + }, + { + "epoch": 1.07648, + "grad_norm": 0.5982031226158142, + "learning_rate": 4.615015207299504e-06, + "loss": 2.0377, + "step": 16820 + }, + { + "epoch": 1.07712, + "grad_norm": 0.6536882519721985, + "learning_rate": 4.61181367056187e-06, + "loss": 2.0466, + "step": 16830 + }, + { + "epoch": 1.07776, + "grad_norm": 0.714253842830658, + "learning_rate": 4.608612133824236e-06, + "loss": 2.0226, + "step": 16840 + }, + { + "epoch": 1.0784, + "grad_norm": 0.6436517834663391, + "learning_rate": 4.605410597086602e-06, + "loss": 2.0467, + "step": 16850 + }, + { + "epoch": 1.07904, + "grad_norm": 0.6904466152191162, + "learning_rate": 4.602209060348967e-06, + "loss": 2.0219, + "step": 16860 + }, + { + "epoch": 1.07968, + "grad_norm": 0.617428183555603, + "learning_rate": 4.599007523611334e-06, + "loss": 2.0212, + "step": 16870 + }, + { + "epoch": 1.08032, + "grad_norm": 0.7247819900512695, + "learning_rate": 4.5958059868737e-06, + "loss": 2.0235, + "step": 16880 + }, + { + "epoch": 1.08096, + "grad_norm": 0.5652709007263184, + "learning_rate": 4.592604450136065e-06, + "loss": 2.0022, + "step": 16890 + }, + { + "epoch": 1.0816, + "grad_norm": 0.6366179585456848, + "learning_rate": 4.589402913398431e-06, + "loss": 2.0255, + "step": 16900 + }, + { + "epoch": 1.0816, + "eval_loss": 2.0470311641693115, + "eval_runtime": 99.227, + "eval_samples_per_second": 10.078, + "eval_steps_per_second": 5.039, + "step": 16900 + }, + { + "epoch": 1.08224, + "grad_norm": 0.6273727416992188, + "learning_rate": 4.586201376660798e-06, + "loss": 2.0166, + "step": 16910 + }, + { + "epoch": 1.08288, + "grad_norm": 0.8277556300163269, + "learning_rate": 4.582999839923163e-06, + "loss": 2.0075, + "step": 16920 + }, + { + "epoch": 1.08352, + "grad_norm": 0.6830505132675171, + "learning_rate": 4.579798303185529e-06, + "loss": 2.0105, + "step": 16930 + }, + { + "epoch": 1.08416, + "grad_norm": 0.5989285707473755, + "learning_rate": 4.576596766447896e-06, + "loss": 2.0225, + "step": 16940 + }, + { + "epoch": 1.0848, + "grad_norm": 0.5858156085014343, + "learning_rate": 4.573395229710261e-06, + "loss": 1.9978, + "step": 16950 + }, + { + "epoch": 1.08544, + "grad_norm": 0.5735730528831482, + "learning_rate": 4.570193692972627e-06, + "loss": 2.0444, + "step": 16960 + }, + { + "epoch": 1.08608, + "grad_norm": 0.5746780037879944, + "learning_rate": 4.566992156234994e-06, + "loss": 2.0497, + "step": 16970 + }, + { + "epoch": 1.08672, + "grad_norm": 0.7411326766014099, + "learning_rate": 4.563790619497359e-06, + "loss": 1.9788, + "step": 16980 + }, + { + "epoch": 1.0873599999999999, + "grad_norm": 0.5445652008056641, + "learning_rate": 4.560589082759725e-06, + "loss": 1.9972, + "step": 16990 + }, + { + "epoch": 1.088, + "grad_norm": 0.6129888892173767, + "learning_rate": 4.557387546022092e-06, + "loss": 2.0206, + "step": 17000 + }, + { + "epoch": 1.088, + "eval_loss": 2.0461783409118652, + "eval_runtime": 99.2517, + "eval_samples_per_second": 10.075, + "eval_steps_per_second": 5.038, + "step": 17000 + }, + { + "epoch": 1.08864, + "grad_norm": 0.5471550822257996, + "learning_rate": 4.554186009284457e-06, + "loss": 2.0684, + "step": 17010 + }, + { + "epoch": 1.08928, + "grad_norm": 0.6158989667892456, + "learning_rate": 4.550984472546823e-06, + "loss": 2.0286, + "step": 17020 + }, + { + "epoch": 1.08992, + "grad_norm": 0.7108532190322876, + "learning_rate": 4.547782935809189e-06, + "loss": 2.0193, + "step": 17030 + }, + { + "epoch": 1.09056, + "grad_norm": 0.5638531446456909, + "learning_rate": 4.5445813990715544e-06, + "loss": 2.0425, + "step": 17040 + }, + { + "epoch": 1.0912, + "grad_norm": 0.6152315735816956, + "learning_rate": 4.541379862333921e-06, + "loss": 2.0392, + "step": 17050 + }, + { + "epoch": 1.09184, + "grad_norm": 0.5938286185264587, + "learning_rate": 4.538178325596287e-06, + "loss": 2.0407, + "step": 17060 + }, + { + "epoch": 1.0924800000000001, + "grad_norm": 0.5730686783790588, + "learning_rate": 4.534976788858652e-06, + "loss": 2.0411, + "step": 17070 + }, + { + "epoch": 1.09312, + "grad_norm": 0.6255854964256287, + "learning_rate": 4.531775252121018e-06, + "loss": 2.0084, + "step": 17080 + }, + { + "epoch": 1.09376, + "grad_norm": 0.567584216594696, + "learning_rate": 4.528573715383384e-06, + "loss": 2.0253, + "step": 17090 + }, + { + "epoch": 1.0944, + "grad_norm": 0.7046324610710144, + "learning_rate": 4.52537217864575e-06, + "loss": 2.0325, + "step": 17100 + }, + { + "epoch": 1.0944, + "eval_loss": 2.0462424755096436, + "eval_runtime": 99.1134, + "eval_samples_per_second": 10.089, + "eval_steps_per_second": 5.045, + "step": 17100 + }, + { + "epoch": 1.09504, + "grad_norm": 0.5818910002708435, + "learning_rate": 4.522170641908116e-06, + "loss": 2.0228, + "step": 17110 + }, + { + "epoch": 1.09568, + "grad_norm": 0.6222846508026123, + "learning_rate": 4.518969105170482e-06, + "loss": 2.0297, + "step": 17120 + }, + { + "epoch": 1.09632, + "grad_norm": 0.5957484841346741, + "learning_rate": 4.515767568432848e-06, + "loss": 1.9729, + "step": 17130 + }, + { + "epoch": 1.09696, + "grad_norm": 0.6309909224510193, + "learning_rate": 4.512566031695214e-06, + "loss": 2.0163, + "step": 17140 + }, + { + "epoch": 1.0976, + "grad_norm": 0.7151498794555664, + "learning_rate": 4.50936449495758e-06, + "loss": 2.024, + "step": 17150 + }, + { + "epoch": 1.09824, + "grad_norm": 0.6055840253829956, + "learning_rate": 4.506162958219946e-06, + "loss": 1.9837, + "step": 17160 + }, + { + "epoch": 1.09888, + "grad_norm": 0.8446348905563354, + "learning_rate": 4.502961421482312e-06, + "loss": 1.9953, + "step": 17170 + }, + { + "epoch": 1.09952, + "grad_norm": 0.6151704788208008, + "learning_rate": 4.499759884744678e-06, + "loss": 2.042, + "step": 17180 + }, + { + "epoch": 1.10016, + "grad_norm": 0.5726914405822754, + "learning_rate": 4.4965583480070435e-06, + "loss": 2.0041, + "step": 17190 + }, + { + "epoch": 1.1008, + "grad_norm": 0.6289142966270447, + "learning_rate": 4.49335681126941e-06, + "loss": 2.0215, + "step": 17200 + }, + { + "epoch": 1.1008, + "eval_loss": 2.0456502437591553, + "eval_runtime": 99.2892, + "eval_samples_per_second": 10.072, + "eval_steps_per_second": 5.036, + "step": 17200 + }, + { + "epoch": 1.10144, + "grad_norm": 0.6238797903060913, + "learning_rate": 4.490155274531776e-06, + "loss": 2.0265, + "step": 17210 + }, + { + "epoch": 1.10208, + "grad_norm": 0.6181656122207642, + "learning_rate": 4.4869537377941414e-06, + "loss": 2.052, + "step": 17220 + }, + { + "epoch": 1.10272, + "grad_norm": 0.6757475137710571, + "learning_rate": 4.483752201056508e-06, + "loss": 2.0402, + "step": 17230 + }, + { + "epoch": 1.10336, + "grad_norm": 0.5690998435020447, + "learning_rate": 4.480550664318874e-06, + "loss": 2.0102, + "step": 17240 + }, + { + "epoch": 1.104, + "grad_norm": 0.6137502193450928, + "learning_rate": 4.477349127581239e-06, + "loss": 2.0199, + "step": 17250 + }, + { + "epoch": 1.10464, + "grad_norm": 0.5718172192573547, + "learning_rate": 4.474147590843605e-06, + "loss": 2.015, + "step": 17260 + }, + { + "epoch": 1.10528, + "grad_norm": 0.5788451433181763, + "learning_rate": 4.470946054105972e-06, + "loss": 1.9953, + "step": 17270 + }, + { + "epoch": 1.10592, + "grad_norm": 0.5897586941719055, + "learning_rate": 4.467744517368337e-06, + "loss": 2.0303, + "step": 17280 + }, + { + "epoch": 1.10656, + "grad_norm": 0.6481188535690308, + "learning_rate": 4.464542980630703e-06, + "loss": 1.9928, + "step": 17290 + }, + { + "epoch": 1.1072, + "grad_norm": 0.5920864939689636, + "learning_rate": 4.461341443893069e-06, + "loss": 2.0279, + "step": 17300 + }, + { + "epoch": 1.1072, + "eval_loss": 2.045930862426758, + "eval_runtime": 99.2881, + "eval_samples_per_second": 10.072, + "eval_steps_per_second": 5.036, + "step": 17300 + }, + { + "epoch": 1.10784, + "grad_norm": 0.6080455183982849, + "learning_rate": 4.4581399071554345e-06, + "loss": 2.0236, + "step": 17310 + }, + { + "epoch": 1.10848, + "grad_norm": 0.7448667883872986, + "learning_rate": 4.4549383704178e-06, + "loss": 2.0498, + "step": 17320 + }, + { + "epoch": 1.1091199999999999, + "grad_norm": 0.5533308386802673, + "learning_rate": 4.451736833680167e-06, + "loss": 2.0094, + "step": 17330 + }, + { + "epoch": 1.10976, + "grad_norm": 0.6973090767860413, + "learning_rate": 4.4485352969425325e-06, + "loss": 2.0248, + "step": 17340 + }, + { + "epoch": 1.1104, + "grad_norm": 0.6721564531326294, + "learning_rate": 4.445333760204898e-06, + "loss": 2.0131, + "step": 17350 + }, + { + "epoch": 1.11104, + "grad_norm": 0.8893942832946777, + "learning_rate": 4.442132223467265e-06, + "loss": 1.994, + "step": 17360 + }, + { + "epoch": 1.11168, + "grad_norm": 0.581801176071167, + "learning_rate": 4.4389306867296305e-06, + "loss": 2.0202, + "step": 17370 + }, + { + "epoch": 1.11232, + "grad_norm": 0.5315964818000793, + "learning_rate": 4.435729149991996e-06, + "loss": 2.0164, + "step": 17380 + }, + { + "epoch": 1.11296, + "grad_norm": 0.6118794679641724, + "learning_rate": 4.432527613254363e-06, + "loss": 2.0312, + "step": 17390 + }, + { + "epoch": 1.1136, + "grad_norm": 0.5874173641204834, + "learning_rate": 4.4293260765167285e-06, + "loss": 2.0616, + "step": 17400 + }, + { + "epoch": 1.1136, + "eval_loss": 2.0455470085144043, + "eval_runtime": 99.1356, + "eval_samples_per_second": 10.087, + "eval_steps_per_second": 5.044, + "step": 17400 + }, + { + "epoch": 1.11424, + "grad_norm": 0.6851205229759216, + "learning_rate": 4.426124539779094e-06, + "loss": 2.0143, + "step": 17410 + }, + { + "epoch": 1.11488, + "grad_norm": 0.5862815976142883, + "learning_rate": 4.422923003041461e-06, + "loss": 2.0224, + "step": 17420 + }, + { + "epoch": 1.11552, + "grad_norm": 0.542276918888092, + "learning_rate": 4.4197214663038264e-06, + "loss": 1.9984, + "step": 17430 + }, + { + "epoch": 1.11616, + "grad_norm": 0.5916332602500916, + "learning_rate": 4.416519929566192e-06, + "loss": 2.0272, + "step": 17440 + }, + { + "epoch": 1.1168, + "grad_norm": 0.5869584679603577, + "learning_rate": 4.413318392828559e-06, + "loss": 2.0239, + "step": 17450 + }, + { + "epoch": 1.11744, + "grad_norm": 0.6191988587379456, + "learning_rate": 4.410116856090924e-06, + "loss": 2.0072, + "step": 17460 + }, + { + "epoch": 1.11808, + "grad_norm": 0.556609034538269, + "learning_rate": 4.40691531935329e-06, + "loss": 2.0464, + "step": 17470 + }, + { + "epoch": 1.11872, + "grad_norm": 0.5710023045539856, + "learning_rate": 4.403713782615656e-06, + "loss": 2.0525, + "step": 17480 + }, + { + "epoch": 1.11936, + "grad_norm": 0.8187467455863953, + "learning_rate": 4.4005122458780215e-06, + "loss": 2.0271, + "step": 17490 + }, + { + "epoch": 1.12, + "grad_norm": 0.5508573055267334, + "learning_rate": 4.397310709140388e-06, + "loss": 2.0339, + "step": 17500 + }, + { + "epoch": 1.12, + "eval_loss": 2.0453834533691406, + "eval_runtime": 99.29, + "eval_samples_per_second": 10.072, + "eval_steps_per_second": 5.036, + "step": 17500 + }, + { + "epoch": 1.12064, + "grad_norm": 0.6992635726928711, + "learning_rate": 4.394109172402754e-06, + "loss": 2.0474, + "step": 17510 + }, + { + "epoch": 1.12128, + "grad_norm": 0.5587980151176453, + "learning_rate": 4.3909076356651195e-06, + "loss": 2.0261, + "step": 17520 + }, + { + "epoch": 1.12192, + "grad_norm": 0.5831376314163208, + "learning_rate": 4.387706098927485e-06, + "loss": 2.0556, + "step": 17530 + }, + { + "epoch": 1.12256, + "grad_norm": 0.6915487051010132, + "learning_rate": 4.384504562189851e-06, + "loss": 2.0218, + "step": 17540 + }, + { + "epoch": 1.1232, + "grad_norm": 0.588444173336029, + "learning_rate": 4.3813030254522175e-06, + "loss": 2.0461, + "step": 17550 + }, + { + "epoch": 1.12384, + "grad_norm": 0.6518751978874207, + "learning_rate": 4.378101488714583e-06, + "loss": 2.0289, + "step": 17560 + }, + { + "epoch": 1.12448, + "grad_norm": 0.5470808148384094, + "learning_rate": 4.374899951976949e-06, + "loss": 2.0294, + "step": 17570 + }, + { + "epoch": 1.12512, + "grad_norm": 0.6305654048919678, + "learning_rate": 4.3716984152393155e-06, + "loss": 1.9871, + "step": 17580 + }, + { + "epoch": 1.12576, + "grad_norm": 0.553517758846283, + "learning_rate": 4.368496878501681e-06, + "loss": 2.0324, + "step": 17590 + }, + { + "epoch": 1.1264, + "grad_norm": 0.7479193806648254, + "learning_rate": 4.365295341764047e-06, + "loss": 2.032, + "step": 17600 + }, + { + "epoch": 1.1264, + "eval_loss": 2.045273542404175, + "eval_runtime": 99.2574, + "eval_samples_per_second": 10.075, + "eval_steps_per_second": 5.037, + "step": 17600 + }, + { + "epoch": 1.12704, + "grad_norm": 0.6123260259628296, + "learning_rate": 4.362093805026413e-06, + "loss": 2.0483, + "step": 17610 + }, + { + "epoch": 1.12768, + "grad_norm": 0.6270941495895386, + "learning_rate": 4.358892268288779e-06, + "loss": 2.0537, + "step": 17620 + }, + { + "epoch": 1.12832, + "grad_norm": 0.6077694892883301, + "learning_rate": 4.355690731551145e-06, + "loss": 2.0137, + "step": 17630 + }, + { + "epoch": 1.12896, + "grad_norm": 0.7045969367027283, + "learning_rate": 4.352489194813511e-06, + "loss": 2.0382, + "step": 17640 + }, + { + "epoch": 1.1296, + "grad_norm": 0.575238049030304, + "learning_rate": 4.349287658075877e-06, + "loss": 2.0437, + "step": 17650 + }, + { + "epoch": 1.13024, + "grad_norm": 0.6163949370384216, + "learning_rate": 4.346086121338243e-06, + "loss": 1.9799, + "step": 17660 + }, + { + "epoch": 1.1308799999999999, + "grad_norm": 0.6812820434570312, + "learning_rate": 4.3428845846006086e-06, + "loss": 1.9935, + "step": 17670 + }, + { + "epoch": 1.13152, + "grad_norm": 0.5811446905136108, + "learning_rate": 4.339683047862975e-06, + "loss": 2.0257, + "step": 17680 + }, + { + "epoch": 1.13216, + "grad_norm": 0.5752651691436768, + "learning_rate": 4.336481511125341e-06, + "loss": 2.0468, + "step": 17690 + }, + { + "epoch": 1.1328, + "grad_norm": 0.538728654384613, + "learning_rate": 4.3332799743877065e-06, + "loss": 2.0496, + "step": 17700 + }, + { + "epoch": 1.1328, + "eval_loss": 2.0443480014801025, + "eval_runtime": 99.2029, + "eval_samples_per_second": 10.08, + "eval_steps_per_second": 5.04, + "step": 17700 + }, + { + "epoch": 1.13344, + "grad_norm": 0.6354817748069763, + "learning_rate": 4.330078437650072e-06, + "loss": 2.0264, + "step": 17710 + }, + { + "epoch": 1.13408, + "grad_norm": 0.5710617899894714, + "learning_rate": 4.326876900912439e-06, + "loss": 2.0434, + "step": 17720 + }, + { + "epoch": 1.13472, + "grad_norm": 0.7051773071289062, + "learning_rate": 4.3236753641748045e-06, + "loss": 2.0114, + "step": 17730 + }, + { + "epoch": 1.13536, + "grad_norm": 0.6036627888679504, + "learning_rate": 4.32047382743717e-06, + "loss": 2.0226, + "step": 17740 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.6783179640769958, + "learning_rate": 4.317272290699536e-06, + "loss": 2.0085, + "step": 17750 + }, + { + "epoch": 1.13664, + "grad_norm": 0.6049875020980835, + "learning_rate": 4.314070753961902e-06, + "loss": 2.036, + "step": 17760 + }, + { + "epoch": 1.13728, + "grad_norm": 0.5788288712501526, + "learning_rate": 4.310869217224267e-06, + "loss": 2.0514, + "step": 17770 + }, + { + "epoch": 1.13792, + "grad_norm": 0.633959174156189, + "learning_rate": 4.307667680486634e-06, + "loss": 2.0439, + "step": 17780 + }, + { + "epoch": 1.13856, + "grad_norm": 0.620322048664093, + "learning_rate": 4.304466143749e-06, + "loss": 2.0219, + "step": 17790 + }, + { + "epoch": 1.1392, + "grad_norm": 0.558303713798523, + "learning_rate": 4.301264607011365e-06, + "loss": 2.0239, + "step": 17800 + }, + { + "epoch": 1.1392, + "eval_loss": 2.044430732727051, + "eval_runtime": 99.4766, + "eval_samples_per_second": 10.053, + "eval_steps_per_second": 5.026, + "step": 17800 + }, + { + "epoch": 1.13984, + "grad_norm": 0.5895607471466064, + "learning_rate": 4.298063070273732e-06, + "loss": 2.0546, + "step": 17810 + }, + { + "epoch": 1.14048, + "grad_norm": 0.6369735598564148, + "learning_rate": 4.294861533536098e-06, + "loss": 2.0283, + "step": 17820 + }, + { + "epoch": 1.14112, + "grad_norm": 0.6027755737304688, + "learning_rate": 4.291659996798463e-06, + "loss": 2.0156, + "step": 17830 + }, + { + "epoch": 1.14176, + "grad_norm": 0.6100727915763855, + "learning_rate": 4.28845846006083e-06, + "loss": 2.0275, + "step": 17840 + }, + { + "epoch": 1.1424, + "grad_norm": 0.5747309923171997, + "learning_rate": 4.2852569233231956e-06, + "loss": 2.0235, + "step": 17850 + }, + { + "epoch": 1.14304, + "grad_norm": 0.5191932320594788, + "learning_rate": 4.282055386585561e-06, + "loss": 1.9881, + "step": 17860 + }, + { + "epoch": 1.14368, + "grad_norm": 0.5874210596084595, + "learning_rate": 4.278853849847928e-06, + "loss": 2.0062, + "step": 17870 + }, + { + "epoch": 1.14432, + "grad_norm": 0.5625083446502686, + "learning_rate": 4.2756523131102936e-06, + "loss": 2.0174, + "step": 17880 + }, + { + "epoch": 1.14496, + "grad_norm": 0.6093165874481201, + "learning_rate": 4.272450776372659e-06, + "loss": 2.0219, + "step": 17890 + }, + { + "epoch": 1.1456, + "grad_norm": 0.5750223398208618, + "learning_rate": 4.269249239635025e-06, + "loss": 2.025, + "step": 17900 + }, + { + "epoch": 1.1456, + "eval_loss": 2.0441596508026123, + "eval_runtime": 99.0998, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.045, + "step": 17900 + }, + { + "epoch": 1.14624, + "grad_norm": 0.6374318599700928, + "learning_rate": 4.2660477028973915e-06, + "loss": 2.0259, + "step": 17910 + }, + { + "epoch": 1.14688, + "grad_norm": 0.5993691682815552, + "learning_rate": 4.262846166159757e-06, + "loss": 2.0514, + "step": 17920 + }, + { + "epoch": 1.14752, + "grad_norm": 0.6802266836166382, + "learning_rate": 4.259644629422123e-06, + "loss": 1.9933, + "step": 17930 + }, + { + "epoch": 1.14816, + "grad_norm": 0.5811522006988525, + "learning_rate": 4.256443092684489e-06, + "loss": 2.0169, + "step": 17940 + }, + { + "epoch": 1.1488, + "grad_norm": 0.6651666164398193, + "learning_rate": 4.253241555946855e-06, + "loss": 2.0495, + "step": 17950 + }, + { + "epoch": 1.14944, + "grad_norm": 0.6029697060585022, + "learning_rate": 4.250040019209221e-06, + "loss": 2.0321, + "step": 17960 + }, + { + "epoch": 1.15008, + "grad_norm": 0.614733099937439, + "learning_rate": 4.246838482471587e-06, + "loss": 1.9961, + "step": 17970 + }, + { + "epoch": 1.15072, + "grad_norm": 0.5310514569282532, + "learning_rate": 4.243636945733952e-06, + "loss": 2.0148, + "step": 17980 + }, + { + "epoch": 1.15136, + "grad_norm": 0.6348665356636047, + "learning_rate": 4.240435408996318e-06, + "loss": 2.05, + "step": 17990 + }, + { + "epoch": 1.152, + "grad_norm": 0.6719682812690735, + "learning_rate": 4.237233872258685e-06, + "loss": 2.0116, + "step": 18000 + }, + { + "epoch": 1.152, + "eval_loss": 2.0433847904205322, + "eval_runtime": 99.1507, + "eval_samples_per_second": 10.086, + "eval_steps_per_second": 5.043, + "step": 18000 + }, + { + "epoch": 1.1526399999999999, + "grad_norm": 0.6079267263412476, + "learning_rate": 4.23403233552105e-06, + "loss": 2.0296, + "step": 18010 + }, + { + "epoch": 1.15328, + "grad_norm": 0.6186202168464661, + "learning_rate": 4.230830798783416e-06, + "loss": 2.0179, + "step": 18020 + }, + { + "epoch": 1.15392, + "grad_norm": 0.6316864490509033, + "learning_rate": 4.227629262045782e-06, + "loss": 2.0382, + "step": 18030 + }, + { + "epoch": 1.15456, + "grad_norm": 0.5883438587188721, + "learning_rate": 4.224427725308148e-06, + "loss": 2.0198, + "step": 18040 + }, + { + "epoch": 1.1552, + "grad_norm": 0.5984814167022705, + "learning_rate": 4.221226188570514e-06, + "loss": 2.0217, + "step": 18050 + }, + { + "epoch": 1.15584, + "grad_norm": 0.5831133127212524, + "learning_rate": 4.21802465183288e-06, + "loss": 2.0159, + "step": 18060 + }, + { + "epoch": 1.15648, + "grad_norm": 0.5060235261917114, + "learning_rate": 4.214823115095246e-06, + "loss": 2.0348, + "step": 18070 + }, + { + "epoch": 1.15712, + "grad_norm": 0.6391846537590027, + "learning_rate": 4.211621578357612e-06, + "loss": 2.008, + "step": 18080 + }, + { + "epoch": 1.1577600000000001, + "grad_norm": 0.5386248230934143, + "learning_rate": 4.208420041619978e-06, + "loss": 2.0194, + "step": 18090 + }, + { + "epoch": 1.1584, + "grad_norm": 0.5904210805892944, + "learning_rate": 4.205218504882344e-06, + "loss": 1.9955, + "step": 18100 + }, + { + "epoch": 1.1584, + "eval_loss": 2.043471097946167, + "eval_runtime": 99.0461, + "eval_samples_per_second": 10.096, + "eval_steps_per_second": 5.048, + "step": 18100 + }, + { + "epoch": 1.15904, + "grad_norm": 0.6032049655914307, + "learning_rate": 4.20201696814471e-06, + "loss": 2.0155, + "step": 18110 + }, + { + "epoch": 1.15968, + "grad_norm": 0.5975533723831177, + "learning_rate": 4.198815431407076e-06, + "loss": 2.0507, + "step": 18120 + }, + { + "epoch": 1.16032, + "grad_norm": 0.6408397555351257, + "learning_rate": 4.195613894669442e-06, + "loss": 2.0152, + "step": 18130 + }, + { + "epoch": 1.16096, + "grad_norm": 0.5729820728302002, + "learning_rate": 4.192412357931808e-06, + "loss": 1.9997, + "step": 18140 + }, + { + "epoch": 1.1616, + "grad_norm": 0.5984222888946533, + "learning_rate": 4.189210821194174e-06, + "loss": 2.0213, + "step": 18150 + }, + { + "epoch": 1.16224, + "grad_norm": 0.5945577621459961, + "learning_rate": 4.186009284456539e-06, + "loss": 2.0073, + "step": 18160 + }, + { + "epoch": 1.16288, + "grad_norm": 0.5897391438484192, + "learning_rate": 4.182807747718906e-06, + "loss": 2.0437, + "step": 18170 + }, + { + "epoch": 1.16352, + "grad_norm": 0.6737760901451111, + "learning_rate": 4.179606210981272e-06, + "loss": 2.0403, + "step": 18180 + }, + { + "epoch": 1.16416, + "grad_norm": 0.5872124433517456, + "learning_rate": 4.176404674243637e-06, + "loss": 2.0306, + "step": 18190 + }, + { + "epoch": 1.1648, + "grad_norm": 0.6742432713508606, + "learning_rate": 4.173203137506003e-06, + "loss": 2.0373, + "step": 18200 + }, + { + "epoch": 1.1648, + "eval_loss": 2.0434553623199463, + "eval_runtime": 99.1351, + "eval_samples_per_second": 10.087, + "eval_steps_per_second": 5.044, + "step": 18200 + }, + { + "epoch": 1.16544, + "grad_norm": 0.6591572761535645, + "learning_rate": 4.170001600768369e-06, + "loss": 2.008, + "step": 18210 + }, + { + "epoch": 1.16608, + "grad_norm": 0.5591892004013062, + "learning_rate": 4.166800064030735e-06, + "loss": 2.053, + "step": 18220 + }, + { + "epoch": 1.16672, + "grad_norm": 0.56505286693573, + "learning_rate": 4.163598527293101e-06, + "loss": 2.0204, + "step": 18230 + }, + { + "epoch": 1.16736, + "grad_norm": 0.6225974559783936, + "learning_rate": 4.160396990555467e-06, + "loss": 2.034, + "step": 18240 + }, + { + "epoch": 1.168, + "grad_norm": 0.7203903794288635, + "learning_rate": 4.1571954538178324e-06, + "loss": 2.0219, + "step": 18250 + }, + { + "epoch": 1.16864, + "grad_norm": 0.6796202659606934, + "learning_rate": 4.153993917080199e-06, + "loss": 2.011, + "step": 18260 + }, + { + "epoch": 1.16928, + "grad_norm": 0.5384121537208557, + "learning_rate": 4.150792380342565e-06, + "loss": 1.9911, + "step": 18270 + }, + { + "epoch": 1.16992, + "grad_norm": 0.6490262746810913, + "learning_rate": 4.14759084360493e-06, + "loss": 2.0012, + "step": 18280 + }, + { + "epoch": 1.17056, + "grad_norm": 0.6402068138122559, + "learning_rate": 4.144389306867297e-06, + "loss": 2.0179, + "step": 18290 + }, + { + "epoch": 1.1712, + "grad_norm": 0.5560231804847717, + "learning_rate": 4.141187770129663e-06, + "loss": 1.9948, + "step": 18300 + }, + { + "epoch": 1.1712, + "eval_loss": 2.043088674545288, + "eval_runtime": 99.2195, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.039, + "step": 18300 + }, + { + "epoch": 1.17184, + "grad_norm": 0.5684863924980164, + "learning_rate": 4.137986233392028e-06, + "loss": 1.9933, + "step": 18310 + }, + { + "epoch": 1.17248, + "grad_norm": 0.6014339327812195, + "learning_rate": 4.134784696654394e-06, + "loss": 2.0161, + "step": 18320 + }, + { + "epoch": 1.17312, + "grad_norm": 0.6450905203819275, + "learning_rate": 4.131583159916761e-06, + "loss": 2.0292, + "step": 18330 + }, + { + "epoch": 1.17376, + "grad_norm": 0.5547463297843933, + "learning_rate": 4.128381623179126e-06, + "loss": 2.0318, + "step": 18340 + }, + { + "epoch": 1.1743999999999999, + "grad_norm": 0.493149071931839, + "learning_rate": 4.125180086441492e-06, + "loss": 2.0142, + "step": 18350 + }, + { + "epoch": 1.17504, + "grad_norm": 0.546998918056488, + "learning_rate": 4.121978549703859e-06, + "loss": 2.0102, + "step": 18360 + }, + { + "epoch": 1.17568, + "grad_norm": 0.5504551529884338, + "learning_rate": 4.118777012966224e-06, + "loss": 2.0329, + "step": 18370 + }, + { + "epoch": 1.17632, + "grad_norm": 0.5663371086120605, + "learning_rate": 4.11557547622859e-06, + "loss": 2.0638, + "step": 18380 + }, + { + "epoch": 1.17696, + "grad_norm": 0.5327328443527222, + "learning_rate": 4.112373939490957e-06, + "loss": 2.0131, + "step": 18390 + }, + { + "epoch": 1.1776, + "grad_norm": 0.702136218547821, + "learning_rate": 4.109172402753322e-06, + "loss": 2.0052, + "step": 18400 + }, + { + "epoch": 1.1776, + "eval_loss": 2.0431933403015137, + "eval_runtime": 99.1385, + "eval_samples_per_second": 10.087, + "eval_steps_per_second": 5.043, + "step": 18400 + }, + { + "epoch": 1.17824, + "grad_norm": 0.678486168384552, + "learning_rate": 4.105970866015688e-06, + "loss": 2.0035, + "step": 18410 + }, + { + "epoch": 1.17888, + "grad_norm": 0.6072514057159424, + "learning_rate": 4.102769329278054e-06, + "loss": 2.0067, + "step": 18420 + }, + { + "epoch": 1.1795200000000001, + "grad_norm": 0.6381948590278625, + "learning_rate": 4.0995677925404195e-06, + "loss": 2.0069, + "step": 18430 + }, + { + "epoch": 1.1801599999999999, + "grad_norm": 0.5768085718154907, + "learning_rate": 4.096366255802785e-06, + "loss": 2.0085, + "step": 18440 + }, + { + "epoch": 1.1808, + "grad_norm": 0.6085562705993652, + "learning_rate": 4.093164719065152e-06, + "loss": 2.0133, + "step": 18450 + }, + { + "epoch": 1.18144, + "grad_norm": 0.6448004841804504, + "learning_rate": 4.0899631823275174e-06, + "loss": 2.0595, + "step": 18460 + }, + { + "epoch": 1.18208, + "grad_norm": 0.5848154425621033, + "learning_rate": 4.086761645589883e-06, + "loss": 2.0215, + "step": 18470 + }, + { + "epoch": 1.18272, + "grad_norm": 0.5383066534996033, + "learning_rate": 4.083560108852249e-06, + "loss": 2.0033, + "step": 18480 + }, + { + "epoch": 1.18336, + "grad_norm": 0.632030189037323, + "learning_rate": 4.080358572114615e-06, + "loss": 2.0296, + "step": 18490 + }, + { + "epoch": 1.184, + "grad_norm": 0.6205241084098816, + "learning_rate": 4.077157035376981e-06, + "loss": 2.0178, + "step": 18500 + }, + { + "epoch": 1.184, + "eval_loss": 2.0423781871795654, + "eval_runtime": 99.0963, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.046, + "step": 18500 + }, + { + "epoch": 1.18464, + "grad_norm": 0.6120121479034424, + "learning_rate": 4.073955498639347e-06, + "loss": 2.0074, + "step": 18510 + }, + { + "epoch": 1.1852800000000001, + "grad_norm": 0.5563983917236328, + "learning_rate": 4.070753961901713e-06, + "loss": 2.0258, + "step": 18520 + }, + { + "epoch": 1.18592, + "grad_norm": 0.5512414574623108, + "learning_rate": 4.067552425164079e-06, + "loss": 2.058, + "step": 18530 + }, + { + "epoch": 1.18656, + "grad_norm": 0.5608769059181213, + "learning_rate": 4.064350888426445e-06, + "loss": 2.0202, + "step": 18540 + }, + { + "epoch": 1.1872, + "grad_norm": 0.5823011994361877, + "learning_rate": 4.061149351688811e-06, + "loss": 2.0224, + "step": 18550 + }, + { + "epoch": 1.18784, + "grad_norm": 0.6108050346374512, + "learning_rate": 4.057947814951177e-06, + "loss": 2.0147, + "step": 18560 + }, + { + "epoch": 1.18848, + "grad_norm": 0.5405247211456299, + "learning_rate": 4.054746278213543e-06, + "loss": 2.0505, + "step": 18570 + }, + { + "epoch": 1.18912, + "grad_norm": 0.5540168285369873, + "learning_rate": 4.051544741475909e-06, + "loss": 2.0591, + "step": 18580 + }, + { + "epoch": 1.18976, + "grad_norm": 0.6209017634391785, + "learning_rate": 4.048343204738275e-06, + "loss": 2.0195, + "step": 18590 + }, + { + "epoch": 1.1904, + "grad_norm": 0.5977944135665894, + "learning_rate": 4.045141668000641e-06, + "loss": 2.0125, + "step": 18600 + }, + { + "epoch": 1.1904, + "eval_loss": 2.042717456817627, + "eval_runtime": 99.1827, + "eval_samples_per_second": 10.082, + "eval_steps_per_second": 5.041, + "step": 18600 + }, + { + "epoch": 1.19104, + "grad_norm": 0.6067842245101929, + "learning_rate": 4.0419401312630065e-06, + "loss": 2.0236, + "step": 18610 + }, + { + "epoch": 1.19168, + "grad_norm": 0.6149134635925293, + "learning_rate": 4.038738594525373e-06, + "loss": 2.0205, + "step": 18620 + }, + { + "epoch": 1.19232, + "grad_norm": 0.5659533739089966, + "learning_rate": 4.035537057787739e-06, + "loss": 2.0257, + "step": 18630 + }, + { + "epoch": 1.19296, + "grad_norm": 0.5780056715011597, + "learning_rate": 4.0323355210501045e-06, + "loss": 2.0414, + "step": 18640 + }, + { + "epoch": 1.1936, + "grad_norm": 0.6140852570533752, + "learning_rate": 4.02913398431247e-06, + "loss": 2.0164, + "step": 18650 + }, + { + "epoch": 1.19424, + "grad_norm": 0.5738794207572937, + "learning_rate": 4.025932447574836e-06, + "loss": 2.0332, + "step": 18660 + }, + { + "epoch": 1.19488, + "grad_norm": 0.6487182974815369, + "learning_rate": 4.0227309108372024e-06, + "loss": 2.0085, + "step": 18670 + }, + { + "epoch": 1.19552, + "grad_norm": 0.5968878269195557, + "learning_rate": 4.019529374099568e-06, + "loss": 2.0128, + "step": 18680 + }, + { + "epoch": 1.19616, + "grad_norm": 0.7557638883590698, + "learning_rate": 4.016327837361934e-06, + "loss": 2.0416, + "step": 18690 + }, + { + "epoch": 1.1968, + "grad_norm": 0.6604118943214417, + "learning_rate": 4.0131263006242996e-06, + "loss": 2.0175, + "step": 18700 + }, + { + "epoch": 1.1968, + "eval_loss": 2.042259454727173, + "eval_runtime": 99.1, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.045, + "step": 18700 + }, + { + "epoch": 1.19744, + "grad_norm": 0.6026879549026489, + "learning_rate": 4.009924763886666e-06, + "loss": 1.9974, + "step": 18710 + }, + { + "epoch": 1.19808, + "grad_norm": 0.6138709187507629, + "learning_rate": 4.006723227149032e-06, + "loss": 2.019, + "step": 18720 + }, + { + "epoch": 1.19872, + "grad_norm": 0.5770138502120972, + "learning_rate": 4.0035216904113975e-06, + "loss": 2.0165, + "step": 18730 + }, + { + "epoch": 1.19936, + "grad_norm": 0.6799411177635193, + "learning_rate": 4.000320153673763e-06, + "loss": 2.0124, + "step": 18740 + }, + { + "epoch": 1.2, + "grad_norm": 0.715839684009552, + "learning_rate": 3.99711861693613e-06, + "loss": 1.9994, + "step": 18750 + }, + { + "epoch": 1.20064, + "grad_norm": 0.6663697361946106, + "learning_rate": 3.9939170801984955e-06, + "loss": 1.9953, + "step": 18760 + }, + { + "epoch": 1.20128, + "grad_norm": 0.5951536893844604, + "learning_rate": 3.990715543460861e-06, + "loss": 2.0368, + "step": 18770 + }, + { + "epoch": 1.2019199999999999, + "grad_norm": 0.6461049914360046, + "learning_rate": 3.987514006723228e-06, + "loss": 2.0049, + "step": 18780 + }, + { + "epoch": 1.20256, + "grad_norm": 0.6078142523765564, + "learning_rate": 3.9843124699855935e-06, + "loss": 2.0297, + "step": 18790 + }, + { + "epoch": 1.2032, + "grad_norm": 0.5475664734840393, + "learning_rate": 3.981110933247959e-06, + "loss": 2.0222, + "step": 18800 + }, + { + "epoch": 1.2032, + "eval_loss": 2.042280912399292, + "eval_runtime": 98.9381, + "eval_samples_per_second": 10.107, + "eval_steps_per_second": 5.054, + "step": 18800 + }, + { + "epoch": 1.20384, + "grad_norm": 0.6041310429573059, + "learning_rate": 3.977909396510326e-06, + "loss": 2.0824, + "step": 18810 + }, + { + "epoch": 1.20448, + "grad_norm": 0.5782486796379089, + "learning_rate": 3.9747078597726915e-06, + "loss": 2.0067, + "step": 18820 + }, + { + "epoch": 1.20512, + "grad_norm": 0.5963742733001709, + "learning_rate": 3.971506323035057e-06, + "loss": 2.0014, + "step": 18830 + }, + { + "epoch": 1.20576, + "grad_norm": 0.5914783477783203, + "learning_rate": 3.968304786297424e-06, + "loss": 2.0082, + "step": 18840 + }, + { + "epoch": 1.2064, + "grad_norm": 0.6149041056632996, + "learning_rate": 3.9651032495597894e-06, + "loss": 2.0269, + "step": 18850 + }, + { + "epoch": 1.2070400000000001, + "grad_norm": 0.6183750629425049, + "learning_rate": 3.961901712822155e-06, + "loss": 2.0448, + "step": 18860 + }, + { + "epoch": 1.20768, + "grad_norm": 0.6572140455245972, + "learning_rate": 3.958700176084521e-06, + "loss": 2.0438, + "step": 18870 + }, + { + "epoch": 1.20832, + "grad_norm": 0.5987107753753662, + "learning_rate": 3.955498639346887e-06, + "loss": 2.0234, + "step": 18880 + }, + { + "epoch": 1.20896, + "grad_norm": 0.6036151051521301, + "learning_rate": 3.952297102609252e-06, + "loss": 2.0204, + "step": 18890 + }, + { + "epoch": 1.2096, + "grad_norm": 0.5558540225028992, + "learning_rate": 3.949095565871619e-06, + "loss": 1.9918, + "step": 18900 + }, + { + "epoch": 1.2096, + "eval_loss": 2.041818141937256, + "eval_runtime": 98.93, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 18900 + }, + { + "epoch": 1.21024, + "grad_norm": 0.56846684217453, + "learning_rate": 3.9458940291339846e-06, + "loss": 2.0231, + "step": 18910 + }, + { + "epoch": 1.21088, + "grad_norm": 0.6935490965843201, + "learning_rate": 3.94269249239635e-06, + "loss": 2.0144, + "step": 18920 + }, + { + "epoch": 1.21152, + "grad_norm": 0.5987290740013123, + "learning_rate": 3.939490955658716e-06, + "loss": 2.0177, + "step": 18930 + }, + { + "epoch": 1.21216, + "grad_norm": 0.5699126124382019, + "learning_rate": 3.9362894189210825e-06, + "loss": 2.0104, + "step": 18940 + }, + { + "epoch": 1.2128, + "grad_norm": 0.5563395023345947, + "learning_rate": 3.933087882183448e-06, + "loss": 1.9968, + "step": 18950 + }, + { + "epoch": 1.21344, + "grad_norm": 0.560488224029541, + "learning_rate": 3.929886345445814e-06, + "loss": 2.0503, + "step": 18960 + }, + { + "epoch": 1.21408, + "grad_norm": 0.6256830096244812, + "learning_rate": 3.9266848087081805e-06, + "loss": 2.0331, + "step": 18970 + }, + { + "epoch": 1.21472, + "grad_norm": 0.5946688055992126, + "learning_rate": 3.923483271970546e-06, + "loss": 1.9896, + "step": 18980 + }, + { + "epoch": 1.21536, + "grad_norm": 0.6053398251533508, + "learning_rate": 3.920281735232912e-06, + "loss": 1.998, + "step": 18990 + }, + { + "epoch": 1.216, + "grad_norm": 0.6324140429496765, + "learning_rate": 3.9170801984952785e-06, + "loss": 2.005, + "step": 19000 + }, + { + "epoch": 1.216, + "eval_loss": 2.041613817214966, + "eval_runtime": 98.9226, + "eval_samples_per_second": 10.109, + "eval_steps_per_second": 5.054, + "step": 19000 + }, + { + "epoch": 1.21664, + "grad_norm": 0.5376859307289124, + "learning_rate": 3.913878661757644e-06, + "loss": 2.0345, + "step": 19010 + }, + { + "epoch": 1.21728, + "grad_norm": 0.5733827352523804, + "learning_rate": 3.91067712502001e-06, + "loss": 2.0347, + "step": 19020 + }, + { + "epoch": 1.21792, + "grad_norm": 0.5942328572273254, + "learning_rate": 3.907475588282376e-06, + "loss": 2.03, + "step": 19030 + }, + { + "epoch": 1.21856, + "grad_norm": 0.5893325209617615, + "learning_rate": 3.904274051544742e-06, + "loss": 2.0436, + "step": 19040 + }, + { + "epoch": 1.2192, + "grad_norm": 0.5583692789077759, + "learning_rate": 3.901072514807108e-06, + "loss": 2.0058, + "step": 19050 + }, + { + "epoch": 1.21984, + "grad_norm": 0.5637010335922241, + "learning_rate": 3.897870978069474e-06, + "loss": 2.0126, + "step": 19060 + }, + { + "epoch": 1.22048, + "grad_norm": 0.5166860222816467, + "learning_rate": 3.89466944133184e-06, + "loss": 2.0226, + "step": 19070 + }, + { + "epoch": 1.22112, + "grad_norm": 0.6455560922622681, + "learning_rate": 3.891467904594206e-06, + "loss": 2.0101, + "step": 19080 + }, + { + "epoch": 1.22176, + "grad_norm": 0.5828796029090881, + "learning_rate": 3.8882663678565716e-06, + "loss": 2.0506, + "step": 19090 + }, + { + "epoch": 1.2224, + "grad_norm": 0.6479492783546448, + "learning_rate": 3.885064831118937e-06, + "loss": 2.0183, + "step": 19100 + }, + { + "epoch": 1.2224, + "eval_loss": 2.041250228881836, + "eval_runtime": 99.0015, + "eval_samples_per_second": 10.101, + "eval_steps_per_second": 5.05, + "step": 19100 + }, + { + "epoch": 1.22304, + "grad_norm": 0.591534435749054, + "learning_rate": 3.881863294381303e-06, + "loss": 2.0026, + "step": 19110 + }, + { + "epoch": 1.2236799999999999, + "grad_norm": 0.7330079674720764, + "learning_rate": 3.8786617576436695e-06, + "loss": 2.0242, + "step": 19120 + }, + { + "epoch": 1.22432, + "grad_norm": 0.6045952439308167, + "learning_rate": 3.875460220906035e-06, + "loss": 2.0236, + "step": 19130 + }, + { + "epoch": 1.22496, + "grad_norm": 0.5437217950820923, + "learning_rate": 3.872258684168401e-06, + "loss": 2.0217, + "step": 19140 + }, + { + "epoch": 1.2256, + "grad_norm": 0.5526325106620789, + "learning_rate": 3.869057147430767e-06, + "loss": 2.0221, + "step": 19150 + }, + { + "epoch": 1.22624, + "grad_norm": 0.6211245656013489, + "learning_rate": 3.865855610693132e-06, + "loss": 2.0326, + "step": 19160 + }, + { + "epoch": 1.22688, + "grad_norm": 0.5961929559707642, + "learning_rate": 3.862654073955499e-06, + "loss": 2.0135, + "step": 19170 + }, + { + "epoch": 1.22752, + "grad_norm": 0.5817451477050781, + "learning_rate": 3.859452537217865e-06, + "loss": 2.0257, + "step": 19180 + }, + { + "epoch": 1.22816, + "grad_norm": 0.5692046284675598, + "learning_rate": 3.85625100048023e-06, + "loss": 2.0062, + "step": 19190 + }, + { + "epoch": 1.2288000000000001, + "grad_norm": 0.5758373141288757, + "learning_rate": 3.853049463742597e-06, + "loss": 2.028, + "step": 19200 + }, + { + "epoch": 1.2288000000000001, + "eval_loss": 2.0411949157714844, + "eval_runtime": 98.9362, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 19200 + }, + { + "epoch": 1.22944, + "grad_norm": 0.6386515498161316, + "learning_rate": 3.849847927004963e-06, + "loss": 2.0233, + "step": 19210 + }, + { + "epoch": 1.23008, + "grad_norm": 0.6881201863288879, + "learning_rate": 3.846646390267328e-06, + "loss": 2.0164, + "step": 19220 + }, + { + "epoch": 1.23072, + "grad_norm": 0.6185692548751831, + "learning_rate": 3.843444853529695e-06, + "loss": 2.0323, + "step": 19230 + }, + { + "epoch": 1.23136, + "grad_norm": 0.6359484791755676, + "learning_rate": 3.840243316792061e-06, + "loss": 2.0251, + "step": 19240 + }, + { + "epoch": 1.232, + "grad_norm": 0.675705075263977, + "learning_rate": 3.837041780054426e-06, + "loss": 2.0037, + "step": 19250 + }, + { + "epoch": 1.23264, + "grad_norm": 0.6411193013191223, + "learning_rate": 3.833840243316793e-06, + "loss": 2.0238, + "step": 19260 + }, + { + "epoch": 1.23328, + "grad_norm": 0.5814663767814636, + "learning_rate": 3.830638706579159e-06, + "loss": 2.0344, + "step": 19270 + }, + { + "epoch": 1.23392, + "grad_norm": 0.6048383116722107, + "learning_rate": 3.827437169841524e-06, + "loss": 1.9835, + "step": 19280 + }, + { + "epoch": 1.23456, + "grad_norm": 0.6306132674217224, + "learning_rate": 3.824235633103891e-06, + "loss": 2.013, + "step": 19290 + }, + { + "epoch": 1.2352, + "grad_norm": 0.5310275554656982, + "learning_rate": 3.8210340963662566e-06, + "loss": 2.0315, + "step": 19300 + }, + { + "epoch": 1.2352, + "eval_loss": 2.041146755218506, + "eval_runtime": 98.9899, + "eval_samples_per_second": 10.102, + "eval_steps_per_second": 5.051, + "step": 19300 + }, + { + "epoch": 1.23584, + "grad_norm": 0.539045512676239, + "learning_rate": 3.817832559628622e-06, + "loss": 2.0175, + "step": 19310 + }, + { + "epoch": 1.23648, + "grad_norm": 0.5589911937713623, + "learning_rate": 3.8146310228909876e-06, + "loss": 2.0288, + "step": 19320 + }, + { + "epoch": 1.23712, + "grad_norm": 0.5046820640563965, + "learning_rate": 3.811429486153354e-06, + "loss": 2.032, + "step": 19330 + }, + { + "epoch": 1.23776, + "grad_norm": 0.5820785164833069, + "learning_rate": 3.80822794941572e-06, + "loss": 1.9774, + "step": 19340 + }, + { + "epoch": 1.2384, + "grad_norm": 0.6001272797584534, + "learning_rate": 3.8050264126780855e-06, + "loss": 2.0126, + "step": 19350 + }, + { + "epoch": 1.23904, + "grad_norm": 0.567148745059967, + "learning_rate": 3.801824875940452e-06, + "loss": 2.0103, + "step": 19360 + }, + { + "epoch": 1.23968, + "grad_norm": 0.5717241764068604, + "learning_rate": 3.798623339202818e-06, + "loss": 2.0294, + "step": 19370 + }, + { + "epoch": 1.24032, + "grad_norm": 0.5383074283599854, + "learning_rate": 3.7954218024651835e-06, + "loss": 2.0216, + "step": 19380 + }, + { + "epoch": 1.24096, + "grad_norm": 0.6128333210945129, + "learning_rate": 3.7922202657275496e-06, + "loss": 2.0299, + "step": 19390 + }, + { + "epoch": 1.2416, + "grad_norm": 0.5616031885147095, + "learning_rate": 3.7890187289899154e-06, + "loss": 1.9918, + "step": 19400 + }, + { + "epoch": 1.2416, + "eval_loss": 2.0412375926971436, + "eval_runtime": 98.8709, + "eval_samples_per_second": 10.114, + "eval_steps_per_second": 5.057, + "step": 19400 + }, + { + "epoch": 1.24224, + "grad_norm": 0.5566725730895996, + "learning_rate": 3.7858171922522815e-06, + "loss": 2.0488, + "step": 19410 + }, + { + "epoch": 1.24288, + "grad_norm": 0.6032694578170776, + "learning_rate": 3.7826156555146476e-06, + "loss": 2.0278, + "step": 19420 + }, + { + "epoch": 1.24352, + "grad_norm": 0.5486621260643005, + "learning_rate": 3.7794141187770133e-06, + "loss": 2.0174, + "step": 19430 + }, + { + "epoch": 1.24416, + "grad_norm": 0.5362090468406677, + "learning_rate": 3.776212582039379e-06, + "loss": 2.0443, + "step": 19440 + }, + { + "epoch": 1.2448, + "grad_norm": 0.5653091669082642, + "learning_rate": 3.7730110453017448e-06, + "loss": 1.984, + "step": 19450 + }, + { + "epoch": 1.2454399999999999, + "grad_norm": 0.5224658250808716, + "learning_rate": 3.7698095085641113e-06, + "loss": 2.033, + "step": 19460 + }, + { + "epoch": 1.24608, + "grad_norm": 0.5859370827674866, + "learning_rate": 3.766607971826477e-06, + "loss": 2.0043, + "step": 19470 + }, + { + "epoch": 1.24672, + "grad_norm": 0.5430259108543396, + "learning_rate": 3.7634064350888427e-06, + "loss": 2.0233, + "step": 19480 + }, + { + "epoch": 1.24736, + "grad_norm": 0.5666323900222778, + "learning_rate": 3.760204898351209e-06, + "loss": 2.0392, + "step": 19490 + }, + { + "epoch": 1.248, + "grad_norm": 0.6931468844413757, + "learning_rate": 3.757003361613575e-06, + "loss": 2.0193, + "step": 19500 + }, + { + "epoch": 1.248, + "eval_loss": 2.0411226749420166, + "eval_runtime": 98.7097, + "eval_samples_per_second": 10.131, + "eval_steps_per_second": 5.065, + "step": 19500 + }, + { + "epoch": 1.24864, + "grad_norm": 0.5994915962219238, + "learning_rate": 3.7538018248759407e-06, + "loss": 2.0416, + "step": 19510 + }, + { + "epoch": 1.24928, + "grad_norm": 0.5854692459106445, + "learning_rate": 3.750600288138307e-06, + "loss": 2.0547, + "step": 19520 + }, + { + "epoch": 1.24992, + "grad_norm": 0.58211749792099, + "learning_rate": 3.7473987514006726e-06, + "loss": 2.0424, + "step": 19530 + }, + { + "epoch": 1.2505600000000001, + "grad_norm": 0.6145838499069214, + "learning_rate": 3.7441972146630383e-06, + "loss": 2.0197, + "step": 19540 + }, + { + "epoch": 1.2511999999999999, + "grad_norm": 0.5879749655723572, + "learning_rate": 3.740995677925405e-06, + "loss": 2.0284, + "step": 19550 + }, + { + "epoch": 1.25184, + "grad_norm": 0.5488758683204651, + "learning_rate": 3.7377941411877705e-06, + "loss": 2.027, + "step": 19560 + }, + { + "epoch": 1.25248, + "grad_norm": 0.5636323690414429, + "learning_rate": 3.7345926044501362e-06, + "loss": 2.0064, + "step": 19570 + }, + { + "epoch": 1.25312, + "grad_norm": 0.6053061485290527, + "learning_rate": 3.731391067712502e-06, + "loss": 2.0148, + "step": 19580 + }, + { + "epoch": 1.25376, + "grad_norm": 0.5649168491363525, + "learning_rate": 3.7281895309748685e-06, + "loss": 2.0224, + "step": 19590 + }, + { + "epoch": 1.2544, + "grad_norm": 0.5852454900741577, + "learning_rate": 3.7249879942372342e-06, + "loss": 2.0244, + "step": 19600 + }, + { + "epoch": 1.2544, + "eval_loss": 2.0403201580047607, + "eval_runtime": 98.8341, + "eval_samples_per_second": 10.118, + "eval_steps_per_second": 5.059, + "step": 19600 + }, + { + "epoch": 1.25504, + "grad_norm": 0.5796120762825012, + "learning_rate": 3.7217864574996e-06, + "loss": 2.042, + "step": 19610 + }, + { + "epoch": 1.25568, + "grad_norm": 0.6857411861419678, + "learning_rate": 3.718584920761966e-06, + "loss": 1.9767, + "step": 19620 + }, + { + "epoch": 1.25632, + "grad_norm": 0.7955313324928284, + "learning_rate": 3.7153833840243318e-06, + "loss": 2.0303, + "step": 19630 + }, + { + "epoch": 1.25696, + "grad_norm": 0.6094045639038086, + "learning_rate": 3.712181847286698e-06, + "loss": 2.0435, + "step": 19640 + }, + { + "epoch": 1.2576, + "grad_norm": 0.574123740196228, + "learning_rate": 3.708980310549064e-06, + "loss": 1.9984, + "step": 19650 + }, + { + "epoch": 1.25824, + "grad_norm": 0.5539616346359253, + "learning_rate": 3.7057787738114297e-06, + "loss": 2.029, + "step": 19660 + }, + { + "epoch": 1.25888, + "grad_norm": 0.6223415732383728, + "learning_rate": 3.7025772370737955e-06, + "loss": 2.0268, + "step": 19670 + }, + { + "epoch": 1.25952, + "grad_norm": 0.5583721995353699, + "learning_rate": 3.699375700336162e-06, + "loss": 2.0131, + "step": 19680 + }, + { + "epoch": 1.26016, + "grad_norm": 0.6118350625038147, + "learning_rate": 3.6961741635985277e-06, + "loss": 2.0043, + "step": 19690 + }, + { + "epoch": 1.2608, + "grad_norm": 0.5769184827804565, + "learning_rate": 3.6929726268608934e-06, + "loss": 2.0199, + "step": 19700 + }, + { + "epoch": 1.2608, + "eval_loss": 2.0401883125305176, + "eval_runtime": 98.7506, + "eval_samples_per_second": 10.127, + "eval_steps_per_second": 5.063, + "step": 19700 + }, + { + "epoch": 1.7064935064935065, + "grad_norm": 0.5702980160713196, + "learning_rate": 3.6897710901232596e-06, + "loss": 2.011, + "step": 19710 + }, + { + "epoch": 1.7073593073593074, + "grad_norm": 0.7027571201324463, + "learning_rate": 3.6865695533856257e-06, + "loss": 2.027, + "step": 19720 + }, + { + "epoch": 1.708225108225108, + "grad_norm": 0.6393783092498779, + "learning_rate": 3.6833680166479914e-06, + "loss": 2.0066, + "step": 19730 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 0.548751950263977, + "learning_rate": 3.680166479910357e-06, + "loss": 2.0248, + "step": 19740 + }, + { + "epoch": 1.70995670995671, + "grad_norm": 0.5644584894180298, + "learning_rate": 3.6769649431727233e-06, + "loss": 2.0273, + "step": 19750 + }, + { + "epoch": 1.7108225108225108, + "grad_norm": 0.6028965711593628, + "learning_rate": 3.673763406435089e-06, + "loss": 2.0544, + "step": 19760 + }, + { + "epoch": 1.7116883116883117, + "grad_norm": 0.5295466780662537, + "learning_rate": 3.6705618696974547e-06, + "loss": 2.0201, + "step": 19770 + }, + { + "epoch": 1.7125541125541126, + "grad_norm": 0.5618656873703003, + "learning_rate": 3.6673603329598212e-06, + "loss": 2.0543, + "step": 19780 + }, + { + "epoch": 1.7134199134199135, + "grad_norm": 0.5360836982727051, + "learning_rate": 3.664158796222187e-06, + "loss": 2.0327, + "step": 19790 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.5506560206413269, + "learning_rate": 3.6609572594845527e-06, + "loss": 2.0258, + "step": 19800 + }, + { + "epoch": 1.7142857142857144, + "eval_loss": 2.0400662422180176, + "eval_runtime": 100.1215, + "eval_samples_per_second": 9.988, + "eval_steps_per_second": 4.994, + "step": 19800 + }, + { + "epoch": 1.7151515151515153, + "grad_norm": 0.6489508748054504, + "learning_rate": 3.657755722746919e-06, + "loss": 2.0223, + "step": 19810 + }, + { + "epoch": 1.716017316017316, + "grad_norm": 0.5744234919548035, + "learning_rate": 3.654554186009285e-06, + "loss": 2.0207, + "step": 19820 + }, + { + "epoch": 1.716883116883117, + "grad_norm": 0.5877808332443237, + "learning_rate": 3.6513526492716506e-06, + "loss": 2.0092, + "step": 19830 + }, + { + "epoch": 1.7177489177489178, + "grad_norm": 0.7264795899391174, + "learning_rate": 3.6481511125340168e-06, + "loss": 2.0123, + "step": 19840 + }, + { + "epoch": 1.7186147186147185, + "grad_norm": 0.5963069200515747, + "learning_rate": 3.6449495757963825e-06, + "loss": 2.0067, + "step": 19850 + }, + { + "epoch": 1.7194805194805194, + "grad_norm": 0.6109662055969238, + "learning_rate": 3.6417480390587486e-06, + "loss": 2.0255, + "step": 19860 + }, + { + "epoch": 1.7203463203463203, + "grad_norm": 0.6088973879814148, + "learning_rate": 3.6385465023211143e-06, + "loss": 2.0263, + "step": 19870 + }, + { + "epoch": 1.7212121212121212, + "grad_norm": 0.5885077714920044, + "learning_rate": 3.6353449655834805e-06, + "loss": 2.0018, + "step": 19880 + }, + { + "epoch": 1.722077922077922, + "grad_norm": 0.583109974861145, + "learning_rate": 3.632143428845846e-06, + "loss": 2.0085, + "step": 19890 + }, + { + "epoch": 1.722943722943723, + "grad_norm": 0.5861423015594482, + "learning_rate": 3.628941892108212e-06, + "loss": 2.0149, + "step": 19900 + }, + { + "epoch": 1.722943722943723, + "eval_loss": 2.0403454303741455, + "eval_runtime": 99.8602, + "eval_samples_per_second": 10.014, + "eval_steps_per_second": 5.007, + "step": 19900 + }, + { + "epoch": 1.723809523809524, + "grad_norm": 0.6617264747619629, + "learning_rate": 3.6257403553705784e-06, + "loss": 2.0527, + "step": 19910 + }, + { + "epoch": 1.7246753246753248, + "grad_norm": 0.5962138772010803, + "learning_rate": 3.622538818632944e-06, + "loss": 2.0027, + "step": 19920 + }, + { + "epoch": 1.7255411255411255, + "grad_norm": 0.5586407780647278, + "learning_rate": 3.61933728189531e-06, + "loss": 2.048, + "step": 19930 + }, + { + "epoch": 1.7264069264069264, + "grad_norm": 0.6327878832817078, + "learning_rate": 3.616135745157676e-06, + "loss": 2.0102, + "step": 19940 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.6179520487785339, + "learning_rate": 3.612934208420042e-06, + "loss": 2.0093, + "step": 19950 + }, + { + "epoch": 1.728138528138528, + "grad_norm": 0.5419082641601562, + "learning_rate": 3.609732671682408e-06, + "loss": 2.0106, + "step": 19960 + }, + { + "epoch": 1.7290043290043289, + "grad_norm": 0.6235204339027405, + "learning_rate": 3.606531134944774e-06, + "loss": 2.0257, + "step": 19970 + }, + { + "epoch": 1.7298701298701298, + "grad_norm": 0.5981021523475647, + "learning_rate": 3.6033295982071397e-06, + "loss": 2.0155, + "step": 19980 + }, + { + "epoch": 1.7307359307359307, + "grad_norm": 0.5662804841995239, + "learning_rate": 3.6001280614695054e-06, + "loss": 2.0286, + "step": 19990 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 0.5968201160430908, + "learning_rate": 3.5969265247318715e-06, + "loss": 2.0087, + "step": 20000 + }, + { + "epoch": 1.7316017316017316, + "eval_loss": 2.0399529933929443, + "eval_runtime": 100.0106, + "eval_samples_per_second": 9.999, + "eval_steps_per_second": 4.999, + "step": 20000 + }, + { + "epoch": 1.7324675324675325, + "grad_norm": 0.7249771952629089, + "learning_rate": 3.5937249879942376e-06, + "loss": 2.0221, + "step": 20010 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.6328686475753784, + "learning_rate": 3.5905234512566034e-06, + "loss": 2.0242, + "step": 20020 + }, + { + "epoch": 1.7341991341991343, + "grad_norm": 0.5426235795021057, + "learning_rate": 3.587321914518969e-06, + "loss": 2.0397, + "step": 20030 + }, + { + "epoch": 1.7350649350649352, + "grad_norm": 0.5799441337585449, + "learning_rate": 3.5841203777813356e-06, + "loss": 1.9964, + "step": 20040 + }, + { + "epoch": 1.7359307359307359, + "grad_norm": 0.6211297512054443, + "learning_rate": 3.5809188410437013e-06, + "loss": 1.9779, + "step": 20050 + }, + { + "epoch": 1.7367965367965368, + "grad_norm": 0.583142876625061, + "learning_rate": 3.577717304306067e-06, + "loss": 2.0101, + "step": 20060 + }, + { + "epoch": 1.7376623376623377, + "grad_norm": 0.7151252031326294, + "learning_rate": 3.574515767568433e-06, + "loss": 2.0254, + "step": 20070 + }, + { + "epoch": 1.7385281385281384, + "grad_norm": 0.6126326322555542, + "learning_rate": 3.571314230830799e-06, + "loss": 2.0136, + "step": 20080 + }, + { + "epoch": 1.7393939393939393, + "grad_norm": 0.5718016028404236, + "learning_rate": 3.568112694093165e-06, + "loss": 2.043, + "step": 20090 + }, + { + "epoch": 1.7402597402597402, + "grad_norm": 0.5921180844306946, + "learning_rate": 3.564911157355531e-06, + "loss": 2.012, + "step": 20100 + }, + { + "epoch": 1.7402597402597402, + "eval_loss": 2.039822816848755, + "eval_runtime": 99.8405, + "eval_samples_per_second": 10.016, + "eval_steps_per_second": 5.008, + "step": 20100 + }, + { + "epoch": 1.741125541125541, + "grad_norm": 0.519929051399231, + "learning_rate": 3.561709620617897e-06, + "loss": 2.0345, + "step": 20110 + }, + { + "epoch": 1.741991341991342, + "grad_norm": 0.5680990219116211, + "learning_rate": 3.5585080838802626e-06, + "loss": 2.0114, + "step": 20120 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 0.5650716423988342, + "learning_rate": 3.555306547142629e-06, + "loss": 1.9992, + "step": 20130 + }, + { + "epoch": 1.7437229437229438, + "grad_norm": 0.5756558179855347, + "learning_rate": 3.552105010404995e-06, + "loss": 2.0144, + "step": 20140 + }, + { + "epoch": 1.7445887445887447, + "grad_norm": 0.5543674826622009, + "learning_rate": 3.5489034736673606e-06, + "loss": 2.0499, + "step": 20150 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 0.6690876483917236, + "learning_rate": 3.5457019369297263e-06, + "loss": 1.9965, + "step": 20160 + }, + { + "epoch": 1.7463203463203463, + "grad_norm": 0.5855807662010193, + "learning_rate": 3.542500400192093e-06, + "loss": 2.0421, + "step": 20170 + }, + { + "epoch": 1.7471861471861472, + "grad_norm": 0.5694772601127625, + "learning_rate": 3.5392988634544585e-06, + "loss": 2.0244, + "step": 20180 + }, + { + "epoch": 1.748051948051948, + "grad_norm": 0.5997563004493713, + "learning_rate": 3.5360973267168242e-06, + "loss": 1.9869, + "step": 20190 + }, + { + "epoch": 1.7489177489177488, + "grad_norm": 0.6609987616539001, + "learning_rate": 3.5328957899791904e-06, + "loss": 2.0407, + "step": 20200 + }, + { + "epoch": 1.7489177489177488, + "eval_loss": 2.039726972579956, + "eval_runtime": 99.685, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 5.016, + "step": 20200 + }, + { + "epoch": 1.7497835497835497, + "grad_norm": 0.5179834365844727, + "learning_rate": 3.529694253241556e-06, + "loss": 1.9891, + "step": 20210 + }, + { + "epoch": 1.7506493506493506, + "grad_norm": 0.6105602979660034, + "learning_rate": 3.526492716503922e-06, + "loss": 2.0039, + "step": 20220 + }, + { + "epoch": 1.7515151515151515, + "grad_norm": 0.610821545124054, + "learning_rate": 3.5232911797662883e-06, + "loss": 1.9976, + "step": 20230 + }, + { + "epoch": 1.7523809523809524, + "grad_norm": 0.6439399123191833, + "learning_rate": 3.520089643028654e-06, + "loss": 1.9953, + "step": 20240 + }, + { + "epoch": 1.7532467532467533, + "grad_norm": 0.5386427044868469, + "learning_rate": 3.5168881062910198e-06, + "loss": 2.0095, + "step": 20250 + }, + { + "epoch": 1.7541125541125542, + "grad_norm": 0.6169285178184509, + "learning_rate": 3.5136865695533863e-06, + "loss": 2.0514, + "step": 20260 + }, + { + "epoch": 1.754978354978355, + "grad_norm": 0.6283642649650574, + "learning_rate": 3.510485032815752e-06, + "loss": 1.9985, + "step": 20270 + }, + { + "epoch": 1.755844155844156, + "grad_norm": 0.5926968455314636, + "learning_rate": 3.5072834960781177e-06, + "loss": 2.0144, + "step": 20280 + }, + { + "epoch": 1.7567099567099567, + "grad_norm": 0.5889548063278198, + "learning_rate": 3.5040819593404835e-06, + "loss": 2.0257, + "step": 20290 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 0.5215528011322021, + "learning_rate": 3.5008804226028496e-06, + "loss": 2.0168, + "step": 20300 + }, + { + "epoch": 1.7575757575757576, + "eval_loss": 2.039764642715454, + "eval_runtime": 99.6287, + "eval_samples_per_second": 10.037, + "eval_steps_per_second": 5.019, + "step": 20300 + }, + { + "epoch": 1.7584415584415585, + "grad_norm": 0.5776051878929138, + "learning_rate": 3.4976788858652157e-06, + "loss": 2.0154, + "step": 20310 + }, + { + "epoch": 1.7593073593073592, + "grad_norm": 0.5756100416183472, + "learning_rate": 3.4944773491275814e-06, + "loss": 2.0076, + "step": 20320 + }, + { + "epoch": 1.76017316017316, + "grad_norm": 0.6546595096588135, + "learning_rate": 3.4912758123899476e-06, + "loss": 2.0007, + "step": 20330 + }, + { + "epoch": 1.761038961038961, + "grad_norm": 0.5286757946014404, + "learning_rate": 3.4880742756523133e-06, + "loss": 1.995, + "step": 20340 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.5665123462677002, + "learning_rate": 3.484872738914679e-06, + "loss": 1.9941, + "step": 20350 + }, + { + "epoch": 1.7627705627705628, + "grad_norm": 0.6835453510284424, + "learning_rate": 3.4816712021770455e-06, + "loss": 2.035, + "step": 20360 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 0.6274100542068481, + "learning_rate": 3.4784696654394113e-06, + "loss": 1.9852, + "step": 20370 + }, + { + "epoch": 1.7645021645021646, + "grad_norm": 0.5485489368438721, + "learning_rate": 3.475268128701777e-06, + "loss": 2.0197, + "step": 20380 + }, + { + "epoch": 1.7653679653679655, + "grad_norm": 0.6467198729515076, + "learning_rate": 3.472066591964143e-06, + "loss": 2.0234, + "step": 20390 + }, + { + "epoch": 1.7662337662337664, + "grad_norm": 0.6412429809570312, + "learning_rate": 3.4688650552265092e-06, + "loss": 2.0303, + "step": 20400 + }, + { + "epoch": 1.7662337662337664, + "eval_loss": 2.039860963821411, + "eval_runtime": 99.6419, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 5.018, + "step": 20400 + }, + { + "epoch": 1.767099567099567, + "grad_norm": 0.544453501701355, + "learning_rate": 3.465663518488875e-06, + "loss": 2.0253, + "step": 20410 + }, + { + "epoch": 1.767965367965368, + "grad_norm": 0.6093266010284424, + "learning_rate": 3.462461981751241e-06, + "loss": 2.0106, + "step": 20420 + }, + { + "epoch": 1.7688311688311689, + "grad_norm": 0.6506460309028625, + "learning_rate": 3.4592604450136068e-06, + "loss": 2.0196, + "step": 20430 + }, + { + "epoch": 1.7696969696969695, + "grad_norm": 0.6249178647994995, + "learning_rate": 3.4560589082759725e-06, + "loss": 2.0308, + "step": 20440 + }, + { + "epoch": 1.7705627705627704, + "grad_norm": 0.5533695220947266, + "learning_rate": 3.4528573715383386e-06, + "loss": 2.0006, + "step": 20450 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.5766792893409729, + "learning_rate": 3.4496558348007048e-06, + "loss": 1.9828, + "step": 20460 + }, + { + "epoch": 1.7722943722943723, + "grad_norm": 0.623255729675293, + "learning_rate": 3.4464542980630705e-06, + "loss": 2.0092, + "step": 20470 + }, + { + "epoch": 1.7731601731601732, + "grad_norm": 0.6529678106307983, + "learning_rate": 3.443252761325436e-06, + "loss": 2.0247, + "step": 20480 + }, + { + "epoch": 1.774025974025974, + "grad_norm": 0.5301777720451355, + "learning_rate": 3.4400512245878027e-06, + "loss": 2.001, + "step": 20490 + }, + { + "epoch": 1.774891774891775, + "grad_norm": 0.5703863501548767, + "learning_rate": 3.4368496878501684e-06, + "loss": 2.0127, + "step": 20500 + }, + { + "epoch": 1.774891774891775, + "eval_loss": 2.039661169052124, + "eval_runtime": 99.803, + "eval_samples_per_second": 10.02, + "eval_steps_per_second": 5.01, + "step": 20500 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.5548557639122009, + "learning_rate": 3.433648151112534e-06, + "loss": 2.0469, + "step": 20510 + }, + { + "epoch": 1.7766233766233768, + "grad_norm": 0.6673153042793274, + "learning_rate": 3.4304466143749003e-06, + "loss": 1.9994, + "step": 20520 + }, + { + "epoch": 1.7774891774891775, + "grad_norm": 0.5282227993011475, + "learning_rate": 3.4272450776372664e-06, + "loss": 2.0299, + "step": 20530 + }, + { + "epoch": 1.7783549783549784, + "grad_norm": 0.5832075476646423, + "learning_rate": 3.424043540899632e-06, + "loss": 2.0186, + "step": 20540 + }, + { + "epoch": 1.7792207792207793, + "grad_norm": 0.585724413394928, + "learning_rate": 3.4208420041619983e-06, + "loss": 2.0172, + "step": 20550 + }, + { + "epoch": 1.78008658008658, + "grad_norm": 0.6119825839996338, + "learning_rate": 3.417640467424364e-06, + "loss": 1.9985, + "step": 20560 + }, + { + "epoch": 1.7809523809523808, + "grad_norm": 0.5499480366706848, + "learning_rate": 3.4144389306867297e-06, + "loss": 2.0324, + "step": 20570 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 0.5651506185531616, + "learning_rate": 3.4112373939490954e-06, + "loss": 2.0494, + "step": 20580 + }, + { + "epoch": 1.7826839826839826, + "grad_norm": 0.6963779330253601, + "learning_rate": 3.408035857211462e-06, + "loss": 2.0276, + "step": 20590 + }, + { + "epoch": 1.7835497835497836, + "grad_norm": 0.6034294366836548, + "learning_rate": 3.4048343204738277e-06, + "loss": 2.0035, + "step": 20600 + }, + { + "epoch": 1.7835497835497836, + "eval_loss": 2.0393102169036865, + "eval_runtime": 99.6842, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 5.016, + "step": 20600 + }, + { + "epoch": 1.7844155844155845, + "grad_norm": 0.6472154855728149, + "learning_rate": 3.4016327837361934e-06, + "loss": 1.9977, + "step": 20610 + }, + { + "epoch": 1.7852813852813854, + "grad_norm": 0.5757026076316833, + "learning_rate": 3.39843124699856e-06, + "loss": 2.0111, + "step": 20620 + }, + { + "epoch": 1.7861471861471863, + "grad_norm": 0.5563897490501404, + "learning_rate": 3.3952297102609256e-06, + "loss": 2.0213, + "step": 20630 + }, + { + "epoch": 1.7870129870129872, + "grad_norm": 0.707472026348114, + "learning_rate": 3.3920281735232914e-06, + "loss": 2.014, + "step": 20640 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 0.6294756531715393, + "learning_rate": 3.3888266367856575e-06, + "loss": 2.0185, + "step": 20650 + }, + { + "epoch": 1.7887445887445887, + "grad_norm": 0.5562750101089478, + "learning_rate": 3.385625100048023e-06, + "loss": 2.0367, + "step": 20660 + }, + { + "epoch": 1.7896103896103897, + "grad_norm": 0.5640225410461426, + "learning_rate": 3.3824235633103893e-06, + "loss": 2.029, + "step": 20670 + }, + { + "epoch": 1.7904761904761903, + "grad_norm": 0.6213473081588745, + "learning_rate": 3.3792220265727555e-06, + "loss": 2.016, + "step": 20680 + }, + { + "epoch": 1.7913419913419912, + "grad_norm": 0.5477250218391418, + "learning_rate": 3.376020489835121e-06, + "loss": 2.0073, + "step": 20690 + }, + { + "epoch": 1.7922077922077921, + "grad_norm": 0.539556086063385, + "learning_rate": 3.372818953097487e-06, + "loss": 2.0243, + "step": 20700 + }, + { + "epoch": 1.7922077922077921, + "eval_loss": 2.0392966270446777, + "eval_runtime": 98.9726, + "eval_samples_per_second": 10.104, + "eval_steps_per_second": 5.052, + "step": 20700 + }, + { + "epoch": 1.793073593073593, + "grad_norm": 0.527823805809021, + "learning_rate": 3.3696174163598526e-06, + "loss": 2.0367, + "step": 20710 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.6325352191925049, + "learning_rate": 3.366415879622219e-06, + "loss": 2.0059, + "step": 20720 + }, + { + "epoch": 1.7948051948051948, + "grad_norm": 0.6825228333473206, + "learning_rate": 3.363214342884585e-06, + "loss": 2.0062, + "step": 20730 + }, + { + "epoch": 1.7956709956709958, + "grad_norm": 0.7011380195617676, + "learning_rate": 3.3600128061469506e-06, + "loss": 2.0427, + "step": 20740 + }, + { + "epoch": 1.7965367965367967, + "grad_norm": 0.5550771355628967, + "learning_rate": 3.3568112694093167e-06, + "loss": 2.0133, + "step": 20750 + }, + { + "epoch": 1.7974025974025976, + "grad_norm": 0.6346076130867004, + "learning_rate": 3.353609732671683e-06, + "loss": 2.0068, + "step": 20760 + }, + { + "epoch": 1.7982683982683982, + "grad_norm": 0.6987687349319458, + "learning_rate": 3.3504081959340485e-06, + "loss": 2.0234, + "step": 20770 + }, + { + "epoch": 1.7991341991341991, + "grad_norm": 0.5877545475959778, + "learning_rate": 3.3472066591964147e-06, + "loss": 1.9782, + "step": 20780 + }, + { + "epoch": 1.8, + "grad_norm": 0.59261554479599, + "learning_rate": 3.3440051224587804e-06, + "loss": 1.9872, + "step": 20790 + }, + { + "epoch": 1.8008658008658007, + "grad_norm": 0.6158283352851868, + "learning_rate": 3.340803585721146e-06, + "loss": 2.0009, + "step": 20800 + }, + { + "epoch": 1.8008658008658007, + "eval_loss": 2.039147138595581, + "eval_runtime": 98.9251, + "eval_samples_per_second": 10.109, + "eval_steps_per_second": 5.054, + "step": 20800 + }, + { + "epoch": 1.8017316017316016, + "grad_norm": 0.6955145597457886, + "learning_rate": 3.3376020489835127e-06, + "loss": 2.0368, + "step": 20810 + }, + { + "epoch": 1.8025974025974025, + "grad_norm": 0.6685484051704407, + "learning_rate": 3.3344005122458784e-06, + "loss": 2.0246, + "step": 20820 + }, + { + "epoch": 1.8034632034632034, + "grad_norm": 0.6130850315093994, + "learning_rate": 3.331198975508244e-06, + "loss": 2.0063, + "step": 20830 + }, + { + "epoch": 1.8043290043290043, + "grad_norm": 0.6293763518333435, + "learning_rate": 3.3279974387706106e-06, + "loss": 2.0222, + "step": 20840 + }, + { + "epoch": 1.8051948051948052, + "grad_norm": 0.5538076758384705, + "learning_rate": 3.3247959020329763e-06, + "loss": 2.009, + "step": 20850 + }, + { + "epoch": 1.8060606060606061, + "grad_norm": 0.5926476120948792, + "learning_rate": 3.321594365295342e-06, + "loss": 2.045, + "step": 20860 + }, + { + "epoch": 1.806926406926407, + "grad_norm": 0.5465952157974243, + "learning_rate": 3.3183928285577078e-06, + "loss": 2.0329, + "step": 20870 + }, + { + "epoch": 1.807792207792208, + "grad_norm": 0.5225395560264587, + "learning_rate": 3.315191291820074e-06, + "loss": 1.9831, + "step": 20880 + }, + { + "epoch": 1.8086580086580086, + "grad_norm": 0.5721254348754883, + "learning_rate": 3.3119897550824396e-06, + "loss": 2.0125, + "step": 20890 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.49629688262939453, + "learning_rate": 3.3087882183448057e-06, + "loss": 2.0262, + "step": 20900 + }, + { + "epoch": 1.8095238095238095, + "eval_loss": 2.0388810634613037, + "eval_runtime": 98.8941, + "eval_samples_per_second": 10.112, + "eval_steps_per_second": 5.056, + "step": 20900 + }, + { + "epoch": 1.8103896103896104, + "grad_norm": 0.5626356601715088, + "learning_rate": 3.305586681607172e-06, + "loss": 2.0369, + "step": 20910 + }, + { + "epoch": 1.8112554112554111, + "grad_norm": 0.5846545696258545, + "learning_rate": 3.3023851448695376e-06, + "loss": 2.0192, + "step": 20920 + }, + { + "epoch": 1.812121212121212, + "grad_norm": 0.5293322205543518, + "learning_rate": 3.2991836081319033e-06, + "loss": 2.0366, + "step": 20930 + }, + { + "epoch": 1.812987012987013, + "grad_norm": 0.5870088934898376, + "learning_rate": 3.29598207139427e-06, + "loss": 2.0465, + "step": 20940 + }, + { + "epoch": 1.8138528138528138, + "grad_norm": 0.6359837651252747, + "learning_rate": 3.2927805346566356e-06, + "loss": 1.9992, + "step": 20950 + }, + { + "epoch": 1.8147186147186147, + "grad_norm": 0.577740490436554, + "learning_rate": 3.2895789979190013e-06, + "loss": 1.9785, + "step": 20960 + }, + { + "epoch": 1.8155844155844156, + "grad_norm": 0.6007850766181946, + "learning_rate": 3.2863774611813674e-06, + "loss": 1.9854, + "step": 20970 + }, + { + "epoch": 1.8164502164502165, + "grad_norm": 0.6609432697296143, + "learning_rate": 3.2831759244437335e-06, + "loss": 2.0204, + "step": 20980 + }, + { + "epoch": 1.8173160173160174, + "grad_norm": 0.664561927318573, + "learning_rate": 3.2799743877060993e-06, + "loss": 1.9866, + "step": 20990 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.5524464845657349, + "learning_rate": 3.276772850968465e-06, + "loss": 1.9935, + "step": 21000 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 2.038954496383667, + "eval_runtime": 99.5448, + "eval_samples_per_second": 10.046, + "eval_steps_per_second": 5.023, + "step": 21000 + }, + { + "epoch": 1.819047619047619, + "grad_norm": 0.5762661695480347, + "learning_rate": 3.273571314230831e-06, + "loss": 2.0434, + "step": 21010 + }, + { + "epoch": 1.81991341991342, + "grad_norm": 0.6803544163703918, + "learning_rate": 3.270369777493197e-06, + "loss": 2.0162, + "step": 21020 + }, + { + "epoch": 1.8207792207792208, + "grad_norm": 0.5953033566474915, + "learning_rate": 3.2671682407555625e-06, + "loss": 2.0268, + "step": 21030 + }, + { + "epoch": 1.8216450216450215, + "grad_norm": 0.5579327940940857, + "learning_rate": 3.263966704017929e-06, + "loss": 2.0112, + "step": 21040 + }, + { + "epoch": 1.8225108225108224, + "grad_norm": 0.5896299481391907, + "learning_rate": 3.2607651672802948e-06, + "loss": 1.9973, + "step": 21050 + }, + { + "epoch": 1.8233766233766233, + "grad_norm": 0.5517136454582214, + "learning_rate": 3.2575636305426605e-06, + "loss": 2.0676, + "step": 21060 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.5696994662284851, + "learning_rate": 3.254362093805027e-06, + "loss": 2.0096, + "step": 21070 + }, + { + "epoch": 1.8251082251082251, + "grad_norm": 0.602047324180603, + "learning_rate": 3.2511605570673928e-06, + "loss": 2.024, + "step": 21080 + }, + { + "epoch": 1.825974025974026, + "grad_norm": 0.531102180480957, + "learning_rate": 3.2479590203297585e-06, + "loss": 2.0221, + "step": 21090 + }, + { + "epoch": 1.826839826839827, + "grad_norm": 0.5972487330436707, + "learning_rate": 3.2447574835921246e-06, + "loss": 2.0167, + "step": 21100 + }, + { + "epoch": 1.826839826839827, + "eval_loss": 2.038703680038452, + "eval_runtime": 99.4301, + "eval_samples_per_second": 10.057, + "eval_steps_per_second": 5.029, + "step": 21100 + }, + { + "epoch": 1.8277056277056278, + "grad_norm": 0.6066187620162964, + "learning_rate": 3.2415559468544903e-06, + "loss": 2.0174, + "step": 21110 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.5727007985115051, + "learning_rate": 3.2383544101168564e-06, + "loss": 2.0065, + "step": 21120 + }, + { + "epoch": 1.8294372294372294, + "grad_norm": 0.6033422946929932, + "learning_rate": 3.235152873379222e-06, + "loss": 2.0072, + "step": 21130 + }, + { + "epoch": 1.8303030303030303, + "grad_norm": 0.5761565566062927, + "learning_rate": 3.2319513366415883e-06, + "loss": 2.0455, + "step": 21140 + }, + { + "epoch": 1.8311688311688312, + "grad_norm": 0.5772866606712341, + "learning_rate": 3.228749799903954e-06, + "loss": 2.0235, + "step": 21150 + }, + { + "epoch": 1.832034632034632, + "grad_norm": 0.5313910841941833, + "learning_rate": 3.2255482631663197e-06, + "loss": 1.9966, + "step": 21160 + }, + { + "epoch": 1.8329004329004328, + "grad_norm": 0.5462523698806763, + "learning_rate": 3.2223467264286863e-06, + "loss": 2.0213, + "step": 21170 + }, + { + "epoch": 1.8337662337662337, + "grad_norm": 0.5453023314476013, + "learning_rate": 3.219145189691052e-06, + "loss": 2.0008, + "step": 21180 + }, + { + "epoch": 1.8346320346320346, + "grad_norm": 0.5670868158340454, + "learning_rate": 3.2159436529534177e-06, + "loss": 2.0036, + "step": 21190 + }, + { + "epoch": 1.8354978354978355, + "grad_norm": 0.6061764359474182, + "learning_rate": 3.212742116215784e-06, + "loss": 2.0033, + "step": 21200 + }, + { + "epoch": 1.8354978354978355, + "eval_loss": 2.038498640060425, + "eval_runtime": 99.564, + "eval_samples_per_second": 10.044, + "eval_steps_per_second": 5.022, + "step": 21200 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 0.5856644511222839, + "learning_rate": 3.20954057947815e-06, + "loss": 2.0251, + "step": 21210 + }, + { + "epoch": 1.8372294372294373, + "grad_norm": 0.5439422726631165, + "learning_rate": 3.2063390427405157e-06, + "loss": 2.0237, + "step": 21220 + }, + { + "epoch": 1.8380952380952382, + "grad_norm": 0.585919976234436, + "learning_rate": 3.203137506002882e-06, + "loss": 2.021, + "step": 21230 + }, + { + "epoch": 1.838961038961039, + "grad_norm": 0.6343615055084229, + "learning_rate": 3.1999359692652475e-06, + "loss": 2.0149, + "step": 21240 + }, + { + "epoch": 1.8398268398268398, + "grad_norm": 0.6096217036247253, + "learning_rate": 3.1967344325276132e-06, + "loss": 2.0069, + "step": 21250 + }, + { + "epoch": 1.8406926406926407, + "grad_norm": 0.5958551168441772, + "learning_rate": 3.1935328957899798e-06, + "loss": 2.004, + "step": 21260 + }, + { + "epoch": 1.8415584415584414, + "grad_norm": 0.6105349659919739, + "learning_rate": 3.1903313590523455e-06, + "loss": 1.9986, + "step": 21270 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.6545379757881165, + "learning_rate": 3.187129822314711e-06, + "loss": 2.0178, + "step": 21280 + }, + { + "epoch": 1.8432900432900432, + "grad_norm": 0.6139872670173645, + "learning_rate": 3.183928285577077e-06, + "loss": 2.0106, + "step": 21290 + }, + { + "epoch": 1.844155844155844, + "grad_norm": 0.5510513782501221, + "learning_rate": 3.1807267488394435e-06, + "loss": 2.0068, + "step": 21300 + }, + { + "epoch": 1.844155844155844, + "eval_loss": 2.038245439529419, + "eval_runtime": 99.354, + "eval_samples_per_second": 10.065, + "eval_steps_per_second": 5.033, + "step": 21300 + }, + { + "epoch": 1.845021645021645, + "grad_norm": 0.6139474511146545, + "learning_rate": 3.177525212101809e-06, + "loss": 2.0397, + "step": 21310 + }, + { + "epoch": 1.845887445887446, + "grad_norm": 0.5427426099777222, + "learning_rate": 3.174323675364175e-06, + "loss": 2.0191, + "step": 21320 + }, + { + "epoch": 1.8467532467532468, + "grad_norm": 0.5988373756408691, + "learning_rate": 3.171122138626541e-06, + "loss": 2.0261, + "step": 21330 + }, + { + "epoch": 1.8476190476190477, + "grad_norm": 0.5978485941886902, + "learning_rate": 3.1679206018889067e-06, + "loss": 1.9954, + "step": 21340 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.5510175228118896, + "learning_rate": 3.164719065151273e-06, + "loss": 2.0011, + "step": 21350 + }, + { + "epoch": 1.8493506493506493, + "grad_norm": 0.589741051197052, + "learning_rate": 3.161517528413639e-06, + "loss": 2.0222, + "step": 21360 + }, + { + "epoch": 1.8502164502164502, + "grad_norm": 0.6140208840370178, + "learning_rate": 3.1583159916760047e-06, + "loss": 2.0232, + "step": 21370 + }, + { + "epoch": 1.851082251082251, + "grad_norm": 0.6576170921325684, + "learning_rate": 3.1551144549383704e-06, + "loss": 2.039, + "step": 21380 + }, + { + "epoch": 1.8519480519480518, + "grad_norm": 0.7040467858314514, + "learning_rate": 3.151912918200737e-06, + "loss": 2.0246, + "step": 21390 + }, + { + "epoch": 1.8528138528138527, + "grad_norm": 0.6736012697219849, + "learning_rate": 3.1487113814631027e-06, + "loss": 2.0045, + "step": 21400 + }, + { + "epoch": 1.8528138528138527, + "eval_loss": 2.038057327270508, + "eval_runtime": 99.3538, + "eval_samples_per_second": 10.065, + "eval_steps_per_second": 5.033, + "step": 21400 + }, + { + "epoch": 1.8536796536796536, + "grad_norm": 0.536457359790802, + "learning_rate": 3.1455098447254684e-06, + "loss": 1.9998, + "step": 21410 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 0.5989015698432922, + "learning_rate": 3.142308307987834e-06, + "loss": 2.0318, + "step": 21420 + }, + { + "epoch": 1.8554112554112554, + "grad_norm": 0.5948752760887146, + "learning_rate": 3.1391067712502007e-06, + "loss": 1.9903, + "step": 21430 + }, + { + "epoch": 1.8562770562770563, + "grad_norm": 0.6903519034385681, + "learning_rate": 3.1359052345125664e-06, + "loss": 1.9955, + "step": 21440 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.5571711659431458, + "learning_rate": 3.132703697774932e-06, + "loss": 2.0192, + "step": 21450 + }, + { + "epoch": 1.858008658008658, + "grad_norm": 0.5514909029006958, + "learning_rate": 3.129502161037298e-06, + "loss": 1.9942, + "step": 21460 + }, + { + "epoch": 1.858874458874459, + "grad_norm": 0.6278780698776245, + "learning_rate": 3.126300624299664e-06, + "loss": 2.0145, + "step": 21470 + }, + { + "epoch": 1.8597402597402597, + "grad_norm": 0.587446928024292, + "learning_rate": 3.1230990875620296e-06, + "loss": 2.0016, + "step": 21480 + }, + { + "epoch": 1.8606060606060606, + "grad_norm": 0.6872239708900452, + "learning_rate": 3.119897550824396e-06, + "loss": 2.0476, + "step": 21490 + }, + { + "epoch": 1.8614718614718615, + "grad_norm": 0.5873289108276367, + "learning_rate": 3.116696014086762e-06, + "loss": 2.0128, + "step": 21500 + }, + { + "epoch": 1.8614718614718615, + "eval_loss": 2.037977457046509, + "eval_runtime": 99.2661, + "eval_samples_per_second": 10.074, + "eval_steps_per_second": 5.037, + "step": 21500 + }, + { + "epoch": 1.8623376623376622, + "grad_norm": 0.631047785282135, + "learning_rate": 3.1134944773491276e-06, + "loss": 1.984, + "step": 21510 + }, + { + "epoch": 1.863203463203463, + "grad_norm": 0.5543110966682434, + "learning_rate": 3.110292940611494e-06, + "loss": 1.9967, + "step": 21520 + }, + { + "epoch": 1.864069264069264, + "grad_norm": 0.5351025462150574, + "learning_rate": 3.10709140387386e-06, + "loss": 2.0258, + "step": 21530 + }, + { + "epoch": 1.864935064935065, + "grad_norm": 0.596958339214325, + "learning_rate": 3.1038898671362256e-06, + "loss": 2.0264, + "step": 21540 + }, + { + "epoch": 1.8658008658008658, + "grad_norm": 0.6210545301437378, + "learning_rate": 3.1006883303985917e-06, + "loss": 2.0105, + "step": 21550 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.6117931008338928, + "learning_rate": 3.0974867936609574e-06, + "loss": 2.0459, + "step": 21560 + }, + { + "epoch": 1.8675324675324676, + "grad_norm": 0.5893979072570801, + "learning_rate": 3.0942852569233236e-06, + "loss": 2.015, + "step": 21570 + }, + { + "epoch": 1.8683982683982685, + "grad_norm": 0.6391885280609131, + "learning_rate": 3.0910837201856893e-06, + "loss": 2.0035, + "step": 21580 + }, + { + "epoch": 1.8692640692640694, + "grad_norm": 0.5598975419998169, + "learning_rate": 3.0878821834480554e-06, + "loss": 2.0405, + "step": 21590 + }, + { + "epoch": 1.87012987012987, + "grad_norm": 0.5230623483657837, + "learning_rate": 3.084680646710421e-06, + "loss": 2.0182, + "step": 21600 + }, + { + "epoch": 1.87012987012987, + "eval_loss": 2.037860870361328, + "eval_runtime": 99.3825, + "eval_samples_per_second": 10.062, + "eval_steps_per_second": 5.031, + "step": 21600 + }, + { + "epoch": 1.870995670995671, + "grad_norm": 0.6047036647796631, + "learning_rate": 3.081479109972787e-06, + "loss": 2.0015, + "step": 21610 + }, + { + "epoch": 1.871861471861472, + "grad_norm": 0.573807954788208, + "learning_rate": 3.0782775732351534e-06, + "loss": 2.003, + "step": 21620 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.5762017965316772, + "learning_rate": 3.075076036497519e-06, + "loss": 2.0432, + "step": 21630 + }, + { + "epoch": 1.8735930735930735, + "grad_norm": 0.5771380662918091, + "learning_rate": 3.071874499759885e-06, + "loss": 1.996, + "step": 21640 + }, + { + "epoch": 1.8744588744588744, + "grad_norm": 0.6185895204544067, + "learning_rate": 3.0686729630222514e-06, + "loss": 1.9864, + "step": 21650 + }, + { + "epoch": 1.8753246753246753, + "grad_norm": 0.5651230812072754, + "learning_rate": 3.065471426284617e-06, + "loss": 2.0112, + "step": 21660 + }, + { + "epoch": 1.8761904761904762, + "grad_norm": 0.5942379236221313, + "learning_rate": 3.0622698895469828e-06, + "loss": 1.9949, + "step": 21670 + }, + { + "epoch": 1.877056277056277, + "grad_norm": 0.6241719126701355, + "learning_rate": 3.059068352809349e-06, + "loss": 2.0335, + "step": 21680 + }, + { + "epoch": 1.877922077922078, + "grad_norm": 0.5679668188095093, + "learning_rate": 3.0558668160717146e-06, + "loss": 2.0457, + "step": 21690 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.555386483669281, + "learning_rate": 3.0526652793340803e-06, + "loss": 2.0202, + "step": 21700 + }, + { + "epoch": 1.878787878787879, + "eval_loss": 2.037943124771118, + "eval_runtime": 98.9297, + "eval_samples_per_second": 10.108, + "eval_steps_per_second": 5.054, + "step": 21700 + }, + { + "epoch": 1.8796536796536798, + "grad_norm": 0.5467822551727295, + "learning_rate": 3.0494637425964465e-06, + "loss": 2.0007, + "step": 21710 + }, + { + "epoch": 1.8805194805194805, + "grad_norm": 0.5980138778686523, + "learning_rate": 3.0462622058588126e-06, + "loss": 2.0131, + "step": 21720 + }, + { + "epoch": 1.8813852813852814, + "grad_norm": 0.5898545980453491, + "learning_rate": 3.0430606691211783e-06, + "loss": 2.0008, + "step": 21730 + }, + { + "epoch": 1.8822510822510823, + "grad_norm": 0.5178360342979431, + "learning_rate": 3.039859132383544e-06, + "loss": 2.0268, + "step": 21740 + }, + { + "epoch": 1.883116883116883, + "grad_norm": 0.5681174397468567, + "learning_rate": 3.0366575956459106e-06, + "loss": 2.0395, + "step": 21750 + }, + { + "epoch": 1.8839826839826839, + "grad_norm": 0.5684530138969421, + "learning_rate": 3.0334560589082763e-06, + "loss": 2.0093, + "step": 21760 + }, + { + "epoch": 1.8848484848484848, + "grad_norm": 0.5369515419006348, + "learning_rate": 3.030254522170642e-06, + "loss": 2.0208, + "step": 21770 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.596129834651947, + "learning_rate": 3.027052985433008e-06, + "loss": 2.0012, + "step": 21780 + }, + { + "epoch": 1.8865800865800866, + "grad_norm": 0.5867626667022705, + "learning_rate": 3.0238514486953743e-06, + "loss": 2.0238, + "step": 21790 + }, + { + "epoch": 1.8874458874458875, + "grad_norm": 0.5560017824172974, + "learning_rate": 3.02064991195774e-06, + "loss": 2.0307, + "step": 21800 + }, + { + "epoch": 1.8874458874458875, + "eval_loss": 2.0375423431396484, + "eval_runtime": 99.0735, + "eval_samples_per_second": 10.094, + "eval_steps_per_second": 5.047, + "step": 21800 + }, + { + "epoch": 1.8883116883116884, + "grad_norm": 0.5294777154922485, + "learning_rate": 3.017448375220106e-06, + "loss": 2.0005, + "step": 21810 + }, + { + "epoch": 1.8891774891774893, + "grad_norm": 0.5869369506835938, + "learning_rate": 3.014246838482472e-06, + "loss": 2.0027, + "step": 21820 + }, + { + "epoch": 1.8900432900432902, + "grad_norm": 0.6573461294174194, + "learning_rate": 3.0110453017448375e-06, + "loss": 2.0265, + "step": 21830 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.6510856747627258, + "learning_rate": 3.0078437650072032e-06, + "loss": 2.0231, + "step": 21840 + }, + { + "epoch": 1.8917748917748918, + "grad_norm": 0.5690562129020691, + "learning_rate": 3.00464222826957e-06, + "loss": 2.0043, + "step": 21850 + }, + { + "epoch": 1.8926406926406927, + "grad_norm": 0.5478546619415283, + "learning_rate": 3.0014406915319355e-06, + "loss": 2.0262, + "step": 21860 + }, + { + "epoch": 1.8935064935064934, + "grad_norm": 0.6657856106758118, + "learning_rate": 2.9982391547943012e-06, + "loss": 1.9971, + "step": 21870 + }, + { + "epoch": 1.8943722943722943, + "grad_norm": 0.5432700514793396, + "learning_rate": 2.9950376180566678e-06, + "loss": 2.0038, + "step": 21880 + }, + { + "epoch": 1.8952380952380952, + "grad_norm": 0.6394188404083252, + "learning_rate": 2.9918360813190335e-06, + "loss": 2.012, + "step": 21890 + }, + { + "epoch": 1.896103896103896, + "grad_norm": 0.5569697618484497, + "learning_rate": 2.988634544581399e-06, + "loss": 2.0136, + "step": 21900 + }, + { + "epoch": 1.896103896103896, + "eval_loss": 2.0372016429901123, + "eval_runtime": 99.5333, + "eval_samples_per_second": 10.047, + "eval_steps_per_second": 5.023, + "step": 21900 + }, + { + "epoch": 1.896969696969697, + "grad_norm": 0.5097722411155701, + "learning_rate": 2.9854330078437653e-06, + "loss": 2.0326, + "step": 21910 + }, + { + "epoch": 1.8978354978354979, + "grad_norm": 0.6028870344161987, + "learning_rate": 2.982231471106131e-06, + "loss": 2.0245, + "step": 21920 + }, + { + "epoch": 1.8987012987012988, + "grad_norm": 0.6038254499435425, + "learning_rate": 2.979029934368497e-06, + "loss": 2.0398, + "step": 21930 + }, + { + "epoch": 1.8995670995670997, + "grad_norm": 0.5593138933181763, + "learning_rate": 2.9758283976308633e-06, + "loss": 1.9825, + "step": 21940 + }, + { + "epoch": 1.9004329004329006, + "grad_norm": 0.6215075254440308, + "learning_rate": 2.972626860893229e-06, + "loss": 2.0137, + "step": 21950 + }, + { + "epoch": 1.9012987012987013, + "grad_norm": 0.59126877784729, + "learning_rate": 2.9694253241555947e-06, + "loss": 2.0392, + "step": 21960 + }, + { + "epoch": 1.9021645021645022, + "grad_norm": 0.6107071042060852, + "learning_rate": 2.9662237874179613e-06, + "loss": 2.0244, + "step": 21970 + }, + { + "epoch": 1.903030303030303, + "grad_norm": 0.5983561873435974, + "learning_rate": 2.963022250680327e-06, + "loss": 2.0302, + "step": 21980 + }, + { + "epoch": 1.9038961038961038, + "grad_norm": 0.6307891607284546, + "learning_rate": 2.9598207139426927e-06, + "loss": 1.9965, + "step": 21990 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.623501181602478, + "learning_rate": 2.9566191772050584e-06, + "loss": 2.0181, + "step": 22000 + }, + { + "epoch": 1.9047619047619047, + "eval_loss": 2.0373153686523438, + "eval_runtime": 99.5258, + "eval_samples_per_second": 10.048, + "eval_steps_per_second": 5.024, + "step": 22000 + }, + { + "epoch": 1.9056277056277056, + "grad_norm": 0.5510867238044739, + "learning_rate": 2.9534176404674245e-06, + "loss": 2.0109, + "step": 22010 + }, + { + "epoch": 1.9064935064935065, + "grad_norm": 0.6466839909553528, + "learning_rate": 2.9502161037297907e-06, + "loss": 2.0275, + "step": 22020 + }, + { + "epoch": 1.9073593073593074, + "grad_norm": 0.547214925289154, + "learning_rate": 2.9470145669921564e-06, + "loss": 2.0352, + "step": 22030 + }, + { + "epoch": 1.9082251082251083, + "grad_norm": 0.6152887344360352, + "learning_rate": 2.9438130302545225e-06, + "loss": 1.9707, + "step": 22040 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.6470698714256287, + "learning_rate": 2.9406114935168882e-06, + "loss": 2.0254, + "step": 22050 + }, + { + "epoch": 1.90995670995671, + "grad_norm": 0.5699059367179871, + "learning_rate": 2.937409956779254e-06, + "loss": 1.9882, + "step": 22060 + }, + { + "epoch": 1.910822510822511, + "grad_norm": 0.625361979007721, + "learning_rate": 2.9342084200416205e-06, + "loss": 2.0048, + "step": 22070 + }, + { + "epoch": 1.9116883116883117, + "grad_norm": 0.5219143629074097, + "learning_rate": 2.931006883303986e-06, + "loss": 2.0008, + "step": 22080 + }, + { + "epoch": 1.9125541125541126, + "grad_norm": 0.5905846953392029, + "learning_rate": 2.927805346566352e-06, + "loss": 1.9883, + "step": 22090 + }, + { + "epoch": 1.9134199134199135, + "grad_norm": 0.5829472541809082, + "learning_rate": 2.9246038098287185e-06, + "loss": 2.0223, + "step": 22100 + }, + { + "epoch": 1.9134199134199135, + "eval_loss": 2.0372154712677, + "eval_runtime": 99.6277, + "eval_samples_per_second": 10.037, + "eval_steps_per_second": 5.019, + "step": 22100 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.5397132635116577, + "learning_rate": 2.921402273091084e-06, + "loss": 2.0253, + "step": 22110 + }, + { + "epoch": 1.915151515151515, + "grad_norm": 0.5382157564163208, + "learning_rate": 2.91820073635345e-06, + "loss": 2.0102, + "step": 22120 + }, + { + "epoch": 1.916017316017316, + "grad_norm": 0.5529114007949829, + "learning_rate": 2.9149991996158156e-06, + "loss": 2.0336, + "step": 22130 + }, + { + "epoch": 1.9168831168831169, + "grad_norm": 0.5019690990447998, + "learning_rate": 2.9117976628781817e-06, + "loss": 2.0327, + "step": 22140 + }, + { + "epoch": 1.9177489177489178, + "grad_norm": 0.5692863464355469, + "learning_rate": 2.9085961261405475e-06, + "loss": 2.013, + "step": 22150 + }, + { + "epoch": 1.9186147186147187, + "grad_norm": 0.5849686861038208, + "learning_rate": 2.9053945894029136e-06, + "loss": 2.0447, + "step": 22160 + }, + { + "epoch": 1.9194805194805196, + "grad_norm": 0.6239033341407776, + "learning_rate": 2.9021930526652797e-06, + "loss": 2.0144, + "step": 22170 + }, + { + "epoch": 1.9203463203463205, + "grad_norm": 0.558525800704956, + "learning_rate": 2.8989915159276454e-06, + "loss": 2.0499, + "step": 22180 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.5736129283905029, + "learning_rate": 2.895789979190011e-06, + "loss": 2.024, + "step": 22190 + }, + { + "epoch": 1.922077922077922, + "grad_norm": 0.5700423717498779, + "learning_rate": 2.8925884424523777e-06, + "loss": 2.0072, + "step": 22200 + }, + { + "epoch": 1.922077922077922, + "eval_loss": 2.036369562149048, + "eval_runtime": 99.4116, + "eval_samples_per_second": 10.059, + "eval_steps_per_second": 5.03, + "step": 22200 + }, + { + "epoch": 1.922943722943723, + "grad_norm": 0.6457517743110657, + "learning_rate": 2.8893869057147434e-06, + "loss": 2.0013, + "step": 22210 + }, + { + "epoch": 1.9238095238095239, + "grad_norm": 0.6279447674751282, + "learning_rate": 2.886185368977109e-06, + "loss": 1.9793, + "step": 22220 + }, + { + "epoch": 1.9246753246753245, + "grad_norm": 0.6363939642906189, + "learning_rate": 2.8829838322394752e-06, + "loss": 2.0159, + "step": 22230 + }, + { + "epoch": 1.9255411255411254, + "grad_norm": 0.6827042102813721, + "learning_rate": 2.8797822955018414e-06, + "loss": 2.0152, + "step": 22240 + }, + { + "epoch": 1.9264069264069263, + "grad_norm": 0.6012581586837769, + "learning_rate": 2.876580758764207e-06, + "loss": 2.024, + "step": 22250 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 0.5410102009773254, + "learning_rate": 2.8733792220265732e-06, + "loss": 2.0303, + "step": 22260 + }, + { + "epoch": 1.9281385281385282, + "grad_norm": 0.5607244372367859, + "learning_rate": 2.870177685288939e-06, + "loss": 2.0185, + "step": 22270 + }, + { + "epoch": 1.929004329004329, + "grad_norm": 0.5890989899635315, + "learning_rate": 2.8669761485513046e-06, + "loss": 1.9857, + "step": 22280 + }, + { + "epoch": 1.92987012987013, + "grad_norm": 0.5300803780555725, + "learning_rate": 2.8637746118136704e-06, + "loss": 2.0256, + "step": 22290 + }, + { + "epoch": 1.9307359307359309, + "grad_norm": 0.5861728191375732, + "learning_rate": 2.860573075076037e-06, + "loss": 2.0206, + "step": 22300 + }, + { + "epoch": 1.9307359307359309, + "eval_loss": 2.036616802215576, + "eval_runtime": 99.5185, + "eval_samples_per_second": 10.048, + "eval_steps_per_second": 5.024, + "step": 22300 + }, + { + "epoch": 1.9316017316017318, + "grad_norm": 0.5773259401321411, + "learning_rate": 2.8573715383384026e-06, + "loss": 2.0219, + "step": 22310 + }, + { + "epoch": 1.9324675324675324, + "grad_norm": 0.723914623260498, + "learning_rate": 2.8541700016007683e-06, + "loss": 2.0343, + "step": 22320 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.5671261548995972, + "learning_rate": 2.850968464863135e-06, + "loss": 2.0309, + "step": 22330 + }, + { + "epoch": 1.9341991341991343, + "grad_norm": 0.566591203212738, + "learning_rate": 2.8477669281255006e-06, + "loss": 2.0093, + "step": 22340 + }, + { + "epoch": 1.935064935064935, + "grad_norm": 0.5599043965339661, + "learning_rate": 2.8445653913878663e-06, + "loss": 2.0006, + "step": 22350 + }, + { + "epoch": 1.9359307359307358, + "grad_norm": 0.5229726433753967, + "learning_rate": 2.8413638546502324e-06, + "loss": 2.0483, + "step": 22360 + }, + { + "epoch": 1.9367965367965367, + "grad_norm": 0.5757012367248535, + "learning_rate": 2.838162317912598e-06, + "loss": 2.0317, + "step": 22370 + }, + { + "epoch": 1.9376623376623376, + "grad_norm": 0.5612809658050537, + "learning_rate": 2.8349607811749643e-06, + "loss": 2.0107, + "step": 22380 + }, + { + "epoch": 1.9385281385281385, + "grad_norm": 0.6425288319587708, + "learning_rate": 2.8317592444373304e-06, + "loss": 1.9869, + "step": 22390 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.5596022605895996, + "learning_rate": 2.828557707699696e-06, + "loss": 1.9866, + "step": 22400 + }, + { + "epoch": 1.9393939393939394, + "eval_loss": 2.0361886024475098, + "eval_runtime": 99.3361, + "eval_samples_per_second": 10.067, + "eval_steps_per_second": 5.033, + "step": 22400 + }, + { + "epoch": 1.9402597402597404, + "grad_norm": 0.5739807486534119, + "learning_rate": 2.825356170962062e-06, + "loss": 1.9978, + "step": 22410 + }, + { + "epoch": 1.9411255411255413, + "grad_norm": 0.5819233059883118, + "learning_rate": 2.8221546342244276e-06, + "loss": 1.9841, + "step": 22420 + }, + { + "epoch": 1.9419913419913422, + "grad_norm": 0.5564745664596558, + "learning_rate": 2.818953097486794e-06, + "loss": 2.0194, + "step": 22430 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.5947351455688477, + "learning_rate": 2.81575156074916e-06, + "loss": 1.9965, + "step": 22440 + }, + { + "epoch": 1.9437229437229437, + "grad_norm": 0.6048354506492615, + "learning_rate": 2.8125500240115255e-06, + "loss": 2.0291, + "step": 22450 + }, + { + "epoch": 1.9445887445887444, + "grad_norm": 0.5926880836486816, + "learning_rate": 2.8093484872738917e-06, + "loss": 2.0123, + "step": 22460 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 0.5721975564956665, + "learning_rate": 2.806146950536258e-06, + "loss": 2.0127, + "step": 22470 + }, + { + "epoch": 1.9463203463203462, + "grad_norm": 0.5987154245376587, + "learning_rate": 2.8029454137986235e-06, + "loss": 2.003, + "step": 22480 + }, + { + "epoch": 1.9471861471861471, + "grad_norm": 0.6043938994407654, + "learning_rate": 2.7997438770609896e-06, + "loss": 1.9983, + "step": 22490 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 0.5845421552658081, + "learning_rate": 2.7965423403233553e-06, + "loss": 2.0467, + "step": 22500 + }, + { + "epoch": 1.948051948051948, + "eval_loss": 2.036294460296631, + "eval_runtime": 99.4558, + "eval_samples_per_second": 10.055, + "eval_steps_per_second": 5.027, + "step": 22500 + }, + { + "epoch": 1.948917748917749, + "grad_norm": 0.5815349221229553, + "learning_rate": 2.793340803585721e-06, + "loss": 2.0169, + "step": 22510 + }, + { + "epoch": 1.9497835497835498, + "grad_norm": 0.5612464547157288, + "learning_rate": 2.7901392668480876e-06, + "loss": 2.0052, + "step": 22520 + }, + { + "epoch": 1.9506493506493507, + "grad_norm": 0.5927168726921082, + "learning_rate": 2.7869377301104533e-06, + "loss": 1.998, + "step": 22530 + }, + { + "epoch": 1.9515151515151516, + "grad_norm": 0.5546810030937195, + "learning_rate": 2.783736193372819e-06, + "loss": 2.0265, + "step": 22540 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.5768939256668091, + "learning_rate": 2.7805346566351847e-06, + "loss": 2.0087, + "step": 22550 + }, + { + "epoch": 1.9532467532467532, + "grad_norm": 0.5690239071846008, + "learning_rate": 2.7773331198975513e-06, + "loss": 1.9983, + "step": 22560 + }, + { + "epoch": 1.9541125541125541, + "grad_norm": 0.5805119276046753, + "learning_rate": 2.774131583159917e-06, + "loss": 2.0343, + "step": 22570 + }, + { + "epoch": 1.9549783549783548, + "grad_norm": 0.5649641156196594, + "learning_rate": 2.7709300464222827e-06, + "loss": 2.0062, + "step": 22580 + }, + { + "epoch": 1.9558441558441557, + "grad_norm": 0.5249608159065247, + "learning_rate": 2.767728509684649e-06, + "loss": 2.0005, + "step": 22590 + }, + { + "epoch": 1.9567099567099566, + "grad_norm": 0.5877872705459595, + "learning_rate": 2.7645269729470146e-06, + "loss": 1.9854, + "step": 22600 + }, + { + "epoch": 1.9567099567099566, + "eval_loss": 2.0360538959503174, + "eval_runtime": 99.515, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 5.024, + "step": 22600 + }, + { + "epoch": 1.9575757575757575, + "grad_norm": 0.5853782892227173, + "learning_rate": 2.7613254362093807e-06, + "loss": 2.0381, + "step": 22610 + }, + { + "epoch": 1.9584415584415584, + "grad_norm": 0.61262446641922, + "learning_rate": 2.758123899471747e-06, + "loss": 2.0193, + "step": 22620 + }, + { + "epoch": 1.9593073593073593, + "grad_norm": 0.6083556413650513, + "learning_rate": 2.7549223627341125e-06, + "loss": 2.0001, + "step": 22630 + }, + { + "epoch": 1.9601731601731602, + "grad_norm": 0.5128453373908997, + "learning_rate": 2.7517208259964783e-06, + "loss": 2.0166, + "step": 22640 + }, + { + "epoch": 1.9610389610389611, + "grad_norm": 0.5564128160476685, + "learning_rate": 2.748519289258845e-06, + "loss": 2.0321, + "step": 22650 + }, + { + "epoch": 1.961904761904762, + "grad_norm": 0.5347721576690674, + "learning_rate": 2.7453177525212105e-06, + "loss": 2.0296, + "step": 22660 + }, + { + "epoch": 1.9627705627705627, + "grad_norm": 0.5394149422645569, + "learning_rate": 2.7421162157835762e-06, + "loss": 2.0007, + "step": 22670 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 0.6041572093963623, + "learning_rate": 2.7389146790459424e-06, + "loss": 2.011, + "step": 22680 + }, + { + "epoch": 1.9645021645021645, + "grad_norm": 0.5431060194969177, + "learning_rate": 2.7357131423083085e-06, + "loss": 1.9783, + "step": 22690 + }, + { + "epoch": 1.9653679653679652, + "grad_norm": 0.6576969027519226, + "learning_rate": 2.732511605570674e-06, + "loss": 2.0401, + "step": 22700 + }, + { + "epoch": 1.9653679653679652, + "eval_loss": 2.036024332046509, + "eval_runtime": 99.6045, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 5.02, + "step": 22700 + }, + { + "epoch": 1.9662337662337661, + "grad_norm": 0.5169672966003418, + "learning_rate": 2.72931006883304e-06, + "loss": 2.0285, + "step": 22710 + }, + { + "epoch": 1.967099567099567, + "grad_norm": 0.5584462881088257, + "learning_rate": 2.726108532095406e-06, + "loss": 2.007, + "step": 22720 + }, + { + "epoch": 1.967965367965368, + "grad_norm": 0.5199535489082336, + "learning_rate": 2.7229069953577718e-06, + "loss": 2.0332, + "step": 22730 + }, + { + "epoch": 1.9688311688311688, + "grad_norm": 0.5056544542312622, + "learning_rate": 2.719705458620138e-06, + "loss": 2.0138, + "step": 22740 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.5921425819396973, + "learning_rate": 2.716503921882504e-06, + "loss": 2.0403, + "step": 22750 + }, + { + "epoch": 1.9705627705627706, + "grad_norm": 0.5671403408050537, + "learning_rate": 2.7133023851448697e-06, + "loss": 2.0586, + "step": 22760 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 0.5604325532913208, + "learning_rate": 2.7101008484072354e-06, + "loss": 2.0047, + "step": 22770 + }, + { + "epoch": 1.9722943722943724, + "grad_norm": 0.6269016265869141, + "learning_rate": 2.706899311669602e-06, + "loss": 2.0121, + "step": 22780 + }, + { + "epoch": 1.9731601731601731, + "grad_norm": 0.5508336424827576, + "learning_rate": 2.7036977749319677e-06, + "loss": 2.0172, + "step": 22790 + }, + { + "epoch": 1.974025974025974, + "grad_norm": 0.5560627579689026, + "learning_rate": 2.7004962381943334e-06, + "loss": 2.02, + "step": 22800 + }, + { + "epoch": 1.974025974025974, + "eval_loss": 2.03597354888916, + "eval_runtime": 99.2692, + "eval_samples_per_second": 10.074, + "eval_steps_per_second": 5.037, + "step": 22800 + }, + { + "epoch": 1.974891774891775, + "grad_norm": 0.5486915111541748, + "learning_rate": 2.6972947014566996e-06, + "loss": 2.0256, + "step": 22810 + }, + { + "epoch": 1.9757575757575756, + "grad_norm": 0.5568284392356873, + "learning_rate": 2.6940931647190653e-06, + "loss": 1.9966, + "step": 22820 + }, + { + "epoch": 1.9766233766233765, + "grad_norm": 0.6057902574539185, + "learning_rate": 2.6908916279814314e-06, + "loss": 2.0174, + "step": 22830 + }, + { + "epoch": 1.9774891774891774, + "grad_norm": 0.5547239184379578, + "learning_rate": 2.687690091243797e-06, + "loss": 2.0172, + "step": 22840 + }, + { + "epoch": 1.9783549783549783, + "grad_norm": 0.6098464727401733, + "learning_rate": 2.6844885545061632e-06, + "loss": 2.0149, + "step": 22850 + }, + { + "epoch": 1.9792207792207792, + "grad_norm": 0.6356939077377319, + "learning_rate": 2.681287017768529e-06, + "loss": 2.0072, + "step": 22860 + }, + { + "epoch": 1.9800865800865801, + "grad_norm": 0.6061809062957764, + "learning_rate": 2.6780854810308947e-06, + "loss": 2.0173, + "step": 22870 + }, + { + "epoch": 1.980952380952381, + "grad_norm": 0.5689536929130554, + "learning_rate": 2.6748839442932612e-06, + "loss": 2.0075, + "step": 22880 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 0.5928916931152344, + "learning_rate": 2.671682407555627e-06, + "loss": 2.0023, + "step": 22890 + }, + { + "epoch": 1.9826839826839828, + "grad_norm": 0.5256547331809998, + "learning_rate": 2.6684808708179926e-06, + "loss": 1.9976, + "step": 22900 + }, + { + "epoch": 1.9826839826839828, + "eval_loss": 2.0356850624084473, + "eval_runtime": 99.3113, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.035, + "step": 22900 + }, + { + "epoch": 1.9835497835497835, + "grad_norm": 0.6071939468383789, + "learning_rate": 2.665279334080359e-06, + "loss": 2.0187, + "step": 22910 + }, + { + "epoch": 1.9844155844155844, + "grad_norm": 0.5771980285644531, + "learning_rate": 2.662077797342725e-06, + "loss": 2.0217, + "step": 22920 + }, + { + "epoch": 1.9852813852813853, + "grad_norm": 0.5437718033790588, + "learning_rate": 2.6588762606050906e-06, + "loss": 1.9815, + "step": 22930 + }, + { + "epoch": 1.986147186147186, + "grad_norm": 0.531406819820404, + "learning_rate": 2.6556747238674568e-06, + "loss": 1.9978, + "step": 22940 + }, + { + "epoch": 1.987012987012987, + "grad_norm": 0.5711106657981873, + "learning_rate": 2.6524731871298225e-06, + "loss": 2.0179, + "step": 22950 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.6144083738327026, + "learning_rate": 2.649271650392188e-06, + "loss": 2.0312, + "step": 22960 + }, + { + "epoch": 1.9887445887445887, + "grad_norm": 0.6573773622512817, + "learning_rate": 2.6460701136545543e-06, + "loss": 2.005, + "step": 22970 + }, + { + "epoch": 1.9896103896103896, + "grad_norm": 0.5698711276054382, + "learning_rate": 2.6428685769169204e-06, + "loss": 2.028, + "step": 22980 + }, + { + "epoch": 1.9904761904761905, + "grad_norm": 0.5695561766624451, + "learning_rate": 2.639667040179286e-06, + "loss": 2.0223, + "step": 22990 + }, + { + "epoch": 1.9913419913419914, + "grad_norm": 0.5652351379394531, + "learning_rate": 2.636465503441652e-06, + "loss": 1.9957, + "step": 23000 + }, + { + "epoch": 1.9913419913419914, + "eval_loss": 2.035548210144043, + "eval_runtime": 99.4804, + "eval_samples_per_second": 10.052, + "eval_steps_per_second": 5.026, + "step": 23000 + }, + { + "epoch": 1.9922077922077923, + "grad_norm": 0.5072441697120667, + "learning_rate": 2.6332639667040184e-06, + "loss": 2.0121, + "step": 23010 + }, + { + "epoch": 1.9930735930735932, + "grad_norm": 0.5116727352142334, + "learning_rate": 2.630062429966384e-06, + "loss": 2.0439, + "step": 23020 + }, + { + "epoch": 1.993939393939394, + "grad_norm": 0.5237976908683777, + "learning_rate": 2.62686089322875e-06, + "loss": 2.0119, + "step": 23030 + }, + { + "epoch": 1.9948051948051948, + "grad_norm": 0.5784240961074829, + "learning_rate": 2.623659356491116e-06, + "loss": 2.0212, + "step": 23040 + }, + { + "epoch": 1.9956709956709957, + "grad_norm": 0.5490781664848328, + "learning_rate": 2.620457819753482e-06, + "loss": 2.0406, + "step": 23050 + }, + { + "epoch": 1.9965367965367964, + "grad_norm": 0.5990797281265259, + "learning_rate": 2.617256283015848e-06, + "loss": 2.0322, + "step": 23060 + }, + { + "epoch": 1.9974025974025973, + "grad_norm": 0.6267635226249695, + "learning_rate": 2.614054746278214e-06, + "loss": 2.013, + "step": 23070 + }, + { + "epoch": 1.9982683982683982, + "grad_norm": 0.5935016870498657, + "learning_rate": 2.6108532095405797e-06, + "loss": 2.0138, + "step": 23080 + }, + { + "epoch": 1.999134199134199, + "grad_norm": 0.597457766532898, + "learning_rate": 2.6076516728029454e-06, + "loss": 2.0064, + "step": 23090 + }, + { + "epoch": 2.0, + "grad_norm": 0.5814897418022156, + "learning_rate": 2.604450136065312e-06, + "loss": 2.0201, + "step": 23100 + }, + { + "epoch": 2.0, + "eval_loss": 2.0357136726379395, + "eval_runtime": 99.4873, + "eval_samples_per_second": 10.052, + "eval_steps_per_second": 5.026, + "step": 23100 + }, + { + "epoch": 2.000865800865801, + "grad_norm": 0.6136175394058228, + "learning_rate": 2.6012485993276776e-06, + "loss": 2.0017, + "step": 23110 + }, + { + "epoch": 2.001731601731602, + "grad_norm": 0.5407949090003967, + "learning_rate": 2.5980470625900433e-06, + "loss": 2.0178, + "step": 23120 + }, + { + "epoch": 2.0025974025974027, + "grad_norm": 0.5387251377105713, + "learning_rate": 2.594845525852409e-06, + "loss": 1.9983, + "step": 23130 + }, + { + "epoch": 2.0034632034632036, + "grad_norm": 0.5647053122520447, + "learning_rate": 2.5916439891147756e-06, + "loss": 2.0078, + "step": 23140 + }, + { + "epoch": 2.0043290043290045, + "grad_norm": 0.619827151298523, + "learning_rate": 2.5884424523771413e-06, + "loss": 2.03, + "step": 23150 + }, + { + "epoch": 2.005194805194805, + "grad_norm": 0.5853438973426819, + "learning_rate": 2.585240915639507e-06, + "loss": 2.0294, + "step": 23160 + }, + { + "epoch": 2.006060606060606, + "grad_norm": 0.5789020657539368, + "learning_rate": 2.582039378901873e-06, + "loss": 2.0126, + "step": 23170 + }, + { + "epoch": 2.006926406926407, + "grad_norm": 0.5304809808731079, + "learning_rate": 2.578837842164239e-06, + "loss": 2.0435, + "step": 23180 + }, + { + "epoch": 2.0077922077922077, + "grad_norm": 0.5407286286354065, + "learning_rate": 2.575636305426605e-06, + "loss": 2.015, + "step": 23190 + }, + { + "epoch": 2.0086580086580086, + "grad_norm": 0.5430712699890137, + "learning_rate": 2.572434768688971e-06, + "loss": 2.0227, + "step": 23200 + }, + { + "epoch": 2.0086580086580086, + "eval_loss": 2.035465955734253, + "eval_runtime": 99.5051, + "eval_samples_per_second": 10.05, + "eval_steps_per_second": 5.025, + "step": 23200 + }, + { + "epoch": 2.0095238095238095, + "grad_norm": 0.5268262624740601, + "learning_rate": 2.569233231951337e-06, + "loss": 2.0069, + "step": 23210 + }, + { + "epoch": 2.0103896103896104, + "grad_norm": 0.6594340205192566, + "learning_rate": 2.5660316952137026e-06, + "loss": 2.041, + "step": 23220 + }, + { + "epoch": 2.0112554112554113, + "grad_norm": 0.5596958994865417, + "learning_rate": 2.562830158476069e-06, + "loss": 2.025, + "step": 23230 + }, + { + "epoch": 2.012121212121212, + "grad_norm": 0.5416747331619263, + "learning_rate": 2.559628621738435e-06, + "loss": 1.9938, + "step": 23240 + }, + { + "epoch": 2.012987012987013, + "grad_norm": 0.5677267909049988, + "learning_rate": 2.5564270850008005e-06, + "loss": 1.9824, + "step": 23250 + }, + { + "epoch": 2.013852813852814, + "grad_norm": 0.5407697558403015, + "learning_rate": 2.5532255482631663e-06, + "loss": 2.0303, + "step": 23260 + }, + { + "epoch": 2.014718614718615, + "grad_norm": 0.6268892884254456, + "learning_rate": 2.5500240115255324e-06, + "loss": 2.0254, + "step": 23270 + }, + { + "epoch": 2.0155844155844154, + "grad_norm": 0.5445701479911804, + "learning_rate": 2.5468224747878985e-06, + "loss": 2.0285, + "step": 23280 + }, + { + "epoch": 2.0164502164502163, + "grad_norm": 0.6353744268417358, + "learning_rate": 2.5436209380502642e-06, + "loss": 1.9719, + "step": 23290 + }, + { + "epoch": 2.017316017316017, + "grad_norm": 0.7125324606895447, + "learning_rate": 2.5404194013126304e-06, + "loss": 2.0303, + "step": 23300 + }, + { + "epoch": 2.017316017316017, + "eval_loss": 2.0351850986480713, + "eval_runtime": 99.7756, + "eval_samples_per_second": 10.022, + "eval_steps_per_second": 5.011, + "step": 23300 + }, + { + "epoch": 2.018181818181818, + "grad_norm": 0.588187038898468, + "learning_rate": 2.537217864574996e-06, + "loss": 1.9909, + "step": 23310 + }, + { + "epoch": 2.019047619047619, + "grad_norm": 0.5819240808486938, + "learning_rate": 2.5340163278373618e-06, + "loss": 2.033, + "step": 23320 + }, + { + "epoch": 2.01991341991342, + "grad_norm": 0.5587559342384338, + "learning_rate": 2.5308147910997283e-06, + "loss": 2.0403, + "step": 23330 + }, + { + "epoch": 2.020779220779221, + "grad_norm": 0.5237162113189697, + "learning_rate": 2.527613254362094e-06, + "loss": 2.0311, + "step": 23340 + }, + { + "epoch": 2.0216450216450217, + "grad_norm": 0.5738016366958618, + "learning_rate": 2.5244117176244598e-06, + "loss": 2.0086, + "step": 23350 + }, + { + "epoch": 2.0225108225108226, + "grad_norm": 0.6247041821479797, + "learning_rate": 2.5212101808868263e-06, + "loss": 2.0163, + "step": 23360 + }, + { + "epoch": 2.0233766233766235, + "grad_norm": 0.5295172929763794, + "learning_rate": 2.518008644149192e-06, + "loss": 2.0048, + "step": 23370 + }, + { + "epoch": 2.0242424242424244, + "grad_norm": 0.5387830138206482, + "learning_rate": 2.5148071074115577e-06, + "loss": 2.0413, + "step": 23380 + }, + { + "epoch": 2.0251082251082253, + "grad_norm": 0.5304757952690125, + "learning_rate": 2.511605570673924e-06, + "loss": 1.9883, + "step": 23390 + }, + { + "epoch": 2.0259740259740258, + "grad_norm": 0.6326545476913452, + "learning_rate": 2.5084040339362896e-06, + "loss": 2.0005, + "step": 23400 + }, + { + "epoch": 2.0259740259740258, + "eval_loss": 2.0352110862731934, + "eval_runtime": 99.8194, + "eval_samples_per_second": 10.018, + "eval_steps_per_second": 5.009, + "step": 23400 + }, + { + "epoch": 2.0268398268398267, + "grad_norm": 0.604609489440918, + "learning_rate": 2.5052024971986553e-06, + "loss": 1.9793, + "step": 23410 + }, + { + "epoch": 2.0277056277056276, + "grad_norm": 0.6415339112281799, + "learning_rate": 2.5020009604610214e-06, + "loss": 1.9915, + "step": 23420 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 0.5685520768165588, + "learning_rate": 2.498799423723387e-06, + "loss": 1.992, + "step": 23430 + }, + { + "epoch": 2.0294372294372294, + "grad_norm": 0.6610993146896362, + "learning_rate": 2.4955978869857533e-06, + "loss": 1.9936, + "step": 23440 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 0.5498647093772888, + "learning_rate": 2.4923963502481194e-06, + "loss": 2.0087, + "step": 23450 + }, + { + "epoch": 2.031168831168831, + "grad_norm": 0.5626619458198547, + "learning_rate": 2.489194813510485e-06, + "loss": 2.057, + "step": 23460 + }, + { + "epoch": 2.032034632034632, + "grad_norm": 0.5277206897735596, + "learning_rate": 2.4859932767728512e-06, + "loss": 2.0326, + "step": 23470 + }, + { + "epoch": 2.032900432900433, + "grad_norm": 0.5527384877204895, + "learning_rate": 2.4827917400352174e-06, + "loss": 1.9978, + "step": 23480 + }, + { + "epoch": 2.033766233766234, + "grad_norm": 0.5400198698043823, + "learning_rate": 2.479590203297583e-06, + "loss": 2.0197, + "step": 23490 + }, + { + "epoch": 2.034632034632035, + "grad_norm": 0.6280678510665894, + "learning_rate": 2.4763886665599492e-06, + "loss": 2.0244, + "step": 23500 + }, + { + "epoch": 2.034632034632035, + "eval_loss": 2.034968852996826, + "eval_runtime": 99.5755, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 5.021, + "step": 23500 + }, + { + "epoch": 2.0354978354978357, + "grad_norm": 0.6585070490837097, + "learning_rate": 2.473187129822315e-06, + "loss": 2.0265, + "step": 23510 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 0.5453212261199951, + "learning_rate": 2.4699855930846806e-06, + "loss": 2.0263, + "step": 23520 + }, + { + "epoch": 2.037229437229437, + "grad_norm": 0.5460197329521179, + "learning_rate": 2.4667840563470468e-06, + "loss": 2.0178, + "step": 23530 + }, + { + "epoch": 2.038095238095238, + "grad_norm": 0.5552266836166382, + "learning_rate": 2.4635825196094125e-06, + "loss": 2.0367, + "step": 23540 + }, + { + "epoch": 2.038961038961039, + "grad_norm": 0.5523292422294617, + "learning_rate": 2.4603809828717786e-06, + "loss": 2.0116, + "step": 23550 + }, + { + "epoch": 2.0398268398268398, + "grad_norm": 0.6269523501396179, + "learning_rate": 2.4571794461341448e-06, + "loss": 2.023, + "step": 23560 + }, + { + "epoch": 2.0406926406926407, + "grad_norm": 0.6676937937736511, + "learning_rate": 2.4539779093965105e-06, + "loss": 1.9792, + "step": 23570 + }, + { + "epoch": 2.0415584415584416, + "grad_norm": 0.6345266699790955, + "learning_rate": 2.4507763726588766e-06, + "loss": 2.0327, + "step": 23580 + }, + { + "epoch": 2.0424242424242425, + "grad_norm": 0.5686797499656677, + "learning_rate": 2.4475748359212423e-06, + "loss": 2.0012, + "step": 23590 + }, + { + "epoch": 2.0432900432900434, + "grad_norm": 0.5607196092605591, + "learning_rate": 2.4443732991836084e-06, + "loss": 2.0059, + "step": 23600 + }, + { + "epoch": 2.0432900432900434, + "eval_loss": 2.0349371433258057, + "eval_runtime": 99.6082, + "eval_samples_per_second": 10.039, + "eval_steps_per_second": 5.02, + "step": 23600 + }, + { + "epoch": 2.0441558441558443, + "grad_norm": 0.6047848463058472, + "learning_rate": 2.4411717624459746e-06, + "loss": 2.0386, + "step": 23610 + }, + { + "epoch": 2.045021645021645, + "grad_norm": 0.5824304819107056, + "learning_rate": 2.4379702257083403e-06, + "loss": 2.0394, + "step": 23620 + }, + { + "epoch": 2.045887445887446, + "grad_norm": 0.567637026309967, + "learning_rate": 2.434768688970706e-06, + "loss": 2.0106, + "step": 23630 + }, + { + "epoch": 2.0467532467532465, + "grad_norm": 0.5690438151359558, + "learning_rate": 2.431567152233072e-06, + "loss": 2.0424, + "step": 23640 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.5576907396316528, + "learning_rate": 2.428365615495438e-06, + "loss": 2.0108, + "step": 23650 + }, + { + "epoch": 2.0484848484848484, + "grad_norm": 0.568816602230072, + "learning_rate": 2.425164078757804e-06, + "loss": 1.9905, + "step": 23660 + }, + { + "epoch": 2.0493506493506493, + "grad_norm": 0.567893922328949, + "learning_rate": 2.4219625420201697e-06, + "loss": 2.0397, + "step": 23670 + }, + { + "epoch": 2.05021645021645, + "grad_norm": 0.5759986639022827, + "learning_rate": 2.418761005282536e-06, + "loss": 2.013, + "step": 23680 + }, + { + "epoch": 2.051082251082251, + "grad_norm": 0.5438066124916077, + "learning_rate": 2.415559468544902e-06, + "loss": 2.0041, + "step": 23690 + }, + { + "epoch": 2.051948051948052, + "grad_norm": 0.5716127157211304, + "learning_rate": 2.4123579318072677e-06, + "loss": 2.0247, + "step": 23700 + }, + { + "epoch": 2.051948051948052, + "eval_loss": 2.0347187519073486, + "eval_runtime": 99.6831, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 5.016, + "step": 23700 + }, + { + "epoch": 2.052813852813853, + "grad_norm": 0.5692997574806213, + "learning_rate": 2.409156395069634e-06, + "loss": 2.0034, + "step": 23710 + }, + { + "epoch": 2.0536796536796538, + "grad_norm": 0.5693305730819702, + "learning_rate": 2.4059548583319995e-06, + "loss": 2.0117, + "step": 23720 + }, + { + "epoch": 2.0545454545454547, + "grad_norm": 0.6145777106285095, + "learning_rate": 2.4027533215943656e-06, + "loss": 2.0224, + "step": 23730 + }, + { + "epoch": 2.0554112554112556, + "grad_norm": 0.5658998489379883, + "learning_rate": 2.3995517848567313e-06, + "loss": 1.9992, + "step": 23740 + }, + { + "epoch": 2.0562770562770565, + "grad_norm": 0.568269670009613, + "learning_rate": 2.3963502481190975e-06, + "loss": 1.9953, + "step": 23750 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.5881268382072449, + "learning_rate": 2.393148711381463e-06, + "loss": 2.021, + "step": 23760 + }, + { + "epoch": 2.058008658008658, + "grad_norm": 0.6150349974632263, + "learning_rate": 2.3899471746438293e-06, + "loss": 2.0009, + "step": 23770 + }, + { + "epoch": 2.0588744588744587, + "grad_norm": 0.515796422958374, + "learning_rate": 2.386745637906195e-06, + "loss": 1.9927, + "step": 23780 + }, + { + "epoch": 2.0597402597402596, + "grad_norm": 0.5475239157676697, + "learning_rate": 2.383544101168561e-06, + "loss": 2.0204, + "step": 23790 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 0.6885071396827698, + "learning_rate": 2.380342564430927e-06, + "loss": 2.012, + "step": 23800 + }, + { + "epoch": 2.0606060606060606, + "eval_loss": 2.034194231033325, + "eval_runtime": 99.5096, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 5.025, + "step": 23800 + }, + { + "epoch": 3.195973154362416, + "grad_norm": 0.5709320306777954, + "learning_rate": 2.377141027693293e-06, + "loss": 1.995, + "step": 23810 + }, + { + "epoch": 3.1973154362416105, + "grad_norm": 0.5705162882804871, + "learning_rate": 2.373939490955659e-06, + "loss": 2.0146, + "step": 23820 + }, + { + "epoch": 3.1986577181208053, + "grad_norm": 0.5645247101783752, + "learning_rate": 2.370737954218025e-06, + "loss": 2.0212, + "step": 23830 + }, + { + "epoch": 3.2, + "grad_norm": 0.6989596486091614, + "learning_rate": 2.367536417480391e-06, + "loss": 2.0094, + "step": 23840 + }, + { + "epoch": 3.2013422818791946, + "grad_norm": 0.6202380657196045, + "learning_rate": 2.3643348807427567e-06, + "loss": 1.9898, + "step": 23850 + }, + { + "epoch": 3.2026845637583894, + "grad_norm": 0.5662652850151062, + "learning_rate": 2.361133344005123e-06, + "loss": 2.0212, + "step": 23860 + }, + { + "epoch": 3.2040268456375838, + "grad_norm": 0.5594598054885864, + "learning_rate": 2.3579318072674885e-06, + "loss": 2.0415, + "step": 23870 + }, + { + "epoch": 3.2053691275167786, + "grad_norm": 0.5457987189292908, + "learning_rate": 2.3547302705298543e-06, + "loss": 2.0357, + "step": 23880 + }, + { + "epoch": 3.206711409395973, + "grad_norm": 0.5731124877929688, + "learning_rate": 2.3515287337922204e-06, + "loss": 2.0174, + "step": 23890 + }, + { + "epoch": 3.208053691275168, + "grad_norm": 0.5775108337402344, + "learning_rate": 2.3483271970545865e-06, + "loss": 2.0033, + "step": 23900 + }, + { + "epoch": 3.208053691275168, + "eval_loss": 2.034665584564209, + "eval_runtime": 99.7956, + "eval_samples_per_second": 10.02, + "eval_steps_per_second": 5.01, + "step": 23900 + }, + { + "epoch": 3.209395973154362, + "grad_norm": 0.6452018022537231, + "learning_rate": 2.3451256603169522e-06, + "loss": 2.0048, + "step": 23910 + }, + { + "epoch": 3.210738255033557, + "grad_norm": 0.5958179235458374, + "learning_rate": 2.3419241235793184e-06, + "loss": 2.0017, + "step": 23920 + }, + { + "epoch": 3.212080536912752, + "grad_norm": 0.6147748231887817, + "learning_rate": 2.338722586841684e-06, + "loss": 2.0273, + "step": 23930 + }, + { + "epoch": 3.2134228187919462, + "grad_norm": 0.5603628754615784, + "learning_rate": 2.33552105010405e-06, + "loss": 2.0264, + "step": 23940 + }, + { + "epoch": 3.214765100671141, + "grad_norm": 0.5753715634346008, + "learning_rate": 2.3323195133664163e-06, + "loss": 2.0444, + "step": 23950 + }, + { + "epoch": 3.2161073825503355, + "grad_norm": 0.5697400569915771, + "learning_rate": 2.329117976628782e-06, + "loss": 1.9961, + "step": 23960 + }, + { + "epoch": 3.2174496644295303, + "grad_norm": 0.5242548584938049, + "learning_rate": 2.3259164398911478e-06, + "loss": 2.0079, + "step": 23970 + }, + { + "epoch": 3.2187919463087247, + "grad_norm": 0.5654411315917969, + "learning_rate": 2.322714903153514e-06, + "loss": 2.0124, + "step": 23980 + }, + { + "epoch": 3.2201342281879195, + "grad_norm": 0.5427534580230713, + "learning_rate": 2.3195133664158796e-06, + "loss": 1.9617, + "step": 23990 + }, + { + "epoch": 3.221476510067114, + "grad_norm": 0.5504390597343445, + "learning_rate": 2.3163118296782457e-06, + "loss": 1.9851, + "step": 24000 + }, + { + "epoch": 3.221476510067114, + "eval_loss": 2.0347743034362793, + "eval_runtime": 99.05, + "eval_samples_per_second": 10.096, + "eval_steps_per_second": 5.048, + "step": 24000 + }, + { + "epoch": 3.2228187919463087, + "grad_norm": 0.5600200295448303, + "learning_rate": 2.3131102929406114e-06, + "loss": 2.0122, + "step": 24010 + }, + { + "epoch": 3.2241610738255035, + "grad_norm": 0.6022480726242065, + "learning_rate": 2.3099087562029776e-06, + "loss": 1.998, + "step": 24020 + }, + { + "epoch": 3.225503355704698, + "grad_norm": 0.7161531448364258, + "learning_rate": 2.3067072194653437e-06, + "loss": 2.0005, + "step": 24030 + }, + { + "epoch": 3.2268456375838928, + "grad_norm": 0.5841786861419678, + "learning_rate": 2.3035056827277094e-06, + "loss": 2.0338, + "step": 24040 + }, + { + "epoch": 3.228187919463087, + "grad_norm": 0.525364100933075, + "learning_rate": 2.3003041459900756e-06, + "loss": 2.0457, + "step": 24050 + }, + { + "epoch": 3.229530201342282, + "grad_norm": 0.5649173259735107, + "learning_rate": 2.2971026092524417e-06, + "loss": 2.011, + "step": 24060 + }, + { + "epoch": 3.2308724832214764, + "grad_norm": 0.5048159956932068, + "learning_rate": 2.2939010725148074e-06, + "loss": 2.0205, + "step": 24070 + }, + { + "epoch": 3.232214765100671, + "grad_norm": 0.5814887881278992, + "learning_rate": 2.290699535777173e-06, + "loss": 2.0189, + "step": 24080 + }, + { + "epoch": 3.2335570469798656, + "grad_norm": 0.6833152770996094, + "learning_rate": 2.2874979990395392e-06, + "loss": 2.0038, + "step": 24090 + }, + { + "epoch": 3.2348993288590604, + "grad_norm": 0.6447194218635559, + "learning_rate": 2.284296462301905e-06, + "loss": 2.0058, + "step": 24100 + }, + { + "epoch": 3.2348993288590604, + "eval_loss": 2.0349581241607666, + "eval_runtime": 98.9765, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.052, + "step": 24100 + }, + { + "epoch": 3.2362416107382552, + "grad_norm": 0.6656696796417236, + "learning_rate": 2.281094925564271e-06, + "loss": 2.0291, + "step": 24110 + }, + { + "epoch": 3.2375838926174496, + "grad_norm": 0.6009058952331543, + "learning_rate": 2.277893388826637e-06, + "loss": 1.9914, + "step": 24120 + }, + { + "epoch": 3.2389261744966444, + "grad_norm": 0.5890768766403198, + "learning_rate": 2.274691852089003e-06, + "loss": 2.0448, + "step": 24130 + }, + { + "epoch": 3.240268456375839, + "grad_norm": 0.563474714756012, + "learning_rate": 2.2714903153513686e-06, + "loss": 2.0105, + "step": 24140 + }, + { + "epoch": 3.2416107382550337, + "grad_norm": 0.5375024080276489, + "learning_rate": 2.2682887786137348e-06, + "loss": 2.0336, + "step": 24150 + }, + { + "epoch": 3.242953020134228, + "grad_norm": 0.5433012843132019, + "learning_rate": 2.265087241876101e-06, + "loss": 2.0475, + "step": 24160 + }, + { + "epoch": 3.244295302013423, + "grad_norm": 0.5040486454963684, + "learning_rate": 2.2618857051384666e-06, + "loss": 2.0288, + "step": 24170 + }, + { + "epoch": 3.2456375838926173, + "grad_norm": 0.5953966975212097, + "learning_rate": 2.2586841684008328e-06, + "loss": 2.0048, + "step": 24180 + }, + { + "epoch": 3.246979865771812, + "grad_norm": 0.5979200005531311, + "learning_rate": 2.2554826316631985e-06, + "loss": 1.9992, + "step": 24190 + }, + { + "epoch": 3.248322147651007, + "grad_norm": 0.5424688458442688, + "learning_rate": 2.2522810949255646e-06, + "loss": 1.9983, + "step": 24200 + }, + { + "epoch": 3.248322147651007, + "eval_loss": 2.034461498260498, + "eval_runtime": 98.9765, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.052, + "step": 24200 + }, + { + "epoch": 3.2496644295302013, + "grad_norm": 0.5857418179512024, + "learning_rate": 2.2490795581879303e-06, + "loss": 1.9997, + "step": 24210 + }, + { + "epoch": 3.251006711409396, + "grad_norm": 0.5468854904174805, + "learning_rate": 2.245878021450296e-06, + "loss": 2.0305, + "step": 24220 + }, + { + "epoch": 3.2523489932885905, + "grad_norm": 0.5583387613296509, + "learning_rate": 2.242676484712662e-06, + "loss": 2.0155, + "step": 24230 + }, + { + "epoch": 3.2536912751677853, + "grad_norm": 0.5511802434921265, + "learning_rate": 2.2394749479750283e-06, + "loss": 2.0018, + "step": 24240 + }, + { + "epoch": 3.2550335570469797, + "grad_norm": 0.7055875062942505, + "learning_rate": 2.236273411237394e-06, + "loss": 2.0143, + "step": 24250 + }, + { + "epoch": 3.2563758389261745, + "grad_norm": 0.6099456548690796, + "learning_rate": 2.23307187449976e-06, + "loss": 1.9877, + "step": 24260 + }, + { + "epoch": 3.257718120805369, + "grad_norm": 0.5808700919151306, + "learning_rate": 2.2298703377621263e-06, + "loss": 2.0161, + "step": 24270 + }, + { + "epoch": 3.2590604026845638, + "grad_norm": 0.5520883202552795, + "learning_rate": 2.226668801024492e-06, + "loss": 1.9884, + "step": 24280 + }, + { + "epoch": 3.2604026845637586, + "grad_norm": 0.5096262097358704, + "learning_rate": 2.223467264286858e-06, + "loss": 2.0242, + "step": 24290 + }, + { + "epoch": 3.261744966442953, + "grad_norm": 0.5912474989891052, + "learning_rate": 2.220265727549224e-06, + "loss": 1.9842, + "step": 24300 + }, + { + "epoch": 3.261744966442953, + "eval_loss": 2.0343332290649414, + "eval_runtime": 98.8495, + "eval_samples_per_second": 10.116, + "eval_steps_per_second": 5.058, + "step": 24300 + }, + { + "epoch": 3.263087248322148, + "grad_norm": 0.5764116644859314, + "learning_rate": 2.21706419081159e-06, + "loss": 2.0633, + "step": 24310 + }, + { + "epoch": 3.264429530201342, + "grad_norm": 0.6952981948852539, + "learning_rate": 2.2138626540739557e-06, + "loss": 1.9669, + "step": 24320 + }, + { + "epoch": 3.265771812080537, + "grad_norm": 0.6251304745674133, + "learning_rate": 2.2106611173363214e-06, + "loss": 2.0096, + "step": 24330 + }, + { + "epoch": 3.2671140939597314, + "grad_norm": 0.5688126087188721, + "learning_rate": 2.2074595805986875e-06, + "loss": 1.9922, + "step": 24340 + }, + { + "epoch": 3.2684563758389262, + "grad_norm": 0.5293943285942078, + "learning_rate": 2.204258043861053e-06, + "loss": 2.0349, + "step": 24350 + }, + { + "epoch": 3.2697986577181206, + "grad_norm": 0.5495185256004333, + "learning_rate": 2.2010565071234193e-06, + "loss": 2.0283, + "step": 24360 + }, + { + "epoch": 3.2711409395973154, + "grad_norm": 0.6728076934814453, + "learning_rate": 2.1978549703857855e-06, + "loss": 2.0335, + "step": 24370 + }, + { + "epoch": 3.2724832214765103, + "grad_norm": 0.5396303534507751, + "learning_rate": 2.194653433648151e-06, + "loss": 2.0144, + "step": 24380 + }, + { + "epoch": 3.2738255033557047, + "grad_norm": 0.5711934566497803, + "learning_rate": 2.1914518969105173e-06, + "loss": 2.0112, + "step": 24390 + }, + { + "epoch": 3.2751677852348995, + "grad_norm": 0.5564073920249939, + "learning_rate": 2.1882503601728835e-06, + "loss": 2.0288, + "step": 24400 + }, + { + "epoch": 3.2751677852348995, + "eval_loss": 2.0342624187469482, + "eval_runtime": 99.5861, + "eval_samples_per_second": 10.042, + "eval_steps_per_second": 5.021, + "step": 24400 + }, + { + "epoch": 3.276510067114094, + "grad_norm": 0.5944424271583557, + "learning_rate": 2.185048823435249e-06, + "loss": 2.0196, + "step": 24410 + }, + { + "epoch": 3.2778523489932887, + "grad_norm": 0.6209209561347961, + "learning_rate": 2.181847286697615e-06, + "loss": 2.0087, + "step": 24420 + }, + { + "epoch": 3.279194630872483, + "grad_norm": 0.6335627436637878, + "learning_rate": 2.178645749959981e-06, + "loss": 2.0274, + "step": 24430 + }, + { + "epoch": 3.280536912751678, + "grad_norm": 0.5886103510856628, + "learning_rate": 2.1754442132223467e-06, + "loss": 2.0106, + "step": 24440 + }, + { + "epoch": 3.2818791946308723, + "grad_norm": 0.5783700346946716, + "learning_rate": 2.172242676484713e-06, + "loss": 2.0025, + "step": 24450 + }, + { + "epoch": 3.283221476510067, + "grad_norm": 0.5945323705673218, + "learning_rate": 2.1690411397470786e-06, + "loss": 2.0192, + "step": 24460 + }, + { + "epoch": 3.284563758389262, + "grad_norm": 0.5845428109169006, + "learning_rate": 2.1658396030094447e-06, + "loss": 2.0084, + "step": 24470 + }, + { + "epoch": 3.2859060402684563, + "grad_norm": 0.5515161156654358, + "learning_rate": 2.162638066271811e-06, + "loss": 2.0013, + "step": 24480 + }, + { + "epoch": 3.287248322147651, + "grad_norm": 0.5515275001525879, + "learning_rate": 2.1594365295341765e-06, + "loss": 1.992, + "step": 24490 + }, + { + "epoch": 3.2885906040268456, + "grad_norm": 0.5798812508583069, + "learning_rate": 2.1562349927965427e-06, + "loss": 2.0224, + "step": 24500 + }, + { + "epoch": 3.2885906040268456, + "eval_loss": 2.034242868423462, + "eval_runtime": 98.8346, + "eval_samples_per_second": 10.118, + "eval_steps_per_second": 5.059, + "step": 24500 + }, + { + "epoch": 3.2899328859060404, + "grad_norm": 0.7159120440483093, + "learning_rate": 2.1530334560589084e-06, + "loss": 2.0158, + "step": 24510 + }, + { + "epoch": 3.2912751677852348, + "grad_norm": 0.5328459143638611, + "learning_rate": 2.1498319193212745e-06, + "loss": 2.0135, + "step": 24520 + }, + { + "epoch": 3.2926174496644296, + "grad_norm": 0.5350387692451477, + "learning_rate": 2.1466303825836402e-06, + "loss": 2.0315, + "step": 24530 + }, + { + "epoch": 3.293959731543624, + "grad_norm": 0.5860224962234497, + "learning_rate": 2.1434288458460064e-06, + "loss": 2.0145, + "step": 24540 + }, + { + "epoch": 3.295302013422819, + "grad_norm": 0.5541955828666687, + "learning_rate": 2.140227309108372e-06, + "loss": 2.0272, + "step": 24550 + }, + { + "epoch": 3.2966442953020136, + "grad_norm": 0.6375378966331482, + "learning_rate": 2.137025772370738e-06, + "loss": 2.0212, + "step": 24560 + }, + { + "epoch": 3.297986577181208, + "grad_norm": 0.7046541571617126, + "learning_rate": 2.133824235633104e-06, + "loss": 1.9943, + "step": 24570 + }, + { + "epoch": 3.299328859060403, + "grad_norm": 0.5820409655570984, + "learning_rate": 2.13062269889547e-06, + "loss": 2.0394, + "step": 24580 + }, + { + "epoch": 3.3006711409395972, + "grad_norm": 0.5739086866378784, + "learning_rate": 2.1274211621578358e-06, + "loss": 2.0043, + "step": 24590 + }, + { + "epoch": 3.302013422818792, + "grad_norm": 0.5671262741088867, + "learning_rate": 2.124219625420202e-06, + "loss": 1.9913, + "step": 24600 + }, + { + "epoch": 3.302013422818792, + "eval_loss": 2.0339884757995605, + "eval_runtime": 99.4671, + "eval_samples_per_second": 10.054, + "eval_steps_per_second": 5.027, + "step": 24600 + }, + { + "epoch": 3.3033557046979865, + "grad_norm": 0.6186826825141907, + "learning_rate": 2.121018088682568e-06, + "loss": 2.0004, + "step": 24610 + }, + { + "epoch": 3.3046979865771813, + "grad_norm": 0.5595492124557495, + "learning_rate": 2.1178165519449337e-06, + "loss": 1.9985, + "step": 24620 + }, + { + "epoch": 3.3060402684563757, + "grad_norm": 0.609688401222229, + "learning_rate": 2.1146150152073e-06, + "loss": 2.0024, + "step": 24630 + }, + { + "epoch": 3.3073825503355705, + "grad_norm": 0.5851048231124878, + "learning_rate": 2.1114134784696656e-06, + "loss": 2.0059, + "step": 24640 + }, + { + "epoch": 3.3087248322147653, + "grad_norm": 0.5259600877761841, + "learning_rate": 2.1082119417320317e-06, + "loss": 2.0294, + "step": 24650 + }, + { + "epoch": 3.3100671140939597, + "grad_norm": 0.5880383849143982, + "learning_rate": 2.1050104049943974e-06, + "loss": 2.0324, + "step": 24660 + }, + { + "epoch": 3.3114093959731545, + "grad_norm": 0.5593898296356201, + "learning_rate": 2.101808868256763e-06, + "loss": 2.025, + "step": 24670 + }, + { + "epoch": 3.312751677852349, + "grad_norm": 0.5315661430358887, + "learning_rate": 2.0986073315191293e-06, + "loss": 2.0221, + "step": 24680 + }, + { + "epoch": 3.3140939597315437, + "grad_norm": 0.6451142430305481, + "learning_rate": 2.0954057947814954e-06, + "loss": 2.0162, + "step": 24690 + }, + { + "epoch": 3.315436241610738, + "grad_norm": 0.6757481694221497, + "learning_rate": 2.092204258043861e-06, + "loss": 1.9826, + "step": 24700 + }, + { + "epoch": 3.315436241610738, + "eval_loss": 2.0339531898498535, + "eval_runtime": 99.0126, + "eval_samples_per_second": 10.1, + "eval_steps_per_second": 5.05, + "step": 24700 + }, + { + "epoch": 3.316778523489933, + "grad_norm": 0.5286951065063477, + "learning_rate": 2.0890027213062272e-06, + "loss": 1.9789, + "step": 24710 + }, + { + "epoch": 3.3181208053691273, + "grad_norm": 0.6284973621368408, + "learning_rate": 2.085801184568593e-06, + "loss": 2.0031, + "step": 24720 + }, + { + "epoch": 3.319463087248322, + "grad_norm": 0.5724067091941833, + "learning_rate": 2.082599647830959e-06, + "loss": 2.0282, + "step": 24730 + }, + { + "epoch": 3.320805369127517, + "grad_norm": 0.5950222015380859, + "learning_rate": 2.0793981110933252e-06, + "loss": 1.9978, + "step": 24740 + }, + { + "epoch": 3.3221476510067114, + "grad_norm": 0.5722485184669495, + "learning_rate": 2.076196574355691e-06, + "loss": 2.0349, + "step": 24750 + }, + { + "epoch": 3.323489932885906, + "grad_norm": 0.5992030501365662, + "learning_rate": 2.072995037618057e-06, + "loss": 2.0462, + "step": 24760 + }, + { + "epoch": 3.3248322147651006, + "grad_norm": 0.5604340434074402, + "learning_rate": 2.0697935008804228e-06, + "loss": 1.9957, + "step": 24770 + }, + { + "epoch": 3.3261744966442954, + "grad_norm": 0.6042061448097229, + "learning_rate": 2.0665919641427885e-06, + "loss": 1.9757, + "step": 24780 + }, + { + "epoch": 3.32751677852349, + "grad_norm": 0.5957545042037964, + "learning_rate": 2.0633904274051546e-06, + "loss": 2.014, + "step": 24790 + }, + { + "epoch": 3.3288590604026846, + "grad_norm": 0.5731126666069031, + "learning_rate": 2.0601888906675203e-06, + "loss": 1.9967, + "step": 24800 + }, + { + "epoch": 3.3288590604026846, + "eval_loss": 2.0340585708618164, + "eval_runtime": 99.1874, + "eval_samples_per_second": 10.082, + "eval_steps_per_second": 5.041, + "step": 24800 + }, + { + "epoch": 3.330201342281879, + "grad_norm": 0.6661995053291321, + "learning_rate": 2.0569873539298865e-06, + "loss": 2.0049, + "step": 24810 + }, + { + "epoch": 3.331543624161074, + "grad_norm": 0.6591014862060547, + "learning_rate": 2.0537858171922526e-06, + "loss": 2.0198, + "step": 24820 + }, + { + "epoch": 3.3328859060402687, + "grad_norm": 0.6858102679252625, + "learning_rate": 2.0505842804546183e-06, + "loss": 2.0049, + "step": 24830 + }, + { + "epoch": 3.334228187919463, + "grad_norm": 0.5600757002830505, + "learning_rate": 2.0473827437169844e-06, + "loss": 2.0191, + "step": 24840 + }, + { + "epoch": 3.335570469798658, + "grad_norm": 0.632583498954773, + "learning_rate": 2.04418120697935e-06, + "loss": 2.0108, + "step": 24850 + }, + { + "epoch": 3.3369127516778523, + "grad_norm": 0.5636388659477234, + "learning_rate": 2.0409796702417163e-06, + "loss": 2.011, + "step": 24860 + }, + { + "epoch": 3.338255033557047, + "grad_norm": 0.6372313499450684, + "learning_rate": 2.0377781335040824e-06, + "loss": 2.0202, + "step": 24870 + }, + { + "epoch": 3.3395973154362415, + "grad_norm": 0.5647403001785278, + "learning_rate": 2.034576596766448e-06, + "loss": 2.0111, + "step": 24880 + }, + { + "epoch": 3.3409395973154363, + "grad_norm": 0.5417010188102722, + "learning_rate": 2.031375060028814e-06, + "loss": 2.0318, + "step": 24890 + }, + { + "epoch": 3.3422818791946307, + "grad_norm": 0.5448442697525024, + "learning_rate": 2.02817352329118e-06, + "loss": 2.0175, + "step": 24900 + }, + { + "epoch": 3.3422818791946307, + "eval_loss": 2.033846855163574, + "eval_runtime": 99.6231, + "eval_samples_per_second": 10.038, + "eval_steps_per_second": 5.019, + "step": 24900 + }, + { + "epoch": 3.3436241610738255, + "grad_norm": 0.6390599012374878, + "learning_rate": 2.0249719865535457e-06, + "loss": 2.0158, + "step": 24910 + }, + { + "epoch": 3.3449664429530204, + "grad_norm": 0.6264297962188721, + "learning_rate": 2.021770449815912e-06, + "loss": 1.9945, + "step": 24920 + }, + { + "epoch": 3.3463087248322148, + "grad_norm": 0.5326258540153503, + "learning_rate": 2.0185689130782775e-06, + "loss": 2.0242, + "step": 24930 + }, + { + "epoch": 3.3476510067114096, + "grad_norm": 0.6191580295562744, + "learning_rate": 2.0153673763406437e-06, + "loss": 2.01, + "step": 24940 + }, + { + "epoch": 3.348993288590604, + "grad_norm": 0.5405324697494507, + "learning_rate": 2.0121658396030098e-06, + "loss": 2.0089, + "step": 24950 + }, + { + "epoch": 3.350335570469799, + "grad_norm": 0.6203567981719971, + "learning_rate": 2.0089643028653755e-06, + "loss": 1.9956, + "step": 24960 + }, + { + "epoch": 3.351677852348993, + "grad_norm": 0.6961588263511658, + "learning_rate": 2.0057627661277416e-06, + "loss": 2.0056, + "step": 24970 + }, + { + "epoch": 3.353020134228188, + "grad_norm": 0.5940682291984558, + "learning_rate": 2.0025612293901073e-06, + "loss": 2.009, + "step": 24980 + }, + { + "epoch": 3.3543624161073824, + "grad_norm": 0.5571852922439575, + "learning_rate": 1.9993596926524735e-06, + "loss": 1.9921, + "step": 24990 + }, + { + "epoch": 3.3557046979865772, + "grad_norm": 0.515678882598877, + "learning_rate": 1.996158155914839e-06, + "loss": 2.0536, + "step": 25000 + }, + { + "epoch": 3.3557046979865772, + "eval_loss": 2.033797264099121, + "eval_runtime": 102.1044, + "eval_samples_per_second": 9.794, + "eval_steps_per_second": 4.897, + "step": 25000 + }, + { + "epoch": 3.357046979865772, + "grad_norm": 0.5705350637435913, + "learning_rate": 1.9929566191772053e-06, + "loss": 2.0119, + "step": 25010 + }, + { + "epoch": 3.3583892617449664, + "grad_norm": 0.6690366864204407, + "learning_rate": 1.989755082439571e-06, + "loss": 2.0099, + "step": 25020 + }, + { + "epoch": 3.3597315436241613, + "grad_norm": 0.6348695755004883, + "learning_rate": 1.986553545701937e-06, + "loss": 2.0011, + "step": 25030 + }, + { + "epoch": 3.3610738255033556, + "grad_norm": 0.604619562625885, + "learning_rate": 1.983352008964303e-06, + "loss": 2.0025, + "step": 25040 + }, + { + "epoch": 3.3624161073825505, + "grad_norm": 0.5843760371208191, + "learning_rate": 1.980150472226669e-06, + "loss": 2.0191, + "step": 25050 + }, + { + "epoch": 3.363758389261745, + "grad_norm": 0.6079941987991333, + "learning_rate": 1.9769489354890347e-06, + "loss": 2.0135, + "step": 25060 + }, + { + "epoch": 3.3651006711409397, + "grad_norm": 0.5454660654067993, + "learning_rate": 1.973747398751401e-06, + "loss": 2.0175, + "step": 25070 + }, + { + "epoch": 3.366442953020134, + "grad_norm": 0.5789991021156311, + "learning_rate": 1.970545862013767e-06, + "loss": 1.9941, + "step": 25080 + }, + { + "epoch": 3.367785234899329, + "grad_norm": 0.5079576373100281, + "learning_rate": 1.9673443252761327e-06, + "loss": 2.031, + "step": 25090 + }, + { + "epoch": 3.3691275167785237, + "grad_norm": 0.5263907313346863, + "learning_rate": 1.964142788538499e-06, + "loss": 2.0079, + "step": 25100 + }, + { + "epoch": 3.3691275167785237, + "eval_loss": 2.033698081970215, + "eval_runtime": 99.4346, + "eval_samples_per_second": 10.057, + "eval_steps_per_second": 5.028, + "step": 25100 + }, + { + "epoch": 3.370469798657718, + "grad_norm": 0.5881233811378479, + "learning_rate": 1.9609412518008645e-06, + "loss": 2.0269, + "step": 25110 + }, + { + "epoch": 3.3718120805369125, + "grad_norm": 0.551294207572937, + "learning_rate": 1.9577397150632307e-06, + "loss": 2.0289, + "step": 25120 + }, + { + "epoch": 3.3731543624161073, + "grad_norm": 0.5218905210494995, + "learning_rate": 1.9545381783255964e-06, + "loss": 2.0029, + "step": 25130 + }, + { + "epoch": 3.374496644295302, + "grad_norm": 0.5961599349975586, + "learning_rate": 1.951336641587962e-06, + "loss": 2.0059, + "step": 25140 + }, + { + "epoch": 3.3758389261744965, + "grad_norm": 0.5878936052322388, + "learning_rate": 1.9481351048503282e-06, + "loss": 2.0329, + "step": 25150 + }, + { + "epoch": 3.3771812080536914, + "grad_norm": 0.5031041502952576, + "learning_rate": 1.9449335681126944e-06, + "loss": 2.0187, + "step": 25160 + }, + { + "epoch": 3.3785234899328858, + "grad_norm": 0.5873698592185974, + "learning_rate": 1.94173203137506e-06, + "loss": 2.0103, + "step": 25170 + }, + { + "epoch": 3.3798657718120806, + "grad_norm": 0.6120571494102478, + "learning_rate": 1.938530494637426e-06, + "loss": 2.0029, + "step": 25180 + }, + { + "epoch": 3.3812080536912754, + "grad_norm": 0.6189383864402771, + "learning_rate": 1.9353289578997923e-06, + "loss": 2.0328, + "step": 25190 + }, + { + "epoch": 3.38255033557047, + "grad_norm": 0.5471286177635193, + "learning_rate": 1.932127421162158e-06, + "loss": 2.0107, + "step": 25200 + }, + { + "epoch": 3.38255033557047, + "eval_loss": 2.0336575508117676, + "eval_runtime": 99.6661, + "eval_samples_per_second": 10.033, + "eval_steps_per_second": 5.017, + "step": 25200 + }, + { + "epoch": 3.383892617449664, + "grad_norm": 0.6131651997566223, + "learning_rate": 1.928925884424524e-06, + "loss": 2.0046, + "step": 25210 + }, + { + "epoch": 3.385234899328859, + "grad_norm": 0.501825213432312, + "learning_rate": 1.92572434768689e-06, + "loss": 2.0169, + "step": 25220 + }, + { + "epoch": 3.386577181208054, + "grad_norm": 0.6550677418708801, + "learning_rate": 1.9225228109492556e-06, + "loss": 2.0105, + "step": 25230 + }, + { + "epoch": 3.3879194630872482, + "grad_norm": 0.590255081653595, + "learning_rate": 1.9193212742116217e-06, + "loss": 2.0032, + "step": 25240 + }, + { + "epoch": 3.389261744966443, + "grad_norm": 0.5433794856071472, + "learning_rate": 1.9161197374739874e-06, + "loss": 2.0134, + "step": 25250 + }, + { + "epoch": 3.3906040268456374, + "grad_norm": 0.5783948302268982, + "learning_rate": 1.9129182007363536e-06, + "loss": 1.9641, + "step": 25260 + }, + { + "epoch": 3.3919463087248323, + "grad_norm": 0.5930677056312561, + "learning_rate": 1.9097166639987193e-06, + "loss": 1.9944, + "step": 25270 + }, + { + "epoch": 3.3932885906040267, + "grad_norm": 0.5659170150756836, + "learning_rate": 1.9065151272610854e-06, + "loss": 2.016, + "step": 25280 + }, + { + "epoch": 3.3946308724832215, + "grad_norm": 0.5566468238830566, + "learning_rate": 1.9033135905234516e-06, + "loss": 2.0061, + "step": 25290 + }, + { + "epoch": 3.395973154362416, + "grad_norm": 0.5847912430763245, + "learning_rate": 1.9001120537858173e-06, + "loss": 1.9786, + "step": 25300 + }, + { + "epoch": 3.395973154362416, + "eval_loss": 2.033474922180176, + "eval_runtime": 99.6194, + "eval_samples_per_second": 10.038, + "eval_steps_per_second": 5.019, + "step": 25300 + }, + { + "epoch": 3.3973154362416107, + "grad_norm": 0.5895040035247803, + "learning_rate": 1.8969105170481834e-06, + "loss": 1.9955, + "step": 25310 + }, + { + "epoch": 3.3986577181208055, + "grad_norm": 0.6539040207862854, + "learning_rate": 1.8937089803105493e-06, + "loss": 1.9979, + "step": 25320 + }, + { + "epoch": 3.4, + "grad_norm": 0.5976948142051697, + "learning_rate": 1.890507443572915e-06, + "loss": 2.0185, + "step": 25330 + }, + { + "epoch": 3.4013422818791947, + "grad_norm": 0.53128981590271, + "learning_rate": 1.8873059068352812e-06, + "loss": 2.0292, + "step": 25340 + }, + { + "epoch": 3.402684563758389, + "grad_norm": 0.5859795808792114, + "learning_rate": 1.8841043700976469e-06, + "loss": 2.0067, + "step": 25350 + }, + { + "epoch": 3.404026845637584, + "grad_norm": 0.5527817010879517, + "learning_rate": 1.880902833360013e-06, + "loss": 2.0164, + "step": 25360 + }, + { + "epoch": 3.4053691275167783, + "grad_norm": 0.5099530220031738, + "learning_rate": 1.877701296622379e-06, + "loss": 2.0122, + "step": 25370 + }, + { + "epoch": 3.406711409395973, + "grad_norm": 0.5217568278312683, + "learning_rate": 1.8744997598847448e-06, + "loss": 1.9672, + "step": 25380 + }, + { + "epoch": 3.4080536912751676, + "grad_norm": 0.569888174533844, + "learning_rate": 1.8712982231471108e-06, + "loss": 2.0249, + "step": 25390 + }, + { + "epoch": 3.4093959731543624, + "grad_norm": 0.5308755040168762, + "learning_rate": 1.868096686409477e-06, + "loss": 2.0134, + "step": 25400 + }, + { + "epoch": 3.4093959731543624, + "eval_loss": 2.0333244800567627, + "eval_runtime": 99.7654, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 5.012, + "step": 25400 + }, + { + "epoch": 3.410738255033557, + "grad_norm": 0.5577428936958313, + "learning_rate": 1.8648951496718426e-06, + "loss": 1.9833, + "step": 25410 + }, + { + "epoch": 3.4120805369127516, + "grad_norm": 0.5679981708526611, + "learning_rate": 1.8616936129342085e-06, + "loss": 2.0226, + "step": 25420 + }, + { + "epoch": 3.4134228187919464, + "grad_norm": 0.5294861197471619, + "learning_rate": 1.8584920761965745e-06, + "loss": 1.9814, + "step": 25430 + }, + { + "epoch": 3.414765100671141, + "grad_norm": 0.5780696868896484, + "learning_rate": 1.8552905394589404e-06, + "loss": 2.0479, + "step": 25440 + }, + { + "epoch": 3.4161073825503356, + "grad_norm": 0.5870608687400818, + "learning_rate": 1.8520890027213065e-06, + "loss": 2.0535, + "step": 25450 + }, + { + "epoch": 3.41744966442953, + "grad_norm": 0.6202998757362366, + "learning_rate": 1.8488874659836722e-06, + "loss": 2.0074, + "step": 25460 + }, + { + "epoch": 3.418791946308725, + "grad_norm": 0.5340785980224609, + "learning_rate": 1.8456859292460384e-06, + "loss": 1.9816, + "step": 25470 + }, + { + "epoch": 3.4201342281879192, + "grad_norm": 0.5752537250518799, + "learning_rate": 1.842484392508404e-06, + "loss": 2.0156, + "step": 25480 + }, + { + "epoch": 3.421476510067114, + "grad_norm": 0.533042311668396, + "learning_rate": 1.83928285577077e-06, + "loss": 1.9993, + "step": 25490 + }, + { + "epoch": 3.422818791946309, + "grad_norm": 0.5413929224014282, + "learning_rate": 1.8360813190331361e-06, + "loss": 2.0008, + "step": 25500 + }, + { + "epoch": 3.422818791946309, + "eval_loss": 2.033494234085083, + "eval_runtime": 99.8012, + "eval_samples_per_second": 10.02, + "eval_steps_per_second": 5.01, + "step": 25500 + }, + { + "epoch": 3.4241610738255033, + "grad_norm": 0.613568902015686, + "learning_rate": 1.8328797822955018e-06, + "loss": 1.9857, + "step": 25510 + }, + { + "epoch": 3.425503355704698, + "grad_norm": 0.5847415924072266, + "learning_rate": 1.829678245557868e-06, + "loss": 2.005, + "step": 25520 + }, + { + "epoch": 3.4268456375838925, + "grad_norm": 0.6314620971679688, + "learning_rate": 1.8264767088202339e-06, + "loss": 2.0095, + "step": 25530 + }, + { + "epoch": 3.4281879194630873, + "grad_norm": 0.5585344433784485, + "learning_rate": 1.8232751720825998e-06, + "loss": 1.9953, + "step": 25540 + }, + { + "epoch": 3.4295302013422817, + "grad_norm": 0.5491799712181091, + "learning_rate": 1.8200736353449657e-06, + "loss": 1.9806, + "step": 25550 + }, + { + "epoch": 3.4308724832214765, + "grad_norm": 0.5185725688934326, + "learning_rate": 1.8168720986073317e-06, + "loss": 1.9981, + "step": 25560 + }, + { + "epoch": 3.432214765100671, + "grad_norm": 0.5704931020736694, + "learning_rate": 1.8136705618696976e-06, + "loss": 2.0389, + "step": 25570 + }, + { + "epoch": 3.4335570469798657, + "grad_norm": 0.6064590811729431, + "learning_rate": 1.8104690251320637e-06, + "loss": 2.0434, + "step": 25580 + }, + { + "epoch": 3.4348993288590606, + "grad_norm": 0.5624018907546997, + "learning_rate": 1.8072674883944294e-06, + "loss": 2.0115, + "step": 25590 + }, + { + "epoch": 3.436241610738255, + "grad_norm": 0.6257191896438599, + "learning_rate": 1.8040659516567953e-06, + "loss": 2.0321, + "step": 25600 + }, + { + "epoch": 3.436241610738255, + "eval_loss": 2.033203601837158, + "eval_runtime": 99.6708, + "eval_samples_per_second": 10.033, + "eval_steps_per_second": 5.017, + "step": 25600 + }, + { + "epoch": 3.43758389261745, + "grad_norm": 0.564278781414032, + "learning_rate": 1.8008644149191615e-06, + "loss": 2.004, + "step": 25610 + }, + { + "epoch": 3.438926174496644, + "grad_norm": 0.5536876320838928, + "learning_rate": 1.7976628781815272e-06, + "loss": 2.0119, + "step": 25620 + }, + { + "epoch": 3.440268456375839, + "grad_norm": 0.5710294246673584, + "learning_rate": 1.7944613414438933e-06, + "loss": 2.0375, + "step": 25630 + }, + { + "epoch": 3.4416107382550334, + "grad_norm": 0.5698024034500122, + "learning_rate": 1.791259804706259e-06, + "loss": 2.0173, + "step": 25640 + }, + { + "epoch": 3.442953020134228, + "grad_norm": 0.5126367211341858, + "learning_rate": 1.7880582679686252e-06, + "loss": 2.0378, + "step": 25650 + }, + { + "epoch": 3.4442953020134226, + "grad_norm": 0.6241849064826965, + "learning_rate": 1.784856731230991e-06, + "loss": 2.0066, + "step": 25660 + }, + { + "epoch": 3.4456375838926174, + "grad_norm": 0.5671263933181763, + "learning_rate": 1.7816551944933568e-06, + "loss": 2.0082, + "step": 25670 + }, + { + "epoch": 3.4469798657718123, + "grad_norm": 0.6242393851280212, + "learning_rate": 1.778453657755723e-06, + "loss": 1.9936, + "step": 25680 + }, + { + "epoch": 3.4483221476510066, + "grad_norm": 0.5984540581703186, + "learning_rate": 1.7752521210180886e-06, + "loss": 2.0275, + "step": 25690 + }, + { + "epoch": 3.4496644295302015, + "grad_norm": 0.5671818256378174, + "learning_rate": 1.7720505842804548e-06, + "loss": 1.9987, + "step": 25700 + }, + { + "epoch": 3.4496644295302015, + "eval_loss": 2.03314208984375, + "eval_runtime": 99.758, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 5.012, + "step": 25700 + }, + { + "epoch": 3.451006711409396, + "grad_norm": 0.550834596157074, + "learning_rate": 1.7688490475428207e-06, + "loss": 2.0123, + "step": 25710 + }, + { + "epoch": 3.4523489932885907, + "grad_norm": 0.5404412150382996, + "learning_rate": 1.7656475108051866e-06, + "loss": 1.9957, + "step": 25720 + }, + { + "epoch": 3.453691275167785, + "grad_norm": 0.576654851436615, + "learning_rate": 1.7624459740675525e-06, + "loss": 2.0206, + "step": 25730 + }, + { + "epoch": 3.45503355704698, + "grad_norm": 0.5848711729049683, + "learning_rate": 1.7592444373299187e-06, + "loss": 2.0004, + "step": 25740 + }, + { + "epoch": 3.4563758389261743, + "grad_norm": 0.525976300239563, + "learning_rate": 1.7560429005922844e-06, + "loss": 2.0218, + "step": 25750 + }, + { + "epoch": 3.457718120805369, + "grad_norm": 0.5736172795295715, + "learning_rate": 1.7528413638546505e-06, + "loss": 2.0345, + "step": 25760 + }, + { + "epoch": 3.459060402684564, + "grad_norm": 0.6559216380119324, + "learning_rate": 1.7496398271170162e-06, + "loss": 1.978, + "step": 25770 + }, + { + "epoch": 3.4604026845637583, + "grad_norm": 0.5698836445808411, + "learning_rate": 1.7464382903793821e-06, + "loss": 2.0003, + "step": 25780 + }, + { + "epoch": 3.461744966442953, + "grad_norm": 0.6036633253097534, + "learning_rate": 1.7432367536417483e-06, + "loss": 2.0183, + "step": 25790 + }, + { + "epoch": 3.4630872483221475, + "grad_norm": 0.5477694869041443, + "learning_rate": 1.740035216904114e-06, + "loss": 1.9895, + "step": 25800 + }, + { + "epoch": 3.4630872483221475, + "eval_loss": 2.0331811904907227, + "eval_runtime": 99.6707, + "eval_samples_per_second": 10.033, + "eval_steps_per_second": 5.017, + "step": 25800 + }, + { + "epoch": 3.4644295302013424, + "grad_norm": 0.664413571357727, + "learning_rate": 1.7368336801664801e-06, + "loss": 2.0055, + "step": 25810 + }, + { + "epoch": 3.4657718120805368, + "grad_norm": 0.5614915490150452, + "learning_rate": 1.733632143428846e-06, + "loss": 2.017, + "step": 25820 + }, + { + "epoch": 3.4671140939597316, + "grad_norm": 0.6014454364776611, + "learning_rate": 1.730430606691212e-06, + "loss": 2.0008, + "step": 25830 + }, + { + "epoch": 3.468456375838926, + "grad_norm": 0.5873821377754211, + "learning_rate": 1.7272290699535779e-06, + "loss": 1.9929, + "step": 25840 + }, + { + "epoch": 3.469798657718121, + "grad_norm": 0.5503237247467041, + "learning_rate": 1.7240275332159436e-06, + "loss": 2.045, + "step": 25850 + }, + { + "epoch": 3.4711409395973156, + "grad_norm": 0.5496537685394287, + "learning_rate": 1.7208259964783097e-06, + "loss": 2.0364, + "step": 25860 + }, + { + "epoch": 3.47248322147651, + "grad_norm": 0.5419268608093262, + "learning_rate": 1.7176244597406759e-06, + "loss": 2.0333, + "step": 25870 + }, + { + "epoch": 3.473825503355705, + "grad_norm": 0.5395207405090332, + "learning_rate": 1.7144229230030416e-06, + "loss": 1.9939, + "step": 25880 + }, + { + "epoch": 3.475167785234899, + "grad_norm": 0.5907343029975891, + "learning_rate": 1.7112213862654075e-06, + "loss": 2.032, + "step": 25890 + }, + { + "epoch": 3.476510067114094, + "grad_norm": 0.5549110174179077, + "learning_rate": 1.7080198495277734e-06, + "loss": 2.0245, + "step": 25900 + }, + { + "epoch": 3.476510067114094, + "eval_loss": 2.0327773094177246, + "eval_runtime": 99.7139, + "eval_samples_per_second": 10.029, + "eval_steps_per_second": 5.014, + "step": 25900 + }, + { + "epoch": 3.4778523489932884, + "grad_norm": 0.5323525667190552, + "learning_rate": 1.7048183127901393e-06, + "loss": 2.0095, + "step": 25910 + }, + { + "epoch": 3.4791946308724833, + "grad_norm": 0.5647071599960327, + "learning_rate": 1.7016167760525055e-06, + "loss": 2.0093, + "step": 25920 + }, + { + "epoch": 3.4805369127516776, + "grad_norm": 0.5616557598114014, + "learning_rate": 1.6984152393148712e-06, + "loss": 2.0069, + "step": 25930 + }, + { + "epoch": 3.4818791946308725, + "grad_norm": 0.5042036175727844, + "learning_rate": 1.6952137025772373e-06, + "loss": 1.9895, + "step": 25940 + }, + { + "epoch": 3.4832214765100673, + "grad_norm": 0.6743494868278503, + "learning_rate": 1.6920121658396032e-06, + "loss": 2.0078, + "step": 25950 + }, + { + "epoch": 3.4845637583892617, + "grad_norm": 0.5964071154594421, + "learning_rate": 1.688810629101969e-06, + "loss": 2.0144, + "step": 25960 + }, + { + "epoch": 3.4859060402684565, + "grad_norm": 0.5817059278488159, + "learning_rate": 1.685609092364335e-06, + "loss": 1.9803, + "step": 25970 + }, + { + "epoch": 3.487248322147651, + "grad_norm": 0.5435609221458435, + "learning_rate": 1.6824075556267008e-06, + "loss": 1.9754, + "step": 25980 + }, + { + "epoch": 3.4885906040268457, + "grad_norm": 0.5652287006378174, + "learning_rate": 1.679206018889067e-06, + "loss": 1.9967, + "step": 25990 + }, + { + "epoch": 3.48993288590604, + "grad_norm": 0.5400235056877136, + "learning_rate": 1.6760044821514328e-06, + "loss": 2.0134, + "step": 26000 + }, + { + "epoch": 3.48993288590604, + "eval_loss": 2.0326852798461914, + "eval_runtime": 99.736, + "eval_samples_per_second": 10.026, + "eval_steps_per_second": 5.013, + "step": 26000 + }, + { + "epoch": 3.491275167785235, + "grad_norm": 0.5601316690444946, + "learning_rate": 1.6728029454137988e-06, + "loss": 1.9638, + "step": 26010 + }, + { + "epoch": 3.4926174496644293, + "grad_norm": 0.594646692276001, + "learning_rate": 1.6696014086761647e-06, + "loss": 2.0044, + "step": 26020 + }, + { + "epoch": 3.493959731543624, + "grad_norm": 0.5239794254302979, + "learning_rate": 1.6663998719385308e-06, + "loss": 1.9931, + "step": 26030 + }, + { + "epoch": 3.495302013422819, + "grad_norm": 0.5149232745170593, + "learning_rate": 1.6631983352008965e-06, + "loss": 1.9846, + "step": 26040 + }, + { + "epoch": 3.4966442953020134, + "grad_norm": 0.5560303926467896, + "learning_rate": 1.6599967984632625e-06, + "loss": 2.0282, + "step": 26050 + }, + { + "epoch": 3.497986577181208, + "grad_norm": 0.5595155358314514, + "learning_rate": 1.6567952617256284e-06, + "loss": 2.0358, + "step": 26060 + }, + { + "epoch": 3.4993288590604026, + "grad_norm": 0.5482628345489502, + "learning_rate": 1.6535937249879943e-06, + "loss": 2.0165, + "step": 26070 + }, + { + "epoch": 3.5006711409395974, + "grad_norm": 0.5926582217216492, + "learning_rate": 1.6503921882503604e-06, + "loss": 2.0109, + "step": 26080 + }, + { + "epoch": 3.502013422818792, + "grad_norm": 0.5720061659812927, + "learning_rate": 1.6471906515127261e-06, + "loss": 2.0206, + "step": 26090 + }, + { + "epoch": 3.5033557046979866, + "grad_norm": 0.6203693747520447, + "learning_rate": 1.6439891147750923e-06, + "loss": 2.0082, + "step": 26100 + }, + { + "epoch": 3.5033557046979866, + "eval_loss": 2.0324747562408447, + "eval_runtime": 99.7144, + "eval_samples_per_second": 10.029, + "eval_steps_per_second": 5.014, + "step": 26100 + }, + { + "epoch": 3.504697986577181, + "grad_norm": 0.519006609916687, + "learning_rate": 1.6407875780374582e-06, + "loss": 2.0433, + "step": 26110 + }, + { + "epoch": 3.506040268456376, + "grad_norm": 0.5979138612747192, + "learning_rate": 1.637586041299824e-06, + "loss": 1.9624, + "step": 26120 + }, + { + "epoch": 3.5073825503355707, + "grad_norm": 0.6031221151351929, + "learning_rate": 1.63438450456219e-06, + "loss": 2.0151, + "step": 26130 + }, + { + "epoch": 3.508724832214765, + "grad_norm": 0.5102588534355164, + "learning_rate": 1.6311829678245558e-06, + "loss": 2.012, + "step": 26140 + }, + { + "epoch": 3.51006711409396, + "grad_norm": 0.5204235315322876, + "learning_rate": 1.6279814310869219e-06, + "loss": 2.0186, + "step": 26150 + }, + { + "epoch": 3.5114093959731543, + "grad_norm": 0.5003209710121155, + "learning_rate": 1.6247798943492878e-06, + "loss": 2.039, + "step": 26160 + }, + { + "epoch": 3.512751677852349, + "grad_norm": 0.5853362679481506, + "learning_rate": 1.6215783576116537e-06, + "loss": 2.0229, + "step": 26170 + }, + { + "epoch": 3.5140939597315435, + "grad_norm": 0.5810410976409912, + "learning_rate": 1.6183768208740197e-06, + "loss": 1.9996, + "step": 26180 + }, + { + "epoch": 3.5154362416107383, + "grad_norm": 0.5772440433502197, + "learning_rate": 1.6151752841363856e-06, + "loss": 2.0179, + "step": 26190 + }, + { + "epoch": 3.5167785234899327, + "grad_norm": 0.5555768609046936, + "learning_rate": 1.6119737473987515e-06, + "loss": 2.028, + "step": 26200 + }, + { + "epoch": 3.5167785234899327, + "eval_loss": 2.0322515964508057, + "eval_runtime": 99.7838, + "eval_samples_per_second": 10.022, + "eval_steps_per_second": 5.011, + "step": 26200 + }, + { + "epoch": 3.5181208053691275, + "grad_norm": 0.5459762215614319, + "learning_rate": 1.6087722106611176e-06, + "loss": 2.0294, + "step": 26210 + }, + { + "epoch": 3.5194630872483224, + "grad_norm": 0.5496200919151306, + "learning_rate": 1.6055706739234833e-06, + "loss": 2.0058, + "step": 26220 + }, + { + "epoch": 3.5208053691275167, + "grad_norm": 0.5605970025062561, + "learning_rate": 1.6023691371858493e-06, + "loss": 2.0198, + "step": 26230 + }, + { + "epoch": 3.5221476510067116, + "grad_norm": 0.47437921166419983, + "learning_rate": 1.5991676004482154e-06, + "loss": 2.0556, + "step": 26240 + }, + { + "epoch": 3.523489932885906, + "grad_norm": 0.50983726978302, + "learning_rate": 1.595966063710581e-06, + "loss": 2.0272, + "step": 26250 + }, + { + "epoch": 3.524832214765101, + "grad_norm": 0.566801130771637, + "learning_rate": 1.5927645269729472e-06, + "loss": 2.0002, + "step": 26260 + }, + { + "epoch": 3.526174496644295, + "grad_norm": 0.5266052484512329, + "learning_rate": 1.589562990235313e-06, + "loss": 2.0261, + "step": 26270 + }, + { + "epoch": 3.52751677852349, + "grad_norm": 0.5181676745414734, + "learning_rate": 1.586361453497679e-06, + "loss": 1.9843, + "step": 26280 + }, + { + "epoch": 3.5288590604026844, + "grad_norm": 0.5289013385772705, + "learning_rate": 1.583159916760045e-06, + "loss": 2.0023, + "step": 26290 + }, + { + "epoch": 3.530201342281879, + "grad_norm": 0.5178347826004028, + "learning_rate": 1.5799583800224107e-06, + "loss": 2.0372, + "step": 26300 + }, + { + "epoch": 3.530201342281879, + "eval_loss": 2.0322930812835693, + "eval_runtime": 99.3132, + "eval_samples_per_second": 10.069, + "eval_steps_per_second": 5.035, + "step": 26300 + }, + { + "epoch": 3.531543624161074, + "grad_norm": 0.5697055459022522, + "learning_rate": 1.5767568432847768e-06, + "loss": 2.0017, + "step": 26310 + }, + { + "epoch": 3.5328859060402684, + "grad_norm": 0.5801728963851929, + "learning_rate": 1.573555306547143e-06, + "loss": 1.9787, + "step": 26320 + }, + { + "epoch": 3.5342281879194632, + "grad_norm": 0.5830363631248474, + "learning_rate": 1.5703537698095087e-06, + "loss": 1.9798, + "step": 26330 + }, + { + "epoch": 3.5355704697986576, + "grad_norm": 0.5186100602149963, + "learning_rate": 1.5671522330718746e-06, + "loss": 2.0331, + "step": 26340 + }, + { + "epoch": 3.5369127516778525, + "grad_norm": 0.5778900384902954, + "learning_rate": 1.5639506963342405e-06, + "loss": 2.0245, + "step": 26350 + }, + { + "epoch": 3.538255033557047, + "grad_norm": 0.5799062848091125, + "learning_rate": 1.5607491595966065e-06, + "loss": 2.0297, + "step": 26360 + }, + { + "epoch": 3.5395973154362417, + "grad_norm": 0.60067218542099, + "learning_rate": 1.5575476228589726e-06, + "loss": 2.0386, + "step": 26370 + }, + { + "epoch": 3.540939597315436, + "grad_norm": 0.8203257322311401, + "learning_rate": 1.5543460861213383e-06, + "loss": 2.0141, + "step": 26380 + }, + { + "epoch": 3.542281879194631, + "grad_norm": 0.5709810256958008, + "learning_rate": 1.5511445493837044e-06, + "loss": 1.9943, + "step": 26390 + }, + { + "epoch": 3.5436241610738257, + "grad_norm": 0.5866687893867493, + "learning_rate": 1.5479430126460701e-06, + "loss": 1.9832, + "step": 26400 + }, + { + "epoch": 3.5436241610738257, + "eval_loss": 2.0319128036499023, + "eval_runtime": 99.0141, + "eval_samples_per_second": 10.1, + "eval_steps_per_second": 5.05, + "step": 26400 + }, + { + "epoch": 3.54496644295302, + "grad_norm": 0.5029199123382568, + "learning_rate": 1.544741475908436e-06, + "loss": 1.9856, + "step": 26410 + }, + { + "epoch": 3.546308724832215, + "grad_norm": 0.5404402613639832, + "learning_rate": 1.5415399391708022e-06, + "loss": 2.0053, + "step": 26420 + }, + { + "epoch": 3.5476510067114093, + "grad_norm": 0.5300592184066772, + "learning_rate": 1.538338402433168e-06, + "loss": 2.012, + "step": 26430 + }, + { + "epoch": 3.548993288590604, + "grad_norm": 0.6518120169639587, + "learning_rate": 1.535136865695534e-06, + "loss": 1.9945, + "step": 26440 + }, + { + "epoch": 3.5503355704697985, + "grad_norm": 0.5931301116943359, + "learning_rate": 1.5319353289579e-06, + "loss": 2.0299, + "step": 26450 + }, + { + "epoch": 3.5516778523489934, + "grad_norm": 0.594891369342804, + "learning_rate": 1.5287337922202659e-06, + "loss": 2.0179, + "step": 26460 + }, + { + "epoch": 3.5530201342281877, + "grad_norm": 0.567933201789856, + "learning_rate": 1.5255322554826318e-06, + "loss": 2.0182, + "step": 26470 + }, + { + "epoch": 3.5543624161073826, + "grad_norm": 0.5732447504997253, + "learning_rate": 1.5223307187449975e-06, + "loss": 2.002, + "step": 26480 + }, + { + "epoch": 3.5557046979865774, + "grad_norm": 0.5601099729537964, + "learning_rate": 1.5191291820073636e-06, + "loss": 2.0019, + "step": 26490 + }, + { + "epoch": 3.557046979865772, + "grad_norm": 0.5106274485588074, + "learning_rate": 1.5159276452697298e-06, + "loss": 1.9877, + "step": 26500 + }, + { + "epoch": 3.557046979865772, + "eval_loss": 2.032021999359131, + "eval_runtime": 99.2164, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.039, + "step": 26500 + }, + { + "epoch": 3.558389261744966, + "grad_norm": 0.5745659470558167, + "learning_rate": 1.5127261085320955e-06, + "loss": 2.0172, + "step": 26510 + }, + { + "epoch": 3.559731543624161, + "grad_norm": 0.6014290452003479, + "learning_rate": 1.5095245717944614e-06, + "loss": 1.987, + "step": 26520 + }, + { + "epoch": 3.561073825503356, + "grad_norm": 0.5782161951065063, + "learning_rate": 1.5063230350568275e-06, + "loss": 2.0174, + "step": 26530 + }, + { + "epoch": 3.56241610738255, + "grad_norm": 0.6540932059288025, + "learning_rate": 1.5031214983191933e-06, + "loss": 1.9996, + "step": 26540 + }, + { + "epoch": 3.563758389261745, + "grad_norm": 0.5923947095870972, + "learning_rate": 1.4999199615815594e-06, + "loss": 1.9941, + "step": 26550 + }, + { + "epoch": 3.5651006711409394, + "grad_norm": 0.5839203000068665, + "learning_rate": 1.496718424843925e-06, + "loss": 1.9726, + "step": 26560 + }, + { + "epoch": 3.5664429530201343, + "grad_norm": 0.5613699555397034, + "learning_rate": 1.4935168881062912e-06, + "loss": 1.9925, + "step": 26570 + }, + { + "epoch": 3.567785234899329, + "grad_norm": 0.5361098051071167, + "learning_rate": 1.4903153513686572e-06, + "loss": 2.0277, + "step": 26580 + }, + { + "epoch": 3.5691275167785235, + "grad_norm": 0.5905805826187134, + "learning_rate": 1.4871138146310229e-06, + "loss": 2.0459, + "step": 26590 + }, + { + "epoch": 3.570469798657718, + "grad_norm": 0.5785099267959595, + "learning_rate": 1.483912277893389e-06, + "loss": 2.0073, + "step": 26600 + }, + { + "epoch": 3.570469798657718, + "eval_loss": 2.0318801403045654, + "eval_runtime": 99.1857, + "eval_samples_per_second": 10.082, + "eval_steps_per_second": 5.041, + "step": 26600 + }, + { + "epoch": 3.5718120805369127, + "grad_norm": 0.5500389933586121, + "learning_rate": 1.4807107411557547e-06, + "loss": 2.0281, + "step": 26610 + }, + { + "epoch": 3.5731543624161075, + "grad_norm": 0.5361244678497314, + "learning_rate": 1.4775092044181208e-06, + "loss": 1.9826, + "step": 26620 + }, + { + "epoch": 3.574496644295302, + "grad_norm": 0.5269361138343811, + "learning_rate": 1.4743076676804868e-06, + "loss": 2.0206, + "step": 26630 + }, + { + "epoch": 3.5758389261744967, + "grad_norm": 0.5010485053062439, + "learning_rate": 1.4711061309428527e-06, + "loss": 2.0153, + "step": 26640 + }, + { + "epoch": 3.577181208053691, + "grad_norm": 0.5887482166290283, + "learning_rate": 1.4679045942052186e-06, + "loss": 2.0069, + "step": 26650 + }, + { + "epoch": 3.578523489932886, + "grad_norm": 0.5067187547683716, + "learning_rate": 1.4647030574675847e-06, + "loss": 2.0108, + "step": 26660 + }, + { + "epoch": 3.5798657718120808, + "grad_norm": 0.5837522745132446, + "learning_rate": 1.4615015207299505e-06, + "loss": 2.0401, + "step": 26670 + }, + { + "epoch": 3.581208053691275, + "grad_norm": 0.5169274210929871, + "learning_rate": 1.4582999839923164e-06, + "loss": 2.0129, + "step": 26680 + }, + { + "epoch": 3.5825503355704695, + "grad_norm": 0.5982518792152405, + "learning_rate": 1.4550984472546823e-06, + "loss": 2.0098, + "step": 26690 + }, + { + "epoch": 3.5838926174496644, + "grad_norm": 0.5763619542121887, + "learning_rate": 1.4518969105170482e-06, + "loss": 2.0456, + "step": 26700 + }, + { + "epoch": 3.5838926174496644, + "eval_loss": 2.0316596031188965, + "eval_runtime": 99.2723, + "eval_samples_per_second": 10.073, + "eval_steps_per_second": 5.037, + "step": 26700 + }, + { + "epoch": 3.585234899328859, + "grad_norm": 0.5087338089942932, + "learning_rate": 1.4486953737794144e-06, + "loss": 2.0014, + "step": 26710 + }, + { + "epoch": 3.5865771812080536, + "grad_norm": 0.5658003687858582, + "learning_rate": 1.44549383704178e-06, + "loss": 1.993, + "step": 26720 + }, + { + "epoch": 3.5879194630872484, + "grad_norm": 0.5506657958030701, + "learning_rate": 1.4422923003041462e-06, + "loss": 1.9865, + "step": 26730 + }, + { + "epoch": 3.589261744966443, + "grad_norm": 0.5712320804595947, + "learning_rate": 1.4390907635665121e-06, + "loss": 1.9918, + "step": 26740 + }, + { + "epoch": 3.5906040268456376, + "grad_norm": 0.5391896367073059, + "learning_rate": 1.435889226828878e-06, + "loss": 2.0186, + "step": 26750 + }, + { + "epoch": 3.5919463087248324, + "grad_norm": 0.5626962780952454, + "learning_rate": 1.432687690091244e-06, + "loss": 2.0359, + "step": 26760 + }, + { + "epoch": 3.593288590604027, + "grad_norm": 0.5323194265365601, + "learning_rate": 1.4294861533536097e-06, + "loss": 2.0044, + "step": 26770 + }, + { + "epoch": 3.594630872483221, + "grad_norm": 0.49520260095596313, + "learning_rate": 1.4262846166159758e-06, + "loss": 2.0125, + "step": 26780 + }, + { + "epoch": 3.595973154362416, + "grad_norm": 0.5849247574806213, + "learning_rate": 1.4230830798783417e-06, + "loss": 2.0276, + "step": 26790 + }, + { + "epoch": 3.597315436241611, + "grad_norm": 0.5351284146308899, + "learning_rate": 1.4198815431407076e-06, + "loss": 2.0037, + "step": 26800 + }, + { + "epoch": 3.597315436241611, + "eval_loss": 2.0315005779266357, + "eval_runtime": 99.2241, + "eval_samples_per_second": 10.078, + "eval_steps_per_second": 5.039, + "step": 26800 + }, + { + "epoch": 3.5986577181208053, + "grad_norm": 0.5691702365875244, + "learning_rate": 1.4166800064030736e-06, + "loss": 1.9961, + "step": 26810 + }, + { + "epoch": 3.6, + "grad_norm": 0.5791316032409668, + "learning_rate": 1.4134784696654395e-06, + "loss": 1.9893, + "step": 26820 + }, + { + "epoch": 3.6013422818791945, + "grad_norm": 0.577229380607605, + "learning_rate": 1.4102769329278054e-06, + "loss": 2.0102, + "step": 26830 + }, + { + "epoch": 3.6026845637583893, + "grad_norm": 0.5284247398376465, + "learning_rate": 1.4070753961901715e-06, + "loss": 2.0143, + "step": 26840 + }, + { + "epoch": 3.604026845637584, + "grad_norm": 0.5774010419845581, + "learning_rate": 1.4038738594525373e-06, + "loss": 2.0377, + "step": 26850 + }, + { + "epoch": 3.6053691275167785, + "grad_norm": 0.5636426210403442, + "learning_rate": 1.4006723227149032e-06, + "loss": 2.0085, + "step": 26860 + }, + { + "epoch": 3.606711409395973, + "grad_norm": 0.5386518239974976, + "learning_rate": 1.3974707859772693e-06, + "loss": 2.0103, + "step": 26870 + }, + { + "epoch": 3.6080536912751677, + "grad_norm": 0.5856292247772217, + "learning_rate": 1.394269249239635e-06, + "loss": 2.0179, + "step": 26880 + }, + { + "epoch": 3.6093959731543626, + "grad_norm": 0.5678332448005676, + "learning_rate": 1.3910677125020012e-06, + "loss": 2.0337, + "step": 26890 + }, + { + "epoch": 3.610738255033557, + "grad_norm": 0.5619184374809265, + "learning_rate": 1.3878661757643669e-06, + "loss": 1.9718, + "step": 26900 + }, + { + "epoch": 3.610738255033557, + "eval_loss": 2.031461715698242, + "eval_runtime": 99.0243, + "eval_samples_per_second": 10.099, + "eval_steps_per_second": 5.049, + "step": 26900 + }, + { + "epoch": 3.6120805369127518, + "grad_norm": 0.5686038136482239, + "learning_rate": 1.384664639026733e-06, + "loss": 2.0198, + "step": 26910 + }, + { + "epoch": 3.613422818791946, + "grad_norm": 0.547531008720398, + "learning_rate": 1.381463102289099e-06, + "loss": 1.9721, + "step": 26920 + }, + { + "epoch": 3.614765100671141, + "grad_norm": 0.6028099656105042, + "learning_rate": 1.3782615655514646e-06, + "loss": 2.0103, + "step": 26930 + }, + { + "epoch": 3.616107382550336, + "grad_norm": 0.5436133742332458, + "learning_rate": 1.3750600288138308e-06, + "loss": 2.0334, + "step": 26940 + }, + { + "epoch": 3.61744966442953, + "grad_norm": 0.6020119190216064, + "learning_rate": 1.371858492076197e-06, + "loss": 1.9932, + "step": 26950 + }, + { + "epoch": 3.6187919463087246, + "grad_norm": 0.5240419507026672, + "learning_rate": 1.3686569553385626e-06, + "loss": 1.9908, + "step": 26960 + }, + { + "epoch": 3.6201342281879194, + "grad_norm": 0.5781471729278564, + "learning_rate": 1.3654554186009285e-06, + "loss": 1.9913, + "step": 26970 + }, + { + "epoch": 3.6214765100671142, + "grad_norm": 0.5228403806686401, + "learning_rate": 1.3622538818632945e-06, + "loss": 2.0105, + "step": 26980 + }, + { + "epoch": 3.6228187919463086, + "grad_norm": 0.5725923776626587, + "learning_rate": 1.3590523451256604e-06, + "loss": 2.0124, + "step": 26990 + }, + { + "epoch": 3.6241610738255035, + "grad_norm": 0.5426804423332214, + "learning_rate": 1.3558508083880265e-06, + "loss": 2.0344, + "step": 27000 + }, + { + "epoch": 3.6241610738255035, + "eval_loss": 2.0313711166381836, + "eval_runtime": 99.1271, + "eval_samples_per_second": 10.088, + "eval_steps_per_second": 5.044, + "step": 27000 + }, + { + "epoch": 3.625503355704698, + "grad_norm": 0.5412684082984924, + "learning_rate": 1.3526492716503922e-06, + "loss": 2.0272, + "step": 27010 + }, + { + "epoch": 3.6268456375838927, + "grad_norm": 0.5994455814361572, + "learning_rate": 1.3494477349127584e-06, + "loss": 2.0083, + "step": 27020 + }, + { + "epoch": 3.6281879194630875, + "grad_norm": 0.5473735928535461, + "learning_rate": 1.3462461981751243e-06, + "loss": 2.0363, + "step": 27030 + }, + { + "epoch": 3.629530201342282, + "grad_norm": 0.5572862029075623, + "learning_rate": 1.34304466143749e-06, + "loss": 1.9947, + "step": 27040 + }, + { + "epoch": 3.6308724832214763, + "grad_norm": 0.6075792908668518, + "learning_rate": 1.3398431246998561e-06, + "loss": 1.9904, + "step": 27050 + }, + { + "epoch": 3.632214765100671, + "grad_norm": 0.6825592517852783, + "learning_rate": 1.3366415879622218e-06, + "loss": 2.0219, + "step": 27060 + }, + { + "epoch": 3.633557046979866, + "grad_norm": 0.8327385783195496, + "learning_rate": 1.333440051224588e-06, + "loss": 2.0099, + "step": 27070 + }, + { + "epoch": 3.6348993288590603, + "grad_norm": 0.6000455021858215, + "learning_rate": 1.3302385144869539e-06, + "loss": 2.0201, + "step": 27080 + }, + { + "epoch": 3.636241610738255, + "grad_norm": 0.5002001523971558, + "learning_rate": 1.3270369777493198e-06, + "loss": 2.0008, + "step": 27090 + }, + { + "epoch": 3.6375838926174495, + "grad_norm": 0.5344942808151245, + "learning_rate": 1.3238354410116857e-06, + "loss": 2.0393, + "step": 27100 + }, + { + "epoch": 3.6375838926174495, + "eval_loss": 2.0311882495880127, + "eval_runtime": 99.2431, + "eval_samples_per_second": 10.076, + "eval_steps_per_second": 5.038, + "step": 27100 + }, + { + "epoch": 3.6389261744966444, + "grad_norm": 0.5355333685874939, + "learning_rate": 1.3206339042740514e-06, + "loss": 2.0083, + "step": 27110 + }, + { + "epoch": 3.640268456375839, + "grad_norm": 0.5520380735397339, + "learning_rate": 1.3174323675364176e-06, + "loss": 2.0107, + "step": 27120 + }, + { + "epoch": 3.6416107382550336, + "grad_norm": 0.4758807420730591, + "learning_rate": 1.3142308307987837e-06, + "loss": 2.011, + "step": 27130 + }, + { + "epoch": 3.642953020134228, + "grad_norm": 0.5604168176651001, + "learning_rate": 1.3110292940611494e-06, + "loss": 1.9704, + "step": 27140 + }, + { + "epoch": 3.6442953020134228, + "grad_norm": 0.4999609887599945, + "learning_rate": 1.3078277573235153e-06, + "loss": 1.9859, + "step": 27150 + }, + { + "epoch": 3.6456375838926176, + "grad_norm": 0.5146457552909851, + "learning_rate": 1.3046262205858815e-06, + "loss": 2.0297, + "step": 27160 + }, + { + "epoch": 3.646979865771812, + "grad_norm": 0.5436716079711914, + "learning_rate": 1.3014246838482472e-06, + "loss": 2.0202, + "step": 27170 + }, + { + "epoch": 3.648322147651007, + "grad_norm": 0.5256075263023376, + "learning_rate": 1.2982231471106133e-06, + "loss": 2.0319, + "step": 27180 + }, + { + "epoch": 3.649664429530201, + "grad_norm": 0.5203789472579956, + "learning_rate": 1.295021610372979e-06, + "loss": 2.0115, + "step": 27190 + }, + { + "epoch": 3.651006711409396, + "grad_norm": 0.5632807612419128, + "learning_rate": 1.2918200736353452e-06, + "loss": 2.0053, + "step": 27200 + }, + { + "epoch": 3.651006711409396, + "eval_loss": 2.0313007831573486, + "eval_runtime": 99.3281, + "eval_samples_per_second": 10.068, + "eval_steps_per_second": 5.034, + "step": 27200 + }, + { + "epoch": 3.652348993288591, + "grad_norm": 0.5029721856117249, + "learning_rate": 1.288618536897711e-06, + "loss": 2.0194, + "step": 27210 + }, + { + "epoch": 3.6536912751677852, + "grad_norm": 0.49627238512039185, + "learning_rate": 1.2854170001600768e-06, + "loss": 2.0026, + "step": 27220 + }, + { + "epoch": 3.6550335570469796, + "grad_norm": 0.5915135741233826, + "learning_rate": 1.282215463422443e-06, + "loss": 1.9974, + "step": 27230 + }, + { + "epoch": 3.6563758389261745, + "grad_norm": 0.5953642129898071, + "learning_rate": 1.2790139266848088e-06, + "loss": 2.0172, + "step": 27240 + }, + { + "epoch": 3.6577181208053693, + "grad_norm": 0.6336215734481812, + "learning_rate": 1.2758123899471748e-06, + "loss": 1.9817, + "step": 27250 + }, + { + "epoch": 3.6590604026845637, + "grad_norm": 0.5529477000236511, + "learning_rate": 1.2726108532095407e-06, + "loss": 1.9971, + "step": 27260 + }, + { + "epoch": 3.6604026845637585, + "grad_norm": 0.5943782329559326, + "learning_rate": 1.2694093164719066e-06, + "loss": 1.9931, + "step": 27270 + }, + { + "epoch": 3.661744966442953, + "grad_norm": 0.5412973761558533, + "learning_rate": 1.2662077797342725e-06, + "loss": 2.0315, + "step": 27280 + }, + { + "epoch": 3.6630872483221477, + "grad_norm": 0.5867758989334106, + "learning_rate": 1.2630062429966387e-06, + "loss": 1.9939, + "step": 27290 + }, + { + "epoch": 3.6644295302013425, + "grad_norm": 0.5666806697845459, + "learning_rate": 1.2598047062590044e-06, + "loss": 2.0057, + "step": 27300 + }, + { + "epoch": 3.6644295302013425, + "eval_loss": 2.0311965942382812, + "eval_runtime": 99.1017, + "eval_samples_per_second": 10.091, + "eval_steps_per_second": 5.045, + "step": 27300 + }, + { + "epoch": 3.665771812080537, + "grad_norm": 0.586299479007721, + "learning_rate": 1.2566031695213705e-06, + "loss": 1.9945, + "step": 27310 + }, + { + "epoch": 3.6671140939597313, + "grad_norm": 0.5768749713897705, + "learning_rate": 1.2534016327837362e-06, + "loss": 2.0298, + "step": 27320 + }, + { + "epoch": 3.668456375838926, + "grad_norm": 0.6095951795578003, + "learning_rate": 1.2502000960461021e-06, + "loss": 2.0274, + "step": 27330 + }, + { + "epoch": 3.669798657718121, + "grad_norm": 0.5211044549942017, + "learning_rate": 1.246998559308468e-06, + "loss": 2.0195, + "step": 27340 + }, + { + "epoch": 3.6711409395973154, + "grad_norm": 0.5351852774620056, + "learning_rate": 1.2437970225708342e-06, + "loss": 1.9969, + "step": 27350 + }, + { + "epoch": 3.67248322147651, + "grad_norm": 0.5246982574462891, + "learning_rate": 1.2405954858332001e-06, + "loss": 1.9887, + "step": 27360 + }, + { + "epoch": 3.6738255033557046, + "grad_norm": 0.5135153532028198, + "learning_rate": 1.237393949095566e-06, + "loss": 2.0068, + "step": 27370 + }, + { + "epoch": 3.6751677852348994, + "grad_norm": 0.5587418675422668, + "learning_rate": 1.234192412357932e-06, + "loss": 1.9971, + "step": 27380 + }, + { + "epoch": 3.6765100671140942, + "grad_norm": 0.4823431968688965, + "learning_rate": 1.2309908756202979e-06, + "loss": 2.026, + "step": 27390 + }, + { + "epoch": 3.6778523489932886, + "grad_norm": 0.5298497080802917, + "learning_rate": 1.2277893388826638e-06, + "loss": 2.0186, + "step": 27400 + }, + { + "epoch": 3.6778523489932886, + "eval_loss": 2.030952215194702, + "eval_runtime": 99.2174, + "eval_samples_per_second": 10.079, + "eval_steps_per_second": 5.039, + "step": 27400 + }, + { + "epoch": 3.679194630872483, + "grad_norm": 0.5753053426742554, + "learning_rate": 1.2245878021450297e-06, + "loss": 2.011, + "step": 27410 + }, + { + "epoch": 3.680536912751678, + "grad_norm": 0.5194422006607056, + "learning_rate": 1.2213862654073956e-06, + "loss": 2.0115, + "step": 27420 + }, + { + "epoch": 3.6818791946308727, + "grad_norm": 0.5648922920227051, + "learning_rate": 1.2181847286697616e-06, + "loss": 2.0051, + "step": 27430 + }, + { + "epoch": 3.683221476510067, + "grad_norm": 0.5649812817573547, + "learning_rate": 1.2149831919321275e-06, + "loss": 2.0234, + "step": 27440 + }, + { + "epoch": 3.684563758389262, + "grad_norm": 0.5129743218421936, + "learning_rate": 1.2117816551944934e-06, + "loss": 2.0035, + "step": 27450 + }, + { + "epoch": 3.6859060402684563, + "grad_norm": 0.5572558641433716, + "learning_rate": 1.2085801184568593e-06, + "loss": 2.0131, + "step": 27460 + }, + { + "epoch": 3.687248322147651, + "grad_norm": 0.5094574689865112, + "learning_rate": 1.2053785817192255e-06, + "loss": 1.9998, + "step": 27470 + }, + { + "epoch": 3.688590604026846, + "grad_norm": 0.6388832926750183, + "learning_rate": 1.2021770449815914e-06, + "loss": 1.9845, + "step": 27480 + }, + { + "epoch": 3.6899328859060403, + "grad_norm": 0.5876774191856384, + "learning_rate": 1.198975508243957e-06, + "loss": 2.0151, + "step": 27490 + }, + { + "epoch": 3.6912751677852347, + "grad_norm": 0.5422551035881042, + "learning_rate": 1.195773971506323e-06, + "loss": 2.0027, + "step": 27500 + }, + { + "epoch": 3.6912751677852347, + "eval_loss": 2.0309927463531494, + "eval_runtime": 99.2731, + "eval_samples_per_second": 10.073, + "eval_steps_per_second": 5.037, + "step": 27500 + }, + { + "epoch": 3.6926174496644295, + "grad_norm": 0.5409743189811707, + "learning_rate": 1.192572434768689e-06, + "loss": 2.0396, + "step": 27510 + }, + { + "epoch": 3.6939597315436243, + "grad_norm": 0.5549281239509583, + "learning_rate": 1.189370898031055e-06, + "loss": 2.011, + "step": 27520 + }, + { + "epoch": 3.6953020134228187, + "grad_norm": 0.5758858919143677, + "learning_rate": 1.186169361293421e-06, + "loss": 1.9923, + "step": 27530 + }, + { + "epoch": 3.6966442953020135, + "grad_norm": 0.5227702856063843, + "learning_rate": 1.182967824555787e-06, + "loss": 2.0063, + "step": 27540 + }, + { + "epoch": 3.697986577181208, + "grad_norm": 0.49413546919822693, + "learning_rate": 1.1797662878181528e-06, + "loss": 2.0086, + "step": 27550 + }, + { + "epoch": 3.6993288590604028, + "grad_norm": 0.6663057208061218, + "learning_rate": 1.1765647510805188e-06, + "loss": 1.9953, + "step": 27560 + }, + { + "epoch": 3.7006711409395976, + "grad_norm": 0.5428208112716675, + "learning_rate": 1.1733632143428847e-06, + "loss": 2.0421, + "step": 27570 + }, + { + "epoch": 3.702013422818792, + "grad_norm": 0.5529650449752808, + "learning_rate": 1.1701616776052506e-06, + "loss": 1.9915, + "step": 27580 + }, + { + "epoch": 3.7033557046979864, + "grad_norm": 0.5597283840179443, + "learning_rate": 1.1669601408676165e-06, + "loss": 2.0052, + "step": 27590 + }, + { + "epoch": 3.704697986577181, + "grad_norm": 0.5497382283210754, + "learning_rate": 1.1637586041299825e-06, + "loss": 1.9972, + "step": 27600 + }, + { + "epoch": 3.704697986577181, + "eval_loss": 2.0307700634002686, + "eval_runtime": 99.1358, + "eval_samples_per_second": 10.087, + "eval_steps_per_second": 5.044, + "step": 27600 + }, + { + "epoch": 3.706040268456376, + "grad_norm": 0.5258267521858215, + "learning_rate": 1.1605570673923484e-06, + "loss": 2.0057, + "step": 27610 + }, + { + "epoch": 3.7073825503355704, + "grad_norm": 0.5696093440055847, + "learning_rate": 1.1573555306547143e-06, + "loss": 1.9995, + "step": 27620 + }, + { + "epoch": 3.7087248322147652, + "grad_norm": 0.5222363471984863, + "learning_rate": 1.1541539939170802e-06, + "loss": 2.0124, + "step": 27630 + }, + { + "epoch": 3.7100671140939596, + "grad_norm": 0.517994225025177, + "learning_rate": 1.1509524571794464e-06, + "loss": 1.9919, + "step": 27640 + }, + { + "epoch": 3.7114093959731544, + "grad_norm": 0.5691993236541748, + "learning_rate": 1.1477509204418123e-06, + "loss": 1.9868, + "step": 27650 + }, + { + "epoch": 3.712751677852349, + "grad_norm": 0.5910984873771667, + "learning_rate": 1.1445493837041782e-06, + "loss": 1.9906, + "step": 27660 + }, + { + "epoch": 3.7140939597315437, + "grad_norm": 0.5375342965126038, + "learning_rate": 1.141347846966544e-06, + "loss": 1.9855, + "step": 27670 + }, + { + "epoch": 3.715436241610738, + "grad_norm": 0.5049592852592468, + "learning_rate": 1.13814631022891e-06, + "loss": 2.0079, + "step": 27680 + }, + { + "epoch": 3.716778523489933, + "grad_norm": 0.5630091428756714, + "learning_rate": 1.134944773491276e-06, + "loss": 2.024, + "step": 27690 + }, + { + "epoch": 3.7181208053691277, + "grad_norm": 0.524662971496582, + "learning_rate": 1.1317432367536419e-06, + "loss": 2.0066, + "step": 27700 + }, + { + "epoch": 3.7181208053691277, + "eval_loss": 2.0309386253356934, + "eval_runtime": 99.1082, + "eval_samples_per_second": 10.09, + "eval_steps_per_second": 5.045, + "step": 27700 + }, + { + "epoch": 3.719463087248322, + "grad_norm": 0.5959316492080688, + "learning_rate": 1.1285417000160078e-06, + "loss": 2.029, + "step": 27710 + }, + { + "epoch": 3.720805369127517, + "grad_norm": 0.5438879728317261, + "learning_rate": 1.1253401632783737e-06, + "loss": 2.0206, + "step": 27720 + }, + { + "epoch": 3.7221476510067113, + "grad_norm": 0.5142372846603394, + "learning_rate": 1.1221386265407396e-06, + "loss": 2.0049, + "step": 27730 + }, + { + "epoch": 3.723489932885906, + "grad_norm": 0.5625041723251343, + "learning_rate": 1.1189370898031056e-06, + "loss": 1.9966, + "step": 27740 + }, + { + "epoch": 3.7248322147651005, + "grad_norm": 0.6374936103820801, + "learning_rate": 1.1157355530654715e-06, + "loss": 2.0287, + "step": 27750 + }, + { + "epoch": 3.7261744966442953, + "grad_norm": 0.5368462204933167, + "learning_rate": 1.1125340163278374e-06, + "loss": 2.002, + "step": 27760 + }, + { + "epoch": 3.7275167785234897, + "grad_norm": 0.6157576441764832, + "learning_rate": 1.1093324795902033e-06, + "loss": 2.0182, + "step": 27770 + }, + { + "epoch": 3.7288590604026846, + "grad_norm": 0.5298171639442444, + "learning_rate": 1.1061309428525693e-06, + "loss": 2.0054, + "step": 27780 + }, + { + "epoch": 3.7302013422818794, + "grad_norm": 0.5143840312957764, + "learning_rate": 1.1029294061149352e-06, + "loss": 1.985, + "step": 27790 + }, + { + "epoch": 3.7315436241610738, + "grad_norm": 0.5257755517959595, + "learning_rate": 1.099727869377301e-06, + "loss": 2.0042, + "step": 27800 + }, + { + "epoch": 3.7315436241610738, + "eval_loss": 2.030616521835327, + "eval_runtime": 98.9775, + "eval_samples_per_second": 10.103, + "eval_steps_per_second": 5.052, + "step": 27800 + }, + { + "epoch": 3.7328859060402686, + "grad_norm": 0.5499988794326782, + "learning_rate": 1.0965263326396672e-06, + "loss": 1.9837, + "step": 27810 + }, + { + "epoch": 3.734228187919463, + "grad_norm": 0.5791357755661011, + "learning_rate": 1.0933247959020332e-06, + "loss": 2.0305, + "step": 27820 + }, + { + "epoch": 3.735570469798658, + "grad_norm": 0.5184398293495178, + "learning_rate": 1.090123259164399e-06, + "loss": 1.9948, + "step": 27830 + }, + { + "epoch": 3.736912751677852, + "grad_norm": 0.4910694360733032, + "learning_rate": 1.0869217224267648e-06, + "loss": 2.0057, + "step": 27840 + }, + { + "epoch": 3.738255033557047, + "grad_norm": 0.5194987058639526, + "learning_rate": 1.083720185689131e-06, + "loss": 2.0186, + "step": 27850 + }, + { + "epoch": 3.7395973154362414, + "grad_norm": 0.5067945122718811, + "learning_rate": 1.0805186489514968e-06, + "loss": 1.9798, + "step": 27860 + }, + { + "epoch": 3.7409395973154362, + "grad_norm": 0.5868769288063049, + "learning_rate": 1.0773171122138628e-06, + "loss": 1.9939, + "step": 27870 + }, + { + "epoch": 3.742281879194631, + "grad_norm": 0.540751039981842, + "learning_rate": 1.0741155754762287e-06, + "loss": 1.9762, + "step": 27880 + }, + { + "epoch": 3.7436241610738255, + "grad_norm": 0.5037314295768738, + "learning_rate": 1.0709140387385946e-06, + "loss": 2.0152, + "step": 27890 + }, + { + "epoch": 3.7449664429530203, + "grad_norm": 0.5138669610023499, + "learning_rate": 1.0677125020009605e-06, + "loss": 2.0025, + "step": 27900 + }, + { + "epoch": 3.7449664429530203, + "eval_loss": 2.0305252075195312, + "eval_runtime": 99.0138, + "eval_samples_per_second": 10.1, + "eval_steps_per_second": 5.05, + "step": 27900 + } + ], + "logging_steps": 10, + "max_steps": 31235, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.410926724972765e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}