{ "best_metric": 2.0305252075195312, "best_model_checkpoint": "/gpfs/radev/home/ap2853/scratch/c2s_models/train_c2s_full_1b/checkpoint-27900", "epoch": 3.7449664429530203, "eval_steps": 100, "global_step": 27900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006406406406406406, "grad_norm": 2.9401302337646484, "learning_rate": 9.99679671984112e-06, "loss": 3.9489, "step": 10 }, { "epoch": 0.0012812812812812813, "grad_norm": 2.2548134326934814, "learning_rate": 9.993593439682236e-06, "loss": 3.2882, "step": 20 }, { "epoch": 0.0019219219219219219, "grad_norm": 1.4946377277374268, "learning_rate": 9.990390159523353e-06, "loss": 2.9585, "step": 30 }, { "epoch": 0.0025625625625625625, "grad_norm": 1.859741449356079, "learning_rate": 9.98718687936447e-06, "loss": 2.7796, "step": 40 }, { "epoch": 0.0032032032032032033, "grad_norm": 1.7827876806259155, "learning_rate": 9.983983599205588e-06, "loss": 2.6482, "step": 50 }, { "epoch": 0.0038438438438438438, "grad_norm": 2.0380122661590576, "learning_rate": 9.980780319046704e-06, "loss": 2.5502, "step": 60 }, { "epoch": 0.004484484484484484, "grad_norm": 1.4964377880096436, "learning_rate": 9.977577038887823e-06, "loss": 2.4852, "step": 70 }, { "epoch": 0.005125125125125125, "grad_norm": 1.9251023530960083, "learning_rate": 9.97437375872894e-06, "loss": 2.463, "step": 80 }, { "epoch": 0.005765765765765766, "grad_norm": 1.8076153993606567, "learning_rate": 9.971170478570058e-06, "loss": 2.4314, "step": 90 }, { "epoch": 0.006406406406406407, "grad_norm": 1.5380029678344727, "learning_rate": 9.967967198411174e-06, "loss": 2.4312, "step": 100 }, { "epoch": 0.006406406406406407, "eval_loss": 2.4192681312561035, "eval_runtime": 99.5638, "eval_samples_per_second": 10.044, "eval_steps_per_second": 5.022, "step": 100 }, { "epoch": 0.007047047047047047, "grad_norm": 1.9866392612457275, "learning_rate": 9.964763918252291e-06, "loss": 2.394, "step": 110 }, { "epoch": 0.0076876876876876875, "grad_norm": 1.4123798608779907, "learning_rate": 9.961560638093408e-06, "loss": 2.4009, "step": 120 }, { "epoch": 0.008328328328328327, "grad_norm": 1.5345197916030884, "learning_rate": 9.958357357934526e-06, "loss": 2.3618, "step": 130 }, { "epoch": 0.008968968968968968, "grad_norm": 1.346252202987671, "learning_rate": 9.955154077775643e-06, "loss": 2.3489, "step": 140 }, { "epoch": 0.00960960960960961, "grad_norm": 1.498039722442627, "learning_rate": 9.951950797616761e-06, "loss": 2.3642, "step": 150 }, { "epoch": 0.01025025025025025, "grad_norm": 1.2046945095062256, "learning_rate": 9.948747517457878e-06, "loss": 2.3346, "step": 160 }, { "epoch": 0.01089089089089089, "grad_norm": 1.650439739227295, "learning_rate": 9.945544237298996e-06, "loss": 2.3319, "step": 170 }, { "epoch": 0.011531531531531532, "grad_norm": 1.819641351699829, "learning_rate": 9.942340957140113e-06, "loss": 2.294, "step": 180 }, { "epoch": 0.012172172172172173, "grad_norm": 1.5125728845596313, "learning_rate": 9.93913767698123e-06, "loss": 2.2876, "step": 190 }, { "epoch": 0.012812812812812813, "grad_norm": 1.2440060377120972, "learning_rate": 9.935934396822346e-06, "loss": 2.3208, "step": 200 }, { "epoch": 0.012812812812812813, "eval_loss": 2.3174383640289307, "eval_runtime": 98.9774, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.052, "step": 200 }, { "epoch": 0.013453453453453454, "grad_norm": 1.1538126468658447, "learning_rate": 9.932731116663464e-06, "loss": 2.3283, "step": 210 }, { "epoch": 0.014094094094094093, "grad_norm": 1.3118531703948975, "learning_rate": 9.929527836504581e-06, "loss": 2.2764, "step": 220 }, { "epoch": 0.014734734734734734, "grad_norm": 1.161333680152893, "learning_rate": 9.9263245563457e-06, "loss": 2.2471, "step": 230 }, { "epoch": 0.015375375375375375, "grad_norm": 1.5464813709259033, "learning_rate": 9.923121276186816e-06, "loss": 2.2699, "step": 240 }, { "epoch": 0.016016016016016016, "grad_norm": 1.2160066366195679, "learning_rate": 9.919917996027933e-06, "loss": 2.2721, "step": 250 }, { "epoch": 0.016656656656656655, "grad_norm": 1.4125703573226929, "learning_rate": 9.91671471586905e-06, "loss": 2.264, "step": 260 }, { "epoch": 0.017297297297297298, "grad_norm": 1.0305856466293335, "learning_rate": 9.913511435710168e-06, "loss": 2.2518, "step": 270 }, { "epoch": 0.017937937937937937, "grad_norm": 1.1297743320465088, "learning_rate": 9.910308155551284e-06, "loss": 2.297, "step": 280 }, { "epoch": 0.01857857857857858, "grad_norm": 1.3597170114517212, "learning_rate": 9.907104875392403e-06, "loss": 2.2957, "step": 290 }, { "epoch": 0.01921921921921922, "grad_norm": 1.4308130741119385, "learning_rate": 9.903901595233521e-06, "loss": 2.2373, "step": 300 }, { "epoch": 0.01921921921921922, "eval_loss": 2.2671942710876465, "eval_runtime": 99.1603, "eval_samples_per_second": 10.085, "eval_steps_per_second": 5.042, "step": 300 }, { "epoch": 0.01985985985985986, "grad_norm": 1.258988618850708, "learning_rate": 9.900698315074638e-06, "loss": 2.2485, "step": 310 }, { "epoch": 0.0205005005005005, "grad_norm": 1.1126744747161865, "learning_rate": 9.897495034915755e-06, "loss": 2.2211, "step": 320 }, { "epoch": 0.021141141141141143, "grad_norm": 1.2072731256484985, "learning_rate": 9.894291754756871e-06, "loss": 2.2268, "step": 330 }, { "epoch": 0.02178178178178178, "grad_norm": 1.6628332138061523, "learning_rate": 9.89108847459799e-06, "loss": 2.2493, "step": 340 }, { "epoch": 0.02242242242242242, "grad_norm": 1.5348600149154663, "learning_rate": 9.887885194439106e-06, "loss": 2.2501, "step": 350 }, { "epoch": 0.023063063063063063, "grad_norm": 1.034929633140564, "learning_rate": 9.884681914280225e-06, "loss": 2.231, "step": 360 }, { "epoch": 0.023703703703703703, "grad_norm": 0.9858831763267517, "learning_rate": 9.881478634121341e-06, "loss": 2.2372, "step": 370 }, { "epoch": 0.024344344344344345, "grad_norm": 0.974418580532074, "learning_rate": 9.878275353962458e-06, "loss": 2.228, "step": 380 }, { "epoch": 0.024984984984984984, "grad_norm": 1.2869054079055786, "learning_rate": 9.875072073803576e-06, "loss": 2.2242, "step": 390 }, { "epoch": 0.025625625625625627, "grad_norm": 1.260129690170288, "learning_rate": 9.871868793644693e-06, "loss": 2.2064, "step": 400 }, { "epoch": 0.025625625625625627, "eval_loss": 2.239616870880127, "eval_runtime": 98.8691, "eval_samples_per_second": 10.114, "eval_steps_per_second": 5.057, "step": 400 }, { "epoch": 0.026266266266266266, "grad_norm": 1.229314923286438, "learning_rate": 9.86866551348581e-06, "loss": 2.2123, "step": 410 }, { "epoch": 0.02690690690690691, "grad_norm": 1.0697575807571411, "learning_rate": 9.865462233326928e-06, "loss": 2.2388, "step": 420 }, { "epoch": 0.027547547547547548, "grad_norm": 1.1127859354019165, "learning_rate": 9.862258953168045e-06, "loss": 2.2439, "step": 430 }, { "epoch": 0.028188188188188187, "grad_norm": 1.0716787576675415, "learning_rate": 9.859055673009163e-06, "loss": 2.1971, "step": 440 }, { "epoch": 0.02882882882882883, "grad_norm": 1.22200608253479, "learning_rate": 9.85585239285028e-06, "loss": 2.2089, "step": 450 }, { "epoch": 0.02946946946946947, "grad_norm": 1.3846632242202759, "learning_rate": 9.852649112691396e-06, "loss": 2.2086, "step": 460 }, { "epoch": 0.03011011011011011, "grad_norm": 1.1426323652267456, "learning_rate": 9.849445832532513e-06, "loss": 2.2088, "step": 470 }, { "epoch": 0.03075075075075075, "grad_norm": 1.2188793420791626, "learning_rate": 9.846242552373631e-06, "loss": 2.1951, "step": 480 }, { "epoch": 0.03139139139139139, "grad_norm": 1.2974520921707153, "learning_rate": 9.843039272214748e-06, "loss": 2.232, "step": 490 }, { "epoch": 0.03203203203203203, "grad_norm": 0.9593750834465027, "learning_rate": 9.839835992055866e-06, "loss": 2.2033, "step": 500 }, { "epoch": 0.03203203203203203, "eval_loss": 2.2161059379577637, "eval_runtime": 99.0548, "eval_samples_per_second": 10.095, "eval_steps_per_second": 5.048, "step": 500 }, { "epoch": 0.032672672672672674, "grad_norm": 1.2013003826141357, "learning_rate": 9.836632711896983e-06, "loss": 2.2249, "step": 510 }, { "epoch": 0.03331331331331331, "grad_norm": 1.036780595779419, "learning_rate": 9.833429431738101e-06, "loss": 2.2022, "step": 520 }, { "epoch": 0.03395395395395395, "grad_norm": 0.9950962066650391, "learning_rate": 9.830226151579218e-06, "loss": 2.1838, "step": 530 }, { "epoch": 0.034594594594594595, "grad_norm": 0.905576229095459, "learning_rate": 9.827022871420335e-06, "loss": 2.1785, "step": 540 }, { "epoch": 0.03523523523523524, "grad_norm": 0.9075835347175598, "learning_rate": 9.823819591261451e-06, "loss": 2.1802, "step": 550 }, { "epoch": 0.03587587587587587, "grad_norm": 1.1423232555389404, "learning_rate": 9.82061631110257e-06, "loss": 2.216, "step": 560 }, { "epoch": 0.036516516516516516, "grad_norm": 0.9267427921295166, "learning_rate": 9.817413030943686e-06, "loss": 2.1745, "step": 570 }, { "epoch": 0.03715715715715716, "grad_norm": 1.0812605619430542, "learning_rate": 9.814209750784805e-06, "loss": 2.1822, "step": 580 }, { "epoch": 0.0377977977977978, "grad_norm": 0.9606761932373047, "learning_rate": 9.811006470625921e-06, "loss": 2.2007, "step": 590 }, { "epoch": 0.03843843843843844, "grad_norm": 1.1749407052993774, "learning_rate": 9.807803190467038e-06, "loss": 2.1762, "step": 600 }, { "epoch": 0.03843843843843844, "eval_loss": 2.2011826038360596, "eval_runtime": 99.0889, "eval_samples_per_second": 10.092, "eval_steps_per_second": 5.046, "step": 600 }, { "epoch": 0.03907907907907908, "grad_norm": 1.5855426788330078, "learning_rate": 9.804599910308157e-06, "loss": 2.1653, "step": 610 }, { "epoch": 0.03971971971971972, "grad_norm": 1.0931143760681152, "learning_rate": 9.801396630149273e-06, "loss": 2.1957, "step": 620 }, { "epoch": 0.04036036036036036, "grad_norm": 0.8430980443954468, "learning_rate": 9.798193349990392e-06, "loss": 2.1894, "step": 630 }, { "epoch": 0.041001001001001, "grad_norm": 0.9702726602554321, "learning_rate": 9.794990069831508e-06, "loss": 2.1907, "step": 640 }, { "epoch": 0.04164164164164164, "grad_norm": 1.0787761211395264, "learning_rate": 9.791786789672627e-06, "loss": 2.1922, "step": 650 }, { "epoch": 0.042282282282282285, "grad_norm": 0.9241836071014404, "learning_rate": 9.788583509513743e-06, "loss": 2.1866, "step": 660 }, { "epoch": 0.04292292292292292, "grad_norm": 0.8241314888000488, "learning_rate": 9.78538022935486e-06, "loss": 2.1769, "step": 670 }, { "epoch": 0.04356356356356356, "grad_norm": 0.7783017158508301, "learning_rate": 9.782176949195977e-06, "loss": 2.1526, "step": 680 }, { "epoch": 0.044204204204204206, "grad_norm": 0.8009265065193176, "learning_rate": 9.778973669037095e-06, "loss": 2.1566, "step": 690 }, { "epoch": 0.04484484484484484, "grad_norm": 0.8154035806655884, "learning_rate": 9.775770388878212e-06, "loss": 2.1745, "step": 700 }, { "epoch": 0.04484484484484484, "eval_loss": 2.188918352127075, "eval_runtime": 98.963, "eval_samples_per_second": 10.105, "eval_steps_per_second": 5.052, "step": 700 }, { "epoch": 0.045485485485485484, "grad_norm": 1.1314467191696167, "learning_rate": 9.77256710871933e-06, "loss": 2.1939, "step": 710 }, { "epoch": 0.04612612612612613, "grad_norm": 0.9179493188858032, "learning_rate": 9.769363828560447e-06, "loss": 2.1498, "step": 720 }, { "epoch": 0.04676676676676677, "grad_norm": 0.8480121493339539, "learning_rate": 9.766160548401565e-06, "loss": 2.1302, "step": 730 }, { "epoch": 0.047407407407407405, "grad_norm": 1.093794822692871, "learning_rate": 9.762957268242682e-06, "loss": 2.1542, "step": 740 }, { "epoch": 0.04804804804804805, "grad_norm": 1.032202124595642, "learning_rate": 9.759753988083798e-06, "loss": 2.1745, "step": 750 }, { "epoch": 0.04868868868868869, "grad_norm": 0.754345715045929, "learning_rate": 9.756550707924915e-06, "loss": 2.1663, "step": 760 }, { "epoch": 0.04932932932932933, "grad_norm": 0.7865972518920898, "learning_rate": 9.753347427766033e-06, "loss": 2.192, "step": 770 }, { "epoch": 0.04996996996996997, "grad_norm": 0.9923487901687622, "learning_rate": 9.75014414760715e-06, "loss": 2.1489, "step": 780 }, { "epoch": 0.05061061061061061, "grad_norm": 0.9781584143638611, "learning_rate": 9.746940867448268e-06, "loss": 2.141, "step": 790 }, { "epoch": 0.051251251251251254, "grad_norm": 0.7666671276092529, "learning_rate": 9.743737587289385e-06, "loss": 2.1452, "step": 800 }, { "epoch": 0.051251251251251254, "eval_loss": 2.1780078411102295, "eval_runtime": 99.0698, "eval_samples_per_second": 10.094, "eval_steps_per_second": 5.047, "step": 800 }, { "epoch": 0.05189189189189189, "grad_norm": 0.8632680177688599, "learning_rate": 9.740534307130502e-06, "loss": 2.1417, "step": 810 }, { "epoch": 0.05253253253253253, "grad_norm": 0.8318341374397278, "learning_rate": 9.73733102697162e-06, "loss": 2.1557, "step": 820 }, { "epoch": 0.053173173173173174, "grad_norm": 0.7503217458724976, "learning_rate": 9.734127746812737e-06, "loss": 2.141, "step": 830 }, { "epoch": 0.05381381381381382, "grad_norm": 0.7540414929389954, "learning_rate": 9.730924466653853e-06, "loss": 2.1537, "step": 840 }, { "epoch": 0.05445445445445445, "grad_norm": 0.8077926635742188, "learning_rate": 9.727721186494972e-06, "loss": 2.1571, "step": 850 }, { "epoch": 0.055095095095095095, "grad_norm": 0.7664414048194885, "learning_rate": 9.72451790633609e-06, "loss": 2.1454, "step": 860 }, { "epoch": 0.05573573573573574, "grad_norm": 0.7282930016517639, "learning_rate": 9.721314626177207e-06, "loss": 2.1908, "step": 870 }, { "epoch": 0.05637637637637637, "grad_norm": 0.9039175510406494, "learning_rate": 9.718111346018323e-06, "loss": 2.1392, "step": 880 }, { "epoch": 0.057017017017017016, "grad_norm": 0.8304852247238159, "learning_rate": 9.71490806585944e-06, "loss": 2.1635, "step": 890 }, { "epoch": 0.05765765765765766, "grad_norm": 0.8299368619918823, "learning_rate": 9.711704785700559e-06, "loss": 2.1533, "step": 900 }, { "epoch": 0.05765765765765766, "eval_loss": 2.1680660247802734, "eval_runtime": 98.9216, "eval_samples_per_second": 10.109, "eval_steps_per_second": 5.055, "step": 900 }, { "epoch": 0.0582982982982983, "grad_norm": 0.8337482213973999, "learning_rate": 9.708501505541675e-06, "loss": 2.1754, "step": 910 }, { "epoch": 0.05893893893893894, "grad_norm": 0.9944339990615845, "learning_rate": 9.705298225382794e-06, "loss": 2.1647, "step": 920 }, { "epoch": 0.05957957957957958, "grad_norm": 0.7565041780471802, "learning_rate": 9.70209494522391e-06, "loss": 2.1799, "step": 930 }, { "epoch": 0.06022022022022022, "grad_norm": 0.7579349279403687, "learning_rate": 9.698891665065029e-06, "loss": 2.1281, "step": 940 }, { "epoch": 0.06086086086086086, "grad_norm": 0.8098254203796387, "learning_rate": 9.695688384906145e-06, "loss": 2.1129, "step": 950 }, { "epoch": 0.0615015015015015, "grad_norm": 1.1852097511291504, "learning_rate": 9.692485104747262e-06, "loss": 2.1336, "step": 960 }, { "epoch": 0.06214214214214214, "grad_norm": 0.8178196549415588, "learning_rate": 9.689281824588379e-06, "loss": 2.1323, "step": 970 }, { "epoch": 0.06278278278278278, "grad_norm": 0.8084219098091125, "learning_rate": 9.686078544429497e-06, "loss": 2.1416, "step": 980 }, { "epoch": 0.06342342342342343, "grad_norm": 0.9296208024024963, "learning_rate": 9.682875264270614e-06, "loss": 2.1364, "step": 990 }, { "epoch": 0.06406406406406406, "grad_norm": 0.8570599555969238, "learning_rate": 9.679671984111732e-06, "loss": 2.1607, "step": 1000 }, { "epoch": 0.06406406406406406, "eval_loss": 2.1602447032928467, "eval_runtime": 98.8457, "eval_samples_per_second": 10.117, "eval_steps_per_second": 5.058, "step": 1000 }, { "epoch": 0.0647047047047047, "grad_norm": 1.0591017007827759, "learning_rate": 9.676468703952849e-06, "loss": 2.1344, "step": 1010 }, { "epoch": 0.06534534534534535, "grad_norm": 0.9065935611724854, "learning_rate": 9.673265423793965e-06, "loss": 2.1669, "step": 1020 }, { "epoch": 0.06598598598598598, "grad_norm": 0.7907170653343201, "learning_rate": 9.670062143635082e-06, "loss": 2.1159, "step": 1030 }, { "epoch": 0.06662662662662662, "grad_norm": 0.8142951726913452, "learning_rate": 9.6668588634762e-06, "loss": 2.1436, "step": 1040 }, { "epoch": 0.06726726726726727, "grad_norm": 0.8167764544487, "learning_rate": 9.663655583317317e-06, "loss": 2.1569, "step": 1050 }, { "epoch": 0.0679079079079079, "grad_norm": 0.8406487703323364, "learning_rate": 9.660452303158435e-06, "loss": 2.136, "step": 1060 }, { "epoch": 0.06854854854854855, "grad_norm": 0.8966623544692993, "learning_rate": 9.657249022999552e-06, "loss": 2.1554, "step": 1070 }, { "epoch": 0.06918918918918919, "grad_norm": 0.7379564642906189, "learning_rate": 9.65404574284067e-06, "loss": 2.1528, "step": 1080 }, { "epoch": 0.06982982982982983, "grad_norm": 0.7592926621437073, "learning_rate": 9.650842462681787e-06, "loss": 2.1438, "step": 1090 }, { "epoch": 0.07047047047047048, "grad_norm": 0.7466913461685181, "learning_rate": 9.647639182522904e-06, "loss": 2.1421, "step": 1100 }, { "epoch": 0.07047047047047048, "eval_loss": 2.1529006958007812, "eval_runtime": 98.9445, "eval_samples_per_second": 10.107, "eval_steps_per_second": 5.053, "step": 1100 }, { "epoch": 0.07111111111111111, "grad_norm": 0.8553027510643005, "learning_rate": 9.64443590236402e-06, "loss": 2.1415, "step": 1110 }, { "epoch": 0.07175175175175175, "grad_norm": 0.7739918231964111, "learning_rate": 9.641232622205139e-06, "loss": 2.1368, "step": 1120 }, { "epoch": 0.0723923923923924, "grad_norm": 0.7692018747329712, "learning_rate": 9.638029342046255e-06, "loss": 2.1217, "step": 1130 }, { "epoch": 0.07303303303303303, "grad_norm": 0.7803593873977661, "learning_rate": 9.634826061887374e-06, "loss": 2.1641, "step": 1140 }, { "epoch": 0.07367367367367367, "grad_norm": 0.7034474015235901, "learning_rate": 9.63162278172849e-06, "loss": 2.1358, "step": 1150 }, { "epoch": 0.07431431431431432, "grad_norm": 0.9745317697525024, "learning_rate": 9.628419501569609e-06, "loss": 2.1226, "step": 1160 }, { "epoch": 0.07495495495495495, "grad_norm": 1.0803890228271484, "learning_rate": 9.625216221410725e-06, "loss": 2.1216, "step": 1170 }, { "epoch": 0.0755955955955956, "grad_norm": 0.8124716281890869, "learning_rate": 9.622012941251842e-06, "loss": 2.1258, "step": 1180 }, { "epoch": 0.07623623623623624, "grad_norm": 0.9271863698959351, "learning_rate": 9.61880966109296e-06, "loss": 2.1545, "step": 1190 }, { "epoch": 0.07687687687687687, "grad_norm": 0.7706722617149353, "learning_rate": 9.615606380934077e-06, "loss": 2.1358, "step": 1200 }, { "epoch": 0.07687687687687687, "eval_loss": 2.1485989093780518, "eval_runtime": 99.2209, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.039, "step": 1200 }, { "epoch": 0.07751751751751752, "grad_norm": 0.8027122616767883, "learning_rate": 9.612403100775196e-06, "loss": 2.1441, "step": 1210 }, { "epoch": 0.07815815815815816, "grad_norm": 0.7750908136367798, "learning_rate": 9.609199820616312e-06, "loss": 2.116, "step": 1220 }, { "epoch": 0.0787987987987988, "grad_norm": 0.6906116604804993, "learning_rate": 9.605996540457429e-06, "loss": 2.1289, "step": 1230 }, { "epoch": 0.07943943943943944, "grad_norm": 0.8581239581108093, "learning_rate": 9.602793260298546e-06, "loss": 2.1343, "step": 1240 }, { "epoch": 0.08008008008008008, "grad_norm": 1.026809573173523, "learning_rate": 9.599589980139664e-06, "loss": 2.1307, "step": 1250 }, { "epoch": 0.08072072072072072, "grad_norm": 0.7176423072814941, "learning_rate": 9.59638669998078e-06, "loss": 2.115, "step": 1260 }, { "epoch": 0.08136136136136136, "grad_norm": 0.7856579422950745, "learning_rate": 9.593183419821899e-06, "loss": 2.1171, "step": 1270 }, { "epoch": 0.082002002002002, "grad_norm": 0.8396461606025696, "learning_rate": 9.589980139663016e-06, "loss": 2.0997, "step": 1280 }, { "epoch": 0.08264264264264264, "grad_norm": 0.6931939721107483, "learning_rate": 9.586776859504134e-06, "loss": 2.1171, "step": 1290 }, { "epoch": 0.08328328328328329, "grad_norm": 0.6357343792915344, "learning_rate": 9.58357357934525e-06, "loss": 2.141, "step": 1300 }, { "epoch": 0.08328328328328329, "eval_loss": 2.1414124965667725, "eval_runtime": 98.9922, "eval_samples_per_second": 10.102, "eval_steps_per_second": 5.051, "step": 1300 }, { "epoch": 0.08392392392392392, "grad_norm": 0.674089789390564, "learning_rate": 9.580370299186367e-06, "loss": 2.1417, "step": 1310 }, { "epoch": 0.08456456456456457, "grad_norm": 0.8322325348854065, "learning_rate": 9.577167019027484e-06, "loss": 2.1386, "step": 1320 }, { "epoch": 0.0852052052052052, "grad_norm": 1.3690531253814697, "learning_rate": 9.573963738868602e-06, "loss": 2.1126, "step": 1330 }, { "epoch": 0.08584584584584584, "grad_norm": 0.8282061219215393, "learning_rate": 9.570760458709719e-06, "loss": 2.1163, "step": 1340 }, { "epoch": 0.08648648648648649, "grad_norm": 1.1951463222503662, "learning_rate": 9.567557178550837e-06, "loss": 2.1205, "step": 1350 }, { "epoch": 0.08712712712712713, "grad_norm": 0.8390799760818481, "learning_rate": 9.564353898391954e-06, "loss": 2.114, "step": 1360 }, { "epoch": 0.08776776776776776, "grad_norm": 0.801324725151062, "learning_rate": 9.561150618233072e-06, "loss": 2.1303, "step": 1370 }, { "epoch": 0.08840840840840841, "grad_norm": 0.8951178789138794, "learning_rate": 9.557947338074189e-06, "loss": 2.1207, "step": 1380 }, { "epoch": 0.08904904904904905, "grad_norm": 1.085943579673767, "learning_rate": 9.554744057915306e-06, "loss": 2.1105, "step": 1390 }, { "epoch": 0.08968968968968968, "grad_norm": 0.7810286283493042, "learning_rate": 9.551540777756422e-06, "loss": 2.1337, "step": 1400 }, { "epoch": 0.08968968968968968, "eval_loss": 2.137179136276245, "eval_runtime": 98.8962, "eval_samples_per_second": 10.112, "eval_steps_per_second": 5.056, "step": 1400 }, { "epoch": 0.09033033033033033, "grad_norm": 0.6787813901901245, "learning_rate": 9.54833749759754e-06, "loss": 2.1192, "step": 1410 }, { "epoch": 0.09097097097097097, "grad_norm": 0.8344500660896301, "learning_rate": 9.545134217438657e-06, "loss": 2.1101, "step": 1420 }, { "epoch": 0.09161161161161162, "grad_norm": 0.7459837198257446, "learning_rate": 9.541930937279776e-06, "loss": 2.123, "step": 1430 }, { "epoch": 0.09225225225225225, "grad_norm": 0.6521297097206116, "learning_rate": 9.538727657120892e-06, "loss": 2.0906, "step": 1440 }, { "epoch": 0.09289289289289289, "grad_norm": 0.6362139582633972, "learning_rate": 9.535524376962009e-06, "loss": 2.1154, "step": 1450 }, { "epoch": 0.09353353353353354, "grad_norm": 0.6726361513137817, "learning_rate": 9.532321096803127e-06, "loss": 2.1368, "step": 1460 }, { "epoch": 0.09417417417417417, "grad_norm": 0.9578641653060913, "learning_rate": 9.529117816644244e-06, "loss": 2.1159, "step": 1470 }, { "epoch": 0.09481481481481481, "grad_norm": 0.8151896595954895, "learning_rate": 9.525914536485363e-06, "loss": 2.1405, "step": 1480 }, { "epoch": 0.09545545545545546, "grad_norm": 0.694631814956665, "learning_rate": 9.52271125632648e-06, "loss": 2.1126, "step": 1490 }, { "epoch": 0.0960960960960961, "grad_norm": 0.7634385824203491, "learning_rate": 9.519507976167598e-06, "loss": 2.1297, "step": 1500 }, { "epoch": 0.0960960960960961, "eval_loss": 2.132540464401245, "eval_runtime": 98.7552, "eval_samples_per_second": 10.126, "eval_steps_per_second": 5.063, "step": 1500 }, { "epoch": 0.09673673673673673, "grad_norm": 0.7835580706596375, "learning_rate": 9.516304696008714e-06, "loss": 2.1314, "step": 1510 }, { "epoch": 0.09737737737737738, "grad_norm": 0.7323389649391174, "learning_rate": 9.513101415849831e-06, "loss": 2.1396, "step": 1520 }, { "epoch": 0.09801801801801802, "grad_norm": 0.8194797039031982, "learning_rate": 9.509898135690948e-06, "loss": 2.0894, "step": 1530 }, { "epoch": 0.09865865865865867, "grad_norm": 0.734104335308075, "learning_rate": 9.506694855532066e-06, "loss": 2.112, "step": 1540 }, { "epoch": 0.0992992992992993, "grad_norm": 0.8642280101776123, "learning_rate": 9.503491575373183e-06, "loss": 2.1098, "step": 1550 }, { "epoch": 0.09993993993993994, "grad_norm": 0.7576484084129333, "learning_rate": 9.500288295214301e-06, "loss": 2.1053, "step": 1560 }, { "epoch": 0.10058058058058059, "grad_norm": 0.6890721321105957, "learning_rate": 9.497085015055418e-06, "loss": 2.0883, "step": 1570 }, { "epoch": 0.10122122122122122, "grad_norm": 0.6535369753837585, "learning_rate": 9.493881734896534e-06, "loss": 2.1248, "step": 1580 }, { "epoch": 0.10186186186186186, "grad_norm": 0.7473631501197815, "learning_rate": 9.490678454737653e-06, "loss": 2.0969, "step": 1590 }, { "epoch": 0.10250250250250251, "grad_norm": 0.6895433664321899, "learning_rate": 9.48747517457877e-06, "loss": 2.1226, "step": 1600 }, { "epoch": 0.10250250250250251, "eval_loss": 2.128535747528076, "eval_runtime": 99.166, "eval_samples_per_second": 10.084, "eval_steps_per_second": 5.042, "step": 1600 }, { "epoch": 0.10314314314314314, "grad_norm": 0.7126756310462952, "learning_rate": 9.484271894419886e-06, "loss": 2.09, "step": 1610 }, { "epoch": 0.10378378378378378, "grad_norm": 0.7647325992584229, "learning_rate": 9.481068614261004e-06, "loss": 2.1306, "step": 1620 }, { "epoch": 0.10442442442442443, "grad_norm": 0.7653983235359192, "learning_rate": 9.477865334102121e-06, "loss": 2.111, "step": 1630 }, { "epoch": 0.10506506506506506, "grad_norm": 0.7532052993774414, "learning_rate": 9.47466205394324e-06, "loss": 2.1125, "step": 1640 }, { "epoch": 0.1057057057057057, "grad_norm": 0.8752217888832092, "learning_rate": 9.471458773784356e-06, "loss": 2.1218, "step": 1650 }, { "epoch": 0.10634634634634635, "grad_norm": 0.7363693118095398, "learning_rate": 9.468255493625473e-06, "loss": 2.0789, "step": 1660 }, { "epoch": 0.10698698698698698, "grad_norm": 0.7645154595375061, "learning_rate": 9.46505221346659e-06, "loss": 2.1146, "step": 1670 }, { "epoch": 0.10762762762762763, "grad_norm": 0.7656025886535645, "learning_rate": 9.461848933307708e-06, "loss": 2.1247, "step": 1680 }, { "epoch": 0.10826826826826827, "grad_norm": 0.893290102481842, "learning_rate": 9.458645653148824e-06, "loss": 2.1189, "step": 1690 }, { "epoch": 0.1089089089089089, "grad_norm": 0.6659897565841675, "learning_rate": 9.455442372989943e-06, "loss": 2.0933, "step": 1700 }, { "epoch": 0.1089089089089089, "eval_loss": 2.125424861907959, "eval_runtime": 98.9268, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 1700 }, { "epoch": 0.10954954954954955, "grad_norm": 0.6630749106407166, "learning_rate": 9.45223909283106e-06, "loss": 2.1147, "step": 1710 }, { "epoch": 0.11019019019019019, "grad_norm": 0.7636531591415405, "learning_rate": 9.449035812672178e-06, "loss": 2.1104, "step": 1720 }, { "epoch": 0.11083083083083083, "grad_norm": 0.6244649887084961, "learning_rate": 9.445832532513294e-06, "loss": 2.0957, "step": 1730 }, { "epoch": 0.11147147147147148, "grad_norm": 0.8571925163269043, "learning_rate": 9.442629252354411e-06, "loss": 2.1127, "step": 1740 }, { "epoch": 0.11211211211211211, "grad_norm": 0.6850053071975708, "learning_rate": 9.43942597219553e-06, "loss": 2.1045, "step": 1750 }, { "epoch": 0.11275275275275275, "grad_norm": 0.716699481010437, "learning_rate": 9.436222692036646e-06, "loss": 2.0728, "step": 1760 }, { "epoch": 0.1133933933933934, "grad_norm": 0.742249071598053, "learning_rate": 9.433019411877765e-06, "loss": 2.0918, "step": 1770 }, { "epoch": 0.11403403403403403, "grad_norm": 0.6848533749580383, "learning_rate": 9.429816131718881e-06, "loss": 2.0769, "step": 1780 }, { "epoch": 0.11467467467467468, "grad_norm": 0.8706116080284119, "learning_rate": 9.426612851559998e-06, "loss": 2.1278, "step": 1790 }, { "epoch": 0.11531531531531532, "grad_norm": 0.7491025328636169, "learning_rate": 9.423409571401115e-06, "loss": 2.1126, "step": 1800 }, { "epoch": 0.11531531531531532, "eval_loss": 2.121605157852173, "eval_runtime": 98.8478, "eval_samples_per_second": 10.117, "eval_steps_per_second": 5.058, "step": 1800 }, { "epoch": 0.11595595595595595, "grad_norm": 0.8098132014274597, "learning_rate": 9.420206291242233e-06, "loss": 2.1015, "step": 1810 }, { "epoch": 0.1165965965965966, "grad_norm": 0.9224849343299866, "learning_rate": 9.41700301108335e-06, "loss": 2.1215, "step": 1820 }, { "epoch": 0.11723723723723724, "grad_norm": 0.6793172359466553, "learning_rate": 9.413799730924468e-06, "loss": 2.1115, "step": 1830 }, { "epoch": 0.11787787787787787, "grad_norm": 0.7940657138824463, "learning_rate": 9.410596450765585e-06, "loss": 2.0879, "step": 1840 }, { "epoch": 0.11851851851851852, "grad_norm": 0.947806715965271, "learning_rate": 9.407393170606703e-06, "loss": 2.1014, "step": 1850 }, { "epoch": 0.11915915915915916, "grad_norm": 0.7339703440666199, "learning_rate": 9.40418989044782e-06, "loss": 2.1189, "step": 1860 }, { "epoch": 0.1197997997997998, "grad_norm": 0.9161699414253235, "learning_rate": 9.400986610288936e-06, "loss": 2.0974, "step": 1870 }, { "epoch": 0.12044044044044044, "grad_norm": 0.6893654465675354, "learning_rate": 9.397783330130053e-06, "loss": 2.096, "step": 1880 }, { "epoch": 0.12108108108108108, "grad_norm": 0.6452145576477051, "learning_rate": 9.394580049971171e-06, "loss": 2.1264, "step": 1890 }, { "epoch": 0.12172172172172172, "grad_norm": 0.8689838647842407, "learning_rate": 9.391376769812288e-06, "loss": 2.0991, "step": 1900 }, { "epoch": 0.12172172172172172, "eval_loss": 2.120067834854126, "eval_runtime": 98.9693, "eval_samples_per_second": 10.104, "eval_steps_per_second": 5.052, "step": 1900 }, { "epoch": 0.12236236236236236, "grad_norm": 0.9230114221572876, "learning_rate": 9.388173489653406e-06, "loss": 2.1093, "step": 1910 }, { "epoch": 0.123003003003003, "grad_norm": 0.8922761678695679, "learning_rate": 9.384970209494523e-06, "loss": 2.1015, "step": 1920 }, { "epoch": 0.12364364364364365, "grad_norm": 0.7505314350128174, "learning_rate": 9.381766929335641e-06, "loss": 2.1449, "step": 1930 }, { "epoch": 0.12428428428428429, "grad_norm": 0.7936354279518127, "learning_rate": 9.378563649176758e-06, "loss": 2.1323, "step": 1940 }, { "epoch": 0.12492492492492492, "grad_norm": 0.8380959033966064, "learning_rate": 9.375360369017875e-06, "loss": 2.1223, "step": 1950 }, { "epoch": 0.12556556556556556, "grad_norm": 0.7633472084999084, "learning_rate": 9.372157088858991e-06, "loss": 2.1203, "step": 1960 }, { "epoch": 0.1262062062062062, "grad_norm": 0.6278607249259949, "learning_rate": 9.36895380870011e-06, "loss": 2.0937, "step": 1970 }, { "epoch": 0.12684684684684686, "grad_norm": 0.7140986323356628, "learning_rate": 9.365750528541226e-06, "loss": 2.118, "step": 1980 }, { "epoch": 0.12748748748748748, "grad_norm": 0.8503913283348083, "learning_rate": 9.362547248382345e-06, "loss": 2.1064, "step": 1990 }, { "epoch": 0.12812812812812813, "grad_norm": 0.8607761263847351, "learning_rate": 9.359343968223461e-06, "loss": 2.1054, "step": 2000 }, { "epoch": 0.12812812812812813, "eval_loss": 2.116664171218872, "eval_runtime": 99.0609, "eval_samples_per_second": 10.095, "eval_steps_per_second": 5.047, "step": 2000 }, { "epoch": 0.12876876876876878, "grad_norm": 0.810498833656311, "learning_rate": 9.356140688064578e-06, "loss": 2.1215, "step": 2010 }, { "epoch": 0.1294094094094094, "grad_norm": 0.5971576571464539, "learning_rate": 9.352937407905696e-06, "loss": 2.0815, "step": 2020 }, { "epoch": 0.13005005005005005, "grad_norm": 0.7978019714355469, "learning_rate": 9.349734127746813e-06, "loss": 2.0826, "step": 2030 }, { "epoch": 0.1306906906906907, "grad_norm": 0.7856429219245911, "learning_rate": 9.346530847587931e-06, "loss": 2.1178, "step": 2040 }, { "epoch": 0.13133133133133132, "grad_norm": 0.610020101070404, "learning_rate": 9.343327567429048e-06, "loss": 2.0959, "step": 2050 }, { "epoch": 0.13197197197197197, "grad_norm": 0.8602128028869629, "learning_rate": 9.340124287270167e-06, "loss": 2.0855, "step": 2060 }, { "epoch": 0.13261261261261262, "grad_norm": 0.7461118698120117, "learning_rate": 9.336921007111283e-06, "loss": 2.1051, "step": 2070 }, { "epoch": 0.13325325325325324, "grad_norm": 0.6008646488189697, "learning_rate": 9.3337177269524e-06, "loss": 2.1097, "step": 2080 }, { "epoch": 0.1338938938938939, "grad_norm": 0.7542864084243774, "learning_rate": 9.330514446793517e-06, "loss": 2.1057, "step": 2090 }, { "epoch": 0.13453453453453454, "grad_norm": 0.6587682962417603, "learning_rate": 9.327311166634635e-06, "loss": 2.0717, "step": 2100 }, { "epoch": 0.13453453453453454, "eval_loss": 2.113861322402954, "eval_runtime": 98.8506, "eval_samples_per_second": 10.116, "eval_steps_per_second": 5.058, "step": 2100 }, { "epoch": 0.1351751751751752, "grad_norm": 0.6886266469955444, "learning_rate": 9.324107886475752e-06, "loss": 2.1092, "step": 2110 }, { "epoch": 0.1358158158158158, "grad_norm": 0.6924049854278564, "learning_rate": 9.32090460631687e-06, "loss": 2.0743, "step": 2120 }, { "epoch": 0.13645645645645646, "grad_norm": 0.6305820941925049, "learning_rate": 9.317701326157987e-06, "loss": 2.1235, "step": 2130 }, { "epoch": 0.1370970970970971, "grad_norm": 0.7048687934875488, "learning_rate": 9.314498045999105e-06, "loss": 2.0817, "step": 2140 }, { "epoch": 0.13773773773773773, "grad_norm": 0.7016113996505737, "learning_rate": 9.311294765840222e-06, "loss": 2.0788, "step": 2150 }, { "epoch": 0.13837837837837838, "grad_norm": 0.6380958557128906, "learning_rate": 9.308091485681338e-06, "loss": 2.0981, "step": 2160 }, { "epoch": 0.13901901901901903, "grad_norm": 0.881716787815094, "learning_rate": 9.304888205522455e-06, "loss": 2.1062, "step": 2170 }, { "epoch": 0.13965965965965965, "grad_norm": 0.6827107667922974, "learning_rate": 9.301684925363573e-06, "loss": 2.1052, "step": 2180 }, { "epoch": 0.1403003003003003, "grad_norm": 0.8060847520828247, "learning_rate": 9.29848164520469e-06, "loss": 2.1071, "step": 2190 }, { "epoch": 0.14094094094094095, "grad_norm": 0.6237013339996338, "learning_rate": 9.295278365045808e-06, "loss": 2.1083, "step": 2200 }, { "epoch": 0.14094094094094095, "eval_loss": 2.1115634441375732, "eval_runtime": 99.231, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.039, "step": 2200 }, { "epoch": 0.14158158158158157, "grad_norm": 0.5867820978164673, "learning_rate": 9.292075084886925e-06, "loss": 2.1005, "step": 2210 }, { "epoch": 0.14222222222222222, "grad_norm": 0.663402795791626, "learning_rate": 9.288871804728042e-06, "loss": 2.1105, "step": 2220 }, { "epoch": 0.14286286286286287, "grad_norm": 0.7630754709243774, "learning_rate": 9.285668524569158e-06, "loss": 2.0621, "step": 2230 }, { "epoch": 0.1435035035035035, "grad_norm": 0.7367675304412842, "learning_rate": 9.282465244410277e-06, "loss": 2.1162, "step": 2240 }, { "epoch": 0.14414414414414414, "grad_norm": 0.7971994876861572, "learning_rate": 9.279261964251393e-06, "loss": 2.0683, "step": 2250 }, { "epoch": 0.1447847847847848, "grad_norm": 0.6509613990783691, "learning_rate": 9.276058684092512e-06, "loss": 2.0778, "step": 2260 }, { "epoch": 0.14542542542542541, "grad_norm": 0.7286920547485352, "learning_rate": 9.272855403933628e-06, "loss": 2.0727, "step": 2270 }, { "epoch": 0.14606606606606606, "grad_norm": 0.7628468871116638, "learning_rate": 9.269652123774747e-06, "loss": 2.0767, "step": 2280 }, { "epoch": 0.1467067067067067, "grad_norm": 0.7225992679595947, "learning_rate": 9.266448843615863e-06, "loss": 2.0786, "step": 2290 }, { "epoch": 0.14734734734734733, "grad_norm": 0.7567510604858398, "learning_rate": 9.26324556345698e-06, "loss": 2.0983, "step": 2300 }, { "epoch": 0.14734734734734733, "eval_loss": 2.1110680103302, "eval_runtime": 99.2871, "eval_samples_per_second": 10.072, "eval_steps_per_second": 5.036, "step": 2300 }, { "epoch": 0.14798798798798798, "grad_norm": 0.809766948223114, "learning_rate": 9.260042283298098e-06, "loss": 2.0749, "step": 2310 }, { "epoch": 0.14862862862862863, "grad_norm": 0.7898465394973755, "learning_rate": 9.256839003139215e-06, "loss": 2.1061, "step": 2320 }, { "epoch": 0.14926926926926926, "grad_norm": 0.7167459726333618, "learning_rate": 9.253635722980333e-06, "loss": 2.1222, "step": 2330 }, { "epoch": 0.1499099099099099, "grad_norm": 0.7634709477424622, "learning_rate": 9.25043244282145e-06, "loss": 2.1032, "step": 2340 }, { "epoch": 0.15055055055055055, "grad_norm": 0.8492723107337952, "learning_rate": 9.247229162662567e-06, "loss": 2.1228, "step": 2350 }, { "epoch": 0.1511911911911912, "grad_norm": 0.6447978019714355, "learning_rate": 9.244025882503685e-06, "loss": 2.1031, "step": 2360 }, { "epoch": 0.15183183183183183, "grad_norm": 0.757310688495636, "learning_rate": 9.240822602344802e-06, "loss": 2.1096, "step": 2370 }, { "epoch": 0.15247247247247248, "grad_norm": 0.6055355668067932, "learning_rate": 9.237619322185919e-06, "loss": 2.1049, "step": 2380 }, { "epoch": 0.15311311311311313, "grad_norm": 0.6686123013496399, "learning_rate": 9.234416042027037e-06, "loss": 2.0744, "step": 2390 }, { "epoch": 0.15375375375375375, "grad_norm": 0.7597561478614807, "learning_rate": 9.231212761868154e-06, "loss": 2.0981, "step": 2400 }, { "epoch": 0.15375375375375375, "eval_loss": 2.106098175048828, "eval_runtime": 99.2501, "eval_samples_per_second": 10.076, "eval_steps_per_second": 5.038, "step": 2400 }, { "epoch": 0.1543943943943944, "grad_norm": 0.635094940662384, "learning_rate": 9.228009481709272e-06, "loss": 2.105, "step": 2410 }, { "epoch": 0.15503503503503505, "grad_norm": 0.8152109980583191, "learning_rate": 9.224806201550389e-06, "loss": 2.0686, "step": 2420 }, { "epoch": 0.15567567567567567, "grad_norm": 0.7147831320762634, "learning_rate": 9.221602921391505e-06, "loss": 2.0769, "step": 2430 }, { "epoch": 0.15631631631631632, "grad_norm": 0.8072251081466675, "learning_rate": 9.218399641232622e-06, "loss": 2.0943, "step": 2440 }, { "epoch": 0.15695695695695697, "grad_norm": 0.6609693169593811, "learning_rate": 9.21519636107374e-06, "loss": 2.0871, "step": 2450 }, { "epoch": 0.1575975975975976, "grad_norm": 0.8483486175537109, "learning_rate": 9.211993080914857e-06, "loss": 2.0795, "step": 2460 }, { "epoch": 0.15823823823823824, "grad_norm": 0.6840553283691406, "learning_rate": 9.208789800755975e-06, "loss": 2.0868, "step": 2470 }, { "epoch": 0.1588788788788789, "grad_norm": 0.7068288326263428, "learning_rate": 9.205586520597092e-06, "loss": 2.0649, "step": 2480 }, { "epoch": 0.1595195195195195, "grad_norm": 0.7380810976028442, "learning_rate": 9.20238324043821e-06, "loss": 2.0733, "step": 2490 }, { "epoch": 0.16016016016016016, "grad_norm": 0.7049792408943176, "learning_rate": 9.199179960279327e-06, "loss": 2.0834, "step": 2500 }, { "epoch": 0.16016016016016016, "eval_loss": 2.104818105697632, "eval_runtime": 99.3161, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.034, "step": 2500 }, { "epoch": 0.1608008008008008, "grad_norm": 0.6536545157432556, "learning_rate": 9.195976680120444e-06, "loss": 2.0919, "step": 2510 }, { "epoch": 0.16144144144144143, "grad_norm": 0.7614091038703918, "learning_rate": 9.19277339996156e-06, "loss": 2.0966, "step": 2520 }, { "epoch": 0.16208208208208208, "grad_norm": 0.696030855178833, "learning_rate": 9.189570119802679e-06, "loss": 2.0731, "step": 2530 }, { "epoch": 0.16272272272272273, "grad_norm": 0.9704303741455078, "learning_rate": 9.186366839643795e-06, "loss": 2.0785, "step": 2540 }, { "epoch": 0.16336336336336335, "grad_norm": 0.6941235065460205, "learning_rate": 9.183163559484914e-06, "loss": 2.0695, "step": 2550 }, { "epoch": 0.164004004004004, "grad_norm": 0.7895737886428833, "learning_rate": 9.17996027932603e-06, "loss": 2.0907, "step": 2560 }, { "epoch": 0.16464464464464465, "grad_norm": 0.7447530031204224, "learning_rate": 9.176756999167147e-06, "loss": 2.0847, "step": 2570 }, { "epoch": 0.16528528528528527, "grad_norm": 0.7622038125991821, "learning_rate": 9.173553719008265e-06, "loss": 2.052, "step": 2580 }, { "epoch": 0.16592592592592592, "grad_norm": 0.7055556178092957, "learning_rate": 9.170350438849382e-06, "loss": 2.0979, "step": 2590 }, { "epoch": 0.16656656656656657, "grad_norm": 0.6614319682121277, "learning_rate": 9.1671471586905e-06, "loss": 2.0952, "step": 2600 }, { "epoch": 0.16656656656656657, "eval_loss": 2.1026060581207275, "eval_runtime": 98.8201, "eval_samples_per_second": 10.119, "eval_steps_per_second": 5.06, "step": 2600 }, { "epoch": 0.16720720720720722, "grad_norm": 0.6550982594490051, "learning_rate": 9.163943878531617e-06, "loss": 2.0745, "step": 2610 }, { "epoch": 0.16784784784784784, "grad_norm": 0.6947475671768188, "learning_rate": 9.160740598372735e-06, "loss": 2.0862, "step": 2620 }, { "epoch": 0.1684884884884885, "grad_norm": 0.6655051112174988, "learning_rate": 9.157537318213852e-06, "loss": 2.0813, "step": 2630 }, { "epoch": 0.16912912912912914, "grad_norm": 0.5886589884757996, "learning_rate": 9.154334038054969e-06, "loss": 2.1122, "step": 2640 }, { "epoch": 0.16976976976976976, "grad_norm": 0.6389265060424805, "learning_rate": 9.151130757896085e-06, "loss": 2.0789, "step": 2650 }, { "epoch": 0.1704104104104104, "grad_norm": 0.6327618360519409, "learning_rate": 9.147927477737204e-06, "loss": 2.0449, "step": 2660 }, { "epoch": 0.17105105105105106, "grad_norm": 0.7381449341773987, "learning_rate": 9.14472419757832e-06, "loss": 2.0934, "step": 2670 }, { "epoch": 0.17169169169169168, "grad_norm": 0.6343180537223816, "learning_rate": 9.141520917419439e-06, "loss": 2.0768, "step": 2680 }, { "epoch": 0.17233233233233233, "grad_norm": 0.6861628293991089, "learning_rate": 9.138317637260556e-06, "loss": 2.0827, "step": 2690 }, { "epoch": 0.17297297297297298, "grad_norm": 0.7896407842636108, "learning_rate": 9.135114357101674e-06, "loss": 2.0736, "step": 2700 }, { "epoch": 0.17297297297297298, "eval_loss": 2.101081132888794, "eval_runtime": 98.7456, "eval_samples_per_second": 10.127, "eval_steps_per_second": 5.064, "step": 2700 }, { "epoch": 0.1736136136136136, "grad_norm": 0.6029135584831238, "learning_rate": 9.13191107694279e-06, "loss": 2.0433, "step": 2710 }, { "epoch": 0.17425425425425425, "grad_norm": 0.9753668308258057, "learning_rate": 9.128707796783907e-06, "loss": 2.0716, "step": 2720 }, { "epoch": 0.1748948948948949, "grad_norm": 0.6830595135688782, "learning_rate": 9.125504516625024e-06, "loss": 2.0663, "step": 2730 }, { "epoch": 0.17553553553553553, "grad_norm": 0.6865383982658386, "learning_rate": 9.122301236466142e-06, "loss": 2.0515, "step": 2740 }, { "epoch": 0.17617617617617617, "grad_norm": 0.6678487062454224, "learning_rate": 9.119097956307259e-06, "loss": 2.1013, "step": 2750 }, { "epoch": 0.17681681681681682, "grad_norm": 0.7546699643135071, "learning_rate": 9.115894676148377e-06, "loss": 2.0351, "step": 2760 }, { "epoch": 0.17745745745745745, "grad_norm": 0.7077041864395142, "learning_rate": 9.112691395989494e-06, "loss": 2.0679, "step": 2770 }, { "epoch": 0.1780980980980981, "grad_norm": 0.682655930519104, "learning_rate": 9.10948811583061e-06, "loss": 2.0566, "step": 2780 }, { "epoch": 0.17873873873873874, "grad_norm": 0.6080804467201233, "learning_rate": 9.106284835671729e-06, "loss": 2.0616, "step": 2790 }, { "epoch": 0.17937937937937937, "grad_norm": 0.6820564270019531, "learning_rate": 9.103081555512846e-06, "loss": 2.0932, "step": 2800 }, { "epoch": 0.17937937937937937, "eval_loss": 2.0988144874572754, "eval_runtime": 99.1774, "eval_samples_per_second": 10.083, "eval_steps_per_second": 5.041, "step": 2800 }, { "epoch": 0.18002002002002002, "grad_norm": 0.5526257753372192, "learning_rate": 9.099878275353962e-06, "loss": 2.1138, "step": 2810 }, { "epoch": 0.18066066066066067, "grad_norm": 0.6748570799827576, "learning_rate": 9.09667499519508e-06, "loss": 2.0878, "step": 2820 }, { "epoch": 0.1813013013013013, "grad_norm": 0.6675301194190979, "learning_rate": 9.093471715036197e-06, "loss": 2.0713, "step": 2830 }, { "epoch": 0.18194194194194194, "grad_norm": 0.7286090850830078, "learning_rate": 9.090268434877316e-06, "loss": 2.0988, "step": 2840 }, { "epoch": 0.1825825825825826, "grad_norm": 0.7662502527236938, "learning_rate": 9.087065154718432e-06, "loss": 2.075, "step": 2850 }, { "epoch": 0.18322322322322324, "grad_norm": 0.6795459389686584, "learning_rate": 9.083861874559549e-06, "loss": 2.0893, "step": 2860 }, { "epoch": 0.18386386386386386, "grad_norm": 0.6231416463851929, "learning_rate": 9.080658594400667e-06, "loss": 2.0917, "step": 2870 }, { "epoch": 0.1845045045045045, "grad_norm": 0.7620142698287964, "learning_rate": 9.077455314241784e-06, "loss": 2.0307, "step": 2880 }, { "epoch": 0.18514514514514516, "grad_norm": 0.6979257464408875, "learning_rate": 9.074252034082902e-06, "loss": 2.0662, "step": 2890 }, { "epoch": 0.18578578578578578, "grad_norm": 0.5804643630981445, "learning_rate": 9.071048753924019e-06, "loss": 2.0613, "step": 2900 }, { "epoch": 0.18578578578578578, "eval_loss": 2.0972344875335693, "eval_runtime": 99.234, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.039, "step": 2900 }, { "epoch": 0.18642642642642643, "grad_norm": 0.7419028282165527, "learning_rate": 9.067845473765137e-06, "loss": 2.0636, "step": 2910 }, { "epoch": 0.18706706706706708, "grad_norm": 0.7618236541748047, "learning_rate": 9.064642193606254e-06, "loss": 2.0519, "step": 2920 }, { "epoch": 0.1877077077077077, "grad_norm": 0.5812451243400574, "learning_rate": 9.06143891344737e-06, "loss": 2.1006, "step": 2930 }, { "epoch": 0.18834834834834835, "grad_norm": 0.6834234595298767, "learning_rate": 9.058235633288487e-06, "loss": 2.0534, "step": 2940 }, { "epoch": 0.188988988988989, "grad_norm": 0.6607987284660339, "learning_rate": 9.055032353129606e-06, "loss": 2.0839, "step": 2950 }, { "epoch": 0.18962962962962962, "grad_norm": 0.6382490396499634, "learning_rate": 9.051829072970723e-06, "loss": 2.085, "step": 2960 }, { "epoch": 0.19027027027027027, "grad_norm": 0.6113517880439758, "learning_rate": 9.048625792811841e-06, "loss": 2.076, "step": 2970 }, { "epoch": 0.19091091091091092, "grad_norm": 0.6453644633293152, "learning_rate": 9.045422512652958e-06, "loss": 2.0997, "step": 2980 }, { "epoch": 0.19155155155155154, "grad_norm": 0.6111727952957153, "learning_rate": 9.042219232494074e-06, "loss": 2.0862, "step": 2990 }, { "epoch": 0.1921921921921922, "grad_norm": 0.6452946066856384, "learning_rate": 9.039015952335191e-06, "loss": 2.0733, "step": 3000 }, { "epoch": 0.1921921921921922, "eval_loss": 2.0955660343170166, "eval_runtime": 99.267, "eval_samples_per_second": 10.074, "eval_steps_per_second": 5.037, "step": 3000 }, { "epoch": 0.19283283283283284, "grad_norm": 0.6006574630737305, "learning_rate": 9.03581267217631e-06, "loss": 2.063, "step": 3010 }, { "epoch": 0.19347347347347346, "grad_norm": 0.6869122982025146, "learning_rate": 9.032609392017426e-06, "loss": 2.0952, "step": 3020 }, { "epoch": 0.1941141141141141, "grad_norm": 0.5634675621986389, "learning_rate": 9.029406111858544e-06, "loss": 2.0956, "step": 3030 }, { "epoch": 0.19475475475475476, "grad_norm": 0.6178089380264282, "learning_rate": 9.026202831699661e-06, "loss": 2.0201, "step": 3040 }, { "epoch": 0.19539539539539538, "grad_norm": 0.663336992263794, "learning_rate": 9.02299955154078e-06, "loss": 2.0747, "step": 3050 }, { "epoch": 0.19603603603603603, "grad_norm": 0.9335659742355347, "learning_rate": 9.019796271381896e-06, "loss": 2.1026, "step": 3060 }, { "epoch": 0.19667667667667668, "grad_norm": 0.6373148560523987, "learning_rate": 9.016592991223013e-06, "loss": 2.0617, "step": 3070 }, { "epoch": 0.19731731731731733, "grad_norm": 0.6642004251480103, "learning_rate": 9.01338971106413e-06, "loss": 2.096, "step": 3080 }, { "epoch": 0.19795795795795795, "grad_norm": 0.6753519773483276, "learning_rate": 9.010186430905248e-06, "loss": 2.077, "step": 3090 }, { "epoch": 0.1985985985985986, "grad_norm": 0.569271445274353, "learning_rate": 9.006983150746364e-06, "loss": 2.0502, "step": 3100 }, { "epoch": 0.1985985985985986, "eval_loss": 2.093749761581421, "eval_runtime": 99.1455, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 3100 }, { "epoch": 0.19923923923923925, "grad_norm": 0.6224071979522705, "learning_rate": 9.003779870587483e-06, "loss": 2.0576, "step": 3110 }, { "epoch": 0.19987987987987987, "grad_norm": 0.6364945769309998, "learning_rate": 9.0005765904286e-06, "loss": 2.0429, "step": 3120 }, { "epoch": 0.20052052052052052, "grad_norm": 0.7530544996261597, "learning_rate": 8.997373310269718e-06, "loss": 2.0803, "step": 3130 }, { "epoch": 0.20116116116116117, "grad_norm": 0.7073140740394592, "learning_rate": 8.994170030110834e-06, "loss": 2.057, "step": 3140 }, { "epoch": 0.2018018018018018, "grad_norm": 0.631070613861084, "learning_rate": 8.990966749951951e-06, "loss": 2.0786, "step": 3150 }, { "epoch": 0.20244244244244244, "grad_norm": 0.711856484413147, "learning_rate": 8.98776346979307e-06, "loss": 2.0761, "step": 3160 }, { "epoch": 0.2030830830830831, "grad_norm": 0.6684698462486267, "learning_rate": 8.984560189634186e-06, "loss": 2.07, "step": 3170 }, { "epoch": 0.20372372372372372, "grad_norm": 0.6424736380577087, "learning_rate": 8.981356909475304e-06, "loss": 2.0619, "step": 3180 }, { "epoch": 0.20436436436436436, "grad_norm": 0.6278524994850159, "learning_rate": 8.978153629316421e-06, "loss": 2.0607, "step": 3190 }, { "epoch": 0.20500500500500501, "grad_norm": 0.6745601296424866, "learning_rate": 8.974950349157538e-06, "loss": 2.0979, "step": 3200 }, { "epoch": 0.20500500500500501, "eval_loss": 2.092132568359375, "eval_runtime": 99.2104, "eval_samples_per_second": 10.08, "eval_steps_per_second": 5.04, "step": 3200 }, { "epoch": 0.20564564564564564, "grad_norm": 0.6170912384986877, "learning_rate": 8.971747068998654e-06, "loss": 2.1035, "step": 3210 }, { "epoch": 0.20628628628628629, "grad_norm": 0.7534422278404236, "learning_rate": 8.968543788839773e-06, "loss": 2.054, "step": 3220 }, { "epoch": 0.20692692692692694, "grad_norm": 0.672907829284668, "learning_rate": 8.96534050868089e-06, "loss": 2.0634, "step": 3230 }, { "epoch": 0.20756756756756756, "grad_norm": 0.6018652319908142, "learning_rate": 8.962137228522008e-06, "loss": 2.0809, "step": 3240 }, { "epoch": 0.2082082082082082, "grad_norm": 0.6929556727409363, "learning_rate": 8.958933948363125e-06, "loss": 2.1188, "step": 3250 }, { "epoch": 0.20884884884884886, "grad_norm": 0.6032728552818298, "learning_rate": 8.955730668204243e-06, "loss": 2.08, "step": 3260 }, { "epoch": 0.20948948948948948, "grad_norm": 0.6318669319152832, "learning_rate": 8.95252738804536e-06, "loss": 2.0684, "step": 3270 }, { "epoch": 0.21013013013013013, "grad_norm": 0.6870554685592651, "learning_rate": 8.949324107886476e-06, "loss": 2.0783, "step": 3280 }, { "epoch": 0.21077077077077078, "grad_norm": 0.6430284976959229, "learning_rate": 8.946120827727593e-06, "loss": 2.084, "step": 3290 }, { "epoch": 0.2114114114114114, "grad_norm": 0.654576301574707, "learning_rate": 8.942917547568711e-06, "loss": 2.1042, "step": 3300 }, { "epoch": 0.2114114114114114, "eval_loss": 2.0920584201812744, "eval_runtime": 99.1218, "eval_samples_per_second": 10.089, "eval_steps_per_second": 5.044, "step": 3300 }, { "epoch": 0.21205205205205205, "grad_norm": 0.7410485744476318, "learning_rate": 8.939714267409828e-06, "loss": 2.0787, "step": 3310 }, { "epoch": 0.2126926926926927, "grad_norm": 0.8399192690849304, "learning_rate": 8.936510987250946e-06, "loss": 2.0877, "step": 3320 }, { "epoch": 0.21333333333333335, "grad_norm": 0.6334551572799683, "learning_rate": 8.933307707092063e-06, "loss": 2.0447, "step": 3330 }, { "epoch": 0.21397397397397397, "grad_norm": 0.8268193006515503, "learning_rate": 8.930104426933181e-06, "loss": 2.0678, "step": 3340 }, { "epoch": 0.21461461461461462, "grad_norm": 0.7777826189994812, "learning_rate": 8.926901146774298e-06, "loss": 2.0719, "step": 3350 }, { "epoch": 0.21525525525525527, "grad_norm": 0.6634954214096069, "learning_rate": 8.923697866615415e-06, "loss": 2.065, "step": 3360 }, { "epoch": 0.2158958958958959, "grad_norm": 0.6769670248031616, "learning_rate": 8.920494586456531e-06, "loss": 2.0575, "step": 3370 }, { "epoch": 0.21653653653653654, "grad_norm": 0.6782456636428833, "learning_rate": 8.91729130629765e-06, "loss": 2.0914, "step": 3380 }, { "epoch": 0.2171771771771772, "grad_norm": 0.5194129943847656, "learning_rate": 8.914088026138766e-06, "loss": 2.0793, "step": 3390 }, { "epoch": 0.2178178178178178, "grad_norm": 0.5972870588302612, "learning_rate": 8.910884745979885e-06, "loss": 2.0644, "step": 3400 }, { "epoch": 0.2178178178178178, "eval_loss": 2.089409589767456, "eval_runtime": 99.0707, "eval_samples_per_second": 10.094, "eval_steps_per_second": 5.047, "step": 3400 }, { "epoch": 0.21845845845845846, "grad_norm": 0.6698557138442993, "learning_rate": 8.907681465821001e-06, "loss": 2.0889, "step": 3410 }, { "epoch": 0.2190990990990991, "grad_norm": 0.6106517910957336, "learning_rate": 8.904478185662118e-06, "loss": 2.0835, "step": 3420 }, { "epoch": 0.21973973973973973, "grad_norm": 0.6931442022323608, "learning_rate": 8.901274905503236e-06, "loss": 2.0821, "step": 3430 }, { "epoch": 0.22038038038038038, "grad_norm": 0.6137683391571045, "learning_rate": 8.898071625344353e-06, "loss": 2.0666, "step": 3440 }, { "epoch": 0.22102102102102103, "grad_norm": 0.599586546421051, "learning_rate": 8.894868345185471e-06, "loss": 2.0867, "step": 3450 }, { "epoch": 0.22166166166166165, "grad_norm": 0.6044438481330872, "learning_rate": 8.891665065026588e-06, "loss": 2.0751, "step": 3460 }, { "epoch": 0.2223023023023023, "grad_norm": 0.6222479343414307, "learning_rate": 8.888461784867706e-06, "loss": 2.0877, "step": 3470 }, { "epoch": 0.22294294294294295, "grad_norm": 0.5773678421974182, "learning_rate": 8.885258504708823e-06, "loss": 2.0502, "step": 3480 }, { "epoch": 0.22358358358358357, "grad_norm": 0.5937193036079407, "learning_rate": 8.88205522454994e-06, "loss": 2.0945, "step": 3490 }, { "epoch": 0.22422422422422422, "grad_norm": 0.6824532747268677, "learning_rate": 8.878851944391056e-06, "loss": 2.0528, "step": 3500 }, { "epoch": 0.22422422422422422, "eval_loss": 2.0892724990844727, "eval_runtime": 99.0067, "eval_samples_per_second": 10.1, "eval_steps_per_second": 5.05, "step": 3500 }, { "epoch": 0.22486486486486487, "grad_norm": 0.6509242057800293, "learning_rate": 8.875648664232175e-06, "loss": 2.1105, "step": 3510 }, { "epoch": 0.2255055055055055, "grad_norm": 0.6354013085365295, "learning_rate": 8.872445384073291e-06, "loss": 2.0607, "step": 3520 }, { "epoch": 0.22614614614614614, "grad_norm": 0.6222078204154968, "learning_rate": 8.86924210391441e-06, "loss": 2.0638, "step": 3530 }, { "epoch": 0.2267867867867868, "grad_norm": 0.5962913632392883, "learning_rate": 8.866038823755527e-06, "loss": 2.0623, "step": 3540 }, { "epoch": 0.22742742742742741, "grad_norm": 0.6757493615150452, "learning_rate": 8.862835543596643e-06, "loss": 2.0916, "step": 3550 }, { "epoch": 0.22806806806806806, "grad_norm": 0.5757195353507996, "learning_rate": 8.859632263437762e-06, "loss": 2.0587, "step": 3560 }, { "epoch": 0.2287087087087087, "grad_norm": 0.6077573299407959, "learning_rate": 8.856428983278878e-06, "loss": 2.0703, "step": 3570 }, { "epoch": 0.22934934934934936, "grad_norm": 0.6776688694953918, "learning_rate": 8.853225703119995e-06, "loss": 2.0932, "step": 3580 }, { "epoch": 0.22998998998998998, "grad_norm": 0.7286055088043213, "learning_rate": 8.850022422961113e-06, "loss": 2.0642, "step": 3590 }, { "epoch": 0.23063063063063063, "grad_norm": 0.5565325617790222, "learning_rate": 8.84681914280223e-06, "loss": 2.0548, "step": 3600 }, { "epoch": 0.23063063063063063, "eval_loss": 2.086730480194092, "eval_runtime": 99.0428, "eval_samples_per_second": 10.097, "eval_steps_per_second": 5.048, "step": 3600 }, { "epoch": 0.23127127127127128, "grad_norm": 0.6085419058799744, "learning_rate": 8.843615862643348e-06, "loss": 2.0301, "step": 3610 }, { "epoch": 0.2319119119119119, "grad_norm": 0.5839805603027344, "learning_rate": 8.840412582484465e-06, "loss": 2.0478, "step": 3620 }, { "epoch": 0.23255255255255255, "grad_norm": 0.7211072444915771, "learning_rate": 8.837209302325582e-06, "loss": 2.0971, "step": 3630 }, { "epoch": 0.2331931931931932, "grad_norm": 0.656085729598999, "learning_rate": 8.834006022166698e-06, "loss": 2.0331, "step": 3640 }, { "epoch": 0.23383383383383383, "grad_norm": 0.5496543645858765, "learning_rate": 8.830802742007817e-06, "loss": 2.067, "step": 3650 }, { "epoch": 0.23447447447447448, "grad_norm": 0.5275883078575134, "learning_rate": 8.827599461848933e-06, "loss": 2.0737, "step": 3660 }, { "epoch": 0.23511511511511513, "grad_norm": 0.5332633852958679, "learning_rate": 8.824396181690052e-06, "loss": 2.0562, "step": 3670 }, { "epoch": 0.23575575575575575, "grad_norm": 0.7012920379638672, "learning_rate": 8.821192901531168e-06, "loss": 2.0704, "step": 3680 }, { "epoch": 0.2363963963963964, "grad_norm": 0.570817232131958, "learning_rate": 8.817989621372287e-06, "loss": 2.0765, "step": 3690 }, { "epoch": 0.23703703703703705, "grad_norm": 0.6036481261253357, "learning_rate": 8.814786341213403e-06, "loss": 2.0905, "step": 3700 }, { "epoch": 0.23703703703703705, "eval_loss": 2.086113452911377, "eval_runtime": 98.9534, "eval_samples_per_second": 10.106, "eval_steps_per_second": 5.053, "step": 3700 }, { "epoch": 0.23767767767767767, "grad_norm": 0.5793745517730713, "learning_rate": 8.81158306105452e-06, "loss": 2.0549, "step": 3710 }, { "epoch": 0.23831831831831832, "grad_norm": 0.566124439239502, "learning_rate": 8.808379780895638e-06, "loss": 2.0537, "step": 3720 }, { "epoch": 0.23895895895895897, "grad_norm": 0.5538589358329773, "learning_rate": 8.805176500736755e-06, "loss": 2.0669, "step": 3730 }, { "epoch": 0.2395995995995996, "grad_norm": 0.558520495891571, "learning_rate": 8.801973220577873e-06, "loss": 2.1043, "step": 3740 }, { "epoch": 0.24024024024024024, "grad_norm": 0.6997453570365906, "learning_rate": 8.79876994041899e-06, "loss": 2.0715, "step": 3750 }, { "epoch": 0.2408808808808809, "grad_norm": 0.641663134098053, "learning_rate": 8.795566660260107e-06, "loss": 2.074, "step": 3760 }, { "epoch": 0.2415215215215215, "grad_norm": 0.7044941782951355, "learning_rate": 8.792363380101223e-06, "loss": 2.0649, "step": 3770 }, { "epoch": 0.24216216216216216, "grad_norm": 0.6711502075195312, "learning_rate": 8.789160099942342e-06, "loss": 2.0886, "step": 3780 }, { "epoch": 0.2428028028028028, "grad_norm": 0.6159843802452087, "learning_rate": 8.785956819783458e-06, "loss": 2.0574, "step": 3790 }, { "epoch": 0.24344344344344343, "grad_norm": 0.6314002871513367, "learning_rate": 8.782753539624577e-06, "loss": 2.0853, "step": 3800 }, { "epoch": 0.24344344344344343, "eval_loss": 2.0856714248657227, "eval_runtime": 98.9938, "eval_samples_per_second": 10.102, "eval_steps_per_second": 5.051, "step": 3800 }, { "epoch": 0.24408408408408408, "grad_norm": 0.7047820091247559, "learning_rate": 8.779550259465693e-06, "loss": 2.0269, "step": 3810 }, { "epoch": 0.24472472472472473, "grad_norm": 0.7113313674926758, "learning_rate": 8.776346979306812e-06, "loss": 2.0679, "step": 3820 }, { "epoch": 0.24536536536536538, "grad_norm": 0.6536078453063965, "learning_rate": 8.773143699147929e-06, "loss": 2.0673, "step": 3830 }, { "epoch": 0.246006006006006, "grad_norm": 0.654203474521637, "learning_rate": 8.769940418989045e-06, "loss": 2.1041, "step": 3840 }, { "epoch": 0.24664664664664665, "grad_norm": 0.6029314398765564, "learning_rate": 8.766737138830162e-06, "loss": 2.0688, "step": 3850 }, { "epoch": 0.2472872872872873, "grad_norm": 0.6532240509986877, "learning_rate": 8.76353385867128e-06, "loss": 2.0567, "step": 3860 }, { "epoch": 0.24792792792792792, "grad_norm": 0.5579843521118164, "learning_rate": 8.760330578512397e-06, "loss": 2.081, "step": 3870 }, { "epoch": 0.24856856856856857, "grad_norm": 0.7852092385292053, "learning_rate": 8.757127298353515e-06, "loss": 2.0464, "step": 3880 }, { "epoch": 0.24920920920920922, "grad_norm": 0.6831695437431335, "learning_rate": 8.753924018194632e-06, "loss": 2.0724, "step": 3890 }, { "epoch": 0.24984984984984984, "grad_norm": 0.5026164650917053, "learning_rate": 8.75072073803575e-06, "loss": 2.0894, "step": 3900 }, { "epoch": 0.24984984984984984, "eval_loss": 2.083714008331299, "eval_runtime": 98.9301, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 3900 }, { "epoch": 0.3339204697091006, "grad_norm": 0.5608721375465393, "learning_rate": 6.660688359381673e-06, "loss": 2.047, "step": 3910 }, { "epoch": 0.3347744862556712, "grad_norm": 0.5373656749725342, "learning_rate": 6.652147920403109e-06, "loss": 2.0301, "step": 3920 }, { "epoch": 0.33562850280224177, "grad_norm": 0.5691947340965271, "learning_rate": 6.643607481424545e-06, "loss": 2.0573, "step": 3930 }, { "epoch": 0.3364825193488124, "grad_norm": 0.63722163438797, "learning_rate": 6.635067042445981e-06, "loss": 2.0341, "step": 3940 }, { "epoch": 0.337336535895383, "grad_norm": 0.5499609112739563, "learning_rate": 6.626526603467419e-06, "loss": 2.0756, "step": 3950 }, { "epoch": 0.33819055244195356, "grad_norm": 0.6073453426361084, "learning_rate": 6.617986164488855e-06, "loss": 2.0549, "step": 3960 }, { "epoch": 0.33904456898852414, "grad_norm": 0.5716825723648071, "learning_rate": 6.609445725510292e-06, "loss": 2.0512, "step": 3970 }, { "epoch": 0.3398985855350947, "grad_norm": 0.5261934399604797, "learning_rate": 6.600905286531728e-06, "loss": 2.0627, "step": 3980 }, { "epoch": 0.34075260208166536, "grad_norm": 0.6741161346435547, "learning_rate": 6.592364847553164e-06, "loss": 2.0467, "step": 3990 }, { "epoch": 0.34160661862823594, "grad_norm": 0.5317021608352661, "learning_rate": 6.583824408574602e-06, "loss": 2.0704, "step": 4000 }, { "epoch": 0.34160661862823594, "eval_loss": 2.0800442695617676, "eval_runtime": 99.8576, "eval_samples_per_second": 10.014, "eval_steps_per_second": 5.007, "step": 4000 }, { "epoch": 0.3424606351748065, "grad_norm": 0.5947962999343872, "learning_rate": 6.575283969596038e-06, "loss": 2.0348, "step": 4010 }, { "epoch": 0.3433146517213771, "grad_norm": 0.5036411881446838, "learning_rate": 6.566743530617475e-06, "loss": 2.0589, "step": 4020 }, { "epoch": 0.3441686682679477, "grad_norm": 0.5852782726287842, "learning_rate": 6.558203091638911e-06, "loss": 2.0557, "step": 4030 }, { "epoch": 0.34502268481451825, "grad_norm": 0.5305678248405457, "learning_rate": 6.549662652660347e-06, "loss": 2.0487, "step": 4040 }, { "epoch": 0.3458767013610889, "grad_norm": 0.5548572540283203, "learning_rate": 6.541122213681784e-06, "loss": 2.0411, "step": 4050 }, { "epoch": 0.34673071790765947, "grad_norm": 0.5388315916061401, "learning_rate": 6.53258177470322e-06, "loss": 2.0236, "step": 4060 }, { "epoch": 0.34758473445423005, "grad_norm": 0.5466106534004211, "learning_rate": 6.5240413357246564e-06, "loss": 2.0639, "step": 4070 }, { "epoch": 0.34843875100080063, "grad_norm": 0.5653154253959656, "learning_rate": 6.515500896746093e-06, "loss": 2.0625, "step": 4080 }, { "epoch": 0.3492927675473712, "grad_norm": 0.5568574666976929, "learning_rate": 6.506960457767529e-06, "loss": 2.0632, "step": 4090 }, { "epoch": 0.35014678409394184, "grad_norm": 0.6117140650749207, "learning_rate": 6.498420018788967e-06, "loss": 2.0439, "step": 4100 }, { "epoch": 0.35014678409394184, "eval_loss": 2.0791141986846924, "eval_runtime": 99.4675, "eval_samples_per_second": 10.054, "eval_steps_per_second": 5.027, "step": 4100 }, { "epoch": 0.3510008006405124, "grad_norm": 0.5750178098678589, "learning_rate": 6.489879579810403e-06, "loss": 2.0585, "step": 4110 }, { "epoch": 0.351854817187083, "grad_norm": 0.5808545351028442, "learning_rate": 6.481339140831839e-06, "loss": 2.0522, "step": 4120 }, { "epoch": 0.3527088337336536, "grad_norm": 0.6266270875930786, "learning_rate": 6.472798701853276e-06, "loss": 2.0635, "step": 4130 }, { "epoch": 0.35356285028022416, "grad_norm": 0.6173312664031982, "learning_rate": 6.464258262874712e-06, "loss": 2.0667, "step": 4140 }, { "epoch": 0.3544168668267948, "grad_norm": 0.5644112825393677, "learning_rate": 6.45571782389615e-06, "loss": 2.0809, "step": 4150 }, { "epoch": 0.3552708833733654, "grad_norm": 0.5110538005828857, "learning_rate": 6.447177384917586e-06, "loss": 2.0636, "step": 4160 }, { "epoch": 0.35612489991993596, "grad_norm": 0.5960603952407837, "learning_rate": 6.438636945939022e-06, "loss": 2.0816, "step": 4170 }, { "epoch": 0.35697891646650654, "grad_norm": 0.6166285872459412, "learning_rate": 6.430096506960459e-06, "loss": 2.0512, "step": 4180 }, { "epoch": 0.3578329330130771, "grad_norm": 0.6123340725898743, "learning_rate": 6.421556067981895e-06, "loss": 2.0568, "step": 4190 }, { "epoch": 0.3586869495596477, "grad_norm": 0.5808490514755249, "learning_rate": 6.413015629003331e-06, "loss": 2.0651, "step": 4200 }, { "epoch": 0.3586869495596477, "eval_loss": 2.078325033187866, "eval_runtime": 99.4675, "eval_samples_per_second": 10.054, "eval_steps_per_second": 5.027, "step": 4200 }, { "epoch": 0.35954096610621833, "grad_norm": 0.550093412399292, "learning_rate": 6.404475190024768e-06, "loss": 2.0594, "step": 4210 }, { "epoch": 0.3603949826527889, "grad_norm": 0.6824820637702942, "learning_rate": 6.395934751046204e-06, "loss": 2.0576, "step": 4220 }, { "epoch": 0.3612489991993595, "grad_norm": 0.5822415351867676, "learning_rate": 6.38739431206764e-06, "loss": 2.0594, "step": 4230 }, { "epoch": 0.36210301574593007, "grad_norm": 0.6032450199127197, "learning_rate": 6.378853873089077e-06, "loss": 2.0715, "step": 4240 }, { "epoch": 0.36295703229250065, "grad_norm": 0.5853261947631836, "learning_rate": 6.370313434110513e-06, "loss": 2.05, "step": 4250 }, { "epoch": 0.3638110488390713, "grad_norm": 0.6220340132713318, "learning_rate": 6.361772995131951e-06, "loss": 2.1133, "step": 4260 }, { "epoch": 0.36466506538564186, "grad_norm": 0.6459780931472778, "learning_rate": 6.353232556153387e-06, "loss": 2.0811, "step": 4270 }, { "epoch": 0.36551908193221244, "grad_norm": 0.5899947881698608, "learning_rate": 6.344692117174823e-06, "loss": 2.0617, "step": 4280 }, { "epoch": 0.366373098478783, "grad_norm": 0.6085699796676636, "learning_rate": 6.33615167819626e-06, "loss": 2.0656, "step": 4290 }, { "epoch": 0.3672271150253536, "grad_norm": 0.6055794954299927, "learning_rate": 6.327611239217696e-06, "loss": 2.0569, "step": 4300 }, { "epoch": 0.3672271150253536, "eval_loss": 2.077655553817749, "eval_runtime": 98.8011, "eval_samples_per_second": 10.121, "eval_steps_per_second": 5.061, "step": 4300 }, { "epoch": 0.3680811315719242, "grad_norm": 0.5629417300224304, "learning_rate": 6.319070800239133e-06, "loss": 2.0774, "step": 4310 }, { "epoch": 0.3689351481184948, "grad_norm": 0.6046031713485718, "learning_rate": 6.31053036126057e-06, "loss": 2.0479, "step": 4320 }, { "epoch": 0.3697891646650654, "grad_norm": 0.5327040553092957, "learning_rate": 6.301989922282006e-06, "loss": 2.0399, "step": 4330 }, { "epoch": 0.370643181211636, "grad_norm": 0.5413053631782532, "learning_rate": 6.293449483303442e-06, "loss": 2.0414, "step": 4340 }, { "epoch": 0.37149719775820655, "grad_norm": 0.6113752126693726, "learning_rate": 6.284909044324879e-06, "loss": 2.0206, "step": 4350 }, { "epoch": 0.37235121430477713, "grad_norm": 0.5839834809303284, "learning_rate": 6.2763686053463154e-06, "loss": 2.0705, "step": 4360 }, { "epoch": 0.37320523085134777, "grad_norm": 0.5761412382125854, "learning_rate": 6.267828166367752e-06, "loss": 2.0449, "step": 4370 }, { "epoch": 0.37405924739791835, "grad_norm": 0.6302218437194824, "learning_rate": 6.259287727389188e-06, "loss": 2.042, "step": 4380 }, { "epoch": 0.37491326394448893, "grad_norm": 0.5558602213859558, "learning_rate": 6.250747288410624e-06, "loss": 2.0546, "step": 4390 }, { "epoch": 0.3757672804910595, "grad_norm": 0.5387922525405884, "learning_rate": 6.242206849432061e-06, "loss": 2.0605, "step": 4400 }, { "epoch": 0.3757672804910595, "eval_loss": 2.076936721801758, "eval_runtime": 99.0017, "eval_samples_per_second": 10.101, "eval_steps_per_second": 5.05, "step": 4400 }, { "epoch": 0.3766212970376301, "grad_norm": 0.6043440103530884, "learning_rate": 6.233666410453498e-06, "loss": 2.0524, "step": 4410 }, { "epoch": 0.37747531358420067, "grad_norm": 0.5480827689170837, "learning_rate": 6.225125971474935e-06, "loss": 2.0423, "step": 4420 }, { "epoch": 0.3783293301307713, "grad_norm": 0.5199385285377502, "learning_rate": 6.216585532496371e-06, "loss": 2.0346, "step": 4430 }, { "epoch": 0.3791833466773419, "grad_norm": 0.5246111154556274, "learning_rate": 6.208045093517807e-06, "loss": 2.0688, "step": 4440 }, { "epoch": 0.38003736322391246, "grad_norm": 0.5296744704246521, "learning_rate": 6.199504654539243e-06, "loss": 2.0552, "step": 4450 }, { "epoch": 0.38089137977048304, "grad_norm": 0.5457771420478821, "learning_rate": 6.190964215560681e-06, "loss": 2.0334, "step": 4460 }, { "epoch": 0.3817453963170536, "grad_norm": 0.5534031987190247, "learning_rate": 6.182423776582117e-06, "loss": 2.0416, "step": 4470 }, { "epoch": 0.38259941286362426, "grad_norm": 0.5709179043769836, "learning_rate": 6.173883337603554e-06, "loss": 2.0795, "step": 4480 }, { "epoch": 0.38345342941019483, "grad_norm": 0.5756047368049622, "learning_rate": 6.16534289862499e-06, "loss": 2.0876, "step": 4490 }, { "epoch": 0.3843074459567654, "grad_norm": 0.5713403820991516, "learning_rate": 6.156802459646426e-06, "loss": 2.0673, "step": 4500 }, { "epoch": 0.3843074459567654, "eval_loss": 2.0763320922851562, "eval_runtime": 98.878, "eval_samples_per_second": 10.113, "eval_steps_per_second": 5.057, "step": 4500 }, { "epoch": 0.385161462503336, "grad_norm": 0.5884702205657959, "learning_rate": 6.148262020667863e-06, "loss": 2.0722, "step": 4510 }, { "epoch": 0.3860154790499066, "grad_norm": 0.5472369194030762, "learning_rate": 6.139721581689299e-06, "loss": 2.0711, "step": 4520 }, { "epoch": 0.3868694955964772, "grad_norm": 0.6504119634628296, "learning_rate": 6.131181142710736e-06, "loss": 2.0431, "step": 4530 }, { "epoch": 0.3877235121430478, "grad_norm": 0.6400245428085327, "learning_rate": 6.122640703732172e-06, "loss": 2.0535, "step": 4540 }, { "epoch": 0.38857752868961837, "grad_norm": 0.6420487761497498, "learning_rate": 6.114100264753608e-06, "loss": 2.0662, "step": 4550 }, { "epoch": 0.38943154523618895, "grad_norm": 0.6386040449142456, "learning_rate": 6.105559825775045e-06, "loss": 2.0476, "step": 4560 }, { "epoch": 0.3902855617827595, "grad_norm": 0.56058669090271, "learning_rate": 6.097019386796482e-06, "loss": 2.0601, "step": 4570 }, { "epoch": 0.3911395783293301, "grad_norm": 0.8170416355133057, "learning_rate": 6.088478947817918e-06, "loss": 2.0618, "step": 4580 }, { "epoch": 0.39199359487590074, "grad_norm": 0.7694204449653625, "learning_rate": 6.079938508839355e-06, "loss": 2.0863, "step": 4590 }, { "epoch": 0.3928476114224713, "grad_norm": 0.693332850933075, "learning_rate": 6.071398069860791e-06, "loss": 2.0622, "step": 4600 }, { "epoch": 0.3928476114224713, "eval_loss": 2.0757362842559814, "eval_runtime": 98.8423, "eval_samples_per_second": 10.117, "eval_steps_per_second": 5.059, "step": 4600 }, { "epoch": 0.3937016279690419, "grad_norm": 0.7094119191169739, "learning_rate": 6.062857630882227e-06, "loss": 2.0691, "step": 4610 }, { "epoch": 0.3945556445156125, "grad_norm": 0.5764107704162598, "learning_rate": 6.054317191903665e-06, "loss": 2.04, "step": 4620 }, { "epoch": 0.39540966106218306, "grad_norm": 0.5961339473724365, "learning_rate": 6.045776752925101e-06, "loss": 2.035, "step": 4630 }, { "epoch": 0.3962636776087537, "grad_norm": 0.6162221431732178, "learning_rate": 6.037236313946538e-06, "loss": 2.0548, "step": 4640 }, { "epoch": 0.3971176941553243, "grad_norm": 0.64893639087677, "learning_rate": 6.028695874967974e-06, "loss": 2.0286, "step": 4650 }, { "epoch": 0.39797171070189485, "grad_norm": 0.5995349884033203, "learning_rate": 6.02015543598941e-06, "loss": 2.0837, "step": 4660 }, { "epoch": 0.39882572724846543, "grad_norm": 0.526799201965332, "learning_rate": 6.011614997010847e-06, "loss": 2.0606, "step": 4670 }, { "epoch": 0.399679743795036, "grad_norm": 0.570662796497345, "learning_rate": 6.003074558032283e-06, "loss": 2.0761, "step": 4680 }, { "epoch": 0.4005337603416066, "grad_norm": 0.5964308381080627, "learning_rate": 5.99453411905372e-06, "loss": 2.0613, "step": 4690 }, { "epoch": 0.40138777688817723, "grad_norm": 0.5859782099723816, "learning_rate": 5.985993680075156e-06, "loss": 2.0845, "step": 4700 }, { "epoch": 0.40138777688817723, "eval_loss": 2.074390172958374, "eval_runtime": 98.8514, "eval_samples_per_second": 10.116, "eval_steps_per_second": 5.058, "step": 4700 }, { "epoch": 0.4022417934347478, "grad_norm": 0.531798243522644, "learning_rate": 5.977453241096592e-06, "loss": 2.0398, "step": 4710 }, { "epoch": 0.4030958099813184, "grad_norm": 0.5802189111709595, "learning_rate": 5.96891280211803e-06, "loss": 2.0344, "step": 4720 }, { "epoch": 0.40394982652788897, "grad_norm": 0.6294711232185364, "learning_rate": 5.960372363139466e-06, "loss": 2.0789, "step": 4730 }, { "epoch": 0.40480384307445955, "grad_norm": 0.5669849514961243, "learning_rate": 5.951831924160902e-06, "loss": 2.0539, "step": 4740 }, { "epoch": 0.4056578596210302, "grad_norm": 0.6840581297874451, "learning_rate": 5.943291485182339e-06, "loss": 2.0784, "step": 4750 }, { "epoch": 0.40651187616760076, "grad_norm": 0.6162413954734802, "learning_rate": 5.934751046203775e-06, "loss": 2.0694, "step": 4760 }, { "epoch": 0.40736589271417134, "grad_norm": 0.5217962265014648, "learning_rate": 5.926210607225213e-06, "loss": 2.093, "step": 4770 }, { "epoch": 0.4082199092607419, "grad_norm": 0.5702998638153076, "learning_rate": 5.917670168246649e-06, "loss": 2.0329, "step": 4780 }, { "epoch": 0.4090739258073125, "grad_norm": 0.7252790331840515, "learning_rate": 5.909129729268085e-06, "loss": 2.0593, "step": 4790 }, { "epoch": 0.4099279423538831, "grad_norm": 0.542995810508728, "learning_rate": 5.9005892902895216e-06, "loss": 2.0582, "step": 4800 }, { "epoch": 0.4099279423538831, "eval_loss": 2.074084997177124, "eval_runtime": 98.584, "eval_samples_per_second": 10.144, "eval_steps_per_second": 5.072, "step": 4800 }, { "epoch": 0.4107819589004537, "grad_norm": 0.5005578994750977, "learning_rate": 5.892048851310958e-06, "loss": 2.0291, "step": 4810 }, { "epoch": 0.4116359754470243, "grad_norm": 0.5405492186546326, "learning_rate": 5.883508412332395e-06, "loss": 2.0288, "step": 4820 }, { "epoch": 0.4124899919935949, "grad_norm": 0.487716943025589, "learning_rate": 5.874967973353831e-06, "loss": 2.0483, "step": 4830 }, { "epoch": 0.41334400854016545, "grad_norm": 0.48800957202911377, "learning_rate": 5.866427534375267e-06, "loss": 2.0608, "step": 4840 }, { "epoch": 0.41419802508673603, "grad_norm": 0.6357758641242981, "learning_rate": 5.857887095396703e-06, "loss": 2.06, "step": 4850 }, { "epoch": 0.41505204163330667, "grad_norm": 0.5239138603210449, "learning_rate": 5.84934665641814e-06, "loss": 2.0216, "step": 4860 }, { "epoch": 0.41590605817987725, "grad_norm": 0.4909784495830536, "learning_rate": 5.840806217439576e-06, "loss": 2.0506, "step": 4870 }, { "epoch": 0.4167600747264478, "grad_norm": 0.5664153695106506, "learning_rate": 5.832265778461014e-06, "loss": 2.0261, "step": 4880 }, { "epoch": 0.4176140912730184, "grad_norm": 0.5648866891860962, "learning_rate": 5.82372533948245e-06, "loss": 2.0376, "step": 4890 }, { "epoch": 0.418468107819589, "grad_norm": 0.5945410132408142, "learning_rate": 5.815184900503886e-06, "loss": 2.0555, "step": 4900 }, { "epoch": 0.418468107819589, "eval_loss": 2.0723860263824463, "eval_runtime": 98.7193, "eval_samples_per_second": 10.13, "eval_steps_per_second": 5.065, "step": 4900 }, { "epoch": 0.4193221243661596, "grad_norm": 0.5736713409423828, "learning_rate": 5.806644461525323e-06, "loss": 2.0666, "step": 4910 }, { "epoch": 0.4201761409127302, "grad_norm": 0.6498487591743469, "learning_rate": 5.798104022546759e-06, "loss": 2.0615, "step": 4920 }, { "epoch": 0.4210301574593008, "grad_norm": 0.694471538066864, "learning_rate": 5.789563583568197e-06, "loss": 2.0657, "step": 4930 }, { "epoch": 0.42188417400587136, "grad_norm": 0.6121436357498169, "learning_rate": 5.781023144589633e-06, "loss": 2.0569, "step": 4940 }, { "epoch": 0.42273819055244194, "grad_norm": 0.5739856958389282, "learning_rate": 5.772482705611069e-06, "loss": 2.0774, "step": 4950 }, { "epoch": 0.4235922070990125, "grad_norm": 0.6429994702339172, "learning_rate": 5.763942266632505e-06, "loss": 2.0487, "step": 4960 }, { "epoch": 0.42444622364558315, "grad_norm": 0.5392478704452515, "learning_rate": 5.7554018276539416e-06, "loss": 2.0283, "step": 4970 }, { "epoch": 0.42530024019215373, "grad_norm": 0.565315842628479, "learning_rate": 5.7468613886753784e-06, "loss": 2.0268, "step": 4980 }, { "epoch": 0.4261542567387243, "grad_norm": 0.669299840927124, "learning_rate": 5.738320949696815e-06, "loss": 2.0408, "step": 4990 }, { "epoch": 0.4270082732852949, "grad_norm": 0.6179564595222473, "learning_rate": 5.729780510718251e-06, "loss": 2.0819, "step": 5000 }, { "epoch": 0.4270082732852949, "eval_loss": 2.0726468563079834, "eval_runtime": 98.5294, "eval_samples_per_second": 10.149, "eval_steps_per_second": 5.075, "step": 5000 }, { "epoch": 0.42786228983186547, "grad_norm": 0.6250094175338745, "learning_rate": 5.721240071739687e-06, "loss": 2.0698, "step": 5010 }, { "epoch": 0.4287163063784361, "grad_norm": 0.4930519759654999, "learning_rate": 5.712699632761124e-06, "loss": 2.0532, "step": 5020 }, { "epoch": 0.4295703229250067, "grad_norm": 0.6259885430335999, "learning_rate": 5.704159193782561e-06, "loss": 2.0651, "step": 5030 }, { "epoch": 0.43042433947157727, "grad_norm": 0.5688281059265137, "learning_rate": 5.695618754803998e-06, "loss": 2.057, "step": 5040 }, { "epoch": 0.43127835601814785, "grad_norm": 0.5666618943214417, "learning_rate": 5.687078315825434e-06, "loss": 2.0392, "step": 5050 }, { "epoch": 0.4321323725647184, "grad_norm": 0.7095407247543335, "learning_rate": 5.67853787684687e-06, "loss": 2.0294, "step": 5060 }, { "epoch": 0.432986389111289, "grad_norm": 0.6068659424781799, "learning_rate": 5.669997437868307e-06, "loss": 2.0576, "step": 5070 }, { "epoch": 0.43384040565785964, "grad_norm": 0.5697404146194458, "learning_rate": 5.661456998889744e-06, "loss": 2.0242, "step": 5080 }, { "epoch": 0.4346944222044302, "grad_norm": 0.6200443506240845, "learning_rate": 5.65291655991118e-06, "loss": 2.0759, "step": 5090 }, { "epoch": 0.4355484387510008, "grad_norm": 0.5611567497253418, "learning_rate": 5.644376120932617e-06, "loss": 2.0455, "step": 5100 }, { "epoch": 0.4355484387510008, "eval_loss": 2.07175350189209, "eval_runtime": 98.9289, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 5100 }, { "epoch": 0.4364024552975714, "grad_norm": 0.6449030041694641, "learning_rate": 5.635835681954053e-06, "loss": 2.0377, "step": 5110 }, { "epoch": 0.43725647184414196, "grad_norm": 0.6077715754508972, "learning_rate": 5.627295242975489e-06, "loss": 2.0611, "step": 5120 }, { "epoch": 0.4381104883907126, "grad_norm": 0.5876298546791077, "learning_rate": 5.618754803996926e-06, "loss": 2.0634, "step": 5130 }, { "epoch": 0.4389645049372832, "grad_norm": 0.6145151853561401, "learning_rate": 5.610214365018362e-06, "loss": 2.0416, "step": 5140 }, { "epoch": 0.43981852148385375, "grad_norm": 0.5999618768692017, "learning_rate": 5.601673926039799e-06, "loss": 2.0692, "step": 5150 }, { "epoch": 0.44067253803042433, "grad_norm": 0.5729487538337708, "learning_rate": 5.593133487061235e-06, "loss": 2.0002, "step": 5160 }, { "epoch": 0.4415265545769949, "grad_norm": 0.5832698941230774, "learning_rate": 5.584593048082671e-06, "loss": 2.0488, "step": 5170 }, { "epoch": 0.4423805711235655, "grad_norm": 0.6237705945968628, "learning_rate": 5.576052609104109e-06, "loss": 2.0394, "step": 5180 }, { "epoch": 0.4432345876701361, "grad_norm": 0.5795707106590271, "learning_rate": 5.567512170125545e-06, "loss": 2.0547, "step": 5190 }, { "epoch": 0.4440886042167067, "grad_norm": 0.5339071154594421, "learning_rate": 5.558971731146982e-06, "loss": 2.0555, "step": 5200 }, { "epoch": 0.4440886042167067, "eval_loss": 2.0713462829589844, "eval_runtime": 99.056, "eval_samples_per_second": 10.095, "eval_steps_per_second": 5.048, "step": 5200 }, { "epoch": 0.4449426207632773, "grad_norm": 0.5246272087097168, "learning_rate": 5.550431292168418e-06, "loss": 2.0571, "step": 5210 }, { "epoch": 0.44579663730984787, "grad_norm": 0.5855641961097717, "learning_rate": 5.541890853189854e-06, "loss": 2.0425, "step": 5220 }, { "epoch": 0.44665065385641844, "grad_norm": 0.6182217001914978, "learning_rate": 5.53335041421129e-06, "loss": 2.0631, "step": 5230 }, { "epoch": 0.4475046704029891, "grad_norm": 0.5944363474845886, "learning_rate": 5.524809975232728e-06, "loss": 2.0234, "step": 5240 }, { "epoch": 0.44835868694955966, "grad_norm": 0.5854187607765198, "learning_rate": 5.516269536254164e-06, "loss": 2.028, "step": 5250 }, { "epoch": 0.44921270349613024, "grad_norm": 0.5205993056297302, "learning_rate": 5.507729097275601e-06, "loss": 2.0795, "step": 5260 }, { "epoch": 0.4500667200427008, "grad_norm": 0.6661815047264099, "learning_rate": 5.499188658297037e-06, "loss": 2.0647, "step": 5270 }, { "epoch": 0.4509207365892714, "grad_norm": 0.5682797431945801, "learning_rate": 5.490648219318473e-06, "loss": 2.0187, "step": 5280 }, { "epoch": 0.45177475313584203, "grad_norm": 0.6261969208717346, "learning_rate": 5.48210778033991e-06, "loss": 2.0437, "step": 5290 }, { "epoch": 0.4526287696824126, "grad_norm": 0.6430942416191101, "learning_rate": 5.473567341361346e-06, "loss": 2.0278, "step": 5300 }, { "epoch": 0.4526287696824126, "eval_loss": 2.070575714111328, "eval_runtime": 98.6678, "eval_samples_per_second": 10.135, "eval_steps_per_second": 5.068, "step": 5300 }, { "epoch": 0.4534827862289832, "grad_norm": 0.5734032988548279, "learning_rate": 5.465026902382783e-06, "loss": 2.055, "step": 5310 }, { "epoch": 0.45433680277555377, "grad_norm": 0.5649511218070984, "learning_rate": 5.456486463404219e-06, "loss": 2.0769, "step": 5320 }, { "epoch": 0.45519081932212435, "grad_norm": 0.5609380006790161, "learning_rate": 5.447946024425655e-06, "loss": 2.0287, "step": 5330 }, { "epoch": 0.45604483586869493, "grad_norm": 0.5055214166641235, "learning_rate": 5.439405585447093e-06, "loss": 2.0595, "step": 5340 }, { "epoch": 0.45689885241526557, "grad_norm": 0.6321738362312317, "learning_rate": 5.430865146468529e-06, "loss": 2.0208, "step": 5350 }, { "epoch": 0.45775286896183615, "grad_norm": 0.7255424857139587, "learning_rate": 5.422324707489965e-06, "loss": 2.0044, "step": 5360 }, { "epoch": 0.4586068855084067, "grad_norm": 0.6330071091651917, "learning_rate": 5.413784268511402e-06, "loss": 2.0307, "step": 5370 }, { "epoch": 0.4594609020549773, "grad_norm": 0.49322399497032166, "learning_rate": 5.405243829532838e-06, "loss": 2.0655, "step": 5380 }, { "epoch": 0.4603149186015479, "grad_norm": 0.5671298503875732, "learning_rate": 5.396703390554276e-06, "loss": 1.9937, "step": 5390 }, { "epoch": 0.4611689351481185, "grad_norm": 0.6089677810668945, "learning_rate": 5.388162951575712e-06, "loss": 2.0664, "step": 5400 }, { "epoch": 0.4611689351481185, "eval_loss": 2.0699245929718018, "eval_runtime": 98.8733, "eval_samples_per_second": 10.114, "eval_steps_per_second": 5.057, "step": 5400 }, { "epoch": 0.4620229516946891, "grad_norm": 0.6179537177085876, "learning_rate": 5.379622512597148e-06, "loss": 2.0713, "step": 5410 }, { "epoch": 0.4628769682412597, "grad_norm": 0.5245445966720581, "learning_rate": 5.3710820736185846e-06, "loss": 2.0533, "step": 5420 }, { "epoch": 0.46373098478783026, "grad_norm": 0.5217622518539429, "learning_rate": 5.362541634640021e-06, "loss": 2.069, "step": 5430 }, { "epoch": 0.46458500133440084, "grad_norm": 0.561970591545105, "learning_rate": 5.354001195661458e-06, "loss": 2.0622, "step": 5440 }, { "epoch": 0.4654390178809714, "grad_norm": 0.4817509651184082, "learning_rate": 5.345460756682894e-06, "loss": 2.0504, "step": 5450 }, { "epoch": 0.46629303442754205, "grad_norm": 0.5928997993469238, "learning_rate": 5.33692031770433e-06, "loss": 2.0416, "step": 5460 }, { "epoch": 0.46714705097411263, "grad_norm": 0.5909265875816345, "learning_rate": 5.328379878725766e-06, "loss": 2.0193, "step": 5470 }, { "epoch": 0.4680010675206832, "grad_norm": 0.63572096824646, "learning_rate": 5.319839439747203e-06, "loss": 2.0607, "step": 5480 }, { "epoch": 0.4688550840672538, "grad_norm": 0.5362561941146851, "learning_rate": 5.31129900076864e-06, "loss": 2.0396, "step": 5490 }, { "epoch": 0.46970910061382437, "grad_norm": 0.5716733336448669, "learning_rate": 5.302758561790077e-06, "loss": 2.0624, "step": 5500 }, { "epoch": 0.46970910061382437, "eval_loss": 2.0693435668945312, "eval_runtime": 98.7515, "eval_samples_per_second": 10.126, "eval_steps_per_second": 5.063, "step": 5500 }, { "epoch": 0.470563117160395, "grad_norm": 0.5095422863960266, "learning_rate": 5.294218122811513e-06, "loss": 2.0081, "step": 5510 }, { "epoch": 0.4714171337069656, "grad_norm": 0.6708410382270813, "learning_rate": 5.285677683832949e-06, "loss": 2.0545, "step": 5520 }, { "epoch": 0.47227115025353616, "grad_norm": 0.6041153073310852, "learning_rate": 5.277137244854386e-06, "loss": 2.0504, "step": 5530 }, { "epoch": 0.47312516680010674, "grad_norm": 0.5644756555557251, "learning_rate": 5.268596805875822e-06, "loss": 2.0685, "step": 5540 }, { "epoch": 0.4739791833466773, "grad_norm": 0.5705012679100037, "learning_rate": 5.26005636689726e-06, "loss": 2.0169, "step": 5550 }, { "epoch": 0.47483319989324796, "grad_norm": 0.5327626466751099, "learning_rate": 5.251515927918696e-06, "loss": 2.0637, "step": 5560 }, { "epoch": 0.47568721643981854, "grad_norm": 0.6007681488990784, "learning_rate": 5.242975488940132e-06, "loss": 2.0452, "step": 5570 }, { "epoch": 0.4765412329863891, "grad_norm": 0.5735289454460144, "learning_rate": 5.2344350499615685e-06, "loss": 2.0617, "step": 5580 }, { "epoch": 0.4773952495329597, "grad_norm": 0.5276105403900146, "learning_rate": 5.2258946109830046e-06, "loss": 2.0742, "step": 5590 }, { "epoch": 0.4782492660795303, "grad_norm": 0.7130099534988403, "learning_rate": 5.217354172004441e-06, "loss": 2.0468, "step": 5600 }, { "epoch": 0.4782492660795303, "eval_loss": 2.0682835578918457, "eval_runtime": 99.1121, "eval_samples_per_second": 10.09, "eval_steps_per_second": 5.045, "step": 5600 }, { "epoch": 0.47910328262610086, "grad_norm": 0.5376310348510742, "learning_rate": 5.208813733025878e-06, "loss": 2.0501, "step": 5610 }, { "epoch": 0.4799572991726715, "grad_norm": 0.5130000114440918, "learning_rate": 5.200273294047314e-06, "loss": 2.044, "step": 5620 }, { "epoch": 0.48081131571924207, "grad_norm": 0.5238226056098938, "learning_rate": 5.19173285506875e-06, "loss": 2.048, "step": 5630 }, { "epoch": 0.48166533226581265, "grad_norm": 0.5614004135131836, "learning_rate": 5.183192416090187e-06, "loss": 2.0333, "step": 5640 }, { "epoch": 0.48251934881238323, "grad_norm": 0.5451902747154236, "learning_rate": 5.174651977111624e-06, "loss": 2.0219, "step": 5650 }, { "epoch": 0.4833733653589538, "grad_norm": 0.6397396326065063, "learning_rate": 5.166111538133061e-06, "loss": 2.0264, "step": 5660 }, { "epoch": 0.48422738190552445, "grad_norm": 0.517250657081604, "learning_rate": 5.157571099154497e-06, "loss": 2.0612, "step": 5670 }, { "epoch": 0.485081398452095, "grad_norm": 0.5595155358314514, "learning_rate": 5.149030660175933e-06, "loss": 2.0297, "step": 5680 }, { "epoch": 0.4859354149986656, "grad_norm": 0.5949190855026245, "learning_rate": 5.14049022119737e-06, "loss": 2.0485, "step": 5690 }, { "epoch": 0.4867894315452362, "grad_norm": 0.5923450589179993, "learning_rate": 5.131949782218807e-06, "loss": 2.0562, "step": 5700 }, { "epoch": 0.4867894315452362, "eval_loss": 2.0683388710021973, "eval_runtime": 98.8272, "eval_samples_per_second": 10.119, "eval_steps_per_second": 5.059, "step": 5700 }, { "epoch": 0.48764344809180676, "grad_norm": 0.551146924495697, "learning_rate": 5.123409343240244e-06, "loss": 2.0274, "step": 5710 }, { "epoch": 0.48849746463837734, "grad_norm": 0.6246230602264404, "learning_rate": 5.11486890426168e-06, "loss": 2.0715, "step": 5720 }, { "epoch": 0.489351481184948, "grad_norm": 0.5968629121780396, "learning_rate": 5.106328465283116e-06, "loss": 2.0357, "step": 5730 }, { "epoch": 0.49020549773151856, "grad_norm": 0.5812126994132996, "learning_rate": 5.097788026304552e-06, "loss": 2.0391, "step": 5740 }, { "epoch": 0.49105951427808914, "grad_norm": 0.5794074535369873, "learning_rate": 5.089247587325989e-06, "loss": 2.0282, "step": 5750 }, { "epoch": 0.4919135308246597, "grad_norm": 0.542427122592926, "learning_rate": 5.080707148347425e-06, "loss": 2.0375, "step": 5760 }, { "epoch": 0.4927675473712303, "grad_norm": 0.601831316947937, "learning_rate": 5.072166709368862e-06, "loss": 2.0626, "step": 5770 }, { "epoch": 0.49362156391780093, "grad_norm": 0.5945174098014832, "learning_rate": 5.063626270390298e-06, "loss": 2.029, "step": 5780 }, { "epoch": 0.4944755804643715, "grad_norm": 0.5705648064613342, "learning_rate": 5.055085831411734e-06, "loss": 2.0753, "step": 5790 }, { "epoch": 0.4953295970109421, "grad_norm": 0.5841405391693115, "learning_rate": 5.046545392433172e-06, "loss": 2.0684, "step": 5800 }, { "epoch": 0.4953295970109421, "eval_loss": 2.0675156116485596, "eval_runtime": 98.7614, "eval_samples_per_second": 10.125, "eval_steps_per_second": 5.063, "step": 5800 }, { "epoch": 0.49618361355751267, "grad_norm": 0.5943530797958374, "learning_rate": 5.038004953454608e-06, "loss": 2.0203, "step": 5810 }, { "epoch": 0.49703763010408325, "grad_norm": 0.5854504704475403, "learning_rate": 5.029464514476045e-06, "loss": 2.0291, "step": 5820 }, { "epoch": 0.49789164665065383, "grad_norm": 0.5930184721946716, "learning_rate": 5.020924075497481e-06, "loss": 2.0685, "step": 5830 }, { "epoch": 0.49874566319722446, "grad_norm": 0.5872290730476379, "learning_rate": 5.012383636518917e-06, "loss": 2.0351, "step": 5840 }, { "epoch": 0.49959967974379504, "grad_norm": 0.5630264282226562, "learning_rate": 5.003843197540355e-06, "loss": 2.047, "step": 5850 }, { "epoch": 0.5004536962903656, "grad_norm": 0.5513464212417603, "learning_rate": 4.99530275856179e-06, "loss": 2.044, "step": 5860 }, { "epoch": 0.5013077128369362, "grad_norm": 0.6369175314903259, "learning_rate": 4.986762319583227e-06, "loss": 2.0288, "step": 5870 }, { "epoch": 0.5021617293835068, "grad_norm": 0.5816603899002075, "learning_rate": 4.978221880604664e-06, "loss": 2.0566, "step": 5880 }, { "epoch": 0.5030157459300774, "grad_norm": 0.5797505974769592, "learning_rate": 4.9696814416261004e-06, "loss": 2.0276, "step": 5890 }, { "epoch": 0.5038697624766479, "grad_norm": 0.5142800211906433, "learning_rate": 4.9611410026475365e-06, "loss": 2.0813, "step": 5900 }, { "epoch": 0.5038697624766479, "eval_loss": 2.0671706199645996, "eval_runtime": 98.8372, "eval_samples_per_second": 10.118, "eval_steps_per_second": 5.059, "step": 5900 }, { "epoch": 0.5047237790232185, "grad_norm": 0.6141909956932068, "learning_rate": 4.9526005636689725e-06, "loss": 2.0213, "step": 5910 }, { "epoch": 0.5055777955697892, "grad_norm": 0.5329708456993103, "learning_rate": 4.944060124690409e-06, "loss": 2.0479, "step": 5920 }, { "epoch": 0.5064318121163598, "grad_norm": 0.5788347125053406, "learning_rate": 4.935519685711846e-06, "loss": 2.054, "step": 5930 }, { "epoch": 0.5072858286629304, "grad_norm": 0.6542540788650513, "learning_rate": 4.926979246733283e-06, "loss": 2.0455, "step": 5940 }, { "epoch": 0.508139845209501, "grad_norm": 0.5416275858879089, "learning_rate": 4.918438807754719e-06, "loss": 2.0591, "step": 5950 }, { "epoch": 0.5089938617560715, "grad_norm": 0.5104756951332092, "learning_rate": 4.909898368776155e-06, "loss": 2.0029, "step": 5960 }, { "epoch": 0.5098478783026421, "grad_norm": 0.6373957395553589, "learning_rate": 4.901357929797592e-06, "loss": 2.0327, "step": 5970 }, { "epoch": 0.5107018948492127, "grad_norm": 0.5671820640563965, "learning_rate": 4.892817490819029e-06, "loss": 2.0184, "step": 5980 }, { "epoch": 0.5115559113957833, "grad_norm": 0.5045920610427856, "learning_rate": 4.884277051840465e-06, "loss": 2.0655, "step": 5990 }, { "epoch": 0.5124099279423538, "grad_norm": 0.6044976115226746, "learning_rate": 4.875736612861902e-06, "loss": 2.0821, "step": 6000 }, { "epoch": 0.5124099279423538, "eval_loss": 2.0663814544677734, "eval_runtime": 99.3571, "eval_samples_per_second": 10.065, "eval_steps_per_second": 5.032, "step": 6000 }, { "epoch": 0.5132639444889244, "grad_norm": 0.5572605133056641, "learning_rate": 4.867196173883338e-06, "loss": 2.0422, "step": 6010 }, { "epoch": 0.514117961035495, "grad_norm": 0.5898503065109253, "learning_rate": 4.858655734904775e-06, "loss": 2.0607, "step": 6020 }, { "epoch": 0.5149719775820657, "grad_norm": 0.5786698460578918, "learning_rate": 4.850115295926211e-06, "loss": 2.0438, "step": 6030 }, { "epoch": 0.5158259941286363, "grad_norm": 0.5419963002204895, "learning_rate": 4.8415748569476475e-06, "loss": 2.0746, "step": 6040 }, { "epoch": 0.5166800106752069, "grad_norm": 0.5842531323432922, "learning_rate": 4.833034417969084e-06, "loss": 2.038, "step": 6050 }, { "epoch": 0.5175340272217774, "grad_norm": 0.6030822396278381, "learning_rate": 4.8244939789905204e-06, "loss": 2.0576, "step": 6060 }, { "epoch": 0.518388043768348, "grad_norm": 0.5286454558372498, "learning_rate": 4.815953540011957e-06, "loss": 2.0291, "step": 6070 }, { "epoch": 0.5192420603149186, "grad_norm": 0.5508376955986023, "learning_rate": 4.807413101033393e-06, "loss": 2.0423, "step": 6080 }, { "epoch": 0.5200960768614892, "grad_norm": 0.5382469296455383, "learning_rate": 4.79887266205483e-06, "loss": 2.0574, "step": 6090 }, { "epoch": 0.5209500934080598, "grad_norm": 0.5823044776916504, "learning_rate": 4.790332223076267e-06, "loss": 2.0456, "step": 6100 }, { "epoch": 0.5209500934080598, "eval_loss": 2.0655107498168945, "eval_runtime": 99.3273, "eval_samples_per_second": 10.068, "eval_steps_per_second": 5.034, "step": 6100 }, { "epoch": 0.5218041099546303, "grad_norm": 0.5577982664108276, "learning_rate": 4.781791784097703e-06, "loss": 2.0618, "step": 6110 }, { "epoch": 0.5226581265012009, "grad_norm": 0.5563845634460449, "learning_rate": 4.77325134511914e-06, "loss": 2.0557, "step": 6120 }, { "epoch": 0.5235121430477716, "grad_norm": 0.6378036737442017, "learning_rate": 4.764710906140576e-06, "loss": 2.0338, "step": 6130 }, { "epoch": 0.5243661595943422, "grad_norm": 0.6540238857269287, "learning_rate": 4.756170467162012e-06, "loss": 2.0695, "step": 6140 }, { "epoch": 0.5252201761409128, "grad_norm": 0.5159985423088074, "learning_rate": 4.747630028183449e-06, "loss": 2.0505, "step": 6150 }, { "epoch": 0.5260741926874833, "grad_norm": 0.5910516381263733, "learning_rate": 4.739089589204886e-06, "loss": 2.0328, "step": 6160 }, { "epoch": 0.5269282092340539, "grad_norm": 0.6169615983963013, "learning_rate": 4.730549150226322e-06, "loss": 2.0317, "step": 6170 }, { "epoch": 0.5277822257806245, "grad_norm": 0.5604709982872009, "learning_rate": 4.722008711247759e-06, "loss": 2.0395, "step": 6180 }, { "epoch": 0.5286362423271951, "grad_norm": 0.5917683839797974, "learning_rate": 4.713468272269195e-06, "loss": 2.0571, "step": 6190 }, { "epoch": 0.5294902588737657, "grad_norm": 0.6007024645805359, "learning_rate": 4.7049278332906315e-06, "loss": 2.0755, "step": 6200 }, { "epoch": 0.5294902588737657, "eval_loss": 2.065096616744995, "eval_runtime": 99.2711, "eval_samples_per_second": 10.073, "eval_steps_per_second": 5.037, "step": 6200 }, { "epoch": 0.5303442754203362, "grad_norm": 0.6155685782432556, "learning_rate": 4.696387394312068e-06, "loss": 2.0576, "step": 6210 }, { "epoch": 0.5311982919669068, "grad_norm": 0.5749043226242065, "learning_rate": 4.687846955333504e-06, "loss": 2.0291, "step": 6220 }, { "epoch": 0.5320523085134774, "grad_norm": 0.6788541674613953, "learning_rate": 4.679306516354941e-06, "loss": 2.0164, "step": 6230 }, { "epoch": 0.5329063250600481, "grad_norm": 0.5900527834892273, "learning_rate": 4.670766077376377e-06, "loss": 2.0543, "step": 6240 }, { "epoch": 0.5337603416066187, "grad_norm": 0.5669994354248047, "learning_rate": 4.662225638397814e-06, "loss": 2.0272, "step": 6250 }, { "epoch": 0.5346143581531893, "grad_norm": 0.5225370526313782, "learning_rate": 4.65368519941925e-06, "loss": 2.0891, "step": 6260 }, { "epoch": 0.5354683746997598, "grad_norm": 0.5951294898986816, "learning_rate": 4.645144760440687e-06, "loss": 2.0397, "step": 6270 }, { "epoch": 0.5363223912463304, "grad_norm": 0.5531771183013916, "learning_rate": 4.636604321462124e-06, "loss": 2.0515, "step": 6280 }, { "epoch": 0.537176407792901, "grad_norm": 0.53160160779953, "learning_rate": 4.62806388248356e-06, "loss": 2.0721, "step": 6290 }, { "epoch": 0.5380304243394716, "grad_norm": 0.5429141521453857, "learning_rate": 4.619523443504996e-06, "loss": 2.0588, "step": 6300 }, { "epoch": 0.5380304243394716, "eval_loss": 2.06453275680542, "eval_runtime": 99.2144, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.04, "step": 6300 }, { "epoch": 0.5388844408860421, "grad_norm": 0.4986340403556824, "learning_rate": 4.610983004526433e-06, "loss": 2.0166, "step": 6310 }, { "epoch": 0.5397384574326127, "grad_norm": 0.5342271327972412, "learning_rate": 4.60244256554787e-06, "loss": 2.0259, "step": 6320 }, { "epoch": 0.5405924739791833, "grad_norm": 0.6318298578262329, "learning_rate": 4.5939021265693066e-06, "loss": 2.0369, "step": 6330 }, { "epoch": 0.5414464905257539, "grad_norm": 0.5513337254524231, "learning_rate": 4.585361687590743e-06, "loss": 2.0554, "step": 6340 }, { "epoch": 0.5423005070723246, "grad_norm": 0.6259679198265076, "learning_rate": 4.576821248612179e-06, "loss": 2.0331, "step": 6350 }, { "epoch": 0.5431545236188952, "grad_norm": 0.5690695643424988, "learning_rate": 4.5682808096336155e-06, "loss": 2.0492, "step": 6360 }, { "epoch": 0.5440085401654657, "grad_norm": 0.616737425327301, "learning_rate": 4.559740370655052e-06, "loss": 2.0283, "step": 6370 }, { "epoch": 0.5448625567120363, "grad_norm": 0.6700045466423035, "learning_rate": 4.551199931676488e-06, "loss": 2.0369, "step": 6380 }, { "epoch": 0.5457165732586069, "grad_norm": 0.5789743065834045, "learning_rate": 4.542659492697925e-06, "loss": 2.0513, "step": 6390 }, { "epoch": 0.5465705898051775, "grad_norm": 0.503028929233551, "learning_rate": 4.534119053719361e-06, "loss": 2.0433, "step": 6400 }, { "epoch": 0.5465705898051775, "eval_loss": 2.0636653900146484, "eval_runtime": 99.1503, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 6400 }, { "epoch": 0.547424606351748, "grad_norm": 0.5215557217597961, "learning_rate": 4.525578614740798e-06, "loss": 2.0615, "step": 6410 }, { "epoch": 0.5482786228983186, "grad_norm": 0.5647512078285217, "learning_rate": 4.517038175762234e-06, "loss": 1.9944, "step": 6420 }, { "epoch": 0.5491326394448892, "grad_norm": 0.5839424133300781, "learning_rate": 4.508497736783671e-06, "loss": 2.0645, "step": 6430 }, { "epoch": 0.5499866559914598, "grad_norm": 0.5427899360656738, "learning_rate": 4.499957297805108e-06, "loss": 2.0424, "step": 6440 }, { "epoch": 0.5508406725380304, "grad_norm": 0.5012800097465515, "learning_rate": 4.491416858826544e-06, "loss": 2.0631, "step": 6450 }, { "epoch": 0.5516946890846011, "grad_norm": 0.5525136590003967, "learning_rate": 4.482876419847981e-06, "loss": 2.0479, "step": 6460 }, { "epoch": 0.5525487056311716, "grad_norm": 0.5929039716720581, "learning_rate": 4.474335980869417e-06, "loss": 2.0691, "step": 6470 }, { "epoch": 0.5534027221777422, "grad_norm": 0.603726863861084, "learning_rate": 4.465795541890854e-06, "loss": 2.0499, "step": 6480 }, { "epoch": 0.5542567387243128, "grad_norm": 0.540930449962616, "learning_rate": 4.4572551029122905e-06, "loss": 2.0402, "step": 6490 }, { "epoch": 0.5551107552708834, "grad_norm": 0.5366859436035156, "learning_rate": 4.4487146639337266e-06, "loss": 2.0181, "step": 6500 }, { "epoch": 0.5551107552708834, "eval_loss": 2.063614845275879, "eval_runtime": 99.0169, "eval_samples_per_second": 10.099, "eval_steps_per_second": 5.05, "step": 6500 }, { "epoch": 0.555964771817454, "grad_norm": 0.5817170143127441, "learning_rate": 4.4401742249551634e-06, "loss": 2.0233, "step": 6510 }, { "epoch": 0.5568187883640245, "grad_norm": 0.5765772461891174, "learning_rate": 4.4316337859765995e-06, "loss": 2.0452, "step": 6520 }, { "epoch": 0.5576728049105951, "grad_norm": 0.5237901210784912, "learning_rate": 4.4230933469980355e-06, "loss": 2.0444, "step": 6530 }, { "epoch": 0.5585268214571657, "grad_norm": 0.5542150735855103, "learning_rate": 4.414552908019472e-06, "loss": 2.0409, "step": 6540 }, { "epoch": 0.5593808380037363, "grad_norm": 0.5346140265464783, "learning_rate": 4.406012469040909e-06, "loss": 2.0505, "step": 6550 }, { "epoch": 0.5602348545503069, "grad_norm": 0.5194046497344971, "learning_rate": 4.397472030062346e-06, "loss": 2.0303, "step": 6560 }, { "epoch": 0.5610888710968776, "grad_norm": 0.6005309820175171, "learning_rate": 4.388931591083782e-06, "loss": 2.0443, "step": 6570 }, { "epoch": 0.5619428876434481, "grad_norm": 0.5340930223464966, "learning_rate": 4.380391152105218e-06, "loss": 2.0599, "step": 6580 }, { "epoch": 0.5627969041900187, "grad_norm": 0.5244272947311401, "learning_rate": 4.371850713126655e-06, "loss": 2.0311, "step": 6590 }, { "epoch": 0.5636509207365893, "grad_norm": 0.5906960368156433, "learning_rate": 4.363310274148092e-06, "loss": 2.0282, "step": 6600 }, { "epoch": 0.5636509207365893, "eval_loss": 2.062767744064331, "eval_runtime": 98.8722, "eval_samples_per_second": 10.114, "eval_steps_per_second": 5.057, "step": 6600 }, { "epoch": 0.5645049372831599, "grad_norm": 0.5671685934066772, "learning_rate": 4.354769835169529e-06, "loss": 2.0688, "step": 6610 }, { "epoch": 0.5653589538297304, "grad_norm": 0.5608102083206177, "learning_rate": 4.346229396190965e-06, "loss": 2.0465, "step": 6620 }, { "epoch": 0.566212970376301, "grad_norm": 0.5790672898292542, "learning_rate": 4.337688957212401e-06, "loss": 2.0184, "step": 6630 }, { "epoch": 0.5670669869228716, "grad_norm": 0.5642273426055908, "learning_rate": 4.329148518233838e-06, "loss": 2.0271, "step": 6640 }, { "epoch": 0.5679210034694422, "grad_norm": 0.5485665202140808, "learning_rate": 4.320608079255274e-06, "loss": 2.0533, "step": 6650 }, { "epoch": 0.5687750200160128, "grad_norm": 0.6455518007278442, "learning_rate": 4.3120676402767105e-06, "loss": 2.0465, "step": 6660 }, { "epoch": 0.5696290365625833, "grad_norm": 0.5416640639305115, "learning_rate": 4.303527201298147e-06, "loss": 2.0891, "step": 6670 }, { "epoch": 0.570483053109154, "grad_norm": 0.5357679724693298, "learning_rate": 4.294986762319583e-06, "loss": 2.0335, "step": 6680 }, { "epoch": 0.5713370696557246, "grad_norm": 0.5133402347564697, "learning_rate": 4.28644632334102e-06, "loss": 2.04, "step": 6690 }, { "epoch": 0.5721910862022952, "grad_norm": 0.5662258267402649, "learning_rate": 4.277905884362456e-06, "loss": 2.0373, "step": 6700 }, { "epoch": 0.5721910862022952, "eval_loss": 2.0626380443573, "eval_runtime": 99.1808, "eval_samples_per_second": 10.083, "eval_steps_per_second": 5.041, "step": 6700 }, { "epoch": 0.5730451027488658, "grad_norm": 0.5616562962532043, "learning_rate": 4.269365445383893e-06, "loss": 2.0597, "step": 6710 }, { "epoch": 0.5738991192954364, "grad_norm": 0.5565703511238098, "learning_rate": 4.26082500640533e-06, "loss": 2.035, "step": 6720 }, { "epoch": 0.5747531358420069, "grad_norm": 0.6030809879302979, "learning_rate": 4.252284567426766e-06, "loss": 2.0444, "step": 6730 }, { "epoch": 0.5756071523885775, "grad_norm": 0.5424668192863464, "learning_rate": 4.243744128448203e-06, "loss": 2.0534, "step": 6740 }, { "epoch": 0.5764611689351481, "grad_norm": 0.5354906916618347, "learning_rate": 4.235203689469639e-06, "loss": 2.033, "step": 6750 }, { "epoch": 0.5773151854817187, "grad_norm": 0.5503740906715393, "learning_rate": 4.226663250491075e-06, "loss": 2.0309, "step": 6760 }, { "epoch": 0.5781692020282893, "grad_norm": 0.600864827632904, "learning_rate": 4.218122811512512e-06, "loss": 2.0531, "step": 6770 }, { "epoch": 0.5790232185748598, "grad_norm": 0.6190809607505798, "learning_rate": 4.209582372533949e-06, "loss": 2.0308, "step": 6780 }, { "epoch": 0.5798772351214305, "grad_norm": 0.5197070837020874, "learning_rate": 4.201041933555386e-06, "loss": 2.0396, "step": 6790 }, { "epoch": 0.5807312516680011, "grad_norm": 0.6103793978691101, "learning_rate": 4.192501494576822e-06, "loss": 2.0258, "step": 6800 }, { "epoch": 0.5807312516680011, "eval_loss": 2.062330722808838, "eval_runtime": 99.1236, "eval_samples_per_second": 10.088, "eval_steps_per_second": 5.044, "step": 6800 }, { "epoch": 0.5815852682145717, "grad_norm": 0.540984570980072, "learning_rate": 4.183961055598258e-06, "loss": 2.0359, "step": 6810 }, { "epoch": 0.5824392847611423, "grad_norm": 0.6142724752426147, "learning_rate": 4.1754206166196945e-06, "loss": 2.0433, "step": 6820 }, { "epoch": 0.5832933013077128, "grad_norm": 0.5233826041221619, "learning_rate": 4.166880177641131e-06, "loss": 2.0364, "step": 6830 }, { "epoch": 0.5841473178542834, "grad_norm": 0.6223013997077942, "learning_rate": 4.158339738662567e-06, "loss": 2.0555, "step": 6840 }, { "epoch": 0.585001334400854, "grad_norm": 0.6881380081176758, "learning_rate": 4.149799299684004e-06, "loss": 2.0529, "step": 6850 }, { "epoch": 0.5858553509474246, "grad_norm": 0.60942143201828, "learning_rate": 4.14125886070544e-06, "loss": 2.0601, "step": 6860 }, { "epoch": 0.5867093674939952, "grad_norm": 0.5089098811149597, "learning_rate": 4.132718421726877e-06, "loss": 2.0576, "step": 6870 }, { "epoch": 0.5875633840405657, "grad_norm": 0.5346848964691162, "learning_rate": 4.124177982748314e-06, "loss": 2.0333, "step": 6880 }, { "epoch": 0.5884174005871364, "grad_norm": 0.576016902923584, "learning_rate": 4.11563754376975e-06, "loss": 2.0475, "step": 6890 }, { "epoch": 0.589271417133707, "grad_norm": 0.5049400925636292, "learning_rate": 4.107097104791187e-06, "loss": 2.0424, "step": 6900 }, { "epoch": 0.589271417133707, "eval_loss": 2.061659097671509, "eval_runtime": 99.083, "eval_samples_per_second": 10.093, "eval_steps_per_second": 5.046, "step": 6900 }, { "epoch": 0.5901254336802776, "grad_norm": 0.538563072681427, "learning_rate": 4.098556665812623e-06, "loss": 2.0278, "step": 6910 }, { "epoch": 0.5909794502268482, "grad_norm": 0.631867527961731, "learning_rate": 4.09001622683406e-06, "loss": 2.0441, "step": 6920 }, { "epoch": 0.5918334667734187, "grad_norm": 0.6422947645187378, "learning_rate": 4.081475787855496e-06, "loss": 2.0301, "step": 6930 }, { "epoch": 0.5926874833199893, "grad_norm": 0.6247473955154419, "learning_rate": 4.072935348876933e-06, "loss": 2.0079, "step": 6940 }, { "epoch": 0.5935414998665599, "grad_norm": 0.5848476886749268, "learning_rate": 4.0643949098983696e-06, "loss": 2.014, "step": 6950 }, { "epoch": 0.5943955164131305, "grad_norm": 0.5247405171394348, "learning_rate": 4.055854470919806e-06, "loss": 2.0357, "step": 6960 }, { "epoch": 0.5952495329597011, "grad_norm": 0.5828875303268433, "learning_rate": 4.047314031941242e-06, "loss": 2.0203, "step": 6970 }, { "epoch": 0.5961035495062716, "grad_norm": 0.5308778285980225, "learning_rate": 4.0387735929626785e-06, "loss": 2.0446, "step": 6980 }, { "epoch": 0.5969575660528422, "grad_norm": 0.6658284068107605, "learning_rate": 4.030233153984115e-06, "loss": 2.076, "step": 6990 }, { "epoch": 0.5978115825994129, "grad_norm": 0.5998433828353882, "learning_rate": 4.021692715005552e-06, "loss": 2.0472, "step": 7000 }, { "epoch": 0.5978115825994129, "eval_loss": 2.0610265731811523, "eval_runtime": 99.1916, "eval_samples_per_second": 10.081, "eval_steps_per_second": 5.041, "step": 7000 }, { "epoch": 0.5986655991459835, "grad_norm": 0.6604376435279846, "learning_rate": 4.013152276026988e-06, "loss": 2.0522, "step": 7010 }, { "epoch": 0.5995196156925541, "grad_norm": 0.5252701640129089, "learning_rate": 4.004611837048424e-06, "loss": 2.0546, "step": 7020 }, { "epoch": 0.6003736322391247, "grad_norm": 0.5231185555458069, "learning_rate": 3.996071398069861e-06, "loss": 2.0597, "step": 7030 }, { "epoch": 0.6012276487856952, "grad_norm": 0.5638464093208313, "learning_rate": 3.987530959091297e-06, "loss": 2.0253, "step": 7040 }, { "epoch": 0.6020816653322658, "grad_norm": 0.6865386962890625, "learning_rate": 3.978990520112734e-06, "loss": 2.0161, "step": 7050 }, { "epoch": 0.6029356818788364, "grad_norm": 0.5177867412567139, "learning_rate": 3.970450081134171e-06, "loss": 2.0625, "step": 7060 }, { "epoch": 0.603789698425407, "grad_norm": 0.505166232585907, "learning_rate": 3.961909642155607e-06, "loss": 2.0391, "step": 7070 }, { "epoch": 0.6046437149719776, "grad_norm": 0.5872111320495605, "learning_rate": 3.953369203177044e-06, "loss": 2.0006, "step": 7080 }, { "epoch": 0.6054977315185481, "grad_norm": 0.5561593770980835, "learning_rate": 3.94482876419848e-06, "loss": 2.0046, "step": 7090 }, { "epoch": 0.6063517480651187, "grad_norm": 0.4805944263935089, "learning_rate": 3.936288325219917e-06, "loss": 2.0452, "step": 7100 }, { "epoch": 0.6063517480651187, "eval_loss": 2.0605008602142334, "eval_runtime": 98.9828, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.051, "step": 7100 }, { "epoch": 0.6072057646116894, "grad_norm": 0.4770943820476532, "learning_rate": 3.9277478862413535e-06, "loss": 2.0427, "step": 7110 }, { "epoch": 0.60805978115826, "grad_norm": 0.48157086968421936, "learning_rate": 3.9192074472627895e-06, "loss": 2.0401, "step": 7120 }, { "epoch": 0.6089137977048306, "grad_norm": 0.48966723680496216, "learning_rate": 3.910667008284226e-06, "loss": 2.0389, "step": 7130 }, { "epoch": 0.6097678142514011, "grad_norm": 0.5447493195533752, "learning_rate": 3.9021265693056624e-06, "loss": 2.0206, "step": 7140 }, { "epoch": 0.6106218307979717, "grad_norm": 0.5208780169487, "learning_rate": 3.8935861303270985e-06, "loss": 2.0661, "step": 7150 }, { "epoch": 0.6114758473445423, "grad_norm": 0.548076868057251, "learning_rate": 3.885045691348535e-06, "loss": 2.0497, "step": 7160 }, { "epoch": 0.6123298638911129, "grad_norm": 0.5473942160606384, "learning_rate": 3.876505252369972e-06, "loss": 2.0206, "step": 7170 }, { "epoch": 0.6131838804376835, "grad_norm": 0.6074666976928711, "learning_rate": 3.867964813391409e-06, "loss": 2.0296, "step": 7180 }, { "epoch": 0.614037896984254, "grad_norm": 0.5882102847099304, "learning_rate": 3.859424374412845e-06, "loss": 2.039, "step": 7190 }, { "epoch": 0.6148919135308246, "grad_norm": 0.6144769787788391, "learning_rate": 3.850883935434281e-06, "loss": 2.058, "step": 7200 }, { "epoch": 0.6148919135308246, "eval_loss": 2.060422897338867, "eval_runtime": 98.8228, "eval_samples_per_second": 10.119, "eval_steps_per_second": 5.06, "step": 7200 }, { "epoch": 0.6157459300773952, "grad_norm": 0.6333703994750977, "learning_rate": 3.842343496455718e-06, "loss": 2.0343, "step": 7210 }, { "epoch": 0.6165999466239659, "grad_norm": 0.6200190782546997, "learning_rate": 3.833803057477155e-06, "loss": 2.0516, "step": 7220 }, { "epoch": 0.6174539631705365, "grad_norm": 0.7211159467697144, "learning_rate": 3.825262618498592e-06, "loss": 2.052, "step": 7230 }, { "epoch": 0.618307979717107, "grad_norm": 0.6151908040046692, "learning_rate": 3.816722179520028e-06, "loss": 2.0628, "step": 7240 }, { "epoch": 0.6191619962636776, "grad_norm": 0.5245025157928467, "learning_rate": 3.8081817405414638e-06, "loss": 2.0287, "step": 7250 }, { "epoch": 0.6200160128102482, "grad_norm": 0.5179617404937744, "learning_rate": 3.7996413015629006e-06, "loss": 2.0384, "step": 7260 }, { "epoch": 0.6208700293568188, "grad_norm": 0.5305375456809998, "learning_rate": 3.791100862584337e-06, "loss": 2.0535, "step": 7270 }, { "epoch": 0.6217240459033894, "grad_norm": 0.4980165362358093, "learning_rate": 3.782560423605774e-06, "loss": 2.0215, "step": 7280 }, { "epoch": 0.62257806244996, "grad_norm": 0.5280580520629883, "learning_rate": 3.7740199846272104e-06, "loss": 2.0707, "step": 7290 }, { "epoch": 0.6234320789965305, "grad_norm": 0.5952538251876831, "learning_rate": 3.7654795456486464e-06, "loss": 2.0395, "step": 7300 }, { "epoch": 0.6234320789965305, "eval_loss": 2.059544324874878, "eval_runtime": 99.0871, "eval_samples_per_second": 10.092, "eval_steps_per_second": 5.046, "step": 7300 }, { "epoch": 0.6242860955431011, "grad_norm": 0.5330567359924316, "learning_rate": 3.7569391066700833e-06, "loss": 2.0756, "step": 7310 }, { "epoch": 0.6251401120896717, "grad_norm": 0.6171514987945557, "learning_rate": 3.7483986676915197e-06, "loss": 2.03, "step": 7320 }, { "epoch": 0.6259941286362424, "grad_norm": 0.5237666964530945, "learning_rate": 3.7398582287129557e-06, "loss": 2.0358, "step": 7330 }, { "epoch": 0.626848145182813, "grad_norm": 0.49751928448677063, "learning_rate": 3.7313177897343926e-06, "loss": 2.0394, "step": 7340 }, { "epoch": 0.6277021617293835, "grad_norm": 0.5514543652534485, "learning_rate": 3.722777350755829e-06, "loss": 2.0491, "step": 7350 }, { "epoch": 0.6285561782759541, "grad_norm": 0.49144166707992554, "learning_rate": 3.714236911777266e-06, "loss": 2.0504, "step": 7360 }, { "epoch": 0.6294101948225247, "grad_norm": 0.4847368896007538, "learning_rate": 3.705696472798702e-06, "loss": 2.0326, "step": 7370 }, { "epoch": 0.6302642113690953, "grad_norm": 0.5339867472648621, "learning_rate": 3.6971560338201384e-06, "loss": 2.0161, "step": 7380 }, { "epoch": 0.6311182279156659, "grad_norm": 0.5152847766876221, "learning_rate": 3.6886155948415753e-06, "loss": 2.0773, "step": 7390 }, { "epoch": 0.6319722444622364, "grad_norm": 0.5037885308265686, "learning_rate": 3.6800751558630117e-06, "loss": 2.0617, "step": 7400 }, { "epoch": 0.6319722444622364, "eval_loss": 2.0594868659973145, "eval_runtime": 98.194, "eval_samples_per_second": 10.184, "eval_steps_per_second": 5.092, "step": 7400 }, { "epoch": 0.632826261008807, "grad_norm": 0.6314968466758728, "learning_rate": 3.6715347168844486e-06, "loss": 2.06, "step": 7410 }, { "epoch": 0.6336802775553776, "grad_norm": 0.5687624216079712, "learning_rate": 3.6629942779058846e-06, "loss": 2.0328, "step": 7420 }, { "epoch": 0.6345342941019482, "grad_norm": 0.5416333079338074, "learning_rate": 3.654453838927321e-06, "loss": 2.0212, "step": 7430 }, { "epoch": 0.6353883106485189, "grad_norm": 0.5656910538673401, "learning_rate": 3.645913399948758e-06, "loss": 2.0601, "step": 7440 }, { "epoch": 0.6362423271950894, "grad_norm": 0.5079363584518433, "learning_rate": 3.637372960970194e-06, "loss": 2.0135, "step": 7450 }, { "epoch": 0.63709634374166, "grad_norm": 0.4980289041996002, "learning_rate": 3.628832521991631e-06, "loss": 2.062, "step": 7460 }, { "epoch": 0.6379503602882306, "grad_norm": 0.5247538089752197, "learning_rate": 3.6202920830130672e-06, "loss": 2.0328, "step": 7470 }, { "epoch": 0.6388043768348012, "grad_norm": 0.5211943984031677, "learning_rate": 3.6117516440345037e-06, "loss": 2.0173, "step": 7480 }, { "epoch": 0.6396583933813718, "grad_norm": 0.5648311376571655, "learning_rate": 3.6032112050559406e-06, "loss": 2.0673, "step": 7490 }, { "epoch": 0.6405124099279423, "grad_norm": 0.5538136959075928, "learning_rate": 3.5946707660773766e-06, "loss": 2.0541, "step": 7500 }, { "epoch": 0.6405124099279423, "eval_loss": 2.0588557720184326, "eval_runtime": 98.1936, "eval_samples_per_second": 10.184, "eval_steps_per_second": 5.092, "step": 7500 }, { "epoch": 0.6413664264745129, "grad_norm": 0.5176765322685242, "learning_rate": 3.586130327098813e-06, "loss": 2.0329, "step": 7510 }, { "epoch": 0.6422204430210835, "grad_norm": 0.5251041650772095, "learning_rate": 3.57758988812025e-06, "loss": 2.0514, "step": 7520 }, { "epoch": 0.6430744595676541, "grad_norm": 0.5257436633110046, "learning_rate": 3.569049449141686e-06, "loss": 2.0424, "step": 7530 }, { "epoch": 0.6439284761142248, "grad_norm": 0.5508720874786377, "learning_rate": 3.5605090101631228e-06, "loss": 2.0182, "step": 7540 }, { "epoch": 0.6447824926607953, "grad_norm": 0.620539665222168, "learning_rate": 3.5519685711845592e-06, "loss": 2.0259, "step": 7550 }, { "epoch": 0.6456365092073659, "grad_norm": 0.5437766909599304, "learning_rate": 3.5434281322059957e-06, "loss": 2.0313, "step": 7560 }, { "epoch": 0.6464905257539365, "grad_norm": 0.5391945242881775, "learning_rate": 3.534887693227432e-06, "loss": 2.0508, "step": 7570 }, { "epoch": 0.6473445423005071, "grad_norm": 0.49488940834999084, "learning_rate": 3.5263472542488686e-06, "loss": 2.0433, "step": 7580 }, { "epoch": 0.6481985588470777, "grad_norm": 0.49338310956954956, "learning_rate": 3.5178068152703054e-06, "loss": 2.0393, "step": 7590 }, { "epoch": 0.6490525753936482, "grad_norm": 0.546257734298706, "learning_rate": 3.509266376291742e-06, "loss": 2.0397, "step": 7600 }, { "epoch": 0.6490525753936482, "eval_loss": 2.0585122108459473, "eval_runtime": 98.1579, "eval_samples_per_second": 10.188, "eval_steps_per_second": 5.094, "step": 7600 }, { "epoch": 0.6499065919402188, "grad_norm": 0.5638222098350525, "learning_rate": 3.500725937313178e-06, "loss": 2.0369, "step": 7610 }, { "epoch": 0.6507606084867894, "grad_norm": 0.4783158004283905, "learning_rate": 3.4921854983346148e-06, "loss": 2.0689, "step": 7620 }, { "epoch": 0.65161462503336, "grad_norm": 0.5572123527526855, "learning_rate": 3.4836450593560512e-06, "loss": 2.0489, "step": 7630 }, { "epoch": 0.6524686415799306, "grad_norm": 0.5384513735771179, "learning_rate": 3.4751046203774872e-06, "loss": 2.0261, "step": 7640 }, { "epoch": 0.6533226581265013, "grad_norm": 0.4891786575317383, "learning_rate": 3.466564181398924e-06, "loss": 2.022, "step": 7650 }, { "epoch": 0.6541766746730718, "grad_norm": 0.499834805727005, "learning_rate": 3.4580237424203605e-06, "loss": 2.0493, "step": 7660 }, { "epoch": 0.6550306912196424, "grad_norm": 0.5120176672935486, "learning_rate": 3.4494833034417974e-06, "loss": 2.026, "step": 7670 }, { "epoch": 0.655884707766213, "grad_norm": 0.5497430562973022, "learning_rate": 3.440942864463234e-06, "loss": 2.0357, "step": 7680 }, { "epoch": 0.6567387243127836, "grad_norm": 0.5623518228530884, "learning_rate": 3.43240242548467e-06, "loss": 2.0151, "step": 7690 }, { "epoch": 0.6575927408593542, "grad_norm": 0.6008960008621216, "learning_rate": 3.4238619865061068e-06, "loss": 2.0443, "step": 7700 }, { "epoch": 0.6575927408593542, "eval_loss": 2.058133840560913, "eval_runtime": 101.1458, "eval_samples_per_second": 9.887, "eval_steps_per_second": 4.943, "step": 7700 }, { "epoch": 0.6584467574059247, "grad_norm": 0.5446147322654724, "learning_rate": 3.415321547527543e-06, "loss": 2.0068, "step": 7710 }, { "epoch": 0.6593007739524953, "grad_norm": 0.5315154194831848, "learning_rate": 3.40678110854898e-06, "loss": 2.0261, "step": 7720 }, { "epoch": 0.6601547904990659, "grad_norm": 0.5347596406936646, "learning_rate": 3.398240669570416e-06, "loss": 2.0595, "step": 7730 }, { "epoch": 0.6610088070456365, "grad_norm": 0.5882947444915771, "learning_rate": 3.3897002305918525e-06, "loss": 2.0016, "step": 7740 }, { "epoch": 0.661862823592207, "grad_norm": 0.5320292711257935, "learning_rate": 3.3811597916132894e-06, "loss": 2.0349, "step": 7750 }, { "epoch": 0.6627168401387777, "grad_norm": 0.4915473759174347, "learning_rate": 3.3726193526347254e-06, "loss": 1.9986, "step": 7760 }, { "epoch": 0.6635708566853483, "grad_norm": 0.5196160674095154, "learning_rate": 3.3640789136561623e-06, "loss": 2.0095, "step": 7770 }, { "epoch": 0.6644248732319189, "grad_norm": 0.5164250135421753, "learning_rate": 3.3555384746775987e-06, "loss": 2.0241, "step": 7780 }, { "epoch": 0.6652788897784895, "grad_norm": 0.5463265180587769, "learning_rate": 3.346998035699035e-06, "loss": 2.0216, "step": 7790 }, { "epoch": 0.6661329063250601, "grad_norm": 0.5488865971565247, "learning_rate": 3.338457596720472e-06, "loss": 2.0307, "step": 7800 }, { "epoch": 0.6661329063250601, "eval_loss": 2.0575978755950928, "eval_runtime": 101.701, "eval_samples_per_second": 9.833, "eval_steps_per_second": 4.916, "step": 7800 }, { "epoch": 1.0000800320128052, "grad_norm": 0.7740113139152527, "learning_rate": 4.996796925048046e-06, "loss": 2.0395, "step": 7810 }, { "epoch": 1.0013605442176872, "grad_norm": 0.718994677066803, "learning_rate": 4.9903907751441385e-06, "loss": 2.04, "step": 7820 }, { "epoch": 1.002641056422569, "grad_norm": 0.5779696702957153, "learning_rate": 4.983984625240231e-06, "loss": 2.0225, "step": 7830 }, { "epoch": 1.003921568627451, "grad_norm": 0.5766503810882568, "learning_rate": 4.977578475336323e-06, "loss": 2.0371, "step": 7840 }, { "epoch": 1.005202080832333, "grad_norm": 0.5812973380088806, "learning_rate": 4.971172325432416e-06, "loss": 2.0419, "step": 7850 }, { "epoch": 1.006482593037215, "grad_norm": 0.623308002948761, "learning_rate": 4.964766175528508e-06, "loss": 2.0505, "step": 7860 }, { "epoch": 1.0077631052420968, "grad_norm": 0.5263451933860779, "learning_rate": 4.9583600256246e-06, "loss": 2.0547, "step": 7870 }, { "epoch": 1.0090436174469788, "grad_norm": 0.5198953151702881, "learning_rate": 4.951953875720693e-06, "loss": 2.0203, "step": 7880 }, { "epoch": 1.0103241296518608, "grad_norm": 0.5853062868118286, "learning_rate": 4.945547725816784e-06, "loss": 2.0358, "step": 7890 }, { "epoch": 1.0116046418567426, "grad_norm": 0.5690246820449829, "learning_rate": 4.939141575912877e-06, "loss": 2.0607, "step": 7900 }, { "epoch": 1.0116046418567426, "eval_loss": 2.060058116912842, "eval_runtime": 99.7073, "eval_samples_per_second": 10.029, "eval_steps_per_second": 5.015, "step": 7900 }, { "epoch": 1.0128851540616246, "grad_norm": 0.6459298729896545, "learning_rate": 4.932735426008969e-06, "loss": 2.0358, "step": 7910 }, { "epoch": 1.0141656662665066, "grad_norm": 0.7262232899665833, "learning_rate": 4.926329276105061e-06, "loss": 2.0391, "step": 7920 }, { "epoch": 1.0154461784713886, "grad_norm": 0.575965940952301, "learning_rate": 4.919923126201154e-06, "loss": 2.0571, "step": 7930 }, { "epoch": 1.0167266906762704, "grad_norm": 0.6091157793998718, "learning_rate": 4.913516976297245e-06, "loss": 2.0116, "step": 7940 }, { "epoch": 1.0180072028811524, "grad_norm": 0.5863317847251892, "learning_rate": 4.9071108263933385e-06, "loss": 2.0389, "step": 7950 }, { "epoch": 1.0192877150860344, "grad_norm": 0.8138238191604614, "learning_rate": 4.90070467648943e-06, "loss": 2.0153, "step": 7960 }, { "epoch": 1.0205682272909165, "grad_norm": 0.5913819074630737, "learning_rate": 4.894298526585522e-06, "loss": 2.0315, "step": 7970 }, { "epoch": 1.0218487394957982, "grad_norm": 0.643348217010498, "learning_rate": 4.887892376681615e-06, "loss": 2.0161, "step": 7980 }, { "epoch": 1.0231292517006803, "grad_norm": 0.6365050673484802, "learning_rate": 4.881486226777707e-06, "loss": 2.0395, "step": 7990 }, { "epoch": 1.0244097639055623, "grad_norm": 0.628605842590332, "learning_rate": 4.8750800768737995e-06, "loss": 2.066, "step": 8000 }, { "epoch": 1.0244097639055623, "eval_loss": 2.0600643157958984, "eval_runtime": 99.495, "eval_samples_per_second": 10.051, "eval_steps_per_second": 5.025, "step": 8000 }, { "epoch": 1.0256902761104443, "grad_norm": 0.568298876285553, "learning_rate": 4.868673926969891e-06, "loss": 2.0526, "step": 8010 }, { "epoch": 1.026970788315326, "grad_norm": 0.5762647986412048, "learning_rate": 4.862267777065983e-06, "loss": 2.0247, "step": 8020 }, { "epoch": 1.028251300520208, "grad_norm": 0.5563777089118958, "learning_rate": 4.8558616271620766e-06, "loss": 2.0326, "step": 8030 }, { "epoch": 1.02953181272509, "grad_norm": 0.5955259203910828, "learning_rate": 4.849455477258168e-06, "loss": 2.0452, "step": 8040 }, { "epoch": 1.0308123249299719, "grad_norm": 0.5595470666885376, "learning_rate": 4.8430493273542605e-06, "loss": 2.0335, "step": 8050 }, { "epoch": 1.032092837134854, "grad_norm": 0.5725470781326294, "learning_rate": 4.836643177450353e-06, "loss": 2.0465, "step": 8060 }, { "epoch": 1.033373349339736, "grad_norm": 0.58101886510849, "learning_rate": 4.830237027546445e-06, "loss": 2.0177, "step": 8070 }, { "epoch": 1.034653861544618, "grad_norm": 0.5748034119606018, "learning_rate": 4.8238308776425376e-06, "loss": 2.0472, "step": 8080 }, { "epoch": 1.0359343737494997, "grad_norm": 0.5992661118507385, "learning_rate": 4.817424727738629e-06, "loss": 2.0163, "step": 8090 }, { "epoch": 1.0372148859543817, "grad_norm": 0.6099635362625122, "learning_rate": 4.8110185778347215e-06, "loss": 2.0443, "step": 8100 }, { "epoch": 1.0372148859543817, "eval_loss": 2.0590715408325195, "eval_runtime": 99.7637, "eval_samples_per_second": 10.024, "eval_steps_per_second": 5.012, "step": 8100 }, { "epoch": 1.0384953981592637, "grad_norm": 0.6065194010734558, "learning_rate": 4.804612427930814e-06, "loss": 2.0325, "step": 8110 }, { "epoch": 1.0397759103641457, "grad_norm": 0.5869019031524658, "learning_rate": 4.798206278026906e-06, "loss": 2.0056, "step": 8120 }, { "epoch": 1.0410564225690275, "grad_norm": 0.6351775527000427, "learning_rate": 4.7918001281229986e-06, "loss": 2.045, "step": 8130 }, { "epoch": 1.0423369347739095, "grad_norm": 0.6560531258583069, "learning_rate": 4.785393978219091e-06, "loss": 2.0568, "step": 8140 }, { "epoch": 1.0436174469787916, "grad_norm": 0.6789016127586365, "learning_rate": 4.778987828315183e-06, "loss": 2.0492, "step": 8150 }, { "epoch": 1.0448979591836736, "grad_norm": 0.6798705458641052, "learning_rate": 4.772581678411275e-06, "loss": 2.059, "step": 8160 }, { "epoch": 1.0461784713885554, "grad_norm": 0.539069652557373, "learning_rate": 4.766175528507367e-06, "loss": 2.019, "step": 8170 }, { "epoch": 1.0474589835934374, "grad_norm": 0.5451361536979675, "learning_rate": 4.7597693786034595e-06, "loss": 2.0505, "step": 8180 }, { "epoch": 1.0487394957983194, "grad_norm": 0.629255473613739, "learning_rate": 4.753363228699552e-06, "loss": 2.0216, "step": 8190 }, { "epoch": 1.0500200080032012, "grad_norm": 0.6603861451148987, "learning_rate": 4.746957078795644e-06, "loss": 2.0266, "step": 8200 }, { "epoch": 1.0500200080032012, "eval_loss": 2.0588185787200928, "eval_runtime": 98.7043, "eval_samples_per_second": 10.131, "eval_steps_per_second": 5.066, "step": 8200 }, { "epoch": 1.0513005202080832, "grad_norm": 0.6830970644950867, "learning_rate": 4.740550928891736e-06, "loss": 2.075, "step": 8210 }, { "epoch": 1.0525810324129652, "grad_norm": 0.5211127996444702, "learning_rate": 4.734144778987829e-06, "loss": 2.006, "step": 8220 }, { "epoch": 1.0538615446178472, "grad_norm": 0.6494899392127991, "learning_rate": 4.727738629083921e-06, "loss": 2.0478, "step": 8230 }, { "epoch": 1.055142056822729, "grad_norm": 0.5123782753944397, "learning_rate": 4.721332479180013e-06, "loss": 2.0282, "step": 8240 }, { "epoch": 1.056422569027611, "grad_norm": 0.5879936814308167, "learning_rate": 4.714926329276105e-06, "loss": 2.0656, "step": 8250 }, { "epoch": 1.057703081232493, "grad_norm": 0.5019848346710205, "learning_rate": 4.708520179372198e-06, "loss": 2.0206, "step": 8260 }, { "epoch": 1.058983593437375, "grad_norm": 0.54250168800354, "learning_rate": 4.70211402946829e-06, "loss": 2.0572, "step": 8270 }, { "epoch": 1.0602641056422568, "grad_norm": 0.5240892767906189, "learning_rate": 4.695707879564382e-06, "loss": 2.0483, "step": 8280 }, { "epoch": 1.0615446178471388, "grad_norm": 0.6275247931480408, "learning_rate": 4.689301729660474e-06, "loss": 2.0346, "step": 8290 }, { "epoch": 1.0628251300520208, "grad_norm": 0.6213353276252747, "learning_rate": 4.682895579756567e-06, "loss": 2.0228, "step": 8300 }, { "epoch": 1.0628251300520208, "eval_loss": 2.0585317611694336, "eval_runtime": 98.584, "eval_samples_per_second": 10.144, "eval_steps_per_second": 5.072, "step": 8300 }, { "epoch": 1.0641056422569029, "grad_norm": 0.5836710929870605, "learning_rate": 4.6764894298526595e-06, "loss": 2.0357, "step": 8310 }, { "epoch": 1.0653861544617846, "grad_norm": 0.5407329797744751, "learning_rate": 4.670083279948751e-06, "loss": 2.0647, "step": 8320 }, { "epoch": 1.0666666666666667, "grad_norm": 0.5546687841415405, "learning_rate": 4.663677130044843e-06, "loss": 2.0501, "step": 8330 }, { "epoch": 1.0679471788715487, "grad_norm": 0.5902862548828125, "learning_rate": 4.657270980140936e-06, "loss": 2.0524, "step": 8340 }, { "epoch": 1.0692276910764307, "grad_norm": 0.5358943939208984, "learning_rate": 4.650864830237028e-06, "loss": 2.0324, "step": 8350 }, { "epoch": 1.0705082032813125, "grad_norm": 0.5571943521499634, "learning_rate": 4.6444586803331205e-06, "loss": 2.0092, "step": 8360 }, { "epoch": 1.0717887154861945, "grad_norm": 0.53328937292099, "learning_rate": 4.638052530429212e-06, "loss": 2.0282, "step": 8370 }, { "epoch": 1.0730692276910765, "grad_norm": 0.678426206111908, "learning_rate": 4.631646380525305e-06, "loss": 2.0427, "step": 8380 }, { "epoch": 1.0743497398959585, "grad_norm": 0.6406043171882629, "learning_rate": 4.625240230621397e-06, "loss": 2.0246, "step": 8390 }, { "epoch": 1.0756302521008403, "grad_norm": 0.5901826024055481, "learning_rate": 4.618834080717489e-06, "loss": 2.0276, "step": 8400 }, { "epoch": 1.0756302521008403, "eval_loss": 2.058272361755371, "eval_runtime": 98.6499, "eval_samples_per_second": 10.137, "eval_steps_per_second": 5.068, "step": 8400 }, { "epoch": 1.0769107643057223, "grad_norm": 0.5525951385498047, "learning_rate": 4.6124279308135815e-06, "loss": 2.0386, "step": 8410 }, { "epoch": 1.0781912765106043, "grad_norm": 0.5235487222671509, "learning_rate": 4.606021780909674e-06, "loss": 2.0413, "step": 8420 }, { "epoch": 1.079471788715486, "grad_norm": 0.5588756203651428, "learning_rate": 4.599615631005766e-06, "loss": 2.021, "step": 8430 }, { "epoch": 1.0807523009203681, "grad_norm": 0.5304463505744934, "learning_rate": 4.593209481101858e-06, "loss": 2.0393, "step": 8440 }, { "epoch": 1.0820328131252501, "grad_norm": 0.7028805017471313, "learning_rate": 4.58680333119795e-06, "loss": 2.02, "step": 8450 }, { "epoch": 1.0833133253301321, "grad_norm": 0.735097348690033, "learning_rate": 4.5803971812940425e-06, "loss": 2.0358, "step": 8460 }, { "epoch": 1.084593837535014, "grad_norm": 0.6663596034049988, "learning_rate": 4.573991031390135e-06, "loss": 2.0365, "step": 8470 }, { "epoch": 1.085874349739896, "grad_norm": 0.589572012424469, "learning_rate": 4.567584881486227e-06, "loss": 2.0444, "step": 8480 }, { "epoch": 1.087154861944778, "grad_norm": 0.5971885323524475, "learning_rate": 4.5611787315823196e-06, "loss": 2.0306, "step": 8490 }, { "epoch": 1.08843537414966, "grad_norm": 0.5263055562973022, "learning_rate": 4.554772581678412e-06, "loss": 2.018, "step": 8500 }, { "epoch": 1.08843537414966, "eval_loss": 2.0575432777404785, "eval_runtime": 98.791, "eval_samples_per_second": 10.122, "eval_steps_per_second": 5.061, "step": 8500 }, { "epoch": 1.0897158863545418, "grad_norm": 0.5179014801979065, "learning_rate": 4.548366431774504e-06, "loss": 2.0413, "step": 8510 }, { "epoch": 1.0909963985594238, "grad_norm": 0.5456497073173523, "learning_rate": 4.541960281870596e-06, "loss": 2.0544, "step": 8520 }, { "epoch": 1.0922769107643058, "grad_norm": 0.6493037939071655, "learning_rate": 4.535554131966688e-06, "loss": 2.0687, "step": 8530 }, { "epoch": 1.0935574229691878, "grad_norm": 0.5566242933273315, "learning_rate": 4.5291479820627806e-06, "loss": 2.0408, "step": 8540 }, { "epoch": 1.0948379351740696, "grad_norm": 0.6045375466346741, "learning_rate": 4.522741832158873e-06, "loss": 2.0458, "step": 8550 }, { "epoch": 1.0961184473789516, "grad_norm": 0.5247881412506104, "learning_rate": 4.516335682254965e-06, "loss": 2.0354, "step": 8560 }, { "epoch": 1.0973989595838336, "grad_norm": 0.5688020586967468, "learning_rate": 4.509929532351057e-06, "loss": 2.0047, "step": 8570 }, { "epoch": 1.0986794717887154, "grad_norm": 0.5846471190452576, "learning_rate": 4.50352338244715e-06, "loss": 2.0359, "step": 8580 }, { "epoch": 1.0999599839935974, "grad_norm": 0.6008567214012146, "learning_rate": 4.4971172325432416e-06, "loss": 2.0378, "step": 8590 }, { "epoch": 1.1012404961984794, "grad_norm": 0.6397348046302795, "learning_rate": 4.490711082639334e-06, "loss": 2.0602, "step": 8600 }, { "epoch": 1.1012404961984794, "eval_loss": 2.0577335357666016, "eval_runtime": 98.7283, "eval_samples_per_second": 10.129, "eval_steps_per_second": 5.064, "step": 8600 }, { "epoch": 1.1025210084033614, "grad_norm": 0.6789408922195435, "learning_rate": 4.484304932735426e-06, "loss": 2.0403, "step": 8610 }, { "epoch": 1.1038015206082432, "grad_norm": 0.7066412568092346, "learning_rate": 4.477898782831519e-06, "loss": 2.0377, "step": 8620 }, { "epoch": 1.1050820328131252, "grad_norm": 0.6749730110168457, "learning_rate": 4.471492632927611e-06, "loss": 2.0187, "step": 8630 }, { "epoch": 1.1063625450180072, "grad_norm": 0.5682045221328735, "learning_rate": 4.4650864830237025e-06, "loss": 2.0206, "step": 8640 }, { "epoch": 1.1076430572228892, "grad_norm": 0.5407593846321106, "learning_rate": 4.458680333119795e-06, "loss": 2.0246, "step": 8650 }, { "epoch": 1.108923569427771, "grad_norm": 0.5678638219833374, "learning_rate": 4.452274183215888e-06, "loss": 2.0428, "step": 8660 }, { "epoch": 1.110204081632653, "grad_norm": 0.5593268275260925, "learning_rate": 4.44586803331198e-06, "loss": 2.0527, "step": 8670 }, { "epoch": 1.111484593837535, "grad_norm": 0.6249791383743286, "learning_rate": 4.439461883408072e-06, "loss": 2.0137, "step": 8680 }, { "epoch": 1.112765106042417, "grad_norm": 0.7263971567153931, "learning_rate": 4.433055733504164e-06, "loss": 2.0361, "step": 8690 }, { "epoch": 1.1140456182472989, "grad_norm": 0.48830723762512207, "learning_rate": 4.426649583600257e-06, "loss": 2.0424, "step": 8700 }, { "epoch": 1.1140456182472989, "eval_loss": 2.0569541454315186, "eval_runtime": 99.0098, "eval_samples_per_second": 10.1, "eval_steps_per_second": 5.05, "step": 8700 }, { "epoch": 1.1153261304521809, "grad_norm": 0.6187960505485535, "learning_rate": 4.420243433696349e-06, "loss": 2.0395, "step": 8710 }, { "epoch": 1.1166066426570629, "grad_norm": 0.5017440915107727, "learning_rate": 4.413837283792441e-06, "loss": 2.0252, "step": 8720 }, { "epoch": 1.1178871548619447, "grad_norm": 0.553180456161499, "learning_rate": 4.407431133888533e-06, "loss": 2.0321, "step": 8730 }, { "epoch": 1.1191676670668267, "grad_norm": 0.5666632056236267, "learning_rate": 4.401024983984626e-06, "loss": 2.0294, "step": 8740 }, { "epoch": 1.1204481792717087, "grad_norm": 0.5148674845695496, "learning_rate": 4.394618834080718e-06, "loss": 2.0564, "step": 8750 }, { "epoch": 1.1217286914765907, "grad_norm": 0.5826708078384399, "learning_rate": 4.38821268417681e-06, "loss": 2.0273, "step": 8760 }, { "epoch": 1.1230092036814725, "grad_norm": 0.5150731205940247, "learning_rate": 4.3818065342729025e-06, "loss": 2.0312, "step": 8770 }, { "epoch": 1.1242897158863545, "grad_norm": 0.5426385402679443, "learning_rate": 4.375400384368995e-06, "loss": 2.0303, "step": 8780 }, { "epoch": 1.1255702280912365, "grad_norm": 0.5782310962677002, "learning_rate": 4.368994234465087e-06, "loss": 2.0154, "step": 8790 }, { "epoch": 1.1268507402961185, "grad_norm": 0.7119567394256592, "learning_rate": 4.362588084561179e-06, "loss": 2.0135, "step": 8800 }, { "epoch": 1.1268507402961185, "eval_loss": 2.056509017944336, "eval_runtime": 99.65, "eval_samples_per_second": 10.035, "eval_steps_per_second": 5.018, "step": 8800 }, { "epoch": 1.1281312525010003, "grad_norm": 0.640973687171936, "learning_rate": 4.356181934657271e-06, "loss": 2.0323, "step": 8810 }, { "epoch": 1.1294117647058823, "grad_norm": 0.5564003586769104, "learning_rate": 4.3497757847533635e-06, "loss": 2.0404, "step": 8820 }, { "epoch": 1.1306922769107643, "grad_norm": 0.5527184009552002, "learning_rate": 4.343369634849456e-06, "loss": 2.0297, "step": 8830 }, { "epoch": 1.1319727891156464, "grad_norm": 0.7402225136756897, "learning_rate": 4.336963484945548e-06, "loss": 2.0241, "step": 8840 }, { "epoch": 1.1332533013205282, "grad_norm": 0.6281604766845703, "learning_rate": 4.330557335041641e-06, "loss": 2.0446, "step": 8850 }, { "epoch": 1.1345338135254102, "grad_norm": 0.5926194787025452, "learning_rate": 4.324151185137733e-06, "loss": 2.018, "step": 8860 }, { "epoch": 1.1358143257302922, "grad_norm": 0.5378844141960144, "learning_rate": 4.3177450352338245e-06, "loss": 2.0394, "step": 8870 }, { "epoch": 1.137094837935174, "grad_norm": 0.5743001699447632, "learning_rate": 4.311338885329917e-06, "loss": 2.0412, "step": 8880 }, { "epoch": 1.138375350140056, "grad_norm": 0.6045801639556885, "learning_rate": 4.304932735426009e-06, "loss": 2.04, "step": 8890 }, { "epoch": 1.139655862344938, "grad_norm": 0.6552196741104126, "learning_rate": 4.2985265855221016e-06, "loss": 2.0216, "step": 8900 }, { "epoch": 1.139655862344938, "eval_loss": 2.056703567504883, "eval_runtime": 99.6537, "eval_samples_per_second": 10.035, "eval_steps_per_second": 5.017, "step": 8900 }, { "epoch": 1.14093637454982, "grad_norm": 0.7684733867645264, "learning_rate": 4.292120435618194e-06, "loss": 2.0079, "step": 8910 }, { "epoch": 1.1422168867547018, "grad_norm": 0.6571674942970276, "learning_rate": 4.2857142857142855e-06, "loss": 2.0467, "step": 8920 }, { "epoch": 1.1434973989595838, "grad_norm": 0.6338903307914734, "learning_rate": 4.279308135810379e-06, "loss": 2.0476, "step": 8930 }, { "epoch": 1.1447779111644658, "grad_norm": 0.6187698841094971, "learning_rate": 4.272901985906471e-06, "loss": 2.031, "step": 8940 }, { "epoch": 1.1460584233693478, "grad_norm": 0.5363069772720337, "learning_rate": 4.2664958360025626e-06, "loss": 2.0144, "step": 8950 }, { "epoch": 1.1473389355742296, "grad_norm": 0.5464866161346436, "learning_rate": 4.260089686098655e-06, "loss": 2.0175, "step": 8960 }, { "epoch": 1.1486194477791116, "grad_norm": 0.5811886787414551, "learning_rate": 4.253683536194747e-06, "loss": 2.0758, "step": 8970 }, { "epoch": 1.1498999599839936, "grad_norm": 0.5649928450584412, "learning_rate": 4.24727738629084e-06, "loss": 2.0875, "step": 8980 }, { "epoch": 1.1511804721888756, "grad_norm": 0.6172623038291931, "learning_rate": 4.240871236386932e-06, "loss": 2.0767, "step": 8990 }, { "epoch": 1.1524609843937574, "grad_norm": 0.7383103370666504, "learning_rate": 4.2344650864830236e-06, "loss": 2.0246, "step": 9000 }, { "epoch": 1.1524609843937574, "eval_loss": 2.056033134460449, "eval_runtime": 99.966, "eval_samples_per_second": 10.003, "eval_steps_per_second": 5.002, "step": 9000 }, { "epoch": 1.1537414965986394, "grad_norm": 0.5509638786315918, "learning_rate": 4.228058936579117e-06, "loss": 2.031, "step": 9010 }, { "epoch": 1.1550220088035215, "grad_norm": 0.5470526814460754, "learning_rate": 4.221652786675208e-06, "loss": 2.0362, "step": 9020 }, { "epoch": 1.1563025210084033, "grad_norm": 0.5429520010948181, "learning_rate": 4.215246636771301e-06, "loss": 2.0339, "step": 9030 }, { "epoch": 1.1575830332132853, "grad_norm": 0.7113022208213806, "learning_rate": 4.208840486867393e-06, "loss": 2.0495, "step": 9040 }, { "epoch": 1.1588635454181673, "grad_norm": 0.59480881690979, "learning_rate": 4.202434336963485e-06, "loss": 2.0054, "step": 9050 }, { "epoch": 1.1601440576230493, "grad_norm": 0.5894181728363037, "learning_rate": 4.196028187059578e-06, "loss": 2.0491, "step": 9060 }, { "epoch": 1.1614245698279313, "grad_norm": 0.5791051387786865, "learning_rate": 4.189622037155669e-06, "loss": 2.0559, "step": 9070 }, { "epoch": 1.162705082032813, "grad_norm": 0.6649132966995239, "learning_rate": 4.183215887251762e-06, "loss": 2.0723, "step": 9080 }, { "epoch": 1.163985594237695, "grad_norm": 0.5229988694190979, "learning_rate": 4.176809737347854e-06, "loss": 2.0144, "step": 9090 }, { "epoch": 1.165266106442577, "grad_norm": 0.5805575847625732, "learning_rate": 4.170403587443946e-06, "loss": 2.0391, "step": 9100 }, { "epoch": 1.165266106442577, "eval_loss": 2.0554511547088623, "eval_runtime": 98.6973, "eval_samples_per_second": 10.132, "eval_steps_per_second": 5.066, "step": 9100 }, { "epoch": 1.166546618647459, "grad_norm": 0.5783365964889526, "learning_rate": 4.163997437540039e-06, "loss": 2.0159, "step": 9110 }, { "epoch": 1.167827130852341, "grad_norm": 0.5854288339614868, "learning_rate": 4.157591287636131e-06, "loss": 2.0347, "step": 9120 }, { "epoch": 1.169107643057223, "grad_norm": 0.5515726208686829, "learning_rate": 4.1511851377322235e-06, "loss": 2.0512, "step": 9130 }, { "epoch": 1.170388155262105, "grad_norm": 0.5261589288711548, "learning_rate": 4.144778987828316e-06, "loss": 2.0588, "step": 9140 }, { "epoch": 1.1716686674669867, "grad_norm": 0.5191331505775452, "learning_rate": 4.138372837924407e-06, "loss": 2.0423, "step": 9150 }, { "epoch": 1.1729491796718687, "grad_norm": 0.6107770204544067, "learning_rate": 4.1319666880205e-06, "loss": 2.0076, "step": 9160 }, { "epoch": 1.1742296918767507, "grad_norm": 0.5812585949897766, "learning_rate": 4.125560538116592e-06, "loss": 2.0123, "step": 9170 }, { "epoch": 1.1755102040816325, "grad_norm": 0.5599560141563416, "learning_rate": 4.1191543882126845e-06, "loss": 2.0249, "step": 9180 }, { "epoch": 1.1767907162865145, "grad_norm": 0.5351222157478333, "learning_rate": 4.112748238308777e-06, "loss": 2.0282, "step": 9190 }, { "epoch": 1.1780712284913966, "grad_norm": 0.5384876728057861, "learning_rate": 4.106342088404869e-06, "loss": 2.0419, "step": 9200 }, { "epoch": 1.1780712284913966, "eval_loss": 2.0551235675811768, "eval_runtime": 99.416, "eval_samples_per_second": 10.059, "eval_steps_per_second": 5.029, "step": 9200 }, { "epoch": 1.1793517406962786, "grad_norm": 0.5739924311637878, "learning_rate": 4.099935938500962e-06, "loss": 2.072, "step": 9210 }, { "epoch": 1.1806322529011606, "grad_norm": 0.5525854229927063, "learning_rate": 4.093529788597054e-06, "loss": 2.0488, "step": 9220 }, { "epoch": 1.1819127651060424, "grad_norm": 0.5480990409851074, "learning_rate": 4.0871236386931455e-06, "loss": 2.0233, "step": 9230 }, { "epoch": 1.1831932773109244, "grad_norm": 0.5385409593582153, "learning_rate": 4.080717488789238e-06, "loss": 2.0448, "step": 9240 }, { "epoch": 1.1844737895158064, "grad_norm": 0.5837866067886353, "learning_rate": 4.07431133888533e-06, "loss": 2.0387, "step": 9250 }, { "epoch": 1.1857543017206882, "grad_norm": 0.5853747129440308, "learning_rate": 4.067905188981423e-06, "loss": 2.0584, "step": 9260 }, { "epoch": 1.1870348139255702, "grad_norm": 0.5508265495300293, "learning_rate": 4.061499039077515e-06, "loss": 2.0211, "step": 9270 }, { "epoch": 1.1883153261304522, "grad_norm": 0.5867159962654114, "learning_rate": 4.0550928891736065e-06, "loss": 2.0016, "step": 9280 }, { "epoch": 1.1895958383353342, "grad_norm": 0.5611317157745361, "learning_rate": 4.0486867392697e-06, "loss": 2.0549, "step": 9290 }, { "epoch": 1.190876350540216, "grad_norm": 0.5700793266296387, "learning_rate": 4.042280589365791e-06, "loss": 2.0517, "step": 9300 }, { "epoch": 1.190876350540216, "eval_loss": 2.0553061962127686, "eval_runtime": 100.0, "eval_samples_per_second": 10.0, "eval_steps_per_second": 5.0, "step": 9300 }, { "epoch": 1.192156862745098, "grad_norm": 0.6431117653846741, "learning_rate": 4.0358744394618836e-06, "loss": 2.0306, "step": 9310 }, { "epoch": 1.19343737494998, "grad_norm": 0.5513091683387756, "learning_rate": 4.029468289557976e-06, "loss": 2.0356, "step": 9320 }, { "epoch": 1.1947178871548618, "grad_norm": 0.561978280544281, "learning_rate": 4.023062139654068e-06, "loss": 2.0345, "step": 9330 }, { "epoch": 1.1959983993597438, "grad_norm": 0.5052829384803772, "learning_rate": 4.016655989750161e-06, "loss": 2.0712, "step": 9340 }, { "epoch": 1.1972789115646258, "grad_norm": 0.6181693077087402, "learning_rate": 4.010249839846252e-06, "loss": 2.0767, "step": 9350 }, { "epoch": 1.1985594237695079, "grad_norm": 0.5558566451072693, "learning_rate": 4.0038436899423446e-06, "loss": 2.0337, "step": 9360 }, { "epoch": 1.1998399359743899, "grad_norm": 0.5830146074295044, "learning_rate": 3.997437540038438e-06, "loss": 2.0225, "step": 9370 }, { "epoch": 1.2011204481792717, "grad_norm": 0.5830032229423523, "learning_rate": 3.991031390134529e-06, "loss": 2.031, "step": 9380 }, { "epoch": 1.2024009603841537, "grad_norm": 0.524630606174469, "learning_rate": 3.984625240230622e-06, "loss": 2.0641, "step": 9390 }, { "epoch": 1.2036814725890357, "grad_norm": 0.5185086131095886, "learning_rate": 3.978219090326714e-06, "loss": 2.0459, "step": 9400 }, { "epoch": 1.2036814725890357, "eval_loss": 2.0544145107269287, "eval_runtime": 99.5316, "eval_samples_per_second": 10.047, "eval_steps_per_second": 5.024, "step": 9400 }, { "epoch": 1.2049619847939175, "grad_norm": 0.6136828660964966, "learning_rate": 3.971812940422806e-06, "loss": 2.0247, "step": 9410 }, { "epoch": 1.2062424969987995, "grad_norm": 0.5434430837631226, "learning_rate": 3.965406790518899e-06, "loss": 2.0359, "step": 9420 }, { "epoch": 1.2075230092036815, "grad_norm": 0.5703685283660889, "learning_rate": 3.95900064061499e-06, "loss": 2.0055, "step": 9430 }, { "epoch": 1.2088035214085635, "grad_norm": 0.5542050004005432, "learning_rate": 3.952594490711083e-06, "loss": 2.0288, "step": 9440 }, { "epoch": 1.2100840336134453, "grad_norm": 0.5820972919464111, "learning_rate": 3.946188340807175e-06, "loss": 2.0562, "step": 9450 }, { "epoch": 1.2113645458183273, "grad_norm": 0.6455557346343994, "learning_rate": 3.939782190903267e-06, "loss": 2.0401, "step": 9460 }, { "epoch": 1.2126450580232093, "grad_norm": 0.6048575639724731, "learning_rate": 3.93337604099936e-06, "loss": 2.0594, "step": 9470 }, { "epoch": 1.2139255702280913, "grad_norm": 0.5332948565483093, "learning_rate": 3.926969891095452e-06, "loss": 2.0354, "step": 9480 }, { "epoch": 1.2152060824329731, "grad_norm": 0.5830275416374207, "learning_rate": 3.9205637411915445e-06, "loss": 2.0257, "step": 9490 }, { "epoch": 1.2164865946378551, "grad_norm": 0.6083182692527771, "learning_rate": 3.914157591287637e-06, "loss": 2.0324, "step": 9500 }, { "epoch": 1.2164865946378551, "eval_loss": 2.054752826690674, "eval_runtime": 99.7348, "eval_samples_per_second": 10.027, "eval_steps_per_second": 5.013, "step": 9500 }, { "epoch": 1.2177671068427371, "grad_norm": 0.6217626929283142, "learning_rate": 3.907751441383728e-06, "loss": 1.9986, "step": 9510 }, { "epoch": 1.2190476190476192, "grad_norm": 0.5222347974777222, "learning_rate": 3.901345291479821e-06, "loss": 1.9824, "step": 9520 }, { "epoch": 1.220328131252501, "grad_norm": 0.5925136804580688, "learning_rate": 3.894939141575913e-06, "loss": 2.0384, "step": 9530 }, { "epoch": 1.221608643457383, "grad_norm": 0.57725989818573, "learning_rate": 3.8885329916720055e-06, "loss": 2.0398, "step": 9540 }, { "epoch": 1.222889155662265, "grad_norm": 0.5565349459648132, "learning_rate": 3.882126841768098e-06, "loss": 2.0592, "step": 9550 }, { "epoch": 1.2241696678671468, "grad_norm": 0.6713798642158508, "learning_rate": 3.87572069186419e-06, "loss": 2.0382, "step": 9560 }, { "epoch": 1.2254501800720288, "grad_norm": 0.5489569902420044, "learning_rate": 3.869314541960283e-06, "loss": 2.0315, "step": 9570 }, { "epoch": 1.2267306922769108, "grad_norm": 0.6010563373565674, "learning_rate": 3.862908392056374e-06, "loss": 2.0423, "step": 9580 }, { "epoch": 1.2280112044817928, "grad_norm": 0.5540730357170105, "learning_rate": 3.8565022421524665e-06, "loss": 2.05, "step": 9590 }, { "epoch": 1.2292917166866746, "grad_norm": 0.5796175003051758, "learning_rate": 3.850096092248559e-06, "loss": 1.995, "step": 9600 }, { "epoch": 1.2292917166866746, "eval_loss": 2.054189682006836, "eval_runtime": 99.3115, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.035, "step": 9600 }, { "epoch": 1.2305722288915566, "grad_norm": 0.5845100283622742, "learning_rate": 3.843689942344651e-06, "loss": 2.0672, "step": 9610 }, { "epoch": 1.2318527410964386, "grad_norm": 0.5468641519546509, "learning_rate": 3.837283792440744e-06, "loss": 2.0307, "step": 9620 }, { "epoch": 1.2331332533013206, "grad_norm": 0.5301446914672852, "learning_rate": 3.830877642536835e-06, "loss": 2.0205, "step": 9630 }, { "epoch": 1.2344137655062024, "grad_norm": 0.5915380120277405, "learning_rate": 3.824471492632928e-06, "loss": 2.0291, "step": 9640 }, { "epoch": 1.2356942777110844, "grad_norm": 0.5630759596824646, "learning_rate": 3.818065342729021e-06, "loss": 2.0108, "step": 9650 }, { "epoch": 1.2369747899159664, "grad_norm": 0.5415170788764954, "learning_rate": 3.8116591928251122e-06, "loss": 2.0513, "step": 9660 }, { "epoch": 1.2382553021208484, "grad_norm": 0.6158027052879333, "learning_rate": 3.8052530429212046e-06, "loss": 2.0158, "step": 9670 }, { "epoch": 1.2395358143257302, "grad_norm": 0.6029166579246521, "learning_rate": 3.7988468930172965e-06, "loss": 2.0285, "step": 9680 }, { "epoch": 1.2408163265306122, "grad_norm": 0.5941691994667053, "learning_rate": 3.7924407431133893e-06, "loss": 2.0606, "step": 9690 }, { "epoch": 1.2420968387354943, "grad_norm": 0.5230644941329956, "learning_rate": 3.7860345932094817e-06, "loss": 2.0413, "step": 9700 }, { "epoch": 1.2420968387354943, "eval_loss": 2.054208993911743, "eval_runtime": 99.4257, "eval_samples_per_second": 10.058, "eval_steps_per_second": 5.029, "step": 9700 }, { "epoch": 1.243377350940376, "grad_norm": 0.5665590167045593, "learning_rate": 3.7796284433055736e-06, "loss": 2.049, "step": 9710 }, { "epoch": 1.244657863145258, "grad_norm": 0.5213637351989746, "learning_rate": 3.773222293401666e-06, "loss": 2.0367, "step": 9720 }, { "epoch": 1.24593837535014, "grad_norm": 0.5417614579200745, "learning_rate": 3.766816143497758e-06, "loss": 2.0392, "step": 9730 }, { "epoch": 1.247218887555022, "grad_norm": 0.5841271877288818, "learning_rate": 3.7604099935938503e-06, "loss": 2.0346, "step": 9740 }, { "epoch": 1.2484993997599039, "grad_norm": 0.5251507759094238, "learning_rate": 3.7540038436899427e-06, "loss": 2.0386, "step": 9750 }, { "epoch": 1.2497799119647859, "grad_norm": 0.5105574727058411, "learning_rate": 3.7475976937860346e-06, "loss": 2.0266, "step": 9760 }, { "epoch": 1.251060424169668, "grad_norm": 0.5459333062171936, "learning_rate": 3.7411915438821274e-06, "loss": 2.0536, "step": 9770 }, { "epoch": 1.2523409363745497, "grad_norm": 0.5234960317611694, "learning_rate": 3.7347853939782194e-06, "loss": 2.0312, "step": 9780 }, { "epoch": 1.2536214485794317, "grad_norm": 0.5144737958908081, "learning_rate": 3.7283792440743117e-06, "loss": 2.0374, "step": 9790 }, { "epoch": 1.2549019607843137, "grad_norm": 0.587037980556488, "learning_rate": 3.721973094170404e-06, "loss": 2.0438, "step": 9800 }, { "epoch": 1.2549019607843137, "eval_loss": 2.053664445877075, "eval_runtime": 99.1442, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 9800 }, { "epoch": 1.2561824729891957, "grad_norm": 0.6080870628356934, "learning_rate": 3.715566944266496e-06, "loss": 2.0679, "step": 9810 }, { "epoch": 1.2574629851940777, "grad_norm": 0.6218572854995728, "learning_rate": 3.7091607943625884e-06, "loss": 1.9985, "step": 9820 }, { "epoch": 1.2587434973989595, "grad_norm": 0.6245348453521729, "learning_rate": 3.7027546444586804e-06, "loss": 2.0422, "step": 9830 }, { "epoch": 1.2600240096038415, "grad_norm": 0.5775768756866455, "learning_rate": 3.6963484945547727e-06, "loss": 2.0326, "step": 9840 }, { "epoch": 1.2613045218087235, "grad_norm": 0.6755483746528625, "learning_rate": 3.6899423446508655e-06, "loss": 2.0396, "step": 9850 }, { "epoch": 1.2625850340136053, "grad_norm": 0.6291812062263489, "learning_rate": 3.683536194746957e-06, "loss": 2.0173, "step": 9860 }, { "epoch": 1.2638655462184873, "grad_norm": 0.5336599946022034, "learning_rate": 3.67713004484305e-06, "loss": 2.0217, "step": 9870 }, { "epoch": 1.2651460584233694, "grad_norm": 0.6076573133468628, "learning_rate": 3.6707238949391418e-06, "loss": 2.0252, "step": 9880 }, { "epoch": 1.2664265706282514, "grad_norm": 0.5939626097679138, "learning_rate": 3.664317745035234e-06, "loss": 2.0559, "step": 9890 }, { "epoch": 1.2677070828331334, "grad_norm": 0.552836537361145, "learning_rate": 3.6579115951313265e-06, "loss": 2.0234, "step": 9900 }, { "epoch": 1.2677070828331334, "eval_loss": 2.0530409812927246, "eval_runtime": 99.1336, "eval_samples_per_second": 10.087, "eval_steps_per_second": 5.044, "step": 9900 }, { "epoch": 1.2689875950380152, "grad_norm": 0.5579429268836975, "learning_rate": 3.6515054452274185e-06, "loss": 2.0606, "step": 9910 }, { "epoch": 1.2702681072428972, "grad_norm": 0.6526882648468018, "learning_rate": 3.645099295323511e-06, "loss": 2.0306, "step": 9920 }, { "epoch": 1.2715486194477792, "grad_norm": 0.5143726468086243, "learning_rate": 3.638693145419603e-06, "loss": 2.0193, "step": 9930 }, { "epoch": 1.272829131652661, "grad_norm": 0.5656778812408447, "learning_rate": 3.632286995515695e-06, "loss": 2.0027, "step": 9940 }, { "epoch": 1.274109643857543, "grad_norm": 0.5471328496932983, "learning_rate": 3.625880845611788e-06, "loss": 2.0292, "step": 9950 }, { "epoch": 1.275390156062425, "grad_norm": 0.5603408813476562, "learning_rate": 3.61947469570788e-06, "loss": 2.0045, "step": 9960 }, { "epoch": 1.276670668267307, "grad_norm": 0.47398802638053894, "learning_rate": 3.6130685458039722e-06, "loss": 2.0263, "step": 9970 }, { "epoch": 1.2779511804721888, "grad_norm": 0.539801299571991, "learning_rate": 3.6066623959000646e-06, "loss": 2.0117, "step": 9980 }, { "epoch": 1.2792316926770708, "grad_norm": 0.5288535356521606, "learning_rate": 3.6002562459961565e-06, "loss": 2.0066, "step": 9990 }, { "epoch": 1.2805122048819528, "grad_norm": 0.608700156211853, "learning_rate": 3.593850096092249e-06, "loss": 2.0319, "step": 10000 }, { "epoch": 1.2805122048819528, "eval_loss": 2.0524823665618896, "eval_runtime": 99.123, "eval_samples_per_second": 10.088, "eval_steps_per_second": 5.044, "step": 10000 }, { "epoch": 1.2817927170868346, "grad_norm": 0.54178386926651, "learning_rate": 3.587443946188341e-06, "loss": 2.0233, "step": 10010 }, { "epoch": 1.2830732292917166, "grad_norm": 0.6180025935173035, "learning_rate": 3.5810377962844332e-06, "loss": 2.0276, "step": 10020 }, { "epoch": 1.2843537414965986, "grad_norm": 0.5264298915863037, "learning_rate": 3.574631646380526e-06, "loss": 2.0516, "step": 10030 }, { "epoch": 1.2856342537014807, "grad_norm": 0.604239821434021, "learning_rate": 3.568225496476618e-06, "loss": 2.0007, "step": 10040 }, { "epoch": 1.2869147659063627, "grad_norm": 0.7111391425132751, "learning_rate": 3.5618193465727103e-06, "loss": 2.0468, "step": 10050 }, { "epoch": 1.2881952781112445, "grad_norm": 0.6468245387077332, "learning_rate": 3.5554131966688023e-06, "loss": 2.0405, "step": 10060 }, { "epoch": 1.2894757903161265, "grad_norm": 0.5630547404289246, "learning_rate": 3.5490070467648946e-06, "loss": 2.0402, "step": 10070 }, { "epoch": 1.2907563025210085, "grad_norm": 0.5179298520088196, "learning_rate": 3.542600896860987e-06, "loss": 1.987, "step": 10080 }, { "epoch": 1.2920368147258903, "grad_norm": 0.5753514766693115, "learning_rate": 3.536194746957079e-06, "loss": 2.0206, "step": 10090 }, { "epoch": 1.2933173269307723, "grad_norm": 0.5431790947914124, "learning_rate": 3.5297885970531713e-06, "loss": 2.0548, "step": 10100 }, { "epoch": 1.2933173269307723, "eval_loss": 2.0521152019500732, "eval_runtime": 99.0994, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.045, "step": 10100 }, { "epoch": 1.2945978391356543, "grad_norm": 0.5497155785560608, "learning_rate": 3.5233824471492633e-06, "loss": 2.0285, "step": 10110 }, { "epoch": 1.2958783513405363, "grad_norm": 0.6134312748908997, "learning_rate": 3.5169762972453556e-06, "loss": 2.0303, "step": 10120 }, { "epoch": 1.297158863545418, "grad_norm": 0.5417413711547852, "learning_rate": 3.5105701473414484e-06, "loss": 2.0536, "step": 10130 }, { "epoch": 1.2984393757503, "grad_norm": 0.5879444479942322, "learning_rate": 3.5041639974375404e-06, "loss": 2.038, "step": 10140 }, { "epoch": 1.2997198879551821, "grad_norm": 0.5949389338493347, "learning_rate": 3.4977578475336327e-06, "loss": 2.0338, "step": 10150 }, { "epoch": 1.301000400160064, "grad_norm": 0.5224178433418274, "learning_rate": 3.4913516976297247e-06, "loss": 2.0411, "step": 10160 }, { "epoch": 1.302280912364946, "grad_norm": 0.5205852389335632, "learning_rate": 3.484945547725817e-06, "loss": 2.0258, "step": 10170 }, { "epoch": 1.303561424569828, "grad_norm": 0.5723111629486084, "learning_rate": 3.4785393978219094e-06, "loss": 2.0226, "step": 10180 }, { "epoch": 1.30484193677471, "grad_norm": 0.5134619474411011, "learning_rate": 3.4721332479180014e-06, "loss": 2.0444, "step": 10190 }, { "epoch": 1.306122448979592, "grad_norm": 0.5348407626152039, "learning_rate": 3.4657270980140937e-06, "loss": 2.0145, "step": 10200 }, { "epoch": 1.306122448979592, "eval_loss": 2.051806926727295, "eval_runtime": 99.149, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 10200 }, { "epoch": 1.3074029611844737, "grad_norm": 0.5550008416175842, "learning_rate": 3.4593209481101857e-06, "loss": 2.0606, "step": 10210 }, { "epoch": 1.3086834733893558, "grad_norm": 0.5189396142959595, "learning_rate": 3.4529147982062785e-06, "loss": 2.034, "step": 10220 }, { "epoch": 1.3099639855942378, "grad_norm": 0.5279061794281006, "learning_rate": 3.446508648302371e-06, "loss": 2.0319, "step": 10230 }, { "epoch": 1.3112444977991196, "grad_norm": 0.5703163146972656, "learning_rate": 3.4401024983984628e-06, "loss": 2.0314, "step": 10240 }, { "epoch": 1.3125250100040016, "grad_norm": 0.6252114176750183, "learning_rate": 3.433696348494555e-06, "loss": 1.993, "step": 10250 }, { "epoch": 1.3138055222088836, "grad_norm": 0.5590977072715759, "learning_rate": 3.427290198590647e-06, "loss": 2.0505, "step": 10260 }, { "epoch": 1.3150860344137656, "grad_norm": 0.5424992442131042, "learning_rate": 3.4208840486867395e-06, "loss": 2.0414, "step": 10270 }, { "epoch": 1.3163665466186474, "grad_norm": 0.5849244594573975, "learning_rate": 3.414477898782832e-06, "loss": 2.0071, "step": 10280 }, { "epoch": 1.3176470588235294, "grad_norm": 0.5255408883094788, "learning_rate": 3.4080717488789238e-06, "loss": 2.0389, "step": 10290 }, { "epoch": 1.3189275710284114, "grad_norm": 0.4956037104129791, "learning_rate": 3.4016655989750166e-06, "loss": 2.0287, "step": 10300 }, { "epoch": 1.3189275710284114, "eval_loss": 2.051372528076172, "eval_runtime": 99.1664, "eval_samples_per_second": 10.084, "eval_steps_per_second": 5.042, "step": 10300 }, { "epoch": 1.3202080832332932, "grad_norm": 0.4993341863155365, "learning_rate": 3.395259449071108e-06, "loss": 2.026, "step": 10310 }, { "epoch": 1.3214885954381752, "grad_norm": 0.5299360156059265, "learning_rate": 3.388853299167201e-06, "loss": 2.042, "step": 10320 }, { "epoch": 1.3227691076430572, "grad_norm": 0.5740581750869751, "learning_rate": 3.3824471492632932e-06, "loss": 2.0569, "step": 10330 }, { "epoch": 1.3240496198479392, "grad_norm": 0.5364329218864441, "learning_rate": 3.376040999359385e-06, "loss": 2.0324, "step": 10340 }, { "epoch": 1.3253301320528212, "grad_norm": 0.6236289143562317, "learning_rate": 3.3696348494554776e-06, "loss": 2.031, "step": 10350 }, { "epoch": 1.326610644257703, "grad_norm": 0.5699196457862854, "learning_rate": 3.36322869955157e-06, "loss": 2.0305, "step": 10360 }, { "epoch": 1.327891156462585, "grad_norm": 0.5235342383384705, "learning_rate": 3.356822549647662e-06, "loss": 2.0462, "step": 10370 }, { "epoch": 1.329171668667467, "grad_norm": 0.529191255569458, "learning_rate": 3.3504163997437542e-06, "loss": 2.0034, "step": 10380 }, { "epoch": 1.3304521808723488, "grad_norm": 0.5550540089607239, "learning_rate": 3.344010249839846e-06, "loss": 2.0158, "step": 10390 }, { "epoch": 1.3317326930772309, "grad_norm": 0.520746111869812, "learning_rate": 3.337604099935939e-06, "loss": 2.0009, "step": 10400 }, { "epoch": 1.3317326930772309, "eval_loss": 2.0515472888946533, "eval_runtime": 99.0648, "eval_samples_per_second": 10.094, "eval_steps_per_second": 5.047, "step": 10400 }, { "epoch": 1.3330132052821129, "grad_norm": 0.5105487704277039, "learning_rate": 3.3311979500320313e-06, "loss": 2.0296, "step": 10410 }, { "epoch": 1.3342937174869949, "grad_norm": 0.5224109292030334, "learning_rate": 3.3247918001281233e-06, "loss": 2.0376, "step": 10420 }, { "epoch": 1.3355742296918767, "grad_norm": 0.5944901704788208, "learning_rate": 3.3183856502242157e-06, "loss": 2.059, "step": 10430 }, { "epoch": 1.3368547418967587, "grad_norm": 0.5145648121833801, "learning_rate": 3.3119795003203076e-06, "loss": 2.0578, "step": 10440 }, { "epoch": 1.3381352541016407, "grad_norm": 0.617449939250946, "learning_rate": 3.3055733504164e-06, "loss": 2.0399, "step": 10450 }, { "epoch": 1.3394157663065225, "grad_norm": 0.5237758159637451, "learning_rate": 3.2991672005124923e-06, "loss": 2.0474, "step": 10460 }, { "epoch": 1.3406962785114045, "grad_norm": 0.5313665270805359, "learning_rate": 3.2927610506085843e-06, "loss": 2.0433, "step": 10470 }, { "epoch": 1.3419767907162865, "grad_norm": 0.6537880301475525, "learning_rate": 3.286354900704677e-06, "loss": 1.995, "step": 10480 }, { "epoch": 1.3432573029211685, "grad_norm": 0.6162858009338379, "learning_rate": 3.2799487508007686e-06, "loss": 2.0267, "step": 10490 }, { "epoch": 1.3445378151260505, "grad_norm": 0.5398708581924438, "learning_rate": 3.2735426008968614e-06, "loss": 2.0481, "step": 10500 }, { "epoch": 1.3445378151260505, "eval_loss": 2.0509071350097656, "eval_runtime": 98.9856, "eval_samples_per_second": 10.102, "eval_steps_per_second": 5.051, "step": 10500 }, { "epoch": 1.3458183273309323, "grad_norm": 0.5513540506362915, "learning_rate": 3.2671364509929538e-06, "loss": 2.0313, "step": 10510 }, { "epoch": 1.3470988395358143, "grad_norm": 0.5159798860549927, "learning_rate": 3.2607303010890457e-06, "loss": 2.0131, "step": 10520 }, { "epoch": 1.3483793517406963, "grad_norm": 0.6065656542778015, "learning_rate": 3.254324151185138e-06, "loss": 2.0335, "step": 10530 }, { "epoch": 1.3496598639455781, "grad_norm": 0.6260432600975037, "learning_rate": 3.24791800128123e-06, "loss": 2.0343, "step": 10540 }, { "epoch": 1.3509403761504601, "grad_norm": 0.581193745136261, "learning_rate": 3.2415118513773224e-06, "loss": 2.045, "step": 10550 }, { "epoch": 1.3522208883553422, "grad_norm": 0.5331265926361084, "learning_rate": 3.235105701473415e-06, "loss": 2.0505, "step": 10560 }, { "epoch": 1.3535014005602242, "grad_norm": 0.6132077574729919, "learning_rate": 3.2286995515695067e-06, "loss": 2.0658, "step": 10570 }, { "epoch": 1.3547819127651062, "grad_norm": 0.5657082796096802, "learning_rate": 3.2222934016655995e-06, "loss": 2.0355, "step": 10580 }, { "epoch": 1.356062424969988, "grad_norm": 0.526736319065094, "learning_rate": 3.2158872517616914e-06, "loss": 2.0322, "step": 10590 }, { "epoch": 1.35734293717487, "grad_norm": 0.5426225662231445, "learning_rate": 3.209481101857784e-06, "loss": 2.0405, "step": 10600 }, { "epoch": 1.35734293717487, "eval_loss": 2.0509629249572754, "eval_runtime": 98.9708, "eval_samples_per_second": 10.104, "eval_steps_per_second": 5.052, "step": 10600 }, { "epoch": 1.3586234493797518, "grad_norm": 0.5064254403114319, "learning_rate": 3.203074951953876e-06, "loss": 2.0416, "step": 10610 }, { "epoch": 1.3599039615846338, "grad_norm": 0.5440548062324524, "learning_rate": 3.196668802049968e-06, "loss": 2.0526, "step": 10620 }, { "epoch": 1.3611844737895158, "grad_norm": 0.58577960729599, "learning_rate": 3.1902626521460605e-06, "loss": 2.0292, "step": 10630 }, { "epoch": 1.3624649859943978, "grad_norm": 0.5705699324607849, "learning_rate": 3.1838565022421524e-06, "loss": 2.042, "step": 10640 }, { "epoch": 1.3637454981992798, "grad_norm": 0.5341757535934448, "learning_rate": 3.1774503523382448e-06, "loss": 2.0116, "step": 10650 }, { "epoch": 1.3650260104041616, "grad_norm": 0.6056538820266724, "learning_rate": 3.1710442024343376e-06, "loss": 2.0721, "step": 10660 }, { "epoch": 1.3663065226090436, "grad_norm": 0.4826757311820984, "learning_rate": 3.1646380525304295e-06, "loss": 2.0463, "step": 10670 }, { "epoch": 1.3675870348139256, "grad_norm": 0.5867482423782349, "learning_rate": 3.158231902626522e-06, "loss": 2.028, "step": 10680 }, { "epoch": 1.3688675470188074, "grad_norm": 0.5410043597221375, "learning_rate": 3.151825752722614e-06, "loss": 2.0253, "step": 10690 }, { "epoch": 1.3701480592236894, "grad_norm": 0.5709148049354553, "learning_rate": 3.145419602818706e-06, "loss": 2.0292, "step": 10700 }, { "epoch": 1.3701480592236894, "eval_loss": 2.0503604412078857, "eval_runtime": 98.9792, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.052, "step": 10700 }, { "epoch": 1.3714285714285714, "grad_norm": 0.5207972526550293, "learning_rate": 3.1390134529147986e-06, "loss": 2.0498, "step": 10710 }, { "epoch": 1.3727090836334535, "grad_norm": 0.5460594892501831, "learning_rate": 3.1326073030108905e-06, "loss": 2.0332, "step": 10720 }, { "epoch": 1.3739895958383355, "grad_norm": 0.5334863066673279, "learning_rate": 3.126201153106983e-06, "loss": 2.0316, "step": 10730 }, { "epoch": 1.3752701080432173, "grad_norm": 0.537344217300415, "learning_rate": 3.119795003203075e-06, "loss": 2.0213, "step": 10740 }, { "epoch": 1.3765506202480993, "grad_norm": 0.5321835279464722, "learning_rate": 3.113388853299167e-06, "loss": 2.0214, "step": 10750 }, { "epoch": 1.377831132452981, "grad_norm": 0.5158102512359619, "learning_rate": 3.10698270339526e-06, "loss": 2.0267, "step": 10760 }, { "epoch": 1.379111644657863, "grad_norm": 0.5449768900871277, "learning_rate": 3.100576553491352e-06, "loss": 2.0605, "step": 10770 }, { "epoch": 1.380392156862745, "grad_norm": 0.6002796292304993, "learning_rate": 3.0941704035874443e-06, "loss": 2.0518, "step": 10780 }, { "epoch": 1.381672669067627, "grad_norm": 0.6357347965240479, "learning_rate": 3.0877642536835367e-06, "loss": 2.0349, "step": 10790 }, { "epoch": 1.382953181272509, "grad_norm": 0.5326115489006042, "learning_rate": 3.0813581037796286e-06, "loss": 2.0338, "step": 10800 }, { "epoch": 1.382953181272509, "eval_loss": 2.0504214763641357, "eval_runtime": 98.5425, "eval_samples_per_second": 10.148, "eval_steps_per_second": 5.074, "step": 10800 }, { "epoch": 1.384233693477391, "grad_norm": 0.6251921653747559, "learning_rate": 3.074951953875721e-06, "loss": 2.0001, "step": 10810 }, { "epoch": 1.385514205682273, "grad_norm": 0.5165422558784485, "learning_rate": 3.068545803971813e-06, "loss": 2.0127, "step": 10820 }, { "epoch": 1.386794717887155, "grad_norm": 0.5653632879257202, "learning_rate": 3.0621396540679053e-06, "loss": 2.0206, "step": 10830 }, { "epoch": 1.3880752300920367, "grad_norm": 0.6335858106613159, "learning_rate": 3.055733504163998e-06, "loss": 2.0242, "step": 10840 }, { "epoch": 1.3893557422969187, "grad_norm": 0.5169991254806519, "learning_rate": 3.04932735426009e-06, "loss": 1.9994, "step": 10850 }, { "epoch": 1.3906362545018007, "grad_norm": 0.5450325012207031, "learning_rate": 3.0429212043561824e-06, "loss": 2.0271, "step": 10860 }, { "epoch": 1.3919167667066827, "grad_norm": 0.567139744758606, "learning_rate": 3.0365150544522743e-06, "loss": 2.0143, "step": 10870 }, { "epoch": 1.3931972789115648, "grad_norm": 0.5426153540611267, "learning_rate": 3.0301089045483667e-06, "loss": 2.0127, "step": 10880 }, { "epoch": 1.3944777911164465, "grad_norm": 0.5378096103668213, "learning_rate": 3.023702754644459e-06, "loss": 1.9883, "step": 10890 }, { "epoch": 1.3957583033213286, "grad_norm": 0.561470627784729, "learning_rate": 3.017296604740551e-06, "loss": 2.023, "step": 10900 }, { "epoch": 1.3957583033213286, "eval_loss": 2.049633026123047, "eval_runtime": 98.5354, "eval_samples_per_second": 10.149, "eval_steps_per_second": 5.074, "step": 10900 }, { "epoch": 1.3970388155262106, "grad_norm": 0.5516214966773987, "learning_rate": 3.0108904548366434e-06, "loss": 2.0643, "step": 10910 }, { "epoch": 1.3983193277310924, "grad_norm": 0.4827578365802765, "learning_rate": 3.0044843049327353e-06, "loss": 2.0247, "step": 10920 }, { "epoch": 1.3995998399359744, "grad_norm": 0.5677744746208191, "learning_rate": 2.998078155028828e-06, "loss": 2.0615, "step": 10930 }, { "epoch": 1.4008803521408564, "grad_norm": 0.5234193801879883, "learning_rate": 2.9916720051249205e-06, "loss": 2.0143, "step": 10940 }, { "epoch": 1.4021608643457384, "grad_norm": 0.5785296559333801, "learning_rate": 2.9852658552210124e-06, "loss": 2.0158, "step": 10950 }, { "epoch": 1.4034413765506202, "grad_norm": 0.5425563454627991, "learning_rate": 2.978859705317105e-06, "loss": 2.0083, "step": 10960 }, { "epoch": 1.4047218887555022, "grad_norm": 0.5265486240386963, "learning_rate": 2.9724535554131967e-06, "loss": 2.0602, "step": 10970 }, { "epoch": 1.4060024009603842, "grad_norm": 0.485811710357666, "learning_rate": 2.966047405509289e-06, "loss": 1.9982, "step": 10980 }, { "epoch": 1.407282913165266, "grad_norm": 0.5507806539535522, "learning_rate": 2.9596412556053815e-06, "loss": 2.0149, "step": 10990 }, { "epoch": 1.408563425370148, "grad_norm": 0.5665221810340881, "learning_rate": 2.9532351057014734e-06, "loss": 2.047, "step": 11000 }, { "epoch": 1.408563425370148, "eval_loss": 2.049713134765625, "eval_runtime": 99.4377, "eval_samples_per_second": 10.057, "eval_steps_per_second": 5.028, "step": 11000 }, { "epoch": 1.40984393757503, "grad_norm": 0.5265170335769653, "learning_rate": 2.946828955797566e-06, "loss": 2.0403, "step": 11010 }, { "epoch": 1.411124449779912, "grad_norm": 0.6089005470275879, "learning_rate": 2.9404228058936577e-06, "loss": 1.9957, "step": 11020 }, { "epoch": 1.412404961984794, "grad_norm": 0.6830588579177856, "learning_rate": 2.9340166559897505e-06, "loss": 1.9934, "step": 11030 }, { "epoch": 1.4136854741896758, "grad_norm": 0.6724534630775452, "learning_rate": 2.927610506085843e-06, "loss": 2.0147, "step": 11040 }, { "epoch": 1.4149659863945578, "grad_norm": 0.49919191002845764, "learning_rate": 2.921204356181935e-06, "loss": 2.0375, "step": 11050 }, { "epoch": 1.4162464985994399, "grad_norm": 0.6525809168815613, "learning_rate": 2.9147982062780272e-06, "loss": 2.0364, "step": 11060 }, { "epoch": 1.4175270108043216, "grad_norm": 0.5216648578643799, "learning_rate": 2.908392056374119e-06, "loss": 2.0526, "step": 11070 }, { "epoch": 1.4188075230092037, "grad_norm": 0.5448505282402039, "learning_rate": 2.9019859064702115e-06, "loss": 1.9872, "step": 11080 }, { "epoch": 1.4200880352140857, "grad_norm": 0.5219195485115051, "learning_rate": 2.895579756566304e-06, "loss": 2.0573, "step": 11090 }, { "epoch": 1.4213685474189677, "grad_norm": 0.5638669729232788, "learning_rate": 2.889173606662396e-06, "loss": 2.0511, "step": 11100 }, { "epoch": 1.4213685474189677, "eval_loss": 2.049281597137451, "eval_runtime": 99.6036, "eval_samples_per_second": 10.04, "eval_steps_per_second": 5.02, "step": 11100 }, { "epoch": 1.4226490596238495, "grad_norm": 0.513096272945404, "learning_rate": 2.8827674567584886e-06, "loss": 2.0564, "step": 11110 }, { "epoch": 1.4239295718287315, "grad_norm": 0.5226659774780273, "learning_rate": 2.8763613068545806e-06, "loss": 2.0418, "step": 11120 }, { "epoch": 1.4252100840336135, "grad_norm": 0.5489218831062317, "learning_rate": 2.869955156950673e-06, "loss": 2.0133, "step": 11130 }, { "epoch": 1.4264905962384953, "grad_norm": 0.5376583337783813, "learning_rate": 2.8635490070467653e-06, "loss": 2.0261, "step": 11140 }, { "epoch": 1.4277711084433773, "grad_norm": 0.5678580403327942, "learning_rate": 2.8571428571428573e-06, "loss": 2.0399, "step": 11150 }, { "epoch": 1.4290516206482593, "grad_norm": 0.541229784488678, "learning_rate": 2.8507367072389496e-06, "loss": 2.0245, "step": 11160 }, { "epoch": 1.4303321328531413, "grad_norm": 0.5533165335655212, "learning_rate": 2.8443305573350416e-06, "loss": 2.0679, "step": 11170 }, { "epoch": 1.4316126450580233, "grad_norm": 0.5560323596000671, "learning_rate": 2.837924407431134e-06, "loss": 2.04, "step": 11180 }, { "epoch": 1.4328931572629051, "grad_norm": 0.4824937880039215, "learning_rate": 2.8315182575272267e-06, "loss": 2.0237, "step": 11190 }, { "epoch": 1.4341736694677871, "grad_norm": 0.5155817866325378, "learning_rate": 2.8251121076233182e-06, "loss": 2.007, "step": 11200 }, { "epoch": 1.4341736694677871, "eval_loss": 2.0490078926086426, "eval_runtime": 99.8932, "eval_samples_per_second": 10.011, "eval_steps_per_second": 5.005, "step": 11200 }, { "epoch": 1.4354541816726691, "grad_norm": 0.5290130376815796, "learning_rate": 2.818705957719411e-06, "loss": 1.9982, "step": 11210 }, { "epoch": 1.436734693877551, "grad_norm": 0.49369412660598755, "learning_rate": 2.8122998078155034e-06, "loss": 2.0366, "step": 11220 }, { "epoch": 1.438015206082433, "grad_norm": 0.5680967569351196, "learning_rate": 2.8058936579115954e-06, "loss": 2.0296, "step": 11230 }, { "epoch": 1.439295718287315, "grad_norm": 0.5298722386360168, "learning_rate": 2.7994875080076877e-06, "loss": 2.0521, "step": 11240 }, { "epoch": 1.440576230492197, "grad_norm": 0.49635031819343567, "learning_rate": 2.7930813581037797e-06, "loss": 2.0323, "step": 11250 }, { "epoch": 1.4418567426970788, "grad_norm": 0.5607994198799133, "learning_rate": 2.786675208199872e-06, "loss": 2.0054, "step": 11260 }, { "epoch": 1.4431372549019608, "grad_norm": 0.47166672348976135, "learning_rate": 2.7802690582959644e-06, "loss": 2.0273, "step": 11270 }, { "epoch": 1.4444177671068428, "grad_norm": 0.5480349063873291, "learning_rate": 2.7738629083920563e-06, "loss": 2.0768, "step": 11280 }, { "epoch": 1.4456982793117246, "grad_norm": 0.5854456424713135, "learning_rate": 2.767456758488149e-06, "loss": 2.0255, "step": 11290 }, { "epoch": 1.4469787915166066, "grad_norm": 0.4981571137905121, "learning_rate": 2.761050608584241e-06, "loss": 2.0187, "step": 11300 }, { "epoch": 1.4469787915166066, "eval_loss": 2.048654317855835, "eval_runtime": 99.9023, "eval_samples_per_second": 10.01, "eval_steps_per_second": 5.005, "step": 11300 }, { "epoch": 1.4482593037214886, "grad_norm": 0.5446431636810303, "learning_rate": 2.7546444586803334e-06, "loss": 2.0112, "step": 11310 }, { "epoch": 1.4495398159263706, "grad_norm": 0.634131133556366, "learning_rate": 2.748238308776426e-06, "loss": 2.01, "step": 11320 }, { "epoch": 1.4508203281312526, "grad_norm": 0.5008243322372437, "learning_rate": 2.7418321588725178e-06, "loss": 2.0596, "step": 11330 }, { "epoch": 1.4521008403361344, "grad_norm": 0.5615090131759644, "learning_rate": 2.73542600896861e-06, "loss": 2.0458, "step": 11340 }, { "epoch": 1.4533813525410164, "grad_norm": 0.5126698017120361, "learning_rate": 2.729019859064702e-06, "loss": 2.0417, "step": 11350 }, { "epoch": 1.4546618647458984, "grad_norm": 0.5614376664161682, "learning_rate": 2.7226137091607944e-06, "loss": 2.0055, "step": 11360 }, { "epoch": 1.4559423769507802, "grad_norm": 0.5798308849334717, "learning_rate": 2.7162075592568872e-06, "loss": 2.0426, "step": 11370 }, { "epoch": 1.4572228891556622, "grad_norm": 0.5568918585777283, "learning_rate": 2.709801409352979e-06, "loss": 2.0319, "step": 11380 }, { "epoch": 1.4585034013605442, "grad_norm": 0.5090416073799133, "learning_rate": 2.7033952594490715e-06, "loss": 2.0174, "step": 11390 }, { "epoch": 1.4597839135654262, "grad_norm": 0.5406888127326965, "learning_rate": 2.6969891095451635e-06, "loss": 2.0338, "step": 11400 }, { "epoch": 1.4597839135654262, "eval_loss": 2.0484492778778076, "eval_runtime": 99.8004, "eval_samples_per_second": 10.02, "eval_steps_per_second": 5.01, "step": 11400 }, { "epoch": 1.4610644257703083, "grad_norm": 0.5530595183372498, "learning_rate": 2.690582959641256e-06, "loss": 2.0025, "step": 11410 }, { "epoch": 1.46234493797519, "grad_norm": 0.47053778171539307, "learning_rate": 2.6841768097373482e-06, "loss": 2.0359, "step": 11420 }, { "epoch": 1.463625450180072, "grad_norm": 0.5242395997047424, "learning_rate": 2.67777065983344e-06, "loss": 2.0162, "step": 11430 }, { "epoch": 1.4649059623849539, "grad_norm": 0.5642728805541992, "learning_rate": 2.6713645099295325e-06, "loss": 2.0372, "step": 11440 }, { "epoch": 1.4661864745898359, "grad_norm": 0.5605606436729431, "learning_rate": 2.6649583600256245e-06, "loss": 2.0256, "step": 11450 }, { "epoch": 1.4674669867947179, "grad_norm": 0.49169278144836426, "learning_rate": 2.658552210121717e-06, "loss": 2.0293, "step": 11460 }, { "epoch": 1.4687474989995999, "grad_norm": 0.5536981821060181, "learning_rate": 2.6521460602178096e-06, "loss": 2.0429, "step": 11470 }, { "epoch": 1.470028011204482, "grad_norm": 0.48586413264274597, "learning_rate": 2.6457399103139016e-06, "loss": 2.0335, "step": 11480 }, { "epoch": 1.4713085234093637, "grad_norm": 0.5224795341491699, "learning_rate": 2.639333760409994e-06, "loss": 2.0395, "step": 11490 }, { "epoch": 1.4725890356142457, "grad_norm": 0.49194952845573425, "learning_rate": 2.632927610506086e-06, "loss": 2.0357, "step": 11500 }, { "epoch": 1.4725890356142457, "eval_loss": 2.0482566356658936, "eval_runtime": 99.9705, "eval_samples_per_second": 10.003, "eval_steps_per_second": 5.001, "step": 11500 }, { "epoch": 1.4738695478191277, "grad_norm": 0.5626738667488098, "learning_rate": 2.6265214606021783e-06, "loss": 2.0273, "step": 11510 }, { "epoch": 1.4751500600240095, "grad_norm": 0.5302162170410156, "learning_rate": 2.6201153106982706e-06, "loss": 2.0381, "step": 11520 }, { "epoch": 1.4764305722288915, "grad_norm": 0.4928155839443207, "learning_rate": 2.6137091607943626e-06, "loss": 1.9951, "step": 11530 }, { "epoch": 1.4777110844337735, "grad_norm": 0.5598331093788147, "learning_rate": 2.607303010890455e-06, "loss": 1.9921, "step": 11540 }, { "epoch": 1.4789915966386555, "grad_norm": 0.5367702841758728, "learning_rate": 2.600896860986547e-06, "loss": 2.0363, "step": 11550 }, { "epoch": 1.4802721088435375, "grad_norm": 0.5699712038040161, "learning_rate": 2.5944907110826397e-06, "loss": 2.0159, "step": 11560 }, { "epoch": 1.4815526210484193, "grad_norm": 0.5477683544158936, "learning_rate": 2.588084561178732e-06, "loss": 2.0371, "step": 11570 }, { "epoch": 1.4828331332533013, "grad_norm": 0.558222770690918, "learning_rate": 2.581678411274824e-06, "loss": 2.029, "step": 11580 }, { "epoch": 1.4841136454581831, "grad_norm": 0.5154983997344971, "learning_rate": 2.5752722613709164e-06, "loss": 1.9831, "step": 11590 }, { "epoch": 1.4853941576630652, "grad_norm": 0.5119442939758301, "learning_rate": 2.5688661114670083e-06, "loss": 2.0158, "step": 11600 }, { "epoch": 1.4853941576630652, "eval_loss": 2.0478100776672363, "eval_runtime": 100.2631, "eval_samples_per_second": 9.974, "eval_steps_per_second": 4.987, "step": 11600 }, { "epoch": 1.4866746698679472, "grad_norm": 0.5495085716247559, "learning_rate": 2.5624599615631007e-06, "loss": 2.0299, "step": 11610 }, { "epoch": 1.4879551820728292, "grad_norm": 0.5720316767692566, "learning_rate": 2.556053811659193e-06, "loss": 2.0443, "step": 11620 }, { "epoch": 1.4892356942777112, "grad_norm": 0.5065346956253052, "learning_rate": 2.549647661755285e-06, "loss": 2.0156, "step": 11630 }, { "epoch": 1.490516206482593, "grad_norm": 0.553152322769165, "learning_rate": 2.5432415118513778e-06, "loss": 2.0176, "step": 11640 }, { "epoch": 1.491796718687475, "grad_norm": 0.5240514874458313, "learning_rate": 2.53683536194747e-06, "loss": 2.0506, "step": 11650 }, { "epoch": 1.493077230892357, "grad_norm": 0.499566912651062, "learning_rate": 2.530429212043562e-06, "loss": 1.9855, "step": 11660 }, { "epoch": 1.4943577430972388, "grad_norm": 0.5063506960868835, "learning_rate": 2.5240230621396545e-06, "loss": 2.0325, "step": 11670 }, { "epoch": 1.4956382553021208, "grad_norm": 0.5436708927154541, "learning_rate": 2.5176169122357464e-06, "loss": 2.0225, "step": 11680 }, { "epoch": 1.4969187675070028, "grad_norm": 0.5449490547180176, "learning_rate": 2.5112107623318388e-06, "loss": 2.0252, "step": 11690 }, { "epoch": 1.4981992797118848, "grad_norm": 0.4780167043209076, "learning_rate": 2.504804612427931e-06, "loss": 2.0297, "step": 11700 }, { "epoch": 1.4981992797118848, "eval_loss": 2.047696352005005, "eval_runtime": 100.4412, "eval_samples_per_second": 9.956, "eval_steps_per_second": 4.978, "step": 11700 }, { "epoch": 1.4994797919167668, "grad_norm": 0.5443697571754456, "learning_rate": 2.498398462524023e-06, "loss": 2.0305, "step": 11710 }, { "epoch": 1.5007603041216486, "grad_norm": 0.5622988939285278, "learning_rate": 2.4919923126201155e-06, "loss": 2.0363, "step": 11720 }, { "epoch": 1.5020408163265306, "grad_norm": 0.5276266932487488, "learning_rate": 2.485586162716208e-06, "loss": 2.0153, "step": 11730 }, { "epoch": 1.5033213285314124, "grad_norm": 0.576263427734375, "learning_rate": 2.4791800128123e-06, "loss": 2.0082, "step": 11740 }, { "epoch": 1.5046018407362944, "grad_norm": 0.557447612285614, "learning_rate": 2.472773862908392e-06, "loss": 2.023, "step": 11750 }, { "epoch": 1.5058823529411764, "grad_norm": 0.5237441062927246, "learning_rate": 2.4663677130044845e-06, "loss": 2.0219, "step": 11760 }, { "epoch": 1.5071628651460585, "grad_norm": 0.5576190948486328, "learning_rate": 2.459961563100577e-06, "loss": 2.0207, "step": 11770 }, { "epoch": 1.5084433773509405, "grad_norm": 0.5819739103317261, "learning_rate": 2.4535554131966692e-06, "loss": 2.0091, "step": 11780 }, { "epoch": 1.5097238895558225, "grad_norm": 0.5563110113143921, "learning_rate": 2.447149263292761e-06, "loss": 2.0456, "step": 11790 }, { "epoch": 1.5110044017607043, "grad_norm": 0.5227601528167725, "learning_rate": 2.4407431133888535e-06, "loss": 2.0574, "step": 11800 }, { "epoch": 1.5110044017607043, "eval_loss": 2.0476009845733643, "eval_runtime": 100.4291, "eval_samples_per_second": 9.957, "eval_steps_per_second": 4.979, "step": 11800 }, { "epoch": 1.5122849139655863, "grad_norm": 0.5243479609489441, "learning_rate": 2.4343369634849455e-06, "loss": 2.0271, "step": 11810 }, { "epoch": 1.513565426170468, "grad_norm": 0.5676461458206177, "learning_rate": 2.4279308135810383e-06, "loss": 2.03, "step": 11820 }, { "epoch": 1.51484593837535, "grad_norm": 0.4777722656726837, "learning_rate": 2.4215246636771302e-06, "loss": 2.022, "step": 11830 }, { "epoch": 1.516126450580232, "grad_norm": 0.5181421041488647, "learning_rate": 2.4151185137732226e-06, "loss": 2.0287, "step": 11840 }, { "epoch": 1.517406962785114, "grad_norm": 0.601375937461853, "learning_rate": 2.4087123638693145e-06, "loss": 2.0235, "step": 11850 }, { "epoch": 1.5186874749899961, "grad_norm": 0.5849976539611816, "learning_rate": 2.402306213965407e-06, "loss": 2.012, "step": 11860 }, { "epoch": 1.519967987194878, "grad_norm": 0.5280182957649231, "learning_rate": 2.3959000640614993e-06, "loss": 2.0154, "step": 11870 }, { "epoch": 1.52124849939976, "grad_norm": 0.511968195438385, "learning_rate": 2.3894939141575916e-06, "loss": 2.0395, "step": 11880 }, { "epoch": 1.5225290116046417, "grad_norm": 0.551898717880249, "learning_rate": 2.3830877642536836e-06, "loss": 2.0001, "step": 11890 }, { "epoch": 1.5238095238095237, "grad_norm": 0.5360147356987, "learning_rate": 2.376681614349776e-06, "loss": 2.0597, "step": 11900 }, { "epoch": 1.5238095238095237, "eval_loss": 2.04695987701416, "eval_runtime": 99.822, "eval_samples_per_second": 10.018, "eval_steps_per_second": 5.009, "step": 11900 }, { "epoch": 1.5250900360144057, "grad_norm": 0.5133785605430603, "learning_rate": 2.370275464445868e-06, "loss": 2.015, "step": 11910 }, { "epoch": 1.5263705482192877, "grad_norm": 0.5302312970161438, "learning_rate": 2.3638693145419607e-06, "loss": 2.0294, "step": 11920 }, { "epoch": 1.5276510604241698, "grad_norm": 0.4700910151004791, "learning_rate": 2.3574631646380526e-06, "loss": 2.0163, "step": 11930 }, { "epoch": 1.5289315726290518, "grad_norm": 0.5158265829086304, "learning_rate": 2.351057014734145e-06, "loss": 2.0465, "step": 11940 }, { "epoch": 1.5302120848339336, "grad_norm": 0.5115975141525269, "learning_rate": 2.344650864830237e-06, "loss": 2.0268, "step": 11950 }, { "epoch": 1.5314925970388156, "grad_norm": 0.5275582671165466, "learning_rate": 2.3382447149263297e-06, "loss": 2.0449, "step": 11960 }, { "epoch": 1.5327731092436974, "grad_norm": 0.6041198968887329, "learning_rate": 2.3318385650224217e-06, "loss": 2.02, "step": 11970 }, { "epoch": 1.5340536214485794, "grad_norm": 0.5098729133605957, "learning_rate": 2.325432415118514e-06, "loss": 2.0326, "step": 11980 }, { "epoch": 1.5353341336534614, "grad_norm": 0.5320337414741516, "learning_rate": 2.319026265214606e-06, "loss": 2.0282, "step": 11990 }, { "epoch": 1.5366146458583434, "grad_norm": 0.5207633376121521, "learning_rate": 2.3126201153106984e-06, "loss": 2.0131, "step": 12000 }, { "epoch": 1.5366146458583434, "eval_loss": 2.0469398498535156, "eval_runtime": 100.2772, "eval_samples_per_second": 9.972, "eval_steps_per_second": 4.986, "step": 12000 }, { "epoch": 1.5378951580632254, "grad_norm": 0.5872837901115417, "learning_rate": 2.3062139654067907e-06, "loss": 2.0213, "step": 12010 }, { "epoch": 1.5391756702681072, "grad_norm": 0.5789574384689331, "learning_rate": 2.299807815502883e-06, "loss": 2.0385, "step": 12020 }, { "epoch": 1.5404561824729892, "grad_norm": 0.525754451751709, "learning_rate": 2.293401665598975e-06, "loss": 2.0177, "step": 12030 }, { "epoch": 1.541736694677871, "grad_norm": 0.5442928671836853, "learning_rate": 2.2869955156950674e-06, "loss": 2.0465, "step": 12040 }, { "epoch": 1.543017206882753, "grad_norm": 0.5661875605583191, "learning_rate": 2.2805893657911598e-06, "loss": 2.0367, "step": 12050 }, { "epoch": 1.544297719087635, "grad_norm": 0.54268479347229, "learning_rate": 2.274183215887252e-06, "loss": 2.0191, "step": 12060 }, { "epoch": 1.545578231292517, "grad_norm": 0.5016104578971863, "learning_rate": 2.267777065983344e-06, "loss": 2.0518, "step": 12070 }, { "epoch": 1.546858743497399, "grad_norm": 0.5148719549179077, "learning_rate": 2.2613709160794365e-06, "loss": 2.0253, "step": 12080 }, { "epoch": 1.548139255702281, "grad_norm": 0.5588960647583008, "learning_rate": 2.2549647661755284e-06, "loss": 1.9878, "step": 12090 }, { "epoch": 1.5494197679071628, "grad_norm": 0.482328325510025, "learning_rate": 2.2485586162716208e-06, "loss": 2.0306, "step": 12100 }, { "epoch": 1.5494197679071628, "eval_loss": 2.046799898147583, "eval_runtime": 100.2859, "eval_samples_per_second": 9.971, "eval_steps_per_second": 4.986, "step": 12100 }, { "epoch": 3.4510774710596617, "grad_norm": 0.5546593070030212, "learning_rate": 2.242152466367713e-06, "loss": 2.0075, "step": 12110 }, { "epoch": 3.4539269813000892, "grad_norm": 0.5128650665283203, "learning_rate": 2.2357463164638055e-06, "loss": 2.0324, "step": 12120 }, { "epoch": 3.4567764915405164, "grad_norm": 0.5232759714126587, "learning_rate": 2.2293401665598975e-06, "loss": 2.0405, "step": 12130 }, { "epoch": 3.459626001780944, "grad_norm": 0.4983081817626953, "learning_rate": 2.22293401665599e-06, "loss": 2.0282, "step": 12140 }, { "epoch": 3.4624755120213715, "grad_norm": 0.6100337505340576, "learning_rate": 2.216527866752082e-06, "loss": 2.0741, "step": 12150 }, { "epoch": 3.4653250222617986, "grad_norm": 0.5565346479415894, "learning_rate": 2.2101217168481746e-06, "loss": 2.0402, "step": 12160 }, { "epoch": 3.468174532502226, "grad_norm": 0.5013384222984314, "learning_rate": 2.2037155669442665e-06, "loss": 2.0275, "step": 12170 }, { "epoch": 3.4710240427426537, "grad_norm": 0.5147426724433899, "learning_rate": 2.197309417040359e-06, "loss": 2.0196, "step": 12180 }, { "epoch": 3.473873552983081, "grad_norm": 0.5874170064926147, "learning_rate": 2.1909032671364512e-06, "loss": 2.0267, "step": 12190 }, { "epoch": 3.4767230632235084, "grad_norm": 0.535470187664032, "learning_rate": 2.1844971172325436e-06, "loss": 2.0293, "step": 12200 }, { "epoch": 3.4767230632235084, "eval_loss": 2.0467445850372314, "eval_runtime": 99.9294, "eval_samples_per_second": 10.007, "eval_steps_per_second": 5.004, "step": 12200 }, { "epoch": 3.479572573463936, "grad_norm": 0.6096872687339783, "learning_rate": 2.1780909673286356e-06, "loss": 2.02, "step": 12210 }, { "epoch": 3.482422083704363, "grad_norm": 0.5604642629623413, "learning_rate": 2.171684817424728e-06, "loss": 2.0279, "step": 12220 }, { "epoch": 3.4852715939447907, "grad_norm": 0.5537389516830444, "learning_rate": 2.1652786675208203e-06, "loss": 2.0111, "step": 12230 }, { "epoch": 3.4881211041852183, "grad_norm": 0.541696310043335, "learning_rate": 2.1588725176169122e-06, "loss": 1.9872, "step": 12240 }, { "epoch": 3.4909706144256454, "grad_norm": 0.53986656665802, "learning_rate": 2.1524663677130046e-06, "loss": 2.0234, "step": 12250 }, { "epoch": 3.493820124666073, "grad_norm": 0.5188933610916138, "learning_rate": 2.146060217809097e-06, "loss": 2.0336, "step": 12260 }, { "epoch": 3.4966696349065005, "grad_norm": 0.5203558802604675, "learning_rate": 2.1396540679051893e-06, "loss": 2.0097, "step": 12270 }, { "epoch": 3.4995191451469276, "grad_norm": 0.5333787798881531, "learning_rate": 2.1332479180012813e-06, "loss": 2.0643, "step": 12280 }, { "epoch": 3.502368655387355, "grad_norm": 0.4590858519077301, "learning_rate": 2.1268417680973736e-06, "loss": 2.0481, "step": 12290 }, { "epoch": 3.5052181656277828, "grad_norm": 0.5285468697547913, "learning_rate": 2.120435618193466e-06, "loss": 2.0291, "step": 12300 }, { "epoch": 3.5052181656277828, "eval_loss": 2.0466067790985107, "eval_runtime": 99.2486, "eval_samples_per_second": 10.076, "eval_steps_per_second": 5.038, "step": 12300 }, { "epoch": 3.50806767586821, "grad_norm": 0.48412200808525085, "learning_rate": 2.1140294682895584e-06, "loss": 2.0402, "step": 12310 }, { "epoch": 3.5109171861086375, "grad_norm": 0.6032649278640747, "learning_rate": 2.1076233183856503e-06, "loss": 2.0287, "step": 12320 }, { "epoch": 3.513766696349065, "grad_norm": 0.61771160364151, "learning_rate": 2.1012171684817427e-06, "loss": 2.0319, "step": 12330 }, { "epoch": 3.5166162065894926, "grad_norm": 0.542523980140686, "learning_rate": 2.0948110185778346e-06, "loss": 2.0293, "step": 12340 }, { "epoch": 3.5194657168299197, "grad_norm": 0.5993557572364807, "learning_rate": 2.088404868673927e-06, "loss": 2.0295, "step": 12350 }, { "epoch": 3.5223152270703473, "grad_norm": 0.5857672691345215, "learning_rate": 2.0819987187700194e-06, "loss": 2.0595, "step": 12360 }, { "epoch": 3.525164737310775, "grad_norm": 0.511431097984314, "learning_rate": 2.0755925688661117e-06, "loss": 2.0043, "step": 12370 }, { "epoch": 3.5280142475512024, "grad_norm": 0.5659806132316589, "learning_rate": 2.0691864189622037e-06, "loss": 2.0351, "step": 12380 }, { "epoch": 3.5308637577916295, "grad_norm": 0.5137180685997009, "learning_rate": 2.062780269058296e-06, "loss": 2.0509, "step": 12390 }, { "epoch": 3.533713268032057, "grad_norm": 0.5334777235984802, "learning_rate": 2.0563741191543884e-06, "loss": 2.048, "step": 12400 }, { "epoch": 3.533713268032057, "eval_loss": 2.046497106552124, "eval_runtime": 99.3964, "eval_samples_per_second": 10.061, "eval_steps_per_second": 5.03, "step": 12400 }, { "epoch": 3.5365627782724847, "grad_norm": 0.5657955408096313, "learning_rate": 2.049967969250481e-06, "loss": 2.038, "step": 12410 }, { "epoch": 3.539412288512912, "grad_norm": 0.6064898371696472, "learning_rate": 2.0435618193465727e-06, "loss": 2.0117, "step": 12420 }, { "epoch": 3.5422617987533394, "grad_norm": 0.5460246205329895, "learning_rate": 2.037155669442665e-06, "loss": 2.0349, "step": 12430 }, { "epoch": 3.545111308993767, "grad_norm": 0.514192521572113, "learning_rate": 2.0307495195387575e-06, "loss": 2.03, "step": 12440 }, { "epoch": 3.547960819234194, "grad_norm": 0.5747187733650208, "learning_rate": 2.02434336963485e-06, "loss": 2.0259, "step": 12450 }, { "epoch": 3.5508103294746216, "grad_norm": 0.5013620853424072, "learning_rate": 2.0179372197309418e-06, "loss": 2.0476, "step": 12460 }, { "epoch": 3.553659839715049, "grad_norm": 0.5189679861068726, "learning_rate": 2.011531069827034e-06, "loss": 1.9952, "step": 12470 }, { "epoch": 3.5565093499554763, "grad_norm": 0.5946024656295776, "learning_rate": 2.005124919923126e-06, "loss": 2.0115, "step": 12480 }, { "epoch": 3.559358860195904, "grad_norm": 0.6720226407051086, "learning_rate": 1.998718770019219e-06, "loss": 2.0522, "step": 12490 }, { "epoch": 3.5622083704363314, "grad_norm": 0.526604950428009, "learning_rate": 1.992312620115311e-06, "loss": 2.058, "step": 12500 }, { "epoch": 3.5622083704363314, "eval_loss": 2.0465705394744873, "eval_runtime": 99.2186, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.039, "step": 12500 }, { "epoch": 3.5650578806767586, "grad_norm": 0.49224352836608887, "learning_rate": 1.985906470211403e-06, "loss": 2.0419, "step": 12510 }, { "epoch": 3.567907390917186, "grad_norm": 0.4674229025840759, "learning_rate": 1.979500320307495e-06, "loss": 2.02, "step": 12520 }, { "epoch": 3.5707569011576137, "grad_norm": 0.5586433410644531, "learning_rate": 1.9730941704035875e-06, "loss": 2.0593, "step": 12530 }, { "epoch": 3.573606411398041, "grad_norm": 0.5551003217697144, "learning_rate": 1.96668802049968e-06, "loss": 2.0137, "step": 12540 }, { "epoch": 3.5764559216384684, "grad_norm": 0.5269743204116821, "learning_rate": 1.9602818705957723e-06, "loss": 2.0045, "step": 12550 }, { "epoch": 3.579305431878896, "grad_norm": 0.5576637387275696, "learning_rate": 1.953875720691864e-06, "loss": 2.0143, "step": 12560 }, { "epoch": 3.582154942119323, "grad_norm": 0.5212885737419128, "learning_rate": 1.9474695707879566e-06, "loss": 2.0637, "step": 12570 }, { "epoch": 3.5850044523597506, "grad_norm": 0.5004526376724243, "learning_rate": 1.941063420884049e-06, "loss": 2.0068, "step": 12580 }, { "epoch": 3.587853962600178, "grad_norm": 0.5198807120323181, "learning_rate": 1.9346572709801413e-06, "loss": 2.024, "step": 12590 }, { "epoch": 3.5907034728406053, "grad_norm": 0.5772445201873779, "learning_rate": 1.9282511210762332e-06, "loss": 1.9967, "step": 12600 }, { "epoch": 3.5907034728406053, "eval_loss": 2.046096086502075, "eval_runtime": 99.0889, "eval_samples_per_second": 10.092, "eval_steps_per_second": 5.046, "step": 12600 }, { "epoch": 3.593552983081033, "grad_norm": 0.5501157641410828, "learning_rate": 1.9218449711723256e-06, "loss": 2.053, "step": 12610 }, { "epoch": 3.5964024933214604, "grad_norm": 0.5690680742263794, "learning_rate": 1.9154388212684176e-06, "loss": 2.0267, "step": 12620 }, { "epoch": 3.5992520035618876, "grad_norm": 0.5340695381164551, "learning_rate": 1.9090326713645103e-06, "loss": 2.0102, "step": 12630 }, { "epoch": 3.602101513802315, "grad_norm": 0.5075132846832275, "learning_rate": 1.9026265214606023e-06, "loss": 2.0389, "step": 12640 }, { "epoch": 3.6049510240427427, "grad_norm": 0.5131998062133789, "learning_rate": 1.8962203715566947e-06, "loss": 2.0137, "step": 12650 }, { "epoch": 3.60780053428317, "grad_norm": 0.5327854156494141, "learning_rate": 1.8898142216527868e-06, "loss": 2.0175, "step": 12660 }, { "epoch": 3.6106500445235974, "grad_norm": 0.559907853603363, "learning_rate": 1.883408071748879e-06, "loss": 2.0391, "step": 12670 }, { "epoch": 3.613499554764025, "grad_norm": 0.5571854710578918, "learning_rate": 1.8770019218449713e-06, "loss": 2.0433, "step": 12680 }, { "epoch": 3.6163490650044525, "grad_norm": 0.5199988484382629, "learning_rate": 1.8705957719410637e-06, "loss": 2.0239, "step": 12690 }, { "epoch": 3.6191985752448796, "grad_norm": 0.5144962072372437, "learning_rate": 1.8641896220371559e-06, "loss": 2.0293, "step": 12700 }, { "epoch": 3.6191985752448796, "eval_loss": 2.0460119247436523, "eval_runtime": 99.2153, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.04, "step": 12700 }, { "epoch": 3.622048085485307, "grad_norm": 0.4945001006126404, "learning_rate": 1.857783472133248e-06, "loss": 2.0161, "step": 12710 }, { "epoch": 3.624897595725735, "grad_norm": 0.5314003825187683, "learning_rate": 1.8513773222293402e-06, "loss": 2.0173, "step": 12720 }, { "epoch": 3.627747105966162, "grad_norm": 0.5037127137184143, "learning_rate": 1.8449711723254328e-06, "loss": 2.061, "step": 12730 }, { "epoch": 3.6305966162065895, "grad_norm": 0.5831037163734436, "learning_rate": 1.838565022421525e-06, "loss": 2.0186, "step": 12740 }, { "epoch": 3.633446126447017, "grad_norm": 0.5599620342254639, "learning_rate": 1.832158872517617e-06, "loss": 2.0394, "step": 12750 }, { "epoch": 3.6362956366874446, "grad_norm": 0.543129026889801, "learning_rate": 1.8257527226137092e-06, "loss": 2.0262, "step": 12760 }, { "epoch": 3.6391451469278717, "grad_norm": 0.5593090653419495, "learning_rate": 1.8193465727098016e-06, "loss": 2.0316, "step": 12770 }, { "epoch": 3.6419946571682993, "grad_norm": 0.49719634652137756, "learning_rate": 1.812940422805894e-06, "loss": 2.0332, "step": 12780 }, { "epoch": 3.644844167408727, "grad_norm": 0.5381009578704834, "learning_rate": 1.8065342729019861e-06, "loss": 2.0232, "step": 12790 }, { "epoch": 3.647693677649154, "grad_norm": 0.4995189905166626, "learning_rate": 1.8001281229980783e-06, "loss": 2.0279, "step": 12800 }, { "epoch": 3.647693677649154, "eval_loss": 2.0456788539886475, "eval_runtime": 99.2397, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.038, "step": 12800 }, { "epoch": 3.6505431878895815, "grad_norm": 0.5659130215644836, "learning_rate": 1.7937219730941704e-06, "loss": 2.0061, "step": 12810 }, { "epoch": 3.653392698130009, "grad_norm": 0.5485902428627014, "learning_rate": 1.787315823190263e-06, "loss": 2.0367, "step": 12820 }, { "epoch": 3.6562422083704362, "grad_norm": 0.5191209316253662, "learning_rate": 1.7809096732863552e-06, "loss": 2.0558, "step": 12830 }, { "epoch": 3.659091718610864, "grad_norm": 0.5433029532432556, "learning_rate": 1.7745035233824473e-06, "loss": 2.0278, "step": 12840 }, { "epoch": 3.6619412288512914, "grad_norm": 0.5491988658905029, "learning_rate": 1.7680973734785395e-06, "loss": 2.0234, "step": 12850 }, { "epoch": 3.6647907390917185, "grad_norm": 0.5946006178855896, "learning_rate": 1.7616912235746316e-06, "loss": 2.0259, "step": 12860 }, { "epoch": 3.667640249332146, "grad_norm": 0.5561248660087585, "learning_rate": 1.7552850736707242e-06, "loss": 2.0301, "step": 12870 }, { "epoch": 3.6704897595725736, "grad_norm": 0.5854030847549438, "learning_rate": 1.7488789237668164e-06, "loss": 2.0477, "step": 12880 }, { "epoch": 3.6733392698130007, "grad_norm": 0.5319736003875732, "learning_rate": 1.7424727738629085e-06, "loss": 2.0438, "step": 12890 }, { "epoch": 3.6761887800534283, "grad_norm": 0.6675127744674683, "learning_rate": 1.7360666239590007e-06, "loss": 1.9977, "step": 12900 }, { "epoch": 3.6761887800534283, "eval_loss": 2.045774459838867, "eval_runtime": 99.3171, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.034, "step": 12900 }, { "epoch": 3.679038290293856, "grad_norm": 0.5847189426422119, "learning_rate": 1.7296604740550928e-06, "loss": 2.028, "step": 12910 }, { "epoch": 3.681887800534283, "grad_norm": 0.4568108022212982, "learning_rate": 1.7232543241511854e-06, "loss": 2.0211, "step": 12920 }, { "epoch": 3.6847373107747106, "grad_norm": 0.5319178700447083, "learning_rate": 1.7168481742472776e-06, "loss": 2.0161, "step": 12930 }, { "epoch": 3.687586821015138, "grad_norm": 0.5896403193473816, "learning_rate": 1.7104420243433697e-06, "loss": 2.0309, "step": 12940 }, { "epoch": 3.6904363312555652, "grad_norm": 0.5127771496772766, "learning_rate": 1.7040358744394619e-06, "loss": 2.0223, "step": 12950 }, { "epoch": 3.693285841495993, "grad_norm": 0.5874524712562561, "learning_rate": 1.697629724535554e-06, "loss": 2.0504, "step": 12960 }, { "epoch": 3.6961353517364204, "grad_norm": 0.5469905734062195, "learning_rate": 1.6912235746316466e-06, "loss": 2.0001, "step": 12970 }, { "epoch": 3.6989848619768475, "grad_norm": 0.5713154673576355, "learning_rate": 1.6848174247277388e-06, "loss": 2.0103, "step": 12980 }, { "epoch": 3.701834372217275, "grad_norm": 0.5152024626731873, "learning_rate": 1.678411274823831e-06, "loss": 2.0357, "step": 12990 }, { "epoch": 3.7046838824577026, "grad_norm": 0.5928826928138733, "learning_rate": 1.672005124919923e-06, "loss": 2.0149, "step": 13000 }, { "epoch": 3.7046838824577026, "eval_loss": 2.04563570022583, "eval_runtime": 99.3373, "eval_samples_per_second": 10.067, "eval_steps_per_second": 5.033, "step": 13000 }, { "epoch": 3.7075333926981298, "grad_norm": 0.5253980159759521, "learning_rate": 1.6655989750160157e-06, "loss": 2.0268, "step": 13010 }, { "epoch": 3.7103829029385573, "grad_norm": 0.4610345661640167, "learning_rate": 1.6591928251121078e-06, "loss": 2.0119, "step": 13020 }, { "epoch": 3.713232413178985, "grad_norm": 0.5618305206298828, "learning_rate": 1.6527866752082e-06, "loss": 2.034, "step": 13030 }, { "epoch": 3.716081923419412, "grad_norm": 0.5720108151435852, "learning_rate": 1.6463805253042921e-06, "loss": 2.0145, "step": 13040 }, { "epoch": 3.7189314336598396, "grad_norm": 0.5844509601593018, "learning_rate": 1.6399743754003843e-06, "loss": 2.0254, "step": 13050 }, { "epoch": 3.721780943900267, "grad_norm": 0.4931798279285431, "learning_rate": 1.6335682254964769e-06, "loss": 1.9958, "step": 13060 }, { "epoch": 3.7246304541406947, "grad_norm": 0.5254191160202026, "learning_rate": 1.627162075592569e-06, "loss": 2.0289, "step": 13070 }, { "epoch": 3.727479964381122, "grad_norm": 0.5591420531272888, "learning_rate": 1.6207559256886612e-06, "loss": 2.0336, "step": 13080 }, { "epoch": 3.7303294746215494, "grad_norm": 0.5102550983428955, "learning_rate": 1.6143497757847533e-06, "loss": 1.9987, "step": 13090 }, { "epoch": 3.733178984861977, "grad_norm": 0.5043312907218933, "learning_rate": 1.6079436258808457e-06, "loss": 2.0445, "step": 13100 }, { "epoch": 3.733178984861977, "eval_loss": 2.0453715324401855, "eval_runtime": 99.1158, "eval_samples_per_second": 10.089, "eval_steps_per_second": 5.045, "step": 13100 }, { "epoch": 3.7360284951024045, "grad_norm": 0.5792537927627563, "learning_rate": 1.601537475976938e-06, "loss": 2.0013, "step": 13110 }, { "epoch": 3.7388780053428317, "grad_norm": 0.5740078687667847, "learning_rate": 1.5951313260730302e-06, "loss": 2.0344, "step": 13120 }, { "epoch": 3.7417275155832592, "grad_norm": 0.5518689155578613, "learning_rate": 1.5887251761691224e-06, "loss": 2.0372, "step": 13130 }, { "epoch": 3.744577025823687, "grad_norm": 0.510488748550415, "learning_rate": 1.5823190262652148e-06, "loss": 2.0235, "step": 13140 }, { "epoch": 3.747426536064114, "grad_norm": 0.5459367632865906, "learning_rate": 1.575912876361307e-06, "loss": 2.0174, "step": 13150 }, { "epoch": 3.7502760463045415, "grad_norm": 0.5442193746566772, "learning_rate": 1.5695067264573993e-06, "loss": 2.0168, "step": 13160 }, { "epoch": 3.753125556544969, "grad_norm": 0.5245689749717712, "learning_rate": 1.5631005765534914e-06, "loss": 2.0254, "step": 13170 }, { "epoch": 3.755975066785396, "grad_norm": 0.5361449122428894, "learning_rate": 1.5566944266495836e-06, "loss": 2.0347, "step": 13180 }, { "epoch": 3.7588245770258237, "grad_norm": 0.5554986596107483, "learning_rate": 1.550288276745676e-06, "loss": 2.0124, "step": 13190 }, { "epoch": 3.7616740872662513, "grad_norm": 0.5109909772872925, "learning_rate": 1.5438821268417683e-06, "loss": 2.0287, "step": 13200 }, { "epoch": 3.7616740872662513, "eval_loss": 2.045180559158325, "eval_runtime": 98.7424, "eval_samples_per_second": 10.127, "eval_steps_per_second": 5.064, "step": 13200 }, { "epoch": 3.7645235975066784, "grad_norm": 0.5418664216995239, "learning_rate": 1.5374759769378605e-06, "loss": 2.054, "step": 13210 }, { "epoch": 3.767373107747106, "grad_norm": 0.5335536003112793, "learning_rate": 1.5310698270339526e-06, "loss": 2.0164, "step": 13220 }, { "epoch": 3.7702226179875336, "grad_norm": 0.46435827016830444, "learning_rate": 1.524663677130045e-06, "loss": 1.9936, "step": 13230 }, { "epoch": 3.7730721282279607, "grad_norm": 0.5345796942710876, "learning_rate": 1.5182575272261372e-06, "loss": 2.0085, "step": 13240 }, { "epoch": 3.7759216384683882, "grad_norm": 0.543218731880188, "learning_rate": 1.5118513773222295e-06, "loss": 1.9945, "step": 13250 }, { "epoch": 3.778771148708816, "grad_norm": 0.5109902024269104, "learning_rate": 1.5054452274183217e-06, "loss": 2.0033, "step": 13260 }, { "epoch": 3.781620658949243, "grad_norm": 0.5215564966201782, "learning_rate": 1.499039077514414e-06, "loss": 2.0381, "step": 13270 }, { "epoch": 3.7844701691896705, "grad_norm": 0.4731573164463043, "learning_rate": 1.4926329276105062e-06, "loss": 2.0365, "step": 13280 }, { "epoch": 3.787319679430098, "grad_norm": 0.4979744851589203, "learning_rate": 1.4862267777065984e-06, "loss": 2.0156, "step": 13290 }, { "epoch": 3.790169189670525, "grad_norm": 0.511603057384491, "learning_rate": 1.4798206278026907e-06, "loss": 1.9742, "step": 13300 }, { "epoch": 3.790169189670525, "eval_loss": 2.044865608215332, "eval_runtime": 99.2698, "eval_samples_per_second": 10.074, "eval_steps_per_second": 5.037, "step": 13300 }, { "epoch": 3.7930186999109528, "grad_norm": 0.4679126441478729, "learning_rate": 1.473414477898783e-06, "loss": 2.0291, "step": 13310 }, { "epoch": 3.7958682101513803, "grad_norm": 0.4647921621799469, "learning_rate": 1.4670083279948753e-06, "loss": 1.9977, "step": 13320 }, { "epoch": 3.7987177203918074, "grad_norm": 0.5578404664993286, "learning_rate": 1.4606021780909674e-06, "loss": 2.0358, "step": 13330 }, { "epoch": 3.801567230632235, "grad_norm": 0.5208609104156494, "learning_rate": 1.4541960281870596e-06, "loss": 2.0294, "step": 13340 }, { "epoch": 3.8044167408726626, "grad_norm": 0.5082564353942871, "learning_rate": 1.447789878283152e-06, "loss": 2.0691, "step": 13350 }, { "epoch": 3.8072662511130897, "grad_norm": 0.4937707781791687, "learning_rate": 1.4413837283792443e-06, "loss": 2.0302, "step": 13360 }, { "epoch": 3.8101157613535173, "grad_norm": 0.5191481709480286, "learning_rate": 1.4349775784753365e-06, "loss": 1.9866, "step": 13370 }, { "epoch": 3.812965271593945, "grad_norm": 0.5355870127677917, "learning_rate": 1.4285714285714286e-06, "loss": 2.0613, "step": 13380 }, { "epoch": 3.815814781834372, "grad_norm": 0.5211424827575684, "learning_rate": 1.4221652786675208e-06, "loss": 1.9967, "step": 13390 }, { "epoch": 3.8186642920747995, "grad_norm": 0.4805680811405182, "learning_rate": 1.4157591287636134e-06, "loss": 2.0063, "step": 13400 }, { "epoch": 3.8186642920747995, "eval_loss": 2.0447990894317627, "eval_runtime": 99.2866, "eval_samples_per_second": 10.072, "eval_steps_per_second": 5.036, "step": 13400 }, { "epoch": 3.821513802315227, "grad_norm": 0.6089988350868225, "learning_rate": 1.4093529788597055e-06, "loss": 2.0092, "step": 13410 }, { "epoch": 3.8243633125556546, "grad_norm": 0.5222851037979126, "learning_rate": 1.4029468289557977e-06, "loss": 2.053, "step": 13420 }, { "epoch": 3.8272128227960818, "grad_norm": 0.5201483368873596, "learning_rate": 1.3965406790518898e-06, "loss": 2.0574, "step": 13430 }, { "epoch": 3.8300623330365093, "grad_norm": 0.5058591365814209, "learning_rate": 1.3901345291479822e-06, "loss": 2.0141, "step": 13440 }, { "epoch": 3.832911843276937, "grad_norm": 0.5813695788383484, "learning_rate": 1.3837283792440746e-06, "loss": 2.0406, "step": 13450 }, { "epoch": 3.8357613535173645, "grad_norm": 0.5166908502578735, "learning_rate": 1.3773222293401667e-06, "loss": 2.0228, "step": 13460 }, { "epoch": 3.8386108637577916, "grad_norm": 0.6206843852996826, "learning_rate": 1.3709160794362589e-06, "loss": 2.026, "step": 13470 }, { "epoch": 3.841460373998219, "grad_norm": 0.4896029233932495, "learning_rate": 1.364509929532351e-06, "loss": 2.0333, "step": 13480 }, { "epoch": 3.8443098842386467, "grad_norm": 0.5111129283905029, "learning_rate": 1.3581037796284436e-06, "loss": 2.0121, "step": 13490 }, { "epoch": 3.847159394479074, "grad_norm": 0.5457862615585327, "learning_rate": 1.3516976297245358e-06, "loss": 2.0093, "step": 13500 }, { "epoch": 3.847159394479074, "eval_loss": 2.04461407661438, "eval_runtime": 99.1024, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.045, "step": 13500 }, { "epoch": 3.8500089047195014, "grad_norm": 0.538062334060669, "learning_rate": 1.345291479820628e-06, "loss": 2.0358, "step": 13510 }, { "epoch": 3.852858414959929, "grad_norm": 0.5242393612861633, "learning_rate": 1.33888532991672e-06, "loss": 2.0331, "step": 13520 }, { "epoch": 3.855707925200356, "grad_norm": 0.5289698243141174, "learning_rate": 1.3324791800128122e-06, "loss": 2.0078, "step": 13530 }, { "epoch": 3.8585574354407837, "grad_norm": 0.5151503086090088, "learning_rate": 1.3260730301089048e-06, "loss": 2.0463, "step": 13540 }, { "epoch": 3.8614069456812112, "grad_norm": 0.5115975737571716, "learning_rate": 1.319666880204997e-06, "loss": 2.0334, "step": 13550 }, { "epoch": 3.8642564559216384, "grad_norm": 0.5621274709701538, "learning_rate": 1.3132607303010891e-06, "loss": 2.0439, "step": 13560 }, { "epoch": 3.867105966162066, "grad_norm": 0.5460280179977417, "learning_rate": 1.3068545803971813e-06, "loss": 2.0014, "step": 13570 }, { "epoch": 3.8699554764024935, "grad_norm": 0.524300217628479, "learning_rate": 1.3004484304932734e-06, "loss": 2.0277, "step": 13580 }, { "epoch": 3.8728049866429206, "grad_norm": 0.5772647261619568, "learning_rate": 1.294042280589366e-06, "loss": 2.0063, "step": 13590 }, { "epoch": 3.875654496883348, "grad_norm": 0.5323388576507568, "learning_rate": 1.2876361306854582e-06, "loss": 2.0306, "step": 13600 }, { "epoch": 3.875654496883348, "eval_loss": 2.044602155685425, "eval_runtime": 99.2737, "eval_samples_per_second": 10.073, "eval_steps_per_second": 5.037, "step": 13600 }, { "epoch": 3.8785040071237757, "grad_norm": 0.57416170835495, "learning_rate": 1.2812299807815503e-06, "loss": 2.0369, "step": 13610 }, { "epoch": 3.881353517364203, "grad_norm": 0.5117653608322144, "learning_rate": 1.2748238308776425e-06, "loss": 1.9994, "step": 13620 }, { "epoch": 3.8842030276046304, "grad_norm": 0.4790748357772827, "learning_rate": 1.268417680973735e-06, "loss": 2.0061, "step": 13630 }, { "epoch": 3.887052537845058, "grad_norm": 0.48775649070739746, "learning_rate": 1.2620115310698272e-06, "loss": 2.0527, "step": 13640 }, { "epoch": 3.889902048085485, "grad_norm": 0.4819037616252899, "learning_rate": 1.2556053811659194e-06, "loss": 2.0403, "step": 13650 }, { "epoch": 3.8927515583259127, "grad_norm": 0.5073307156562805, "learning_rate": 1.2491992312620115e-06, "loss": 2.0114, "step": 13660 }, { "epoch": 3.8956010685663403, "grad_norm": 0.5168735980987549, "learning_rate": 1.242793081358104e-06, "loss": 2.0362, "step": 13670 }, { "epoch": 3.8984505788067674, "grad_norm": 0.5055269598960876, "learning_rate": 1.236386931454196e-06, "loss": 2.0426, "step": 13680 }, { "epoch": 3.901300089047195, "grad_norm": 0.5459708571434021, "learning_rate": 1.2299807815502884e-06, "loss": 2.011, "step": 13690 }, { "epoch": 3.9041495992876225, "grad_norm": 0.5798599123954773, "learning_rate": 1.2235746316463806e-06, "loss": 1.999, "step": 13700 }, { "epoch": 3.9041495992876225, "eval_loss": 2.044389009475708, "eval_runtime": 99.4564, "eval_samples_per_second": 10.055, "eval_steps_per_second": 5.027, "step": 13700 }, { "epoch": 3.9069991095280496, "grad_norm": 0.4718180000782013, "learning_rate": 1.2171684817424727e-06, "loss": 2.023, "step": 13710 }, { "epoch": 3.909848619768477, "grad_norm": 0.558879017829895, "learning_rate": 1.2107623318385651e-06, "loss": 1.996, "step": 13720 }, { "epoch": 3.9126981300089048, "grad_norm": 0.5425081253051758, "learning_rate": 1.2043561819346573e-06, "loss": 2.0198, "step": 13730 }, { "epoch": 3.915547640249332, "grad_norm": 0.516757607460022, "learning_rate": 1.1979500320307496e-06, "loss": 2.006, "step": 13740 }, { "epoch": 3.9183971504897595, "grad_norm": 0.5674097537994385, "learning_rate": 1.1915438821268418e-06, "loss": 2.014, "step": 13750 }, { "epoch": 3.921246660730187, "grad_norm": 0.47472038865089417, "learning_rate": 1.185137732222934e-06, "loss": 2.0161, "step": 13760 }, { "epoch": 3.9240961709706146, "grad_norm": 0.46472105383872986, "learning_rate": 1.1787315823190263e-06, "loss": 1.9891, "step": 13770 }, { "epoch": 3.9269456812110417, "grad_norm": 0.5644447207450867, "learning_rate": 1.1723254324151185e-06, "loss": 2.0327, "step": 13780 }, { "epoch": 3.9297951914514693, "grad_norm": 0.49525147676467896, "learning_rate": 1.1659192825112108e-06, "loss": 1.9746, "step": 13790 }, { "epoch": 3.932644701691897, "grad_norm": 0.5136142373085022, "learning_rate": 1.159513132607303e-06, "loss": 2.0308, "step": 13800 }, { "epoch": 3.932644701691897, "eval_loss": 2.044067621231079, "eval_runtime": 98.9728, "eval_samples_per_second": 10.104, "eval_steps_per_second": 5.052, "step": 13800 }, { "epoch": 3.9354942119323244, "grad_norm": 0.5278817415237427, "learning_rate": 1.1531069827033954e-06, "loss": 2.0229, "step": 13810 }, { "epoch": 3.9383437221727515, "grad_norm": 0.5376277565956116, "learning_rate": 1.1467008327994875e-06, "loss": 2.0198, "step": 13820 }, { "epoch": 3.941193232413179, "grad_norm": 0.5397053956985474, "learning_rate": 1.1402946828955799e-06, "loss": 2.0582, "step": 13830 }, { "epoch": 3.9440427426536067, "grad_norm": 0.5622960925102234, "learning_rate": 1.133888532991672e-06, "loss": 2.017, "step": 13840 }, { "epoch": 3.946892252894034, "grad_norm": 0.5214881896972656, "learning_rate": 1.1274823830877642e-06, "loss": 1.9991, "step": 13850 }, { "epoch": 3.9497417631344613, "grad_norm": 0.47489604353904724, "learning_rate": 1.1210762331838566e-06, "loss": 2.0538, "step": 13860 }, { "epoch": 3.952591273374889, "grad_norm": 0.5163717269897461, "learning_rate": 1.1146700832799487e-06, "loss": 2.0365, "step": 13870 }, { "epoch": 3.955440783615316, "grad_norm": 0.5314338803291321, "learning_rate": 1.108263933376041e-06, "loss": 2.0118, "step": 13880 }, { "epoch": 3.9582902938557436, "grad_norm": 0.47694677114486694, "learning_rate": 1.1018577834721333e-06, "loss": 2.0093, "step": 13890 }, { "epoch": 3.961139804096171, "grad_norm": 0.49023205041885376, "learning_rate": 1.0954516335682256e-06, "loss": 2.0123, "step": 13900 }, { "epoch": 3.961139804096171, "eval_loss": 2.0438590049743652, "eval_runtime": 99.2315, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.039, "step": 13900 }, { "epoch": 3.9639893143365983, "grad_norm": 0.5529634952545166, "learning_rate": 1.0890454836643178e-06, "loss": 2.026, "step": 13910 }, { "epoch": 3.966838824577026, "grad_norm": 0.49043890833854675, "learning_rate": 1.0826393337604101e-06, "loss": 2.0315, "step": 13920 }, { "epoch": 3.9696883348174534, "grad_norm": 0.5095582008361816, "learning_rate": 1.0762331838565023e-06, "loss": 1.998, "step": 13930 }, { "epoch": 3.9725378450578805, "grad_norm": 0.5738527774810791, "learning_rate": 1.0698270339525947e-06, "loss": 2.0027, "step": 13940 }, { "epoch": 3.975387355298308, "grad_norm": 0.5268344879150391, "learning_rate": 1.0634208840486868e-06, "loss": 2.0028, "step": 13950 }, { "epoch": 3.9782368655387357, "grad_norm": 0.4693433344364166, "learning_rate": 1.0570147341447792e-06, "loss": 2.049, "step": 13960 }, { "epoch": 3.981086375779163, "grad_norm": 0.5160017013549805, "learning_rate": 1.0506085842408713e-06, "loss": 2.0355, "step": 13970 }, { "epoch": 3.9839358860195904, "grad_norm": 0.6007476449012756, "learning_rate": 1.0442024343369635e-06, "loss": 2.0596, "step": 13980 }, { "epoch": 3.986785396260018, "grad_norm": 0.5093830227851868, "learning_rate": 1.0377962844330559e-06, "loss": 2.041, "step": 13990 }, { "epoch": 3.989634906500445, "grad_norm": 0.5473350882530212, "learning_rate": 1.031390134529148e-06, "loss": 2.0365, "step": 14000 }, { "epoch": 3.989634906500445, "eval_loss": 2.043860673904419, "eval_runtime": 99.085, "eval_samples_per_second": 10.092, "eval_steps_per_second": 5.046, "step": 14000 }, { "epoch": 3.9924844167408726, "grad_norm": 0.5005938410758972, "learning_rate": 1.0249839846252404e-06, "loss": 2.0238, "step": 14010 }, { "epoch": 3.9953339269813, "grad_norm": 0.5401079654693604, "learning_rate": 1.0185778347213326e-06, "loss": 2.0377, "step": 14020 }, { "epoch": 3.9981834372217273, "grad_norm": 0.5096141695976257, "learning_rate": 1.012171684817425e-06, "loss": 2.0088, "step": 14030 }, { "epoch": 4.001032947462155, "grad_norm": 0.5210298895835876, "learning_rate": 1.005765534913517e-06, "loss": 2.0331, "step": 14040 }, { "epoch": 4.003882457702582, "grad_norm": 0.47449570894241333, "learning_rate": 9.993593850096094e-07, "loss": 2.0141, "step": 14050 }, { "epoch": 4.00673196794301, "grad_norm": 0.5214652419090271, "learning_rate": 9.929532351057016e-07, "loss": 2.0099, "step": 14060 }, { "epoch": 4.009581478183438, "grad_norm": 0.4921712875366211, "learning_rate": 9.865470852017938e-07, "loss": 2.0109, "step": 14070 }, { "epoch": 4.012430988423865, "grad_norm": 0.4758373200893402, "learning_rate": 9.801409352978861e-07, "loss": 2.0104, "step": 14080 }, { "epoch": 4.015280498664292, "grad_norm": 0.5070475935935974, "learning_rate": 9.737347853939783e-07, "loss": 2.0215, "step": 14090 }, { "epoch": 4.01813000890472, "grad_norm": 0.5147979855537415, "learning_rate": 9.673286354900707e-07, "loss": 1.9953, "step": 14100 }, { "epoch": 4.01813000890472, "eval_loss": 2.0437448024749756, "eval_runtime": 99.1205, "eval_samples_per_second": 10.089, "eval_steps_per_second": 5.044, "step": 14100 }, { "epoch": 4.020979519145147, "grad_norm": 0.5636800527572632, "learning_rate": 9.609224855861628e-07, "loss": 2.0432, "step": 14110 }, { "epoch": 4.023829029385574, "grad_norm": 0.48588666319847107, "learning_rate": 9.545163356822552e-07, "loss": 2.0504, "step": 14120 }, { "epoch": 4.026678539626002, "grad_norm": 0.5209534764289856, "learning_rate": 9.481101857783473e-07, "loss": 2.0058, "step": 14130 }, { "epoch": 4.029528049866429, "grad_norm": 0.4877401888370514, "learning_rate": 9.417040358744395e-07, "loss": 2.0261, "step": 14140 }, { "epoch": 4.032377560106856, "grad_norm": 0.5681753754615784, "learning_rate": 9.352978859705319e-07, "loss": 2.0174, "step": 14150 }, { "epoch": 4.035227070347284, "grad_norm": 0.5463777184486389, "learning_rate": 9.28891736066624e-07, "loss": 2.0013, "step": 14160 }, { "epoch": 4.0380765805877115, "grad_norm": 0.4704899489879608, "learning_rate": 9.224855861627164e-07, "loss": 2.0138, "step": 14170 }, { "epoch": 4.040926090828139, "grad_norm": 0.5133561491966248, "learning_rate": 9.160794362588085e-07, "loss": 2.0229, "step": 14180 }, { "epoch": 4.043775601068567, "grad_norm": 0.5174084901809692, "learning_rate": 9.096732863549008e-07, "loss": 2.0311, "step": 14190 }, { "epoch": 4.046625111308994, "grad_norm": 0.49283725023269653, "learning_rate": 9.032671364509931e-07, "loss": 2.0092, "step": 14200 }, { "epoch": 4.046625111308994, "eval_loss": 2.043433666229248, "eval_runtime": 99.0743, "eval_samples_per_second": 10.093, "eval_steps_per_second": 5.047, "step": 14200 }, { "epoch": 4.049474621549421, "grad_norm": 0.5019715428352356, "learning_rate": 8.968609865470852e-07, "loss": 2.0344, "step": 14210 }, { "epoch": 4.052324131789849, "grad_norm": 0.5088763236999512, "learning_rate": 8.904548366431776e-07, "loss": 2.0216, "step": 14220 }, { "epoch": 4.055173642030276, "grad_norm": 0.5045077800750732, "learning_rate": 8.840486867392697e-07, "loss": 2.0279, "step": 14230 }, { "epoch": 4.058023152270703, "grad_norm": 0.5076019763946533, "learning_rate": 8.776425368353621e-07, "loss": 2.0421, "step": 14240 }, { "epoch": 4.060872662511131, "grad_norm": 0.49383342266082764, "learning_rate": 8.712363869314543e-07, "loss": 2.018, "step": 14250 }, { "epoch": 4.063722172751558, "grad_norm": 0.4705515503883362, "learning_rate": 8.648302370275464e-07, "loss": 2.0218, "step": 14260 }, { "epoch": 4.066571682991985, "grad_norm": 0.5173514485359192, "learning_rate": 8.584240871236388e-07, "loss": 2.0092, "step": 14270 }, { "epoch": 4.069421193232413, "grad_norm": 0.4916168451309204, "learning_rate": 8.520179372197309e-07, "loss": 2.0169, "step": 14280 }, { "epoch": 4.0722707034728405, "grad_norm": 0.4872733950614929, "learning_rate": 8.456117873158233e-07, "loss": 2.0068, "step": 14290 }, { "epoch": 4.075120213713268, "grad_norm": 0.5647644400596619, "learning_rate": 8.392056374119155e-07, "loss": 2.0008, "step": 14300 }, { "epoch": 4.075120213713268, "eval_loss": 2.043240785598755, "eval_runtime": 99.2058, "eval_samples_per_second": 10.08, "eval_steps_per_second": 5.04, "step": 14300 }, { "epoch": 4.077969723953696, "grad_norm": 0.5437302589416504, "learning_rate": 8.327994875080078e-07, "loss": 2.0172, "step": 14310 }, { "epoch": 4.080819234194123, "grad_norm": 0.5404428243637085, "learning_rate": 8.263933376041e-07, "loss": 2.0339, "step": 14320 }, { "epoch": 4.083668744434551, "grad_norm": 0.5169433951377869, "learning_rate": 8.199871877001921e-07, "loss": 2.03, "step": 14330 }, { "epoch": 4.086518254674978, "grad_norm": 0.48950499296188354, "learning_rate": 8.135810377962845e-07, "loss": 2.0209, "step": 14340 }, { "epoch": 4.089367764915405, "grad_norm": 0.48941537737846375, "learning_rate": 8.071748878923767e-07, "loss": 2.0278, "step": 14350 }, { "epoch": 4.092217275155833, "grad_norm": 0.4822404682636261, "learning_rate": 8.00768737988469e-07, "loss": 1.9925, "step": 14360 }, { "epoch": 4.09506678539626, "grad_norm": 0.5627114176750183, "learning_rate": 7.943625880845612e-07, "loss": 2.0017, "step": 14370 }, { "epoch": 4.097916295636687, "grad_norm": 0.5145583152770996, "learning_rate": 7.879564381806535e-07, "loss": 2.014, "step": 14380 }, { "epoch": 4.100765805877115, "grad_norm": 0.476077139377594, "learning_rate": 7.815502882767457e-07, "loss": 2.0253, "step": 14390 }, { "epoch": 4.103615316117542, "grad_norm": 0.4902302026748657, "learning_rate": 7.75144138372838e-07, "loss": 2.0256, "step": 14400 }, { "epoch": 4.103615316117542, "eval_loss": 2.0430707931518555, "eval_runtime": 99.116, "eval_samples_per_second": 10.089, "eval_steps_per_second": 5.045, "step": 14400 }, { "epoch": 4.1064648263579695, "grad_norm": 0.4917190670967102, "learning_rate": 7.687379884689302e-07, "loss": 2.0237, "step": 14410 }, { "epoch": 4.1093143365983975, "grad_norm": 0.48008298873901367, "learning_rate": 7.623318385650225e-07, "loss": 2.0297, "step": 14420 }, { "epoch": 4.112163846838825, "grad_norm": 0.4533262252807617, "learning_rate": 7.559256886611148e-07, "loss": 2.0198, "step": 14430 }, { "epoch": 4.115013357079252, "grad_norm": 0.48702970147132874, "learning_rate": 7.49519538757207e-07, "loss": 2.0298, "step": 14440 }, { "epoch": 4.11786286731968, "grad_norm": 0.48160573840141296, "learning_rate": 7.431133888532992e-07, "loss": 2.0452, "step": 14450 }, { "epoch": 4.120712377560107, "grad_norm": 0.48124462366104126, "learning_rate": 7.367072389493914e-07, "loss": 2.0156, "step": 14460 }, { "epoch": 4.123561887800534, "grad_norm": 0.48351117968559265, "learning_rate": 7.303010890454837e-07, "loss": 2.0027, "step": 14470 }, { "epoch": 4.126411398040962, "grad_norm": 0.5064377784729004, "learning_rate": 7.23894939141576e-07, "loss": 2.0332, "step": 14480 }, { "epoch": 4.129260908281389, "grad_norm": 0.4717184901237488, "learning_rate": 7.174887892376682e-07, "loss": 2.0374, "step": 14490 }, { "epoch": 4.132110418521816, "grad_norm": 0.507052481174469, "learning_rate": 7.110826393337604e-07, "loss": 2.0335, "step": 14500 }, { "epoch": 4.132110418521816, "eval_loss": 2.0429084300994873, "eval_runtime": 99.1818, "eval_samples_per_second": 10.082, "eval_steps_per_second": 5.041, "step": 14500 }, { "epoch": 4.134959928762244, "grad_norm": 0.46373996138572693, "learning_rate": 7.046764894298528e-07, "loss": 2.04, "step": 14510 }, { "epoch": 4.137809439002671, "grad_norm": 0.500369131565094, "learning_rate": 6.982703395259449e-07, "loss": 1.9717, "step": 14520 }, { "epoch": 4.1406589492430985, "grad_norm": 0.47627460956573486, "learning_rate": 6.918641896220373e-07, "loss": 2.0219, "step": 14530 }, { "epoch": 4.1435084594835265, "grad_norm": 0.4577222764492035, "learning_rate": 6.854580397181294e-07, "loss": 1.9991, "step": 14540 }, { "epoch": 4.146357969723954, "grad_norm": 0.4755384027957916, "learning_rate": 6.790518898142218e-07, "loss": 2.0254, "step": 14550 }, { "epoch": 4.149207479964381, "grad_norm": 0.4971500337123871, "learning_rate": 6.72645739910314e-07, "loss": 2.0144, "step": 14560 }, { "epoch": 4.152056990204809, "grad_norm": 0.4956054091453552, "learning_rate": 6.662395900064061e-07, "loss": 2.0501, "step": 14570 }, { "epoch": 4.154906500445236, "grad_norm": 0.4786563813686371, "learning_rate": 6.598334401024985e-07, "loss": 1.9868, "step": 14580 }, { "epoch": 4.157756010685663, "grad_norm": 0.4747292101383209, "learning_rate": 6.534272901985906e-07, "loss": 2.0442, "step": 14590 }, { "epoch": 4.160605520926091, "grad_norm": 0.46925073862075806, "learning_rate": 6.47021140294683e-07, "loss": 2.0345, "step": 14600 }, { "epoch": 4.160605520926091, "eval_loss": 2.0427427291870117, "eval_runtime": 99.2559, "eval_samples_per_second": 10.075, "eval_steps_per_second": 5.037, "step": 14600 }, { "epoch": 4.163455031166518, "grad_norm": 0.4990875720977783, "learning_rate": 6.406149903907752e-07, "loss": 2.0123, "step": 14610 }, { "epoch": 4.166304541406945, "grad_norm": 0.46079766750335693, "learning_rate": 6.342088404868675e-07, "loss": 2.0184, "step": 14620 }, { "epoch": 4.169154051647373, "grad_norm": 0.5010029077529907, "learning_rate": 6.278026905829597e-07, "loss": 2.0524, "step": 14630 }, { "epoch": 4.1720035618878, "grad_norm": 0.4925595223903656, "learning_rate": 6.21396540679052e-07, "loss": 2.0467, "step": 14640 }, { "epoch": 4.1748530721282275, "grad_norm": 0.4509677588939667, "learning_rate": 6.149903907751442e-07, "loss": 2.0128, "step": 14650 }, { "epoch": 4.1777025823686555, "grad_norm": 0.4880792796611786, "learning_rate": 6.085842408712364e-07, "loss": 1.9989, "step": 14660 }, { "epoch": 4.180552092609083, "grad_norm": 0.4964626133441925, "learning_rate": 6.021780909673286e-07, "loss": 2.0136, "step": 14670 }, { "epoch": 4.183401602849511, "grad_norm": 0.46766600012779236, "learning_rate": 5.957719410634209e-07, "loss": 2.0002, "step": 14680 }, { "epoch": 4.186251113089938, "grad_norm": 0.48704707622528076, "learning_rate": 5.893657911595132e-07, "loss": 2.0394, "step": 14690 }, { "epoch": 4.189100623330365, "grad_norm": 0.5206152200698853, "learning_rate": 5.829596412556054e-07, "loss": 2.0207, "step": 14700 }, { "epoch": 4.189100623330365, "eval_loss": 2.042611837387085, "eval_runtime": 99.2584, "eval_samples_per_second": 10.075, "eval_steps_per_second": 5.037, "step": 14700 }, { "epoch": 4.191950133570793, "grad_norm": 0.4617534875869751, "learning_rate": 5.765534913516977e-07, "loss": 2.0204, "step": 14710 }, { "epoch": 4.19479964381122, "grad_norm": 0.4623759388923645, "learning_rate": 5.701473414477899e-07, "loss": 2.0188, "step": 14720 }, { "epoch": 4.197649154051647, "grad_norm": 0.4455204904079437, "learning_rate": 5.637411915438821e-07, "loss": 2.0427, "step": 14730 }, { "epoch": 4.200498664292075, "grad_norm": 0.5701692700386047, "learning_rate": 5.573350416399744e-07, "loss": 2.0028, "step": 14740 }, { "epoch": 4.203348174532502, "grad_norm": 0.48934704065322876, "learning_rate": 5.509288917360666e-07, "loss": 1.9963, "step": 14750 }, { "epoch": 4.206197684772929, "grad_norm": 0.4875647723674774, "learning_rate": 5.445227418321589e-07, "loss": 2.0354, "step": 14760 }, { "epoch": 4.2090471950133574, "grad_norm": 0.4656578004360199, "learning_rate": 5.381165919282512e-07, "loss": 2.006, "step": 14770 }, { "epoch": 4.211896705253785, "grad_norm": 0.5381175875663757, "learning_rate": 5.317104420243434e-07, "loss": 2.0257, "step": 14780 }, { "epoch": 4.214746215494212, "grad_norm": 0.477753609418869, "learning_rate": 5.253042921204357e-07, "loss": 2.0294, "step": 14790 }, { "epoch": 4.21759572573464, "grad_norm": 0.4912634789943695, "learning_rate": 5.188981422165279e-07, "loss": 2.0379, "step": 14800 }, { "epoch": 4.21759572573464, "eval_loss": 2.042499542236328, "eval_runtime": 99.2392, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.038, "step": 14800 }, { "epoch": 4.220445235975067, "grad_norm": 0.45377182960510254, "learning_rate": 5.124919923126202e-07, "loss": 2.0226, "step": 14810 }, { "epoch": 4.223294746215494, "grad_norm": 0.4863813817501068, "learning_rate": 5.060858424087125e-07, "loss": 2.027, "step": 14820 }, { "epoch": 4.226144256455922, "grad_norm": 0.4592478573322296, "learning_rate": 4.996796925048047e-07, "loss": 2.0243, "step": 14830 }, { "epoch": 4.228993766696349, "grad_norm": 0.4891451895236969, "learning_rate": 4.932735426008969e-07, "loss": 2.0361, "step": 14840 }, { "epoch": 4.231843276936776, "grad_norm": 0.4831571578979492, "learning_rate": 4.868673926969891e-07, "loss": 2.0453, "step": 14850 }, { "epoch": 4.234692787177204, "grad_norm": 0.46735844016075134, "learning_rate": 4.804612427930814e-07, "loss": 2.0351, "step": 14860 }, { "epoch": 4.237542297417631, "grad_norm": 0.5098645091056824, "learning_rate": 4.7405509288917367e-07, "loss": 2.0323, "step": 14870 }, { "epoch": 4.2403918076580585, "grad_norm": 0.4800586700439453, "learning_rate": 4.6764894298526593e-07, "loss": 2.0298, "step": 14880 }, { "epoch": 4.2432413178984865, "grad_norm": 0.4520800709724426, "learning_rate": 4.612427930813582e-07, "loss": 2.0181, "step": 14890 }, { "epoch": 4.246090828138914, "grad_norm": 0.5034991502761841, "learning_rate": 4.548366431774504e-07, "loss": 2.0216, "step": 14900 }, { "epoch": 4.246090828138914, "eval_loss": 2.0423622131347656, "eval_runtime": 99.1542, "eval_samples_per_second": 10.085, "eval_steps_per_second": 5.043, "step": 14900 }, { "epoch": 4.248940338379341, "grad_norm": 0.456511527299881, "learning_rate": 4.484304932735426e-07, "loss": 1.9933, "step": 14910 }, { "epoch": 4.251789848619769, "grad_norm": 0.4512493908405304, "learning_rate": 4.4202434336963487e-07, "loss": 2.0269, "step": 14920 }, { "epoch": 4.254639358860196, "grad_norm": 0.49248650670051575, "learning_rate": 4.3561819346572713e-07, "loss": 1.9984, "step": 14930 }, { "epoch": 4.257488869100623, "grad_norm": 0.46570587158203125, "learning_rate": 4.292120435618194e-07, "loss": 2.0029, "step": 14940 }, { "epoch": 4.260338379341051, "grad_norm": 0.5560067892074585, "learning_rate": 4.2280589365791166e-07, "loss": 2.0418, "step": 14950 }, { "epoch": 4.263187889581478, "grad_norm": 0.48884886503219604, "learning_rate": 4.163997437540039e-07, "loss": 1.9966, "step": 14960 }, { "epoch": 4.266037399821905, "grad_norm": 0.4981346130371094, "learning_rate": 4.099935938500961e-07, "loss": 2.0336, "step": 14970 }, { "epoch": 4.268886910062333, "grad_norm": 0.48836010694503784, "learning_rate": 4.0358744394618834e-07, "loss": 2.0336, "step": 14980 }, { "epoch": 4.27173642030276, "grad_norm": 0.5348659753799438, "learning_rate": 3.971812940422806e-07, "loss": 2.0482, "step": 14990 }, { "epoch": 4.2745859305431875, "grad_norm": 0.49484753608703613, "learning_rate": 3.9077514413837286e-07, "loss": 2.0119, "step": 15000 }, { "epoch": 4.2745859305431875, "eval_loss": 2.042267084121704, "eval_runtime": 99.4064, "eval_samples_per_second": 10.06, "eval_steps_per_second": 5.03, "step": 15000 }, { "epoch": 4.2774354407836155, "grad_norm": 0.4720345139503479, "learning_rate": 3.843689942344651e-07, "loss": 2.0184, "step": 15010 }, { "epoch": 4.280284951024043, "grad_norm": 0.47287288308143616, "learning_rate": 3.779628443305574e-07, "loss": 2.0244, "step": 15020 }, { "epoch": 4.283134461264471, "grad_norm": 0.45684701204299927, "learning_rate": 3.715566944266496e-07, "loss": 2.0464, "step": 15030 }, { "epoch": 4.285983971504898, "grad_norm": 0.4994913339614868, "learning_rate": 3.6515054452274186e-07, "loss": 2.023, "step": 15040 }, { "epoch": 4.288833481745325, "grad_norm": 0.4420747756958008, "learning_rate": 3.587443946188341e-07, "loss": 2.0245, "step": 15050 }, { "epoch": 4.291682991985752, "grad_norm": 0.4551266133785248, "learning_rate": 3.523382447149264e-07, "loss": 1.9831, "step": 15060 }, { "epoch": 4.29453250222618, "grad_norm": 0.4731452763080597, "learning_rate": 3.4593209481101864e-07, "loss": 2.0185, "step": 15070 }, { "epoch": 4.297382012466607, "grad_norm": 0.45530498027801514, "learning_rate": 3.395259449071109e-07, "loss": 2.0228, "step": 15080 }, { "epoch": 4.300231522707035, "grad_norm": 0.44847631454467773, "learning_rate": 3.3311979500320306e-07, "loss": 2.0489, "step": 15090 }, { "epoch": 4.303081032947462, "grad_norm": 0.5331618189811707, "learning_rate": 3.267136450992953e-07, "loss": 2.0163, "step": 15100 }, { "epoch": 4.303081032947462, "eval_loss": 2.042123317718506, "eval_runtime": 99.3275, "eval_samples_per_second": 10.068, "eval_steps_per_second": 5.034, "step": 15100 }, { "epoch": 4.305930543187889, "grad_norm": 0.46870335936546326, "learning_rate": 3.203074951953876e-07, "loss": 2.0225, "step": 15110 }, { "epoch": 4.308780053428317, "grad_norm": 0.49238336086273193, "learning_rate": 3.1390134529147985e-07, "loss": 2.0763, "step": 15120 }, { "epoch": 4.3116295636687445, "grad_norm": 0.487042635679245, "learning_rate": 3.074951953875721e-07, "loss": 2.044, "step": 15130 }, { "epoch": 4.314479073909172, "grad_norm": 0.4582352042198181, "learning_rate": 3.010890454836643e-07, "loss": 2.0615, "step": 15140 }, { "epoch": 4.3173285841496, "grad_norm": 0.47779443860054016, "learning_rate": 2.946828955797566e-07, "loss": 2.0154, "step": 15150 }, { "epoch": 4.320178094390027, "grad_norm": 0.48991677165031433, "learning_rate": 2.8827674567584884e-07, "loss": 2.0161, "step": 15160 }, { "epoch": 4.323027604630454, "grad_norm": 0.49436208605766296, "learning_rate": 2.8187059577194105e-07, "loss": 2.0358, "step": 15170 }, { "epoch": 4.325877114870882, "grad_norm": 0.4528005123138428, "learning_rate": 2.754644458680333e-07, "loss": 2.0148, "step": 15180 }, { "epoch": 4.328726625111309, "grad_norm": 0.4622327983379364, "learning_rate": 2.690582959641256e-07, "loss": 2.0151, "step": 15190 }, { "epoch": 4.331576135351736, "grad_norm": 0.4738125503063202, "learning_rate": 2.6265214606021784e-07, "loss": 2.0272, "step": 15200 }, { "epoch": 4.331576135351736, "eval_loss": 2.0419702529907227, "eval_runtime": 99.4017, "eval_samples_per_second": 10.06, "eval_steps_per_second": 5.03, "step": 15200 }, { "epoch": 4.334425645592164, "grad_norm": 0.4631751477718353, "learning_rate": 2.562459961563101e-07, "loss": 2.0151, "step": 15210 }, { "epoch": 4.337275155832591, "grad_norm": 0.49889975786209106, "learning_rate": 2.4983984625240236e-07, "loss": 2.0478, "step": 15220 }, { "epoch": 4.340124666073018, "grad_norm": 0.45168715715408325, "learning_rate": 2.4343369634849457e-07, "loss": 2.0292, "step": 15230 }, { "epoch": 4.342974176313446, "grad_norm": 0.5072616934776306, "learning_rate": 2.3702754644458683e-07, "loss": 2.0183, "step": 15240 }, { "epoch": 4.3458236865538735, "grad_norm": 0.5032421946525574, "learning_rate": 2.306213965406791e-07, "loss": 2.0098, "step": 15250 }, { "epoch": 4.348673196794301, "grad_norm": 0.4905742406845093, "learning_rate": 2.242152466367713e-07, "loss": 2.0349, "step": 15260 }, { "epoch": 4.351522707034729, "grad_norm": 0.5213619470596313, "learning_rate": 2.1780909673286357e-07, "loss": 2.0246, "step": 15270 }, { "epoch": 4.354372217275156, "grad_norm": 0.47906991839408875, "learning_rate": 2.1140294682895583e-07, "loss": 2.0212, "step": 15280 }, { "epoch": 4.357221727515583, "grad_norm": 0.4957728087902069, "learning_rate": 2.0499679692504804e-07, "loss": 2.021, "step": 15290 }, { "epoch": 4.360071237756011, "grad_norm": 0.44690993428230286, "learning_rate": 1.985906470211403e-07, "loss": 2.0376, "step": 15300 }, { "epoch": 4.360071237756011, "eval_loss": 2.041898488998413, "eval_runtime": 99.5154, "eval_samples_per_second": 10.049, "eval_steps_per_second": 5.024, "step": 15300 }, { "epoch": 4.362920747996438, "grad_norm": 0.446969598531723, "learning_rate": 1.9218449711723256e-07, "loss": 2.0322, "step": 15310 }, { "epoch": 4.365770258236865, "grad_norm": 0.4475710690021515, "learning_rate": 1.857783472133248e-07, "loss": 2.0318, "step": 15320 }, { "epoch": 4.368619768477293, "grad_norm": 0.45626747608184814, "learning_rate": 1.7937219730941706e-07, "loss": 2.0228, "step": 15330 }, { "epoch": 4.37146927871772, "grad_norm": 0.42936715483665466, "learning_rate": 1.7296604740550932e-07, "loss": 2.0457, "step": 15340 }, { "epoch": 4.374318788958147, "grad_norm": 0.43782496452331543, "learning_rate": 1.6655989750160153e-07, "loss": 1.9622, "step": 15350 }, { "epoch": 4.377168299198575, "grad_norm": 0.48435845971107483, "learning_rate": 1.601537475976938e-07, "loss": 2.0286, "step": 15360 }, { "epoch": 4.3800178094390025, "grad_norm": 0.47565901279449463, "learning_rate": 1.5374759769378605e-07, "loss": 2.0014, "step": 15370 }, { "epoch": 4.3828673196794306, "grad_norm": 0.4597352147102356, "learning_rate": 1.473414477898783e-07, "loss": 2.0069, "step": 15380 }, { "epoch": 4.385716829919858, "grad_norm": 0.4486582279205322, "learning_rate": 1.4093529788597053e-07, "loss": 2.0413, "step": 15390 }, { "epoch": 4.388566340160285, "grad_norm": 0.455665647983551, "learning_rate": 1.345291479820628e-07, "loss": 2.0162, "step": 15400 }, { "epoch": 4.388566340160285, "eval_loss": 2.0417721271514893, "eval_runtime": 99.1826, "eval_samples_per_second": 10.082, "eval_steps_per_second": 5.041, "step": 15400 }, { "epoch": 4.391415850400712, "grad_norm": 0.45837944746017456, "learning_rate": 1.2812299807815505e-07, "loss": 1.9732, "step": 15410 }, { "epoch": 4.39426536064114, "grad_norm": 0.4252510368824005, "learning_rate": 1.2171684817424729e-07, "loss": 2.0138, "step": 15420 }, { "epoch": 4.397114870881567, "grad_norm": 0.49600866436958313, "learning_rate": 1.1531069827033955e-07, "loss": 2.0193, "step": 15430 }, { "epoch": 4.399964381121995, "grad_norm": 0.42285075783729553, "learning_rate": 1.0890454836643178e-07, "loss": 2.0225, "step": 15440 }, { "epoch": 4.402813891362422, "grad_norm": 0.5210908055305481, "learning_rate": 1.0249839846252402e-07, "loss": 2.0294, "step": 15450 }, { "epoch": 4.405663401602849, "grad_norm": 0.448207825422287, "learning_rate": 9.609224855861628e-08, "loss": 1.9902, "step": 15460 }, { "epoch": 4.408512911843277, "grad_norm": 0.45898616313934326, "learning_rate": 8.968609865470853e-08, "loss": 2.0227, "step": 15470 }, { "epoch": 4.411362422083704, "grad_norm": 0.4249895215034485, "learning_rate": 8.327994875080077e-08, "loss": 2.0255, "step": 15480 }, { "epoch": 4.414211932324132, "grad_norm": 0.44755247235298157, "learning_rate": 7.687379884689303e-08, "loss": 2.0284, "step": 15490 }, { "epoch": 4.41706144256456, "grad_norm": 0.467061847448349, "learning_rate": 7.046764894298526e-08, "loss": 2.0549, "step": 15500 }, { "epoch": 4.41706144256456, "eval_loss": 2.0417046546936035, "eval_runtime": 99.2091, "eval_samples_per_second": 10.08, "eval_steps_per_second": 5.04, "step": 15500 }, { "epoch": 4.419910952804987, "grad_norm": 0.4667827785015106, "learning_rate": 6.406149903907752e-08, "loss": 2.0094, "step": 15510 }, { "epoch": 4.422760463045414, "grad_norm": 0.4421955943107605, "learning_rate": 5.7655349135169774e-08, "loss": 2.0206, "step": 15520 }, { "epoch": 4.425609973285842, "grad_norm": 0.4796900749206543, "learning_rate": 5.124919923126201e-08, "loss": 2.0234, "step": 15530 }, { "epoch": 4.428459483526269, "grad_norm": 0.46110549569129944, "learning_rate": 4.4843049327354265e-08, "loss": 2.0719, "step": 15540 }, { "epoch": 4.431308993766696, "grad_norm": 0.43615448474884033, "learning_rate": 3.8436899423446514e-08, "loss": 2.004, "step": 15550 }, { "epoch": 4.434158504007124, "grad_norm": 0.4552094340324402, "learning_rate": 3.203074951953876e-08, "loss": 2.018, "step": 15560 }, { "epoch": 4.437008014247551, "grad_norm": 0.4268529415130615, "learning_rate": 2.5624599615631005e-08, "loss": 2.0226, "step": 15570 }, { "epoch": 4.439857524487978, "grad_norm": 0.4385978579521179, "learning_rate": 1.9218449711723257e-08, "loss": 2.0217, "step": 15580 }, { "epoch": 4.442707034728406, "grad_norm": 0.48627611994743347, "learning_rate": 1.2812299807815502e-08, "loss": 2.0048, "step": 15590 }, { "epoch": 4.4455565449688335, "grad_norm": 0.42732298374176025, "learning_rate": 6.406149903907751e-09, "loss": 2.0082, "step": 15600 }, { "epoch": 4.4455565449688335, "eval_loss": 2.0416414737701416, "eval_runtime": 99.0939, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.046, "step": 15600 }, { "epoch": 4.448406055209261, "grad_norm": 0.4420681297779083, "learning_rate": 0.0, "loss": 2.0069, "step": 15610 }, { "epoch": 0.99968, "grad_norm": 0.7863059639930725, "learning_rate": 4.999199615815592e-06, "loss": 2.0359, "step": 15620 }, { "epoch": 1.00032, "grad_norm": 0.8261616230010986, "learning_rate": 4.995998079077958e-06, "loss": 2.0272, "step": 15630 }, { "epoch": 1.00096, "grad_norm": 1.2386047840118408, "learning_rate": 4.992796542340324e-06, "loss": 2.0275, "step": 15640 }, { "epoch": 1.0016, "grad_norm": 1.005250096321106, "learning_rate": 4.98959500560269e-06, "loss": 2.0273, "step": 15650 }, { "epoch": 1.00224, "grad_norm": 0.7347936630249023, "learning_rate": 4.986393468865056e-06, "loss": 2.0332, "step": 15660 }, { "epoch": 1.00288, "grad_norm": 0.8586557507514954, "learning_rate": 4.983191932127422e-06, "loss": 2.0431, "step": 15670 }, { "epoch": 1.00352, "grad_norm": 0.66364985704422, "learning_rate": 4.979990395389787e-06, "loss": 2.0106, "step": 15680 }, { "epoch": 1.00416, "grad_norm": 0.6150959134101868, "learning_rate": 4.976788858652153e-06, "loss": 2.0459, "step": 15690 }, { "epoch": 1.0048, "grad_norm": 0.7407456636428833, "learning_rate": 4.973587321914519e-06, "loss": 2.0127, "step": 15700 }, { "epoch": 1.0048, "eval_loss": 2.050377607345581, "eval_runtime": 99.449, "eval_samples_per_second": 10.055, "eval_steps_per_second": 5.028, "step": 15700 }, { "epoch": 1.0054400000000001, "grad_norm": 0.6863277554512024, "learning_rate": 4.970385785176885e-06, "loss": 2.0219, "step": 15710 }, { "epoch": 1.00608, "grad_norm": 0.736400842666626, "learning_rate": 4.967184248439251e-06, "loss": 2.0259, "step": 15720 }, { "epoch": 1.00672, "grad_norm": 0.7324177026748657, "learning_rate": 4.963982711701617e-06, "loss": 2.0163, "step": 15730 }, { "epoch": 1.00736, "grad_norm": 0.7272530198097229, "learning_rate": 4.960781174963983e-06, "loss": 2.0265, "step": 15740 }, { "epoch": 1.008, "grad_norm": 0.682123601436615, "learning_rate": 4.957579638226349e-06, "loss": 2.0479, "step": 15750 }, { "epoch": 1.00864, "grad_norm": 0.774432897567749, "learning_rate": 4.954378101488715e-06, "loss": 2.0273, "step": 15760 }, { "epoch": 1.00928, "grad_norm": 0.6753953695297241, "learning_rate": 4.9511765647510805e-06, "loss": 2.0358, "step": 15770 }, { "epoch": 1.00992, "grad_norm": 0.7278711795806885, "learning_rate": 4.947975028013447e-06, "loss": 2.0327, "step": 15780 }, { "epoch": 1.01056, "grad_norm": 0.6097889542579651, "learning_rate": 4.944773491275813e-06, "loss": 2.0118, "step": 15790 }, { "epoch": 1.0112, "grad_norm": 0.9159688353538513, "learning_rate": 4.9415719545381784e-06, "loss": 2.0241, "step": 15800 }, { "epoch": 1.0112, "eval_loss": 2.049624443054199, "eval_runtime": 99.2626, "eval_samples_per_second": 10.074, "eval_steps_per_second": 5.037, "step": 15800 }, { "epoch": 1.01184, "grad_norm": 0.689122200012207, "learning_rate": 4.938370417800545e-06, "loss": 2.0432, "step": 15810 }, { "epoch": 1.01248, "grad_norm": 0.733984112739563, "learning_rate": 4.935168881062911e-06, "loss": 2.0437, "step": 15820 }, { "epoch": 1.01312, "grad_norm": 0.6271092295646667, "learning_rate": 4.931967344325276e-06, "loss": 2.044, "step": 15830 }, { "epoch": 1.01376, "grad_norm": 0.6616642475128174, "learning_rate": 4.928765807587643e-06, "loss": 2.015, "step": 15840 }, { "epoch": 1.0144, "grad_norm": 0.6139689087867737, "learning_rate": 4.925564270850009e-06, "loss": 2.0044, "step": 15850 }, { "epoch": 1.01504, "grad_norm": 0.6150696873664856, "learning_rate": 4.922362734112374e-06, "loss": 2.0086, "step": 15860 }, { "epoch": 1.01568, "grad_norm": 0.6278733611106873, "learning_rate": 4.91916119737474e-06, "loss": 2.0221, "step": 15870 }, { "epoch": 1.01632, "grad_norm": 0.6292057633399963, "learning_rate": 4.915959660637107e-06, "loss": 2.0128, "step": 15880 }, { "epoch": 1.01696, "grad_norm": 0.6009802222251892, "learning_rate": 4.912758123899472e-06, "loss": 2.0146, "step": 15890 }, { "epoch": 1.0176, "grad_norm": 0.736315131187439, "learning_rate": 4.909556587161838e-06, "loss": 2.0409, "step": 15900 }, { "epoch": 1.0176, "eval_loss": 2.049333095550537, "eval_runtime": 99.3102, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.035, "step": 15900 }, { "epoch": 1.01824, "grad_norm": 0.7083340287208557, "learning_rate": 4.906355050424204e-06, "loss": 2.0488, "step": 15910 }, { "epoch": 1.01888, "grad_norm": 0.7024874687194824, "learning_rate": 4.9031535136865695e-06, "loss": 2.05, "step": 15920 }, { "epoch": 1.01952, "grad_norm": 0.60423344373703, "learning_rate": 4.899951976948936e-06, "loss": 1.9965, "step": 15930 }, { "epoch": 1.02016, "grad_norm": 0.709780216217041, "learning_rate": 4.896750440211302e-06, "loss": 2.0311, "step": 15940 }, { "epoch": 1.0208, "grad_norm": 0.7139421701431274, "learning_rate": 4.8935489034736675e-06, "loss": 2.0122, "step": 15950 }, { "epoch": 1.02144, "grad_norm": 0.6455684304237366, "learning_rate": 4.890347366736033e-06, "loss": 2.0438, "step": 15960 }, { "epoch": 1.02208, "grad_norm": 0.6643734574317932, "learning_rate": 4.8871458299984e-06, "loss": 2.0006, "step": 15970 }, { "epoch": 1.02272, "grad_norm": 0.8344186544418335, "learning_rate": 4.8839442932607654e-06, "loss": 2.0017, "step": 15980 }, { "epoch": 1.02336, "grad_norm": 0.6340540647506714, "learning_rate": 4.880742756523131e-06, "loss": 2.0021, "step": 15990 }, { "epoch": 1.024, "grad_norm": 0.6304895281791687, "learning_rate": 4.877541219785498e-06, "loss": 2.0278, "step": 16000 }, { "epoch": 1.024, "eval_loss": 2.0487542152404785, "eval_runtime": 99.1634, "eval_samples_per_second": 10.084, "eval_steps_per_second": 5.042, "step": 16000 }, { "epoch": 1.02464, "grad_norm": 0.5626567006111145, "learning_rate": 4.874339683047863e-06, "loss": 2.0197, "step": 16010 }, { "epoch": 1.02528, "grad_norm": 0.5974437594413757, "learning_rate": 4.871138146310229e-06, "loss": 2.0158, "step": 16020 }, { "epoch": 1.02592, "grad_norm": 0.5794137716293335, "learning_rate": 4.867936609572596e-06, "loss": 2.0045, "step": 16030 }, { "epoch": 1.02656, "grad_norm": 0.6216360926628113, "learning_rate": 4.864735072834961e-06, "loss": 2.0391, "step": 16040 }, { "epoch": 1.0272, "grad_norm": 0.7104542255401611, "learning_rate": 4.861533536097327e-06, "loss": 2.0457, "step": 16050 }, { "epoch": 1.02784, "grad_norm": 0.6782556772232056, "learning_rate": 4.858331999359693e-06, "loss": 1.9936, "step": 16060 }, { "epoch": 1.02848, "grad_norm": 0.7673866748809814, "learning_rate": 4.855130462622059e-06, "loss": 2.0346, "step": 16070 }, { "epoch": 1.02912, "grad_norm": 0.6345120668411255, "learning_rate": 4.851928925884425e-06, "loss": 2.049, "step": 16080 }, { "epoch": 1.02976, "grad_norm": 0.6022641658782959, "learning_rate": 4.848727389146791e-06, "loss": 2.0472, "step": 16090 }, { "epoch": 1.0304, "grad_norm": 0.6524453163146973, "learning_rate": 4.845525852409157e-06, "loss": 2.0457, "step": 16100 }, { "epoch": 1.0304, "eval_loss": 2.049445867538452, "eval_runtime": 99.2294, "eval_samples_per_second": 10.078, "eval_steps_per_second": 5.039, "step": 16100 }, { "epoch": 1.03104, "grad_norm": 0.6599230170249939, "learning_rate": 4.842324315671523e-06, "loss": 2.041, "step": 16110 }, { "epoch": 1.03168, "grad_norm": 0.6208943724632263, "learning_rate": 4.839122778933889e-06, "loss": 2.0523, "step": 16120 }, { "epoch": 1.03232, "grad_norm": 0.6133295297622681, "learning_rate": 4.8359212421962545e-06, "loss": 2.0007, "step": 16130 }, { "epoch": 1.03296, "grad_norm": 0.6858235001564026, "learning_rate": 4.83271970545862e-06, "loss": 2.0315, "step": 16140 }, { "epoch": 1.0336, "grad_norm": 0.6205055117607117, "learning_rate": 4.829518168720987e-06, "loss": 2.0114, "step": 16150 }, { "epoch": 1.03424, "grad_norm": 0.6582651138305664, "learning_rate": 4.8263166319833525e-06, "loss": 2.0176, "step": 16160 }, { "epoch": 1.03488, "grad_norm": 0.6905097961425781, "learning_rate": 4.823115095245718e-06, "loss": 2.0338, "step": 16170 }, { "epoch": 1.03552, "grad_norm": 0.6507758498191833, "learning_rate": 4.819913558508084e-06, "loss": 2.0382, "step": 16180 }, { "epoch": 1.03616, "grad_norm": 0.6980578899383545, "learning_rate": 4.81671202177045e-06, "loss": 2.027, "step": 16190 }, { "epoch": 1.0368, "grad_norm": 0.6131721138954163, "learning_rate": 4.813510485032816e-06, "loss": 2.0456, "step": 16200 }, { "epoch": 1.0368, "eval_loss": 2.048192024230957, "eval_runtime": 99.2342, "eval_samples_per_second": 10.077, "eval_steps_per_second": 5.039, "step": 16200 }, { "epoch": 1.03744, "grad_norm": 0.6967636346817017, "learning_rate": 4.810308948295182e-06, "loss": 2.0024, "step": 16210 }, { "epoch": 1.03808, "grad_norm": 0.6524723172187805, "learning_rate": 4.8071074115575476e-06, "loss": 2.0663, "step": 16220 }, { "epoch": 1.03872, "grad_norm": 0.7181164026260376, "learning_rate": 4.803905874819914e-06, "loss": 2.0482, "step": 16230 }, { "epoch": 1.03936, "grad_norm": 0.6536914706230164, "learning_rate": 4.80070433808228e-06, "loss": 2.0432, "step": 16240 }, { "epoch": 1.04, "grad_norm": 0.682151734828949, "learning_rate": 4.7975028013446455e-06, "loss": 2.0246, "step": 16250 }, { "epoch": 1.04064, "grad_norm": 0.5902345180511475, "learning_rate": 4.794301264607012e-06, "loss": 2.0352, "step": 16260 }, { "epoch": 1.04128, "grad_norm": 0.6463891267776489, "learning_rate": 4.791099727869378e-06, "loss": 2.0132, "step": 16270 }, { "epoch": 1.04192, "grad_norm": 0.5589125752449036, "learning_rate": 4.7878981911317435e-06, "loss": 2.0283, "step": 16280 }, { "epoch": 1.04256, "grad_norm": 0.6977293491363525, "learning_rate": 4.78469665439411e-06, "loss": 2.0297, "step": 16290 }, { "epoch": 1.0432, "grad_norm": 0.6409101486206055, "learning_rate": 4.781495117656476e-06, "loss": 2.0402, "step": 16300 }, { "epoch": 1.0432, "eval_loss": 2.0480778217315674, "eval_runtime": 99.1509, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 16300 }, { "epoch": 1.04384, "grad_norm": 0.8234467506408691, "learning_rate": 4.7782935809188415e-06, "loss": 2.0179, "step": 16310 }, { "epoch": 1.04448, "grad_norm": 0.6057077050209045, "learning_rate": 4.775092044181208e-06, "loss": 2.0022, "step": 16320 }, { "epoch": 1.04512, "grad_norm": 0.5955457091331482, "learning_rate": 4.771890507443574e-06, "loss": 2.0085, "step": 16330 }, { "epoch": 1.04576, "grad_norm": 0.6484575271606445, "learning_rate": 4.7686889707059395e-06, "loss": 2.03, "step": 16340 }, { "epoch": 1.0464, "grad_norm": 0.7228317856788635, "learning_rate": 4.765487433968305e-06, "loss": 2.0155, "step": 16350 }, { "epoch": 1.04704, "grad_norm": 0.735862135887146, "learning_rate": 4.762285897230671e-06, "loss": 2.0442, "step": 16360 }, { "epoch": 1.04768, "grad_norm": 0.7349162697792053, "learning_rate": 4.759084360493037e-06, "loss": 2.033, "step": 16370 }, { "epoch": 1.04832, "grad_norm": 0.6394819617271423, "learning_rate": 4.755882823755403e-06, "loss": 2.0079, "step": 16380 }, { "epoch": 1.04896, "grad_norm": 0.6491113901138306, "learning_rate": 4.752681287017769e-06, "loss": 2.0325, "step": 16390 }, { "epoch": 1.0496, "grad_norm": 0.6450079083442688, "learning_rate": 4.749479750280135e-06, "loss": 2.0339, "step": 16400 }, { "epoch": 1.0496, "eval_loss": 2.047717332839966, "eval_runtime": 99.3795, "eval_samples_per_second": 10.062, "eval_steps_per_second": 5.031, "step": 16400 }, { "epoch": 1.05024, "grad_norm": 0.5671599507331848, "learning_rate": 4.7462782135425e-06, "loss": 2.066, "step": 16410 }, { "epoch": 1.05088, "grad_norm": 0.6196804046630859, "learning_rate": 4.743076676804867e-06, "loss": 2.0442, "step": 16420 }, { "epoch": 1.05152, "grad_norm": 0.5986396074295044, "learning_rate": 4.7398751400672326e-06, "loss": 2.0049, "step": 16430 }, { "epoch": 1.05216, "grad_norm": 0.7346808314323425, "learning_rate": 4.736673603329598e-06, "loss": 1.9991, "step": 16440 }, { "epoch": 1.0528, "grad_norm": 0.631497859954834, "learning_rate": 4.733472066591965e-06, "loss": 2.0414, "step": 16450 }, { "epoch": 1.05344, "grad_norm": 0.6475789546966553, "learning_rate": 4.7302705298543305e-06, "loss": 2.0289, "step": 16460 }, { "epoch": 1.05408, "grad_norm": 0.7186465263366699, "learning_rate": 4.727068993116696e-06, "loss": 2.0203, "step": 16470 }, { "epoch": 1.05472, "grad_norm": 1.0701770782470703, "learning_rate": 4.723867456379062e-06, "loss": 2.0291, "step": 16480 }, { "epoch": 1.05536, "grad_norm": 0.7437950968742371, "learning_rate": 4.7206659196414285e-06, "loss": 2.0146, "step": 16490 }, { "epoch": 1.056, "grad_norm": 0.7150045037269592, "learning_rate": 4.717464382903794e-06, "loss": 1.9934, "step": 16500 }, { "epoch": 1.056, "eval_loss": 2.0487072467803955, "eval_runtime": 99.389, "eval_samples_per_second": 10.061, "eval_steps_per_second": 5.031, "step": 16500 }, { "epoch": 1.05664, "grad_norm": 0.6178598999977112, "learning_rate": 4.71426284616616e-06, "loss": 2.0244, "step": 16510 }, { "epoch": 1.05728, "grad_norm": 0.6366320848464966, "learning_rate": 4.7110613094285265e-06, "loss": 2.0105, "step": 16520 }, { "epoch": 1.05792, "grad_norm": 0.5788693428039551, "learning_rate": 4.707859772690892e-06, "loss": 2.0542, "step": 16530 }, { "epoch": 1.05856, "grad_norm": 0.5591841340065002, "learning_rate": 4.704658235953258e-06, "loss": 2.0441, "step": 16540 }, { "epoch": 1.0592, "grad_norm": 0.679841935634613, "learning_rate": 4.7014566992156245e-06, "loss": 2.0394, "step": 16550 }, { "epoch": 1.05984, "grad_norm": 0.6606016755104065, "learning_rate": 4.69825516247799e-06, "loss": 2.0118, "step": 16560 }, { "epoch": 1.06048, "grad_norm": 0.561490535736084, "learning_rate": 4.695053625740356e-06, "loss": 1.9913, "step": 16570 }, { "epoch": 1.06112, "grad_norm": 0.7106103897094727, "learning_rate": 4.691852089002722e-06, "loss": 2.04, "step": 16580 }, { "epoch": 1.06176, "grad_norm": 0.6973317861557007, "learning_rate": 4.688650552265087e-06, "loss": 2.0272, "step": 16590 }, { "epoch": 1.0624, "grad_norm": 0.6442530155181885, "learning_rate": 4.685449015527454e-06, "loss": 2.0355, "step": 16600 }, { "epoch": 1.0624, "eval_loss": 2.047421932220459, "eval_runtime": 99.1752, "eval_samples_per_second": 10.083, "eval_steps_per_second": 5.042, "step": 16600 }, { "epoch": 1.06304, "grad_norm": 0.6881089806556702, "learning_rate": 4.6822474787898196e-06, "loss": 2.0436, "step": 16610 }, { "epoch": 1.06368, "grad_norm": 0.6413591504096985, "learning_rate": 4.679045942052185e-06, "loss": 2.0438, "step": 16620 }, { "epoch": 1.06432, "grad_norm": 0.5700530409812927, "learning_rate": 4.675844405314551e-06, "loss": 2.0283, "step": 16630 }, { "epoch": 1.06496, "grad_norm": 0.6032475233078003, "learning_rate": 4.672642868576917e-06, "loss": 2.0294, "step": 16640 }, { "epoch": 1.0656, "grad_norm": 0.6175091862678528, "learning_rate": 4.669441331839283e-06, "loss": 2.0272, "step": 16650 }, { "epoch": 1.06624, "grad_norm": 0.6341057419776917, "learning_rate": 4.666239795101649e-06, "loss": 2.0302, "step": 16660 }, { "epoch": 1.06688, "grad_norm": 0.6054055094718933, "learning_rate": 4.663038258364015e-06, "loss": 2.0211, "step": 16670 }, { "epoch": 1.06752, "grad_norm": 0.5983806848526001, "learning_rate": 4.659836721626381e-06, "loss": 2.0211, "step": 16680 }, { "epoch": 1.06816, "grad_norm": 0.6351720690727234, "learning_rate": 4.656635184888747e-06, "loss": 2.0615, "step": 16690 }, { "epoch": 1.0688, "grad_norm": 0.6418411135673523, "learning_rate": 4.653433648151113e-06, "loss": 2.0036, "step": 16700 }, { "epoch": 1.0688, "eval_loss": 2.0469391345977783, "eval_runtime": 99.2513, "eval_samples_per_second": 10.075, "eval_steps_per_second": 5.038, "step": 16700 }, { "epoch": 1.06944, "grad_norm": 0.5719327926635742, "learning_rate": 4.650232111413479e-06, "loss": 2.0489, "step": 16710 }, { "epoch": 1.07008, "grad_norm": 0.632648229598999, "learning_rate": 4.647030574675845e-06, "loss": 2.0198, "step": 16720 }, { "epoch": 1.0707200000000001, "grad_norm": 0.5791489481925964, "learning_rate": 4.643829037938211e-06, "loss": 2.0042, "step": 16730 }, { "epoch": 1.07136, "grad_norm": 0.696101725101471, "learning_rate": 4.640627501200577e-06, "loss": 2.0353, "step": 16740 }, { "epoch": 1.072, "grad_norm": 0.6109420657157898, "learning_rate": 4.637425964462943e-06, "loss": 2.0301, "step": 16750 }, { "epoch": 1.07264, "grad_norm": 0.6553672552108765, "learning_rate": 4.634224427725309e-06, "loss": 2.0321, "step": 16760 }, { "epoch": 1.07328, "grad_norm": 0.5798039436340332, "learning_rate": 4.631022890987674e-06, "loss": 2.0336, "step": 16770 }, { "epoch": 1.07392, "grad_norm": 0.5964285731315613, "learning_rate": 4.627821354250041e-06, "loss": 2.0314, "step": 16780 }, { "epoch": 1.07456, "grad_norm": 0.6674721240997314, "learning_rate": 4.624619817512407e-06, "loss": 2.0391, "step": 16790 }, { "epoch": 1.0752, "grad_norm": 0.5824782848358154, "learning_rate": 4.621418280774772e-06, "loss": 2.0247, "step": 16800 }, { "epoch": 1.0752, "eval_loss": 2.047208547592163, "eval_runtime": 99.1467, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 16800 }, { "epoch": 1.07584, "grad_norm": 0.6081963181495667, "learning_rate": 4.618216744037138e-06, "loss": 2.0421, "step": 16810 }, { "epoch": 1.07648, "grad_norm": 0.5982031226158142, "learning_rate": 4.615015207299504e-06, "loss": 2.0377, "step": 16820 }, { "epoch": 1.07712, "grad_norm": 0.6536882519721985, "learning_rate": 4.61181367056187e-06, "loss": 2.0466, "step": 16830 }, { "epoch": 1.07776, "grad_norm": 0.714253842830658, "learning_rate": 4.608612133824236e-06, "loss": 2.0226, "step": 16840 }, { "epoch": 1.0784, "grad_norm": 0.6436517834663391, "learning_rate": 4.605410597086602e-06, "loss": 2.0467, "step": 16850 }, { "epoch": 1.07904, "grad_norm": 0.6904466152191162, "learning_rate": 4.602209060348967e-06, "loss": 2.0219, "step": 16860 }, { "epoch": 1.07968, "grad_norm": 0.617428183555603, "learning_rate": 4.599007523611334e-06, "loss": 2.0212, "step": 16870 }, { "epoch": 1.08032, "grad_norm": 0.7247819900512695, "learning_rate": 4.5958059868737e-06, "loss": 2.0235, "step": 16880 }, { "epoch": 1.08096, "grad_norm": 0.5652709007263184, "learning_rate": 4.592604450136065e-06, "loss": 2.0022, "step": 16890 }, { "epoch": 1.0816, "grad_norm": 0.6366179585456848, "learning_rate": 4.589402913398431e-06, "loss": 2.0255, "step": 16900 }, { "epoch": 1.0816, "eval_loss": 2.0470311641693115, "eval_runtime": 99.227, "eval_samples_per_second": 10.078, "eval_steps_per_second": 5.039, "step": 16900 }, { "epoch": 1.08224, "grad_norm": 0.6273727416992188, "learning_rate": 4.586201376660798e-06, "loss": 2.0166, "step": 16910 }, { "epoch": 1.08288, "grad_norm": 0.8277556300163269, "learning_rate": 4.582999839923163e-06, "loss": 2.0075, "step": 16920 }, { "epoch": 1.08352, "grad_norm": 0.6830505132675171, "learning_rate": 4.579798303185529e-06, "loss": 2.0105, "step": 16930 }, { "epoch": 1.08416, "grad_norm": 0.5989285707473755, "learning_rate": 4.576596766447896e-06, "loss": 2.0225, "step": 16940 }, { "epoch": 1.0848, "grad_norm": 0.5858156085014343, "learning_rate": 4.573395229710261e-06, "loss": 1.9978, "step": 16950 }, { "epoch": 1.08544, "grad_norm": 0.5735730528831482, "learning_rate": 4.570193692972627e-06, "loss": 2.0444, "step": 16960 }, { "epoch": 1.08608, "grad_norm": 0.5746780037879944, "learning_rate": 4.566992156234994e-06, "loss": 2.0497, "step": 16970 }, { "epoch": 1.08672, "grad_norm": 0.7411326766014099, "learning_rate": 4.563790619497359e-06, "loss": 1.9788, "step": 16980 }, { "epoch": 1.0873599999999999, "grad_norm": 0.5445652008056641, "learning_rate": 4.560589082759725e-06, "loss": 1.9972, "step": 16990 }, { "epoch": 1.088, "grad_norm": 0.6129888892173767, "learning_rate": 4.557387546022092e-06, "loss": 2.0206, "step": 17000 }, { "epoch": 1.088, "eval_loss": 2.0461783409118652, "eval_runtime": 99.2517, "eval_samples_per_second": 10.075, "eval_steps_per_second": 5.038, "step": 17000 }, { "epoch": 1.08864, "grad_norm": 0.5471550822257996, "learning_rate": 4.554186009284457e-06, "loss": 2.0684, "step": 17010 }, { "epoch": 1.08928, "grad_norm": 0.6158989667892456, "learning_rate": 4.550984472546823e-06, "loss": 2.0286, "step": 17020 }, { "epoch": 1.08992, "grad_norm": 0.7108532190322876, "learning_rate": 4.547782935809189e-06, "loss": 2.0193, "step": 17030 }, { "epoch": 1.09056, "grad_norm": 0.5638531446456909, "learning_rate": 4.5445813990715544e-06, "loss": 2.0425, "step": 17040 }, { "epoch": 1.0912, "grad_norm": 0.6152315735816956, "learning_rate": 4.541379862333921e-06, "loss": 2.0392, "step": 17050 }, { "epoch": 1.09184, "grad_norm": 0.5938286185264587, "learning_rate": 4.538178325596287e-06, "loss": 2.0407, "step": 17060 }, { "epoch": 1.0924800000000001, "grad_norm": 0.5730686783790588, "learning_rate": 4.534976788858652e-06, "loss": 2.0411, "step": 17070 }, { "epoch": 1.09312, "grad_norm": 0.6255854964256287, "learning_rate": 4.531775252121018e-06, "loss": 2.0084, "step": 17080 }, { "epoch": 1.09376, "grad_norm": 0.567584216594696, "learning_rate": 4.528573715383384e-06, "loss": 2.0253, "step": 17090 }, { "epoch": 1.0944, "grad_norm": 0.7046324610710144, "learning_rate": 4.52537217864575e-06, "loss": 2.0325, "step": 17100 }, { "epoch": 1.0944, "eval_loss": 2.0462424755096436, "eval_runtime": 99.1134, "eval_samples_per_second": 10.089, "eval_steps_per_second": 5.045, "step": 17100 }, { "epoch": 1.09504, "grad_norm": 0.5818910002708435, "learning_rate": 4.522170641908116e-06, "loss": 2.0228, "step": 17110 }, { "epoch": 1.09568, "grad_norm": 0.6222846508026123, "learning_rate": 4.518969105170482e-06, "loss": 2.0297, "step": 17120 }, { "epoch": 1.09632, "grad_norm": 0.5957484841346741, "learning_rate": 4.515767568432848e-06, "loss": 1.9729, "step": 17130 }, { "epoch": 1.09696, "grad_norm": 0.6309909224510193, "learning_rate": 4.512566031695214e-06, "loss": 2.0163, "step": 17140 }, { "epoch": 1.0976, "grad_norm": 0.7151498794555664, "learning_rate": 4.50936449495758e-06, "loss": 2.024, "step": 17150 }, { "epoch": 1.09824, "grad_norm": 0.6055840253829956, "learning_rate": 4.506162958219946e-06, "loss": 1.9837, "step": 17160 }, { "epoch": 1.09888, "grad_norm": 0.8446348905563354, "learning_rate": 4.502961421482312e-06, "loss": 1.9953, "step": 17170 }, { "epoch": 1.09952, "grad_norm": 0.6151704788208008, "learning_rate": 4.499759884744678e-06, "loss": 2.042, "step": 17180 }, { "epoch": 1.10016, "grad_norm": 0.5726914405822754, "learning_rate": 4.4965583480070435e-06, "loss": 2.0041, "step": 17190 }, { "epoch": 1.1008, "grad_norm": 0.6289142966270447, "learning_rate": 4.49335681126941e-06, "loss": 2.0215, "step": 17200 }, { "epoch": 1.1008, "eval_loss": 2.0456502437591553, "eval_runtime": 99.2892, "eval_samples_per_second": 10.072, "eval_steps_per_second": 5.036, "step": 17200 }, { "epoch": 1.10144, "grad_norm": 0.6238797903060913, "learning_rate": 4.490155274531776e-06, "loss": 2.0265, "step": 17210 }, { "epoch": 1.10208, "grad_norm": 0.6181656122207642, "learning_rate": 4.4869537377941414e-06, "loss": 2.052, "step": 17220 }, { "epoch": 1.10272, "grad_norm": 0.6757475137710571, "learning_rate": 4.483752201056508e-06, "loss": 2.0402, "step": 17230 }, { "epoch": 1.10336, "grad_norm": 0.5690998435020447, "learning_rate": 4.480550664318874e-06, "loss": 2.0102, "step": 17240 }, { "epoch": 1.104, "grad_norm": 0.6137502193450928, "learning_rate": 4.477349127581239e-06, "loss": 2.0199, "step": 17250 }, { "epoch": 1.10464, "grad_norm": 0.5718172192573547, "learning_rate": 4.474147590843605e-06, "loss": 2.015, "step": 17260 }, { "epoch": 1.10528, "grad_norm": 0.5788451433181763, "learning_rate": 4.470946054105972e-06, "loss": 1.9953, "step": 17270 }, { "epoch": 1.10592, "grad_norm": 0.5897586941719055, "learning_rate": 4.467744517368337e-06, "loss": 2.0303, "step": 17280 }, { "epoch": 1.10656, "grad_norm": 0.6481188535690308, "learning_rate": 4.464542980630703e-06, "loss": 1.9928, "step": 17290 }, { "epoch": 1.1072, "grad_norm": 0.5920864939689636, "learning_rate": 4.461341443893069e-06, "loss": 2.0279, "step": 17300 }, { "epoch": 1.1072, "eval_loss": 2.045930862426758, "eval_runtime": 99.2881, "eval_samples_per_second": 10.072, "eval_steps_per_second": 5.036, "step": 17300 }, { "epoch": 1.10784, "grad_norm": 0.6080455183982849, "learning_rate": 4.4581399071554345e-06, "loss": 2.0236, "step": 17310 }, { "epoch": 1.10848, "grad_norm": 0.7448667883872986, "learning_rate": 4.4549383704178e-06, "loss": 2.0498, "step": 17320 }, { "epoch": 1.1091199999999999, "grad_norm": 0.5533308386802673, "learning_rate": 4.451736833680167e-06, "loss": 2.0094, "step": 17330 }, { "epoch": 1.10976, "grad_norm": 0.6973090767860413, "learning_rate": 4.4485352969425325e-06, "loss": 2.0248, "step": 17340 }, { "epoch": 1.1104, "grad_norm": 0.6721564531326294, "learning_rate": 4.445333760204898e-06, "loss": 2.0131, "step": 17350 }, { "epoch": 1.11104, "grad_norm": 0.8893942832946777, "learning_rate": 4.442132223467265e-06, "loss": 1.994, "step": 17360 }, { "epoch": 1.11168, "grad_norm": 0.581801176071167, "learning_rate": 4.4389306867296305e-06, "loss": 2.0202, "step": 17370 }, { "epoch": 1.11232, "grad_norm": 0.5315964818000793, "learning_rate": 4.435729149991996e-06, "loss": 2.0164, "step": 17380 }, { "epoch": 1.11296, "grad_norm": 0.6118794679641724, "learning_rate": 4.432527613254363e-06, "loss": 2.0312, "step": 17390 }, { "epoch": 1.1136, "grad_norm": 0.5874173641204834, "learning_rate": 4.4293260765167285e-06, "loss": 2.0616, "step": 17400 }, { "epoch": 1.1136, "eval_loss": 2.0455470085144043, "eval_runtime": 99.1356, "eval_samples_per_second": 10.087, "eval_steps_per_second": 5.044, "step": 17400 }, { "epoch": 1.11424, "grad_norm": 0.6851205229759216, "learning_rate": 4.426124539779094e-06, "loss": 2.0143, "step": 17410 }, { "epoch": 1.11488, "grad_norm": 0.5862815976142883, "learning_rate": 4.422923003041461e-06, "loss": 2.0224, "step": 17420 }, { "epoch": 1.11552, "grad_norm": 0.542276918888092, "learning_rate": 4.4197214663038264e-06, "loss": 1.9984, "step": 17430 }, { "epoch": 1.11616, "grad_norm": 0.5916332602500916, "learning_rate": 4.416519929566192e-06, "loss": 2.0272, "step": 17440 }, { "epoch": 1.1168, "grad_norm": 0.5869584679603577, "learning_rate": 4.413318392828559e-06, "loss": 2.0239, "step": 17450 }, { "epoch": 1.11744, "grad_norm": 0.6191988587379456, "learning_rate": 4.410116856090924e-06, "loss": 2.0072, "step": 17460 }, { "epoch": 1.11808, "grad_norm": 0.556609034538269, "learning_rate": 4.40691531935329e-06, "loss": 2.0464, "step": 17470 }, { "epoch": 1.11872, "grad_norm": 0.5710023045539856, "learning_rate": 4.403713782615656e-06, "loss": 2.0525, "step": 17480 }, { "epoch": 1.11936, "grad_norm": 0.8187467455863953, "learning_rate": 4.4005122458780215e-06, "loss": 2.0271, "step": 17490 }, { "epoch": 1.12, "grad_norm": 0.5508573055267334, "learning_rate": 4.397310709140388e-06, "loss": 2.0339, "step": 17500 }, { "epoch": 1.12, "eval_loss": 2.0453834533691406, "eval_runtime": 99.29, "eval_samples_per_second": 10.072, "eval_steps_per_second": 5.036, "step": 17500 }, { "epoch": 1.12064, "grad_norm": 0.6992635726928711, "learning_rate": 4.394109172402754e-06, "loss": 2.0474, "step": 17510 }, { "epoch": 1.12128, "grad_norm": 0.5587980151176453, "learning_rate": 4.3909076356651195e-06, "loss": 2.0261, "step": 17520 }, { "epoch": 1.12192, "grad_norm": 0.5831376314163208, "learning_rate": 4.387706098927485e-06, "loss": 2.0556, "step": 17530 }, { "epoch": 1.12256, "grad_norm": 0.6915487051010132, "learning_rate": 4.384504562189851e-06, "loss": 2.0218, "step": 17540 }, { "epoch": 1.1232, "grad_norm": 0.588444173336029, "learning_rate": 4.3813030254522175e-06, "loss": 2.0461, "step": 17550 }, { "epoch": 1.12384, "grad_norm": 0.6518751978874207, "learning_rate": 4.378101488714583e-06, "loss": 2.0289, "step": 17560 }, { "epoch": 1.12448, "grad_norm": 0.5470808148384094, "learning_rate": 4.374899951976949e-06, "loss": 2.0294, "step": 17570 }, { "epoch": 1.12512, "grad_norm": 0.6305654048919678, "learning_rate": 4.3716984152393155e-06, "loss": 1.9871, "step": 17580 }, { "epoch": 1.12576, "grad_norm": 0.553517758846283, "learning_rate": 4.368496878501681e-06, "loss": 2.0324, "step": 17590 }, { "epoch": 1.1264, "grad_norm": 0.7479193806648254, "learning_rate": 4.365295341764047e-06, "loss": 2.032, "step": 17600 }, { "epoch": 1.1264, "eval_loss": 2.045273542404175, "eval_runtime": 99.2574, "eval_samples_per_second": 10.075, "eval_steps_per_second": 5.037, "step": 17600 }, { "epoch": 1.12704, "grad_norm": 0.6123260259628296, "learning_rate": 4.362093805026413e-06, "loss": 2.0483, "step": 17610 }, { "epoch": 1.12768, "grad_norm": 0.6270941495895386, "learning_rate": 4.358892268288779e-06, "loss": 2.0537, "step": 17620 }, { "epoch": 1.12832, "grad_norm": 0.6077694892883301, "learning_rate": 4.355690731551145e-06, "loss": 2.0137, "step": 17630 }, { "epoch": 1.12896, "grad_norm": 0.7045969367027283, "learning_rate": 4.352489194813511e-06, "loss": 2.0382, "step": 17640 }, { "epoch": 1.1296, "grad_norm": 0.575238049030304, "learning_rate": 4.349287658075877e-06, "loss": 2.0437, "step": 17650 }, { "epoch": 1.13024, "grad_norm": 0.6163949370384216, "learning_rate": 4.346086121338243e-06, "loss": 1.9799, "step": 17660 }, { "epoch": 1.1308799999999999, "grad_norm": 0.6812820434570312, "learning_rate": 4.3428845846006086e-06, "loss": 1.9935, "step": 17670 }, { "epoch": 1.13152, "grad_norm": 0.5811446905136108, "learning_rate": 4.339683047862975e-06, "loss": 2.0257, "step": 17680 }, { "epoch": 1.13216, "grad_norm": 0.5752651691436768, "learning_rate": 4.336481511125341e-06, "loss": 2.0468, "step": 17690 }, { "epoch": 1.1328, "grad_norm": 0.538728654384613, "learning_rate": 4.3332799743877065e-06, "loss": 2.0496, "step": 17700 }, { "epoch": 1.1328, "eval_loss": 2.0443480014801025, "eval_runtime": 99.2029, "eval_samples_per_second": 10.08, "eval_steps_per_second": 5.04, "step": 17700 }, { "epoch": 1.13344, "grad_norm": 0.6354817748069763, "learning_rate": 4.330078437650072e-06, "loss": 2.0264, "step": 17710 }, { "epoch": 1.13408, "grad_norm": 0.5710617899894714, "learning_rate": 4.326876900912439e-06, "loss": 2.0434, "step": 17720 }, { "epoch": 1.13472, "grad_norm": 0.7051773071289062, "learning_rate": 4.3236753641748045e-06, "loss": 2.0114, "step": 17730 }, { "epoch": 1.13536, "grad_norm": 0.6036627888679504, "learning_rate": 4.32047382743717e-06, "loss": 2.0226, "step": 17740 }, { "epoch": 1.1360000000000001, "grad_norm": 0.6783179640769958, "learning_rate": 4.317272290699536e-06, "loss": 2.0085, "step": 17750 }, { "epoch": 1.13664, "grad_norm": 0.6049875020980835, "learning_rate": 4.314070753961902e-06, "loss": 2.036, "step": 17760 }, { "epoch": 1.13728, "grad_norm": 0.5788288712501526, "learning_rate": 4.310869217224267e-06, "loss": 2.0514, "step": 17770 }, { "epoch": 1.13792, "grad_norm": 0.633959174156189, "learning_rate": 4.307667680486634e-06, "loss": 2.0439, "step": 17780 }, { "epoch": 1.13856, "grad_norm": 0.620322048664093, "learning_rate": 4.304466143749e-06, "loss": 2.0219, "step": 17790 }, { "epoch": 1.1392, "grad_norm": 0.558303713798523, "learning_rate": 4.301264607011365e-06, "loss": 2.0239, "step": 17800 }, { "epoch": 1.1392, "eval_loss": 2.044430732727051, "eval_runtime": 99.4766, "eval_samples_per_second": 10.053, "eval_steps_per_second": 5.026, "step": 17800 }, { "epoch": 1.13984, "grad_norm": 0.5895607471466064, "learning_rate": 4.298063070273732e-06, "loss": 2.0546, "step": 17810 }, { "epoch": 1.14048, "grad_norm": 0.6369735598564148, "learning_rate": 4.294861533536098e-06, "loss": 2.0283, "step": 17820 }, { "epoch": 1.14112, "grad_norm": 0.6027755737304688, "learning_rate": 4.291659996798463e-06, "loss": 2.0156, "step": 17830 }, { "epoch": 1.14176, "grad_norm": 0.6100727915763855, "learning_rate": 4.28845846006083e-06, "loss": 2.0275, "step": 17840 }, { "epoch": 1.1424, "grad_norm": 0.5747309923171997, "learning_rate": 4.2852569233231956e-06, "loss": 2.0235, "step": 17850 }, { "epoch": 1.14304, "grad_norm": 0.5191932320594788, "learning_rate": 4.282055386585561e-06, "loss": 1.9881, "step": 17860 }, { "epoch": 1.14368, "grad_norm": 0.5874210596084595, "learning_rate": 4.278853849847928e-06, "loss": 2.0062, "step": 17870 }, { "epoch": 1.14432, "grad_norm": 0.5625083446502686, "learning_rate": 4.2756523131102936e-06, "loss": 2.0174, "step": 17880 }, { "epoch": 1.14496, "grad_norm": 0.6093165874481201, "learning_rate": 4.272450776372659e-06, "loss": 2.0219, "step": 17890 }, { "epoch": 1.1456, "grad_norm": 0.5750223398208618, "learning_rate": 4.269249239635025e-06, "loss": 2.025, "step": 17900 }, { "epoch": 1.1456, "eval_loss": 2.0441596508026123, "eval_runtime": 99.0998, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.045, "step": 17900 }, { "epoch": 1.14624, "grad_norm": 0.6374318599700928, "learning_rate": 4.2660477028973915e-06, "loss": 2.0259, "step": 17910 }, { "epoch": 1.14688, "grad_norm": 0.5993691682815552, "learning_rate": 4.262846166159757e-06, "loss": 2.0514, "step": 17920 }, { "epoch": 1.14752, "grad_norm": 0.6802266836166382, "learning_rate": 4.259644629422123e-06, "loss": 1.9933, "step": 17930 }, { "epoch": 1.14816, "grad_norm": 0.5811522006988525, "learning_rate": 4.256443092684489e-06, "loss": 2.0169, "step": 17940 }, { "epoch": 1.1488, "grad_norm": 0.6651666164398193, "learning_rate": 4.253241555946855e-06, "loss": 2.0495, "step": 17950 }, { "epoch": 1.14944, "grad_norm": 0.6029697060585022, "learning_rate": 4.250040019209221e-06, "loss": 2.0321, "step": 17960 }, { "epoch": 1.15008, "grad_norm": 0.614733099937439, "learning_rate": 4.246838482471587e-06, "loss": 1.9961, "step": 17970 }, { "epoch": 1.15072, "grad_norm": 0.5310514569282532, "learning_rate": 4.243636945733952e-06, "loss": 2.0148, "step": 17980 }, { "epoch": 1.15136, "grad_norm": 0.6348665356636047, "learning_rate": 4.240435408996318e-06, "loss": 2.05, "step": 17990 }, { "epoch": 1.152, "grad_norm": 0.6719682812690735, "learning_rate": 4.237233872258685e-06, "loss": 2.0116, "step": 18000 }, { "epoch": 1.152, "eval_loss": 2.0433847904205322, "eval_runtime": 99.1507, "eval_samples_per_second": 10.086, "eval_steps_per_second": 5.043, "step": 18000 }, { "epoch": 1.1526399999999999, "grad_norm": 0.6079267263412476, "learning_rate": 4.23403233552105e-06, "loss": 2.0296, "step": 18010 }, { "epoch": 1.15328, "grad_norm": 0.6186202168464661, "learning_rate": 4.230830798783416e-06, "loss": 2.0179, "step": 18020 }, { "epoch": 1.15392, "grad_norm": 0.6316864490509033, "learning_rate": 4.227629262045782e-06, "loss": 2.0382, "step": 18030 }, { "epoch": 1.15456, "grad_norm": 0.5883438587188721, "learning_rate": 4.224427725308148e-06, "loss": 2.0198, "step": 18040 }, { "epoch": 1.1552, "grad_norm": 0.5984814167022705, "learning_rate": 4.221226188570514e-06, "loss": 2.0217, "step": 18050 }, { "epoch": 1.15584, "grad_norm": 0.5831133127212524, "learning_rate": 4.21802465183288e-06, "loss": 2.0159, "step": 18060 }, { "epoch": 1.15648, "grad_norm": 0.5060235261917114, "learning_rate": 4.214823115095246e-06, "loss": 2.0348, "step": 18070 }, { "epoch": 1.15712, "grad_norm": 0.6391846537590027, "learning_rate": 4.211621578357612e-06, "loss": 2.008, "step": 18080 }, { "epoch": 1.1577600000000001, "grad_norm": 0.5386248230934143, "learning_rate": 4.208420041619978e-06, "loss": 2.0194, "step": 18090 }, { "epoch": 1.1584, "grad_norm": 0.5904210805892944, "learning_rate": 4.205218504882344e-06, "loss": 1.9955, "step": 18100 }, { "epoch": 1.1584, "eval_loss": 2.043471097946167, "eval_runtime": 99.0461, "eval_samples_per_second": 10.096, "eval_steps_per_second": 5.048, "step": 18100 }, { "epoch": 1.15904, "grad_norm": 0.6032049655914307, "learning_rate": 4.20201696814471e-06, "loss": 2.0155, "step": 18110 }, { "epoch": 1.15968, "grad_norm": 0.5975533723831177, "learning_rate": 4.198815431407076e-06, "loss": 2.0507, "step": 18120 }, { "epoch": 1.16032, "grad_norm": 0.6408397555351257, "learning_rate": 4.195613894669442e-06, "loss": 2.0152, "step": 18130 }, { "epoch": 1.16096, "grad_norm": 0.5729820728302002, "learning_rate": 4.192412357931808e-06, "loss": 1.9997, "step": 18140 }, { "epoch": 1.1616, "grad_norm": 0.5984222888946533, "learning_rate": 4.189210821194174e-06, "loss": 2.0213, "step": 18150 }, { "epoch": 1.16224, "grad_norm": 0.5945577621459961, "learning_rate": 4.186009284456539e-06, "loss": 2.0073, "step": 18160 }, { "epoch": 1.16288, "grad_norm": 0.5897391438484192, "learning_rate": 4.182807747718906e-06, "loss": 2.0437, "step": 18170 }, { "epoch": 1.16352, "grad_norm": 0.6737760901451111, "learning_rate": 4.179606210981272e-06, "loss": 2.0403, "step": 18180 }, { "epoch": 1.16416, "grad_norm": 0.5872124433517456, "learning_rate": 4.176404674243637e-06, "loss": 2.0306, "step": 18190 }, { "epoch": 1.1648, "grad_norm": 0.6742432713508606, "learning_rate": 4.173203137506003e-06, "loss": 2.0373, "step": 18200 }, { "epoch": 1.1648, "eval_loss": 2.0434553623199463, "eval_runtime": 99.1351, "eval_samples_per_second": 10.087, "eval_steps_per_second": 5.044, "step": 18200 }, { "epoch": 1.16544, "grad_norm": 0.6591572761535645, "learning_rate": 4.170001600768369e-06, "loss": 2.008, "step": 18210 }, { "epoch": 1.16608, "grad_norm": 0.5591892004013062, "learning_rate": 4.166800064030735e-06, "loss": 2.053, "step": 18220 }, { "epoch": 1.16672, "grad_norm": 0.56505286693573, "learning_rate": 4.163598527293101e-06, "loss": 2.0204, "step": 18230 }, { "epoch": 1.16736, "grad_norm": 0.6225974559783936, "learning_rate": 4.160396990555467e-06, "loss": 2.034, "step": 18240 }, { "epoch": 1.168, "grad_norm": 0.7203903794288635, "learning_rate": 4.1571954538178324e-06, "loss": 2.0219, "step": 18250 }, { "epoch": 1.16864, "grad_norm": 0.6796202659606934, "learning_rate": 4.153993917080199e-06, "loss": 2.011, "step": 18260 }, { "epoch": 1.16928, "grad_norm": 0.5384121537208557, "learning_rate": 4.150792380342565e-06, "loss": 1.9911, "step": 18270 }, { "epoch": 1.16992, "grad_norm": 0.6490262746810913, "learning_rate": 4.14759084360493e-06, "loss": 2.0012, "step": 18280 }, { "epoch": 1.17056, "grad_norm": 0.6402068138122559, "learning_rate": 4.144389306867297e-06, "loss": 2.0179, "step": 18290 }, { "epoch": 1.1712, "grad_norm": 0.5560231804847717, "learning_rate": 4.141187770129663e-06, "loss": 1.9948, "step": 18300 }, { "epoch": 1.1712, "eval_loss": 2.043088674545288, "eval_runtime": 99.2195, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.039, "step": 18300 }, { "epoch": 1.17184, "grad_norm": 0.5684863924980164, "learning_rate": 4.137986233392028e-06, "loss": 1.9933, "step": 18310 }, { "epoch": 1.17248, "grad_norm": 0.6014339327812195, "learning_rate": 4.134784696654394e-06, "loss": 2.0161, "step": 18320 }, { "epoch": 1.17312, "grad_norm": 0.6450905203819275, "learning_rate": 4.131583159916761e-06, "loss": 2.0292, "step": 18330 }, { "epoch": 1.17376, "grad_norm": 0.5547463297843933, "learning_rate": 4.128381623179126e-06, "loss": 2.0318, "step": 18340 }, { "epoch": 1.1743999999999999, "grad_norm": 0.493149071931839, "learning_rate": 4.125180086441492e-06, "loss": 2.0142, "step": 18350 }, { "epoch": 1.17504, "grad_norm": 0.546998918056488, "learning_rate": 4.121978549703859e-06, "loss": 2.0102, "step": 18360 }, { "epoch": 1.17568, "grad_norm": 0.5504551529884338, "learning_rate": 4.118777012966224e-06, "loss": 2.0329, "step": 18370 }, { "epoch": 1.17632, "grad_norm": 0.5663371086120605, "learning_rate": 4.11557547622859e-06, "loss": 2.0638, "step": 18380 }, { "epoch": 1.17696, "grad_norm": 0.5327328443527222, "learning_rate": 4.112373939490957e-06, "loss": 2.0131, "step": 18390 }, { "epoch": 1.1776, "grad_norm": 0.702136218547821, "learning_rate": 4.109172402753322e-06, "loss": 2.0052, "step": 18400 }, { "epoch": 1.1776, "eval_loss": 2.0431933403015137, "eval_runtime": 99.1385, "eval_samples_per_second": 10.087, "eval_steps_per_second": 5.043, "step": 18400 }, { "epoch": 1.17824, "grad_norm": 0.678486168384552, "learning_rate": 4.105970866015688e-06, "loss": 2.0035, "step": 18410 }, { "epoch": 1.17888, "grad_norm": 0.6072514057159424, "learning_rate": 4.102769329278054e-06, "loss": 2.0067, "step": 18420 }, { "epoch": 1.1795200000000001, "grad_norm": 0.6381948590278625, "learning_rate": 4.0995677925404195e-06, "loss": 2.0069, "step": 18430 }, { "epoch": 1.1801599999999999, "grad_norm": 0.5768085718154907, "learning_rate": 4.096366255802785e-06, "loss": 2.0085, "step": 18440 }, { "epoch": 1.1808, "grad_norm": 0.6085562705993652, "learning_rate": 4.093164719065152e-06, "loss": 2.0133, "step": 18450 }, { "epoch": 1.18144, "grad_norm": 0.6448004841804504, "learning_rate": 4.0899631823275174e-06, "loss": 2.0595, "step": 18460 }, { "epoch": 1.18208, "grad_norm": 0.5848154425621033, "learning_rate": 4.086761645589883e-06, "loss": 2.0215, "step": 18470 }, { "epoch": 1.18272, "grad_norm": 0.5383066534996033, "learning_rate": 4.083560108852249e-06, "loss": 2.0033, "step": 18480 }, { "epoch": 1.18336, "grad_norm": 0.632030189037323, "learning_rate": 4.080358572114615e-06, "loss": 2.0296, "step": 18490 }, { "epoch": 1.184, "grad_norm": 0.6205241084098816, "learning_rate": 4.077157035376981e-06, "loss": 2.0178, "step": 18500 }, { "epoch": 1.184, "eval_loss": 2.0423781871795654, "eval_runtime": 99.0963, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.046, "step": 18500 }, { "epoch": 1.18464, "grad_norm": 0.6120121479034424, "learning_rate": 4.073955498639347e-06, "loss": 2.0074, "step": 18510 }, { "epoch": 1.1852800000000001, "grad_norm": 0.5563983917236328, "learning_rate": 4.070753961901713e-06, "loss": 2.0258, "step": 18520 }, { "epoch": 1.18592, "grad_norm": 0.5512414574623108, "learning_rate": 4.067552425164079e-06, "loss": 2.058, "step": 18530 }, { "epoch": 1.18656, "grad_norm": 0.5608769059181213, "learning_rate": 4.064350888426445e-06, "loss": 2.0202, "step": 18540 }, { "epoch": 1.1872, "grad_norm": 0.5823011994361877, "learning_rate": 4.061149351688811e-06, "loss": 2.0224, "step": 18550 }, { "epoch": 1.18784, "grad_norm": 0.6108050346374512, "learning_rate": 4.057947814951177e-06, "loss": 2.0147, "step": 18560 }, { "epoch": 1.18848, "grad_norm": 0.5405247211456299, "learning_rate": 4.054746278213543e-06, "loss": 2.0505, "step": 18570 }, { "epoch": 1.18912, "grad_norm": 0.5540168285369873, "learning_rate": 4.051544741475909e-06, "loss": 2.0591, "step": 18580 }, { "epoch": 1.18976, "grad_norm": 0.6209017634391785, "learning_rate": 4.048343204738275e-06, "loss": 2.0195, "step": 18590 }, { "epoch": 1.1904, "grad_norm": 0.5977944135665894, "learning_rate": 4.045141668000641e-06, "loss": 2.0125, "step": 18600 }, { "epoch": 1.1904, "eval_loss": 2.042717456817627, "eval_runtime": 99.1827, "eval_samples_per_second": 10.082, "eval_steps_per_second": 5.041, "step": 18600 }, { "epoch": 1.19104, "grad_norm": 0.6067842245101929, "learning_rate": 4.0419401312630065e-06, "loss": 2.0236, "step": 18610 }, { "epoch": 1.19168, "grad_norm": 0.6149134635925293, "learning_rate": 4.038738594525373e-06, "loss": 2.0205, "step": 18620 }, { "epoch": 1.19232, "grad_norm": 0.5659533739089966, "learning_rate": 4.035537057787739e-06, "loss": 2.0257, "step": 18630 }, { "epoch": 1.19296, "grad_norm": 0.5780056715011597, "learning_rate": 4.0323355210501045e-06, "loss": 2.0414, "step": 18640 }, { "epoch": 1.1936, "grad_norm": 0.6140852570533752, "learning_rate": 4.02913398431247e-06, "loss": 2.0164, "step": 18650 }, { "epoch": 1.19424, "grad_norm": 0.5738794207572937, "learning_rate": 4.025932447574836e-06, "loss": 2.0332, "step": 18660 }, { "epoch": 1.19488, "grad_norm": 0.6487182974815369, "learning_rate": 4.0227309108372024e-06, "loss": 2.0085, "step": 18670 }, { "epoch": 1.19552, "grad_norm": 0.5968878269195557, "learning_rate": 4.019529374099568e-06, "loss": 2.0128, "step": 18680 }, { "epoch": 1.19616, "grad_norm": 0.7557638883590698, "learning_rate": 4.016327837361934e-06, "loss": 2.0416, "step": 18690 }, { "epoch": 1.1968, "grad_norm": 0.6604118943214417, "learning_rate": 4.0131263006242996e-06, "loss": 2.0175, "step": 18700 }, { "epoch": 1.1968, "eval_loss": 2.042259454727173, "eval_runtime": 99.1, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.045, "step": 18700 }, { "epoch": 1.19744, "grad_norm": 0.6026879549026489, "learning_rate": 4.009924763886666e-06, "loss": 1.9974, "step": 18710 }, { "epoch": 1.19808, "grad_norm": 0.6138709187507629, "learning_rate": 4.006723227149032e-06, "loss": 2.019, "step": 18720 }, { "epoch": 1.19872, "grad_norm": 0.5770138502120972, "learning_rate": 4.0035216904113975e-06, "loss": 2.0165, "step": 18730 }, { "epoch": 1.19936, "grad_norm": 0.6799411177635193, "learning_rate": 4.000320153673763e-06, "loss": 2.0124, "step": 18740 }, { "epoch": 1.2, "grad_norm": 0.715839684009552, "learning_rate": 3.99711861693613e-06, "loss": 1.9994, "step": 18750 }, { "epoch": 1.20064, "grad_norm": 0.6663697361946106, "learning_rate": 3.9939170801984955e-06, "loss": 1.9953, "step": 18760 }, { "epoch": 1.20128, "grad_norm": 0.5951536893844604, "learning_rate": 3.990715543460861e-06, "loss": 2.0368, "step": 18770 }, { "epoch": 1.2019199999999999, "grad_norm": 0.6461049914360046, "learning_rate": 3.987514006723228e-06, "loss": 2.0049, "step": 18780 }, { "epoch": 1.20256, "grad_norm": 0.6078142523765564, "learning_rate": 3.9843124699855935e-06, "loss": 2.0297, "step": 18790 }, { "epoch": 1.2032, "grad_norm": 0.5475664734840393, "learning_rate": 3.981110933247959e-06, "loss": 2.0222, "step": 18800 }, { "epoch": 1.2032, "eval_loss": 2.042280912399292, "eval_runtime": 98.9381, "eval_samples_per_second": 10.107, "eval_steps_per_second": 5.054, "step": 18800 }, { "epoch": 1.20384, "grad_norm": 0.6041310429573059, "learning_rate": 3.977909396510326e-06, "loss": 2.0824, "step": 18810 }, { "epoch": 1.20448, "grad_norm": 0.5782486796379089, "learning_rate": 3.9747078597726915e-06, "loss": 2.0067, "step": 18820 }, { "epoch": 1.20512, "grad_norm": 0.5963742733001709, "learning_rate": 3.971506323035057e-06, "loss": 2.0014, "step": 18830 }, { "epoch": 1.20576, "grad_norm": 0.5914783477783203, "learning_rate": 3.968304786297424e-06, "loss": 2.0082, "step": 18840 }, { "epoch": 1.2064, "grad_norm": 0.6149041056632996, "learning_rate": 3.9651032495597894e-06, "loss": 2.0269, "step": 18850 }, { "epoch": 1.2070400000000001, "grad_norm": 0.6183750629425049, "learning_rate": 3.961901712822155e-06, "loss": 2.0448, "step": 18860 }, { "epoch": 1.20768, "grad_norm": 0.6572140455245972, "learning_rate": 3.958700176084521e-06, "loss": 2.0438, "step": 18870 }, { "epoch": 1.20832, "grad_norm": 0.5987107753753662, "learning_rate": 3.955498639346887e-06, "loss": 2.0234, "step": 18880 }, { "epoch": 1.20896, "grad_norm": 0.6036151051521301, "learning_rate": 3.952297102609252e-06, "loss": 2.0204, "step": 18890 }, { "epoch": 1.2096, "grad_norm": 0.5558540225028992, "learning_rate": 3.949095565871619e-06, "loss": 1.9918, "step": 18900 }, { "epoch": 1.2096, "eval_loss": 2.041818141937256, "eval_runtime": 98.93, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 18900 }, { "epoch": 1.21024, "grad_norm": 0.56846684217453, "learning_rate": 3.9458940291339846e-06, "loss": 2.0231, "step": 18910 }, { "epoch": 1.21088, "grad_norm": 0.6935490965843201, "learning_rate": 3.94269249239635e-06, "loss": 2.0144, "step": 18920 }, { "epoch": 1.21152, "grad_norm": 0.5987290740013123, "learning_rate": 3.939490955658716e-06, "loss": 2.0177, "step": 18930 }, { "epoch": 1.21216, "grad_norm": 0.5699126124382019, "learning_rate": 3.9362894189210825e-06, "loss": 2.0104, "step": 18940 }, { "epoch": 1.2128, "grad_norm": 0.5563395023345947, "learning_rate": 3.933087882183448e-06, "loss": 1.9968, "step": 18950 }, { "epoch": 1.21344, "grad_norm": 0.560488224029541, "learning_rate": 3.929886345445814e-06, "loss": 2.0503, "step": 18960 }, { "epoch": 1.21408, "grad_norm": 0.6256830096244812, "learning_rate": 3.9266848087081805e-06, "loss": 2.0331, "step": 18970 }, { "epoch": 1.21472, "grad_norm": 0.5946688055992126, "learning_rate": 3.923483271970546e-06, "loss": 1.9896, "step": 18980 }, { "epoch": 1.21536, "grad_norm": 0.6053398251533508, "learning_rate": 3.920281735232912e-06, "loss": 1.998, "step": 18990 }, { "epoch": 1.216, "grad_norm": 0.6324140429496765, "learning_rate": 3.9170801984952785e-06, "loss": 2.005, "step": 19000 }, { "epoch": 1.216, "eval_loss": 2.041613817214966, "eval_runtime": 98.9226, "eval_samples_per_second": 10.109, "eval_steps_per_second": 5.054, "step": 19000 }, { "epoch": 1.21664, "grad_norm": 0.5376859307289124, "learning_rate": 3.913878661757644e-06, "loss": 2.0345, "step": 19010 }, { "epoch": 1.21728, "grad_norm": 0.5733827352523804, "learning_rate": 3.91067712502001e-06, "loss": 2.0347, "step": 19020 }, { "epoch": 1.21792, "grad_norm": 0.5942328572273254, "learning_rate": 3.907475588282376e-06, "loss": 2.03, "step": 19030 }, { "epoch": 1.21856, "grad_norm": 0.5893325209617615, "learning_rate": 3.904274051544742e-06, "loss": 2.0436, "step": 19040 }, { "epoch": 1.2192, "grad_norm": 0.5583692789077759, "learning_rate": 3.901072514807108e-06, "loss": 2.0058, "step": 19050 }, { "epoch": 1.21984, "grad_norm": 0.5637010335922241, "learning_rate": 3.897870978069474e-06, "loss": 2.0126, "step": 19060 }, { "epoch": 1.22048, "grad_norm": 0.5166860222816467, "learning_rate": 3.89466944133184e-06, "loss": 2.0226, "step": 19070 }, { "epoch": 1.22112, "grad_norm": 0.6455560922622681, "learning_rate": 3.891467904594206e-06, "loss": 2.0101, "step": 19080 }, { "epoch": 1.22176, "grad_norm": 0.5828796029090881, "learning_rate": 3.8882663678565716e-06, "loss": 2.0506, "step": 19090 }, { "epoch": 1.2224, "grad_norm": 0.6479492783546448, "learning_rate": 3.885064831118937e-06, "loss": 2.0183, "step": 19100 }, { "epoch": 1.2224, "eval_loss": 2.041250228881836, "eval_runtime": 99.0015, "eval_samples_per_second": 10.101, "eval_steps_per_second": 5.05, "step": 19100 }, { "epoch": 1.22304, "grad_norm": 0.591534435749054, "learning_rate": 3.881863294381303e-06, "loss": 2.0026, "step": 19110 }, { "epoch": 1.2236799999999999, "grad_norm": 0.7330079674720764, "learning_rate": 3.8786617576436695e-06, "loss": 2.0242, "step": 19120 }, { "epoch": 1.22432, "grad_norm": 0.6045952439308167, "learning_rate": 3.875460220906035e-06, "loss": 2.0236, "step": 19130 }, { "epoch": 1.22496, "grad_norm": 0.5437217950820923, "learning_rate": 3.872258684168401e-06, "loss": 2.0217, "step": 19140 }, { "epoch": 1.2256, "grad_norm": 0.5526325106620789, "learning_rate": 3.869057147430767e-06, "loss": 2.0221, "step": 19150 }, { "epoch": 1.22624, "grad_norm": 0.6211245656013489, "learning_rate": 3.865855610693132e-06, "loss": 2.0326, "step": 19160 }, { "epoch": 1.22688, "grad_norm": 0.5961929559707642, "learning_rate": 3.862654073955499e-06, "loss": 2.0135, "step": 19170 }, { "epoch": 1.22752, "grad_norm": 0.5817451477050781, "learning_rate": 3.859452537217865e-06, "loss": 2.0257, "step": 19180 }, { "epoch": 1.22816, "grad_norm": 0.5692046284675598, "learning_rate": 3.85625100048023e-06, "loss": 2.0062, "step": 19190 }, { "epoch": 1.2288000000000001, "grad_norm": 0.5758373141288757, "learning_rate": 3.853049463742597e-06, "loss": 2.028, "step": 19200 }, { "epoch": 1.2288000000000001, "eval_loss": 2.0411949157714844, "eval_runtime": 98.9362, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 19200 }, { "epoch": 1.22944, "grad_norm": 0.6386515498161316, "learning_rate": 3.849847927004963e-06, "loss": 2.0233, "step": 19210 }, { "epoch": 1.23008, "grad_norm": 0.6881201863288879, "learning_rate": 3.846646390267328e-06, "loss": 2.0164, "step": 19220 }, { "epoch": 1.23072, "grad_norm": 0.6185692548751831, "learning_rate": 3.843444853529695e-06, "loss": 2.0323, "step": 19230 }, { "epoch": 1.23136, "grad_norm": 0.6359484791755676, "learning_rate": 3.840243316792061e-06, "loss": 2.0251, "step": 19240 }, { "epoch": 1.232, "grad_norm": 0.675705075263977, "learning_rate": 3.837041780054426e-06, "loss": 2.0037, "step": 19250 }, { "epoch": 1.23264, "grad_norm": 0.6411193013191223, "learning_rate": 3.833840243316793e-06, "loss": 2.0238, "step": 19260 }, { "epoch": 1.23328, "grad_norm": 0.5814663767814636, "learning_rate": 3.830638706579159e-06, "loss": 2.0344, "step": 19270 }, { "epoch": 1.23392, "grad_norm": 0.6048383116722107, "learning_rate": 3.827437169841524e-06, "loss": 1.9835, "step": 19280 }, { "epoch": 1.23456, "grad_norm": 0.6306132674217224, "learning_rate": 3.824235633103891e-06, "loss": 2.013, "step": 19290 }, { "epoch": 1.2352, "grad_norm": 0.5310275554656982, "learning_rate": 3.8210340963662566e-06, "loss": 2.0315, "step": 19300 }, { "epoch": 1.2352, "eval_loss": 2.041146755218506, "eval_runtime": 98.9899, "eval_samples_per_second": 10.102, "eval_steps_per_second": 5.051, "step": 19300 }, { "epoch": 1.23584, "grad_norm": 0.539045512676239, "learning_rate": 3.817832559628622e-06, "loss": 2.0175, "step": 19310 }, { "epoch": 1.23648, "grad_norm": 0.5589911937713623, "learning_rate": 3.8146310228909876e-06, "loss": 2.0288, "step": 19320 }, { "epoch": 1.23712, "grad_norm": 0.5046820640563965, "learning_rate": 3.811429486153354e-06, "loss": 2.032, "step": 19330 }, { "epoch": 1.23776, "grad_norm": 0.5820785164833069, "learning_rate": 3.80822794941572e-06, "loss": 1.9774, "step": 19340 }, { "epoch": 1.2384, "grad_norm": 0.6001272797584534, "learning_rate": 3.8050264126780855e-06, "loss": 2.0126, "step": 19350 }, { "epoch": 1.23904, "grad_norm": 0.567148745059967, "learning_rate": 3.801824875940452e-06, "loss": 2.0103, "step": 19360 }, { "epoch": 1.23968, "grad_norm": 0.5717241764068604, "learning_rate": 3.798623339202818e-06, "loss": 2.0294, "step": 19370 }, { "epoch": 1.24032, "grad_norm": 0.5383074283599854, "learning_rate": 3.7954218024651835e-06, "loss": 2.0216, "step": 19380 }, { "epoch": 1.24096, "grad_norm": 0.6128333210945129, "learning_rate": 3.7922202657275496e-06, "loss": 2.0299, "step": 19390 }, { "epoch": 1.2416, "grad_norm": 0.5616031885147095, "learning_rate": 3.7890187289899154e-06, "loss": 1.9918, "step": 19400 }, { "epoch": 1.2416, "eval_loss": 2.0412375926971436, "eval_runtime": 98.8709, "eval_samples_per_second": 10.114, "eval_steps_per_second": 5.057, "step": 19400 }, { "epoch": 1.24224, "grad_norm": 0.5566725730895996, "learning_rate": 3.7858171922522815e-06, "loss": 2.0488, "step": 19410 }, { "epoch": 1.24288, "grad_norm": 0.6032694578170776, "learning_rate": 3.7826156555146476e-06, "loss": 2.0278, "step": 19420 }, { "epoch": 1.24352, "grad_norm": 0.5486621260643005, "learning_rate": 3.7794141187770133e-06, "loss": 2.0174, "step": 19430 }, { "epoch": 1.24416, "grad_norm": 0.5362090468406677, "learning_rate": 3.776212582039379e-06, "loss": 2.0443, "step": 19440 }, { "epoch": 1.2448, "grad_norm": 0.5653091669082642, "learning_rate": 3.7730110453017448e-06, "loss": 1.984, "step": 19450 }, { "epoch": 1.2454399999999999, "grad_norm": 0.5224658250808716, "learning_rate": 3.7698095085641113e-06, "loss": 2.033, "step": 19460 }, { "epoch": 1.24608, "grad_norm": 0.5859370827674866, "learning_rate": 3.766607971826477e-06, "loss": 2.0043, "step": 19470 }, { "epoch": 1.24672, "grad_norm": 0.5430259108543396, "learning_rate": 3.7634064350888427e-06, "loss": 2.0233, "step": 19480 }, { "epoch": 1.24736, "grad_norm": 0.5666323900222778, "learning_rate": 3.760204898351209e-06, "loss": 2.0392, "step": 19490 }, { "epoch": 1.248, "grad_norm": 0.6931468844413757, "learning_rate": 3.757003361613575e-06, "loss": 2.0193, "step": 19500 }, { "epoch": 1.248, "eval_loss": 2.0411226749420166, "eval_runtime": 98.7097, "eval_samples_per_second": 10.131, "eval_steps_per_second": 5.065, "step": 19500 }, { "epoch": 1.24864, "grad_norm": 0.5994915962219238, "learning_rate": 3.7538018248759407e-06, "loss": 2.0416, "step": 19510 }, { "epoch": 1.24928, "grad_norm": 0.5854692459106445, "learning_rate": 3.750600288138307e-06, "loss": 2.0547, "step": 19520 }, { "epoch": 1.24992, "grad_norm": 0.58211749792099, "learning_rate": 3.7473987514006726e-06, "loss": 2.0424, "step": 19530 }, { "epoch": 1.2505600000000001, "grad_norm": 0.6145838499069214, "learning_rate": 3.7441972146630383e-06, "loss": 2.0197, "step": 19540 }, { "epoch": 1.2511999999999999, "grad_norm": 0.5879749655723572, "learning_rate": 3.740995677925405e-06, "loss": 2.0284, "step": 19550 }, { "epoch": 1.25184, "grad_norm": 0.5488758683204651, "learning_rate": 3.7377941411877705e-06, "loss": 2.027, "step": 19560 }, { "epoch": 1.25248, "grad_norm": 0.5636323690414429, "learning_rate": 3.7345926044501362e-06, "loss": 2.0064, "step": 19570 }, { "epoch": 1.25312, "grad_norm": 0.6053061485290527, "learning_rate": 3.731391067712502e-06, "loss": 2.0148, "step": 19580 }, { "epoch": 1.25376, "grad_norm": 0.5649168491363525, "learning_rate": 3.7281895309748685e-06, "loss": 2.0224, "step": 19590 }, { "epoch": 1.2544, "grad_norm": 0.5852454900741577, "learning_rate": 3.7249879942372342e-06, "loss": 2.0244, "step": 19600 }, { "epoch": 1.2544, "eval_loss": 2.0403201580047607, "eval_runtime": 98.8341, "eval_samples_per_second": 10.118, "eval_steps_per_second": 5.059, "step": 19600 }, { "epoch": 1.25504, "grad_norm": 0.5796120762825012, "learning_rate": 3.7217864574996e-06, "loss": 2.042, "step": 19610 }, { "epoch": 1.25568, "grad_norm": 0.6857411861419678, "learning_rate": 3.718584920761966e-06, "loss": 1.9767, "step": 19620 }, { "epoch": 1.25632, "grad_norm": 0.7955313324928284, "learning_rate": 3.7153833840243318e-06, "loss": 2.0303, "step": 19630 }, { "epoch": 1.25696, "grad_norm": 0.6094045639038086, "learning_rate": 3.712181847286698e-06, "loss": 2.0435, "step": 19640 }, { "epoch": 1.2576, "grad_norm": 0.574123740196228, "learning_rate": 3.708980310549064e-06, "loss": 1.9984, "step": 19650 }, { "epoch": 1.25824, "grad_norm": 0.5539616346359253, "learning_rate": 3.7057787738114297e-06, "loss": 2.029, "step": 19660 }, { "epoch": 1.25888, "grad_norm": 0.6223415732383728, "learning_rate": 3.7025772370737955e-06, "loss": 2.0268, "step": 19670 }, { "epoch": 1.25952, "grad_norm": 0.5583721995353699, "learning_rate": 3.699375700336162e-06, "loss": 2.0131, "step": 19680 }, { "epoch": 1.26016, "grad_norm": 0.6118350625038147, "learning_rate": 3.6961741635985277e-06, "loss": 2.0043, "step": 19690 }, { "epoch": 1.2608, "grad_norm": 0.5769184827804565, "learning_rate": 3.6929726268608934e-06, "loss": 2.0199, "step": 19700 }, { "epoch": 1.2608, "eval_loss": 2.0401883125305176, "eval_runtime": 98.7506, "eval_samples_per_second": 10.127, "eval_steps_per_second": 5.063, "step": 19700 }, { "epoch": 1.7064935064935065, "grad_norm": 0.5702980160713196, "learning_rate": 3.6897710901232596e-06, "loss": 2.011, "step": 19710 }, { "epoch": 1.7073593073593074, "grad_norm": 0.7027571201324463, "learning_rate": 3.6865695533856257e-06, "loss": 2.027, "step": 19720 }, { "epoch": 1.708225108225108, "grad_norm": 0.6393783092498779, "learning_rate": 3.6833680166479914e-06, "loss": 2.0066, "step": 19730 }, { "epoch": 1.709090909090909, "grad_norm": 0.548751950263977, "learning_rate": 3.680166479910357e-06, "loss": 2.0248, "step": 19740 }, { "epoch": 1.70995670995671, "grad_norm": 0.5644584894180298, "learning_rate": 3.6769649431727233e-06, "loss": 2.0273, "step": 19750 }, { "epoch": 1.7108225108225108, "grad_norm": 0.6028965711593628, "learning_rate": 3.673763406435089e-06, "loss": 2.0544, "step": 19760 }, { "epoch": 1.7116883116883117, "grad_norm": 0.5295466780662537, "learning_rate": 3.6705618696974547e-06, "loss": 2.0201, "step": 19770 }, { "epoch": 1.7125541125541126, "grad_norm": 0.5618656873703003, "learning_rate": 3.6673603329598212e-06, "loss": 2.0543, "step": 19780 }, { "epoch": 1.7134199134199135, "grad_norm": 0.5360836982727051, "learning_rate": 3.664158796222187e-06, "loss": 2.0327, "step": 19790 }, { "epoch": 1.7142857142857144, "grad_norm": 0.5506560206413269, "learning_rate": 3.6609572594845527e-06, "loss": 2.0258, "step": 19800 }, { "epoch": 1.7142857142857144, "eval_loss": 2.0400662422180176, "eval_runtime": 100.1215, "eval_samples_per_second": 9.988, "eval_steps_per_second": 4.994, "step": 19800 }, { "epoch": 1.7151515151515153, "grad_norm": 0.6489508748054504, "learning_rate": 3.657755722746919e-06, "loss": 2.0223, "step": 19810 }, { "epoch": 1.716017316017316, "grad_norm": 0.5744234919548035, "learning_rate": 3.654554186009285e-06, "loss": 2.0207, "step": 19820 }, { "epoch": 1.716883116883117, "grad_norm": 0.5877808332443237, "learning_rate": 3.6513526492716506e-06, "loss": 2.0092, "step": 19830 }, { "epoch": 1.7177489177489178, "grad_norm": 0.7264795899391174, "learning_rate": 3.6481511125340168e-06, "loss": 2.0123, "step": 19840 }, { "epoch": 1.7186147186147185, "grad_norm": 0.5963069200515747, "learning_rate": 3.6449495757963825e-06, "loss": 2.0067, "step": 19850 }, { "epoch": 1.7194805194805194, "grad_norm": 0.6109662055969238, "learning_rate": 3.6417480390587486e-06, "loss": 2.0255, "step": 19860 }, { "epoch": 1.7203463203463203, "grad_norm": 0.6088973879814148, "learning_rate": 3.6385465023211143e-06, "loss": 2.0263, "step": 19870 }, { "epoch": 1.7212121212121212, "grad_norm": 0.5885077714920044, "learning_rate": 3.6353449655834805e-06, "loss": 2.0018, "step": 19880 }, { "epoch": 1.722077922077922, "grad_norm": 0.583109974861145, "learning_rate": 3.632143428845846e-06, "loss": 2.0085, "step": 19890 }, { "epoch": 1.722943722943723, "grad_norm": 0.5861423015594482, "learning_rate": 3.628941892108212e-06, "loss": 2.0149, "step": 19900 }, { "epoch": 1.722943722943723, "eval_loss": 2.0403454303741455, "eval_runtime": 99.8602, "eval_samples_per_second": 10.014, "eval_steps_per_second": 5.007, "step": 19900 }, { "epoch": 1.723809523809524, "grad_norm": 0.6617264747619629, "learning_rate": 3.6257403553705784e-06, "loss": 2.0527, "step": 19910 }, { "epoch": 1.7246753246753248, "grad_norm": 0.5962138772010803, "learning_rate": 3.622538818632944e-06, "loss": 2.0027, "step": 19920 }, { "epoch": 1.7255411255411255, "grad_norm": 0.5586407780647278, "learning_rate": 3.61933728189531e-06, "loss": 2.048, "step": 19930 }, { "epoch": 1.7264069264069264, "grad_norm": 0.6327878832817078, "learning_rate": 3.616135745157676e-06, "loss": 2.0102, "step": 19940 }, { "epoch": 1.7272727272727273, "grad_norm": 0.6179520487785339, "learning_rate": 3.612934208420042e-06, "loss": 2.0093, "step": 19950 }, { "epoch": 1.728138528138528, "grad_norm": 0.5419082641601562, "learning_rate": 3.609732671682408e-06, "loss": 2.0106, "step": 19960 }, { "epoch": 1.7290043290043289, "grad_norm": 0.6235204339027405, "learning_rate": 3.606531134944774e-06, "loss": 2.0257, "step": 19970 }, { "epoch": 1.7298701298701298, "grad_norm": 0.5981021523475647, "learning_rate": 3.6033295982071397e-06, "loss": 2.0155, "step": 19980 }, { "epoch": 1.7307359307359307, "grad_norm": 0.5662804841995239, "learning_rate": 3.6001280614695054e-06, "loss": 2.0286, "step": 19990 }, { "epoch": 1.7316017316017316, "grad_norm": 0.5968201160430908, "learning_rate": 3.5969265247318715e-06, "loss": 2.0087, "step": 20000 }, { "epoch": 1.7316017316017316, "eval_loss": 2.0399529933929443, "eval_runtime": 100.0106, "eval_samples_per_second": 9.999, "eval_steps_per_second": 4.999, "step": 20000 }, { "epoch": 1.7324675324675325, "grad_norm": 0.7249771952629089, "learning_rate": 3.5937249879942376e-06, "loss": 2.0221, "step": 20010 }, { "epoch": 1.7333333333333334, "grad_norm": 0.6328686475753784, "learning_rate": 3.5905234512566034e-06, "loss": 2.0242, "step": 20020 }, { "epoch": 1.7341991341991343, "grad_norm": 0.5426235795021057, "learning_rate": 3.587321914518969e-06, "loss": 2.0397, "step": 20030 }, { "epoch": 1.7350649350649352, "grad_norm": 0.5799441337585449, "learning_rate": 3.5841203777813356e-06, "loss": 1.9964, "step": 20040 }, { "epoch": 1.7359307359307359, "grad_norm": 0.6211297512054443, "learning_rate": 3.5809188410437013e-06, "loss": 1.9779, "step": 20050 }, { "epoch": 1.7367965367965368, "grad_norm": 0.583142876625061, "learning_rate": 3.577717304306067e-06, "loss": 2.0101, "step": 20060 }, { "epoch": 1.7376623376623377, "grad_norm": 0.7151252031326294, "learning_rate": 3.574515767568433e-06, "loss": 2.0254, "step": 20070 }, { "epoch": 1.7385281385281384, "grad_norm": 0.6126326322555542, "learning_rate": 3.571314230830799e-06, "loss": 2.0136, "step": 20080 }, { "epoch": 1.7393939393939393, "grad_norm": 0.5718016028404236, "learning_rate": 3.568112694093165e-06, "loss": 2.043, "step": 20090 }, { "epoch": 1.7402597402597402, "grad_norm": 0.5921180844306946, "learning_rate": 3.564911157355531e-06, "loss": 2.012, "step": 20100 }, { "epoch": 1.7402597402597402, "eval_loss": 2.039822816848755, "eval_runtime": 99.8405, "eval_samples_per_second": 10.016, "eval_steps_per_second": 5.008, "step": 20100 }, { "epoch": 1.741125541125541, "grad_norm": 0.519929051399231, "learning_rate": 3.561709620617897e-06, "loss": 2.0345, "step": 20110 }, { "epoch": 1.741991341991342, "grad_norm": 0.5680990219116211, "learning_rate": 3.5585080838802626e-06, "loss": 2.0114, "step": 20120 }, { "epoch": 1.7428571428571429, "grad_norm": 0.5650716423988342, "learning_rate": 3.555306547142629e-06, "loss": 1.9992, "step": 20130 }, { "epoch": 1.7437229437229438, "grad_norm": 0.5756558179855347, "learning_rate": 3.552105010404995e-06, "loss": 2.0144, "step": 20140 }, { "epoch": 1.7445887445887447, "grad_norm": 0.5543674826622009, "learning_rate": 3.5489034736673606e-06, "loss": 2.0499, "step": 20150 }, { "epoch": 1.7454545454545456, "grad_norm": 0.6690876483917236, "learning_rate": 3.5457019369297263e-06, "loss": 1.9965, "step": 20160 }, { "epoch": 1.7463203463203463, "grad_norm": 0.5855807662010193, "learning_rate": 3.542500400192093e-06, "loss": 2.0421, "step": 20170 }, { "epoch": 1.7471861471861472, "grad_norm": 0.5694772601127625, "learning_rate": 3.5392988634544585e-06, "loss": 2.0244, "step": 20180 }, { "epoch": 1.748051948051948, "grad_norm": 0.5997563004493713, "learning_rate": 3.5360973267168242e-06, "loss": 1.9869, "step": 20190 }, { "epoch": 1.7489177489177488, "grad_norm": 0.6609987616539001, "learning_rate": 3.5328957899791904e-06, "loss": 2.0407, "step": 20200 }, { "epoch": 1.7489177489177488, "eval_loss": 2.039726972579956, "eval_runtime": 99.685, "eval_samples_per_second": 10.032, "eval_steps_per_second": 5.016, "step": 20200 }, { "epoch": 1.7497835497835497, "grad_norm": 0.5179834365844727, "learning_rate": 3.529694253241556e-06, "loss": 1.9891, "step": 20210 }, { "epoch": 1.7506493506493506, "grad_norm": 0.6105602979660034, "learning_rate": 3.526492716503922e-06, "loss": 2.0039, "step": 20220 }, { "epoch": 1.7515151515151515, "grad_norm": 0.610821545124054, "learning_rate": 3.5232911797662883e-06, "loss": 1.9976, "step": 20230 }, { "epoch": 1.7523809523809524, "grad_norm": 0.6439399123191833, "learning_rate": 3.520089643028654e-06, "loss": 1.9953, "step": 20240 }, { "epoch": 1.7532467532467533, "grad_norm": 0.5386427044868469, "learning_rate": 3.5168881062910198e-06, "loss": 2.0095, "step": 20250 }, { "epoch": 1.7541125541125542, "grad_norm": 0.6169285178184509, "learning_rate": 3.5136865695533863e-06, "loss": 2.0514, "step": 20260 }, { "epoch": 1.754978354978355, "grad_norm": 0.6283642649650574, "learning_rate": 3.510485032815752e-06, "loss": 1.9985, "step": 20270 }, { "epoch": 1.755844155844156, "grad_norm": 0.5926968455314636, "learning_rate": 3.5072834960781177e-06, "loss": 2.0144, "step": 20280 }, { "epoch": 1.7567099567099567, "grad_norm": 0.5889548063278198, "learning_rate": 3.5040819593404835e-06, "loss": 2.0257, "step": 20290 }, { "epoch": 1.7575757575757576, "grad_norm": 0.5215528011322021, "learning_rate": 3.5008804226028496e-06, "loss": 2.0168, "step": 20300 }, { "epoch": 1.7575757575757576, "eval_loss": 2.039764642715454, "eval_runtime": 99.6287, "eval_samples_per_second": 10.037, "eval_steps_per_second": 5.019, "step": 20300 }, { "epoch": 1.7584415584415585, "grad_norm": 0.5776051878929138, "learning_rate": 3.4976788858652157e-06, "loss": 2.0154, "step": 20310 }, { "epoch": 1.7593073593073592, "grad_norm": 0.5756100416183472, "learning_rate": 3.4944773491275814e-06, "loss": 2.0076, "step": 20320 }, { "epoch": 1.76017316017316, "grad_norm": 0.6546595096588135, "learning_rate": 3.4912758123899476e-06, "loss": 2.0007, "step": 20330 }, { "epoch": 1.761038961038961, "grad_norm": 0.5286757946014404, "learning_rate": 3.4880742756523133e-06, "loss": 1.995, "step": 20340 }, { "epoch": 1.7619047619047619, "grad_norm": 0.5665123462677002, "learning_rate": 3.484872738914679e-06, "loss": 1.9941, "step": 20350 }, { "epoch": 1.7627705627705628, "grad_norm": 0.6835453510284424, "learning_rate": 3.4816712021770455e-06, "loss": 2.035, "step": 20360 }, { "epoch": 1.7636363636363637, "grad_norm": 0.6274100542068481, "learning_rate": 3.4784696654394113e-06, "loss": 1.9852, "step": 20370 }, { "epoch": 1.7645021645021646, "grad_norm": 0.5485489368438721, "learning_rate": 3.475268128701777e-06, "loss": 2.0197, "step": 20380 }, { "epoch": 1.7653679653679655, "grad_norm": 0.6467198729515076, "learning_rate": 3.472066591964143e-06, "loss": 2.0234, "step": 20390 }, { "epoch": 1.7662337662337664, "grad_norm": 0.6412429809570312, "learning_rate": 3.4688650552265092e-06, "loss": 2.0303, "step": 20400 }, { "epoch": 1.7662337662337664, "eval_loss": 2.039860963821411, "eval_runtime": 99.6419, "eval_samples_per_second": 10.036, "eval_steps_per_second": 5.018, "step": 20400 }, { "epoch": 1.767099567099567, "grad_norm": 0.544453501701355, "learning_rate": 3.465663518488875e-06, "loss": 2.0253, "step": 20410 }, { "epoch": 1.767965367965368, "grad_norm": 0.6093266010284424, "learning_rate": 3.462461981751241e-06, "loss": 2.0106, "step": 20420 }, { "epoch": 1.7688311688311689, "grad_norm": 0.6506460309028625, "learning_rate": 3.4592604450136068e-06, "loss": 2.0196, "step": 20430 }, { "epoch": 1.7696969696969695, "grad_norm": 0.6249178647994995, "learning_rate": 3.4560589082759725e-06, "loss": 2.0308, "step": 20440 }, { "epoch": 1.7705627705627704, "grad_norm": 0.5533695220947266, "learning_rate": 3.4528573715383386e-06, "loss": 2.0006, "step": 20450 }, { "epoch": 1.7714285714285714, "grad_norm": 0.5766792893409729, "learning_rate": 3.4496558348007048e-06, "loss": 1.9828, "step": 20460 }, { "epoch": 1.7722943722943723, "grad_norm": 0.623255729675293, "learning_rate": 3.4464542980630705e-06, "loss": 2.0092, "step": 20470 }, { "epoch": 1.7731601731601732, "grad_norm": 0.6529678106307983, "learning_rate": 3.443252761325436e-06, "loss": 2.0247, "step": 20480 }, { "epoch": 1.774025974025974, "grad_norm": 0.5301777720451355, "learning_rate": 3.4400512245878027e-06, "loss": 2.001, "step": 20490 }, { "epoch": 1.774891774891775, "grad_norm": 0.5703863501548767, "learning_rate": 3.4368496878501684e-06, "loss": 2.0127, "step": 20500 }, { "epoch": 1.774891774891775, "eval_loss": 2.039661169052124, "eval_runtime": 99.803, "eval_samples_per_second": 10.02, "eval_steps_per_second": 5.01, "step": 20500 }, { "epoch": 1.7757575757575759, "grad_norm": 0.5548557639122009, "learning_rate": 3.433648151112534e-06, "loss": 2.0469, "step": 20510 }, { "epoch": 1.7766233766233768, "grad_norm": 0.6673153042793274, "learning_rate": 3.4304466143749003e-06, "loss": 1.9994, "step": 20520 }, { "epoch": 1.7774891774891775, "grad_norm": 0.5282227993011475, "learning_rate": 3.4272450776372664e-06, "loss": 2.0299, "step": 20530 }, { "epoch": 1.7783549783549784, "grad_norm": 0.5832075476646423, "learning_rate": 3.424043540899632e-06, "loss": 2.0186, "step": 20540 }, { "epoch": 1.7792207792207793, "grad_norm": 0.585724413394928, "learning_rate": 3.4208420041619983e-06, "loss": 2.0172, "step": 20550 }, { "epoch": 1.78008658008658, "grad_norm": 0.6119825839996338, "learning_rate": 3.417640467424364e-06, "loss": 1.9985, "step": 20560 }, { "epoch": 1.7809523809523808, "grad_norm": 0.5499480366706848, "learning_rate": 3.4144389306867297e-06, "loss": 2.0324, "step": 20570 }, { "epoch": 1.7818181818181817, "grad_norm": 0.5651506185531616, "learning_rate": 3.4112373939490954e-06, "loss": 2.0494, "step": 20580 }, { "epoch": 1.7826839826839826, "grad_norm": 0.6963779330253601, "learning_rate": 3.408035857211462e-06, "loss": 2.0276, "step": 20590 }, { "epoch": 1.7835497835497836, "grad_norm": 0.6034294366836548, "learning_rate": 3.4048343204738277e-06, "loss": 2.0035, "step": 20600 }, { "epoch": 1.7835497835497836, "eval_loss": 2.0393102169036865, "eval_runtime": 99.6842, "eval_samples_per_second": 10.032, "eval_steps_per_second": 5.016, "step": 20600 }, { "epoch": 1.7844155844155845, "grad_norm": 0.6472154855728149, "learning_rate": 3.4016327837361934e-06, "loss": 1.9977, "step": 20610 }, { "epoch": 1.7852813852813854, "grad_norm": 0.5757026076316833, "learning_rate": 3.39843124699856e-06, "loss": 2.0111, "step": 20620 }, { "epoch": 1.7861471861471863, "grad_norm": 0.5563897490501404, "learning_rate": 3.3952297102609256e-06, "loss": 2.0213, "step": 20630 }, { "epoch": 1.7870129870129872, "grad_norm": 0.707472026348114, "learning_rate": 3.3920281735232914e-06, "loss": 2.014, "step": 20640 }, { "epoch": 1.7878787878787878, "grad_norm": 0.6294756531715393, "learning_rate": 3.3888266367856575e-06, "loss": 2.0185, "step": 20650 }, { "epoch": 1.7887445887445887, "grad_norm": 0.5562750101089478, "learning_rate": 3.385625100048023e-06, "loss": 2.0367, "step": 20660 }, { "epoch": 1.7896103896103897, "grad_norm": 0.5640225410461426, "learning_rate": 3.3824235633103893e-06, "loss": 2.029, "step": 20670 }, { "epoch": 1.7904761904761903, "grad_norm": 0.6213473081588745, "learning_rate": 3.3792220265727555e-06, "loss": 2.016, "step": 20680 }, { "epoch": 1.7913419913419912, "grad_norm": 0.5477250218391418, "learning_rate": 3.376020489835121e-06, "loss": 2.0073, "step": 20690 }, { "epoch": 1.7922077922077921, "grad_norm": 0.539556086063385, "learning_rate": 3.372818953097487e-06, "loss": 2.0243, "step": 20700 }, { "epoch": 1.7922077922077921, "eval_loss": 2.0392966270446777, "eval_runtime": 98.9726, "eval_samples_per_second": 10.104, "eval_steps_per_second": 5.052, "step": 20700 }, { "epoch": 1.793073593073593, "grad_norm": 0.527823805809021, "learning_rate": 3.3696174163598526e-06, "loss": 2.0367, "step": 20710 }, { "epoch": 1.793939393939394, "grad_norm": 0.6325352191925049, "learning_rate": 3.366415879622219e-06, "loss": 2.0059, "step": 20720 }, { "epoch": 1.7948051948051948, "grad_norm": 0.6825228333473206, "learning_rate": 3.363214342884585e-06, "loss": 2.0062, "step": 20730 }, { "epoch": 1.7956709956709958, "grad_norm": 0.7011380195617676, "learning_rate": 3.3600128061469506e-06, "loss": 2.0427, "step": 20740 }, { "epoch": 1.7965367965367967, "grad_norm": 0.5550771355628967, "learning_rate": 3.3568112694093167e-06, "loss": 2.0133, "step": 20750 }, { "epoch": 1.7974025974025976, "grad_norm": 0.6346076130867004, "learning_rate": 3.353609732671683e-06, "loss": 2.0068, "step": 20760 }, { "epoch": 1.7982683982683982, "grad_norm": 0.6987687349319458, "learning_rate": 3.3504081959340485e-06, "loss": 2.0234, "step": 20770 }, { "epoch": 1.7991341991341991, "grad_norm": 0.5877545475959778, "learning_rate": 3.3472066591964147e-06, "loss": 1.9782, "step": 20780 }, { "epoch": 1.8, "grad_norm": 0.59261554479599, "learning_rate": 3.3440051224587804e-06, "loss": 1.9872, "step": 20790 }, { "epoch": 1.8008658008658007, "grad_norm": 0.6158283352851868, "learning_rate": 3.340803585721146e-06, "loss": 2.0009, "step": 20800 }, { "epoch": 1.8008658008658007, "eval_loss": 2.039147138595581, "eval_runtime": 98.9251, "eval_samples_per_second": 10.109, "eval_steps_per_second": 5.054, "step": 20800 }, { "epoch": 1.8017316017316016, "grad_norm": 0.6955145597457886, "learning_rate": 3.3376020489835127e-06, "loss": 2.0368, "step": 20810 }, { "epoch": 1.8025974025974025, "grad_norm": 0.6685484051704407, "learning_rate": 3.3344005122458784e-06, "loss": 2.0246, "step": 20820 }, { "epoch": 1.8034632034632034, "grad_norm": 0.6130850315093994, "learning_rate": 3.331198975508244e-06, "loss": 2.0063, "step": 20830 }, { "epoch": 1.8043290043290043, "grad_norm": 0.6293763518333435, "learning_rate": 3.3279974387706106e-06, "loss": 2.0222, "step": 20840 }, { "epoch": 1.8051948051948052, "grad_norm": 0.5538076758384705, "learning_rate": 3.3247959020329763e-06, "loss": 2.009, "step": 20850 }, { "epoch": 1.8060606060606061, "grad_norm": 0.5926476120948792, "learning_rate": 3.321594365295342e-06, "loss": 2.045, "step": 20860 }, { "epoch": 1.806926406926407, "grad_norm": 0.5465952157974243, "learning_rate": 3.3183928285577078e-06, "loss": 2.0329, "step": 20870 }, { "epoch": 1.807792207792208, "grad_norm": 0.5225395560264587, "learning_rate": 3.315191291820074e-06, "loss": 1.9831, "step": 20880 }, { "epoch": 1.8086580086580086, "grad_norm": 0.5721254348754883, "learning_rate": 3.3119897550824396e-06, "loss": 2.0125, "step": 20890 }, { "epoch": 1.8095238095238095, "grad_norm": 0.49629688262939453, "learning_rate": 3.3087882183448057e-06, "loss": 2.0262, "step": 20900 }, { "epoch": 1.8095238095238095, "eval_loss": 2.0388810634613037, "eval_runtime": 98.8941, "eval_samples_per_second": 10.112, "eval_steps_per_second": 5.056, "step": 20900 }, { "epoch": 1.8103896103896104, "grad_norm": 0.5626356601715088, "learning_rate": 3.305586681607172e-06, "loss": 2.0369, "step": 20910 }, { "epoch": 1.8112554112554111, "grad_norm": 0.5846545696258545, "learning_rate": 3.3023851448695376e-06, "loss": 2.0192, "step": 20920 }, { "epoch": 1.812121212121212, "grad_norm": 0.5293322205543518, "learning_rate": 3.2991836081319033e-06, "loss": 2.0366, "step": 20930 }, { "epoch": 1.812987012987013, "grad_norm": 0.5870088934898376, "learning_rate": 3.29598207139427e-06, "loss": 2.0465, "step": 20940 }, { "epoch": 1.8138528138528138, "grad_norm": 0.6359837651252747, "learning_rate": 3.2927805346566356e-06, "loss": 1.9992, "step": 20950 }, { "epoch": 1.8147186147186147, "grad_norm": 0.577740490436554, "learning_rate": 3.2895789979190013e-06, "loss": 1.9785, "step": 20960 }, { "epoch": 1.8155844155844156, "grad_norm": 0.6007850766181946, "learning_rate": 3.2863774611813674e-06, "loss": 1.9854, "step": 20970 }, { "epoch": 1.8164502164502165, "grad_norm": 0.6609432697296143, "learning_rate": 3.2831759244437335e-06, "loss": 2.0204, "step": 20980 }, { "epoch": 1.8173160173160174, "grad_norm": 0.664561927318573, "learning_rate": 3.2799743877060993e-06, "loss": 1.9866, "step": 20990 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5524464845657349, "learning_rate": 3.276772850968465e-06, "loss": 1.9935, "step": 21000 }, { "epoch": 1.8181818181818183, "eval_loss": 2.038954496383667, "eval_runtime": 99.5448, "eval_samples_per_second": 10.046, "eval_steps_per_second": 5.023, "step": 21000 }, { "epoch": 1.819047619047619, "grad_norm": 0.5762661695480347, "learning_rate": 3.273571314230831e-06, "loss": 2.0434, "step": 21010 }, { "epoch": 1.81991341991342, "grad_norm": 0.6803544163703918, "learning_rate": 3.270369777493197e-06, "loss": 2.0162, "step": 21020 }, { "epoch": 1.8207792207792208, "grad_norm": 0.5953033566474915, "learning_rate": 3.2671682407555625e-06, "loss": 2.0268, "step": 21030 }, { "epoch": 1.8216450216450215, "grad_norm": 0.5579327940940857, "learning_rate": 3.263966704017929e-06, "loss": 2.0112, "step": 21040 }, { "epoch": 1.8225108225108224, "grad_norm": 0.5896299481391907, "learning_rate": 3.2607651672802948e-06, "loss": 1.9973, "step": 21050 }, { "epoch": 1.8233766233766233, "grad_norm": 0.5517136454582214, "learning_rate": 3.2575636305426605e-06, "loss": 2.0676, "step": 21060 }, { "epoch": 1.8242424242424242, "grad_norm": 0.5696994662284851, "learning_rate": 3.254362093805027e-06, "loss": 2.0096, "step": 21070 }, { "epoch": 1.8251082251082251, "grad_norm": 0.602047324180603, "learning_rate": 3.2511605570673928e-06, "loss": 2.024, "step": 21080 }, { "epoch": 1.825974025974026, "grad_norm": 0.531102180480957, "learning_rate": 3.2479590203297585e-06, "loss": 2.0221, "step": 21090 }, { "epoch": 1.826839826839827, "grad_norm": 0.5972487330436707, "learning_rate": 3.2447574835921246e-06, "loss": 2.0167, "step": 21100 }, { "epoch": 1.826839826839827, "eval_loss": 2.038703680038452, "eval_runtime": 99.4301, "eval_samples_per_second": 10.057, "eval_steps_per_second": 5.029, "step": 21100 }, { "epoch": 1.8277056277056278, "grad_norm": 0.6066187620162964, "learning_rate": 3.2415559468544903e-06, "loss": 2.0174, "step": 21110 }, { "epoch": 1.8285714285714287, "grad_norm": 0.5727007985115051, "learning_rate": 3.2383544101168564e-06, "loss": 2.0065, "step": 21120 }, { "epoch": 1.8294372294372294, "grad_norm": 0.6033422946929932, "learning_rate": 3.235152873379222e-06, "loss": 2.0072, "step": 21130 }, { "epoch": 1.8303030303030303, "grad_norm": 0.5761565566062927, "learning_rate": 3.2319513366415883e-06, "loss": 2.0455, "step": 21140 }, { "epoch": 1.8311688311688312, "grad_norm": 0.5772866606712341, "learning_rate": 3.228749799903954e-06, "loss": 2.0235, "step": 21150 }, { "epoch": 1.832034632034632, "grad_norm": 0.5313910841941833, "learning_rate": 3.2255482631663197e-06, "loss": 1.9966, "step": 21160 }, { "epoch": 1.8329004329004328, "grad_norm": 0.5462523698806763, "learning_rate": 3.2223467264286863e-06, "loss": 2.0213, "step": 21170 }, { "epoch": 1.8337662337662337, "grad_norm": 0.5453023314476013, "learning_rate": 3.219145189691052e-06, "loss": 2.0008, "step": 21180 }, { "epoch": 1.8346320346320346, "grad_norm": 0.5670868158340454, "learning_rate": 3.2159436529534177e-06, "loss": 2.0036, "step": 21190 }, { "epoch": 1.8354978354978355, "grad_norm": 0.6061764359474182, "learning_rate": 3.212742116215784e-06, "loss": 2.0033, "step": 21200 }, { "epoch": 1.8354978354978355, "eval_loss": 2.038498640060425, "eval_runtime": 99.564, "eval_samples_per_second": 10.044, "eval_steps_per_second": 5.022, "step": 21200 }, { "epoch": 1.8363636363636364, "grad_norm": 0.5856644511222839, "learning_rate": 3.20954057947815e-06, "loss": 2.0251, "step": 21210 }, { "epoch": 1.8372294372294373, "grad_norm": 0.5439422726631165, "learning_rate": 3.2063390427405157e-06, "loss": 2.0237, "step": 21220 }, { "epoch": 1.8380952380952382, "grad_norm": 0.585919976234436, "learning_rate": 3.203137506002882e-06, "loss": 2.021, "step": 21230 }, { "epoch": 1.838961038961039, "grad_norm": 0.6343615055084229, "learning_rate": 3.1999359692652475e-06, "loss": 2.0149, "step": 21240 }, { "epoch": 1.8398268398268398, "grad_norm": 0.6096217036247253, "learning_rate": 3.1967344325276132e-06, "loss": 2.0069, "step": 21250 }, { "epoch": 1.8406926406926407, "grad_norm": 0.5958551168441772, "learning_rate": 3.1935328957899798e-06, "loss": 2.004, "step": 21260 }, { "epoch": 1.8415584415584414, "grad_norm": 0.6105349659919739, "learning_rate": 3.1903313590523455e-06, "loss": 1.9986, "step": 21270 }, { "epoch": 1.8424242424242423, "grad_norm": 0.6545379757881165, "learning_rate": 3.187129822314711e-06, "loss": 2.0178, "step": 21280 }, { "epoch": 1.8432900432900432, "grad_norm": 0.6139872670173645, "learning_rate": 3.183928285577077e-06, "loss": 2.0106, "step": 21290 }, { "epoch": 1.844155844155844, "grad_norm": 0.5510513782501221, "learning_rate": 3.1807267488394435e-06, "loss": 2.0068, "step": 21300 }, { "epoch": 1.844155844155844, "eval_loss": 2.038245439529419, "eval_runtime": 99.354, "eval_samples_per_second": 10.065, "eval_steps_per_second": 5.033, "step": 21300 }, { "epoch": 1.845021645021645, "grad_norm": 0.6139474511146545, "learning_rate": 3.177525212101809e-06, "loss": 2.0397, "step": 21310 }, { "epoch": 1.845887445887446, "grad_norm": 0.5427426099777222, "learning_rate": 3.174323675364175e-06, "loss": 2.0191, "step": 21320 }, { "epoch": 1.8467532467532468, "grad_norm": 0.5988373756408691, "learning_rate": 3.171122138626541e-06, "loss": 2.0261, "step": 21330 }, { "epoch": 1.8476190476190477, "grad_norm": 0.5978485941886902, "learning_rate": 3.1679206018889067e-06, "loss": 1.9954, "step": 21340 }, { "epoch": 1.8484848484848486, "grad_norm": 0.5510175228118896, "learning_rate": 3.164719065151273e-06, "loss": 2.0011, "step": 21350 }, { "epoch": 1.8493506493506493, "grad_norm": 0.589741051197052, "learning_rate": 3.161517528413639e-06, "loss": 2.0222, "step": 21360 }, { "epoch": 1.8502164502164502, "grad_norm": 0.6140208840370178, "learning_rate": 3.1583159916760047e-06, "loss": 2.0232, "step": 21370 }, { "epoch": 1.851082251082251, "grad_norm": 0.6576170921325684, "learning_rate": 3.1551144549383704e-06, "loss": 2.039, "step": 21380 }, { "epoch": 1.8519480519480518, "grad_norm": 0.7040467858314514, "learning_rate": 3.151912918200737e-06, "loss": 2.0246, "step": 21390 }, { "epoch": 1.8528138528138527, "grad_norm": 0.6736012697219849, "learning_rate": 3.1487113814631027e-06, "loss": 2.0045, "step": 21400 }, { "epoch": 1.8528138528138527, "eval_loss": 2.038057327270508, "eval_runtime": 99.3538, "eval_samples_per_second": 10.065, "eval_steps_per_second": 5.033, "step": 21400 }, { "epoch": 1.8536796536796536, "grad_norm": 0.536457359790802, "learning_rate": 3.1455098447254684e-06, "loss": 1.9998, "step": 21410 }, { "epoch": 1.8545454545454545, "grad_norm": 0.5989015698432922, "learning_rate": 3.142308307987834e-06, "loss": 2.0318, "step": 21420 }, { "epoch": 1.8554112554112554, "grad_norm": 0.5948752760887146, "learning_rate": 3.1391067712502007e-06, "loss": 1.9903, "step": 21430 }, { "epoch": 1.8562770562770563, "grad_norm": 0.6903519034385681, "learning_rate": 3.1359052345125664e-06, "loss": 1.9955, "step": 21440 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5571711659431458, "learning_rate": 3.132703697774932e-06, "loss": 2.0192, "step": 21450 }, { "epoch": 1.858008658008658, "grad_norm": 0.5514909029006958, "learning_rate": 3.129502161037298e-06, "loss": 1.9942, "step": 21460 }, { "epoch": 1.858874458874459, "grad_norm": 0.6278780698776245, "learning_rate": 3.126300624299664e-06, "loss": 2.0145, "step": 21470 }, { "epoch": 1.8597402597402597, "grad_norm": 0.587446928024292, "learning_rate": 3.1230990875620296e-06, "loss": 2.0016, "step": 21480 }, { "epoch": 1.8606060606060606, "grad_norm": 0.6872239708900452, "learning_rate": 3.119897550824396e-06, "loss": 2.0476, "step": 21490 }, { "epoch": 1.8614718614718615, "grad_norm": 0.5873289108276367, "learning_rate": 3.116696014086762e-06, "loss": 2.0128, "step": 21500 }, { "epoch": 1.8614718614718615, "eval_loss": 2.037977457046509, "eval_runtime": 99.2661, "eval_samples_per_second": 10.074, "eval_steps_per_second": 5.037, "step": 21500 }, { "epoch": 1.8623376623376622, "grad_norm": 0.631047785282135, "learning_rate": 3.1134944773491276e-06, "loss": 1.984, "step": 21510 }, { "epoch": 1.863203463203463, "grad_norm": 0.5543110966682434, "learning_rate": 3.110292940611494e-06, "loss": 1.9967, "step": 21520 }, { "epoch": 1.864069264069264, "grad_norm": 0.5351025462150574, "learning_rate": 3.10709140387386e-06, "loss": 2.0258, "step": 21530 }, { "epoch": 1.864935064935065, "grad_norm": 0.596958339214325, "learning_rate": 3.1038898671362256e-06, "loss": 2.0264, "step": 21540 }, { "epoch": 1.8658008658008658, "grad_norm": 0.6210545301437378, "learning_rate": 3.1006883303985917e-06, "loss": 2.0105, "step": 21550 }, { "epoch": 1.8666666666666667, "grad_norm": 0.6117931008338928, "learning_rate": 3.0974867936609574e-06, "loss": 2.0459, "step": 21560 }, { "epoch": 1.8675324675324676, "grad_norm": 0.5893979072570801, "learning_rate": 3.0942852569233236e-06, "loss": 2.015, "step": 21570 }, { "epoch": 1.8683982683982685, "grad_norm": 0.6391885280609131, "learning_rate": 3.0910837201856893e-06, "loss": 2.0035, "step": 21580 }, { "epoch": 1.8692640692640694, "grad_norm": 0.5598975419998169, "learning_rate": 3.0878821834480554e-06, "loss": 2.0405, "step": 21590 }, { "epoch": 1.87012987012987, "grad_norm": 0.5230623483657837, "learning_rate": 3.084680646710421e-06, "loss": 2.0182, "step": 21600 }, { "epoch": 1.87012987012987, "eval_loss": 2.037860870361328, "eval_runtime": 99.3825, "eval_samples_per_second": 10.062, "eval_steps_per_second": 5.031, "step": 21600 }, { "epoch": 1.870995670995671, "grad_norm": 0.6047036647796631, "learning_rate": 3.081479109972787e-06, "loss": 2.0015, "step": 21610 }, { "epoch": 1.871861471861472, "grad_norm": 0.573807954788208, "learning_rate": 3.0782775732351534e-06, "loss": 2.003, "step": 21620 }, { "epoch": 1.8727272727272726, "grad_norm": 0.5762017965316772, "learning_rate": 3.075076036497519e-06, "loss": 2.0432, "step": 21630 }, { "epoch": 1.8735930735930735, "grad_norm": 0.5771380662918091, "learning_rate": 3.071874499759885e-06, "loss": 1.996, "step": 21640 }, { "epoch": 1.8744588744588744, "grad_norm": 0.6185895204544067, "learning_rate": 3.0686729630222514e-06, "loss": 1.9864, "step": 21650 }, { "epoch": 1.8753246753246753, "grad_norm": 0.5651230812072754, "learning_rate": 3.065471426284617e-06, "loss": 2.0112, "step": 21660 }, { "epoch": 1.8761904761904762, "grad_norm": 0.5942379236221313, "learning_rate": 3.0622698895469828e-06, "loss": 1.9949, "step": 21670 }, { "epoch": 1.877056277056277, "grad_norm": 0.6241719126701355, "learning_rate": 3.059068352809349e-06, "loss": 2.0335, "step": 21680 }, { "epoch": 1.877922077922078, "grad_norm": 0.5679668188095093, "learning_rate": 3.0558668160717146e-06, "loss": 2.0457, "step": 21690 }, { "epoch": 1.878787878787879, "grad_norm": 0.555386483669281, "learning_rate": 3.0526652793340803e-06, "loss": 2.0202, "step": 21700 }, { "epoch": 1.878787878787879, "eval_loss": 2.037943124771118, "eval_runtime": 98.9297, "eval_samples_per_second": 10.108, "eval_steps_per_second": 5.054, "step": 21700 }, { "epoch": 1.8796536796536798, "grad_norm": 0.5467822551727295, "learning_rate": 3.0494637425964465e-06, "loss": 2.0007, "step": 21710 }, { "epoch": 1.8805194805194805, "grad_norm": 0.5980138778686523, "learning_rate": 3.0462622058588126e-06, "loss": 2.0131, "step": 21720 }, { "epoch": 1.8813852813852814, "grad_norm": 0.5898545980453491, "learning_rate": 3.0430606691211783e-06, "loss": 2.0008, "step": 21730 }, { "epoch": 1.8822510822510823, "grad_norm": 0.5178360342979431, "learning_rate": 3.039859132383544e-06, "loss": 2.0268, "step": 21740 }, { "epoch": 1.883116883116883, "grad_norm": 0.5681174397468567, "learning_rate": 3.0366575956459106e-06, "loss": 2.0395, "step": 21750 }, { "epoch": 1.8839826839826839, "grad_norm": 0.5684530138969421, "learning_rate": 3.0334560589082763e-06, "loss": 2.0093, "step": 21760 }, { "epoch": 1.8848484848484848, "grad_norm": 0.5369515419006348, "learning_rate": 3.030254522170642e-06, "loss": 2.0208, "step": 21770 }, { "epoch": 1.8857142857142857, "grad_norm": 0.596129834651947, "learning_rate": 3.027052985433008e-06, "loss": 2.0012, "step": 21780 }, { "epoch": 1.8865800865800866, "grad_norm": 0.5867626667022705, "learning_rate": 3.0238514486953743e-06, "loss": 2.0238, "step": 21790 }, { "epoch": 1.8874458874458875, "grad_norm": 0.5560017824172974, "learning_rate": 3.02064991195774e-06, "loss": 2.0307, "step": 21800 }, { "epoch": 1.8874458874458875, "eval_loss": 2.0375423431396484, "eval_runtime": 99.0735, "eval_samples_per_second": 10.094, "eval_steps_per_second": 5.047, "step": 21800 }, { "epoch": 1.8883116883116884, "grad_norm": 0.5294777154922485, "learning_rate": 3.017448375220106e-06, "loss": 2.0005, "step": 21810 }, { "epoch": 1.8891774891774893, "grad_norm": 0.5869369506835938, "learning_rate": 3.014246838482472e-06, "loss": 2.0027, "step": 21820 }, { "epoch": 1.8900432900432902, "grad_norm": 0.6573461294174194, "learning_rate": 3.0110453017448375e-06, "loss": 2.0265, "step": 21830 }, { "epoch": 1.8909090909090909, "grad_norm": 0.6510856747627258, "learning_rate": 3.0078437650072032e-06, "loss": 2.0231, "step": 21840 }, { "epoch": 1.8917748917748918, "grad_norm": 0.5690562129020691, "learning_rate": 3.00464222826957e-06, "loss": 2.0043, "step": 21850 }, { "epoch": 1.8926406926406927, "grad_norm": 0.5478546619415283, "learning_rate": 3.0014406915319355e-06, "loss": 2.0262, "step": 21860 }, { "epoch": 1.8935064935064934, "grad_norm": 0.6657856106758118, "learning_rate": 2.9982391547943012e-06, "loss": 1.9971, "step": 21870 }, { "epoch": 1.8943722943722943, "grad_norm": 0.5432700514793396, "learning_rate": 2.9950376180566678e-06, "loss": 2.0038, "step": 21880 }, { "epoch": 1.8952380952380952, "grad_norm": 0.6394188404083252, "learning_rate": 2.9918360813190335e-06, "loss": 2.012, "step": 21890 }, { "epoch": 1.896103896103896, "grad_norm": 0.5569697618484497, "learning_rate": 2.988634544581399e-06, "loss": 2.0136, "step": 21900 }, { "epoch": 1.896103896103896, "eval_loss": 2.0372016429901123, "eval_runtime": 99.5333, "eval_samples_per_second": 10.047, "eval_steps_per_second": 5.023, "step": 21900 }, { "epoch": 1.896969696969697, "grad_norm": 0.5097722411155701, "learning_rate": 2.9854330078437653e-06, "loss": 2.0326, "step": 21910 }, { "epoch": 1.8978354978354979, "grad_norm": 0.6028870344161987, "learning_rate": 2.982231471106131e-06, "loss": 2.0245, "step": 21920 }, { "epoch": 1.8987012987012988, "grad_norm": 0.6038254499435425, "learning_rate": 2.979029934368497e-06, "loss": 2.0398, "step": 21930 }, { "epoch": 1.8995670995670997, "grad_norm": 0.5593138933181763, "learning_rate": 2.9758283976308633e-06, "loss": 1.9825, "step": 21940 }, { "epoch": 1.9004329004329006, "grad_norm": 0.6215075254440308, "learning_rate": 2.972626860893229e-06, "loss": 2.0137, "step": 21950 }, { "epoch": 1.9012987012987013, "grad_norm": 0.59126877784729, "learning_rate": 2.9694253241555947e-06, "loss": 2.0392, "step": 21960 }, { "epoch": 1.9021645021645022, "grad_norm": 0.6107071042060852, "learning_rate": 2.9662237874179613e-06, "loss": 2.0244, "step": 21970 }, { "epoch": 1.903030303030303, "grad_norm": 0.5983561873435974, "learning_rate": 2.963022250680327e-06, "loss": 2.0302, "step": 21980 }, { "epoch": 1.9038961038961038, "grad_norm": 0.6307891607284546, "learning_rate": 2.9598207139426927e-06, "loss": 1.9965, "step": 21990 }, { "epoch": 1.9047619047619047, "grad_norm": 0.623501181602478, "learning_rate": 2.9566191772050584e-06, "loss": 2.0181, "step": 22000 }, { "epoch": 1.9047619047619047, "eval_loss": 2.0373153686523438, "eval_runtime": 99.5258, "eval_samples_per_second": 10.048, "eval_steps_per_second": 5.024, "step": 22000 }, { "epoch": 1.9056277056277056, "grad_norm": 0.5510867238044739, "learning_rate": 2.9534176404674245e-06, "loss": 2.0109, "step": 22010 }, { "epoch": 1.9064935064935065, "grad_norm": 0.6466839909553528, "learning_rate": 2.9502161037297907e-06, "loss": 2.0275, "step": 22020 }, { "epoch": 1.9073593073593074, "grad_norm": 0.547214925289154, "learning_rate": 2.9470145669921564e-06, "loss": 2.0352, "step": 22030 }, { "epoch": 1.9082251082251083, "grad_norm": 0.6152887344360352, "learning_rate": 2.9438130302545225e-06, "loss": 1.9707, "step": 22040 }, { "epoch": 1.9090909090909092, "grad_norm": 0.6470698714256287, "learning_rate": 2.9406114935168882e-06, "loss": 2.0254, "step": 22050 }, { "epoch": 1.90995670995671, "grad_norm": 0.5699059367179871, "learning_rate": 2.937409956779254e-06, "loss": 1.9882, "step": 22060 }, { "epoch": 1.910822510822511, "grad_norm": 0.625361979007721, "learning_rate": 2.9342084200416205e-06, "loss": 2.0048, "step": 22070 }, { "epoch": 1.9116883116883117, "grad_norm": 0.5219143629074097, "learning_rate": 2.931006883303986e-06, "loss": 2.0008, "step": 22080 }, { "epoch": 1.9125541125541126, "grad_norm": 0.5905846953392029, "learning_rate": 2.927805346566352e-06, "loss": 1.9883, "step": 22090 }, { "epoch": 1.9134199134199135, "grad_norm": 0.5829472541809082, "learning_rate": 2.9246038098287185e-06, "loss": 2.0223, "step": 22100 }, { "epoch": 1.9134199134199135, "eval_loss": 2.0372154712677, "eval_runtime": 99.6277, "eval_samples_per_second": 10.037, "eval_steps_per_second": 5.019, "step": 22100 }, { "epoch": 1.9142857142857141, "grad_norm": 0.5397132635116577, "learning_rate": 2.921402273091084e-06, "loss": 2.0253, "step": 22110 }, { "epoch": 1.915151515151515, "grad_norm": 0.5382157564163208, "learning_rate": 2.91820073635345e-06, "loss": 2.0102, "step": 22120 }, { "epoch": 1.916017316017316, "grad_norm": 0.5529114007949829, "learning_rate": 2.9149991996158156e-06, "loss": 2.0336, "step": 22130 }, { "epoch": 1.9168831168831169, "grad_norm": 0.5019690990447998, "learning_rate": 2.9117976628781817e-06, "loss": 2.0327, "step": 22140 }, { "epoch": 1.9177489177489178, "grad_norm": 0.5692863464355469, "learning_rate": 2.9085961261405475e-06, "loss": 2.013, "step": 22150 }, { "epoch": 1.9186147186147187, "grad_norm": 0.5849686861038208, "learning_rate": 2.9053945894029136e-06, "loss": 2.0447, "step": 22160 }, { "epoch": 1.9194805194805196, "grad_norm": 0.6239033341407776, "learning_rate": 2.9021930526652797e-06, "loss": 2.0144, "step": 22170 }, { "epoch": 1.9203463203463205, "grad_norm": 0.558525800704956, "learning_rate": 2.8989915159276454e-06, "loss": 2.0499, "step": 22180 }, { "epoch": 1.9212121212121214, "grad_norm": 0.5736129283905029, "learning_rate": 2.895789979190011e-06, "loss": 2.024, "step": 22190 }, { "epoch": 1.922077922077922, "grad_norm": 0.5700423717498779, "learning_rate": 2.8925884424523777e-06, "loss": 2.0072, "step": 22200 }, { "epoch": 1.922077922077922, "eval_loss": 2.036369562149048, "eval_runtime": 99.4116, "eval_samples_per_second": 10.059, "eval_steps_per_second": 5.03, "step": 22200 }, { "epoch": 1.922943722943723, "grad_norm": 0.6457517743110657, "learning_rate": 2.8893869057147434e-06, "loss": 2.0013, "step": 22210 }, { "epoch": 1.9238095238095239, "grad_norm": 0.6279447674751282, "learning_rate": 2.886185368977109e-06, "loss": 1.9793, "step": 22220 }, { "epoch": 1.9246753246753245, "grad_norm": 0.6363939642906189, "learning_rate": 2.8829838322394752e-06, "loss": 2.0159, "step": 22230 }, { "epoch": 1.9255411255411254, "grad_norm": 0.6827042102813721, "learning_rate": 2.8797822955018414e-06, "loss": 2.0152, "step": 22240 }, { "epoch": 1.9264069264069263, "grad_norm": 0.6012581586837769, "learning_rate": 2.876580758764207e-06, "loss": 2.024, "step": 22250 }, { "epoch": 1.9272727272727272, "grad_norm": 0.5410102009773254, "learning_rate": 2.8733792220265732e-06, "loss": 2.0303, "step": 22260 }, { "epoch": 1.9281385281385282, "grad_norm": 0.5607244372367859, "learning_rate": 2.870177685288939e-06, "loss": 2.0185, "step": 22270 }, { "epoch": 1.929004329004329, "grad_norm": 0.5890989899635315, "learning_rate": 2.8669761485513046e-06, "loss": 1.9857, "step": 22280 }, { "epoch": 1.92987012987013, "grad_norm": 0.5300803780555725, "learning_rate": 2.8637746118136704e-06, "loss": 2.0256, "step": 22290 }, { "epoch": 1.9307359307359309, "grad_norm": 0.5861728191375732, "learning_rate": 2.860573075076037e-06, "loss": 2.0206, "step": 22300 }, { "epoch": 1.9307359307359309, "eval_loss": 2.036616802215576, "eval_runtime": 99.5185, "eval_samples_per_second": 10.048, "eval_steps_per_second": 5.024, "step": 22300 }, { "epoch": 1.9316017316017318, "grad_norm": 0.5773259401321411, "learning_rate": 2.8573715383384026e-06, "loss": 2.0219, "step": 22310 }, { "epoch": 1.9324675324675324, "grad_norm": 0.723914623260498, "learning_rate": 2.8541700016007683e-06, "loss": 2.0343, "step": 22320 }, { "epoch": 1.9333333333333333, "grad_norm": 0.5671261548995972, "learning_rate": 2.850968464863135e-06, "loss": 2.0309, "step": 22330 }, { "epoch": 1.9341991341991343, "grad_norm": 0.566591203212738, "learning_rate": 2.8477669281255006e-06, "loss": 2.0093, "step": 22340 }, { "epoch": 1.935064935064935, "grad_norm": 0.5599043965339661, "learning_rate": 2.8445653913878663e-06, "loss": 2.0006, "step": 22350 }, { "epoch": 1.9359307359307358, "grad_norm": 0.5229726433753967, "learning_rate": 2.8413638546502324e-06, "loss": 2.0483, "step": 22360 }, { "epoch": 1.9367965367965367, "grad_norm": 0.5757012367248535, "learning_rate": 2.838162317912598e-06, "loss": 2.0317, "step": 22370 }, { "epoch": 1.9376623376623376, "grad_norm": 0.5612809658050537, "learning_rate": 2.8349607811749643e-06, "loss": 2.0107, "step": 22380 }, { "epoch": 1.9385281385281385, "grad_norm": 0.6425288319587708, "learning_rate": 2.8317592444373304e-06, "loss": 1.9869, "step": 22390 }, { "epoch": 1.9393939393939394, "grad_norm": 0.5596022605895996, "learning_rate": 2.828557707699696e-06, "loss": 1.9866, "step": 22400 }, { "epoch": 1.9393939393939394, "eval_loss": 2.0361886024475098, "eval_runtime": 99.3361, "eval_samples_per_second": 10.067, "eval_steps_per_second": 5.033, "step": 22400 }, { "epoch": 1.9402597402597404, "grad_norm": 0.5739807486534119, "learning_rate": 2.825356170962062e-06, "loss": 1.9978, "step": 22410 }, { "epoch": 1.9411255411255413, "grad_norm": 0.5819233059883118, "learning_rate": 2.8221546342244276e-06, "loss": 1.9841, "step": 22420 }, { "epoch": 1.9419913419913422, "grad_norm": 0.5564745664596558, "learning_rate": 2.818953097486794e-06, "loss": 2.0194, "step": 22430 }, { "epoch": 1.9428571428571428, "grad_norm": 0.5947351455688477, "learning_rate": 2.81575156074916e-06, "loss": 1.9965, "step": 22440 }, { "epoch": 1.9437229437229437, "grad_norm": 0.6048354506492615, "learning_rate": 2.8125500240115255e-06, "loss": 2.0291, "step": 22450 }, { "epoch": 1.9445887445887444, "grad_norm": 0.5926880836486816, "learning_rate": 2.8093484872738917e-06, "loss": 2.0123, "step": 22460 }, { "epoch": 1.9454545454545453, "grad_norm": 0.5721975564956665, "learning_rate": 2.806146950536258e-06, "loss": 2.0127, "step": 22470 }, { "epoch": 1.9463203463203462, "grad_norm": 0.5987154245376587, "learning_rate": 2.8029454137986235e-06, "loss": 2.003, "step": 22480 }, { "epoch": 1.9471861471861471, "grad_norm": 0.6043938994407654, "learning_rate": 2.7997438770609896e-06, "loss": 1.9983, "step": 22490 }, { "epoch": 1.948051948051948, "grad_norm": 0.5845421552658081, "learning_rate": 2.7965423403233553e-06, "loss": 2.0467, "step": 22500 }, { "epoch": 1.948051948051948, "eval_loss": 2.036294460296631, "eval_runtime": 99.4558, "eval_samples_per_second": 10.055, "eval_steps_per_second": 5.027, "step": 22500 }, { "epoch": 1.948917748917749, "grad_norm": 0.5815349221229553, "learning_rate": 2.793340803585721e-06, "loss": 2.0169, "step": 22510 }, { "epoch": 1.9497835497835498, "grad_norm": 0.5612464547157288, "learning_rate": 2.7901392668480876e-06, "loss": 2.0052, "step": 22520 }, { "epoch": 1.9506493506493507, "grad_norm": 0.5927168726921082, "learning_rate": 2.7869377301104533e-06, "loss": 1.998, "step": 22530 }, { "epoch": 1.9515151515151516, "grad_norm": 0.5546810030937195, "learning_rate": 2.783736193372819e-06, "loss": 2.0265, "step": 22540 }, { "epoch": 1.9523809523809523, "grad_norm": 0.5768939256668091, "learning_rate": 2.7805346566351847e-06, "loss": 2.0087, "step": 22550 }, { "epoch": 1.9532467532467532, "grad_norm": 0.5690239071846008, "learning_rate": 2.7773331198975513e-06, "loss": 1.9983, "step": 22560 }, { "epoch": 1.9541125541125541, "grad_norm": 0.5805119276046753, "learning_rate": 2.774131583159917e-06, "loss": 2.0343, "step": 22570 }, { "epoch": 1.9549783549783548, "grad_norm": 0.5649641156196594, "learning_rate": 2.7709300464222827e-06, "loss": 2.0062, "step": 22580 }, { "epoch": 1.9558441558441557, "grad_norm": 0.5249608159065247, "learning_rate": 2.767728509684649e-06, "loss": 2.0005, "step": 22590 }, { "epoch": 1.9567099567099566, "grad_norm": 0.5877872705459595, "learning_rate": 2.7645269729470146e-06, "loss": 1.9854, "step": 22600 }, { "epoch": 1.9567099567099566, "eval_loss": 2.0360538959503174, "eval_runtime": 99.515, "eval_samples_per_second": 10.049, "eval_steps_per_second": 5.024, "step": 22600 }, { "epoch": 1.9575757575757575, "grad_norm": 0.5853782892227173, "learning_rate": 2.7613254362093807e-06, "loss": 2.0381, "step": 22610 }, { "epoch": 1.9584415584415584, "grad_norm": 0.61262446641922, "learning_rate": 2.758123899471747e-06, "loss": 2.0193, "step": 22620 }, { "epoch": 1.9593073593073593, "grad_norm": 0.6083556413650513, "learning_rate": 2.7549223627341125e-06, "loss": 2.0001, "step": 22630 }, { "epoch": 1.9601731601731602, "grad_norm": 0.5128453373908997, "learning_rate": 2.7517208259964783e-06, "loss": 2.0166, "step": 22640 }, { "epoch": 1.9610389610389611, "grad_norm": 0.5564128160476685, "learning_rate": 2.748519289258845e-06, "loss": 2.0321, "step": 22650 }, { "epoch": 1.961904761904762, "grad_norm": 0.5347721576690674, "learning_rate": 2.7453177525212105e-06, "loss": 2.0296, "step": 22660 }, { "epoch": 1.9627705627705627, "grad_norm": 0.5394149422645569, "learning_rate": 2.7421162157835762e-06, "loss": 2.0007, "step": 22670 }, { "epoch": 1.9636363636363636, "grad_norm": 0.6041572093963623, "learning_rate": 2.7389146790459424e-06, "loss": 2.011, "step": 22680 }, { "epoch": 1.9645021645021645, "grad_norm": 0.5431060194969177, "learning_rate": 2.7357131423083085e-06, "loss": 1.9783, "step": 22690 }, { "epoch": 1.9653679653679652, "grad_norm": 0.6576969027519226, "learning_rate": 2.732511605570674e-06, "loss": 2.0401, "step": 22700 }, { "epoch": 1.9653679653679652, "eval_loss": 2.036024332046509, "eval_runtime": 99.6045, "eval_samples_per_second": 10.04, "eval_steps_per_second": 5.02, "step": 22700 }, { "epoch": 1.9662337662337661, "grad_norm": 0.5169672966003418, "learning_rate": 2.72931006883304e-06, "loss": 2.0285, "step": 22710 }, { "epoch": 1.967099567099567, "grad_norm": 0.5584462881088257, "learning_rate": 2.726108532095406e-06, "loss": 2.007, "step": 22720 }, { "epoch": 1.967965367965368, "grad_norm": 0.5199535489082336, "learning_rate": 2.7229069953577718e-06, "loss": 2.0332, "step": 22730 }, { "epoch": 1.9688311688311688, "grad_norm": 0.5056544542312622, "learning_rate": 2.719705458620138e-06, "loss": 2.0138, "step": 22740 }, { "epoch": 1.9696969696969697, "grad_norm": 0.5921425819396973, "learning_rate": 2.716503921882504e-06, "loss": 2.0403, "step": 22750 }, { "epoch": 1.9705627705627706, "grad_norm": 0.5671403408050537, "learning_rate": 2.7133023851448697e-06, "loss": 2.0586, "step": 22760 }, { "epoch": 1.9714285714285715, "grad_norm": 0.5604325532913208, "learning_rate": 2.7101008484072354e-06, "loss": 2.0047, "step": 22770 }, { "epoch": 1.9722943722943724, "grad_norm": 0.6269016265869141, "learning_rate": 2.706899311669602e-06, "loss": 2.0121, "step": 22780 }, { "epoch": 1.9731601731601731, "grad_norm": 0.5508336424827576, "learning_rate": 2.7036977749319677e-06, "loss": 2.0172, "step": 22790 }, { "epoch": 1.974025974025974, "grad_norm": 0.5560627579689026, "learning_rate": 2.7004962381943334e-06, "loss": 2.02, "step": 22800 }, { "epoch": 1.974025974025974, "eval_loss": 2.03597354888916, "eval_runtime": 99.2692, "eval_samples_per_second": 10.074, "eval_steps_per_second": 5.037, "step": 22800 }, { "epoch": 1.974891774891775, "grad_norm": 0.5486915111541748, "learning_rate": 2.6972947014566996e-06, "loss": 2.0256, "step": 22810 }, { "epoch": 1.9757575757575756, "grad_norm": 0.5568284392356873, "learning_rate": 2.6940931647190653e-06, "loss": 1.9966, "step": 22820 }, { "epoch": 1.9766233766233765, "grad_norm": 0.6057902574539185, "learning_rate": 2.6908916279814314e-06, "loss": 2.0174, "step": 22830 }, { "epoch": 1.9774891774891774, "grad_norm": 0.5547239184379578, "learning_rate": 2.687690091243797e-06, "loss": 2.0172, "step": 22840 }, { "epoch": 1.9783549783549783, "grad_norm": 0.6098464727401733, "learning_rate": 2.6844885545061632e-06, "loss": 2.0149, "step": 22850 }, { "epoch": 1.9792207792207792, "grad_norm": 0.6356939077377319, "learning_rate": 2.681287017768529e-06, "loss": 2.0072, "step": 22860 }, { "epoch": 1.9800865800865801, "grad_norm": 0.6061809062957764, "learning_rate": 2.6780854810308947e-06, "loss": 2.0173, "step": 22870 }, { "epoch": 1.980952380952381, "grad_norm": 0.5689536929130554, "learning_rate": 2.6748839442932612e-06, "loss": 2.0075, "step": 22880 }, { "epoch": 1.981818181818182, "grad_norm": 0.5928916931152344, "learning_rate": 2.671682407555627e-06, "loss": 2.0023, "step": 22890 }, { "epoch": 1.9826839826839828, "grad_norm": 0.5256547331809998, "learning_rate": 2.6684808708179926e-06, "loss": 1.9976, "step": 22900 }, { "epoch": 1.9826839826839828, "eval_loss": 2.0356850624084473, "eval_runtime": 99.3113, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.035, "step": 22900 }, { "epoch": 1.9835497835497835, "grad_norm": 0.6071939468383789, "learning_rate": 2.665279334080359e-06, "loss": 2.0187, "step": 22910 }, { "epoch": 1.9844155844155844, "grad_norm": 0.5771980285644531, "learning_rate": 2.662077797342725e-06, "loss": 2.0217, "step": 22920 }, { "epoch": 1.9852813852813853, "grad_norm": 0.5437718033790588, "learning_rate": 2.6588762606050906e-06, "loss": 1.9815, "step": 22930 }, { "epoch": 1.986147186147186, "grad_norm": 0.531406819820404, "learning_rate": 2.6556747238674568e-06, "loss": 1.9978, "step": 22940 }, { "epoch": 1.987012987012987, "grad_norm": 0.5711106657981873, "learning_rate": 2.6524731871298225e-06, "loss": 2.0179, "step": 22950 }, { "epoch": 1.9878787878787878, "grad_norm": 0.6144083738327026, "learning_rate": 2.649271650392188e-06, "loss": 2.0312, "step": 22960 }, { "epoch": 1.9887445887445887, "grad_norm": 0.6573773622512817, "learning_rate": 2.6460701136545543e-06, "loss": 2.005, "step": 22970 }, { "epoch": 1.9896103896103896, "grad_norm": 0.5698711276054382, "learning_rate": 2.6428685769169204e-06, "loss": 2.028, "step": 22980 }, { "epoch": 1.9904761904761905, "grad_norm": 0.5695561766624451, "learning_rate": 2.639667040179286e-06, "loss": 2.0223, "step": 22990 }, { "epoch": 1.9913419913419914, "grad_norm": 0.5652351379394531, "learning_rate": 2.636465503441652e-06, "loss": 1.9957, "step": 23000 }, { "epoch": 1.9913419913419914, "eval_loss": 2.035548210144043, "eval_runtime": 99.4804, "eval_samples_per_second": 10.052, "eval_steps_per_second": 5.026, "step": 23000 }, { "epoch": 1.9922077922077923, "grad_norm": 0.5072441697120667, "learning_rate": 2.6332639667040184e-06, "loss": 2.0121, "step": 23010 }, { "epoch": 1.9930735930735932, "grad_norm": 0.5116727352142334, "learning_rate": 2.630062429966384e-06, "loss": 2.0439, "step": 23020 }, { "epoch": 1.993939393939394, "grad_norm": 0.5237976908683777, "learning_rate": 2.62686089322875e-06, "loss": 2.0119, "step": 23030 }, { "epoch": 1.9948051948051948, "grad_norm": 0.5784240961074829, "learning_rate": 2.623659356491116e-06, "loss": 2.0212, "step": 23040 }, { "epoch": 1.9956709956709957, "grad_norm": 0.5490781664848328, "learning_rate": 2.620457819753482e-06, "loss": 2.0406, "step": 23050 }, { "epoch": 1.9965367965367964, "grad_norm": 0.5990797281265259, "learning_rate": 2.617256283015848e-06, "loss": 2.0322, "step": 23060 }, { "epoch": 1.9974025974025973, "grad_norm": 0.6267635226249695, "learning_rate": 2.614054746278214e-06, "loss": 2.013, "step": 23070 }, { "epoch": 1.9982683982683982, "grad_norm": 0.5935016870498657, "learning_rate": 2.6108532095405797e-06, "loss": 2.0138, "step": 23080 }, { "epoch": 1.999134199134199, "grad_norm": 0.597457766532898, "learning_rate": 2.6076516728029454e-06, "loss": 2.0064, "step": 23090 }, { "epoch": 2.0, "grad_norm": 0.5814897418022156, "learning_rate": 2.604450136065312e-06, "loss": 2.0201, "step": 23100 }, { "epoch": 2.0, "eval_loss": 2.0357136726379395, "eval_runtime": 99.4873, "eval_samples_per_second": 10.052, "eval_steps_per_second": 5.026, "step": 23100 }, { "epoch": 2.000865800865801, "grad_norm": 0.6136175394058228, "learning_rate": 2.6012485993276776e-06, "loss": 2.0017, "step": 23110 }, { "epoch": 2.001731601731602, "grad_norm": 0.5407949090003967, "learning_rate": 2.5980470625900433e-06, "loss": 2.0178, "step": 23120 }, { "epoch": 2.0025974025974027, "grad_norm": 0.5387251377105713, "learning_rate": 2.594845525852409e-06, "loss": 1.9983, "step": 23130 }, { "epoch": 2.0034632034632036, "grad_norm": 0.5647053122520447, "learning_rate": 2.5916439891147756e-06, "loss": 2.0078, "step": 23140 }, { "epoch": 2.0043290043290045, "grad_norm": 0.619827151298523, "learning_rate": 2.5884424523771413e-06, "loss": 2.03, "step": 23150 }, { "epoch": 2.005194805194805, "grad_norm": 0.5853438973426819, "learning_rate": 2.585240915639507e-06, "loss": 2.0294, "step": 23160 }, { "epoch": 2.006060606060606, "grad_norm": 0.5789020657539368, "learning_rate": 2.582039378901873e-06, "loss": 2.0126, "step": 23170 }, { "epoch": 2.006926406926407, "grad_norm": 0.5304809808731079, "learning_rate": 2.578837842164239e-06, "loss": 2.0435, "step": 23180 }, { "epoch": 2.0077922077922077, "grad_norm": 0.5407286286354065, "learning_rate": 2.575636305426605e-06, "loss": 2.015, "step": 23190 }, { "epoch": 2.0086580086580086, "grad_norm": 0.5430712699890137, "learning_rate": 2.572434768688971e-06, "loss": 2.0227, "step": 23200 }, { "epoch": 2.0086580086580086, "eval_loss": 2.035465955734253, "eval_runtime": 99.5051, "eval_samples_per_second": 10.05, "eval_steps_per_second": 5.025, "step": 23200 }, { "epoch": 2.0095238095238095, "grad_norm": 0.5268262624740601, "learning_rate": 2.569233231951337e-06, "loss": 2.0069, "step": 23210 }, { "epoch": 2.0103896103896104, "grad_norm": 0.6594340205192566, "learning_rate": 2.5660316952137026e-06, "loss": 2.041, "step": 23220 }, { "epoch": 2.0112554112554113, "grad_norm": 0.5596958994865417, "learning_rate": 2.562830158476069e-06, "loss": 2.025, "step": 23230 }, { "epoch": 2.012121212121212, "grad_norm": 0.5416747331619263, "learning_rate": 2.559628621738435e-06, "loss": 1.9938, "step": 23240 }, { "epoch": 2.012987012987013, "grad_norm": 0.5677267909049988, "learning_rate": 2.5564270850008005e-06, "loss": 1.9824, "step": 23250 }, { "epoch": 2.013852813852814, "grad_norm": 0.5407697558403015, "learning_rate": 2.5532255482631663e-06, "loss": 2.0303, "step": 23260 }, { "epoch": 2.014718614718615, "grad_norm": 0.6268892884254456, "learning_rate": 2.5500240115255324e-06, "loss": 2.0254, "step": 23270 }, { "epoch": 2.0155844155844154, "grad_norm": 0.5445701479911804, "learning_rate": 2.5468224747878985e-06, "loss": 2.0285, "step": 23280 }, { "epoch": 2.0164502164502163, "grad_norm": 0.6353744268417358, "learning_rate": 2.5436209380502642e-06, "loss": 1.9719, "step": 23290 }, { "epoch": 2.017316017316017, "grad_norm": 0.7125324606895447, "learning_rate": 2.5404194013126304e-06, "loss": 2.0303, "step": 23300 }, { "epoch": 2.017316017316017, "eval_loss": 2.0351850986480713, "eval_runtime": 99.7756, "eval_samples_per_second": 10.022, "eval_steps_per_second": 5.011, "step": 23300 }, { "epoch": 2.018181818181818, "grad_norm": 0.588187038898468, "learning_rate": 2.537217864574996e-06, "loss": 1.9909, "step": 23310 }, { "epoch": 2.019047619047619, "grad_norm": 0.5819240808486938, "learning_rate": 2.5340163278373618e-06, "loss": 2.033, "step": 23320 }, { "epoch": 2.01991341991342, "grad_norm": 0.5587559342384338, "learning_rate": 2.5308147910997283e-06, "loss": 2.0403, "step": 23330 }, { "epoch": 2.020779220779221, "grad_norm": 0.5237162113189697, "learning_rate": 2.527613254362094e-06, "loss": 2.0311, "step": 23340 }, { "epoch": 2.0216450216450217, "grad_norm": 0.5738016366958618, "learning_rate": 2.5244117176244598e-06, "loss": 2.0086, "step": 23350 }, { "epoch": 2.0225108225108226, "grad_norm": 0.6247041821479797, "learning_rate": 2.5212101808868263e-06, "loss": 2.0163, "step": 23360 }, { "epoch": 2.0233766233766235, "grad_norm": 0.5295172929763794, "learning_rate": 2.518008644149192e-06, "loss": 2.0048, "step": 23370 }, { "epoch": 2.0242424242424244, "grad_norm": 0.5387830138206482, "learning_rate": 2.5148071074115577e-06, "loss": 2.0413, "step": 23380 }, { "epoch": 2.0251082251082253, "grad_norm": 0.5304757952690125, "learning_rate": 2.511605570673924e-06, "loss": 1.9883, "step": 23390 }, { "epoch": 2.0259740259740258, "grad_norm": 0.6326545476913452, "learning_rate": 2.5084040339362896e-06, "loss": 2.0005, "step": 23400 }, { "epoch": 2.0259740259740258, "eval_loss": 2.0352110862731934, "eval_runtime": 99.8194, "eval_samples_per_second": 10.018, "eval_steps_per_second": 5.009, "step": 23400 }, { "epoch": 2.0268398268398267, "grad_norm": 0.604609489440918, "learning_rate": 2.5052024971986553e-06, "loss": 1.9793, "step": 23410 }, { "epoch": 2.0277056277056276, "grad_norm": 0.6415339112281799, "learning_rate": 2.5020009604610214e-06, "loss": 1.9915, "step": 23420 }, { "epoch": 2.0285714285714285, "grad_norm": 0.5685520768165588, "learning_rate": 2.498799423723387e-06, "loss": 1.992, "step": 23430 }, { "epoch": 2.0294372294372294, "grad_norm": 0.6610993146896362, "learning_rate": 2.4955978869857533e-06, "loss": 1.9936, "step": 23440 }, { "epoch": 2.0303030303030303, "grad_norm": 0.5498647093772888, "learning_rate": 2.4923963502481194e-06, "loss": 2.0087, "step": 23450 }, { "epoch": 2.031168831168831, "grad_norm": 0.5626619458198547, "learning_rate": 2.489194813510485e-06, "loss": 2.057, "step": 23460 }, { "epoch": 2.032034632034632, "grad_norm": 0.5277206897735596, "learning_rate": 2.4859932767728512e-06, "loss": 2.0326, "step": 23470 }, { "epoch": 2.032900432900433, "grad_norm": 0.5527384877204895, "learning_rate": 2.4827917400352174e-06, "loss": 1.9978, "step": 23480 }, { "epoch": 2.033766233766234, "grad_norm": 0.5400198698043823, "learning_rate": 2.479590203297583e-06, "loss": 2.0197, "step": 23490 }, { "epoch": 2.034632034632035, "grad_norm": 0.6280678510665894, "learning_rate": 2.4763886665599492e-06, "loss": 2.0244, "step": 23500 }, { "epoch": 2.034632034632035, "eval_loss": 2.034968852996826, "eval_runtime": 99.5755, "eval_samples_per_second": 10.043, "eval_steps_per_second": 5.021, "step": 23500 }, { "epoch": 2.0354978354978357, "grad_norm": 0.6585070490837097, "learning_rate": 2.473187129822315e-06, "loss": 2.0265, "step": 23510 }, { "epoch": 2.036363636363636, "grad_norm": 0.5453212261199951, "learning_rate": 2.4699855930846806e-06, "loss": 2.0263, "step": 23520 }, { "epoch": 2.037229437229437, "grad_norm": 0.5460197329521179, "learning_rate": 2.4667840563470468e-06, "loss": 2.0178, "step": 23530 }, { "epoch": 2.038095238095238, "grad_norm": 0.5552266836166382, "learning_rate": 2.4635825196094125e-06, "loss": 2.0367, "step": 23540 }, { "epoch": 2.038961038961039, "grad_norm": 0.5523292422294617, "learning_rate": 2.4603809828717786e-06, "loss": 2.0116, "step": 23550 }, { "epoch": 2.0398268398268398, "grad_norm": 0.6269523501396179, "learning_rate": 2.4571794461341448e-06, "loss": 2.023, "step": 23560 }, { "epoch": 2.0406926406926407, "grad_norm": 0.6676937937736511, "learning_rate": 2.4539779093965105e-06, "loss": 1.9792, "step": 23570 }, { "epoch": 2.0415584415584416, "grad_norm": 0.6345266699790955, "learning_rate": 2.4507763726588766e-06, "loss": 2.0327, "step": 23580 }, { "epoch": 2.0424242424242425, "grad_norm": 0.5686797499656677, "learning_rate": 2.4475748359212423e-06, "loss": 2.0012, "step": 23590 }, { "epoch": 2.0432900432900434, "grad_norm": 0.5607196092605591, "learning_rate": 2.4443732991836084e-06, "loss": 2.0059, "step": 23600 }, { "epoch": 2.0432900432900434, "eval_loss": 2.0349371433258057, "eval_runtime": 99.6082, "eval_samples_per_second": 10.039, "eval_steps_per_second": 5.02, "step": 23600 }, { "epoch": 2.0441558441558443, "grad_norm": 0.6047848463058472, "learning_rate": 2.4411717624459746e-06, "loss": 2.0386, "step": 23610 }, { "epoch": 2.045021645021645, "grad_norm": 0.5824304819107056, "learning_rate": 2.4379702257083403e-06, "loss": 2.0394, "step": 23620 }, { "epoch": 2.045887445887446, "grad_norm": 0.567637026309967, "learning_rate": 2.434768688970706e-06, "loss": 2.0106, "step": 23630 }, { "epoch": 2.0467532467532465, "grad_norm": 0.5690438151359558, "learning_rate": 2.431567152233072e-06, "loss": 2.0424, "step": 23640 }, { "epoch": 2.0476190476190474, "grad_norm": 0.5576907396316528, "learning_rate": 2.428365615495438e-06, "loss": 2.0108, "step": 23650 }, { "epoch": 2.0484848484848484, "grad_norm": 0.568816602230072, "learning_rate": 2.425164078757804e-06, "loss": 1.9905, "step": 23660 }, { "epoch": 2.0493506493506493, "grad_norm": 0.567893922328949, "learning_rate": 2.4219625420201697e-06, "loss": 2.0397, "step": 23670 }, { "epoch": 2.05021645021645, "grad_norm": 0.5759986639022827, "learning_rate": 2.418761005282536e-06, "loss": 2.013, "step": 23680 }, { "epoch": 2.051082251082251, "grad_norm": 0.5438066124916077, "learning_rate": 2.415559468544902e-06, "loss": 2.0041, "step": 23690 }, { "epoch": 2.051948051948052, "grad_norm": 0.5716127157211304, "learning_rate": 2.4123579318072677e-06, "loss": 2.0247, "step": 23700 }, { "epoch": 2.051948051948052, "eval_loss": 2.0347187519073486, "eval_runtime": 99.6831, "eval_samples_per_second": 10.032, "eval_steps_per_second": 5.016, "step": 23700 }, { "epoch": 2.052813852813853, "grad_norm": 0.5692997574806213, "learning_rate": 2.409156395069634e-06, "loss": 2.0034, "step": 23710 }, { "epoch": 2.0536796536796538, "grad_norm": 0.5693305730819702, "learning_rate": 2.4059548583319995e-06, "loss": 2.0117, "step": 23720 }, { "epoch": 2.0545454545454547, "grad_norm": 0.6145777106285095, "learning_rate": 2.4027533215943656e-06, "loss": 2.0224, "step": 23730 }, { "epoch": 2.0554112554112556, "grad_norm": 0.5658998489379883, "learning_rate": 2.3995517848567313e-06, "loss": 1.9992, "step": 23740 }, { "epoch": 2.0562770562770565, "grad_norm": 0.568269670009613, "learning_rate": 2.3963502481190975e-06, "loss": 1.9953, "step": 23750 }, { "epoch": 2.057142857142857, "grad_norm": 0.5881268382072449, "learning_rate": 2.393148711381463e-06, "loss": 2.021, "step": 23760 }, { "epoch": 2.058008658008658, "grad_norm": 0.6150349974632263, "learning_rate": 2.3899471746438293e-06, "loss": 2.0009, "step": 23770 }, { "epoch": 2.0588744588744587, "grad_norm": 0.515796422958374, "learning_rate": 2.386745637906195e-06, "loss": 1.9927, "step": 23780 }, { "epoch": 2.0597402597402596, "grad_norm": 0.5475239157676697, "learning_rate": 2.383544101168561e-06, "loss": 2.0204, "step": 23790 }, { "epoch": 2.0606060606060606, "grad_norm": 0.6885071396827698, "learning_rate": 2.380342564430927e-06, "loss": 2.012, "step": 23800 }, { "epoch": 2.0606060606060606, "eval_loss": 2.034194231033325, "eval_runtime": 99.5096, "eval_samples_per_second": 10.049, "eval_steps_per_second": 5.025, "step": 23800 }, { "epoch": 3.195973154362416, "grad_norm": 0.5709320306777954, "learning_rate": 2.377141027693293e-06, "loss": 1.995, "step": 23810 }, { "epoch": 3.1973154362416105, "grad_norm": 0.5705162882804871, "learning_rate": 2.373939490955659e-06, "loss": 2.0146, "step": 23820 }, { "epoch": 3.1986577181208053, "grad_norm": 0.5645247101783752, "learning_rate": 2.370737954218025e-06, "loss": 2.0212, "step": 23830 }, { "epoch": 3.2, "grad_norm": 0.6989596486091614, "learning_rate": 2.367536417480391e-06, "loss": 2.0094, "step": 23840 }, { "epoch": 3.2013422818791946, "grad_norm": 0.6202380657196045, "learning_rate": 2.3643348807427567e-06, "loss": 1.9898, "step": 23850 }, { "epoch": 3.2026845637583894, "grad_norm": 0.5662652850151062, "learning_rate": 2.361133344005123e-06, "loss": 2.0212, "step": 23860 }, { "epoch": 3.2040268456375838, "grad_norm": 0.5594598054885864, "learning_rate": 2.3579318072674885e-06, "loss": 2.0415, "step": 23870 }, { "epoch": 3.2053691275167786, "grad_norm": 0.5457987189292908, "learning_rate": 2.3547302705298543e-06, "loss": 2.0357, "step": 23880 }, { "epoch": 3.206711409395973, "grad_norm": 0.5731124877929688, "learning_rate": 2.3515287337922204e-06, "loss": 2.0174, "step": 23890 }, { "epoch": 3.208053691275168, "grad_norm": 0.5775108337402344, "learning_rate": 2.3483271970545865e-06, "loss": 2.0033, "step": 23900 }, { "epoch": 3.208053691275168, "eval_loss": 2.034665584564209, "eval_runtime": 99.7956, "eval_samples_per_second": 10.02, "eval_steps_per_second": 5.01, "step": 23900 }, { "epoch": 3.209395973154362, "grad_norm": 0.6452018022537231, "learning_rate": 2.3451256603169522e-06, "loss": 2.0048, "step": 23910 }, { "epoch": 3.210738255033557, "grad_norm": 0.5958179235458374, "learning_rate": 2.3419241235793184e-06, "loss": 2.0017, "step": 23920 }, { "epoch": 3.212080536912752, "grad_norm": 0.6147748231887817, "learning_rate": 2.338722586841684e-06, "loss": 2.0273, "step": 23930 }, { "epoch": 3.2134228187919462, "grad_norm": 0.5603628754615784, "learning_rate": 2.33552105010405e-06, "loss": 2.0264, "step": 23940 }, { "epoch": 3.214765100671141, "grad_norm": 0.5753715634346008, "learning_rate": 2.3323195133664163e-06, "loss": 2.0444, "step": 23950 }, { "epoch": 3.2161073825503355, "grad_norm": 0.5697400569915771, "learning_rate": 2.329117976628782e-06, "loss": 1.9961, "step": 23960 }, { "epoch": 3.2174496644295303, "grad_norm": 0.5242548584938049, "learning_rate": 2.3259164398911478e-06, "loss": 2.0079, "step": 23970 }, { "epoch": 3.2187919463087247, "grad_norm": 0.5654411315917969, "learning_rate": 2.322714903153514e-06, "loss": 2.0124, "step": 23980 }, { "epoch": 3.2201342281879195, "grad_norm": 0.5427534580230713, "learning_rate": 2.3195133664158796e-06, "loss": 1.9617, "step": 23990 }, { "epoch": 3.221476510067114, "grad_norm": 0.5504390597343445, "learning_rate": 2.3163118296782457e-06, "loss": 1.9851, "step": 24000 }, { "epoch": 3.221476510067114, "eval_loss": 2.0347743034362793, "eval_runtime": 99.05, "eval_samples_per_second": 10.096, "eval_steps_per_second": 5.048, "step": 24000 }, { "epoch": 3.2228187919463087, "grad_norm": 0.5600200295448303, "learning_rate": 2.3131102929406114e-06, "loss": 2.0122, "step": 24010 }, { "epoch": 3.2241610738255035, "grad_norm": 0.6022480726242065, "learning_rate": 2.3099087562029776e-06, "loss": 1.998, "step": 24020 }, { "epoch": 3.225503355704698, "grad_norm": 0.7161531448364258, "learning_rate": 2.3067072194653437e-06, "loss": 2.0005, "step": 24030 }, { "epoch": 3.2268456375838928, "grad_norm": 0.5841786861419678, "learning_rate": 2.3035056827277094e-06, "loss": 2.0338, "step": 24040 }, { "epoch": 3.228187919463087, "grad_norm": 0.525364100933075, "learning_rate": 2.3003041459900756e-06, "loss": 2.0457, "step": 24050 }, { "epoch": 3.229530201342282, "grad_norm": 0.5649173259735107, "learning_rate": 2.2971026092524417e-06, "loss": 2.011, "step": 24060 }, { "epoch": 3.2308724832214764, "grad_norm": 0.5048159956932068, "learning_rate": 2.2939010725148074e-06, "loss": 2.0205, "step": 24070 }, { "epoch": 3.232214765100671, "grad_norm": 0.5814887881278992, "learning_rate": 2.290699535777173e-06, "loss": 2.0189, "step": 24080 }, { "epoch": 3.2335570469798656, "grad_norm": 0.6833152770996094, "learning_rate": 2.2874979990395392e-06, "loss": 2.0038, "step": 24090 }, { "epoch": 3.2348993288590604, "grad_norm": 0.6447194218635559, "learning_rate": 2.284296462301905e-06, "loss": 2.0058, "step": 24100 }, { "epoch": 3.2348993288590604, "eval_loss": 2.0349581241607666, "eval_runtime": 98.9765, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.052, "step": 24100 }, { "epoch": 3.2362416107382552, "grad_norm": 0.6656696796417236, "learning_rate": 2.281094925564271e-06, "loss": 2.0291, "step": 24110 }, { "epoch": 3.2375838926174496, "grad_norm": 0.6009058952331543, "learning_rate": 2.277893388826637e-06, "loss": 1.9914, "step": 24120 }, { "epoch": 3.2389261744966444, "grad_norm": 0.5890768766403198, "learning_rate": 2.274691852089003e-06, "loss": 2.0448, "step": 24130 }, { "epoch": 3.240268456375839, "grad_norm": 0.563474714756012, "learning_rate": 2.2714903153513686e-06, "loss": 2.0105, "step": 24140 }, { "epoch": 3.2416107382550337, "grad_norm": 0.5375024080276489, "learning_rate": 2.2682887786137348e-06, "loss": 2.0336, "step": 24150 }, { "epoch": 3.242953020134228, "grad_norm": 0.5433012843132019, "learning_rate": 2.265087241876101e-06, "loss": 2.0475, "step": 24160 }, { "epoch": 3.244295302013423, "grad_norm": 0.5040486454963684, "learning_rate": 2.2618857051384666e-06, "loss": 2.0288, "step": 24170 }, { "epoch": 3.2456375838926173, "grad_norm": 0.5953966975212097, "learning_rate": 2.2586841684008328e-06, "loss": 2.0048, "step": 24180 }, { "epoch": 3.246979865771812, "grad_norm": 0.5979200005531311, "learning_rate": 2.2554826316631985e-06, "loss": 1.9992, "step": 24190 }, { "epoch": 3.248322147651007, "grad_norm": 0.5424688458442688, "learning_rate": 2.2522810949255646e-06, "loss": 1.9983, "step": 24200 }, { "epoch": 3.248322147651007, "eval_loss": 2.034461498260498, "eval_runtime": 98.9765, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.052, "step": 24200 }, { "epoch": 3.2496644295302013, "grad_norm": 0.5857418179512024, "learning_rate": 2.2490795581879303e-06, "loss": 1.9997, "step": 24210 }, { "epoch": 3.251006711409396, "grad_norm": 0.5468854904174805, "learning_rate": 2.245878021450296e-06, "loss": 2.0305, "step": 24220 }, { "epoch": 3.2523489932885905, "grad_norm": 0.5583387613296509, "learning_rate": 2.242676484712662e-06, "loss": 2.0155, "step": 24230 }, { "epoch": 3.2536912751677853, "grad_norm": 0.5511802434921265, "learning_rate": 2.2394749479750283e-06, "loss": 2.0018, "step": 24240 }, { "epoch": 3.2550335570469797, "grad_norm": 0.7055875062942505, "learning_rate": 2.236273411237394e-06, "loss": 2.0143, "step": 24250 }, { "epoch": 3.2563758389261745, "grad_norm": 0.6099456548690796, "learning_rate": 2.23307187449976e-06, "loss": 1.9877, "step": 24260 }, { "epoch": 3.257718120805369, "grad_norm": 0.5808700919151306, "learning_rate": 2.2298703377621263e-06, "loss": 2.0161, "step": 24270 }, { "epoch": 3.2590604026845638, "grad_norm": 0.5520883202552795, "learning_rate": 2.226668801024492e-06, "loss": 1.9884, "step": 24280 }, { "epoch": 3.2604026845637586, "grad_norm": 0.5096262097358704, "learning_rate": 2.223467264286858e-06, "loss": 2.0242, "step": 24290 }, { "epoch": 3.261744966442953, "grad_norm": 0.5912474989891052, "learning_rate": 2.220265727549224e-06, "loss": 1.9842, "step": 24300 }, { "epoch": 3.261744966442953, "eval_loss": 2.0343332290649414, "eval_runtime": 98.8495, "eval_samples_per_second": 10.116, "eval_steps_per_second": 5.058, "step": 24300 }, { "epoch": 3.263087248322148, "grad_norm": 0.5764116644859314, "learning_rate": 2.21706419081159e-06, "loss": 2.0633, "step": 24310 }, { "epoch": 3.264429530201342, "grad_norm": 0.6952981948852539, "learning_rate": 2.2138626540739557e-06, "loss": 1.9669, "step": 24320 }, { "epoch": 3.265771812080537, "grad_norm": 0.6251304745674133, "learning_rate": 2.2106611173363214e-06, "loss": 2.0096, "step": 24330 }, { "epoch": 3.2671140939597314, "grad_norm": 0.5688126087188721, "learning_rate": 2.2074595805986875e-06, "loss": 1.9922, "step": 24340 }, { "epoch": 3.2684563758389262, "grad_norm": 0.5293943285942078, "learning_rate": 2.204258043861053e-06, "loss": 2.0349, "step": 24350 }, { "epoch": 3.2697986577181206, "grad_norm": 0.5495185256004333, "learning_rate": 2.2010565071234193e-06, "loss": 2.0283, "step": 24360 }, { "epoch": 3.2711409395973154, "grad_norm": 0.6728076934814453, "learning_rate": 2.1978549703857855e-06, "loss": 2.0335, "step": 24370 }, { "epoch": 3.2724832214765103, "grad_norm": 0.5396303534507751, "learning_rate": 2.194653433648151e-06, "loss": 2.0144, "step": 24380 }, { "epoch": 3.2738255033557047, "grad_norm": 0.5711934566497803, "learning_rate": 2.1914518969105173e-06, "loss": 2.0112, "step": 24390 }, { "epoch": 3.2751677852348995, "grad_norm": 0.5564073920249939, "learning_rate": 2.1882503601728835e-06, "loss": 2.0288, "step": 24400 }, { "epoch": 3.2751677852348995, "eval_loss": 2.0342624187469482, "eval_runtime": 99.5861, "eval_samples_per_second": 10.042, "eval_steps_per_second": 5.021, "step": 24400 }, { "epoch": 3.276510067114094, "grad_norm": 0.5944424271583557, "learning_rate": 2.185048823435249e-06, "loss": 2.0196, "step": 24410 }, { "epoch": 3.2778523489932887, "grad_norm": 0.6209209561347961, "learning_rate": 2.181847286697615e-06, "loss": 2.0087, "step": 24420 }, { "epoch": 3.279194630872483, "grad_norm": 0.6335627436637878, "learning_rate": 2.178645749959981e-06, "loss": 2.0274, "step": 24430 }, { "epoch": 3.280536912751678, "grad_norm": 0.5886103510856628, "learning_rate": 2.1754442132223467e-06, "loss": 2.0106, "step": 24440 }, { "epoch": 3.2818791946308723, "grad_norm": 0.5783700346946716, "learning_rate": 2.172242676484713e-06, "loss": 2.0025, "step": 24450 }, { "epoch": 3.283221476510067, "grad_norm": 0.5945323705673218, "learning_rate": 2.1690411397470786e-06, "loss": 2.0192, "step": 24460 }, { "epoch": 3.284563758389262, "grad_norm": 0.5845428109169006, "learning_rate": 2.1658396030094447e-06, "loss": 2.0084, "step": 24470 }, { "epoch": 3.2859060402684563, "grad_norm": 0.5515161156654358, "learning_rate": 2.162638066271811e-06, "loss": 2.0013, "step": 24480 }, { "epoch": 3.287248322147651, "grad_norm": 0.5515275001525879, "learning_rate": 2.1594365295341765e-06, "loss": 1.992, "step": 24490 }, { "epoch": 3.2885906040268456, "grad_norm": 0.5798812508583069, "learning_rate": 2.1562349927965427e-06, "loss": 2.0224, "step": 24500 }, { "epoch": 3.2885906040268456, "eval_loss": 2.034242868423462, "eval_runtime": 98.8346, "eval_samples_per_second": 10.118, "eval_steps_per_second": 5.059, "step": 24500 }, { "epoch": 3.2899328859060404, "grad_norm": 0.7159120440483093, "learning_rate": 2.1530334560589084e-06, "loss": 2.0158, "step": 24510 }, { "epoch": 3.2912751677852348, "grad_norm": 0.5328459143638611, "learning_rate": 2.1498319193212745e-06, "loss": 2.0135, "step": 24520 }, { "epoch": 3.2926174496644296, "grad_norm": 0.5350387692451477, "learning_rate": 2.1466303825836402e-06, "loss": 2.0315, "step": 24530 }, { "epoch": 3.293959731543624, "grad_norm": 0.5860224962234497, "learning_rate": 2.1434288458460064e-06, "loss": 2.0145, "step": 24540 }, { "epoch": 3.295302013422819, "grad_norm": 0.5541955828666687, "learning_rate": 2.140227309108372e-06, "loss": 2.0272, "step": 24550 }, { "epoch": 3.2966442953020136, "grad_norm": 0.6375378966331482, "learning_rate": 2.137025772370738e-06, "loss": 2.0212, "step": 24560 }, { "epoch": 3.297986577181208, "grad_norm": 0.7046541571617126, "learning_rate": 2.133824235633104e-06, "loss": 1.9943, "step": 24570 }, { "epoch": 3.299328859060403, "grad_norm": 0.5820409655570984, "learning_rate": 2.13062269889547e-06, "loss": 2.0394, "step": 24580 }, { "epoch": 3.3006711409395972, "grad_norm": 0.5739086866378784, "learning_rate": 2.1274211621578358e-06, "loss": 2.0043, "step": 24590 }, { "epoch": 3.302013422818792, "grad_norm": 0.5671262741088867, "learning_rate": 2.124219625420202e-06, "loss": 1.9913, "step": 24600 }, { "epoch": 3.302013422818792, "eval_loss": 2.0339884757995605, "eval_runtime": 99.4671, "eval_samples_per_second": 10.054, "eval_steps_per_second": 5.027, "step": 24600 }, { "epoch": 3.3033557046979865, "grad_norm": 0.6186826825141907, "learning_rate": 2.121018088682568e-06, "loss": 2.0004, "step": 24610 }, { "epoch": 3.3046979865771813, "grad_norm": 0.5595492124557495, "learning_rate": 2.1178165519449337e-06, "loss": 1.9985, "step": 24620 }, { "epoch": 3.3060402684563757, "grad_norm": 0.609688401222229, "learning_rate": 2.1146150152073e-06, "loss": 2.0024, "step": 24630 }, { "epoch": 3.3073825503355705, "grad_norm": 0.5851048231124878, "learning_rate": 2.1114134784696656e-06, "loss": 2.0059, "step": 24640 }, { "epoch": 3.3087248322147653, "grad_norm": 0.5259600877761841, "learning_rate": 2.1082119417320317e-06, "loss": 2.0294, "step": 24650 }, { "epoch": 3.3100671140939597, "grad_norm": 0.5880383849143982, "learning_rate": 2.1050104049943974e-06, "loss": 2.0324, "step": 24660 }, { "epoch": 3.3114093959731545, "grad_norm": 0.5593898296356201, "learning_rate": 2.101808868256763e-06, "loss": 2.025, "step": 24670 }, { "epoch": 3.312751677852349, "grad_norm": 0.5315661430358887, "learning_rate": 2.0986073315191293e-06, "loss": 2.0221, "step": 24680 }, { "epoch": 3.3140939597315437, "grad_norm": 0.6451142430305481, "learning_rate": 2.0954057947814954e-06, "loss": 2.0162, "step": 24690 }, { "epoch": 3.315436241610738, "grad_norm": 0.6757481694221497, "learning_rate": 2.092204258043861e-06, "loss": 1.9826, "step": 24700 }, { "epoch": 3.315436241610738, "eval_loss": 2.0339531898498535, "eval_runtime": 99.0126, "eval_samples_per_second": 10.1, "eval_steps_per_second": 5.05, "step": 24700 }, { "epoch": 3.316778523489933, "grad_norm": 0.5286951065063477, "learning_rate": 2.0890027213062272e-06, "loss": 1.9789, "step": 24710 }, { "epoch": 3.3181208053691273, "grad_norm": 0.6284973621368408, "learning_rate": 2.085801184568593e-06, "loss": 2.0031, "step": 24720 }, { "epoch": 3.319463087248322, "grad_norm": 0.5724067091941833, "learning_rate": 2.082599647830959e-06, "loss": 2.0282, "step": 24730 }, { "epoch": 3.320805369127517, "grad_norm": 0.5950222015380859, "learning_rate": 2.0793981110933252e-06, "loss": 1.9978, "step": 24740 }, { "epoch": 3.3221476510067114, "grad_norm": 0.5722485184669495, "learning_rate": 2.076196574355691e-06, "loss": 2.0349, "step": 24750 }, { "epoch": 3.323489932885906, "grad_norm": 0.5992030501365662, "learning_rate": 2.072995037618057e-06, "loss": 2.0462, "step": 24760 }, { "epoch": 3.3248322147651006, "grad_norm": 0.5604340434074402, "learning_rate": 2.0697935008804228e-06, "loss": 1.9957, "step": 24770 }, { "epoch": 3.3261744966442954, "grad_norm": 0.6042061448097229, "learning_rate": 2.0665919641427885e-06, "loss": 1.9757, "step": 24780 }, { "epoch": 3.32751677852349, "grad_norm": 0.5957545042037964, "learning_rate": 2.0633904274051546e-06, "loss": 2.014, "step": 24790 }, { "epoch": 3.3288590604026846, "grad_norm": 0.5731126666069031, "learning_rate": 2.0601888906675203e-06, "loss": 1.9967, "step": 24800 }, { "epoch": 3.3288590604026846, "eval_loss": 2.0340585708618164, "eval_runtime": 99.1874, "eval_samples_per_second": 10.082, "eval_steps_per_second": 5.041, "step": 24800 }, { "epoch": 3.330201342281879, "grad_norm": 0.6661995053291321, "learning_rate": 2.0569873539298865e-06, "loss": 2.0049, "step": 24810 }, { "epoch": 3.331543624161074, "grad_norm": 0.6591014862060547, "learning_rate": 2.0537858171922526e-06, "loss": 2.0198, "step": 24820 }, { "epoch": 3.3328859060402687, "grad_norm": 0.6858102679252625, "learning_rate": 2.0505842804546183e-06, "loss": 2.0049, "step": 24830 }, { "epoch": 3.334228187919463, "grad_norm": 0.5600757002830505, "learning_rate": 2.0473827437169844e-06, "loss": 2.0191, "step": 24840 }, { "epoch": 3.335570469798658, "grad_norm": 0.632583498954773, "learning_rate": 2.04418120697935e-06, "loss": 2.0108, "step": 24850 }, { "epoch": 3.3369127516778523, "grad_norm": 0.5636388659477234, "learning_rate": 2.0409796702417163e-06, "loss": 2.011, "step": 24860 }, { "epoch": 3.338255033557047, "grad_norm": 0.6372313499450684, "learning_rate": 2.0377781335040824e-06, "loss": 2.0202, "step": 24870 }, { "epoch": 3.3395973154362415, "grad_norm": 0.5647403001785278, "learning_rate": 2.034576596766448e-06, "loss": 2.0111, "step": 24880 }, { "epoch": 3.3409395973154363, "grad_norm": 0.5417010188102722, "learning_rate": 2.031375060028814e-06, "loss": 2.0318, "step": 24890 }, { "epoch": 3.3422818791946307, "grad_norm": 0.5448442697525024, "learning_rate": 2.02817352329118e-06, "loss": 2.0175, "step": 24900 }, { "epoch": 3.3422818791946307, "eval_loss": 2.033846855163574, "eval_runtime": 99.6231, "eval_samples_per_second": 10.038, "eval_steps_per_second": 5.019, "step": 24900 }, { "epoch": 3.3436241610738255, "grad_norm": 0.6390599012374878, "learning_rate": 2.0249719865535457e-06, "loss": 2.0158, "step": 24910 }, { "epoch": 3.3449664429530204, "grad_norm": 0.6264297962188721, "learning_rate": 2.021770449815912e-06, "loss": 1.9945, "step": 24920 }, { "epoch": 3.3463087248322148, "grad_norm": 0.5326258540153503, "learning_rate": 2.0185689130782775e-06, "loss": 2.0242, "step": 24930 }, { "epoch": 3.3476510067114096, "grad_norm": 0.6191580295562744, "learning_rate": 2.0153673763406437e-06, "loss": 2.01, "step": 24940 }, { "epoch": 3.348993288590604, "grad_norm": 0.5405324697494507, "learning_rate": 2.0121658396030098e-06, "loss": 2.0089, "step": 24950 }, { "epoch": 3.350335570469799, "grad_norm": 0.6203567981719971, "learning_rate": 2.0089643028653755e-06, "loss": 1.9956, "step": 24960 }, { "epoch": 3.351677852348993, "grad_norm": 0.6961588263511658, "learning_rate": 2.0057627661277416e-06, "loss": 2.0056, "step": 24970 }, { "epoch": 3.353020134228188, "grad_norm": 0.5940682291984558, "learning_rate": 2.0025612293901073e-06, "loss": 2.009, "step": 24980 }, { "epoch": 3.3543624161073824, "grad_norm": 0.5571852922439575, "learning_rate": 1.9993596926524735e-06, "loss": 1.9921, "step": 24990 }, { "epoch": 3.3557046979865772, "grad_norm": 0.515678882598877, "learning_rate": 1.996158155914839e-06, "loss": 2.0536, "step": 25000 }, { "epoch": 3.3557046979865772, "eval_loss": 2.033797264099121, "eval_runtime": 102.1044, "eval_samples_per_second": 9.794, "eval_steps_per_second": 4.897, "step": 25000 }, { "epoch": 3.357046979865772, "grad_norm": 0.5705350637435913, "learning_rate": 1.9929566191772053e-06, "loss": 2.0119, "step": 25010 }, { "epoch": 3.3583892617449664, "grad_norm": 0.6690366864204407, "learning_rate": 1.989755082439571e-06, "loss": 2.0099, "step": 25020 }, { "epoch": 3.3597315436241613, "grad_norm": 0.6348695755004883, "learning_rate": 1.986553545701937e-06, "loss": 2.0011, "step": 25030 }, { "epoch": 3.3610738255033556, "grad_norm": 0.604619562625885, "learning_rate": 1.983352008964303e-06, "loss": 2.0025, "step": 25040 }, { "epoch": 3.3624161073825505, "grad_norm": 0.5843760371208191, "learning_rate": 1.980150472226669e-06, "loss": 2.0191, "step": 25050 }, { "epoch": 3.363758389261745, "grad_norm": 0.6079941987991333, "learning_rate": 1.9769489354890347e-06, "loss": 2.0135, "step": 25060 }, { "epoch": 3.3651006711409397, "grad_norm": 0.5454660654067993, "learning_rate": 1.973747398751401e-06, "loss": 2.0175, "step": 25070 }, { "epoch": 3.366442953020134, "grad_norm": 0.5789991021156311, "learning_rate": 1.970545862013767e-06, "loss": 1.9941, "step": 25080 }, { "epoch": 3.367785234899329, "grad_norm": 0.5079576373100281, "learning_rate": 1.9673443252761327e-06, "loss": 2.031, "step": 25090 }, { "epoch": 3.3691275167785237, "grad_norm": 0.5263907313346863, "learning_rate": 1.964142788538499e-06, "loss": 2.0079, "step": 25100 }, { "epoch": 3.3691275167785237, "eval_loss": 2.033698081970215, "eval_runtime": 99.4346, "eval_samples_per_second": 10.057, "eval_steps_per_second": 5.028, "step": 25100 }, { "epoch": 3.370469798657718, "grad_norm": 0.5881233811378479, "learning_rate": 1.9609412518008645e-06, "loss": 2.0269, "step": 25110 }, { "epoch": 3.3718120805369125, "grad_norm": 0.551294207572937, "learning_rate": 1.9577397150632307e-06, "loss": 2.0289, "step": 25120 }, { "epoch": 3.3731543624161073, "grad_norm": 0.5218905210494995, "learning_rate": 1.9545381783255964e-06, "loss": 2.0029, "step": 25130 }, { "epoch": 3.374496644295302, "grad_norm": 0.5961599349975586, "learning_rate": 1.951336641587962e-06, "loss": 2.0059, "step": 25140 }, { "epoch": 3.3758389261744965, "grad_norm": 0.5878936052322388, "learning_rate": 1.9481351048503282e-06, "loss": 2.0329, "step": 25150 }, { "epoch": 3.3771812080536914, "grad_norm": 0.5031041502952576, "learning_rate": 1.9449335681126944e-06, "loss": 2.0187, "step": 25160 }, { "epoch": 3.3785234899328858, "grad_norm": 0.5873698592185974, "learning_rate": 1.94173203137506e-06, "loss": 2.0103, "step": 25170 }, { "epoch": 3.3798657718120806, "grad_norm": 0.6120571494102478, "learning_rate": 1.938530494637426e-06, "loss": 2.0029, "step": 25180 }, { "epoch": 3.3812080536912754, "grad_norm": 0.6189383864402771, "learning_rate": 1.9353289578997923e-06, "loss": 2.0328, "step": 25190 }, { "epoch": 3.38255033557047, "grad_norm": 0.5471286177635193, "learning_rate": 1.932127421162158e-06, "loss": 2.0107, "step": 25200 }, { "epoch": 3.38255033557047, "eval_loss": 2.0336575508117676, "eval_runtime": 99.6661, "eval_samples_per_second": 10.033, "eval_steps_per_second": 5.017, "step": 25200 }, { "epoch": 3.383892617449664, "grad_norm": 0.6131651997566223, "learning_rate": 1.928925884424524e-06, "loss": 2.0046, "step": 25210 }, { "epoch": 3.385234899328859, "grad_norm": 0.501825213432312, "learning_rate": 1.92572434768689e-06, "loss": 2.0169, "step": 25220 }, { "epoch": 3.386577181208054, "grad_norm": 0.6550677418708801, "learning_rate": 1.9225228109492556e-06, "loss": 2.0105, "step": 25230 }, { "epoch": 3.3879194630872482, "grad_norm": 0.590255081653595, "learning_rate": 1.9193212742116217e-06, "loss": 2.0032, "step": 25240 }, { "epoch": 3.389261744966443, "grad_norm": 0.5433794856071472, "learning_rate": 1.9161197374739874e-06, "loss": 2.0134, "step": 25250 }, { "epoch": 3.3906040268456374, "grad_norm": 0.5783948302268982, "learning_rate": 1.9129182007363536e-06, "loss": 1.9641, "step": 25260 }, { "epoch": 3.3919463087248323, "grad_norm": 0.5930677056312561, "learning_rate": 1.9097166639987193e-06, "loss": 1.9944, "step": 25270 }, { "epoch": 3.3932885906040267, "grad_norm": 0.5659170150756836, "learning_rate": 1.9065151272610854e-06, "loss": 2.016, "step": 25280 }, { "epoch": 3.3946308724832215, "grad_norm": 0.5566468238830566, "learning_rate": 1.9033135905234516e-06, "loss": 2.0061, "step": 25290 }, { "epoch": 3.395973154362416, "grad_norm": 0.5847912430763245, "learning_rate": 1.9001120537858173e-06, "loss": 1.9786, "step": 25300 }, { "epoch": 3.395973154362416, "eval_loss": 2.033474922180176, "eval_runtime": 99.6194, "eval_samples_per_second": 10.038, "eval_steps_per_second": 5.019, "step": 25300 }, { "epoch": 3.3973154362416107, "grad_norm": 0.5895040035247803, "learning_rate": 1.8969105170481834e-06, "loss": 1.9955, "step": 25310 }, { "epoch": 3.3986577181208055, "grad_norm": 0.6539040207862854, "learning_rate": 1.8937089803105493e-06, "loss": 1.9979, "step": 25320 }, { "epoch": 3.4, "grad_norm": 0.5976948142051697, "learning_rate": 1.890507443572915e-06, "loss": 2.0185, "step": 25330 }, { "epoch": 3.4013422818791947, "grad_norm": 0.53128981590271, "learning_rate": 1.8873059068352812e-06, "loss": 2.0292, "step": 25340 }, { "epoch": 3.402684563758389, "grad_norm": 0.5859795808792114, "learning_rate": 1.8841043700976469e-06, "loss": 2.0067, "step": 25350 }, { "epoch": 3.404026845637584, "grad_norm": 0.5527817010879517, "learning_rate": 1.880902833360013e-06, "loss": 2.0164, "step": 25360 }, { "epoch": 3.4053691275167783, "grad_norm": 0.5099530220031738, "learning_rate": 1.877701296622379e-06, "loss": 2.0122, "step": 25370 }, { "epoch": 3.406711409395973, "grad_norm": 0.5217568278312683, "learning_rate": 1.8744997598847448e-06, "loss": 1.9672, "step": 25380 }, { "epoch": 3.4080536912751676, "grad_norm": 0.569888174533844, "learning_rate": 1.8712982231471108e-06, "loss": 2.0249, "step": 25390 }, { "epoch": 3.4093959731543624, "grad_norm": 0.5308755040168762, "learning_rate": 1.868096686409477e-06, "loss": 2.0134, "step": 25400 }, { "epoch": 3.4093959731543624, "eval_loss": 2.0333244800567627, "eval_runtime": 99.7654, "eval_samples_per_second": 10.024, "eval_steps_per_second": 5.012, "step": 25400 }, { "epoch": 3.410738255033557, "grad_norm": 0.5577428936958313, "learning_rate": 1.8648951496718426e-06, "loss": 1.9833, "step": 25410 }, { "epoch": 3.4120805369127516, "grad_norm": 0.5679981708526611, "learning_rate": 1.8616936129342085e-06, "loss": 2.0226, "step": 25420 }, { "epoch": 3.4134228187919464, "grad_norm": 0.5294861197471619, "learning_rate": 1.8584920761965745e-06, "loss": 1.9814, "step": 25430 }, { "epoch": 3.414765100671141, "grad_norm": 0.5780696868896484, "learning_rate": 1.8552905394589404e-06, "loss": 2.0479, "step": 25440 }, { "epoch": 3.4161073825503356, "grad_norm": 0.5870608687400818, "learning_rate": 1.8520890027213065e-06, "loss": 2.0535, "step": 25450 }, { "epoch": 3.41744966442953, "grad_norm": 0.6202998757362366, "learning_rate": 1.8488874659836722e-06, "loss": 2.0074, "step": 25460 }, { "epoch": 3.418791946308725, "grad_norm": 0.5340785980224609, "learning_rate": 1.8456859292460384e-06, "loss": 1.9816, "step": 25470 }, { "epoch": 3.4201342281879192, "grad_norm": 0.5752537250518799, "learning_rate": 1.842484392508404e-06, "loss": 2.0156, "step": 25480 }, { "epoch": 3.421476510067114, "grad_norm": 0.533042311668396, "learning_rate": 1.83928285577077e-06, "loss": 1.9993, "step": 25490 }, { "epoch": 3.422818791946309, "grad_norm": 0.5413929224014282, "learning_rate": 1.8360813190331361e-06, "loss": 2.0008, "step": 25500 }, { "epoch": 3.422818791946309, "eval_loss": 2.033494234085083, "eval_runtime": 99.8012, "eval_samples_per_second": 10.02, "eval_steps_per_second": 5.01, "step": 25500 }, { "epoch": 3.4241610738255033, "grad_norm": 0.613568902015686, "learning_rate": 1.8328797822955018e-06, "loss": 1.9857, "step": 25510 }, { "epoch": 3.425503355704698, "grad_norm": 0.5847415924072266, "learning_rate": 1.829678245557868e-06, "loss": 2.005, "step": 25520 }, { "epoch": 3.4268456375838925, "grad_norm": 0.6314620971679688, "learning_rate": 1.8264767088202339e-06, "loss": 2.0095, "step": 25530 }, { "epoch": 3.4281879194630873, "grad_norm": 0.5585344433784485, "learning_rate": 1.8232751720825998e-06, "loss": 1.9953, "step": 25540 }, { "epoch": 3.4295302013422817, "grad_norm": 0.5491799712181091, "learning_rate": 1.8200736353449657e-06, "loss": 1.9806, "step": 25550 }, { "epoch": 3.4308724832214765, "grad_norm": 0.5185725688934326, "learning_rate": 1.8168720986073317e-06, "loss": 1.9981, "step": 25560 }, { "epoch": 3.432214765100671, "grad_norm": 0.5704931020736694, "learning_rate": 1.8136705618696976e-06, "loss": 2.0389, "step": 25570 }, { "epoch": 3.4335570469798657, "grad_norm": 0.6064590811729431, "learning_rate": 1.8104690251320637e-06, "loss": 2.0434, "step": 25580 }, { "epoch": 3.4348993288590606, "grad_norm": 0.5624018907546997, "learning_rate": 1.8072674883944294e-06, "loss": 2.0115, "step": 25590 }, { "epoch": 3.436241610738255, "grad_norm": 0.6257191896438599, "learning_rate": 1.8040659516567953e-06, "loss": 2.0321, "step": 25600 }, { "epoch": 3.436241610738255, "eval_loss": 2.033203601837158, "eval_runtime": 99.6708, "eval_samples_per_second": 10.033, "eval_steps_per_second": 5.017, "step": 25600 }, { "epoch": 3.43758389261745, "grad_norm": 0.564278781414032, "learning_rate": 1.8008644149191615e-06, "loss": 2.004, "step": 25610 }, { "epoch": 3.438926174496644, "grad_norm": 0.5536876320838928, "learning_rate": 1.7976628781815272e-06, "loss": 2.0119, "step": 25620 }, { "epoch": 3.440268456375839, "grad_norm": 0.5710294246673584, "learning_rate": 1.7944613414438933e-06, "loss": 2.0375, "step": 25630 }, { "epoch": 3.4416107382550334, "grad_norm": 0.5698024034500122, "learning_rate": 1.791259804706259e-06, "loss": 2.0173, "step": 25640 }, { "epoch": 3.442953020134228, "grad_norm": 0.5126367211341858, "learning_rate": 1.7880582679686252e-06, "loss": 2.0378, "step": 25650 }, { "epoch": 3.4442953020134226, "grad_norm": 0.6241849064826965, "learning_rate": 1.784856731230991e-06, "loss": 2.0066, "step": 25660 }, { "epoch": 3.4456375838926174, "grad_norm": 0.5671263933181763, "learning_rate": 1.7816551944933568e-06, "loss": 2.0082, "step": 25670 }, { "epoch": 3.4469798657718123, "grad_norm": 0.6242393851280212, "learning_rate": 1.778453657755723e-06, "loss": 1.9936, "step": 25680 }, { "epoch": 3.4483221476510066, "grad_norm": 0.5984540581703186, "learning_rate": 1.7752521210180886e-06, "loss": 2.0275, "step": 25690 }, { "epoch": 3.4496644295302015, "grad_norm": 0.5671818256378174, "learning_rate": 1.7720505842804548e-06, "loss": 1.9987, "step": 25700 }, { "epoch": 3.4496644295302015, "eval_loss": 2.03314208984375, "eval_runtime": 99.758, "eval_samples_per_second": 10.024, "eval_steps_per_second": 5.012, "step": 25700 }, { "epoch": 3.451006711409396, "grad_norm": 0.550834596157074, "learning_rate": 1.7688490475428207e-06, "loss": 2.0123, "step": 25710 }, { "epoch": 3.4523489932885907, "grad_norm": 0.5404412150382996, "learning_rate": 1.7656475108051866e-06, "loss": 1.9957, "step": 25720 }, { "epoch": 3.453691275167785, "grad_norm": 0.576654851436615, "learning_rate": 1.7624459740675525e-06, "loss": 2.0206, "step": 25730 }, { "epoch": 3.45503355704698, "grad_norm": 0.5848711729049683, "learning_rate": 1.7592444373299187e-06, "loss": 2.0004, "step": 25740 }, { "epoch": 3.4563758389261743, "grad_norm": 0.525976300239563, "learning_rate": 1.7560429005922844e-06, "loss": 2.0218, "step": 25750 }, { "epoch": 3.457718120805369, "grad_norm": 0.5736172795295715, "learning_rate": 1.7528413638546505e-06, "loss": 2.0345, "step": 25760 }, { "epoch": 3.459060402684564, "grad_norm": 0.6559216380119324, "learning_rate": 1.7496398271170162e-06, "loss": 1.978, "step": 25770 }, { "epoch": 3.4604026845637583, "grad_norm": 0.5698836445808411, "learning_rate": 1.7464382903793821e-06, "loss": 2.0003, "step": 25780 }, { "epoch": 3.461744966442953, "grad_norm": 0.6036633253097534, "learning_rate": 1.7432367536417483e-06, "loss": 2.0183, "step": 25790 }, { "epoch": 3.4630872483221475, "grad_norm": 0.5477694869041443, "learning_rate": 1.740035216904114e-06, "loss": 1.9895, "step": 25800 }, { "epoch": 3.4630872483221475, "eval_loss": 2.0331811904907227, "eval_runtime": 99.6707, "eval_samples_per_second": 10.033, "eval_steps_per_second": 5.017, "step": 25800 }, { "epoch": 3.4644295302013424, "grad_norm": 0.664413571357727, "learning_rate": 1.7368336801664801e-06, "loss": 2.0055, "step": 25810 }, { "epoch": 3.4657718120805368, "grad_norm": 0.5614915490150452, "learning_rate": 1.733632143428846e-06, "loss": 2.017, "step": 25820 }, { "epoch": 3.4671140939597316, "grad_norm": 0.6014454364776611, "learning_rate": 1.730430606691212e-06, "loss": 2.0008, "step": 25830 }, { "epoch": 3.468456375838926, "grad_norm": 0.5873821377754211, "learning_rate": 1.7272290699535779e-06, "loss": 1.9929, "step": 25840 }, { "epoch": 3.469798657718121, "grad_norm": 0.5503237247467041, "learning_rate": 1.7240275332159436e-06, "loss": 2.045, "step": 25850 }, { "epoch": 3.4711409395973156, "grad_norm": 0.5496537685394287, "learning_rate": 1.7208259964783097e-06, "loss": 2.0364, "step": 25860 }, { "epoch": 3.47248322147651, "grad_norm": 0.5419268608093262, "learning_rate": 1.7176244597406759e-06, "loss": 2.0333, "step": 25870 }, { "epoch": 3.473825503355705, "grad_norm": 0.5395207405090332, "learning_rate": 1.7144229230030416e-06, "loss": 1.9939, "step": 25880 }, { "epoch": 3.475167785234899, "grad_norm": 0.5907343029975891, "learning_rate": 1.7112213862654075e-06, "loss": 2.032, "step": 25890 }, { "epoch": 3.476510067114094, "grad_norm": 0.5549110174179077, "learning_rate": 1.7080198495277734e-06, "loss": 2.0245, "step": 25900 }, { "epoch": 3.476510067114094, "eval_loss": 2.0327773094177246, "eval_runtime": 99.7139, "eval_samples_per_second": 10.029, "eval_steps_per_second": 5.014, "step": 25900 }, { "epoch": 3.4778523489932884, "grad_norm": 0.5323525667190552, "learning_rate": 1.7048183127901393e-06, "loss": 2.0095, "step": 25910 }, { "epoch": 3.4791946308724833, "grad_norm": 0.5647071599960327, "learning_rate": 1.7016167760525055e-06, "loss": 2.0093, "step": 25920 }, { "epoch": 3.4805369127516776, "grad_norm": 0.5616557598114014, "learning_rate": 1.6984152393148712e-06, "loss": 2.0069, "step": 25930 }, { "epoch": 3.4818791946308725, "grad_norm": 0.5042036175727844, "learning_rate": 1.6952137025772373e-06, "loss": 1.9895, "step": 25940 }, { "epoch": 3.4832214765100673, "grad_norm": 0.6743494868278503, "learning_rate": 1.6920121658396032e-06, "loss": 2.0078, "step": 25950 }, { "epoch": 3.4845637583892617, "grad_norm": 0.5964071154594421, "learning_rate": 1.688810629101969e-06, "loss": 2.0144, "step": 25960 }, { "epoch": 3.4859060402684565, "grad_norm": 0.5817059278488159, "learning_rate": 1.685609092364335e-06, "loss": 1.9803, "step": 25970 }, { "epoch": 3.487248322147651, "grad_norm": 0.5435609221458435, "learning_rate": 1.6824075556267008e-06, "loss": 1.9754, "step": 25980 }, { "epoch": 3.4885906040268457, "grad_norm": 0.5652287006378174, "learning_rate": 1.679206018889067e-06, "loss": 1.9967, "step": 25990 }, { "epoch": 3.48993288590604, "grad_norm": 0.5400235056877136, "learning_rate": 1.6760044821514328e-06, "loss": 2.0134, "step": 26000 }, { "epoch": 3.48993288590604, "eval_loss": 2.0326852798461914, "eval_runtime": 99.736, "eval_samples_per_second": 10.026, "eval_steps_per_second": 5.013, "step": 26000 }, { "epoch": 3.491275167785235, "grad_norm": 0.5601316690444946, "learning_rate": 1.6728029454137988e-06, "loss": 1.9638, "step": 26010 }, { "epoch": 3.4926174496644293, "grad_norm": 0.594646692276001, "learning_rate": 1.6696014086761647e-06, "loss": 2.0044, "step": 26020 }, { "epoch": 3.493959731543624, "grad_norm": 0.5239794254302979, "learning_rate": 1.6663998719385308e-06, "loss": 1.9931, "step": 26030 }, { "epoch": 3.495302013422819, "grad_norm": 0.5149232745170593, "learning_rate": 1.6631983352008965e-06, "loss": 1.9846, "step": 26040 }, { "epoch": 3.4966442953020134, "grad_norm": 0.5560303926467896, "learning_rate": 1.6599967984632625e-06, "loss": 2.0282, "step": 26050 }, { "epoch": 3.497986577181208, "grad_norm": 0.5595155358314514, "learning_rate": 1.6567952617256284e-06, "loss": 2.0358, "step": 26060 }, { "epoch": 3.4993288590604026, "grad_norm": 0.5482628345489502, "learning_rate": 1.6535937249879943e-06, "loss": 2.0165, "step": 26070 }, { "epoch": 3.5006711409395974, "grad_norm": 0.5926582217216492, "learning_rate": 1.6503921882503604e-06, "loss": 2.0109, "step": 26080 }, { "epoch": 3.502013422818792, "grad_norm": 0.5720061659812927, "learning_rate": 1.6471906515127261e-06, "loss": 2.0206, "step": 26090 }, { "epoch": 3.5033557046979866, "grad_norm": 0.6203693747520447, "learning_rate": 1.6439891147750923e-06, "loss": 2.0082, "step": 26100 }, { "epoch": 3.5033557046979866, "eval_loss": 2.0324747562408447, "eval_runtime": 99.7144, "eval_samples_per_second": 10.029, "eval_steps_per_second": 5.014, "step": 26100 }, { "epoch": 3.504697986577181, "grad_norm": 0.519006609916687, "learning_rate": 1.6407875780374582e-06, "loss": 2.0433, "step": 26110 }, { "epoch": 3.506040268456376, "grad_norm": 0.5979138612747192, "learning_rate": 1.637586041299824e-06, "loss": 1.9624, "step": 26120 }, { "epoch": 3.5073825503355707, "grad_norm": 0.6031221151351929, "learning_rate": 1.63438450456219e-06, "loss": 2.0151, "step": 26130 }, { "epoch": 3.508724832214765, "grad_norm": 0.5102588534355164, "learning_rate": 1.6311829678245558e-06, "loss": 2.012, "step": 26140 }, { "epoch": 3.51006711409396, "grad_norm": 0.5204235315322876, "learning_rate": 1.6279814310869219e-06, "loss": 2.0186, "step": 26150 }, { "epoch": 3.5114093959731543, "grad_norm": 0.5003209710121155, "learning_rate": 1.6247798943492878e-06, "loss": 2.039, "step": 26160 }, { "epoch": 3.512751677852349, "grad_norm": 0.5853362679481506, "learning_rate": 1.6215783576116537e-06, "loss": 2.0229, "step": 26170 }, { "epoch": 3.5140939597315435, "grad_norm": 0.5810410976409912, "learning_rate": 1.6183768208740197e-06, "loss": 1.9996, "step": 26180 }, { "epoch": 3.5154362416107383, "grad_norm": 0.5772440433502197, "learning_rate": 1.6151752841363856e-06, "loss": 2.0179, "step": 26190 }, { "epoch": 3.5167785234899327, "grad_norm": 0.5555768609046936, "learning_rate": 1.6119737473987515e-06, "loss": 2.028, "step": 26200 }, { "epoch": 3.5167785234899327, "eval_loss": 2.0322515964508057, "eval_runtime": 99.7838, "eval_samples_per_second": 10.022, "eval_steps_per_second": 5.011, "step": 26200 }, { "epoch": 3.5181208053691275, "grad_norm": 0.5459762215614319, "learning_rate": 1.6087722106611176e-06, "loss": 2.0294, "step": 26210 }, { "epoch": 3.5194630872483224, "grad_norm": 0.5496200919151306, "learning_rate": 1.6055706739234833e-06, "loss": 2.0058, "step": 26220 }, { "epoch": 3.5208053691275167, "grad_norm": 0.5605970025062561, "learning_rate": 1.6023691371858493e-06, "loss": 2.0198, "step": 26230 }, { "epoch": 3.5221476510067116, "grad_norm": 0.47437921166419983, "learning_rate": 1.5991676004482154e-06, "loss": 2.0556, "step": 26240 }, { "epoch": 3.523489932885906, "grad_norm": 0.50983726978302, "learning_rate": 1.595966063710581e-06, "loss": 2.0272, "step": 26250 }, { "epoch": 3.524832214765101, "grad_norm": 0.566801130771637, "learning_rate": 1.5927645269729472e-06, "loss": 2.0002, "step": 26260 }, { "epoch": 3.526174496644295, "grad_norm": 0.5266052484512329, "learning_rate": 1.589562990235313e-06, "loss": 2.0261, "step": 26270 }, { "epoch": 3.52751677852349, "grad_norm": 0.5181676745414734, "learning_rate": 1.586361453497679e-06, "loss": 1.9843, "step": 26280 }, { "epoch": 3.5288590604026844, "grad_norm": 0.5289013385772705, "learning_rate": 1.583159916760045e-06, "loss": 2.0023, "step": 26290 }, { "epoch": 3.530201342281879, "grad_norm": 0.5178347826004028, "learning_rate": 1.5799583800224107e-06, "loss": 2.0372, "step": 26300 }, { "epoch": 3.530201342281879, "eval_loss": 2.0322930812835693, "eval_runtime": 99.3132, "eval_samples_per_second": 10.069, "eval_steps_per_second": 5.035, "step": 26300 }, { "epoch": 3.531543624161074, "grad_norm": 0.5697055459022522, "learning_rate": 1.5767568432847768e-06, "loss": 2.0017, "step": 26310 }, { "epoch": 3.5328859060402684, "grad_norm": 0.5801728963851929, "learning_rate": 1.573555306547143e-06, "loss": 1.9787, "step": 26320 }, { "epoch": 3.5342281879194632, "grad_norm": 0.5830363631248474, "learning_rate": 1.5703537698095087e-06, "loss": 1.9798, "step": 26330 }, { "epoch": 3.5355704697986576, "grad_norm": 0.5186100602149963, "learning_rate": 1.5671522330718746e-06, "loss": 2.0331, "step": 26340 }, { "epoch": 3.5369127516778525, "grad_norm": 0.5778900384902954, "learning_rate": 1.5639506963342405e-06, "loss": 2.0245, "step": 26350 }, { "epoch": 3.538255033557047, "grad_norm": 0.5799062848091125, "learning_rate": 1.5607491595966065e-06, "loss": 2.0297, "step": 26360 }, { "epoch": 3.5395973154362417, "grad_norm": 0.60067218542099, "learning_rate": 1.5575476228589726e-06, "loss": 2.0386, "step": 26370 }, { "epoch": 3.540939597315436, "grad_norm": 0.8203257322311401, "learning_rate": 1.5543460861213383e-06, "loss": 2.0141, "step": 26380 }, { "epoch": 3.542281879194631, "grad_norm": 0.5709810256958008, "learning_rate": 1.5511445493837044e-06, "loss": 1.9943, "step": 26390 }, { "epoch": 3.5436241610738257, "grad_norm": 0.5866687893867493, "learning_rate": 1.5479430126460701e-06, "loss": 1.9832, "step": 26400 }, { "epoch": 3.5436241610738257, "eval_loss": 2.0319128036499023, "eval_runtime": 99.0141, "eval_samples_per_second": 10.1, "eval_steps_per_second": 5.05, "step": 26400 }, { "epoch": 3.54496644295302, "grad_norm": 0.5029199123382568, "learning_rate": 1.544741475908436e-06, "loss": 1.9856, "step": 26410 }, { "epoch": 3.546308724832215, "grad_norm": 0.5404402613639832, "learning_rate": 1.5415399391708022e-06, "loss": 2.0053, "step": 26420 }, { "epoch": 3.5476510067114093, "grad_norm": 0.5300592184066772, "learning_rate": 1.538338402433168e-06, "loss": 2.012, "step": 26430 }, { "epoch": 3.548993288590604, "grad_norm": 0.6518120169639587, "learning_rate": 1.535136865695534e-06, "loss": 1.9945, "step": 26440 }, { "epoch": 3.5503355704697985, "grad_norm": 0.5931301116943359, "learning_rate": 1.5319353289579e-06, "loss": 2.0299, "step": 26450 }, { "epoch": 3.5516778523489934, "grad_norm": 0.594891369342804, "learning_rate": 1.5287337922202659e-06, "loss": 2.0179, "step": 26460 }, { "epoch": 3.5530201342281877, "grad_norm": 0.567933201789856, "learning_rate": 1.5255322554826318e-06, "loss": 2.0182, "step": 26470 }, { "epoch": 3.5543624161073826, "grad_norm": 0.5732447504997253, "learning_rate": 1.5223307187449975e-06, "loss": 2.002, "step": 26480 }, { "epoch": 3.5557046979865774, "grad_norm": 0.5601099729537964, "learning_rate": 1.5191291820073636e-06, "loss": 2.0019, "step": 26490 }, { "epoch": 3.557046979865772, "grad_norm": 0.5106274485588074, "learning_rate": 1.5159276452697298e-06, "loss": 1.9877, "step": 26500 }, { "epoch": 3.557046979865772, "eval_loss": 2.032021999359131, "eval_runtime": 99.2164, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.039, "step": 26500 }, { "epoch": 3.558389261744966, "grad_norm": 0.5745659470558167, "learning_rate": 1.5127261085320955e-06, "loss": 2.0172, "step": 26510 }, { "epoch": 3.559731543624161, "grad_norm": 0.6014290452003479, "learning_rate": 1.5095245717944614e-06, "loss": 1.987, "step": 26520 }, { "epoch": 3.561073825503356, "grad_norm": 0.5782161951065063, "learning_rate": 1.5063230350568275e-06, "loss": 2.0174, "step": 26530 }, { "epoch": 3.56241610738255, "grad_norm": 0.6540932059288025, "learning_rate": 1.5031214983191933e-06, "loss": 1.9996, "step": 26540 }, { "epoch": 3.563758389261745, "grad_norm": 0.5923947095870972, "learning_rate": 1.4999199615815594e-06, "loss": 1.9941, "step": 26550 }, { "epoch": 3.5651006711409394, "grad_norm": 0.5839203000068665, "learning_rate": 1.496718424843925e-06, "loss": 1.9726, "step": 26560 }, { "epoch": 3.5664429530201343, "grad_norm": 0.5613699555397034, "learning_rate": 1.4935168881062912e-06, "loss": 1.9925, "step": 26570 }, { "epoch": 3.567785234899329, "grad_norm": 0.5361098051071167, "learning_rate": 1.4903153513686572e-06, "loss": 2.0277, "step": 26580 }, { "epoch": 3.5691275167785235, "grad_norm": 0.5905805826187134, "learning_rate": 1.4871138146310229e-06, "loss": 2.0459, "step": 26590 }, { "epoch": 3.570469798657718, "grad_norm": 0.5785099267959595, "learning_rate": 1.483912277893389e-06, "loss": 2.0073, "step": 26600 }, { "epoch": 3.570469798657718, "eval_loss": 2.0318801403045654, "eval_runtime": 99.1857, "eval_samples_per_second": 10.082, "eval_steps_per_second": 5.041, "step": 26600 }, { "epoch": 3.5718120805369127, "grad_norm": 0.5500389933586121, "learning_rate": 1.4807107411557547e-06, "loss": 2.0281, "step": 26610 }, { "epoch": 3.5731543624161075, "grad_norm": 0.5361244678497314, "learning_rate": 1.4775092044181208e-06, "loss": 1.9826, "step": 26620 }, { "epoch": 3.574496644295302, "grad_norm": 0.5269361138343811, "learning_rate": 1.4743076676804868e-06, "loss": 2.0206, "step": 26630 }, { "epoch": 3.5758389261744967, "grad_norm": 0.5010485053062439, "learning_rate": 1.4711061309428527e-06, "loss": 2.0153, "step": 26640 }, { "epoch": 3.577181208053691, "grad_norm": 0.5887482166290283, "learning_rate": 1.4679045942052186e-06, "loss": 2.0069, "step": 26650 }, { "epoch": 3.578523489932886, "grad_norm": 0.5067187547683716, "learning_rate": 1.4647030574675847e-06, "loss": 2.0108, "step": 26660 }, { "epoch": 3.5798657718120808, "grad_norm": 0.5837522745132446, "learning_rate": 1.4615015207299505e-06, "loss": 2.0401, "step": 26670 }, { "epoch": 3.581208053691275, "grad_norm": 0.5169274210929871, "learning_rate": 1.4582999839923164e-06, "loss": 2.0129, "step": 26680 }, { "epoch": 3.5825503355704695, "grad_norm": 0.5982518792152405, "learning_rate": 1.4550984472546823e-06, "loss": 2.0098, "step": 26690 }, { "epoch": 3.5838926174496644, "grad_norm": 0.5763619542121887, "learning_rate": 1.4518969105170482e-06, "loss": 2.0456, "step": 26700 }, { "epoch": 3.5838926174496644, "eval_loss": 2.0316596031188965, "eval_runtime": 99.2723, "eval_samples_per_second": 10.073, "eval_steps_per_second": 5.037, "step": 26700 }, { "epoch": 3.585234899328859, "grad_norm": 0.5087338089942932, "learning_rate": 1.4486953737794144e-06, "loss": 2.0014, "step": 26710 }, { "epoch": 3.5865771812080536, "grad_norm": 0.5658003687858582, "learning_rate": 1.44549383704178e-06, "loss": 1.993, "step": 26720 }, { "epoch": 3.5879194630872484, "grad_norm": 0.5506657958030701, "learning_rate": 1.4422923003041462e-06, "loss": 1.9865, "step": 26730 }, { "epoch": 3.589261744966443, "grad_norm": 0.5712320804595947, "learning_rate": 1.4390907635665121e-06, "loss": 1.9918, "step": 26740 }, { "epoch": 3.5906040268456376, "grad_norm": 0.5391896367073059, "learning_rate": 1.435889226828878e-06, "loss": 2.0186, "step": 26750 }, { "epoch": 3.5919463087248324, "grad_norm": 0.5626962780952454, "learning_rate": 1.432687690091244e-06, "loss": 2.0359, "step": 26760 }, { "epoch": 3.593288590604027, "grad_norm": 0.5323194265365601, "learning_rate": 1.4294861533536097e-06, "loss": 2.0044, "step": 26770 }, { "epoch": 3.594630872483221, "grad_norm": 0.49520260095596313, "learning_rate": 1.4262846166159758e-06, "loss": 2.0125, "step": 26780 }, { "epoch": 3.595973154362416, "grad_norm": 0.5849247574806213, "learning_rate": 1.4230830798783417e-06, "loss": 2.0276, "step": 26790 }, { "epoch": 3.597315436241611, "grad_norm": 0.5351284146308899, "learning_rate": 1.4198815431407076e-06, "loss": 2.0037, "step": 26800 }, { "epoch": 3.597315436241611, "eval_loss": 2.0315005779266357, "eval_runtime": 99.2241, "eval_samples_per_second": 10.078, "eval_steps_per_second": 5.039, "step": 26800 }, { "epoch": 3.5986577181208053, "grad_norm": 0.5691702365875244, "learning_rate": 1.4166800064030736e-06, "loss": 1.9961, "step": 26810 }, { "epoch": 3.6, "grad_norm": 0.5791316032409668, "learning_rate": 1.4134784696654395e-06, "loss": 1.9893, "step": 26820 }, { "epoch": 3.6013422818791945, "grad_norm": 0.577229380607605, "learning_rate": 1.4102769329278054e-06, "loss": 2.0102, "step": 26830 }, { "epoch": 3.6026845637583893, "grad_norm": 0.5284247398376465, "learning_rate": 1.4070753961901715e-06, "loss": 2.0143, "step": 26840 }, { "epoch": 3.604026845637584, "grad_norm": 0.5774010419845581, "learning_rate": 1.4038738594525373e-06, "loss": 2.0377, "step": 26850 }, { "epoch": 3.6053691275167785, "grad_norm": 0.5636426210403442, "learning_rate": 1.4006723227149032e-06, "loss": 2.0085, "step": 26860 }, { "epoch": 3.606711409395973, "grad_norm": 0.5386518239974976, "learning_rate": 1.3974707859772693e-06, "loss": 2.0103, "step": 26870 }, { "epoch": 3.6080536912751677, "grad_norm": 0.5856292247772217, "learning_rate": 1.394269249239635e-06, "loss": 2.0179, "step": 26880 }, { "epoch": 3.6093959731543626, "grad_norm": 0.5678332448005676, "learning_rate": 1.3910677125020012e-06, "loss": 2.0337, "step": 26890 }, { "epoch": 3.610738255033557, "grad_norm": 0.5619184374809265, "learning_rate": 1.3878661757643669e-06, "loss": 1.9718, "step": 26900 }, { "epoch": 3.610738255033557, "eval_loss": 2.031461715698242, "eval_runtime": 99.0243, "eval_samples_per_second": 10.099, "eval_steps_per_second": 5.049, "step": 26900 }, { "epoch": 3.6120805369127518, "grad_norm": 0.5686038136482239, "learning_rate": 1.384664639026733e-06, "loss": 2.0198, "step": 26910 }, { "epoch": 3.613422818791946, "grad_norm": 0.547531008720398, "learning_rate": 1.381463102289099e-06, "loss": 1.9721, "step": 26920 }, { "epoch": 3.614765100671141, "grad_norm": 0.6028099656105042, "learning_rate": 1.3782615655514646e-06, "loss": 2.0103, "step": 26930 }, { "epoch": 3.616107382550336, "grad_norm": 0.5436133742332458, "learning_rate": 1.3750600288138308e-06, "loss": 2.0334, "step": 26940 }, { "epoch": 3.61744966442953, "grad_norm": 0.6020119190216064, "learning_rate": 1.371858492076197e-06, "loss": 1.9932, "step": 26950 }, { "epoch": 3.6187919463087246, "grad_norm": 0.5240419507026672, "learning_rate": 1.3686569553385626e-06, "loss": 1.9908, "step": 26960 }, { "epoch": 3.6201342281879194, "grad_norm": 0.5781471729278564, "learning_rate": 1.3654554186009285e-06, "loss": 1.9913, "step": 26970 }, { "epoch": 3.6214765100671142, "grad_norm": 0.5228403806686401, "learning_rate": 1.3622538818632945e-06, "loss": 2.0105, "step": 26980 }, { "epoch": 3.6228187919463086, "grad_norm": 0.5725923776626587, "learning_rate": 1.3590523451256604e-06, "loss": 2.0124, "step": 26990 }, { "epoch": 3.6241610738255035, "grad_norm": 0.5426804423332214, "learning_rate": 1.3558508083880265e-06, "loss": 2.0344, "step": 27000 }, { "epoch": 3.6241610738255035, "eval_loss": 2.0313711166381836, "eval_runtime": 99.1271, "eval_samples_per_second": 10.088, "eval_steps_per_second": 5.044, "step": 27000 }, { "epoch": 3.625503355704698, "grad_norm": 0.5412684082984924, "learning_rate": 1.3526492716503922e-06, "loss": 2.0272, "step": 27010 }, { "epoch": 3.6268456375838927, "grad_norm": 0.5994455814361572, "learning_rate": 1.3494477349127584e-06, "loss": 2.0083, "step": 27020 }, { "epoch": 3.6281879194630875, "grad_norm": 0.5473735928535461, "learning_rate": 1.3462461981751243e-06, "loss": 2.0363, "step": 27030 }, { "epoch": 3.629530201342282, "grad_norm": 0.5572862029075623, "learning_rate": 1.34304466143749e-06, "loss": 1.9947, "step": 27040 }, { "epoch": 3.6308724832214763, "grad_norm": 0.6075792908668518, "learning_rate": 1.3398431246998561e-06, "loss": 1.9904, "step": 27050 }, { "epoch": 3.632214765100671, "grad_norm": 0.6825592517852783, "learning_rate": 1.3366415879622218e-06, "loss": 2.0219, "step": 27060 }, { "epoch": 3.633557046979866, "grad_norm": 0.8327385783195496, "learning_rate": 1.333440051224588e-06, "loss": 2.0099, "step": 27070 }, { "epoch": 3.6348993288590603, "grad_norm": 0.6000455021858215, "learning_rate": 1.3302385144869539e-06, "loss": 2.0201, "step": 27080 }, { "epoch": 3.636241610738255, "grad_norm": 0.5002001523971558, "learning_rate": 1.3270369777493198e-06, "loss": 2.0008, "step": 27090 }, { "epoch": 3.6375838926174495, "grad_norm": 0.5344942808151245, "learning_rate": 1.3238354410116857e-06, "loss": 2.0393, "step": 27100 }, { "epoch": 3.6375838926174495, "eval_loss": 2.0311882495880127, "eval_runtime": 99.2431, "eval_samples_per_second": 10.076, "eval_steps_per_second": 5.038, "step": 27100 }, { "epoch": 3.6389261744966444, "grad_norm": 0.5355333685874939, "learning_rate": 1.3206339042740514e-06, "loss": 2.0083, "step": 27110 }, { "epoch": 3.640268456375839, "grad_norm": 0.5520380735397339, "learning_rate": 1.3174323675364176e-06, "loss": 2.0107, "step": 27120 }, { "epoch": 3.6416107382550336, "grad_norm": 0.4758807420730591, "learning_rate": 1.3142308307987837e-06, "loss": 2.011, "step": 27130 }, { "epoch": 3.642953020134228, "grad_norm": 0.5604168176651001, "learning_rate": 1.3110292940611494e-06, "loss": 1.9704, "step": 27140 }, { "epoch": 3.6442953020134228, "grad_norm": 0.4999609887599945, "learning_rate": 1.3078277573235153e-06, "loss": 1.9859, "step": 27150 }, { "epoch": 3.6456375838926176, "grad_norm": 0.5146457552909851, "learning_rate": 1.3046262205858815e-06, "loss": 2.0297, "step": 27160 }, { "epoch": 3.646979865771812, "grad_norm": 0.5436716079711914, "learning_rate": 1.3014246838482472e-06, "loss": 2.0202, "step": 27170 }, { "epoch": 3.648322147651007, "grad_norm": 0.5256075263023376, "learning_rate": 1.2982231471106133e-06, "loss": 2.0319, "step": 27180 }, { "epoch": 3.649664429530201, "grad_norm": 0.5203789472579956, "learning_rate": 1.295021610372979e-06, "loss": 2.0115, "step": 27190 }, { "epoch": 3.651006711409396, "grad_norm": 0.5632807612419128, "learning_rate": 1.2918200736353452e-06, "loss": 2.0053, "step": 27200 }, { "epoch": 3.651006711409396, "eval_loss": 2.0313007831573486, "eval_runtime": 99.3281, "eval_samples_per_second": 10.068, "eval_steps_per_second": 5.034, "step": 27200 }, { "epoch": 3.652348993288591, "grad_norm": 0.5029721856117249, "learning_rate": 1.288618536897711e-06, "loss": 2.0194, "step": 27210 }, { "epoch": 3.6536912751677852, "grad_norm": 0.49627238512039185, "learning_rate": 1.2854170001600768e-06, "loss": 2.0026, "step": 27220 }, { "epoch": 3.6550335570469796, "grad_norm": 0.5915135741233826, "learning_rate": 1.282215463422443e-06, "loss": 1.9974, "step": 27230 }, { "epoch": 3.6563758389261745, "grad_norm": 0.5953642129898071, "learning_rate": 1.2790139266848088e-06, "loss": 2.0172, "step": 27240 }, { "epoch": 3.6577181208053693, "grad_norm": 0.6336215734481812, "learning_rate": 1.2758123899471748e-06, "loss": 1.9817, "step": 27250 }, { "epoch": 3.6590604026845637, "grad_norm": 0.5529477000236511, "learning_rate": 1.2726108532095407e-06, "loss": 1.9971, "step": 27260 }, { "epoch": 3.6604026845637585, "grad_norm": 0.5943782329559326, "learning_rate": 1.2694093164719066e-06, "loss": 1.9931, "step": 27270 }, { "epoch": 3.661744966442953, "grad_norm": 0.5412973761558533, "learning_rate": 1.2662077797342725e-06, "loss": 2.0315, "step": 27280 }, { "epoch": 3.6630872483221477, "grad_norm": 0.5867758989334106, "learning_rate": 1.2630062429966387e-06, "loss": 1.9939, "step": 27290 }, { "epoch": 3.6644295302013425, "grad_norm": 0.5666806697845459, "learning_rate": 1.2598047062590044e-06, "loss": 2.0057, "step": 27300 }, { "epoch": 3.6644295302013425, "eval_loss": 2.0311965942382812, "eval_runtime": 99.1017, "eval_samples_per_second": 10.091, "eval_steps_per_second": 5.045, "step": 27300 }, { "epoch": 3.665771812080537, "grad_norm": 0.586299479007721, "learning_rate": 1.2566031695213705e-06, "loss": 1.9945, "step": 27310 }, { "epoch": 3.6671140939597313, "grad_norm": 0.5768749713897705, "learning_rate": 1.2534016327837362e-06, "loss": 2.0298, "step": 27320 }, { "epoch": 3.668456375838926, "grad_norm": 0.6095951795578003, "learning_rate": 1.2502000960461021e-06, "loss": 2.0274, "step": 27330 }, { "epoch": 3.669798657718121, "grad_norm": 0.5211044549942017, "learning_rate": 1.246998559308468e-06, "loss": 2.0195, "step": 27340 }, { "epoch": 3.6711409395973154, "grad_norm": 0.5351852774620056, "learning_rate": 1.2437970225708342e-06, "loss": 1.9969, "step": 27350 }, { "epoch": 3.67248322147651, "grad_norm": 0.5246982574462891, "learning_rate": 1.2405954858332001e-06, "loss": 1.9887, "step": 27360 }, { "epoch": 3.6738255033557046, "grad_norm": 0.5135153532028198, "learning_rate": 1.237393949095566e-06, "loss": 2.0068, "step": 27370 }, { "epoch": 3.6751677852348994, "grad_norm": 0.5587418675422668, "learning_rate": 1.234192412357932e-06, "loss": 1.9971, "step": 27380 }, { "epoch": 3.6765100671140942, "grad_norm": 0.4823431968688965, "learning_rate": 1.2309908756202979e-06, "loss": 2.026, "step": 27390 }, { "epoch": 3.6778523489932886, "grad_norm": 0.5298497080802917, "learning_rate": 1.2277893388826638e-06, "loss": 2.0186, "step": 27400 }, { "epoch": 3.6778523489932886, "eval_loss": 2.030952215194702, "eval_runtime": 99.2174, "eval_samples_per_second": 10.079, "eval_steps_per_second": 5.039, "step": 27400 }, { "epoch": 3.679194630872483, "grad_norm": 0.5753053426742554, "learning_rate": 1.2245878021450297e-06, "loss": 2.011, "step": 27410 }, { "epoch": 3.680536912751678, "grad_norm": 0.5194422006607056, "learning_rate": 1.2213862654073956e-06, "loss": 2.0115, "step": 27420 }, { "epoch": 3.6818791946308727, "grad_norm": 0.5648922920227051, "learning_rate": 1.2181847286697616e-06, "loss": 2.0051, "step": 27430 }, { "epoch": 3.683221476510067, "grad_norm": 0.5649812817573547, "learning_rate": 1.2149831919321275e-06, "loss": 2.0234, "step": 27440 }, { "epoch": 3.684563758389262, "grad_norm": 0.5129743218421936, "learning_rate": 1.2117816551944934e-06, "loss": 2.0035, "step": 27450 }, { "epoch": 3.6859060402684563, "grad_norm": 0.5572558641433716, "learning_rate": 1.2085801184568593e-06, "loss": 2.0131, "step": 27460 }, { "epoch": 3.687248322147651, "grad_norm": 0.5094574689865112, "learning_rate": 1.2053785817192255e-06, "loss": 1.9998, "step": 27470 }, { "epoch": 3.688590604026846, "grad_norm": 0.6388832926750183, "learning_rate": 1.2021770449815914e-06, "loss": 1.9845, "step": 27480 }, { "epoch": 3.6899328859060403, "grad_norm": 0.5876774191856384, "learning_rate": 1.198975508243957e-06, "loss": 2.0151, "step": 27490 }, { "epoch": 3.6912751677852347, "grad_norm": 0.5422551035881042, "learning_rate": 1.195773971506323e-06, "loss": 2.0027, "step": 27500 }, { "epoch": 3.6912751677852347, "eval_loss": 2.0309927463531494, "eval_runtime": 99.2731, "eval_samples_per_second": 10.073, "eval_steps_per_second": 5.037, "step": 27500 }, { "epoch": 3.6926174496644295, "grad_norm": 0.5409743189811707, "learning_rate": 1.192572434768689e-06, "loss": 2.0396, "step": 27510 }, { "epoch": 3.6939597315436243, "grad_norm": 0.5549281239509583, "learning_rate": 1.189370898031055e-06, "loss": 2.011, "step": 27520 }, { "epoch": 3.6953020134228187, "grad_norm": 0.5758858919143677, "learning_rate": 1.186169361293421e-06, "loss": 1.9923, "step": 27530 }, { "epoch": 3.6966442953020135, "grad_norm": 0.5227702856063843, "learning_rate": 1.182967824555787e-06, "loss": 2.0063, "step": 27540 }, { "epoch": 3.697986577181208, "grad_norm": 0.49413546919822693, "learning_rate": 1.1797662878181528e-06, "loss": 2.0086, "step": 27550 }, { "epoch": 3.6993288590604028, "grad_norm": 0.6663057208061218, "learning_rate": 1.1765647510805188e-06, "loss": 1.9953, "step": 27560 }, { "epoch": 3.7006711409395976, "grad_norm": 0.5428208112716675, "learning_rate": 1.1733632143428847e-06, "loss": 2.0421, "step": 27570 }, { "epoch": 3.702013422818792, "grad_norm": 0.5529650449752808, "learning_rate": 1.1701616776052506e-06, "loss": 1.9915, "step": 27580 }, { "epoch": 3.7033557046979864, "grad_norm": 0.5597283840179443, "learning_rate": 1.1669601408676165e-06, "loss": 2.0052, "step": 27590 }, { "epoch": 3.704697986577181, "grad_norm": 0.5497382283210754, "learning_rate": 1.1637586041299825e-06, "loss": 1.9972, "step": 27600 }, { "epoch": 3.704697986577181, "eval_loss": 2.0307700634002686, "eval_runtime": 99.1358, "eval_samples_per_second": 10.087, "eval_steps_per_second": 5.044, "step": 27600 }, { "epoch": 3.706040268456376, "grad_norm": 0.5258267521858215, "learning_rate": 1.1605570673923484e-06, "loss": 2.0057, "step": 27610 }, { "epoch": 3.7073825503355704, "grad_norm": 0.5696093440055847, "learning_rate": 1.1573555306547143e-06, "loss": 1.9995, "step": 27620 }, { "epoch": 3.7087248322147652, "grad_norm": 0.5222363471984863, "learning_rate": 1.1541539939170802e-06, "loss": 2.0124, "step": 27630 }, { "epoch": 3.7100671140939596, "grad_norm": 0.517994225025177, "learning_rate": 1.1509524571794464e-06, "loss": 1.9919, "step": 27640 }, { "epoch": 3.7114093959731544, "grad_norm": 0.5691993236541748, "learning_rate": 1.1477509204418123e-06, "loss": 1.9868, "step": 27650 }, { "epoch": 3.712751677852349, "grad_norm": 0.5910984873771667, "learning_rate": 1.1445493837041782e-06, "loss": 1.9906, "step": 27660 }, { "epoch": 3.7140939597315437, "grad_norm": 0.5375342965126038, "learning_rate": 1.141347846966544e-06, "loss": 1.9855, "step": 27670 }, { "epoch": 3.715436241610738, "grad_norm": 0.5049592852592468, "learning_rate": 1.13814631022891e-06, "loss": 2.0079, "step": 27680 }, { "epoch": 3.716778523489933, "grad_norm": 0.5630091428756714, "learning_rate": 1.134944773491276e-06, "loss": 2.024, "step": 27690 }, { "epoch": 3.7181208053691277, "grad_norm": 0.524662971496582, "learning_rate": 1.1317432367536419e-06, "loss": 2.0066, "step": 27700 }, { "epoch": 3.7181208053691277, "eval_loss": 2.0309386253356934, "eval_runtime": 99.1082, "eval_samples_per_second": 10.09, "eval_steps_per_second": 5.045, "step": 27700 }, { "epoch": 3.719463087248322, "grad_norm": 0.5959316492080688, "learning_rate": 1.1285417000160078e-06, "loss": 2.029, "step": 27710 }, { "epoch": 3.720805369127517, "grad_norm": 0.5438879728317261, "learning_rate": 1.1253401632783737e-06, "loss": 2.0206, "step": 27720 }, { "epoch": 3.7221476510067113, "grad_norm": 0.5142372846603394, "learning_rate": 1.1221386265407396e-06, "loss": 2.0049, "step": 27730 }, { "epoch": 3.723489932885906, "grad_norm": 0.5625041723251343, "learning_rate": 1.1189370898031056e-06, "loss": 1.9966, "step": 27740 }, { "epoch": 3.7248322147651005, "grad_norm": 0.6374936103820801, "learning_rate": 1.1157355530654715e-06, "loss": 2.0287, "step": 27750 }, { "epoch": 3.7261744966442953, "grad_norm": 0.5368462204933167, "learning_rate": 1.1125340163278374e-06, "loss": 2.002, "step": 27760 }, { "epoch": 3.7275167785234897, "grad_norm": 0.6157576441764832, "learning_rate": 1.1093324795902033e-06, "loss": 2.0182, "step": 27770 }, { "epoch": 3.7288590604026846, "grad_norm": 0.5298171639442444, "learning_rate": 1.1061309428525693e-06, "loss": 2.0054, "step": 27780 }, { "epoch": 3.7302013422818794, "grad_norm": 0.5143840312957764, "learning_rate": 1.1029294061149352e-06, "loss": 1.985, "step": 27790 }, { "epoch": 3.7315436241610738, "grad_norm": 0.5257755517959595, "learning_rate": 1.099727869377301e-06, "loss": 2.0042, "step": 27800 }, { "epoch": 3.7315436241610738, "eval_loss": 2.030616521835327, "eval_runtime": 98.9775, "eval_samples_per_second": 10.103, "eval_steps_per_second": 5.052, "step": 27800 }, { "epoch": 3.7328859060402686, "grad_norm": 0.5499988794326782, "learning_rate": 1.0965263326396672e-06, "loss": 1.9837, "step": 27810 }, { "epoch": 3.734228187919463, "grad_norm": 0.5791357755661011, "learning_rate": 1.0933247959020332e-06, "loss": 2.0305, "step": 27820 }, { "epoch": 3.735570469798658, "grad_norm": 0.5184398293495178, "learning_rate": 1.090123259164399e-06, "loss": 1.9948, "step": 27830 }, { "epoch": 3.736912751677852, "grad_norm": 0.4910694360733032, "learning_rate": 1.0869217224267648e-06, "loss": 2.0057, "step": 27840 }, { "epoch": 3.738255033557047, "grad_norm": 0.5194987058639526, "learning_rate": 1.083720185689131e-06, "loss": 2.0186, "step": 27850 }, { "epoch": 3.7395973154362414, "grad_norm": 0.5067945122718811, "learning_rate": 1.0805186489514968e-06, "loss": 1.9798, "step": 27860 }, { "epoch": 3.7409395973154362, "grad_norm": 0.5868769288063049, "learning_rate": 1.0773171122138628e-06, "loss": 1.9939, "step": 27870 }, { "epoch": 3.742281879194631, "grad_norm": 0.540751039981842, "learning_rate": 1.0741155754762287e-06, "loss": 1.9762, "step": 27880 }, { "epoch": 3.7436241610738255, "grad_norm": 0.5037314295768738, "learning_rate": 1.0709140387385946e-06, "loss": 2.0152, "step": 27890 }, { "epoch": 3.7449664429530203, "grad_norm": 0.5138669610023499, "learning_rate": 1.0677125020009605e-06, "loss": 2.0025, "step": 27900 }, { "epoch": 3.7449664429530203, "eval_loss": 2.0305252075195312, "eval_runtime": 99.0138, "eval_samples_per_second": 10.1, "eval_steps_per_second": 5.05, "step": 27900 } ], "logging_steps": 10, "max_steps": 31235, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.410926724972765e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }