{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 34520, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023177656738903696, "grad_norm": 17.894062042236328, "learning_rate": 3.4749034749034746e-07, "loss": 5.7034, "mean_token_accuracy": 0.3196992427110672, "num_tokens": 161658.0, "step": 10 }, { "epoch": 0.004635531347780739, "grad_norm": 15.612476348876953, "learning_rate": 7.335907335907336e-07, "loss": 5.593, "mean_token_accuracy": 0.3260790541768074, "num_tokens": 323183.0, "step": 20 }, { "epoch": 0.006953297021671109, "grad_norm": 12.087387084960938, "learning_rate": 1.1196911196911197e-06, "loss": 5.3856, "mean_token_accuracy": 0.3406873792409897, "num_tokens": 483375.0, "step": 30 }, { "epoch": 0.009271062695561478, "grad_norm": 8.148137092590332, "learning_rate": 1.505791505791506e-06, "loss": 5.1441, "mean_token_accuracy": 0.3562857091426849, "num_tokens": 644210.0, "step": 40 }, { "epoch": 0.011588828369451848, "grad_norm": 5.097574234008789, "learning_rate": 1.891891891891892e-06, "loss": 4.9161, "mean_token_accuracy": 0.3665184356272221, "num_tokens": 804992.0, "step": 50 }, { "epoch": 0.013906594043342218, "grad_norm": 3.7244396209716797, "learning_rate": 2.2779922779922782e-06, "loss": 4.6486, "mean_token_accuracy": 0.3827472202479839, "num_tokens": 967048.0, "step": 60 }, { "epoch": 0.016224359717232587, "grad_norm": 3.2032999992370605, "learning_rate": 2.6640926640926642e-06, "loss": 4.4449, "mean_token_accuracy": 0.396025874465704, "num_tokens": 1127887.0, "step": 70 }, { "epoch": 0.018542125391122957, "grad_norm": 2.717254877090454, "learning_rate": 3.0501930501930503e-06, "loss": 4.1988, "mean_token_accuracy": 0.41127921268343925, "num_tokens": 1288554.0, "step": 80 }, { "epoch": 0.020859891065013327, "grad_norm": 2.216557264328003, "learning_rate": 3.4362934362934363e-06, "loss": 3.9877, "mean_token_accuracy": 0.4272708624601364, "num_tokens": 1449894.0, "step": 90 }, { "epoch": 0.023177656738903697, "grad_norm": 2.2230777740478516, "learning_rate": 3.822393822393822e-06, "loss": 3.7781, "mean_token_accuracy": 0.44815572947263715, "num_tokens": 1611409.0, "step": 100 }, { "epoch": 0.025495422412794067, "grad_norm": 2.2106094360351562, "learning_rate": 4.208494208494209e-06, "loss": 3.5866, "mean_token_accuracy": 0.4678606018424034, "num_tokens": 1771988.0, "step": 110 }, { "epoch": 0.027813188086684437, "grad_norm": 2.1260807514190674, "learning_rate": 4.594594594594595e-06, "loss": 3.4561, "mean_token_accuracy": 0.4756400555372238, "num_tokens": 1932379.0, "step": 120 }, { "epoch": 0.030130953760574807, "grad_norm": 1.9311332702636719, "learning_rate": 4.980694980694981e-06, "loss": 3.3109, "mean_token_accuracy": 0.4875908061861992, "num_tokens": 2094352.0, "step": 130 }, { "epoch": 0.03244871943446517, "grad_norm": 2.1242077350616455, "learning_rate": 5.366795366795367e-06, "loss": 3.213, "mean_token_accuracy": 0.49783681333065033, "num_tokens": 2256446.0, "step": 140 }, { "epoch": 0.034766485108355547, "grad_norm": 1.9190019369125366, "learning_rate": 5.752895752895753e-06, "loss": 3.1082, "mean_token_accuracy": 0.5071425855159759, "num_tokens": 2418352.0, "step": 150 }, { "epoch": 0.03708425078224591, "grad_norm": 3.2198374271392822, "learning_rate": 6.138996138996139e-06, "loss": 3.0446, "mean_token_accuracy": 0.5142567045986652, "num_tokens": 2578505.0, "step": 160 }, { "epoch": 0.039402016456136287, "grad_norm": 2.2282063961029053, "learning_rate": 6.525096525096526e-06, "loss": 2.9943, "mean_token_accuracy": 0.5179550401866436, "num_tokens": 2739891.0, "step": 170 }, { "epoch": 0.04171978213002665, "grad_norm": 2.454672336578369, "learning_rate": 6.911196911196911e-06, "loss": 2.9096, "mean_token_accuracy": 0.5246538534760475, "num_tokens": 2900487.0, "step": 180 }, { "epoch": 0.044037547803917027, "grad_norm": 2.844451427459717, "learning_rate": 7.297297297297298e-06, "loss": 2.8447, "mean_token_accuracy": 0.529235951602459, "num_tokens": 3061968.0, "step": 190 }, { "epoch": 0.04635531347780739, "grad_norm": 2.334955930709839, "learning_rate": 7.683397683397683e-06, "loss": 2.8446, "mean_token_accuracy": 0.528924684226513, "num_tokens": 3223711.0, "step": 200 }, { "epoch": 0.048673079151697766, "grad_norm": 2.502856492996216, "learning_rate": 8.06949806949807e-06, "loss": 2.7652, "mean_token_accuracy": 0.5422907933592797, "num_tokens": 3384820.0, "step": 210 }, { "epoch": 0.05099084482558813, "grad_norm": 3.2274768352508545, "learning_rate": 8.455598455598455e-06, "loss": 2.7052, "mean_token_accuracy": 0.5477661579847336, "num_tokens": 3546057.0, "step": 220 }, { "epoch": 0.0533086104994785, "grad_norm": 2.669459819793701, "learning_rate": 8.841698841698842e-06, "loss": 2.6898, "mean_token_accuracy": 0.5491460785269737, "num_tokens": 3706789.0, "step": 230 }, { "epoch": 0.05562637617336887, "grad_norm": 3.1272873878479004, "learning_rate": 9.227799227799227e-06, "loss": 2.6609, "mean_token_accuracy": 0.5495775461196899, "num_tokens": 3867336.0, "step": 240 }, { "epoch": 0.05794414184725924, "grad_norm": 2.624591827392578, "learning_rate": 9.613899613899614e-06, "loss": 2.6203, "mean_token_accuracy": 0.557054840028286, "num_tokens": 4028273.0, "step": 250 }, { "epoch": 0.06026190752114961, "grad_norm": 3.2321643829345703, "learning_rate": 1e-05, "loss": 2.6097, "mean_token_accuracy": 0.5582983300089837, "num_tokens": 4189986.0, "step": 260 }, { "epoch": 0.06257967319503999, "grad_norm": 1.9396703243255615, "learning_rate": 1.0386100386100386e-05, "loss": 2.5942, "mean_token_accuracy": 0.5580702036619186, "num_tokens": 4351588.0, "step": 270 }, { "epoch": 0.06489743886893035, "grad_norm": 2.1074259281158447, "learning_rate": 1.0772200772200773e-05, "loss": 2.5596, "mean_token_accuracy": 0.5645368278026581, "num_tokens": 4512728.0, "step": 280 }, { "epoch": 0.06721520454282072, "grad_norm": 2.8373234272003174, "learning_rate": 1.1158301158301158e-05, "loss": 2.5428, "mean_token_accuracy": 0.5663235753774643, "num_tokens": 4673603.0, "step": 290 }, { "epoch": 0.06953297021671109, "grad_norm": 2.7382004261016846, "learning_rate": 1.1544401544401545e-05, "loss": 2.5144, "mean_token_accuracy": 0.5688980147242546, "num_tokens": 4834486.0, "step": 300 }, { "epoch": 0.07185073589060147, "grad_norm": 3.4352259635925293, "learning_rate": 1.193050193050193e-05, "loss": 2.4888, "mean_token_accuracy": 0.570370252430439, "num_tokens": 4996033.0, "step": 310 }, { "epoch": 0.07416850156449183, "grad_norm": 2.4446895122528076, "learning_rate": 1.2316602316602317e-05, "loss": 2.4771, "mean_token_accuracy": 0.5736857205629349, "num_tokens": 5157589.0, "step": 320 }, { "epoch": 0.0764862672383822, "grad_norm": 2.3635342121124268, "learning_rate": 1.2702702702702704e-05, "loss": 2.469, "mean_token_accuracy": 0.573961754143238, "num_tokens": 5318586.0, "step": 330 }, { "epoch": 0.07880403291227257, "grad_norm": 2.765320301055908, "learning_rate": 1.3088803088803089e-05, "loss": 2.4542, "mean_token_accuracy": 0.5757471770048141, "num_tokens": 5479436.0, "step": 340 }, { "epoch": 0.08112179858616293, "grad_norm": 4.682990074157715, "learning_rate": 1.3474903474903474e-05, "loss": 2.4379, "mean_token_accuracy": 0.5790262222290039, "num_tokens": 5639471.0, "step": 350 }, { "epoch": 0.0834395642600533, "grad_norm": 2.488096237182617, "learning_rate": 1.3861003861003863e-05, "loss": 2.4203, "mean_token_accuracy": 0.5818719446659089, "num_tokens": 5800395.0, "step": 360 }, { "epoch": 0.08575732993394368, "grad_norm": 3.1795806884765625, "learning_rate": 1.4247104247104248e-05, "loss": 2.3934, "mean_token_accuracy": 0.5840843468904495, "num_tokens": 5961780.0, "step": 370 }, { "epoch": 0.08807509560783405, "grad_norm": 2.761369228363037, "learning_rate": 1.4633204633204633e-05, "loss": 2.3744, "mean_token_accuracy": 0.5851527094841004, "num_tokens": 6123514.0, "step": 380 }, { "epoch": 0.09039286128172441, "grad_norm": 3.063748359680176, "learning_rate": 1.5019305019305018e-05, "loss": 2.3787, "mean_token_accuracy": 0.5843465492129326, "num_tokens": 6285549.0, "step": 390 }, { "epoch": 0.09271062695561479, "grad_norm": 3.2026376724243164, "learning_rate": 1.540540540540541e-05, "loss": 2.328, "mean_token_accuracy": 0.5920817986130714, "num_tokens": 6447178.0, "step": 400 }, { "epoch": 0.09502839262950516, "grad_norm": 3.360424757003784, "learning_rate": 1.5791505791505794e-05, "loss": 2.3572, "mean_token_accuracy": 0.589422681927681, "num_tokens": 6608952.0, "step": 410 }, { "epoch": 0.09734615830339553, "grad_norm": 2.62567400932312, "learning_rate": 1.617760617760618e-05, "loss": 2.3365, "mean_token_accuracy": 0.5904348254203796, "num_tokens": 6768984.0, "step": 420 }, { "epoch": 0.09966392397728589, "grad_norm": 2.7034912109375, "learning_rate": 1.6563706563706567e-05, "loss": 2.327, "mean_token_accuracy": 0.5917965903878212, "num_tokens": 6930946.0, "step": 430 }, { "epoch": 0.10198168965117627, "grad_norm": 2.708073616027832, "learning_rate": 1.6949806949806953e-05, "loss": 2.304, "mean_token_accuracy": 0.5943972274661065, "num_tokens": 7092287.0, "step": 440 }, { "epoch": 0.10429945532506664, "grad_norm": 3.0076394081115723, "learning_rate": 1.7335907335907338e-05, "loss": 2.2743, "mean_token_accuracy": 0.5988315954804421, "num_tokens": 7253522.0, "step": 450 }, { "epoch": 0.106617220998957, "grad_norm": 2.46193790435791, "learning_rate": 1.7722007722007723e-05, "loss": 2.2977, "mean_token_accuracy": 0.5950200334191322, "num_tokens": 7413306.0, "step": 460 }, { "epoch": 0.10893498667284737, "grad_norm": 2.9363443851470947, "learning_rate": 1.810810810810811e-05, "loss": 2.2622, "mean_token_accuracy": 0.5994308307766915, "num_tokens": 7574407.0, "step": 470 }, { "epoch": 0.11125275234673775, "grad_norm": 2.910444498062134, "learning_rate": 1.8494208494208497e-05, "loss": 2.2521, "mean_token_accuracy": 0.6001504391431809, "num_tokens": 7734222.0, "step": 480 }, { "epoch": 0.11357051802062812, "grad_norm": 2.472151279449463, "learning_rate": 1.8880308880308882e-05, "loss": 2.2803, "mean_token_accuracy": 0.5982357397675514, "num_tokens": 7893723.0, "step": 490 }, { "epoch": 0.11588828369451848, "grad_norm": 2.797504425048828, "learning_rate": 1.9266409266409267e-05, "loss": 2.2366, "mean_token_accuracy": 0.602298018336296, "num_tokens": 8055391.0, "step": 500 }, { "epoch": 0.11820604936840885, "grad_norm": 2.9060018062591553, "learning_rate": 1.9652509652509656e-05, "loss": 2.2554, "mean_token_accuracy": 0.6004772841930389, "num_tokens": 8215821.0, "step": 510 }, { "epoch": 0.12052381504229923, "grad_norm": 3.2472403049468994, "learning_rate": 2.003861003861004e-05, "loss": 2.2352, "mean_token_accuracy": 0.6029785186052322, "num_tokens": 8377640.0, "step": 520 }, { "epoch": 0.1228415807161896, "grad_norm": 2.743255376815796, "learning_rate": 2.0424710424710426e-05, "loss": 2.2352, "mean_token_accuracy": 0.6026115983724594, "num_tokens": 8539373.0, "step": 530 }, { "epoch": 0.12515934639007997, "grad_norm": 2.890150308609009, "learning_rate": 2.0810810810810815e-05, "loss": 2.2371, "mean_token_accuracy": 0.6009327247738838, "num_tokens": 8700312.0, "step": 540 }, { "epoch": 0.12747711206397033, "grad_norm": 2.570850372314453, "learning_rate": 2.11969111969112e-05, "loss": 2.225, "mean_token_accuracy": 0.6045730233192443, "num_tokens": 8861889.0, "step": 550 }, { "epoch": 0.1297948777378607, "grad_norm": 3.533282995223999, "learning_rate": 2.1583011583011585e-05, "loss": 2.2049, "mean_token_accuracy": 0.6066286504268646, "num_tokens": 9023093.0, "step": 560 }, { "epoch": 0.13211264341175108, "grad_norm": 2.8368372917175293, "learning_rate": 2.196911196911197e-05, "loss": 2.1899, "mean_token_accuracy": 0.6071818187832833, "num_tokens": 9183272.0, "step": 570 }, { "epoch": 0.13443040908564144, "grad_norm": 2.768383502960205, "learning_rate": 2.235521235521236e-05, "loss": 2.1997, "mean_token_accuracy": 0.6062324836850166, "num_tokens": 9344184.0, "step": 580 }, { "epoch": 0.1367481747595318, "grad_norm": 2.7252895832061768, "learning_rate": 2.2741312741312744e-05, "loss": 2.1754, "mean_token_accuracy": 0.6083667874336243, "num_tokens": 9504964.0, "step": 590 }, { "epoch": 0.13906594043342219, "grad_norm": 2.7383980751037598, "learning_rate": 2.312741312741313e-05, "loss": 2.1869, "mean_token_accuracy": 0.6072710052132606, "num_tokens": 9666932.0, "step": 600 }, { "epoch": 0.14138370610731255, "grad_norm": 2.6337738037109375, "learning_rate": 2.3513513513513518e-05, "loss": 2.1667, "mean_token_accuracy": 0.6101355910301208, "num_tokens": 9828101.0, "step": 610 }, { "epoch": 0.14370147178120293, "grad_norm": 3.1017556190490723, "learning_rate": 2.3899613899613903e-05, "loss": 2.1627, "mean_token_accuracy": 0.6097298428416252, "num_tokens": 9989370.0, "step": 620 }, { "epoch": 0.1460192374550933, "grad_norm": 2.5814712047576904, "learning_rate": 2.4285714285714288e-05, "loss": 2.1695, "mean_token_accuracy": 0.6110931381583213, "num_tokens": 10151295.0, "step": 630 }, { "epoch": 0.14833700312898365, "grad_norm": 3.3343594074249268, "learning_rate": 2.4671814671814673e-05, "loss": 2.1463, "mean_token_accuracy": 0.613611313700676, "num_tokens": 10312292.0, "step": 640 }, { "epoch": 0.15065476880287404, "grad_norm": 2.7114105224609375, "learning_rate": 2.505791505791506e-05, "loss": 2.1368, "mean_token_accuracy": 0.6126717463135719, "num_tokens": 10473159.0, "step": 650 }, { "epoch": 0.1529725344767644, "grad_norm": 3.4572370052337646, "learning_rate": 2.5444015444015447e-05, "loss": 2.1326, "mean_token_accuracy": 0.6155589848756791, "num_tokens": 10635118.0, "step": 660 }, { "epoch": 0.15529030015065476, "grad_norm": 2.101663589477539, "learning_rate": 2.5830115830115832e-05, "loss": 2.132, "mean_token_accuracy": 0.614087063074112, "num_tokens": 10796881.0, "step": 670 }, { "epoch": 0.15760806582454515, "grad_norm": 2.810567855834961, "learning_rate": 2.6216216216216217e-05, "loss": 2.134, "mean_token_accuracy": 0.6125252619385719, "num_tokens": 10958972.0, "step": 680 }, { "epoch": 0.1599258314984355, "grad_norm": 2.773970365524292, "learning_rate": 2.6602316602316602e-05, "loss": 2.1314, "mean_token_accuracy": 0.6144662827253342, "num_tokens": 11120824.0, "step": 690 }, { "epoch": 0.16224359717232587, "grad_norm": 3.687438488006592, "learning_rate": 2.698841698841699e-05, "loss": 2.1206, "mean_token_accuracy": 0.6151857882738113, "num_tokens": 11282419.0, "step": 700 }, { "epoch": 0.16456136284621625, "grad_norm": 3.091953992843628, "learning_rate": 2.7374517374517376e-05, "loss": 2.1281, "mean_token_accuracy": 0.6147716239094734, "num_tokens": 11444166.0, "step": 710 }, { "epoch": 0.1668791285201066, "grad_norm": 2.8364405632019043, "learning_rate": 2.7760617760617765e-05, "loss": 2.1134, "mean_token_accuracy": 0.6162003487348556, "num_tokens": 11605796.0, "step": 720 }, { "epoch": 0.169196894193997, "grad_norm": 2.6339120864868164, "learning_rate": 2.814671814671815e-05, "loss": 2.1057, "mean_token_accuracy": 0.6165803253650666, "num_tokens": 11767312.0, "step": 730 }, { "epoch": 0.17151465986788736, "grad_norm": 3.09688401222229, "learning_rate": 2.8532818532818535e-05, "loss": 2.0973, "mean_token_accuracy": 0.6162187606096268, "num_tokens": 11928797.0, "step": 740 }, { "epoch": 0.17383242554177772, "grad_norm": 2.3416366577148438, "learning_rate": 2.891891891891892e-05, "loss": 2.1158, "mean_token_accuracy": 0.6159212291240692, "num_tokens": 12088802.0, "step": 750 }, { "epoch": 0.1761501912156681, "grad_norm": 3.5401194095611572, "learning_rate": 2.9305019305019305e-05, "loss": 2.1046, "mean_token_accuracy": 0.6168718814849854, "num_tokens": 12250295.0, "step": 760 }, { "epoch": 0.17846795688955847, "grad_norm": 2.134984254837036, "learning_rate": 2.969111969111969e-05, "loss": 2.1127, "mean_token_accuracy": 0.6152158454060555, "num_tokens": 12412187.0, "step": 770 }, { "epoch": 0.18078572256344883, "grad_norm": 3.166926622390747, "learning_rate": 3.007722007722008e-05, "loss": 2.0818, "mean_token_accuracy": 0.6198478862643242, "num_tokens": 12573905.0, "step": 780 }, { "epoch": 0.1831034882373392, "grad_norm": 3.0857746601104736, "learning_rate": 3.0463320463320468e-05, "loss": 2.0924, "mean_token_accuracy": 0.6168564721941948, "num_tokens": 12735539.0, "step": 790 }, { "epoch": 0.18542125391122957, "grad_norm": 3.0504462718963623, "learning_rate": 3.084942084942085e-05, "loss": 2.0889, "mean_token_accuracy": 0.6171769425272942, "num_tokens": 12896042.0, "step": 800 }, { "epoch": 0.18773901958511993, "grad_norm": 2.2158491611480713, "learning_rate": 3.123552123552124e-05, "loss": 2.1096, "mean_token_accuracy": 0.6144651144742965, "num_tokens": 13057156.0, "step": 810 }, { "epoch": 0.19005678525901032, "grad_norm": 3.263890504837036, "learning_rate": 3.162162162162162e-05, "loss": 2.0787, "mean_token_accuracy": 0.620747198164463, "num_tokens": 13217501.0, "step": 820 }, { "epoch": 0.19237455093290068, "grad_norm": 2.383279323577881, "learning_rate": 3.200772200772201e-05, "loss": 2.0677, "mean_token_accuracy": 0.6205677390098572, "num_tokens": 13378448.0, "step": 830 }, { "epoch": 0.19469231660679107, "grad_norm": 1.9628933668136597, "learning_rate": 3.2393822393822394e-05, "loss": 2.0743, "mean_token_accuracy": 0.6199361652135849, "num_tokens": 13538952.0, "step": 840 }, { "epoch": 0.19701008228068143, "grad_norm": 2.551349401473999, "learning_rate": 3.2779922779922786e-05, "loss": 2.0788, "mean_token_accuracy": 0.6195822462439537, "num_tokens": 13699593.0, "step": 850 }, { "epoch": 0.19932784795457179, "grad_norm": 2.9565794467926025, "learning_rate": 3.316602316602317e-05, "loss": 2.0672, "mean_token_accuracy": 0.6205028340220451, "num_tokens": 13861197.0, "step": 860 }, { "epoch": 0.20164561362846217, "grad_norm": 2.1605026721954346, "learning_rate": 3.3552123552123556e-05, "loss": 2.0744, "mean_token_accuracy": 0.6191812247037888, "num_tokens": 14022860.0, "step": 870 }, { "epoch": 0.20396337930235253, "grad_norm": 2.6521265506744385, "learning_rate": 3.393822393822394e-05, "loss": 2.0606, "mean_token_accuracy": 0.6217024847865105, "num_tokens": 14183190.0, "step": 880 }, { "epoch": 0.2062811449762429, "grad_norm": 2.3585920333862305, "learning_rate": 3.4324324324324326e-05, "loss": 2.0535, "mean_token_accuracy": 0.6224743768572807, "num_tokens": 14345163.0, "step": 890 }, { "epoch": 0.20859891065013328, "grad_norm": 2.46527361869812, "learning_rate": 3.471042471042471e-05, "loss": 2.0498, "mean_token_accuracy": 0.6219742521643639, "num_tokens": 14506967.0, "step": 900 }, { "epoch": 0.21091667632402364, "grad_norm": 2.4162466526031494, "learning_rate": 3.50965250965251e-05, "loss": 2.0475, "mean_token_accuracy": 0.6220772638916969, "num_tokens": 14668521.0, "step": 910 }, { "epoch": 0.213234441997914, "grad_norm": 2.821624279022217, "learning_rate": 3.548262548262549e-05, "loss": 2.0553, "mean_token_accuracy": 0.6201831638813019, "num_tokens": 14829077.0, "step": 920 }, { "epoch": 0.21555220767180439, "grad_norm": 3.000847339630127, "learning_rate": 3.5868725868725874e-05, "loss": 2.0467, "mean_token_accuracy": 0.624513928592205, "num_tokens": 14990423.0, "step": 930 }, { "epoch": 0.21786997334569475, "grad_norm": 2.933260440826416, "learning_rate": 3.625482625482626e-05, "loss": 2.0365, "mean_token_accuracy": 0.6214005410671234, "num_tokens": 15151916.0, "step": 940 }, { "epoch": 0.22018773901958513, "grad_norm": 2.452296495437622, "learning_rate": 3.6640926640926644e-05, "loss": 2.0359, "mean_token_accuracy": 0.624190154671669, "num_tokens": 15313899.0, "step": 950 }, { "epoch": 0.2225055046934755, "grad_norm": 2.3679802417755127, "learning_rate": 3.702702702702703e-05, "loss": 2.0526, "mean_token_accuracy": 0.6225566834211349, "num_tokens": 15475435.0, "step": 960 }, { "epoch": 0.22482327036736585, "grad_norm": 2.1546075344085693, "learning_rate": 3.7413127413127414e-05, "loss": 2.0076, "mean_token_accuracy": 0.6269759923219681, "num_tokens": 15636355.0, "step": 970 }, { "epoch": 0.22714103604125624, "grad_norm": 2.518970251083374, "learning_rate": 3.77992277992278e-05, "loss": 2.0403, "mean_token_accuracy": 0.6239780411124229, "num_tokens": 15797274.0, "step": 980 }, { "epoch": 0.2294588017151466, "grad_norm": 2.788369655609131, "learning_rate": 3.818532818532819e-05, "loss": 2.0492, "mean_token_accuracy": 0.6211053118109703, "num_tokens": 15958285.0, "step": 990 }, { "epoch": 0.23177656738903696, "grad_norm": 2.3481032848358154, "learning_rate": 3.857142857142858e-05, "loss": 2.022, "mean_token_accuracy": 0.6261083573102951, "num_tokens": 16119745.0, "step": 1000 }, { "epoch": 0.23409433306292735, "grad_norm": 1.9556487798690796, "learning_rate": 3.895752895752896e-05, "loss": 2.0114, "mean_token_accuracy": 0.627188217639923, "num_tokens": 16280121.0, "step": 1010 }, { "epoch": 0.2364120987368177, "grad_norm": 1.9742565155029297, "learning_rate": 3.934362934362935e-05, "loss": 2.018, "mean_token_accuracy": 0.6237287357449531, "num_tokens": 16441289.0, "step": 1020 }, { "epoch": 0.23872986441070806, "grad_norm": 2.252917528152466, "learning_rate": 3.972972972972973e-05, "loss": 1.988, "mean_token_accuracy": 0.628923487663269, "num_tokens": 16602298.0, "step": 1030 }, { "epoch": 0.24104763008459845, "grad_norm": 2.125159740447998, "learning_rate": 4.011583011583012e-05, "loss": 2.004, "mean_token_accuracy": 0.6271463707089424, "num_tokens": 16762962.0, "step": 1040 }, { "epoch": 0.2433653957584888, "grad_norm": 2.4846150875091553, "learning_rate": 4.05019305019305e-05, "loss": 1.9965, "mean_token_accuracy": 0.6292179197072982, "num_tokens": 16924322.0, "step": 1050 }, { "epoch": 0.2456831614323792, "grad_norm": 2.258348226547241, "learning_rate": 4.0888030888030895e-05, "loss": 2.0182, "mean_token_accuracy": 0.6261778011918068, "num_tokens": 17084982.0, "step": 1060 }, { "epoch": 0.24800092710626956, "grad_norm": 3.067295551300049, "learning_rate": 4.127413127413128e-05, "loss": 2.0093, "mean_token_accuracy": 0.6274378567934036, "num_tokens": 17245771.0, "step": 1070 }, { "epoch": 0.25031869278015995, "grad_norm": 1.8076977729797363, "learning_rate": 4.1660231660231665e-05, "loss": 1.9827, "mean_token_accuracy": 0.6292496040463448, "num_tokens": 17406666.0, "step": 1080 }, { "epoch": 0.2526364584540503, "grad_norm": 2.139723777770996, "learning_rate": 4.204633204633205e-05, "loss": 2.0246, "mean_token_accuracy": 0.6262366771697998, "num_tokens": 17567246.0, "step": 1090 }, { "epoch": 0.25495422412794067, "grad_norm": 1.764849305152893, "learning_rate": 4.2432432432432435e-05, "loss": 1.9823, "mean_token_accuracy": 0.6277291804552079, "num_tokens": 17728176.0, "step": 1100 }, { "epoch": 0.257271989801831, "grad_norm": 1.870922327041626, "learning_rate": 4.281853281853282e-05, "loss": 1.982, "mean_token_accuracy": 0.6300041258335114, "num_tokens": 17889494.0, "step": 1110 }, { "epoch": 0.2595897554757214, "grad_norm": 2.440463066101074, "learning_rate": 4.3204633204633206e-05, "loss": 1.9681, "mean_token_accuracy": 0.6317617386579514, "num_tokens": 18050952.0, "step": 1120 }, { "epoch": 0.2619075211496118, "grad_norm": 2.0300073623657227, "learning_rate": 4.359073359073359e-05, "loss": 1.9929, "mean_token_accuracy": 0.6287112057209014, "num_tokens": 18212445.0, "step": 1130 }, { "epoch": 0.26422528682350216, "grad_norm": 2.5505902767181396, "learning_rate": 4.397683397683398e-05, "loss": 1.995, "mean_token_accuracy": 0.6281368225812912, "num_tokens": 18373620.0, "step": 1140 }, { "epoch": 0.2665430524973925, "grad_norm": 1.84709894657135, "learning_rate": 4.436293436293437e-05, "loss": 1.9684, "mean_token_accuracy": 0.6318689227104187, "num_tokens": 18534429.0, "step": 1150 }, { "epoch": 0.2688608181712829, "grad_norm": 2.470418930053711, "learning_rate": 4.474903474903475e-05, "loss": 1.991, "mean_token_accuracy": 0.6278605565428734, "num_tokens": 18695473.0, "step": 1160 }, { "epoch": 0.27117858384517324, "grad_norm": 1.9627760648727417, "learning_rate": 4.513513513513514e-05, "loss": 1.9826, "mean_token_accuracy": 0.6303821548819541, "num_tokens": 18857141.0, "step": 1170 }, { "epoch": 0.2734963495190636, "grad_norm": 1.7161710262298584, "learning_rate": 4.5521235521235524e-05, "loss": 1.9756, "mean_token_accuracy": 0.629201129078865, "num_tokens": 19018263.0, "step": 1180 }, { "epoch": 0.275814115192954, "grad_norm": 1.9838098287582397, "learning_rate": 4.590733590733591e-05, "loss": 1.9813, "mean_token_accuracy": 0.6299731224775315, "num_tokens": 19180408.0, "step": 1190 }, { "epoch": 0.27813188086684437, "grad_norm": 2.0741536617279053, "learning_rate": 4.6293436293436294e-05, "loss": 1.9715, "mean_token_accuracy": 0.6296976760029793, "num_tokens": 19340765.0, "step": 1200 }, { "epoch": 0.28044964654073473, "grad_norm": 1.6929048299789429, "learning_rate": 4.6679536679536686e-05, "loss": 1.9687, "mean_token_accuracy": 0.6325523748993873, "num_tokens": 19502387.0, "step": 1210 }, { "epoch": 0.2827674122146251, "grad_norm": 1.9009760618209839, "learning_rate": 4.706563706563707e-05, "loss": 1.9599, "mean_token_accuracy": 0.6326333746314049, "num_tokens": 19663402.0, "step": 1220 }, { "epoch": 0.28508517788851545, "grad_norm": 2.345518112182617, "learning_rate": 4.7451737451737456e-05, "loss": 1.9665, "mean_token_accuracy": 0.6318219244480133, "num_tokens": 19824910.0, "step": 1230 }, { "epoch": 0.28740294356240587, "grad_norm": 1.7701102495193481, "learning_rate": 4.783783783783784e-05, "loss": 1.962, "mean_token_accuracy": 0.6322108402848243, "num_tokens": 19986530.0, "step": 1240 }, { "epoch": 0.2897207092362962, "grad_norm": 1.9724475145339966, "learning_rate": 4.8223938223938227e-05, "loss": 1.9943, "mean_token_accuracy": 0.627080324292183, "num_tokens": 20147770.0, "step": 1250 }, { "epoch": 0.2920384749101866, "grad_norm": 1.5367251634597778, "learning_rate": 4.861003861003861e-05, "loss": 1.9803, "mean_token_accuracy": 0.6306385934352875, "num_tokens": 20308801.0, "step": 1260 }, { "epoch": 0.29435624058407694, "grad_norm": 1.7661594152450562, "learning_rate": 4.8996138996139e-05, "loss": 1.9674, "mean_token_accuracy": 0.6320006594061851, "num_tokens": 20470126.0, "step": 1270 }, { "epoch": 0.2966740062579673, "grad_norm": 1.8147454261779785, "learning_rate": 4.938223938223939e-05, "loss": 1.9589, "mean_token_accuracy": 0.6314522087574005, "num_tokens": 20631865.0, "step": 1280 }, { "epoch": 0.29899177193185766, "grad_norm": 3.2721054553985596, "learning_rate": 4.9768339768339774e-05, "loss": 1.9481, "mean_token_accuracy": 0.6328243985772133, "num_tokens": 20793483.0, "step": 1290 }, { "epoch": 0.3013095376057481, "grad_norm": 2.034388542175293, "learning_rate": 4.99999988732306e-05, "loss": 1.9527, "mean_token_accuracy": 0.6314484283328057, "num_tokens": 20954341.0, "step": 1300 }, { "epoch": 0.30362730327963844, "grad_norm": 1.8776136636734009, "learning_rate": 4.999998619707599e-05, "loss": 1.9239, "mean_token_accuracy": 0.6360997810959816, "num_tokens": 21114646.0, "step": 1310 }, { "epoch": 0.3059450689535288, "grad_norm": 1.5118343830108643, "learning_rate": 4.999995943631217e-05, "loss": 1.9446, "mean_token_accuracy": 0.6352547079324722, "num_tokens": 21275113.0, "step": 1320 }, { "epoch": 0.30826283462741916, "grad_norm": 1.7091580629348755, "learning_rate": 4.999991859095423e-05, "loss": 1.9352, "mean_token_accuracy": 0.6343237787485123, "num_tokens": 21436106.0, "step": 1330 }, { "epoch": 0.3105806003013095, "grad_norm": 1.7310645580291748, "learning_rate": 4.999986366102517e-05, "loss": 1.9353, "mean_token_accuracy": 0.6359612554311752, "num_tokens": 21597081.0, "step": 1340 }, { "epoch": 0.31289836597519993, "grad_norm": 1.8860507011413574, "learning_rate": 4.9999794646555955e-05, "loss": 1.9543, "mean_token_accuracy": 0.6317195475101471, "num_tokens": 21758530.0, "step": 1350 }, { "epoch": 0.3152161316490903, "grad_norm": 1.9822536706924438, "learning_rate": 4.999971154758544e-05, "loss": 1.9445, "mean_token_accuracy": 0.634228678047657, "num_tokens": 21919320.0, "step": 1360 }, { "epoch": 0.31753389732298065, "grad_norm": 2.243898391723633, "learning_rate": 4.9999614364160463e-05, "loss": 1.9227, "mean_token_accuracy": 0.6352636322379113, "num_tokens": 22081063.0, "step": 1370 }, { "epoch": 0.319851662996871, "grad_norm": 1.8585615158081055, "learning_rate": 4.999950309633577e-05, "loss": 1.938, "mean_token_accuracy": 0.6337199211120605, "num_tokens": 22242549.0, "step": 1380 }, { "epoch": 0.32216942867076137, "grad_norm": 2.4262335300445557, "learning_rate": 4.999937774417405e-05, "loss": 1.9632, "mean_token_accuracy": 0.6302136555314064, "num_tokens": 22401852.0, "step": 1390 }, { "epoch": 0.32448719434465173, "grad_norm": 1.8295425176620483, "learning_rate": 4.999923830774591e-05, "loss": 1.9357, "mean_token_accuracy": 0.6363146975636482, "num_tokens": 22562906.0, "step": 1400 }, { "epoch": 0.32680496001854215, "grad_norm": 1.7400521039962769, "learning_rate": 4.999908478712993e-05, "loss": 1.9187, "mean_token_accuracy": 0.636882996559143, "num_tokens": 22724235.0, "step": 1410 }, { "epoch": 0.3291227256924325, "grad_norm": 1.619806170463562, "learning_rate": 4.999891718241258e-05, "loss": 1.9022, "mean_token_accuracy": 0.6390747547149658, "num_tokens": 22886229.0, "step": 1420 }, { "epoch": 0.33144049136632286, "grad_norm": 1.846305251121521, "learning_rate": 4.999873549368829e-05, "loss": 1.9096, "mean_token_accuracy": 0.6382807478308677, "num_tokens": 23047898.0, "step": 1430 }, { "epoch": 0.3337582570402132, "grad_norm": 1.5274845361709595, "learning_rate": 4.9998539721059435e-05, "loss": 1.9166, "mean_token_accuracy": 0.6356395587325097, "num_tokens": 23208312.0, "step": 1440 }, { "epoch": 0.3360760227141036, "grad_norm": 1.372841238975525, "learning_rate": 4.9998329864636296e-05, "loss": 1.9104, "mean_token_accuracy": 0.6393186882138252, "num_tokens": 23369818.0, "step": 1450 }, { "epoch": 0.338393788387994, "grad_norm": 1.788949966430664, "learning_rate": 4.999810592453711e-05, "loss": 1.9145, "mean_token_accuracy": 0.636710125207901, "num_tokens": 23531038.0, "step": 1460 }, { "epoch": 0.34071155406188436, "grad_norm": 1.3993631601333618, "learning_rate": 4.999786790088803e-05, "loss": 1.8727, "mean_token_accuracy": 0.6403887659311295, "num_tokens": 23691420.0, "step": 1470 }, { "epoch": 0.3430293197357747, "grad_norm": 1.6193249225616455, "learning_rate": 4.999761579382317e-05, "loss": 1.9015, "mean_token_accuracy": 0.637080043554306, "num_tokens": 23852256.0, "step": 1480 }, { "epoch": 0.3453470854096651, "grad_norm": 1.4080313444137573, "learning_rate": 4.999734960348456e-05, "loss": 1.9, "mean_token_accuracy": 0.6401629343628883, "num_tokens": 24013271.0, "step": 1490 }, { "epoch": 0.34766485108355544, "grad_norm": 2.023298978805542, "learning_rate": 4.999706933002216e-05, "loss": 1.9016, "mean_token_accuracy": 0.6390696242451668, "num_tokens": 24174504.0, "step": 1500 }, { "epoch": 0.3499826167574458, "grad_norm": 2.1554718017578125, "learning_rate": 4.999677497359388e-05, "loss": 1.8912, "mean_token_accuracy": 0.640379510819912, "num_tokens": 24335125.0, "step": 1510 }, { "epoch": 0.3523003824313362, "grad_norm": 1.818528413772583, "learning_rate": 4.999646653436555e-05, "loss": 1.8966, "mean_token_accuracy": 0.6394944846630096, "num_tokens": 24496769.0, "step": 1520 }, { "epoch": 0.35461814810522657, "grad_norm": 1.5238648653030396, "learning_rate": 4.9996144012510935e-05, "loss": 1.8949, "mean_token_accuracy": 0.640499472618103, "num_tokens": 24657743.0, "step": 1530 }, { "epoch": 0.35693591377911693, "grad_norm": 1.4448144435882568, "learning_rate": 4.999580740821175e-05, "loss": 1.899, "mean_token_accuracy": 0.6380876749753952, "num_tokens": 24819698.0, "step": 1540 }, { "epoch": 0.3592536794530073, "grad_norm": 1.4283281564712524, "learning_rate": 4.999545672165763e-05, "loss": 1.8704, "mean_token_accuracy": 0.6409377381205559, "num_tokens": 24981423.0, "step": 1550 }, { "epoch": 0.36157144512689765, "grad_norm": 1.3715801239013672, "learning_rate": 4.999509195304615e-05, "loss": 1.8885, "mean_token_accuracy": 0.6391847386956215, "num_tokens": 25142205.0, "step": 1560 }, { "epoch": 0.36388921080078807, "grad_norm": 1.4633738994598389, "learning_rate": 4.9994713102582805e-05, "loss": 1.8988, "mean_token_accuracy": 0.640008395910263, "num_tokens": 25303140.0, "step": 1570 }, { "epoch": 0.3662069764746784, "grad_norm": 1.818905234336853, "learning_rate": 4.999432017048103e-05, "loss": 1.8783, "mean_token_accuracy": 0.6411171838641166, "num_tokens": 25463659.0, "step": 1580 }, { "epoch": 0.3685247421485688, "grad_norm": 1.7161966562271118, "learning_rate": 4.9993913156962223e-05, "loss": 1.9034, "mean_token_accuracy": 0.6370707347989082, "num_tokens": 25624444.0, "step": 1590 }, { "epoch": 0.37084250782245914, "grad_norm": 1.3690422773361206, "learning_rate": 4.999349206225567e-05, "loss": 1.8858, "mean_token_accuracy": 0.6399022758007049, "num_tokens": 25785913.0, "step": 1600 }, { "epoch": 0.3731602734963495, "grad_norm": 1.4552398920059204, "learning_rate": 4.9993056886598595e-05, "loss": 1.8825, "mean_token_accuracy": 0.6411809146404266, "num_tokens": 25946545.0, "step": 1610 }, { "epoch": 0.37547803917023986, "grad_norm": 1.4672073125839233, "learning_rate": 4.999260763023619e-05, "loss": 1.8783, "mean_token_accuracy": 0.6397744581103325, "num_tokens": 26107804.0, "step": 1620 }, { "epoch": 0.3777958048441303, "grad_norm": 1.2996481657028198, "learning_rate": 4.999214429342156e-05, "loss": 1.8884, "mean_token_accuracy": 0.6401124358177185, "num_tokens": 26267790.0, "step": 1630 }, { "epoch": 0.38011357051802064, "grad_norm": 1.3476495742797852, "learning_rate": 4.999166687641573e-05, "loss": 1.878, "mean_token_accuracy": 0.6414383992552757, "num_tokens": 26428615.0, "step": 1640 }, { "epoch": 0.382431336191911, "grad_norm": 1.2584129571914673, "learning_rate": 4.999117537948769e-05, "loss": 1.8641, "mean_token_accuracy": 0.6423551321029664, "num_tokens": 26589355.0, "step": 1650 }, { "epoch": 0.38474910186580136, "grad_norm": 1.4108871221542358, "learning_rate": 4.999066980291431e-05, "loss": 1.8632, "mean_token_accuracy": 0.6409617647528648, "num_tokens": 26750561.0, "step": 1660 }, { "epoch": 0.3870668675396917, "grad_norm": 1.2753043174743652, "learning_rate": 4.9990150146980455e-05, "loss": 1.8755, "mean_token_accuracy": 0.6409595653414726, "num_tokens": 26912718.0, "step": 1670 }, { "epoch": 0.38938463321358213, "grad_norm": 1.408045768737793, "learning_rate": 4.9989616411978865e-05, "loss": 1.8615, "mean_token_accuracy": 0.6435681149363518, "num_tokens": 27074392.0, "step": 1680 }, { "epoch": 0.3917023988874725, "grad_norm": 1.3851354122161865, "learning_rate": 4.998906859821027e-05, "loss": 1.8761, "mean_token_accuracy": 0.6401672676205635, "num_tokens": 27235662.0, "step": 1690 }, { "epoch": 0.39402016456136285, "grad_norm": 1.9568464756011963, "learning_rate": 4.998850670598326e-05, "loss": 1.875, "mean_token_accuracy": 0.641376368701458, "num_tokens": 27395781.0, "step": 1700 }, { "epoch": 0.3963379302352532, "grad_norm": 1.5728007555007935, "learning_rate": 4.998793073561443e-05, "loss": 1.867, "mean_token_accuracy": 0.6427549034357071, "num_tokens": 27557515.0, "step": 1710 }, { "epoch": 0.39865569590914357, "grad_norm": 1.4253342151641846, "learning_rate": 4.9987340687428255e-05, "loss": 1.8749, "mean_token_accuracy": 0.641077670454979, "num_tokens": 27717637.0, "step": 1720 }, { "epoch": 0.40097346158303393, "grad_norm": 1.283916711807251, "learning_rate": 4.998673656175716e-05, "loss": 1.8825, "mean_token_accuracy": 0.6409965708851815, "num_tokens": 27878142.0, "step": 1730 }, { "epoch": 0.40329122725692435, "grad_norm": 1.3802934885025024, "learning_rate": 4.9986118358941516e-05, "loss": 1.8512, "mean_token_accuracy": 0.6427737548947334, "num_tokens": 28039828.0, "step": 1740 }, { "epoch": 0.4056089929308147, "grad_norm": 1.3293272256851196, "learning_rate": 4.998548607932959e-05, "loss": 1.851, "mean_token_accuracy": 0.6422316864132881, "num_tokens": 28201516.0, "step": 1750 }, { "epoch": 0.40792675860470506, "grad_norm": 1.4651212692260742, "learning_rate": 4.998483972327761e-05, "loss": 1.8521, "mean_token_accuracy": 0.6461771473288536, "num_tokens": 28363057.0, "step": 1760 }, { "epoch": 0.4102445242785954, "grad_norm": 1.5236210823059082, "learning_rate": 4.998417929114971e-05, "loss": 1.8746, "mean_token_accuracy": 0.6410267069935799, "num_tokens": 28524187.0, "step": 1770 }, { "epoch": 0.4125622899524858, "grad_norm": 1.313223958015442, "learning_rate": 4.998350478331799e-05, "loss": 1.8544, "mean_token_accuracy": 0.6428179815411568, "num_tokens": 28683808.0, "step": 1780 }, { "epoch": 0.4148800556263762, "grad_norm": 1.4754985570907593, "learning_rate": 4.998281620016243e-05, "loss": 1.8503, "mean_token_accuracy": 0.6441743224859238, "num_tokens": 28845504.0, "step": 1790 }, { "epoch": 0.41719782130026656, "grad_norm": 1.436733603477478, "learning_rate": 4.9982113542071e-05, "loss": 1.857, "mean_token_accuracy": 0.6423680290579796, "num_tokens": 29006214.0, "step": 1800 }, { "epoch": 0.4195155869741569, "grad_norm": 1.3749688863754272, "learning_rate": 4.998139680943953e-05, "loss": 1.8428, "mean_token_accuracy": 0.6437972038984299, "num_tokens": 29167201.0, "step": 1810 }, { "epoch": 0.4218333526480473, "grad_norm": 1.4325748682022095, "learning_rate": 4.998066600267184e-05, "loss": 1.8342, "mean_token_accuracy": 0.6447791233658791, "num_tokens": 29329220.0, "step": 1820 }, { "epoch": 0.42415111832193764, "grad_norm": 1.6521497964859009, "learning_rate": 4.9979921122179654e-05, "loss": 1.8412, "mean_token_accuracy": 0.6441050633788109, "num_tokens": 29489432.0, "step": 1830 }, { "epoch": 0.426468883995828, "grad_norm": 1.292902946472168, "learning_rate": 4.997916216838261e-05, "loss": 1.8369, "mean_token_accuracy": 0.6456946566700935, "num_tokens": 29650316.0, "step": 1840 }, { "epoch": 0.4287866496697184, "grad_norm": 1.3509552478790283, "learning_rate": 4.997838914170831e-05, "loss": 1.8286, "mean_token_accuracy": 0.6450114071369171, "num_tokens": 29811427.0, "step": 1850 }, { "epoch": 0.43110441534360877, "grad_norm": 1.3762990236282349, "learning_rate": 4.997760204259226e-05, "loss": 1.842, "mean_token_accuracy": 0.6457752421498298, "num_tokens": 29972395.0, "step": 1860 }, { "epoch": 0.43342218101749913, "grad_norm": 1.1928985118865967, "learning_rate": 4.997680087147789e-05, "loss": 1.8264, "mean_token_accuracy": 0.6465903967618942, "num_tokens": 30134237.0, "step": 1870 }, { "epoch": 0.4357399466913895, "grad_norm": 1.334893822669983, "learning_rate": 4.997598562881658e-05, "loss": 1.8454, "mean_token_accuracy": 0.6438649952411651, "num_tokens": 30294604.0, "step": 1880 }, { "epoch": 0.43805771236527985, "grad_norm": 1.3709943294525146, "learning_rate": 4.997515631506763e-05, "loss": 1.8252, "mean_token_accuracy": 0.6459429666399956, "num_tokens": 30455239.0, "step": 1890 }, { "epoch": 0.44037547803917027, "grad_norm": 1.5541445016860962, "learning_rate": 4.997431293069824e-05, "loss": 1.8269, "mean_token_accuracy": 0.647772628068924, "num_tokens": 30616374.0, "step": 1900 }, { "epoch": 0.4426932437130606, "grad_norm": 1.3806477785110474, "learning_rate": 4.9973455476183575e-05, "loss": 1.8432, "mean_token_accuracy": 0.6451797544956207, "num_tokens": 30776512.0, "step": 1910 }, { "epoch": 0.445011009386951, "grad_norm": 1.2107192277908325, "learning_rate": 4.997258395200671e-05, "loss": 1.8376, "mean_token_accuracy": 0.6465131267905235, "num_tokens": 30937249.0, "step": 1920 }, { "epoch": 0.44732877506084134, "grad_norm": 1.3950155973434448, "learning_rate": 4.997169835865865e-05, "loss": 1.8253, "mean_token_accuracy": 0.648149473965168, "num_tokens": 31098501.0, "step": 1930 }, { "epoch": 0.4496465407347317, "grad_norm": 1.6820474863052368, "learning_rate": 4.9970798696638325e-05, "loss": 1.841, "mean_token_accuracy": 0.6438210234045982, "num_tokens": 31260205.0, "step": 1940 }, { "epoch": 0.45196430640862206, "grad_norm": 1.4521360397338867, "learning_rate": 4.9969884966452586e-05, "loss": 1.8365, "mean_token_accuracy": 0.6448712572455406, "num_tokens": 31420132.0, "step": 1950 }, { "epoch": 0.4542820720825125, "grad_norm": 1.2954416275024414, "learning_rate": 4.996895716861622e-05, "loss": 1.8308, "mean_token_accuracy": 0.6462687224149704, "num_tokens": 31581700.0, "step": 1960 }, { "epoch": 0.45659983775640284, "grad_norm": 1.3404178619384766, "learning_rate": 4.9968015303651914e-05, "loss": 1.8055, "mean_token_accuracy": 0.6502536803483963, "num_tokens": 31742549.0, "step": 1970 }, { "epoch": 0.4589176034302932, "grad_norm": 1.298412799835205, "learning_rate": 4.996705937209034e-05, "loss": 1.8227, "mean_token_accuracy": 0.6485438525676728, "num_tokens": 31903729.0, "step": 1980 }, { "epoch": 0.46123536910418356, "grad_norm": 1.2438616752624512, "learning_rate": 4.996608937447001e-05, "loss": 1.8295, "mean_token_accuracy": 0.6465036660432816, "num_tokens": 32064937.0, "step": 1990 }, { "epoch": 0.4635531347780739, "grad_norm": 1.3428300619125366, "learning_rate": 4.996510531133745e-05, "loss": 1.8264, "mean_token_accuracy": 0.6465498238801957, "num_tokens": 32226723.0, "step": 2000 }, { "epoch": 0.46587090045196433, "grad_norm": 1.29853093624115, "learning_rate": 4.996410718324703e-05, "loss": 1.8386, "mean_token_accuracy": 0.6440900847315788, "num_tokens": 32387037.0, "step": 2010 }, { "epoch": 0.4681886661258547, "grad_norm": 1.1978315114974976, "learning_rate": 4.99630949907611e-05, "loss": 1.7981, "mean_token_accuracy": 0.651172150671482, "num_tokens": 32548549.0, "step": 2020 }, { "epoch": 0.47050643179974505, "grad_norm": 1.3055459260940552, "learning_rate": 4.9962068734449915e-05, "loss": 1.8349, "mean_token_accuracy": 0.6453921094536781, "num_tokens": 32710488.0, "step": 2030 }, { "epoch": 0.4728241974736354, "grad_norm": 1.1710385084152222, "learning_rate": 4.996102841489164e-05, "loss": 1.8369, "mean_token_accuracy": 0.6456437513232232, "num_tokens": 32870369.0, "step": 2040 }, { "epoch": 0.47514196314752577, "grad_norm": 1.2161914110183716, "learning_rate": 4.995997403267238e-05, "loss": 1.8267, "mean_token_accuracy": 0.6461189821362495, "num_tokens": 33032000.0, "step": 2050 }, { "epoch": 0.47745972882141613, "grad_norm": 1.3030792474746704, "learning_rate": 4.9958905588386166e-05, "loss": 1.8174, "mean_token_accuracy": 0.647930084168911, "num_tokens": 33193856.0, "step": 2060 }, { "epoch": 0.47977749449530654, "grad_norm": 1.40851628780365, "learning_rate": 4.995782308263492e-05, "loss": 1.822, "mean_token_accuracy": 0.6464095324277878, "num_tokens": 33354169.0, "step": 2070 }, { "epoch": 0.4820952601691969, "grad_norm": 1.2720558643341064, "learning_rate": 4.995672651602853e-05, "loss": 1.8304, "mean_token_accuracy": 0.6451669082045555, "num_tokens": 33515954.0, "step": 2080 }, { "epoch": 0.48441302584308726, "grad_norm": 1.1803936958312988, "learning_rate": 4.9955615889184784e-05, "loss": 1.8228, "mean_token_accuracy": 0.6469815775752068, "num_tokens": 33676064.0, "step": 2090 }, { "epoch": 0.4867307915169776, "grad_norm": 1.2207764387130737, "learning_rate": 4.995449120272939e-05, "loss": 1.8261, "mean_token_accuracy": 0.6462548539042473, "num_tokens": 33837500.0, "step": 2100 }, { "epoch": 0.489048557190868, "grad_norm": 1.4788895845413208, "learning_rate": 4.995335245729597e-05, "loss": 1.8053, "mean_token_accuracy": 0.6500621229410172, "num_tokens": 33998649.0, "step": 2110 }, { "epoch": 0.4913663228647584, "grad_norm": 1.4980946779251099, "learning_rate": 4.9952199653526085e-05, "loss": 1.7987, "mean_token_accuracy": 0.6494337037205696, "num_tokens": 34160245.0, "step": 2120 }, { "epoch": 0.49368408853864876, "grad_norm": 1.224846363067627, "learning_rate": 4.9951032792069194e-05, "loss": 1.8325, "mean_token_accuracy": 0.6463965609669685, "num_tokens": 34321806.0, "step": 2130 }, { "epoch": 0.4960018542125391, "grad_norm": 1.194200038909912, "learning_rate": 4.9949851873582714e-05, "loss": 1.8047, "mean_token_accuracy": 0.64946728348732, "num_tokens": 34483411.0, "step": 2140 }, { "epoch": 0.4983196198864295, "grad_norm": 1.3484511375427246, "learning_rate": 4.994865689873194e-05, "loss": 1.8119, "mean_token_accuracy": 0.6480161309242248, "num_tokens": 34643124.0, "step": 2150 }, { "epoch": 0.5006373855603199, "grad_norm": 1.330817461013794, "learning_rate": 4.99474478681901e-05, "loss": 1.8036, "mean_token_accuracy": 0.6494914516806602, "num_tokens": 34804919.0, "step": 2160 }, { "epoch": 0.5029551512342102, "grad_norm": 1.2171236276626587, "learning_rate": 4.9946224782638343e-05, "loss": 1.7926, "mean_token_accuracy": 0.6504859492182732, "num_tokens": 34966752.0, "step": 2170 }, { "epoch": 0.5052729169081006, "grad_norm": 1.4082651138305664, "learning_rate": 4.994498764276575e-05, "loss": 1.8037, "mean_token_accuracy": 0.6486766472458839, "num_tokens": 35126765.0, "step": 2180 }, { "epoch": 0.5075906825819909, "grad_norm": 1.6300028562545776, "learning_rate": 4.99437364492693e-05, "loss": 1.7927, "mean_token_accuracy": 0.6507325500249863, "num_tokens": 35287430.0, "step": 2190 }, { "epoch": 0.5099084482558813, "grad_norm": 1.142637848854065, "learning_rate": 4.994247120285388e-05, "loss": 1.7898, "mean_token_accuracy": 0.6521765917539597, "num_tokens": 35449458.0, "step": 2200 }, { "epoch": 0.5122262139297717, "grad_norm": 1.1656745672225952, "learning_rate": 4.9941191904232334e-05, "loss": 1.7931, "mean_token_accuracy": 0.6513628751039505, "num_tokens": 35610854.0, "step": 2210 }, { "epoch": 0.514543979603662, "grad_norm": 1.243106484413147, "learning_rate": 4.9939898554125384e-05, "loss": 1.7951, "mean_token_accuracy": 0.6502505838871002, "num_tokens": 35772077.0, "step": 2220 }, { "epoch": 0.5168617452775525, "grad_norm": 1.2600574493408203, "learning_rate": 4.993859115326169e-05, "loss": 1.801, "mean_token_accuracy": 0.6496979624032975, "num_tokens": 35933656.0, "step": 2230 }, { "epoch": 0.5191795109514428, "grad_norm": 1.2119355201721191, "learning_rate": 4.993726970237783e-05, "loss": 1.8, "mean_token_accuracy": 0.6506540104746819, "num_tokens": 36095306.0, "step": 2240 }, { "epoch": 0.5214972766253332, "grad_norm": 1.0696052312850952, "learning_rate": 4.993593420221828e-05, "loss": 1.792, "mean_token_accuracy": 0.6503913417458534, "num_tokens": 36256309.0, "step": 2250 }, { "epoch": 0.5238150422992236, "grad_norm": 1.1840111017227173, "learning_rate": 4.993458465353543e-05, "loss": 1.7984, "mean_token_accuracy": 0.6502600029110909, "num_tokens": 36417026.0, "step": 2260 }, { "epoch": 0.5261328079731139, "grad_norm": 1.2211843729019165, "learning_rate": 4.993322105708961e-05, "loss": 1.8064, "mean_token_accuracy": 0.6502588808536529, "num_tokens": 36578418.0, "step": 2270 }, { "epoch": 0.5284505736470043, "grad_norm": 1.2912896871566772, "learning_rate": 4.993184341364905e-05, "loss": 1.7997, "mean_token_accuracy": 0.6493040397763252, "num_tokens": 36739789.0, "step": 2280 }, { "epoch": 0.5307683393208946, "grad_norm": 1.2933979034423828, "learning_rate": 4.993045172398988e-05, "loss": 1.7881, "mean_token_accuracy": 0.6514629051089287, "num_tokens": 36900407.0, "step": 2290 }, { "epoch": 0.533086104994785, "grad_norm": 1.3533680438995361, "learning_rate": 4.9929045988896165e-05, "loss": 1.8011, "mean_token_accuracy": 0.6488009244203568, "num_tokens": 37061232.0, "step": 2300 }, { "epoch": 0.5354038706686755, "grad_norm": 1.3179523944854736, "learning_rate": 4.992762620915988e-05, "loss": 1.7909, "mean_token_accuracy": 0.649954816699028, "num_tokens": 37222828.0, "step": 2310 }, { "epoch": 0.5377216363425658, "grad_norm": 1.233285665512085, "learning_rate": 4.99261923855809e-05, "loss": 1.7979, "mean_token_accuracy": 0.6499597236514092, "num_tokens": 37384792.0, "step": 2320 }, { "epoch": 0.5400394020164562, "grad_norm": 1.2906845808029175, "learning_rate": 4.992474451896702e-05, "loss": 1.7989, "mean_token_accuracy": 0.6512679532170296, "num_tokens": 37546517.0, "step": 2330 }, { "epoch": 0.5423571676903465, "grad_norm": 1.1491575241088867, "learning_rate": 4.992328261013395e-05, "loss": 1.7931, "mean_token_accuracy": 0.6500511258840561, "num_tokens": 37707581.0, "step": 2340 }, { "epoch": 0.5446749333642369, "grad_norm": 1.1850157976150513, "learning_rate": 4.9921806659905294e-05, "loss": 1.8026, "mean_token_accuracy": 0.6490116715431213, "num_tokens": 37869206.0, "step": 2350 }, { "epoch": 0.5469926990381272, "grad_norm": 1.1337497234344482, "learning_rate": 4.992031666911259e-05, "loss": 1.7779, "mean_token_accuracy": 0.6530553370714187, "num_tokens": 38030619.0, "step": 2360 }, { "epoch": 0.5493104647120176, "grad_norm": 1.3335121870040894, "learning_rate": 4.991881263859528e-05, "loss": 1.7842, "mean_token_accuracy": 0.6514137059450149, "num_tokens": 38192360.0, "step": 2370 }, { "epoch": 0.551628230385908, "grad_norm": 1.3550134897232056, "learning_rate": 4.9917294569200714e-05, "loss": 1.7853, "mean_token_accuracy": 0.6505600795149803, "num_tokens": 38354232.0, "step": 2380 }, { "epoch": 0.5539459960597983, "grad_norm": 1.1735466718673706, "learning_rate": 4.991576246178413e-05, "loss": 1.7941, "mean_token_accuracy": 0.6508138298988342, "num_tokens": 38516000.0, "step": 2390 }, { "epoch": 0.5562637617336887, "grad_norm": 1.2093802690505981, "learning_rate": 4.991421631720872e-05, "loss": 1.8107, "mean_token_accuracy": 0.6488280117511749, "num_tokens": 38678051.0, "step": 2400 }, { "epoch": 0.558581527407579, "grad_norm": 1.109484076499939, "learning_rate": 4.991265613634553e-05, "loss": 1.7563, "mean_token_accuracy": 0.6549599155783653, "num_tokens": 38839021.0, "step": 2410 }, { "epoch": 0.5608992930814695, "grad_norm": 1.2432451248168945, "learning_rate": 4.9911081920073566e-05, "loss": 1.7801, "mean_token_accuracy": 0.6502027124166488, "num_tokens": 39000390.0, "step": 2420 }, { "epoch": 0.5632170587553599, "grad_norm": 1.2306640148162842, "learning_rate": 4.99094936692797e-05, "loss": 1.7621, "mean_token_accuracy": 0.6544852972030639, "num_tokens": 39161210.0, "step": 2430 }, { "epoch": 0.5655348244292502, "grad_norm": 1.2192769050598145, "learning_rate": 4.990789138485874e-05, "loss": 1.7775, "mean_token_accuracy": 0.651050665974617, "num_tokens": 39322629.0, "step": 2440 }, { "epoch": 0.5678525901031406, "grad_norm": 1.290901780128479, "learning_rate": 4.9906275067713384e-05, "loss": 1.7676, "mean_token_accuracy": 0.652652969956398, "num_tokens": 39482324.0, "step": 2450 }, { "epoch": 0.5701703557770309, "grad_norm": 1.2228672504425049, "learning_rate": 4.9904644718754236e-05, "loss": 1.7902, "mean_token_accuracy": 0.6513749271631241, "num_tokens": 39643111.0, "step": 2460 }, { "epoch": 0.5724881214509213, "grad_norm": 1.0864702463150024, "learning_rate": 4.990300033889982e-05, "loss": 1.771, "mean_token_accuracy": 0.6528406798839569, "num_tokens": 39804459.0, "step": 2470 }, { "epoch": 0.5748058871248117, "grad_norm": 1.1518689393997192, "learning_rate": 4.990134192907654e-05, "loss": 1.7838, "mean_token_accuracy": 0.6524649575352669, "num_tokens": 39965834.0, "step": 2480 }, { "epoch": 0.577123652798702, "grad_norm": 1.297217845916748, "learning_rate": 4.9899669490218734e-05, "loss": 1.7737, "mean_token_accuracy": 0.6526654109358787, "num_tokens": 40126521.0, "step": 2490 }, { "epoch": 0.5794414184725925, "grad_norm": 1.1065510511398315, "learning_rate": 4.989798302326862e-05, "loss": 1.7635, "mean_token_accuracy": 0.6538629919290543, "num_tokens": 40287616.0, "step": 2500 }, { "epoch": 0.5817591841464828, "grad_norm": 1.0550730228424072, "learning_rate": 4.989628252917633e-05, "loss": 1.7795, "mean_token_accuracy": 0.6509060606360435, "num_tokens": 40449406.0, "step": 2510 }, { "epoch": 0.5840769498203732, "grad_norm": 1.3630030155181885, "learning_rate": 4.98945680088999e-05, "loss": 1.7751, "mean_token_accuracy": 0.6543081298470497, "num_tokens": 40610974.0, "step": 2520 }, { "epoch": 0.5863947154942636, "grad_norm": 1.1745381355285645, "learning_rate": 4.989283946340525e-05, "loss": 1.7561, "mean_token_accuracy": 0.6549885541200637, "num_tokens": 40772657.0, "step": 2530 }, { "epoch": 0.5887124811681539, "grad_norm": 1.0969319343566895, "learning_rate": 4.9891096893666234e-05, "loss": 1.7779, "mean_token_accuracy": 0.651214775443077, "num_tokens": 40933754.0, "step": 2540 }, { "epoch": 0.5910302468420443, "grad_norm": 1.1451414823532104, "learning_rate": 4.9889340300664576e-05, "loss": 1.7868, "mean_token_accuracy": 0.6512230083346366, "num_tokens": 41095050.0, "step": 2550 }, { "epoch": 0.5933480125159346, "grad_norm": 1.1274573802947998, "learning_rate": 4.9887569685389925e-05, "loss": 1.7677, "mean_token_accuracy": 0.6547203674912453, "num_tokens": 41256385.0, "step": 2560 }, { "epoch": 0.595665778189825, "grad_norm": 1.1763322353363037, "learning_rate": 4.9885785048839806e-05, "loss": 1.7642, "mean_token_accuracy": 0.6561498239636421, "num_tokens": 41417950.0, "step": 2570 }, { "epoch": 0.5979835438637153, "grad_norm": 1.0554320812225342, "learning_rate": 4.988398639201967e-05, "loss": 1.759, "mean_token_accuracy": 0.653768227994442, "num_tokens": 41579436.0, "step": 2580 }, { "epoch": 0.6003013095376057, "grad_norm": 1.0405648946762085, "learning_rate": 4.9882173715942836e-05, "loss": 1.7517, "mean_token_accuracy": 0.6555889502167702, "num_tokens": 41741307.0, "step": 2590 }, { "epoch": 0.6026190752114962, "grad_norm": 1.0603960752487183, "learning_rate": 4.9880347021630544e-05, "loss": 1.7752, "mean_token_accuracy": 0.6523320704698563, "num_tokens": 41902551.0, "step": 2600 }, { "epoch": 0.6049368408853865, "grad_norm": 1.3559843301773071, "learning_rate": 4.987850631011194e-05, "loss": 1.7823, "mean_token_accuracy": 0.6516653686761856, "num_tokens": 42063965.0, "step": 2610 }, { "epoch": 0.6072546065592769, "grad_norm": 1.1409677267074585, "learning_rate": 4.987665158242403e-05, "loss": 1.7688, "mean_token_accuracy": 0.6540316551923752, "num_tokens": 42225294.0, "step": 2620 }, { "epoch": 0.6095723722331672, "grad_norm": 1.0073792934417725, "learning_rate": 4.987478283961176e-05, "loss": 1.7872, "mean_token_accuracy": 0.6507670432329178, "num_tokens": 42386866.0, "step": 2630 }, { "epoch": 0.6118901379070576, "grad_norm": 1.1738861799240112, "learning_rate": 4.9872900082727934e-05, "loss": 1.759, "mean_token_accuracy": 0.653783792257309, "num_tokens": 42546750.0, "step": 2640 }, { "epoch": 0.614207903580948, "grad_norm": 1.0095278024673462, "learning_rate": 4.987100331283328e-05, "loss": 1.7511, "mean_token_accuracy": 0.6556258171796798, "num_tokens": 42707971.0, "step": 2650 }, { "epoch": 0.6165256692548383, "grad_norm": 1.0292046070098877, "learning_rate": 4.986909253099641e-05, "loss": 1.7822, "mean_token_accuracy": 0.6523023322224617, "num_tokens": 42869177.0, "step": 2660 }, { "epoch": 0.6188434349287287, "grad_norm": 1.0657143592834473, "learning_rate": 4.986716773829381e-05, "loss": 1.7509, "mean_token_accuracy": 0.655777807533741, "num_tokens": 43029732.0, "step": 2670 }, { "epoch": 0.621161200602619, "grad_norm": 1.061148762702942, "learning_rate": 4.98652289358099e-05, "loss": 1.7789, "mean_token_accuracy": 0.6514153137803078, "num_tokens": 43190764.0, "step": 2680 }, { "epoch": 0.6234789662765094, "grad_norm": 0.9902697205543518, "learning_rate": 4.986327612463696e-05, "loss": 1.7388, "mean_token_accuracy": 0.6562409579753876, "num_tokens": 43351789.0, "step": 2690 }, { "epoch": 0.6257967319503999, "grad_norm": 1.3972876071929932, "learning_rate": 4.986130930587519e-05, "loss": 1.7758, "mean_token_accuracy": 0.6521974816918373, "num_tokens": 43512946.0, "step": 2700 }, { "epoch": 0.6281144976242902, "grad_norm": 1.178807020187378, "learning_rate": 4.9859328480632646e-05, "loss": 1.7516, "mean_token_accuracy": 0.6554417118430138, "num_tokens": 43674499.0, "step": 2710 }, { "epoch": 0.6304322632981806, "grad_norm": 1.073844075202942, "learning_rate": 4.98573336500253e-05, "loss": 1.7574, "mean_token_accuracy": 0.6526891931891441, "num_tokens": 43832677.0, "step": 2720 }, { "epoch": 0.6327500289720709, "grad_norm": 1.0595623254776, "learning_rate": 4.9855324815177016e-05, "loss": 1.7606, "mean_token_accuracy": 0.6553361788392067, "num_tokens": 43994236.0, "step": 2730 }, { "epoch": 0.6350677946459613, "grad_norm": 1.2667583227157593, "learning_rate": 4.9853301977219535e-05, "loss": 1.7578, "mean_token_accuracy": 0.655476139485836, "num_tokens": 44155677.0, "step": 2740 }, { "epoch": 0.6373855603198517, "grad_norm": 1.0695858001708984, "learning_rate": 4.985126513729249e-05, "loss": 1.7638, "mean_token_accuracy": 0.6536165803670884, "num_tokens": 44317330.0, "step": 2750 }, { "epoch": 0.639703325993742, "grad_norm": 1.1521931886672974, "learning_rate": 4.984921429654341e-05, "loss": 1.7608, "mean_token_accuracy": 0.654318344593048, "num_tokens": 44479064.0, "step": 2760 }, { "epoch": 0.6420210916676324, "grad_norm": 1.0316860675811768, "learning_rate": 4.984714945612771e-05, "loss": 1.7577, "mean_token_accuracy": 0.6551008567214012, "num_tokens": 44641329.0, "step": 2770 }, { "epoch": 0.6443388573415227, "grad_norm": 1.0926828384399414, "learning_rate": 4.984507061720869e-05, "loss": 1.7468, "mean_token_accuracy": 0.6553538665175438, "num_tokens": 44802911.0, "step": 2780 }, { "epoch": 0.6466566230154132, "grad_norm": 1.084883451461792, "learning_rate": 4.984297778095752e-05, "loss": 1.7223, "mean_token_accuracy": 0.6591030791401863, "num_tokens": 44963914.0, "step": 2790 }, { "epoch": 0.6489743886893035, "grad_norm": 1.0893343687057495, "learning_rate": 4.98408709485533e-05, "loss": 1.7585, "mean_token_accuracy": 0.6552063181996346, "num_tokens": 45125220.0, "step": 2800 }, { "epoch": 0.6512921543631939, "grad_norm": 1.0550068616867065, "learning_rate": 4.983875012118296e-05, "loss": 1.7711, "mean_token_accuracy": 0.6517155855894089, "num_tokens": 45286884.0, "step": 2810 }, { "epoch": 0.6536099200370843, "grad_norm": 1.3651378154754639, "learning_rate": 4.9836615300041355e-05, "loss": 1.7333, "mean_token_accuracy": 0.6563895136117935, "num_tokens": 45448646.0, "step": 2820 }, { "epoch": 0.6559276857109746, "grad_norm": 1.0840108394622803, "learning_rate": 4.98344664863312e-05, "loss": 1.7623, "mean_token_accuracy": 0.6538578152656556, "num_tokens": 45610441.0, "step": 2830 }, { "epoch": 0.658245451384865, "grad_norm": 0.9841294884681702, "learning_rate": 4.9832303681263126e-05, "loss": 1.7584, "mean_token_accuracy": 0.6540656849741936, "num_tokens": 45772514.0, "step": 2840 }, { "epoch": 0.6605632170587553, "grad_norm": 1.1170574426651, "learning_rate": 4.9830126886055606e-05, "loss": 1.7553, "mean_token_accuracy": 0.6544242754578591, "num_tokens": 45934613.0, "step": 2850 }, { "epoch": 0.6628809827326457, "grad_norm": 1.2113828659057617, "learning_rate": 4.982793610193501e-05, "loss": 1.7623, "mean_token_accuracy": 0.6534338384866715, "num_tokens": 46094698.0, "step": 2860 }, { "epoch": 0.6651987484065361, "grad_norm": 1.053471565246582, "learning_rate": 4.982573133013561e-05, "loss": 1.7635, "mean_token_accuracy": 0.6529383212327957, "num_tokens": 46256231.0, "step": 2870 }, { "epoch": 0.6675165140804264, "grad_norm": 1.108306884765625, "learning_rate": 4.9823512571899523e-05, "loss": 1.7461, "mean_token_accuracy": 0.6563647076487541, "num_tokens": 46417186.0, "step": 2880 }, { "epoch": 0.6698342797543169, "grad_norm": 1.1083530187606812, "learning_rate": 4.982127982847677e-05, "loss": 1.7422, "mean_token_accuracy": 0.6558579817414284, "num_tokens": 46577821.0, "step": 2890 }, { "epoch": 0.6721520454282072, "grad_norm": 1.027117133140564, "learning_rate": 4.981903310112525e-05, "loss": 1.7368, "mean_token_accuracy": 0.6547025322914124, "num_tokens": 46739265.0, "step": 2900 }, { "epoch": 0.6744698111020976, "grad_norm": 1.0534621477127075, "learning_rate": 4.981677239111073e-05, "loss": 1.741, "mean_token_accuracy": 0.6552992403507233, "num_tokens": 46900848.0, "step": 2910 }, { "epoch": 0.676787576775988, "grad_norm": 1.050523281097412, "learning_rate": 4.981449769970686e-05, "loss": 1.7421, "mean_token_accuracy": 0.656470088660717, "num_tokens": 47061575.0, "step": 2920 }, { "epoch": 0.6791053424498783, "grad_norm": 1.2064077854156494, "learning_rate": 4.981220902819516e-05, "loss": 1.7328, "mean_token_accuracy": 0.6562667399644851, "num_tokens": 47222558.0, "step": 2930 }, { "epoch": 0.6814231081237687, "grad_norm": 1.0565195083618164, "learning_rate": 4.9809906377865045e-05, "loss": 1.7538, "mean_token_accuracy": 0.6542763963341713, "num_tokens": 47383767.0, "step": 2940 }, { "epoch": 0.683740873797659, "grad_norm": 1.0721921920776367, "learning_rate": 4.9807589750013784e-05, "loss": 1.7498, "mean_token_accuracy": 0.6544047385454178, "num_tokens": 47544832.0, "step": 2950 }, { "epoch": 0.6860586394715494, "grad_norm": 1.0613431930541992, "learning_rate": 4.9805259145946525e-05, "loss": 1.7597, "mean_token_accuracy": 0.6541865423321724, "num_tokens": 47705924.0, "step": 2960 }, { "epoch": 0.6883764051454399, "grad_norm": 1.1610240936279297, "learning_rate": 4.9802914566976305e-05, "loss": 1.7432, "mean_token_accuracy": 0.655061948299408, "num_tokens": 47867191.0, "step": 2970 }, { "epoch": 0.6906941708193302, "grad_norm": 1.1195889711380005, "learning_rate": 4.9800556014424014e-05, "loss": 1.737, "mean_token_accuracy": 0.6570713475346566, "num_tokens": 48028318.0, "step": 2980 }, { "epoch": 0.6930119364932206, "grad_norm": 1.1215170621871948, "learning_rate": 4.9798183489618444e-05, "loss": 1.7402, "mean_token_accuracy": 0.656301173567772, "num_tokens": 48189550.0, "step": 2990 }, { "epoch": 0.6953297021671109, "grad_norm": 1.0468863248825073, "learning_rate": 4.979579699389621e-05, "loss": 1.7228, "mean_token_accuracy": 0.657524025440216, "num_tokens": 48350983.0, "step": 3000 }, { "epoch": 0.6976474678410013, "grad_norm": 0.9623467922210693, "learning_rate": 4.9793396528601853e-05, "loss": 1.7423, "mean_token_accuracy": 0.6556651800870895, "num_tokens": 48511939.0, "step": 3010 }, { "epoch": 0.6999652335148916, "grad_norm": 1.0096664428710938, "learning_rate": 4.9790982095087745e-05, "loss": 1.7398, "mean_token_accuracy": 0.6562199622392655, "num_tokens": 48672993.0, "step": 3020 }, { "epoch": 0.702282999188782, "grad_norm": 1.0396407842636108, "learning_rate": 4.978855369471415e-05, "loss": 1.7455, "mean_token_accuracy": 0.6562417715787887, "num_tokens": 48834131.0, "step": 3030 }, { "epoch": 0.7046007648626724, "grad_norm": 1.1060670614242554, "learning_rate": 4.9786111328849176e-05, "loss": 1.7227, "mean_token_accuracy": 0.6588175401091576, "num_tokens": 48993810.0, "step": 3040 }, { "epoch": 0.7069185305365627, "grad_norm": 1.1163461208343506, "learning_rate": 4.978365499886883e-05, "loss": 1.7421, "mean_token_accuracy": 0.656818887591362, "num_tokens": 49155551.0, "step": 3050 }, { "epoch": 0.7092362962104531, "grad_norm": 1.056733250617981, "learning_rate": 4.9781184706156955e-05, "loss": 1.7318, "mean_token_accuracy": 0.6592550337314605, "num_tokens": 49317346.0, "step": 3060 }, { "epoch": 0.7115540618843434, "grad_norm": 1.0052881240844727, "learning_rate": 4.97787004521053e-05, "loss": 1.7419, "mean_token_accuracy": 0.6572806775569916, "num_tokens": 49478986.0, "step": 3070 }, { "epoch": 0.7138718275582339, "grad_norm": 1.0049275159835815, "learning_rate": 4.9776202238113435e-05, "loss": 1.7295, "mean_token_accuracy": 0.6585634425282478, "num_tokens": 49640482.0, "step": 3080 }, { "epoch": 0.7161895932321243, "grad_norm": 1.0569453239440918, "learning_rate": 4.977369006558882e-05, "loss": 1.7504, "mean_token_accuracy": 0.6552115693688393, "num_tokens": 49801964.0, "step": 3090 }, { "epoch": 0.7185073589060146, "grad_norm": 1.0047682523727417, "learning_rate": 4.9771163935946776e-05, "loss": 1.753, "mean_token_accuracy": 0.6554709360003471, "num_tokens": 49963195.0, "step": 3100 }, { "epoch": 0.720825124579905, "grad_norm": 1.0652652978897095, "learning_rate": 4.976862385061048e-05, "loss": 1.731, "mean_token_accuracy": 0.6564961552619935, "num_tokens": 50124451.0, "step": 3110 }, { "epoch": 0.7231428902537953, "grad_norm": 1.067233920097351, "learning_rate": 4.9766069811010994e-05, "loss": 1.7433, "mean_token_accuracy": 0.657318389415741, "num_tokens": 50286162.0, "step": 3120 }, { "epoch": 0.7254606559276857, "grad_norm": 1.0034873485565186, "learning_rate": 4.976350181858721e-05, "loss": 1.7412, "mean_token_accuracy": 0.6560275480151176, "num_tokens": 50446844.0, "step": 3130 }, { "epoch": 0.7277784216015761, "grad_norm": 1.0705958604812622, "learning_rate": 4.9760919874785906e-05, "loss": 1.74, "mean_token_accuracy": 0.6557386577129364, "num_tokens": 50608660.0, "step": 3140 }, { "epoch": 0.7300961872754664, "grad_norm": 1.0406018495559692, "learning_rate": 4.975832398106169e-05, "loss": 1.7438, "mean_token_accuracy": 0.6558197692036629, "num_tokens": 50770464.0, "step": 3150 }, { "epoch": 0.7324139529493569, "grad_norm": 1.0642038583755493, "learning_rate": 4.9755714138877076e-05, "loss": 1.7405, "mean_token_accuracy": 0.6566336616873741, "num_tokens": 50930939.0, "step": 3160 }, { "epoch": 0.7347317186232472, "grad_norm": 1.1188726425170898, "learning_rate": 4.975309034970238e-05, "loss": 1.7159, "mean_token_accuracy": 0.6580009996891022, "num_tokens": 51092146.0, "step": 3170 }, { "epoch": 0.7370494842971376, "grad_norm": 0.9789296984672546, "learning_rate": 4.975045261501583e-05, "loss": 1.741, "mean_token_accuracy": 0.654839476943016, "num_tokens": 51253106.0, "step": 3180 }, { "epoch": 0.739367249971028, "grad_norm": 1.0472869873046875, "learning_rate": 4.9747800936303476e-05, "loss": 1.7385, "mean_token_accuracy": 0.6587568700313569, "num_tokens": 51414897.0, "step": 3190 }, { "epoch": 0.7416850156449183, "grad_norm": 1.0026291608810425, "learning_rate": 4.974513531505922e-05, "loss": 1.7408, "mean_token_accuracy": 0.6558206290006637, "num_tokens": 51575337.0, "step": 3200 }, { "epoch": 0.7440027813188087, "grad_norm": 1.103634238243103, "learning_rate": 4.9742455752784855e-05, "loss": 1.7017, "mean_token_accuracy": 0.661705331504345, "num_tokens": 51737114.0, "step": 3210 }, { "epoch": 0.746320546992699, "grad_norm": 1.0470640659332275, "learning_rate": 4.9739762250989995e-05, "loss": 1.7258, "mean_token_accuracy": 0.657838049530983, "num_tokens": 51897053.0, "step": 3220 }, { "epoch": 0.7486383126665894, "grad_norm": 1.102329134941101, "learning_rate": 4.973705481119212e-05, "loss": 1.6974, "mean_token_accuracy": 0.6620575085282325, "num_tokens": 52058509.0, "step": 3230 }, { "epoch": 0.7509560783404797, "grad_norm": 1.0934652090072632, "learning_rate": 4.973433343491655e-05, "loss": 1.7368, "mean_token_accuracy": 0.6568454846739769, "num_tokens": 52220537.0, "step": 3240 }, { "epoch": 0.7532738440143701, "grad_norm": 1.0909183025360107, "learning_rate": 4.9731598123696475e-05, "loss": 1.7263, "mean_token_accuracy": 0.6597494572401047, "num_tokens": 52382083.0, "step": 3250 }, { "epoch": 0.7555916096882606, "grad_norm": 0.9793383479118347, "learning_rate": 4.9728848879072926e-05, "loss": 1.7261, "mean_token_accuracy": 0.6593211308121681, "num_tokens": 52542950.0, "step": 3260 }, { "epoch": 0.7579093753621509, "grad_norm": 1.0654027462005615, "learning_rate": 4.9726085702594786e-05, "loss": 1.7303, "mean_token_accuracy": 0.6586936429142952, "num_tokens": 52704156.0, "step": 3270 }, { "epoch": 0.7602271410360413, "grad_norm": 0.9579721689224243, "learning_rate": 4.9723308595818785e-05, "loss": 1.7036, "mean_token_accuracy": 0.6599122405052185, "num_tokens": 52865776.0, "step": 3280 }, { "epoch": 0.7625449067099316, "grad_norm": 1.0924031734466553, "learning_rate": 4.972051756030951e-05, "loss": 1.7367, "mean_token_accuracy": 0.6576773419976234, "num_tokens": 53026281.0, "step": 3290 }, { "epoch": 0.764862672383822, "grad_norm": 1.0000312328338623, "learning_rate": 4.9717712597639376e-05, "loss": 1.7414, "mean_token_accuracy": 0.6564709693193436, "num_tokens": 53188313.0, "step": 3300 }, { "epoch": 0.7671804380577124, "grad_norm": 1.0209730863571167, "learning_rate": 4.9714893709388665e-05, "loss": 1.7234, "mean_token_accuracy": 0.6582284942269325, "num_tokens": 53349826.0, "step": 3310 }, { "epoch": 0.7694982037316027, "grad_norm": 1.151604413986206, "learning_rate": 4.971206089714548e-05, "loss": 1.7208, "mean_token_accuracy": 0.6588966816663742, "num_tokens": 53511517.0, "step": 3320 }, { "epoch": 0.7718159694054931, "grad_norm": 1.0564467906951904, "learning_rate": 4.970921416250581e-05, "loss": 1.7294, "mean_token_accuracy": 0.657782818377018, "num_tokens": 53673016.0, "step": 3330 }, { "epoch": 0.7741337350793834, "grad_norm": 1.0037518739700317, "learning_rate": 4.970635350707344e-05, "loss": 1.72, "mean_token_accuracy": 0.6602640211582184, "num_tokens": 53833021.0, "step": 3340 }, { "epoch": 0.7764515007532738, "grad_norm": 1.0817556381225586, "learning_rate": 4.970347893246003e-05, "loss": 1.7201, "mean_token_accuracy": 0.6596139445900917, "num_tokens": 53994753.0, "step": 3350 }, { "epoch": 0.7787692664271643, "grad_norm": 1.0298349857330322, "learning_rate": 4.9700590440285065e-05, "loss": 1.7155, "mean_token_accuracy": 0.6571404695510864, "num_tokens": 54155856.0, "step": 3360 }, { "epoch": 0.7810870321010546, "grad_norm": 1.1728031635284424, "learning_rate": 4.969768803217588e-05, "loss": 1.7239, "mean_token_accuracy": 0.6582241475582122, "num_tokens": 54316444.0, "step": 3370 }, { "epoch": 0.783404797774945, "grad_norm": 1.0229969024658203, "learning_rate": 4.969477170976765e-05, "loss": 1.7347, "mean_token_accuracy": 0.6567343845963478, "num_tokens": 54478220.0, "step": 3380 }, { "epoch": 0.7857225634488353, "grad_norm": 0.9726830124855042, "learning_rate": 4.969184147470338e-05, "loss": 1.6941, "mean_token_accuracy": 0.661408805847168, "num_tokens": 54639881.0, "step": 3390 }, { "epoch": 0.7880403291227257, "grad_norm": 1.0143760442733765, "learning_rate": 4.968889732863393e-05, "loss": 1.713, "mean_token_accuracy": 0.6587746948003769, "num_tokens": 54801544.0, "step": 3400 }, { "epoch": 0.7903580947966161, "grad_norm": 0.9625095725059509, "learning_rate": 4.9685939273217977e-05, "loss": 1.7282, "mean_token_accuracy": 0.6576461717486382, "num_tokens": 54963360.0, "step": 3410 }, { "epoch": 0.7926758604705064, "grad_norm": 1.0391517877578735, "learning_rate": 4.968296731012205e-05, "loss": 1.7263, "mean_token_accuracy": 0.6561090916395187, "num_tokens": 55123845.0, "step": 3420 }, { "epoch": 0.7949936261443968, "grad_norm": 1.1197253465652466, "learning_rate": 4.9679981441020504e-05, "loss": 1.7219, "mean_token_accuracy": 0.6562957033514977, "num_tokens": 55284219.0, "step": 3430 }, { "epoch": 0.7973113918182871, "grad_norm": 1.166800856590271, "learning_rate": 4.967698166759552e-05, "loss": 1.7278, "mean_token_accuracy": 0.6561817929148674, "num_tokens": 55445119.0, "step": 3440 }, { "epoch": 0.7996291574921776, "grad_norm": 1.0956605672836304, "learning_rate": 4.967396799153715e-05, "loss": 1.7193, "mean_token_accuracy": 0.6575034201145172, "num_tokens": 55607091.0, "step": 3450 }, { "epoch": 0.8019469231660679, "grad_norm": 0.9844366908073425, "learning_rate": 4.9670940414543246e-05, "loss": 1.7166, "mean_token_accuracy": 0.6587544068694114, "num_tokens": 55767355.0, "step": 3460 }, { "epoch": 0.8042646888399583, "grad_norm": 1.06368088722229, "learning_rate": 4.966789893831948e-05, "loss": 1.7101, "mean_token_accuracy": 0.6592752948403359, "num_tokens": 55928300.0, "step": 3470 }, { "epoch": 0.8065824545138487, "grad_norm": 1.2256017923355103, "learning_rate": 4.9664843564579386e-05, "loss": 1.7268, "mean_token_accuracy": 0.6578044950962066, "num_tokens": 56090100.0, "step": 3480 }, { "epoch": 0.808900220187739, "grad_norm": 1.0474401712417603, "learning_rate": 4.9661774295044325e-05, "loss": 1.7032, "mean_token_accuracy": 0.659090431034565, "num_tokens": 56251284.0, "step": 3490 }, { "epoch": 0.8112179858616294, "grad_norm": 0.9689825773239136, "learning_rate": 4.9658691131443455e-05, "loss": 1.7192, "mean_token_accuracy": 0.6582483321428299, "num_tokens": 56412907.0, "step": 3500 }, { "epoch": 0.8135357515355197, "grad_norm": 1.2496188879013062, "learning_rate": 4.9655594075513803e-05, "loss": 1.7121, "mean_token_accuracy": 0.6597615405917168, "num_tokens": 56574516.0, "step": 3510 }, { "epoch": 0.8158535172094101, "grad_norm": 0.9240866899490356, "learning_rate": 4.965248312900018e-05, "loss": 1.7218, "mean_token_accuracy": 0.6591229125857353, "num_tokens": 56735599.0, "step": 3520 }, { "epoch": 0.8181712828833005, "grad_norm": 0.9534657597541809, "learning_rate": 4.964935829365527e-05, "loss": 1.716, "mean_token_accuracy": 0.6602185219526291, "num_tokens": 56896209.0, "step": 3530 }, { "epoch": 0.8204890485571908, "grad_norm": 1.0886791944503784, "learning_rate": 4.9646219571239546e-05, "loss": 1.7139, "mean_token_accuracy": 0.6574513569474221, "num_tokens": 57057514.0, "step": 3540 }, { "epoch": 0.8228068142310813, "grad_norm": 1.0175065994262695, "learning_rate": 4.964306696352131e-05, "loss": 1.7164, "mean_token_accuracy": 0.6594159111380578, "num_tokens": 57218926.0, "step": 3550 }, { "epoch": 0.8251245799049716, "grad_norm": 1.1032217741012573, "learning_rate": 4.963990047227671e-05, "loss": 1.7537, "mean_token_accuracy": 0.6556964546442032, "num_tokens": 57380537.0, "step": 3560 }, { "epoch": 0.827442345578862, "grad_norm": 0.9860242009162903, "learning_rate": 4.9636720099289676e-05, "loss": 1.6995, "mean_token_accuracy": 0.6588382825255394, "num_tokens": 57541309.0, "step": 3570 }, { "epoch": 0.8297601112527524, "grad_norm": 0.973425567150116, "learning_rate": 4.963352584635201e-05, "loss": 1.7097, "mean_token_accuracy": 0.6597781330347061, "num_tokens": 57702761.0, "step": 3580 }, { "epoch": 0.8320778769266427, "grad_norm": 0.9753161668777466, "learning_rate": 4.9630317715263275e-05, "loss": 1.7093, "mean_token_accuracy": 0.6583984896540642, "num_tokens": 57864414.0, "step": 3590 }, { "epoch": 0.8343956426005331, "grad_norm": 1.1334505081176758, "learning_rate": 4.9627095707830906e-05, "loss": 1.7041, "mean_token_accuracy": 0.6599982872605323, "num_tokens": 58025034.0, "step": 3600 }, { "epoch": 0.8367134082744234, "grad_norm": 1.027525544166565, "learning_rate": 4.962385982587012e-05, "loss": 1.7305, "mean_token_accuracy": 0.658175240457058, "num_tokens": 58186466.0, "step": 3610 }, { "epoch": 0.8390311739483138, "grad_norm": 0.97569739818573, "learning_rate": 4.9620610071203966e-05, "loss": 1.7133, "mean_token_accuracy": 0.6597214609384536, "num_tokens": 58347973.0, "step": 3620 }, { "epoch": 0.8413489396222043, "grad_norm": 0.9698459506034851, "learning_rate": 4.9617346445663305e-05, "loss": 1.7134, "mean_token_accuracy": 0.6582682296633721, "num_tokens": 58509545.0, "step": 3630 }, { "epoch": 0.8436667052960946, "grad_norm": 0.9100245237350464, "learning_rate": 4.961406895108682e-05, "loss": 1.7221, "mean_token_accuracy": 0.6575820907950402, "num_tokens": 58670319.0, "step": 3640 }, { "epoch": 0.845984470969985, "grad_norm": 1.0162261724472046, "learning_rate": 4.9610777589321e-05, "loss": 1.7143, "mean_token_accuracy": 0.659430094063282, "num_tokens": 58830323.0, "step": 3650 }, { "epoch": 0.8483022366438753, "grad_norm": 0.9866462349891663, "learning_rate": 4.9607472362220134e-05, "loss": 1.7086, "mean_token_accuracy": 0.6625803455710411, "num_tokens": 58991291.0, "step": 3660 }, { "epoch": 0.8506200023177657, "grad_norm": 1.0676451921463013, "learning_rate": 4.960415327164635e-05, "loss": 1.703, "mean_token_accuracy": 0.6601762369275093, "num_tokens": 59153490.0, "step": 3670 }, { "epoch": 0.852937767991656, "grad_norm": 1.0011926889419556, "learning_rate": 4.960082031946958e-05, "loss": 1.6946, "mean_token_accuracy": 0.6608894422650338, "num_tokens": 59315524.0, "step": 3680 }, { "epoch": 0.8552555336655464, "grad_norm": 0.9858991503715515, "learning_rate": 4.959747350756753e-05, "loss": 1.7058, "mean_token_accuracy": 0.6605504557490349, "num_tokens": 59477151.0, "step": 3690 }, { "epoch": 0.8575732993394368, "grad_norm": 1.00888991355896, "learning_rate": 4.9594112837825766e-05, "loss": 1.7009, "mean_token_accuracy": 0.6601909264922142, "num_tokens": 59637438.0, "step": 3700 }, { "epoch": 0.8598910650133271, "grad_norm": 1.0119386911392212, "learning_rate": 4.959073831213764e-05, "loss": 1.7088, "mean_token_accuracy": 0.6602195829153061, "num_tokens": 59799097.0, "step": 3710 }, { "epoch": 0.8622088306872175, "grad_norm": 1.0309172868728638, "learning_rate": 4.95873499324043e-05, "loss": 1.7053, "mean_token_accuracy": 0.6605506196618081, "num_tokens": 59960921.0, "step": 3720 }, { "epoch": 0.8645265963611078, "grad_norm": 1.074751615524292, "learning_rate": 4.95839477005347e-05, "loss": 1.7057, "mean_token_accuracy": 0.6599774181842804, "num_tokens": 60121029.0, "step": 3730 }, { "epoch": 0.8668443620349983, "grad_norm": 1.0253074169158936, "learning_rate": 4.958053161844561e-05, "loss": 1.7112, "mean_token_accuracy": 0.659799163043499, "num_tokens": 60283205.0, "step": 3740 }, { "epoch": 0.8691621277088887, "grad_norm": 0.9912146329879761, "learning_rate": 4.9577101688061613e-05, "loss": 1.6935, "mean_token_accuracy": 0.661335577070713, "num_tokens": 60443874.0, "step": 3750 }, { "epoch": 0.871479893382779, "grad_norm": 1.0861930847167969, "learning_rate": 4.957365791131506e-05, "loss": 1.7091, "mean_token_accuracy": 0.6590238869190216, "num_tokens": 60604829.0, "step": 3760 }, { "epoch": 0.8737976590566694, "grad_norm": 1.0001258850097656, "learning_rate": 4.957020029014613e-05, "loss": 1.7278, "mean_token_accuracy": 0.6565095722675324, "num_tokens": 60766612.0, "step": 3770 }, { "epoch": 0.8761154247305597, "grad_norm": 1.0131498575210571, "learning_rate": 4.95667288265028e-05, "loss": 1.6903, "mean_token_accuracy": 0.6612862706184387, "num_tokens": 60927738.0, "step": 3780 }, { "epoch": 0.8784331904044501, "grad_norm": 0.9991037845611572, "learning_rate": 4.956324352234082e-05, "loss": 1.6945, "mean_token_accuracy": 0.6616118937730789, "num_tokens": 61087948.0, "step": 3790 }, { "epoch": 0.8807509560783405, "grad_norm": 1.0000249147415161, "learning_rate": 4.955974437962378e-05, "loss": 1.7049, "mean_token_accuracy": 0.6571160241961479, "num_tokens": 61249118.0, "step": 3800 }, { "epoch": 0.8830687217522308, "grad_norm": 1.0517035722732544, "learning_rate": 4.955623140032303e-05, "loss": 1.7151, "mean_token_accuracy": 0.6572216212749481, "num_tokens": 61407799.0, "step": 3810 }, { "epoch": 0.8853864874261212, "grad_norm": 0.9211824536323547, "learning_rate": 4.955270458641773e-05, "loss": 1.6821, "mean_token_accuracy": 0.6629683747887611, "num_tokens": 61569149.0, "step": 3820 }, { "epoch": 0.8877042531000116, "grad_norm": 1.0865659713745117, "learning_rate": 4.954916393989484e-05, "loss": 1.689, "mean_token_accuracy": 0.6608955562114716, "num_tokens": 61729315.0, "step": 3830 }, { "epoch": 0.890022018773902, "grad_norm": 1.004725456237793, "learning_rate": 4.954560946274909e-05, "loss": 1.7078, "mean_token_accuracy": 0.6594462037086487, "num_tokens": 61888808.0, "step": 3840 }, { "epoch": 0.8923397844477924, "grad_norm": 1.0836572647094727, "learning_rate": 4.9542041156983035e-05, "loss": 1.6923, "mean_token_accuracy": 0.6638524606823921, "num_tokens": 62049902.0, "step": 3850 }, { "epoch": 0.8946575501216827, "grad_norm": 1.1714822053909302, "learning_rate": 4.9538459024607e-05, "loss": 1.7098, "mean_token_accuracy": 0.6583287253975868, "num_tokens": 62210642.0, "step": 3860 }, { "epoch": 0.8969753157955731, "grad_norm": 1.0555225610733032, "learning_rate": 4.9534863067639095e-05, "loss": 1.7072, "mean_token_accuracy": 0.6609332337975502, "num_tokens": 62371256.0, "step": 3870 }, { "epoch": 0.8992930814694634, "grad_norm": 1.204321265220642, "learning_rate": 4.9531253288105236e-05, "loss": 1.7005, "mean_token_accuracy": 0.6596424236893654, "num_tokens": 62533150.0, "step": 3880 }, { "epoch": 0.9016108471433538, "grad_norm": 0.9545002579689026, "learning_rate": 4.952762968803911e-05, "loss": 1.7138, "mean_token_accuracy": 0.6581745222210884, "num_tokens": 62695219.0, "step": 3890 }, { "epoch": 0.9039286128172441, "grad_norm": 0.9552336931228638, "learning_rate": 4.952399226948221e-05, "loss": 1.6979, "mean_token_accuracy": 0.66222964823246, "num_tokens": 62856919.0, "step": 3900 }, { "epoch": 0.9062463784911345, "grad_norm": 1.0208659172058105, "learning_rate": 4.952034103448379e-05, "loss": 1.6806, "mean_token_accuracy": 0.662202799320221, "num_tokens": 63017550.0, "step": 3910 }, { "epoch": 0.908564144165025, "grad_norm": 1.0594830513000488, "learning_rate": 4.95166759851009e-05, "loss": 1.6932, "mean_token_accuracy": 0.6629514917731285, "num_tokens": 63179824.0, "step": 3920 }, { "epoch": 0.9108819098389153, "grad_norm": 1.0876712799072266, "learning_rate": 4.9512997123398386e-05, "loss": 1.6953, "mean_token_accuracy": 0.6601560920476913, "num_tokens": 63341766.0, "step": 3930 }, { "epoch": 0.9131996755128057, "grad_norm": 1.1610174179077148, "learning_rate": 4.950930445144884e-05, "loss": 1.6846, "mean_token_accuracy": 0.6612243697047233, "num_tokens": 63503060.0, "step": 3940 }, { "epoch": 0.915517441186696, "grad_norm": 1.0349533557891846, "learning_rate": 4.950559797133268e-05, "loss": 1.694, "mean_token_accuracy": 0.6626518309116364, "num_tokens": 63663864.0, "step": 3950 }, { "epoch": 0.9178352068605864, "grad_norm": 1.03456449508667, "learning_rate": 4.950187768513807e-05, "loss": 1.6878, "mean_token_accuracy": 0.6615640729665756, "num_tokens": 63825841.0, "step": 3960 }, { "epoch": 0.9201529725344768, "grad_norm": 1.0091434717178345, "learning_rate": 4.9498143594960954e-05, "loss": 1.6957, "mean_token_accuracy": 0.6614128023386001, "num_tokens": 63987637.0, "step": 3970 }, { "epoch": 0.9224707382083671, "grad_norm": 0.9903602004051208, "learning_rate": 4.949439570290508e-05, "loss": 1.6852, "mean_token_accuracy": 0.6617074936628342, "num_tokens": 64149769.0, "step": 3980 }, { "epoch": 0.9247885038822575, "grad_norm": 1.0532004833221436, "learning_rate": 4.949063401108194e-05, "loss": 1.6798, "mean_token_accuracy": 0.6637041717767715, "num_tokens": 64311569.0, "step": 3990 }, { "epoch": 0.9271062695561478, "grad_norm": 1.0296556949615479, "learning_rate": 4.9486858521610804e-05, "loss": 1.6944, "mean_token_accuracy": 0.6608454063534737, "num_tokens": 64471578.0, "step": 4000 }, { "epoch": 0.9294240352300382, "grad_norm": 0.962782084941864, "learning_rate": 4.948306923661875e-05, "loss": 1.7034, "mean_token_accuracy": 0.658486907184124, "num_tokens": 64632028.0, "step": 4010 }, { "epoch": 0.9317418009039287, "grad_norm": 1.0573170185089111, "learning_rate": 4.947926615824058e-05, "loss": 1.6838, "mean_token_accuracy": 0.6639728397130966, "num_tokens": 64791921.0, "step": 4020 }, { "epoch": 0.934059566577819, "grad_norm": 1.0388057231903076, "learning_rate": 4.9475449288618906e-05, "loss": 1.6749, "mean_token_accuracy": 0.6622364521026611, "num_tokens": 64952586.0, "step": 4030 }, { "epoch": 0.9363773322517094, "grad_norm": 0.9471749663352966, "learning_rate": 4.9471618629904086e-05, "loss": 1.682, "mean_token_accuracy": 0.662259279191494, "num_tokens": 65114440.0, "step": 4040 }, { "epoch": 0.9386950979255997, "grad_norm": 1.0125908851623535, "learning_rate": 4.9467774184254254e-05, "loss": 1.6798, "mean_token_accuracy": 0.6635805308818817, "num_tokens": 65275592.0, "step": 4050 }, { "epoch": 0.9410128635994901, "grad_norm": 0.9500680565834045, "learning_rate": 4.946391595383532e-05, "loss": 1.6939, "mean_token_accuracy": 0.661236061155796, "num_tokens": 65436716.0, "step": 4060 }, { "epoch": 0.9433306292733805, "grad_norm": 0.9293331503868103, "learning_rate": 4.946004394082094e-05, "loss": 1.6912, "mean_token_accuracy": 0.661682802438736, "num_tokens": 65598492.0, "step": 4070 }, { "epoch": 0.9456483949472708, "grad_norm": 0.9278277158737183, "learning_rate": 4.9456158147392557e-05, "loss": 1.6879, "mean_token_accuracy": 0.661865445971489, "num_tokens": 65760173.0, "step": 4080 }, { "epoch": 0.9479661606211612, "grad_norm": 1.016579031944275, "learning_rate": 4.9452258575739366e-05, "loss": 1.688, "mean_token_accuracy": 0.6617168411612511, "num_tokens": 65921623.0, "step": 4090 }, { "epoch": 0.9502839262950515, "grad_norm": 1.1469446420669556, "learning_rate": 4.944834522805831e-05, "loss": 1.6938, "mean_token_accuracy": 0.6610754072666168, "num_tokens": 66082407.0, "step": 4100 }, { "epoch": 0.952601691968942, "grad_norm": 0.9419703483581543, "learning_rate": 4.9444418106554136e-05, "loss": 1.6878, "mean_token_accuracy": 0.6626044467091561, "num_tokens": 66243073.0, "step": 4110 }, { "epoch": 0.9549194576428323, "grad_norm": 0.9756712913513184, "learning_rate": 4.9440477213439305e-05, "loss": 1.7027, "mean_token_accuracy": 0.6603279501199723, "num_tokens": 66404761.0, "step": 4120 }, { "epoch": 0.9572372233167227, "grad_norm": 1.078514814376831, "learning_rate": 4.943652255093406e-05, "loss": 1.6981, "mean_token_accuracy": 0.6618581518530846, "num_tokens": 66566538.0, "step": 4130 }, { "epoch": 0.9595549889906131, "grad_norm": 1.0204819440841675, "learning_rate": 4.943255412126639e-05, "loss": 1.6887, "mean_token_accuracy": 0.6605068475008011, "num_tokens": 66728712.0, "step": 4140 }, { "epoch": 0.9618727546645034, "grad_norm": 0.943292498588562, "learning_rate": 4.942857192667207e-05, "loss": 1.6923, "mean_token_accuracy": 0.6614650607109069, "num_tokens": 66889788.0, "step": 4150 }, { "epoch": 0.9641905203383938, "grad_norm": 0.9697903990745544, "learning_rate": 4.9424575969394594e-05, "loss": 1.6932, "mean_token_accuracy": 0.6604887142777442, "num_tokens": 67050950.0, "step": 4160 }, { "epoch": 0.9665082860122841, "grad_norm": 1.102819800376892, "learning_rate": 4.9420566251685215e-05, "loss": 1.6864, "mean_token_accuracy": 0.661095455288887, "num_tokens": 67210962.0, "step": 4170 }, { "epoch": 0.9688260516861745, "grad_norm": 1.0385510921478271, "learning_rate": 4.941654277580296e-05, "loss": 1.7045, "mean_token_accuracy": 0.6586248457431794, "num_tokens": 67372194.0, "step": 4180 }, { "epoch": 0.9711438173600649, "grad_norm": 1.077050805091858, "learning_rate": 4.9412505544014576e-05, "loss": 1.6903, "mean_token_accuracy": 0.6618388518691063, "num_tokens": 67533943.0, "step": 4190 }, { "epoch": 0.9734615830339552, "grad_norm": 0.9360863566398621, "learning_rate": 4.9408454558594594e-05, "loss": 1.6799, "mean_token_accuracy": 0.6634792193770409, "num_tokens": 67695555.0, "step": 4200 }, { "epoch": 0.9757793487078457, "grad_norm": 0.9144700765609741, "learning_rate": 4.940438982182528e-05, "loss": 1.6722, "mean_token_accuracy": 0.6651823431253433, "num_tokens": 67857006.0, "step": 4210 }, { "epoch": 0.978097114381736, "grad_norm": 1.006689429283142, "learning_rate": 4.940031133599663e-05, "loss": 1.6862, "mean_token_accuracy": 0.6625020816922188, "num_tokens": 68018901.0, "step": 4220 }, { "epoch": 0.9804148800556264, "grad_norm": 0.9824639558792114, "learning_rate": 4.9396219103406404e-05, "loss": 1.6958, "mean_token_accuracy": 0.6614641293883323, "num_tokens": 68179749.0, "step": 4230 }, { "epoch": 0.9827326457295168, "grad_norm": 1.004931092262268, "learning_rate": 4.939211312636012e-05, "loss": 1.6983, "mean_token_accuracy": 0.6613583013415336, "num_tokens": 68341168.0, "step": 4240 }, { "epoch": 0.9850504114034071, "grad_norm": 1.0371170043945312, "learning_rate": 4.9387993407171e-05, "loss": 1.6807, "mean_token_accuracy": 0.663859347999096, "num_tokens": 68500206.0, "step": 4250 }, { "epoch": 0.9873681770772975, "grad_norm": 0.9106301069259644, "learning_rate": 4.9383859948160036e-05, "loss": 1.6992, "mean_token_accuracy": 0.6603253424167633, "num_tokens": 68661947.0, "step": 4260 }, { "epoch": 0.9896859427511878, "grad_norm": 0.9682409763336182, "learning_rate": 4.937971275165596e-05, "loss": 1.6875, "mean_token_accuracy": 0.6607443749904632, "num_tokens": 68823387.0, "step": 4270 }, { "epoch": 0.9920037084250782, "grad_norm": 0.9830387830734253, "learning_rate": 4.937555181999524e-05, "loss": 1.6964, "mean_token_accuracy": 0.6600877240300178, "num_tokens": 68984299.0, "step": 4280 }, { "epoch": 0.9943214740989686, "grad_norm": 0.8748356699943542, "learning_rate": 4.9371377155522076e-05, "loss": 1.6918, "mean_token_accuracy": 0.663083216547966, "num_tokens": 69146218.0, "step": 4290 }, { "epoch": 0.996639239772859, "grad_norm": 0.9656722545623779, "learning_rate": 4.9367188760588415e-05, "loss": 1.6886, "mean_token_accuracy": 0.6621182098984718, "num_tokens": 69307193.0, "step": 4300 }, { "epoch": 0.9989570054467494, "grad_norm": 1.0498154163360596, "learning_rate": 4.936298663755393e-05, "loss": 1.684, "mean_token_accuracy": 0.6617154404520988, "num_tokens": 69468058.0, "step": 4310 }, { "epoch": 1.001158882836945, "grad_norm": 1.0515093803405762, "learning_rate": 4.9358770788786046e-05, "loss": 1.6937, "mean_token_accuracy": 0.6610775244863409, "num_tokens": 69620913.0, "step": 4320 }, { "epoch": 1.0034766485108355, "grad_norm": 1.1904125213623047, "learning_rate": 4.935454121665989e-05, "loss": 1.6927, "mean_token_accuracy": 0.6609912693500519, "num_tokens": 69782877.0, "step": 4330 }, { "epoch": 1.005794414184726, "grad_norm": 1.0945911407470703, "learning_rate": 4.935029792355834e-05, "loss": 1.6856, "mean_token_accuracy": 0.6632763147354126, "num_tokens": 69943051.0, "step": 4340 }, { "epoch": 1.0081121798586163, "grad_norm": 0.9298327565193176, "learning_rate": 4.934604091187201e-05, "loss": 1.6673, "mean_token_accuracy": 0.6658756494522095, "num_tokens": 70104405.0, "step": 4350 }, { "epoch": 1.0104299455325068, "grad_norm": 0.9255250096321106, "learning_rate": 4.934177018399924e-05, "loss": 1.6878, "mean_token_accuracy": 0.6603480175137519, "num_tokens": 70265501.0, "step": 4360 }, { "epoch": 1.012747711206397, "grad_norm": 1.0104544162750244, "learning_rate": 4.933748574234608e-05, "loss": 1.6899, "mean_token_accuracy": 0.6622277975082398, "num_tokens": 70427082.0, "step": 4370 }, { "epoch": 1.0150654768802874, "grad_norm": 1.1360876560211182, "learning_rate": 4.9333187589326326e-05, "loss": 1.6814, "mean_token_accuracy": 0.6632625430822372, "num_tokens": 70587940.0, "step": 4380 }, { "epoch": 1.0173832425541778, "grad_norm": 0.9656396508216858, "learning_rate": 4.9328875727361476e-05, "loss": 1.6588, "mean_token_accuracy": 0.6641913115978241, "num_tokens": 70748866.0, "step": 4390 }, { "epoch": 1.0197010082280682, "grad_norm": 1.0103038549423218, "learning_rate": 4.93245501588808e-05, "loss": 1.6748, "mean_token_accuracy": 0.6632545292377472, "num_tokens": 70910346.0, "step": 4400 }, { "epoch": 1.0220187739019586, "grad_norm": 0.9978703260421753, "learning_rate": 4.9320210886321225e-05, "loss": 1.6635, "mean_token_accuracy": 0.6645608738064765, "num_tokens": 71071768.0, "step": 4410 }, { "epoch": 1.0243365395758488, "grad_norm": 0.9729065895080566, "learning_rate": 4.931585791212745e-05, "loss": 1.6582, "mean_token_accuracy": 0.6651962861418724, "num_tokens": 71232647.0, "step": 4420 }, { "epoch": 1.0266543052497392, "grad_norm": 1.0049227476119995, "learning_rate": 4.931149123875186e-05, "loss": 1.6682, "mean_token_accuracy": 0.6641142413020134, "num_tokens": 71393409.0, "step": 4430 }, { "epoch": 1.0289720709236296, "grad_norm": 0.9610632658004761, "learning_rate": 4.930711086865459e-05, "loss": 1.6799, "mean_token_accuracy": 0.66262087225914, "num_tokens": 71554929.0, "step": 4440 }, { "epoch": 1.03128983659752, "grad_norm": 0.9846996068954468, "learning_rate": 4.930271680430345e-05, "loss": 1.6632, "mean_token_accuracy": 0.6650021746754646, "num_tokens": 71715209.0, "step": 4450 }, { "epoch": 1.0336076022714105, "grad_norm": 1.013278603553772, "learning_rate": 4.929830904817401e-05, "loss": 1.6636, "mean_token_accuracy": 0.6654136821627616, "num_tokens": 71876877.0, "step": 4460 }, { "epoch": 1.0359253679453007, "grad_norm": 1.0065717697143555, "learning_rate": 4.929388760274952e-05, "loss": 1.6755, "mean_token_accuracy": 0.6637661412358284, "num_tokens": 72038389.0, "step": 4470 }, { "epoch": 1.038243133619191, "grad_norm": 0.9513846635818481, "learning_rate": 4.928945247052096e-05, "loss": 1.675, "mean_token_accuracy": 0.6647187098860741, "num_tokens": 72199343.0, "step": 4480 }, { "epoch": 1.0405608992930815, "grad_norm": 1.0433385372161865, "learning_rate": 4.928500365398701e-05, "loss": 1.6813, "mean_token_accuracy": 0.6620920479297638, "num_tokens": 72360812.0, "step": 4490 }, { "epoch": 1.042878664966972, "grad_norm": 0.9538586735725403, "learning_rate": 4.928054115565407e-05, "loss": 1.665, "mean_token_accuracy": 0.6658876433968544, "num_tokens": 72522549.0, "step": 4500 }, { "epoch": 1.0451964306408623, "grad_norm": 1.0233635902404785, "learning_rate": 4.927606497803624e-05, "loss": 1.6402, "mean_token_accuracy": 0.6668902680277824, "num_tokens": 72684465.0, "step": 4510 }, { "epoch": 1.0475141963147525, "grad_norm": 0.8790856003761292, "learning_rate": 4.9271575123655334e-05, "loss": 1.6782, "mean_token_accuracy": 0.663810633122921, "num_tokens": 72845311.0, "step": 4520 }, { "epoch": 1.049831961988643, "grad_norm": 1.0868940353393555, "learning_rate": 4.926707159504087e-05, "loss": 1.6576, "mean_token_accuracy": 0.6639130622148514, "num_tokens": 73005068.0, "step": 4530 }, { "epoch": 1.0521497276625333, "grad_norm": 1.0004290342330933, "learning_rate": 4.926255439473005e-05, "loss": 1.6783, "mean_token_accuracy": 0.6626141130924225, "num_tokens": 73163887.0, "step": 4540 }, { "epoch": 1.0544674933364238, "grad_norm": 0.9336773753166199, "learning_rate": 4.9258023525267815e-05, "loss": 1.6613, "mean_token_accuracy": 0.664112888276577, "num_tokens": 73325039.0, "step": 4550 }, { "epoch": 1.0567852590103142, "grad_norm": 0.9268376231193542, "learning_rate": 4.925347898920678e-05, "loss": 1.6609, "mean_token_accuracy": 0.6660724878311157, "num_tokens": 73486226.0, "step": 4560 }, { "epoch": 1.0591030246842044, "grad_norm": 1.012199878692627, "learning_rate": 4.924892078910727e-05, "loss": 1.6694, "mean_token_accuracy": 0.6627550095319747, "num_tokens": 73646740.0, "step": 4570 }, { "epoch": 1.0614207903580948, "grad_norm": 0.9599702954292297, "learning_rate": 4.924434892753729e-05, "loss": 1.6634, "mean_token_accuracy": 0.662474849820137, "num_tokens": 73808442.0, "step": 4580 }, { "epoch": 1.0637385560319852, "grad_norm": 0.9486956596374512, "learning_rate": 4.923976340707258e-05, "loss": 1.6495, "mean_token_accuracy": 0.6654542639851571, "num_tokens": 73970254.0, "step": 4590 }, { "epoch": 1.0660563217058756, "grad_norm": 0.9542300701141357, "learning_rate": 4.923516423029653e-05, "loss": 1.6603, "mean_token_accuracy": 0.664971086382866, "num_tokens": 74132517.0, "step": 4600 }, { "epoch": 1.0683740873797658, "grad_norm": 0.9054281711578369, "learning_rate": 4.9230551399800265e-05, "loss": 1.6511, "mean_token_accuracy": 0.6658335328102112, "num_tokens": 74293526.0, "step": 4610 }, { "epoch": 1.0706918530536562, "grad_norm": 0.8988864421844482, "learning_rate": 4.922592491818257e-05, "loss": 1.6609, "mean_token_accuracy": 0.6642599329352379, "num_tokens": 74454366.0, "step": 4620 }, { "epoch": 1.0730096187275466, "grad_norm": 0.9684102535247803, "learning_rate": 4.9221284788049935e-05, "loss": 1.6672, "mean_token_accuracy": 0.6633868679404259, "num_tokens": 74615103.0, "step": 4630 }, { "epoch": 1.075327384401437, "grad_norm": 1.0616976022720337, "learning_rate": 4.921663101201655e-05, "loss": 1.6568, "mean_token_accuracy": 0.666366671025753, "num_tokens": 74775605.0, "step": 4640 }, { "epoch": 1.0776451500753275, "grad_norm": 0.9311758875846863, "learning_rate": 4.9211963592704266e-05, "loss": 1.6496, "mean_token_accuracy": 0.6672915756702423, "num_tokens": 74937124.0, "step": 4650 }, { "epoch": 1.0799629157492177, "grad_norm": 1.0469322204589844, "learning_rate": 4.920728253274264e-05, "loss": 1.6466, "mean_token_accuracy": 0.6669893294572831, "num_tokens": 75098481.0, "step": 4660 }, { "epoch": 1.082280681423108, "grad_norm": 1.0864574909210205, "learning_rate": 4.920258783476892e-05, "loss": 1.6648, "mean_token_accuracy": 0.662703862786293, "num_tokens": 75260068.0, "step": 4670 }, { "epoch": 1.0845984470969985, "grad_norm": 1.0105825662612915, "learning_rate": 4.9197879501428016e-05, "loss": 1.6648, "mean_token_accuracy": 0.6647584453225136, "num_tokens": 75419356.0, "step": 4680 }, { "epoch": 1.086916212770889, "grad_norm": 1.073385238647461, "learning_rate": 4.9193157535372525e-05, "loss": 1.6597, "mean_token_accuracy": 0.6637428209185601, "num_tokens": 75580558.0, "step": 4690 }, { "epoch": 1.0892339784447793, "grad_norm": 0.9048916101455688, "learning_rate": 4.9188421939262755e-05, "loss": 1.6595, "mean_token_accuracy": 0.664665125310421, "num_tokens": 75742178.0, "step": 4700 }, { "epoch": 1.0915517441186695, "grad_norm": 0.9425325989723206, "learning_rate": 4.918367271576664e-05, "loss": 1.6525, "mean_token_accuracy": 0.6659875556826591, "num_tokens": 75903522.0, "step": 4710 }, { "epoch": 1.09386950979256, "grad_norm": 0.9496796727180481, "learning_rate": 4.917890986755983e-05, "loss": 1.6541, "mean_token_accuracy": 0.6672275304794312, "num_tokens": 76065208.0, "step": 4720 }, { "epoch": 1.0961872754664503, "grad_norm": 0.9647900462150574, "learning_rate": 4.9174133397325637e-05, "loss": 1.6624, "mean_token_accuracy": 0.6647691279649734, "num_tokens": 76227538.0, "step": 4730 }, { "epoch": 1.0985050411403408, "grad_norm": 1.0522147417068481, "learning_rate": 4.9169343307755064e-05, "loss": 1.6727, "mean_token_accuracy": 0.6621431320905685, "num_tokens": 76387982.0, "step": 4740 }, { "epoch": 1.1008228068142312, "grad_norm": 1.1349430084228516, "learning_rate": 4.916453960154678e-05, "loss": 1.6542, "mean_token_accuracy": 0.6645429208874702, "num_tokens": 76549282.0, "step": 4750 }, { "epoch": 1.1031405724881214, "grad_norm": 0.9350016117095947, "learning_rate": 4.915972228140708e-05, "loss": 1.6534, "mean_token_accuracy": 0.6674652606248855, "num_tokens": 76709888.0, "step": 4760 }, { "epoch": 1.1054583381620118, "grad_norm": 0.9078601598739624, "learning_rate": 4.9154891350050014e-05, "loss": 1.6585, "mean_token_accuracy": 0.6653171926736832, "num_tokens": 76872113.0, "step": 4770 }, { "epoch": 1.1077761038359022, "grad_norm": 0.945285975933075, "learning_rate": 4.9150046810197234e-05, "loss": 1.6586, "mean_token_accuracy": 0.6650816813111305, "num_tokens": 77034006.0, "step": 4780 }, { "epoch": 1.1100938695097926, "grad_norm": 0.9602475762367249, "learning_rate": 4.9145188664578065e-05, "loss": 1.6594, "mean_token_accuracy": 0.6628764003515244, "num_tokens": 77194817.0, "step": 4790 }, { "epoch": 1.112411635183683, "grad_norm": 0.970971405506134, "learning_rate": 4.914031691592955e-05, "loss": 1.662, "mean_token_accuracy": 0.6642777621746063, "num_tokens": 77354916.0, "step": 4800 }, { "epoch": 1.1147294008575732, "grad_norm": 0.9668848514556885, "learning_rate": 4.9135431566996315e-05, "loss": 1.6644, "mean_token_accuracy": 0.6637920081615448, "num_tokens": 77515329.0, "step": 4810 }, { "epoch": 1.1170471665314636, "grad_norm": 0.9721421003341675, "learning_rate": 4.913053262053072e-05, "loss": 1.6571, "mean_token_accuracy": 0.6666308373212815, "num_tokens": 77676446.0, "step": 4820 }, { "epoch": 1.119364932205354, "grad_norm": 0.9774282574653625, "learning_rate": 4.9125620079292744e-05, "loss": 1.6601, "mean_token_accuracy": 0.663867573440075, "num_tokens": 77837852.0, "step": 4830 }, { "epoch": 1.1216826978792445, "grad_norm": 1.019399642944336, "learning_rate": 4.912069394605005e-05, "loss": 1.6525, "mean_token_accuracy": 0.6664171740412712, "num_tokens": 77999750.0, "step": 4840 }, { "epoch": 1.1240004635531349, "grad_norm": 0.9663134813308716, "learning_rate": 4.911575422357793e-05, "loss": 1.6635, "mean_token_accuracy": 0.6637378469109535, "num_tokens": 78161094.0, "step": 4850 }, { "epoch": 1.126318229227025, "grad_norm": 1.0030871629714966, "learning_rate": 4.911080091465935e-05, "loss": 1.657, "mean_token_accuracy": 0.663893823325634, "num_tokens": 78322675.0, "step": 4860 }, { "epoch": 1.1286359949009155, "grad_norm": 0.9700556993484497, "learning_rate": 4.9105834022084934e-05, "loss": 1.6532, "mean_token_accuracy": 0.6658656045794487, "num_tokens": 78483891.0, "step": 4870 }, { "epoch": 1.130953760574806, "grad_norm": 0.9963045120239258, "learning_rate": 4.910085354865296e-05, "loss": 1.6577, "mean_token_accuracy": 0.6652840316295624, "num_tokens": 78645583.0, "step": 4880 }, { "epoch": 1.1332715262486963, "grad_norm": 1.0034971237182617, "learning_rate": 4.9095859497169336e-05, "loss": 1.6544, "mean_token_accuracy": 0.6659222140908241, "num_tokens": 78806321.0, "step": 4890 }, { "epoch": 1.1355892919225865, "grad_norm": 0.9346729516983032, "learning_rate": 4.909085187044764e-05, "loss": 1.6586, "mean_token_accuracy": 0.6636552065610886, "num_tokens": 78968324.0, "step": 4900 }, { "epoch": 1.137907057596477, "grad_norm": 0.9121778607368469, "learning_rate": 4.90858306713091e-05, "loss": 1.6705, "mean_token_accuracy": 0.6626597955822945, "num_tokens": 79129325.0, "step": 4910 }, { "epoch": 1.1402248232703673, "grad_norm": 0.9799475073814392, "learning_rate": 4.9080795902582566e-05, "loss": 1.6701, "mean_token_accuracy": 0.6632930710911751, "num_tokens": 79290455.0, "step": 4920 }, { "epoch": 1.1425425889442578, "grad_norm": 0.9645478129386902, "learning_rate": 4.907574756710456e-05, "loss": 1.656, "mean_token_accuracy": 0.6650773584842682, "num_tokens": 79451108.0, "step": 4930 }, { "epoch": 1.1448603546181482, "grad_norm": 0.9186272621154785, "learning_rate": 4.907068566771924e-05, "loss": 1.6545, "mean_token_accuracy": 0.6657266199588776, "num_tokens": 79612487.0, "step": 4940 }, { "epoch": 1.1471781202920384, "grad_norm": 0.9513791799545288, "learning_rate": 4.9065610207278395e-05, "loss": 1.6472, "mean_token_accuracy": 0.6664933621883392, "num_tokens": 79774188.0, "step": 4950 }, { "epoch": 1.1494958859659288, "grad_norm": 0.9489573836326599, "learning_rate": 4.9060521188641474e-05, "loss": 1.6461, "mean_token_accuracy": 0.665320272743702, "num_tokens": 79936201.0, "step": 4960 }, { "epoch": 1.1518136516398192, "grad_norm": 0.9905589818954468, "learning_rate": 4.905541861467553e-05, "loss": 1.6569, "mean_token_accuracy": 0.6640210971236229, "num_tokens": 80097177.0, "step": 4970 }, { "epoch": 1.1541314173137096, "grad_norm": 0.9735917448997498, "learning_rate": 4.9050302488255296e-05, "loss": 1.6623, "mean_token_accuracy": 0.6649672299623489, "num_tokens": 80258641.0, "step": 4980 }, { "epoch": 1.1564491829876, "grad_norm": 1.008385181427002, "learning_rate": 4.9045172812263105e-05, "loss": 1.6668, "mean_token_accuracy": 0.6638745322823525, "num_tokens": 80419562.0, "step": 4990 }, { "epoch": 1.1587669486614902, "grad_norm": 0.9643353819847107, "learning_rate": 4.9040029589588956e-05, "loss": 1.6485, "mean_token_accuracy": 0.6666832432150841, "num_tokens": 80580492.0, "step": 5000 }, { "epoch": 1.1610847143353806, "grad_norm": 0.957999050617218, "learning_rate": 4.903487282313044e-05, "loss": 1.6656, "mean_token_accuracy": 0.6640524357557297, "num_tokens": 80742163.0, "step": 5010 }, { "epoch": 1.163402480009271, "grad_norm": 0.9236865639686584, "learning_rate": 4.902970251579282e-05, "loss": 1.6563, "mean_token_accuracy": 0.6657476246356964, "num_tokens": 80902138.0, "step": 5020 }, { "epoch": 1.1657202456831615, "grad_norm": 0.9595335721969604, "learning_rate": 4.902451867048895e-05, "loss": 1.6802, "mean_token_accuracy": 0.661811476945877, "num_tokens": 81063847.0, "step": 5030 }, { "epoch": 1.1680380113570519, "grad_norm": 1.109161376953125, "learning_rate": 4.901932129013934e-05, "loss": 1.647, "mean_token_accuracy": 0.6652707844972611, "num_tokens": 81225411.0, "step": 5040 }, { "epoch": 1.170355777030942, "grad_norm": 0.9565542936325073, "learning_rate": 4.901411037767211e-05, "loss": 1.6881, "mean_token_accuracy": 0.6608491241931915, "num_tokens": 81387251.0, "step": 5050 }, { "epoch": 1.1726735427048325, "grad_norm": 0.9034549593925476, "learning_rate": 4.900888593602301e-05, "loss": 1.662, "mean_token_accuracy": 0.6644912838935852, "num_tokens": 81547899.0, "step": 5060 }, { "epoch": 1.174991308378723, "grad_norm": 0.9144630432128906, "learning_rate": 4.900364796813541e-05, "loss": 1.6784, "mean_token_accuracy": 0.663722088932991, "num_tokens": 81709889.0, "step": 5070 }, { "epoch": 1.1773090740526133, "grad_norm": 1.035597562789917, "learning_rate": 4.8998396476960306e-05, "loss": 1.6655, "mean_token_accuracy": 0.6649868160486221, "num_tokens": 81870930.0, "step": 5080 }, { "epoch": 1.1796268397265037, "grad_norm": 0.916426956653595, "learning_rate": 4.89931314654563e-05, "loss": 1.6692, "mean_token_accuracy": 0.6646684840321541, "num_tokens": 82032603.0, "step": 5090 }, { "epoch": 1.181944605400394, "grad_norm": 0.9233450293540955, "learning_rate": 4.898785293658963e-05, "loss": 1.6479, "mean_token_accuracy": 0.665992633998394, "num_tokens": 82194202.0, "step": 5100 }, { "epoch": 1.1842623710742843, "grad_norm": 0.9819741249084473, "learning_rate": 4.898256089333412e-05, "loss": 1.6585, "mean_token_accuracy": 0.663990917801857, "num_tokens": 82355841.0, "step": 5110 }, { "epoch": 1.1865801367481748, "grad_norm": 0.9883556365966797, "learning_rate": 4.8977255338671236e-05, "loss": 1.6447, "mean_token_accuracy": 0.6660318434238434, "num_tokens": 82516354.0, "step": 5120 }, { "epoch": 1.1888979024220652, "grad_norm": 1.007706642150879, "learning_rate": 4.8971936275590046e-05, "loss": 1.6569, "mean_token_accuracy": 0.6657413244247437, "num_tokens": 82677969.0, "step": 5130 }, { "epoch": 1.1912156680959556, "grad_norm": 0.9211971759796143, "learning_rate": 4.8966603707087236e-05, "loss": 1.6602, "mean_token_accuracy": 0.6639321729540825, "num_tokens": 82839994.0, "step": 5140 }, { "epoch": 1.1935334337698458, "grad_norm": 0.9324185252189636, "learning_rate": 4.896125763616708e-05, "loss": 1.6493, "mean_token_accuracy": 0.6674002662301064, "num_tokens": 83001165.0, "step": 5150 }, { "epoch": 1.1958511994437362, "grad_norm": 0.9896000623703003, "learning_rate": 4.895589806584148e-05, "loss": 1.673, "mean_token_accuracy": 0.663814227283001, "num_tokens": 83162081.0, "step": 5160 }, { "epoch": 1.1981689651176266, "grad_norm": 0.8888491988182068, "learning_rate": 4.8950524999129935e-05, "loss": 1.6697, "mean_token_accuracy": 0.6634126737713814, "num_tokens": 83323813.0, "step": 5170 }, { "epoch": 1.200486730791517, "grad_norm": 0.8875452876091003, "learning_rate": 4.894513843905955e-05, "loss": 1.6562, "mean_token_accuracy": 0.6654169708490372, "num_tokens": 83485605.0, "step": 5180 }, { "epoch": 1.2028044964654074, "grad_norm": 1.1466624736785889, "learning_rate": 4.893973838866503e-05, "loss": 1.6733, "mean_token_accuracy": 0.6630445048213005, "num_tokens": 83647022.0, "step": 5190 }, { "epoch": 1.2051222621392976, "grad_norm": 0.9161199331283569, "learning_rate": 4.8934324850988675e-05, "loss": 1.6318, "mean_token_accuracy": 0.6674504369497299, "num_tokens": 83808463.0, "step": 5200 }, { "epoch": 1.207440027813188, "grad_norm": 0.9291313886642456, "learning_rate": 4.89288978290804e-05, "loss": 1.6366, "mean_token_accuracy": 0.6682101085782051, "num_tokens": 83969951.0, "step": 5210 }, { "epoch": 1.2097577934870785, "grad_norm": 0.944803774356842, "learning_rate": 4.892345732599769e-05, "loss": 1.6447, "mean_token_accuracy": 0.6658159613609314, "num_tokens": 84131691.0, "step": 5220 }, { "epoch": 1.2120755591609689, "grad_norm": 1.005571961402893, "learning_rate": 4.8918003344805656e-05, "loss": 1.6361, "mean_token_accuracy": 0.6672273024916648, "num_tokens": 84293447.0, "step": 5230 }, { "epoch": 1.2143933248348593, "grad_norm": 1.006261944770813, "learning_rate": 4.8912535888576985e-05, "loss": 1.6794, "mean_token_accuracy": 0.6639830216765403, "num_tokens": 84452818.0, "step": 5240 }, { "epoch": 1.2167110905087495, "grad_norm": 0.9233111143112183, "learning_rate": 4.890705496039195e-05, "loss": 1.6538, "mean_token_accuracy": 0.6657785683870315, "num_tokens": 84614416.0, "step": 5250 }, { "epoch": 1.21902885618264, "grad_norm": 0.9534965753555298, "learning_rate": 4.890156056333842e-05, "loss": 1.6416, "mean_token_accuracy": 0.6662712231278419, "num_tokens": 84775694.0, "step": 5260 }, { "epoch": 1.2213466218565303, "grad_norm": 0.9511992931365967, "learning_rate": 4.8896052700511876e-05, "loss": 1.6631, "mean_token_accuracy": 0.6636038526892662, "num_tokens": 84936476.0, "step": 5270 }, { "epoch": 1.2236643875304207, "grad_norm": 1.0809130668640137, "learning_rate": 4.889053137501534e-05, "loss": 1.6427, "mean_token_accuracy": 0.6663650691509246, "num_tokens": 85097147.0, "step": 5280 }, { "epoch": 1.2259821532043111, "grad_norm": 1.0130774974822998, "learning_rate": 4.888499658995945e-05, "loss": 1.6456, "mean_token_accuracy": 0.6668738886713982, "num_tokens": 85259061.0, "step": 5290 }, { "epoch": 1.2282999188782013, "grad_norm": 1.1234930753707886, "learning_rate": 4.8879448348462425e-05, "loss": 1.6487, "mean_token_accuracy": 0.6653995826840401, "num_tokens": 85419647.0, "step": 5300 }, { "epoch": 1.2306176845520918, "grad_norm": 1.0698226690292358, "learning_rate": 4.8873886653650055e-05, "loss": 1.6549, "mean_token_accuracy": 0.6658605858683586, "num_tokens": 85581527.0, "step": 5310 }, { "epoch": 1.2329354502259822, "grad_norm": 0.9586246013641357, "learning_rate": 4.8868311508655703e-05, "loss": 1.6608, "mean_token_accuracy": 0.6662013843655586, "num_tokens": 85742880.0, "step": 5320 }, { "epoch": 1.2352532158998726, "grad_norm": 0.9322565197944641, "learning_rate": 4.886272291662034e-05, "loss": 1.62, "mean_token_accuracy": 0.6701038584113121, "num_tokens": 85904060.0, "step": 5330 }, { "epoch": 1.237570981573763, "grad_norm": 0.958730161190033, "learning_rate": 4.885712088069248e-05, "loss": 1.6447, "mean_token_accuracy": 0.6674300909042359, "num_tokens": 86065625.0, "step": 5340 }, { "epoch": 1.2398887472476532, "grad_norm": 0.974582314491272, "learning_rate": 4.885150540402823e-05, "loss": 1.6575, "mean_token_accuracy": 0.6641270875930786, "num_tokens": 86227173.0, "step": 5350 }, { "epoch": 1.2422065129215436, "grad_norm": 0.9755936861038208, "learning_rate": 4.884587648979125e-05, "loss": 1.6394, "mean_token_accuracy": 0.6666323572397232, "num_tokens": 86388213.0, "step": 5360 }, { "epoch": 1.244524278595434, "grad_norm": 1.0326364040374756, "learning_rate": 4.884023414115281e-05, "loss": 1.6391, "mean_token_accuracy": 0.6667574241757392, "num_tokens": 86549115.0, "step": 5370 }, { "epoch": 1.2468420442693244, "grad_norm": 1.00269615650177, "learning_rate": 4.88345783612917e-05, "loss": 1.6409, "mean_token_accuracy": 0.6676496669650078, "num_tokens": 86710987.0, "step": 5380 }, { "epoch": 1.2491598099432148, "grad_norm": 0.9981412887573242, "learning_rate": 4.8828909153394306e-05, "loss": 1.6474, "mean_token_accuracy": 0.6662119820713996, "num_tokens": 86871604.0, "step": 5390 }, { "epoch": 1.251477575617105, "grad_norm": 0.8933107256889343, "learning_rate": 4.882322652065458e-05, "loss": 1.6407, "mean_token_accuracy": 0.6670059651136399, "num_tokens": 87033114.0, "step": 5400 }, { "epoch": 1.2537953412909955, "grad_norm": 0.8743984699249268, "learning_rate": 4.881753046627402e-05, "loss": 1.644, "mean_token_accuracy": 0.6665025517344475, "num_tokens": 87194820.0, "step": 5410 }, { "epoch": 1.2561131069648859, "grad_norm": 0.9008446335792542, "learning_rate": 4.88118209934617e-05, "loss": 1.6575, "mean_token_accuracy": 0.663758285343647, "num_tokens": 87356166.0, "step": 5420 }, { "epoch": 1.2584308726387763, "grad_norm": 0.9519490003585815, "learning_rate": 4.880609810543426e-05, "loss": 1.6565, "mean_token_accuracy": 0.6651030972599983, "num_tokens": 87517044.0, "step": 5430 }, { "epoch": 1.2607486383126667, "grad_norm": 0.9847290515899658, "learning_rate": 4.880036180541586e-05, "loss": 1.6394, "mean_token_accuracy": 0.6661131381988525, "num_tokens": 87679100.0, "step": 5440 }, { "epoch": 1.263066403986557, "grad_norm": 0.9998654127120972, "learning_rate": 4.8794612096638275e-05, "loss": 1.6477, "mean_token_accuracy": 0.6664488911628723, "num_tokens": 87840237.0, "step": 5450 }, { "epoch": 1.2653841696604473, "grad_norm": 0.9471983909606934, "learning_rate": 4.878884898234078e-05, "loss": 1.6352, "mean_token_accuracy": 0.6677572414278984, "num_tokens": 88001722.0, "step": 5460 }, { "epoch": 1.2677019353343377, "grad_norm": 0.9498677849769592, "learning_rate": 4.878307246577024e-05, "loss": 1.6371, "mean_token_accuracy": 0.6657526612281799, "num_tokens": 88163258.0, "step": 5470 }, { "epoch": 1.2700197010082281, "grad_norm": 0.9544923901557922, "learning_rate": 4.8777282550181044e-05, "loss": 1.6317, "mean_token_accuracy": 0.6679775536060333, "num_tokens": 88325008.0, "step": 5480 }, { "epoch": 1.2723374666821186, "grad_norm": 0.9385226964950562, "learning_rate": 4.877147923883516e-05, "loss": 1.6573, "mean_token_accuracy": 0.6638476386666298, "num_tokens": 88486409.0, "step": 5490 }, { "epoch": 1.2746552323560087, "grad_norm": 0.8877605199813843, "learning_rate": 4.876566253500206e-05, "loss": 1.6425, "mean_token_accuracy": 0.6677944898605347, "num_tokens": 88647763.0, "step": 5500 }, { "epoch": 1.2769729980298992, "grad_norm": 0.9195212721824646, "learning_rate": 4.87598324419588e-05, "loss": 1.6487, "mean_token_accuracy": 0.6678121864795685, "num_tokens": 88809631.0, "step": 5510 }, { "epoch": 1.2792907637037896, "grad_norm": 0.9537209272384644, "learning_rate": 4.8753988962989964e-05, "loss": 1.6449, "mean_token_accuracy": 0.6672055900096894, "num_tokens": 88971113.0, "step": 5520 }, { "epoch": 1.28160852937768, "grad_norm": 0.9744460582733154, "learning_rate": 4.874813210138768e-05, "loss": 1.6417, "mean_token_accuracy": 0.6661201909184455, "num_tokens": 89132173.0, "step": 5530 }, { "epoch": 1.2839262950515704, "grad_norm": 1.0296579599380493, "learning_rate": 4.874226186045161e-05, "loss": 1.6308, "mean_token_accuracy": 0.6681009098887444, "num_tokens": 89293765.0, "step": 5540 }, { "epoch": 1.2862440607254606, "grad_norm": 0.9648062586784363, "learning_rate": 4.873637824348897e-05, "loss": 1.6581, "mean_token_accuracy": 0.6634981393814087, "num_tokens": 89455141.0, "step": 5550 }, { "epoch": 1.288561826399351, "grad_norm": 0.9969829320907593, "learning_rate": 4.873048125381448e-05, "loss": 1.6426, "mean_token_accuracy": 0.6680165380239487, "num_tokens": 89616119.0, "step": 5560 }, { "epoch": 1.2908795920732414, "grad_norm": 0.916713535785675, "learning_rate": 4.8724570894750415e-05, "loss": 1.6666, "mean_token_accuracy": 0.6641782253980637, "num_tokens": 89777458.0, "step": 5570 }, { "epoch": 1.2931973577471318, "grad_norm": 0.9532466530799866, "learning_rate": 4.8718647169626594e-05, "loss": 1.6482, "mean_token_accuracy": 0.6643812134861946, "num_tokens": 89939287.0, "step": 5580 }, { "epoch": 1.2955151234210223, "grad_norm": 0.8886408805847168, "learning_rate": 4.871271008178035e-05, "loss": 1.6256, "mean_token_accuracy": 0.6688634067773819, "num_tokens": 90100158.0, "step": 5590 }, { "epoch": 1.2978328890949125, "grad_norm": 0.9268935918807983, "learning_rate": 4.8706759634556556e-05, "loss": 1.6432, "mean_token_accuracy": 0.6668284296989441, "num_tokens": 90261501.0, "step": 5600 }, { "epoch": 1.3001506547688029, "grad_norm": 1.0413192510604858, "learning_rate": 4.8700795831307577e-05, "loss": 1.6396, "mean_token_accuracy": 0.6696178302168846, "num_tokens": 90420497.0, "step": 5610 }, { "epoch": 1.3024684204426933, "grad_norm": 0.9574744701385498, "learning_rate": 4.8694818675393344e-05, "loss": 1.6317, "mean_token_accuracy": 0.667943499982357, "num_tokens": 90580170.0, "step": 5620 }, { "epoch": 1.3047861861165835, "grad_norm": 0.9870886206626892, "learning_rate": 4.86888281701813e-05, "loss": 1.6392, "mean_token_accuracy": 0.6658263504505157, "num_tokens": 90742021.0, "step": 5630 }, { "epoch": 1.3071039517904741, "grad_norm": 1.0014625787734985, "learning_rate": 4.868282431904639e-05, "loss": 1.6261, "mean_token_accuracy": 0.6695694342255593, "num_tokens": 90903301.0, "step": 5640 }, { "epoch": 1.3094217174643643, "grad_norm": 0.9070147275924683, "learning_rate": 4.8676807125371116e-05, "loss": 1.6375, "mean_token_accuracy": 0.6672870814800262, "num_tokens": 91063149.0, "step": 5650 }, { "epoch": 1.3117394831382547, "grad_norm": 0.9792453646659851, "learning_rate": 4.8670776592545445e-05, "loss": 1.6209, "mean_token_accuracy": 0.6696529299020767, "num_tokens": 91222992.0, "step": 5660 }, { "epoch": 1.3140572488121451, "grad_norm": 0.8630582690238953, "learning_rate": 4.8664732723966896e-05, "loss": 1.6335, "mean_token_accuracy": 0.6675295457243919, "num_tokens": 91384892.0, "step": 5670 }, { "epoch": 1.3163750144860353, "grad_norm": 0.9712023735046387, "learning_rate": 4.86586755230405e-05, "loss": 1.6463, "mean_token_accuracy": 0.666418980062008, "num_tokens": 91546054.0, "step": 5680 }, { "epoch": 1.318692780159926, "grad_norm": 1.0619256496429443, "learning_rate": 4.8652604993178785e-05, "loss": 1.6408, "mean_token_accuracy": 0.6677900061011315, "num_tokens": 91706216.0, "step": 5690 }, { "epoch": 1.3210105458338162, "grad_norm": 1.005686640739441, "learning_rate": 4.864652113780179e-05, "loss": 1.6486, "mean_token_accuracy": 0.666243264079094, "num_tokens": 91867871.0, "step": 5700 }, { "epoch": 1.3233283115077066, "grad_norm": 0.9257636666297913, "learning_rate": 4.864042396033708e-05, "loss": 1.6288, "mean_token_accuracy": 0.6682513311505318, "num_tokens": 92029881.0, "step": 5710 }, { "epoch": 1.325646077181597, "grad_norm": 0.9223511219024658, "learning_rate": 4.863431346421969e-05, "loss": 1.6377, "mean_token_accuracy": 0.6667949452996254, "num_tokens": 92191486.0, "step": 5720 }, { "epoch": 1.3279638428554872, "grad_norm": 0.9525748491287231, "learning_rate": 4.862818965289221e-05, "loss": 1.6442, "mean_token_accuracy": 0.6663282036781311, "num_tokens": 92352848.0, "step": 5730 }, { "epoch": 1.3302816085293776, "grad_norm": 0.8697435855865479, "learning_rate": 4.862205252980467e-05, "loss": 1.6499, "mean_token_accuracy": 0.6655889213085174, "num_tokens": 92513957.0, "step": 5740 }, { "epoch": 1.332599374203268, "grad_norm": 0.9856190085411072, "learning_rate": 4.8615902098414646e-05, "loss": 1.6169, "mean_token_accuracy": 0.6681730628013611, "num_tokens": 92675412.0, "step": 5750 }, { "epoch": 1.3349171398771584, "grad_norm": 0.9283172488212585, "learning_rate": 4.8609738362187196e-05, "loss": 1.6558, "mean_token_accuracy": 0.6634155467152596, "num_tokens": 92837070.0, "step": 5760 }, { "epoch": 1.3372349055510488, "grad_norm": 0.9126998782157898, "learning_rate": 4.860356132459488e-05, "loss": 1.6535, "mean_token_accuracy": 0.6656754553318024, "num_tokens": 92998362.0, "step": 5770 }, { "epoch": 1.339552671224939, "grad_norm": 0.8932685852050781, "learning_rate": 4.859737098911774e-05, "loss": 1.6332, "mean_token_accuracy": 0.6674070686101914, "num_tokens": 93159055.0, "step": 5780 }, { "epoch": 1.3418704368988295, "grad_norm": 0.9072970747947693, "learning_rate": 4.859116735924331e-05, "loss": 1.6448, "mean_token_accuracy": 0.6662386372685433, "num_tokens": 93319850.0, "step": 5790 }, { "epoch": 1.3441882025727199, "grad_norm": 1.0460636615753174, "learning_rate": 4.858495043846663e-05, "loss": 1.6478, "mean_token_accuracy": 0.6668492197990418, "num_tokens": 93481496.0, "step": 5800 }, { "epoch": 1.3465059682466103, "grad_norm": 0.912661075592041, "learning_rate": 4.857872023029022e-05, "loss": 1.6316, "mean_token_accuracy": 0.6674600511789321, "num_tokens": 93642425.0, "step": 5810 }, { "epoch": 1.3488237339205007, "grad_norm": 0.9570991396903992, "learning_rate": 4.857247673822407e-05, "loss": 1.6542, "mean_token_accuracy": 0.6665983214974404, "num_tokens": 93803104.0, "step": 5820 }, { "epoch": 1.351141499594391, "grad_norm": 0.8594950437545776, "learning_rate": 4.856621996578568e-05, "loss": 1.6302, "mean_token_accuracy": 0.6679940894246101, "num_tokens": 93964243.0, "step": 5830 }, { "epoch": 1.3534592652682813, "grad_norm": 0.9848896861076355, "learning_rate": 4.855994991650001e-05, "loss": 1.639, "mean_token_accuracy": 0.6666417628526687, "num_tokens": 94125772.0, "step": 5840 }, { "epoch": 1.3557770309421717, "grad_norm": 0.986595869064331, "learning_rate": 4.8553666593899525e-05, "loss": 1.6352, "mean_token_accuracy": 0.6676309287548066, "num_tokens": 94287346.0, "step": 5850 }, { "epoch": 1.3580947966160621, "grad_norm": 0.8985373377799988, "learning_rate": 4.8547370001524144e-05, "loss": 1.6235, "mean_token_accuracy": 0.6689436495304107, "num_tokens": 94447991.0, "step": 5860 }, { "epoch": 1.3604125622899526, "grad_norm": 0.9458909630775452, "learning_rate": 4.854106014292127e-05, "loss": 1.6266, "mean_token_accuracy": 0.6674213081598281, "num_tokens": 94609661.0, "step": 5870 }, { "epoch": 1.3627303279638427, "grad_norm": 0.8799587488174438, "learning_rate": 4.8534737021645774e-05, "loss": 1.6349, "mean_token_accuracy": 0.6678979560732842, "num_tokens": 94770462.0, "step": 5880 }, { "epoch": 1.3650480936377332, "grad_norm": 1.0458879470825195, "learning_rate": 4.8528400641260016e-05, "loss": 1.6344, "mean_token_accuracy": 0.6684871286153793, "num_tokens": 94932044.0, "step": 5890 }, { "epoch": 1.3673658593116236, "grad_norm": 0.9595792889595032, "learning_rate": 4.8522051005333814e-05, "loss": 1.631, "mean_token_accuracy": 0.6685238540172577, "num_tokens": 95092632.0, "step": 5900 }, { "epoch": 1.369683624985514, "grad_norm": 0.9156360030174255, "learning_rate": 4.8515688117444446e-05, "loss": 1.6325, "mean_token_accuracy": 0.6671541944146157, "num_tokens": 95252802.0, "step": 5910 }, { "epoch": 1.3720013906594044, "grad_norm": 0.8675534725189209, "learning_rate": 4.8509311981176684e-05, "loss": 1.639, "mean_token_accuracy": 0.6661180153489112, "num_tokens": 95413239.0, "step": 5920 }, { "epoch": 1.3743191563332946, "grad_norm": 0.9752662777900696, "learning_rate": 4.850292260012273e-05, "loss": 1.6132, "mean_token_accuracy": 0.6695626765489578, "num_tokens": 95574553.0, "step": 5930 }, { "epoch": 1.376636922007185, "grad_norm": 0.8787292242050171, "learning_rate": 4.849651997788226e-05, "loss": 1.6595, "mean_token_accuracy": 0.6646679982542991, "num_tokens": 95735863.0, "step": 5940 }, { "epoch": 1.3789546876810754, "grad_norm": 0.965189516544342, "learning_rate": 4.8490104118062437e-05, "loss": 1.6197, "mean_token_accuracy": 0.6686425775289535, "num_tokens": 95896662.0, "step": 5950 }, { "epoch": 1.3812724533549658, "grad_norm": 0.9070329666137695, "learning_rate": 4.848367502427783e-05, "loss": 1.641, "mean_token_accuracy": 0.6666617587208747, "num_tokens": 96057967.0, "step": 5960 }, { "epoch": 1.3835902190288563, "grad_norm": 0.9834482073783875, "learning_rate": 4.847723270015051e-05, "loss": 1.6326, "mean_token_accuracy": 0.6679441601037979, "num_tokens": 96219551.0, "step": 5970 }, { "epoch": 1.3859079847027465, "grad_norm": 0.9405391812324524, "learning_rate": 4.8470777149309974e-05, "loss": 1.6433, "mean_token_accuracy": 0.6668954610824585, "num_tokens": 96380727.0, "step": 5980 }, { "epoch": 1.3882257503766369, "grad_norm": 0.9828850626945496, "learning_rate": 4.846430837539319e-05, "loss": 1.6408, "mean_token_accuracy": 0.6673390477895736, "num_tokens": 96542416.0, "step": 5990 }, { "epoch": 1.3905435160505273, "grad_norm": 0.921890139579773, "learning_rate": 4.8457826382044555e-05, "loss": 1.62, "mean_token_accuracy": 0.66998850107193, "num_tokens": 96703466.0, "step": 6000 }, { "epoch": 1.3928612817244177, "grad_norm": 0.9669054746627808, "learning_rate": 4.8451331172915935e-05, "loss": 1.6494, "mean_token_accuracy": 0.6652275204658509, "num_tokens": 96863890.0, "step": 6010 }, { "epoch": 1.3951790473983081, "grad_norm": 0.883021354675293, "learning_rate": 4.844482275166663e-05, "loss": 1.6488, "mean_token_accuracy": 0.6679476574063301, "num_tokens": 97025848.0, "step": 6020 }, { "epoch": 1.3974968130721983, "grad_norm": 0.8874942064285278, "learning_rate": 4.8438301121963376e-05, "loss": 1.6552, "mean_token_accuracy": 0.6639342173933983, "num_tokens": 97187813.0, "step": 6030 }, { "epoch": 1.3998145787460887, "grad_norm": 0.9215936660766602, "learning_rate": 4.843176628748036e-05, "loss": 1.6289, "mean_token_accuracy": 0.6668382287025452, "num_tokens": 97349800.0, "step": 6040 }, { "epoch": 1.4021323444199791, "grad_norm": 0.9176316857337952, "learning_rate": 4.8425218251899224e-05, "loss": 1.6389, "mean_token_accuracy": 0.666423662006855, "num_tokens": 97510398.0, "step": 6050 }, { "epoch": 1.4044501100938696, "grad_norm": 0.8882095217704773, "learning_rate": 4.8418657018909015e-05, "loss": 1.6121, "mean_token_accuracy": 0.6717051193118095, "num_tokens": 97671834.0, "step": 6060 }, { "epoch": 1.40676787576776, "grad_norm": 0.959479808807373, "learning_rate": 4.841208259220623e-05, "loss": 1.6252, "mean_token_accuracy": 0.6701214283704757, "num_tokens": 97833900.0, "step": 6070 }, { "epoch": 1.4090856414416502, "grad_norm": 0.9820235371589661, "learning_rate": 4.840549497549481e-05, "loss": 1.6269, "mean_token_accuracy": 0.6689907357096672, "num_tokens": 97994124.0, "step": 6080 }, { "epoch": 1.4114034071155406, "grad_norm": 1.0177104473114014, "learning_rate": 4.839889417248612e-05, "loss": 1.6299, "mean_token_accuracy": 0.6682976081967353, "num_tokens": 98156246.0, "step": 6090 }, { "epoch": 1.413721172789431, "grad_norm": 0.9674437046051025, "learning_rate": 4.839228018689893e-05, "loss": 1.6324, "mean_token_accuracy": 0.6689804494380951, "num_tokens": 98317956.0, "step": 6100 }, { "epoch": 1.4160389384633214, "grad_norm": 0.8684967160224915, "learning_rate": 4.838565302245949e-05, "loss": 1.6117, "mean_token_accuracy": 0.6693948701024055, "num_tokens": 98479872.0, "step": 6110 }, { "epoch": 1.4183567041372118, "grad_norm": 0.9032576680183411, "learning_rate": 4.8379012682901406e-05, "loss": 1.6298, "mean_token_accuracy": 0.6687612786889077, "num_tokens": 98641638.0, "step": 6120 }, { "epoch": 1.420674469811102, "grad_norm": 0.9680318832397461, "learning_rate": 4.837235917196577e-05, "loss": 1.6201, "mean_token_accuracy": 0.6696533784270287, "num_tokens": 98802164.0, "step": 6130 }, { "epoch": 1.4229922354849924, "grad_norm": 1.0233021974563599, "learning_rate": 4.836569249340107e-05, "loss": 1.6457, "mean_token_accuracy": 0.6666064664721489, "num_tokens": 98962861.0, "step": 6140 }, { "epoch": 1.4253100011588828, "grad_norm": 0.9192615151405334, "learning_rate": 4.83590126509632e-05, "loss": 1.6367, "mean_token_accuracy": 0.6667204901576043, "num_tokens": 99124191.0, "step": 6150 }, { "epoch": 1.4276277668327733, "grad_norm": 1.029840111732483, "learning_rate": 4.835231964841548e-05, "loss": 1.6343, "mean_token_accuracy": 0.6683445855975151, "num_tokens": 99284006.0, "step": 6160 }, { "epoch": 1.4299455325066637, "grad_norm": 0.8898642063140869, "learning_rate": 4.834561348952864e-05, "loss": 1.6246, "mean_token_accuracy": 0.6671500071883202, "num_tokens": 99445727.0, "step": 6170 }, { "epoch": 1.4322632981805539, "grad_norm": 0.9864247441291809, "learning_rate": 4.833889417808084e-05, "loss": 1.6263, "mean_token_accuracy": 0.6670430228114128, "num_tokens": 99607028.0, "step": 6180 }, { "epoch": 1.4345810638544443, "grad_norm": 0.9432833194732666, "learning_rate": 4.8332161717857636e-05, "loss": 1.6485, "mean_token_accuracy": 0.6671121656894684, "num_tokens": 99768499.0, "step": 6190 }, { "epoch": 1.4368988295283347, "grad_norm": 0.9403245449066162, "learning_rate": 4.832541611265198e-05, "loss": 1.6254, "mean_token_accuracy": 0.6683650329709053, "num_tokens": 99929791.0, "step": 6200 }, { "epoch": 1.4392165952022251, "grad_norm": 0.896673321723938, "learning_rate": 4.831865736626426e-05, "loss": 1.6397, "mean_token_accuracy": 0.6662025466561318, "num_tokens": 100091337.0, "step": 6210 }, { "epoch": 1.4415343608761155, "grad_norm": 0.9501383304595947, "learning_rate": 4.831188548250224e-05, "loss": 1.6328, "mean_token_accuracy": 0.6674139991402626, "num_tokens": 100252799.0, "step": 6220 }, { "epoch": 1.4438521265500057, "grad_norm": 0.9422773718833923, "learning_rate": 4.8305100465181095e-05, "loss": 1.6276, "mean_token_accuracy": 0.6679717615246773, "num_tokens": 100413913.0, "step": 6230 }, { "epoch": 1.4461698922238961, "grad_norm": 0.9697166681289673, "learning_rate": 4.829830231812341e-05, "loss": 1.6204, "mean_token_accuracy": 0.6692275196313858, "num_tokens": 100575377.0, "step": 6240 }, { "epoch": 1.4484876578977866, "grad_norm": 0.8706114292144775, "learning_rate": 4.829149104515914e-05, "loss": 1.6055, "mean_token_accuracy": 0.6698480248451233, "num_tokens": 100736008.0, "step": 6250 }, { "epoch": 1.450805423571677, "grad_norm": 0.8743609189987183, "learning_rate": 4.828466665012567e-05, "loss": 1.6252, "mean_token_accuracy": 0.6686190500855446, "num_tokens": 100897531.0, "step": 6260 }, { "epoch": 1.4531231892455674, "grad_norm": 0.9364244937896729, "learning_rate": 4.827782913686774e-05, "loss": 1.6312, "mean_token_accuracy": 0.6677579134702682, "num_tokens": 101059064.0, "step": 6270 }, { "epoch": 1.4554409549194576, "grad_norm": 0.9550907015800476, "learning_rate": 4.827097850923751e-05, "loss": 1.6303, "mean_token_accuracy": 0.667827595770359, "num_tokens": 101220707.0, "step": 6280 }, { "epoch": 1.457758720593348, "grad_norm": 0.9894869923591614, "learning_rate": 4.826411477109453e-05, "loss": 1.6302, "mean_token_accuracy": 0.6704351305961609, "num_tokens": 101381581.0, "step": 6290 }, { "epoch": 1.4600764862672384, "grad_norm": 0.9603965282440186, "learning_rate": 4.82572379263057e-05, "loss": 1.6475, "mean_token_accuracy": 0.6648008480668068, "num_tokens": 101543265.0, "step": 6300 }, { "epoch": 1.4623942519411288, "grad_norm": 0.9128243327140808, "learning_rate": 4.825034797874536e-05, "loss": 1.6154, "mean_token_accuracy": 0.6690594330430031, "num_tokens": 101705061.0, "step": 6310 }, { "epoch": 1.4647120176150192, "grad_norm": 0.9246515035629272, "learning_rate": 4.824344493229517e-05, "loss": 1.6215, "mean_token_accuracy": 0.6682519108057022, "num_tokens": 101866850.0, "step": 6320 }, { "epoch": 1.4670297832889094, "grad_norm": 0.9296827912330627, "learning_rate": 4.823652879084422e-05, "loss": 1.6141, "mean_token_accuracy": 0.6700446680188179, "num_tokens": 102027920.0, "step": 6330 }, { "epoch": 1.4693475489627998, "grad_norm": 0.9537080526351929, "learning_rate": 4.822959955828895e-05, "loss": 1.6151, "mean_token_accuracy": 0.6709495976567268, "num_tokens": 102188336.0, "step": 6340 }, { "epoch": 1.4716653146366903, "grad_norm": 0.8644818663597107, "learning_rate": 4.8222657238533196e-05, "loss": 1.617, "mean_token_accuracy": 0.6701490402221679, "num_tokens": 102349445.0, "step": 6350 }, { "epoch": 1.4739830803105807, "grad_norm": 0.9287856221199036, "learning_rate": 4.8215701835488135e-05, "loss": 1.6348, "mean_token_accuracy": 0.6670252770185471, "num_tokens": 102510379.0, "step": 6360 }, { "epoch": 1.476300845984471, "grad_norm": 0.8648366332054138, "learning_rate": 4.820873335307235e-05, "loss": 1.6299, "mean_token_accuracy": 0.6681587010622024, "num_tokens": 102671875.0, "step": 6370 }, { "epoch": 1.4786186116583613, "grad_norm": 0.9385184645652771, "learning_rate": 4.820175179521177e-05, "loss": 1.6251, "mean_token_accuracy": 0.6688551783561707, "num_tokens": 102832558.0, "step": 6380 }, { "epoch": 1.4809363773322517, "grad_norm": 0.908819854259491, "learning_rate": 4.81947571658397e-05, "loss": 1.6261, "mean_token_accuracy": 0.6686033919453621, "num_tokens": 102993017.0, "step": 6390 }, { "epoch": 1.4832541430061421, "grad_norm": 0.9855405688285828, "learning_rate": 4.8187749468896806e-05, "loss": 1.6473, "mean_token_accuracy": 0.6634609118103981, "num_tokens": 103153936.0, "step": 6400 }, { "epoch": 1.4855719086800325, "grad_norm": 0.9256932735443115, "learning_rate": 4.818072870833111e-05, "loss": 1.6379, "mean_token_accuracy": 0.667057740688324, "num_tokens": 103315597.0, "step": 6410 }, { "epoch": 1.487889674353923, "grad_norm": 0.903532087802887, "learning_rate": 4.817369488809802e-05, "loss": 1.6572, "mean_token_accuracy": 0.6651875436306, "num_tokens": 103477285.0, "step": 6420 }, { "epoch": 1.4902074400278131, "grad_norm": 0.8866155743598938, "learning_rate": 4.816664801216027e-05, "loss": 1.6234, "mean_token_accuracy": 0.6685026943683624, "num_tokens": 103638819.0, "step": 6430 }, { "epoch": 1.4925252057017036, "grad_norm": 0.9270355701446533, "learning_rate": 4.815958808448796e-05, "loss": 1.6547, "mean_token_accuracy": 0.6640980660915374, "num_tokens": 103799923.0, "step": 6440 }, { "epoch": 1.494842971375594, "grad_norm": 0.9117171168327332, "learning_rate": 4.815251510905854e-05, "loss": 1.6196, "mean_token_accuracy": 0.6691268384456635, "num_tokens": 103961332.0, "step": 6450 }, { "epoch": 1.4971607370494844, "grad_norm": 0.882796049118042, "learning_rate": 4.8145429089856834e-05, "loss": 1.6187, "mean_token_accuracy": 0.6701616570353508, "num_tokens": 104122677.0, "step": 6460 }, { "epoch": 1.4994785027233748, "grad_norm": 0.8882198929786682, "learning_rate": 4.8138330030874976e-05, "loss": 1.6308, "mean_token_accuracy": 0.6668872475624085, "num_tokens": 104281927.0, "step": 6470 }, { "epoch": 1.501796268397265, "grad_norm": 0.9241828918457031, "learning_rate": 4.813121793611249e-05, "loss": 1.6287, "mean_token_accuracy": 0.6684737518429756, "num_tokens": 104442027.0, "step": 6480 }, { "epoch": 1.5041140340711554, "grad_norm": 0.9198806881904602, "learning_rate": 4.81240928095762e-05, "loss": 1.6226, "mean_token_accuracy": 0.669235198199749, "num_tokens": 104603011.0, "step": 6490 }, { "epoch": 1.5064317997450458, "grad_norm": 0.9480087161064148, "learning_rate": 4.811695465528031e-05, "loss": 1.6418, "mean_token_accuracy": 0.6670579552650452, "num_tokens": 104763887.0, "step": 6500 }, { "epoch": 1.508749565418936, "grad_norm": 0.8957732915878296, "learning_rate": 4.810980347724634e-05, "loss": 1.6271, "mean_token_accuracy": 0.6680972725152969, "num_tokens": 104924847.0, "step": 6510 }, { "epoch": 1.5110673310928266, "grad_norm": 0.8428501486778259, "learning_rate": 4.810263927950315e-05, "loss": 1.6265, "mean_token_accuracy": 0.667701967060566, "num_tokens": 105086661.0, "step": 6520 }, { "epoch": 1.5133850967667168, "grad_norm": 0.9000861644744873, "learning_rate": 4.809546206608694e-05, "loss": 1.6549, "mean_token_accuracy": 0.6637429192662239, "num_tokens": 105247526.0, "step": 6530 }, { "epoch": 1.5157028624406073, "grad_norm": 0.9398950934410095, "learning_rate": 4.8088271841041245e-05, "loss": 1.6182, "mean_token_accuracy": 0.6700411602854729, "num_tokens": 105408238.0, "step": 6540 }, { "epoch": 1.5180206281144977, "grad_norm": 0.8709012269973755, "learning_rate": 4.8081068608416924e-05, "loss": 1.6302, "mean_token_accuracy": 0.6673229068517685, "num_tokens": 105569077.0, "step": 6550 }, { "epoch": 1.5203383937883879, "grad_norm": 0.918907642364502, "learning_rate": 4.8073852372272176e-05, "loss": 1.63, "mean_token_accuracy": 0.6687179133296013, "num_tokens": 105730381.0, "step": 6560 }, { "epoch": 1.5226561594622785, "grad_norm": 0.9010257124900818, "learning_rate": 4.8066623136672514e-05, "loss": 1.641, "mean_token_accuracy": 0.6657818362116814, "num_tokens": 105892148.0, "step": 6570 }, { "epoch": 1.5249739251361687, "grad_norm": 0.8913388252258301, "learning_rate": 4.805938090569077e-05, "loss": 1.6071, "mean_token_accuracy": 0.6698646947741509, "num_tokens": 106052971.0, "step": 6580 }, { "epoch": 1.527291690810059, "grad_norm": 0.9111267924308777, "learning_rate": 4.8052125683407116e-05, "loss": 1.6021, "mean_token_accuracy": 0.6711113676428795, "num_tokens": 106214093.0, "step": 6590 }, { "epoch": 1.5296094564839495, "grad_norm": 0.9237767457962036, "learning_rate": 4.804485747390903e-05, "loss": 1.6208, "mean_token_accuracy": 0.6697884902358056, "num_tokens": 106374762.0, "step": 6600 }, { "epoch": 1.5319272221578397, "grad_norm": 0.9697799682617188, "learning_rate": 4.803757628129132e-05, "loss": 1.63, "mean_token_accuracy": 0.6694128900766373, "num_tokens": 106535899.0, "step": 6610 }, { "epoch": 1.5342449878317304, "grad_norm": 0.8706051707267761, "learning_rate": 4.8030282109656076e-05, "loss": 1.6271, "mean_token_accuracy": 0.668670754134655, "num_tokens": 106696830.0, "step": 6620 }, { "epoch": 1.5365627535056205, "grad_norm": 0.9834458231925964, "learning_rate": 4.802297496311274e-05, "loss": 1.6371, "mean_token_accuracy": 0.6663829952478408, "num_tokens": 106858475.0, "step": 6630 }, { "epoch": 1.538880519179511, "grad_norm": 0.929908037185669, "learning_rate": 4.801565484577804e-05, "loss": 1.5996, "mean_token_accuracy": 0.6707342803478241, "num_tokens": 107019616.0, "step": 6640 }, { "epoch": 1.5411982848534014, "grad_norm": 0.9493139982223511, "learning_rate": 4.800832176177602e-05, "loss": 1.633, "mean_token_accuracy": 0.66839699447155, "num_tokens": 107181847.0, "step": 6650 }, { "epoch": 1.5435160505272916, "grad_norm": 0.9254579544067383, "learning_rate": 4.800097571523803e-05, "loss": 1.6309, "mean_token_accuracy": 0.6690337061882019, "num_tokens": 107343373.0, "step": 6660 }, { "epoch": 1.5458338162011822, "grad_norm": 0.9536603093147278, "learning_rate": 4.799361671030271e-05, "loss": 1.6463, "mean_token_accuracy": 0.6649050727486611, "num_tokens": 107503714.0, "step": 6670 }, { "epoch": 1.5481515818750724, "grad_norm": 0.9013090133666992, "learning_rate": 4.7986244751116026e-05, "loss": 1.6289, "mean_token_accuracy": 0.6666588515043259, "num_tokens": 107665407.0, "step": 6680 }, { "epoch": 1.5504693475489628, "grad_norm": 0.8715863227844238, "learning_rate": 4.797885984183121e-05, "loss": 1.6264, "mean_token_accuracy": 0.6696239024400711, "num_tokens": 107826094.0, "step": 6690 }, { "epoch": 1.5527871132228532, "grad_norm": 0.9761693477630615, "learning_rate": 4.7971461986608825e-05, "loss": 1.6291, "mean_token_accuracy": 0.6667206779122352, "num_tokens": 107987375.0, "step": 6700 }, { "epoch": 1.5551048788967434, "grad_norm": 0.9413326978683472, "learning_rate": 4.796405118961669e-05, "loss": 1.6339, "mean_token_accuracy": 0.6676578938961029, "num_tokens": 108147609.0, "step": 6710 }, { "epoch": 1.557422644570634, "grad_norm": 0.9243894219398499, "learning_rate": 4.795662745502994e-05, "loss": 1.6252, "mean_token_accuracy": 0.6672645628452301, "num_tokens": 108309172.0, "step": 6720 }, { "epoch": 1.5597404102445243, "grad_norm": 0.9139487147331238, "learning_rate": 4.7949190787031015e-05, "loss": 1.6302, "mean_token_accuracy": 0.6673160463571548, "num_tokens": 108470666.0, "step": 6730 }, { "epoch": 1.5620581759184147, "grad_norm": 0.8579688668251038, "learning_rate": 4.794174118980959e-05, "loss": 1.6176, "mean_token_accuracy": 0.6688989892601966, "num_tokens": 108631978.0, "step": 6740 }, { "epoch": 1.564375941592305, "grad_norm": 0.9512479901313782, "learning_rate": 4.7934278667562664e-05, "loss": 1.6308, "mean_token_accuracy": 0.6665326625108718, "num_tokens": 108793554.0, "step": 6750 }, { "epoch": 1.5666937072661953, "grad_norm": 0.8428590297698975, "learning_rate": 4.7926803224494514e-05, "loss": 1.6389, "mean_token_accuracy": 0.6661309406161309, "num_tokens": 108954936.0, "step": 6760 }, { "epoch": 1.569011472940086, "grad_norm": 0.8564085364341736, "learning_rate": 4.791931486481669e-05, "loss": 1.6105, "mean_token_accuracy": 0.6705759927630425, "num_tokens": 109115671.0, "step": 6770 }, { "epoch": 1.571329238613976, "grad_norm": 0.9621458053588867, "learning_rate": 4.7911813592748e-05, "loss": 1.6137, "mean_token_accuracy": 0.6682850286364556, "num_tokens": 109276879.0, "step": 6780 }, { "epoch": 1.5736470042878665, "grad_norm": 0.909195601940155, "learning_rate": 4.790429941251458e-05, "loss": 1.6202, "mean_token_accuracy": 0.6697250992059708, "num_tokens": 109437607.0, "step": 6790 }, { "epoch": 1.575964769961757, "grad_norm": 0.8866643309593201, "learning_rate": 4.789677232834977e-05, "loss": 1.6216, "mean_token_accuracy": 0.6672923266887665, "num_tokens": 109597677.0, "step": 6800 }, { "epoch": 1.5782825356356471, "grad_norm": 0.9039297103881836, "learning_rate": 4.7889232344494225e-05, "loss": 1.61, "mean_token_accuracy": 0.6703900068998336, "num_tokens": 109757676.0, "step": 6810 }, { "epoch": 1.5806003013095378, "grad_norm": 0.884681224822998, "learning_rate": 4.788167946519587e-05, "loss": 1.6139, "mean_token_accuracy": 0.6677000522613525, "num_tokens": 109919688.0, "step": 6820 }, { "epoch": 1.582918066983428, "grad_norm": 0.9282922148704529, "learning_rate": 4.787411369470988e-05, "loss": 1.6282, "mean_token_accuracy": 0.66787591278553, "num_tokens": 110080762.0, "step": 6830 }, { "epoch": 1.5852358326573184, "grad_norm": 0.8886680603027344, "learning_rate": 4.7866535037298664e-05, "loss": 1.624, "mean_token_accuracy": 0.6674341112375259, "num_tokens": 110239268.0, "step": 6840 }, { "epoch": 1.5875535983312088, "grad_norm": 0.9401371479034424, "learning_rate": 4.7858943497231956e-05, "loss": 1.6442, "mean_token_accuracy": 0.6667677119374276, "num_tokens": 110399976.0, "step": 6850 }, { "epoch": 1.589871364005099, "grad_norm": 1.0371129512786865, "learning_rate": 4.78513390787867e-05, "loss": 1.6226, "mean_token_accuracy": 0.6685519143939018, "num_tokens": 110561769.0, "step": 6860 }, { "epoch": 1.5921891296789894, "grad_norm": 0.9031786322593689, "learning_rate": 4.7843721786247107e-05, "loss": 1.6285, "mean_token_accuracy": 0.6684768259525299, "num_tokens": 110722475.0, "step": 6870 }, { "epoch": 1.5945068953528798, "grad_norm": 0.8674361705780029, "learning_rate": 4.7836091623904645e-05, "loss": 1.6458, "mean_token_accuracy": 0.6652881562709808, "num_tokens": 110883980.0, "step": 6880 }, { "epoch": 1.5968246610267702, "grad_norm": 0.89284348487854, "learning_rate": 4.782844859605803e-05, "loss": 1.6131, "mean_token_accuracy": 0.6688280209898949, "num_tokens": 111045681.0, "step": 6890 }, { "epoch": 1.5991424267006606, "grad_norm": 0.9640328288078308, "learning_rate": 4.782079270701323e-05, "loss": 1.6273, "mean_token_accuracy": 0.6677613034844398, "num_tokens": 111206536.0, "step": 6900 }, { "epoch": 1.6014601923745508, "grad_norm": 0.9486966729164124, "learning_rate": 4.781312396108345e-05, "loss": 1.6146, "mean_token_accuracy": 0.6696506202220917, "num_tokens": 111368358.0, "step": 6910 }, { "epoch": 1.6037779580484413, "grad_norm": 0.9358534216880798, "learning_rate": 4.7805442362589146e-05, "loss": 1.626, "mean_token_accuracy": 0.6685999140143395, "num_tokens": 111529680.0, "step": 6920 }, { "epoch": 1.6060957237223317, "grad_norm": 0.8874921202659607, "learning_rate": 4.779774791585801e-05, "loss": 1.6128, "mean_token_accuracy": 0.6709885537624359, "num_tokens": 111691273.0, "step": 6930 }, { "epoch": 1.608413489396222, "grad_norm": 0.9925693273544312, "learning_rate": 4.7790040625224994e-05, "loss": 1.636, "mean_token_accuracy": 0.6672026321291924, "num_tokens": 111851674.0, "step": 6940 }, { "epoch": 1.6107312550701125, "grad_norm": 0.8326547741889954, "learning_rate": 4.7782320495032244e-05, "loss": 1.6164, "mean_token_accuracy": 0.6703977197408676, "num_tokens": 112013065.0, "step": 6950 }, { "epoch": 1.6130490207440027, "grad_norm": 0.9244637489318848, "learning_rate": 4.7774587529629176e-05, "loss": 1.6292, "mean_token_accuracy": 0.6675394728779793, "num_tokens": 112174926.0, "step": 6960 }, { "epoch": 1.615366786417893, "grad_norm": 0.8845772743225098, "learning_rate": 4.776684173337241e-05, "loss": 1.6074, "mean_token_accuracy": 0.6699056699872017, "num_tokens": 112334898.0, "step": 6970 }, { "epoch": 1.6176845520917835, "grad_norm": 0.8960733413696289, "learning_rate": 4.775908311062583e-05, "loss": 1.6264, "mean_token_accuracy": 0.6678862512111664, "num_tokens": 112494594.0, "step": 6980 }, { "epoch": 1.620002317765674, "grad_norm": 0.9485416412353516, "learning_rate": 4.775131166576051e-05, "loss": 1.6162, "mean_token_accuracy": 0.6689054265618324, "num_tokens": 112654996.0, "step": 6990 }, { "epoch": 1.6223200834395644, "grad_norm": 0.8952045440673828, "learning_rate": 4.7743527403154766e-05, "loss": 1.6443, "mean_token_accuracy": 0.6644425526261329, "num_tokens": 112816528.0, "step": 7000 }, { "epoch": 1.6246378491134545, "grad_norm": 0.9659868478775024, "learning_rate": 4.7735730327194136e-05, "loss": 1.616, "mean_token_accuracy": 0.6695608794689178, "num_tokens": 112977564.0, "step": 7010 }, { "epoch": 1.626955614787345, "grad_norm": 0.9140159487724304, "learning_rate": 4.772792044227137e-05, "loss": 1.6209, "mean_token_accuracy": 0.6684606775641442, "num_tokens": 113137210.0, "step": 7020 }, { "epoch": 1.6292733804612354, "grad_norm": 0.9852905869483948, "learning_rate": 4.772009775278643e-05, "loss": 1.6347, "mean_token_accuracy": 0.6662852138280868, "num_tokens": 113298827.0, "step": 7030 }, { "epoch": 1.6315911461351258, "grad_norm": 0.950326681137085, "learning_rate": 4.7712262263146516e-05, "loss": 1.6053, "mean_token_accuracy": 0.6707319155335426, "num_tokens": 113459874.0, "step": 7040 }, { "epoch": 1.6339089118090162, "grad_norm": 0.9844124913215637, "learning_rate": 4.770441397776602e-05, "loss": 1.6324, "mean_token_accuracy": 0.6662065118551255, "num_tokens": 113620469.0, "step": 7050 }, { "epoch": 1.6362266774829064, "grad_norm": 0.9131007790565491, "learning_rate": 4.769655290106653e-05, "loss": 1.6185, "mean_token_accuracy": 0.6683483973145485, "num_tokens": 113781611.0, "step": 7060 }, { "epoch": 1.6385444431567968, "grad_norm": 0.8897318243980408, "learning_rate": 4.768867903747687e-05, "loss": 1.6139, "mean_token_accuracy": 0.6698591887950898, "num_tokens": 113943250.0, "step": 7070 }, { "epoch": 1.6408622088306872, "grad_norm": 0.9325919151306152, "learning_rate": 4.768079239143305e-05, "loss": 1.6102, "mean_token_accuracy": 0.6693206280469894, "num_tokens": 114104232.0, "step": 7080 }, { "epoch": 1.6431799745045774, "grad_norm": 0.8528140187263489, "learning_rate": 4.767289296737828e-05, "loss": 1.6115, "mean_token_accuracy": 0.6715900465846062, "num_tokens": 114265599.0, "step": 7090 }, { "epoch": 1.645497740178468, "grad_norm": 0.8792181015014648, "learning_rate": 4.766498076976299e-05, "loss": 1.6151, "mean_token_accuracy": 0.6700529962778091, "num_tokens": 114426686.0, "step": 7100 }, { "epoch": 1.6478155058523583, "grad_norm": 0.8788661360740662, "learning_rate": 4.765705580304478e-05, "loss": 1.6125, "mean_token_accuracy": 0.6693087935447692, "num_tokens": 114588438.0, "step": 7110 }, { "epoch": 1.6501332715262487, "grad_norm": 0.9197399020195007, "learning_rate": 4.764911807168845e-05, "loss": 1.6333, "mean_token_accuracy": 0.6662102788686752, "num_tokens": 114750196.0, "step": 7120 }, { "epoch": 1.652451037200139, "grad_norm": 0.9226119518280029, "learning_rate": 4.7641167580166005e-05, "loss": 1.6046, "mean_token_accuracy": 0.6699122101068496, "num_tokens": 114911563.0, "step": 7130 }, { "epoch": 1.6547688028740293, "grad_norm": 0.9239480495452881, "learning_rate": 4.7633204332956625e-05, "loss": 1.6277, "mean_token_accuracy": 0.6685487881302834, "num_tokens": 115072805.0, "step": 7140 }, { "epoch": 1.65708656854792, "grad_norm": 0.8682395219802856, "learning_rate": 4.762522833454669e-05, "loss": 1.6257, "mean_token_accuracy": 0.6683442562818527, "num_tokens": 115234594.0, "step": 7150 }, { "epoch": 1.65940433422181, "grad_norm": 0.9213774800300598, "learning_rate": 4.761723958942975e-05, "loss": 1.6171, "mean_token_accuracy": 0.6697243094444275, "num_tokens": 115394943.0, "step": 7160 }, { "epoch": 1.6617220998957005, "grad_norm": 0.9166555404663086, "learning_rate": 4.760923810210654e-05, "loss": 1.6331, "mean_token_accuracy": 0.6673653855919838, "num_tokens": 115556537.0, "step": 7170 }, { "epoch": 1.664039865569591, "grad_norm": 0.8811573386192322, "learning_rate": 4.760122387708497e-05, "loss": 1.6127, "mean_token_accuracy": 0.6697865471243858, "num_tokens": 115718150.0, "step": 7180 }, { "epoch": 1.6663576312434811, "grad_norm": 0.8674602508544922, "learning_rate": 4.7593196918880145e-05, "loss": 1.629, "mean_token_accuracy": 0.6667844265699386, "num_tokens": 115879682.0, "step": 7190 }, { "epoch": 1.6686753969173718, "grad_norm": 0.9818097352981567, "learning_rate": 4.758515723201432e-05, "loss": 1.6153, "mean_token_accuracy": 0.6689330577850342, "num_tokens": 116040729.0, "step": 7200 }, { "epoch": 1.670993162591262, "grad_norm": 0.8923401832580566, "learning_rate": 4.757710482101693e-05, "loss": 1.6182, "mean_token_accuracy": 0.6687459513545037, "num_tokens": 116201584.0, "step": 7210 }, { "epoch": 1.6733109282651524, "grad_norm": 0.9418288469314575, "learning_rate": 4.756903969042459e-05, "loss": 1.626, "mean_token_accuracy": 0.6675539761781693, "num_tokens": 116362229.0, "step": 7220 }, { "epoch": 1.6756286939390428, "grad_norm": 0.9835395812988281, "learning_rate": 4.756096184478107e-05, "loss": 1.6052, "mean_token_accuracy": 0.6708303153514862, "num_tokens": 116523392.0, "step": 7230 }, { "epoch": 1.677946459612933, "grad_norm": 0.8755620121955872, "learning_rate": 4.755287128863729e-05, "loss": 1.622, "mean_token_accuracy": 0.6672364965081214, "num_tokens": 116684110.0, "step": 7240 }, { "epoch": 1.6802642252868236, "grad_norm": 0.9865872859954834, "learning_rate": 4.754476802655136e-05, "loss": 1.5915, "mean_token_accuracy": 0.6730170652270318, "num_tokens": 116845270.0, "step": 7250 }, { "epoch": 1.6825819909607138, "grad_norm": 0.9473416209220886, "learning_rate": 4.753665206308853e-05, "loss": 1.6231, "mean_token_accuracy": 0.6675307020545006, "num_tokens": 117006860.0, "step": 7260 }, { "epoch": 1.6848997566346042, "grad_norm": 0.9493829607963562, "learning_rate": 4.752852340282121e-05, "loss": 1.6097, "mean_token_accuracy": 0.6679642543196678, "num_tokens": 117168467.0, "step": 7270 }, { "epoch": 1.6872175223084946, "grad_norm": 0.9553330540657043, "learning_rate": 4.7520382050328954e-05, "loss": 1.6007, "mean_token_accuracy": 0.6725415647029876, "num_tokens": 117328910.0, "step": 7280 }, { "epoch": 1.6895352879823848, "grad_norm": 0.9469290375709534, "learning_rate": 4.751222801019849e-05, "loss": 1.6143, "mean_token_accuracy": 0.6717975825071335, "num_tokens": 117490163.0, "step": 7290 }, { "epoch": 1.6918530536562755, "grad_norm": 0.9674245715141296, "learning_rate": 4.750406128702367e-05, "loss": 1.5944, "mean_token_accuracy": 0.6707197293639183, "num_tokens": 117651132.0, "step": 7300 }, { "epoch": 1.6941708193301657, "grad_norm": 0.9178436398506165, "learning_rate": 4.74958818854055e-05, "loss": 1.6162, "mean_token_accuracy": 0.669378112256527, "num_tokens": 117812990.0, "step": 7310 }, { "epoch": 1.696488585004056, "grad_norm": 0.8957508206367493, "learning_rate": 4.748768980995213e-05, "loss": 1.62, "mean_token_accuracy": 0.6686677679419517, "num_tokens": 117974701.0, "step": 7320 }, { "epoch": 1.6988063506779465, "grad_norm": 0.870993971824646, "learning_rate": 4.747948506527886e-05, "loss": 1.6162, "mean_token_accuracy": 0.6681779339909554, "num_tokens": 118135723.0, "step": 7330 }, { "epoch": 1.7011241163518367, "grad_norm": 0.9091371893882751, "learning_rate": 4.74712676560081e-05, "loss": 1.6249, "mean_token_accuracy": 0.668052251636982, "num_tokens": 118296597.0, "step": 7340 }, { "epoch": 1.7034418820257273, "grad_norm": 0.8581207990646362, "learning_rate": 4.7463037586769434e-05, "loss": 1.6198, "mean_token_accuracy": 0.668710532784462, "num_tokens": 118457087.0, "step": 7350 }, { "epoch": 1.7057596476996175, "grad_norm": 1.0410507917404175, "learning_rate": 4.745479486219953e-05, "loss": 1.607, "mean_token_accuracy": 0.6709647968411445, "num_tokens": 118616665.0, "step": 7360 }, { "epoch": 1.708077413373508, "grad_norm": 0.8890935778617859, "learning_rate": 4.744653948694224e-05, "loss": 1.6256, "mean_token_accuracy": 0.6664139062166214, "num_tokens": 118778303.0, "step": 7370 }, { "epoch": 1.7103951790473984, "grad_norm": 0.8973677158355713, "learning_rate": 4.74382714656485e-05, "loss": 1.6247, "mean_token_accuracy": 0.6689519852399826, "num_tokens": 118939338.0, "step": 7380 }, { "epoch": 1.7127129447212885, "grad_norm": 1.0404648780822754, "learning_rate": 4.742999080297638e-05, "loss": 1.6382, "mean_token_accuracy": 0.6675389617681503, "num_tokens": 119099313.0, "step": 7390 }, { "epoch": 1.7150307103951792, "grad_norm": 0.8880589604377747, "learning_rate": 4.74216975035911e-05, "loss": 1.6246, "mean_token_accuracy": 0.670115552842617, "num_tokens": 119261259.0, "step": 7400 }, { "epoch": 1.7173484760690694, "grad_norm": 0.9255965948104858, "learning_rate": 4.741339157216496e-05, "loss": 1.6016, "mean_token_accuracy": 0.6694790169596672, "num_tokens": 119423040.0, "step": 7410 }, { "epoch": 1.7196662417429598, "grad_norm": 0.9160863161087036, "learning_rate": 4.7405073013377405e-05, "loss": 1.617, "mean_token_accuracy": 0.6686131224036217, "num_tokens": 119584053.0, "step": 7420 }, { "epoch": 1.7219840074168502, "grad_norm": 0.9173243641853333, "learning_rate": 4.739674183191497e-05, "loss": 1.6203, "mean_token_accuracy": 0.6675078019499778, "num_tokens": 119744066.0, "step": 7430 }, { "epoch": 1.7243017730907404, "grad_norm": 0.9561407566070557, "learning_rate": 4.7388398032471336e-05, "loss": 1.6138, "mean_token_accuracy": 0.6680781871080399, "num_tokens": 119904783.0, "step": 7440 }, { "epoch": 1.726619538764631, "grad_norm": 0.8587433099746704, "learning_rate": 4.738004161974726e-05, "loss": 1.6014, "mean_token_accuracy": 0.6708362400531769, "num_tokens": 120066237.0, "step": 7450 }, { "epoch": 1.7289373044385212, "grad_norm": 0.9618345499038696, "learning_rate": 4.737167259845061e-05, "loss": 1.6232, "mean_token_accuracy": 0.6682398721575737, "num_tokens": 120227239.0, "step": 7460 }, { "epoch": 1.7312550701124116, "grad_norm": 0.9621495008468628, "learning_rate": 4.736329097329637e-05, "loss": 1.6009, "mean_token_accuracy": 0.6701546832919121, "num_tokens": 120389052.0, "step": 7470 }, { "epoch": 1.733572835786302, "grad_norm": 0.9155102968215942, "learning_rate": 4.735489674900663e-05, "loss": 1.6137, "mean_token_accuracy": 0.6699761226773262, "num_tokens": 120550797.0, "step": 7480 }, { "epoch": 1.7358906014601923, "grad_norm": 1.0102802515029907, "learning_rate": 4.734648993031055e-05, "loss": 1.6187, "mean_token_accuracy": 0.6701601982116699, "num_tokens": 120711571.0, "step": 7490 }, { "epoch": 1.7382083671340829, "grad_norm": 0.8922771215438843, "learning_rate": 4.733807052194441e-05, "loss": 1.6084, "mean_token_accuracy": 0.6688319519162178, "num_tokens": 120872958.0, "step": 7500 }, { "epoch": 1.740526132807973, "grad_norm": 0.9439284205436707, "learning_rate": 4.7329638528651584e-05, "loss": 1.6166, "mean_token_accuracy": 0.669372309744358, "num_tokens": 121034577.0, "step": 7510 }, { "epoch": 1.7428438984818635, "grad_norm": 0.9044144749641418, "learning_rate": 4.732119395518251e-05, "loss": 1.6069, "mean_token_accuracy": 0.6702992334961891, "num_tokens": 121196400.0, "step": 7520 }, { "epoch": 1.745161664155754, "grad_norm": 0.9516335725784302, "learning_rate": 4.731273680629475e-05, "loss": 1.6215, "mean_token_accuracy": 0.6683030381798745, "num_tokens": 121355531.0, "step": 7530 }, { "epoch": 1.747479429829644, "grad_norm": 0.9044315218925476, "learning_rate": 4.730426708675292e-05, "loss": 1.6139, "mean_token_accuracy": 0.6695613518357277, "num_tokens": 121517160.0, "step": 7540 }, { "epoch": 1.7497971955035347, "grad_norm": 0.9213445782661438, "learning_rate": 4.7295784801328735e-05, "loss": 1.6154, "mean_token_accuracy": 0.6688265457749367, "num_tokens": 121678409.0, "step": 7550 }, { "epoch": 1.752114961177425, "grad_norm": 0.8478663563728333, "learning_rate": 4.728728995480098e-05, "loss": 1.6156, "mean_token_accuracy": 0.6683713406324386, "num_tokens": 121840335.0, "step": 7560 }, { "epoch": 1.7544327268513153, "grad_norm": 0.8912850618362427, "learning_rate": 4.727878255195554e-05, "loss": 1.6124, "mean_token_accuracy": 0.6686562106013298, "num_tokens": 122002107.0, "step": 7570 }, { "epoch": 1.7567504925252058, "grad_norm": 0.8828921318054199, "learning_rate": 4.7270262597585325e-05, "loss": 1.6291, "mean_token_accuracy": 0.6666677996516228, "num_tokens": 122163444.0, "step": 7580 }, { "epoch": 1.759068258199096, "grad_norm": 0.888204038143158, "learning_rate": 4.7261730096490365e-05, "loss": 1.6085, "mean_token_accuracy": 0.670141077041626, "num_tokens": 122324982.0, "step": 7590 }, { "epoch": 1.7613860238729866, "grad_norm": 1.0214720964431763, "learning_rate": 4.725318505347773e-05, "loss": 1.6156, "mean_token_accuracy": 0.6693902537226677, "num_tokens": 122485130.0, "step": 7600 }, { "epoch": 1.7637037895468768, "grad_norm": 0.8661583662033081, "learning_rate": 4.724462747336159e-05, "loss": 1.616, "mean_token_accuracy": 0.6670769900083542, "num_tokens": 122647005.0, "step": 7610 }, { "epoch": 1.7660215552207672, "grad_norm": 0.8832915425300598, "learning_rate": 4.723605736096313e-05, "loss": 1.6189, "mean_token_accuracy": 0.6683069914579391, "num_tokens": 122808068.0, "step": 7620 }, { "epoch": 1.7683393208946576, "grad_norm": 0.8824859261512756, "learning_rate": 4.722747472111063e-05, "loss": 1.6223, "mean_token_accuracy": 0.669134970009327, "num_tokens": 122969747.0, "step": 7630 }, { "epoch": 1.7706570865685478, "grad_norm": 0.9127689599990845, "learning_rate": 4.721887955863941e-05, "loss": 1.5924, "mean_token_accuracy": 0.6705520495772361, "num_tokens": 123129177.0, "step": 7640 }, { "epoch": 1.7729748522424384, "grad_norm": 0.908750057220459, "learning_rate": 4.721027187839187e-05, "loss": 1.6289, "mean_token_accuracy": 0.6693118453025818, "num_tokens": 123289867.0, "step": 7650 }, { "epoch": 1.7752926179163286, "grad_norm": 1.0143109560012817, "learning_rate": 4.720165168521743e-05, "loss": 1.6102, "mean_token_accuracy": 0.6698835641145706, "num_tokens": 123451719.0, "step": 7660 }, { "epoch": 1.777610383590219, "grad_norm": 0.7949471473693848, "learning_rate": 4.719301898397257e-05, "loss": 1.6106, "mean_token_accuracy": 0.669491009414196, "num_tokens": 123612533.0, "step": 7670 }, { "epoch": 1.7799281492641095, "grad_norm": 0.927652895450592, "learning_rate": 4.718437377952084e-05, "loss": 1.6009, "mean_token_accuracy": 0.669826865196228, "num_tokens": 123774512.0, "step": 7680 }, { "epoch": 1.7822459149379997, "grad_norm": 0.9157851934432983, "learning_rate": 4.71757160767328e-05, "loss": 1.5947, "mean_token_accuracy": 0.6731186181306839, "num_tokens": 123935120.0, "step": 7690 }, { "epoch": 1.7845636806118903, "grad_norm": 0.9501652717590332, "learning_rate": 4.716704588048608e-05, "loss": 1.6132, "mean_token_accuracy": 0.6689582943916321, "num_tokens": 124096619.0, "step": 7700 }, { "epoch": 1.7868814462857805, "grad_norm": 0.9490858316421509, "learning_rate": 4.715836319566533e-05, "loss": 1.6136, "mean_token_accuracy": 0.6693569347262383, "num_tokens": 124258428.0, "step": 7710 }, { "epoch": 1.789199211959671, "grad_norm": 0.8902377486228943, "learning_rate": 4.714966802716224e-05, "loss": 1.5929, "mean_token_accuracy": 0.6728615939617157, "num_tokens": 124420167.0, "step": 7720 }, { "epoch": 1.7915169776335613, "grad_norm": 0.9374500513076782, "learning_rate": 4.714096037987553e-05, "loss": 1.6208, "mean_token_accuracy": 0.6690667107701301, "num_tokens": 124581559.0, "step": 7730 }, { "epoch": 1.7938347433074515, "grad_norm": 0.9391505718231201, "learning_rate": 4.713224025871096e-05, "loss": 1.6052, "mean_token_accuracy": 0.670231944322586, "num_tokens": 124743120.0, "step": 7740 }, { "epoch": 1.796152508981342, "grad_norm": 0.940070390701294, "learning_rate": 4.7123507668581316e-05, "loss": 1.6029, "mean_token_accuracy": 0.6697575554251671, "num_tokens": 124904453.0, "step": 7750 }, { "epoch": 1.7984702746552323, "grad_norm": 0.9259909391403198, "learning_rate": 4.71147626144064e-05, "loss": 1.6181, "mean_token_accuracy": 0.6678547531366348, "num_tokens": 125066162.0, "step": 7760 }, { "epoch": 1.8007880403291228, "grad_norm": 0.8889830708503723, "learning_rate": 4.7106005101113047e-05, "loss": 1.6094, "mean_token_accuracy": 0.6700221598148346, "num_tokens": 125226817.0, "step": 7770 }, { "epoch": 1.8031058060030132, "grad_norm": 0.9607876539230347, "learning_rate": 4.709723513363511e-05, "loss": 1.6247, "mean_token_accuracy": 0.6679604411125183, "num_tokens": 125388377.0, "step": 7780 }, { "epoch": 1.8054235716769034, "grad_norm": 0.8747990727424622, "learning_rate": 4.708845271691344e-05, "loss": 1.6021, "mean_token_accuracy": 0.6705572590231895, "num_tokens": 125549665.0, "step": 7790 }, { "epoch": 1.8077413373507938, "grad_norm": 0.9474254846572876, "learning_rate": 4.7079657855895926e-05, "loss": 1.6049, "mean_token_accuracy": 0.6702522099018097, "num_tokens": 125710693.0, "step": 7800 }, { "epoch": 1.8100591030246842, "grad_norm": 0.9404877424240112, "learning_rate": 4.707085055553745e-05, "loss": 1.5882, "mean_token_accuracy": 0.6726245433092117, "num_tokens": 125870529.0, "step": 7810 }, { "epoch": 1.8123768686985746, "grad_norm": 0.9841725826263428, "learning_rate": 4.7062030820799917e-05, "loss": 1.6248, "mean_token_accuracy": 0.6662049517035484, "num_tokens": 126030871.0, "step": 7820 }, { "epoch": 1.814694634372465, "grad_norm": 0.8711299300193787, "learning_rate": 4.7053198656652236e-05, "loss": 1.6213, "mean_token_accuracy": 0.6682845562696457, "num_tokens": 126191978.0, "step": 7830 }, { "epoch": 1.8170124000463552, "grad_norm": 0.9045615196228027, "learning_rate": 4.70443540680703e-05, "loss": 1.6148, "mean_token_accuracy": 0.6684085428714752, "num_tokens": 126352812.0, "step": 7840 }, { "epoch": 1.8193301657202456, "grad_norm": 0.9022089838981628, "learning_rate": 4.7035497060037023e-05, "loss": 1.6137, "mean_token_accuracy": 0.6688717573881149, "num_tokens": 126513805.0, "step": 7850 }, { "epoch": 1.821647931394136, "grad_norm": 0.8096802234649658, "learning_rate": 4.70266276375423e-05, "loss": 1.6106, "mean_token_accuracy": 0.6694563701748848, "num_tokens": 126673692.0, "step": 7860 }, { "epoch": 1.8239656970680265, "grad_norm": 0.9231085181236267, "learning_rate": 4.7017745805583036e-05, "loss": 1.6119, "mean_token_accuracy": 0.6692959576845169, "num_tokens": 126835602.0, "step": 7870 }, { "epoch": 1.8262834627419169, "grad_norm": 0.9498635530471802, "learning_rate": 4.700885156916312e-05, "loss": 1.6072, "mean_token_accuracy": 0.6698481902480126, "num_tokens": 126997141.0, "step": 7880 }, { "epoch": 1.828601228415807, "grad_norm": 0.9700915217399597, "learning_rate": 4.6999944933293425e-05, "loss": 1.616, "mean_token_accuracy": 0.6676223605871201, "num_tokens": 127158179.0, "step": 7890 }, { "epoch": 1.8309189940896975, "grad_norm": 0.8849897384643555, "learning_rate": 4.6991025902991806e-05, "loss": 1.6134, "mean_token_accuracy": 0.6684589594602585, "num_tokens": 127319179.0, "step": 7900 }, { "epoch": 1.833236759763588, "grad_norm": 0.8370596170425415, "learning_rate": 4.6982094483283114e-05, "loss": 1.6087, "mean_token_accuracy": 0.6676624625921249, "num_tokens": 127480226.0, "step": 7910 }, { "epoch": 1.8355545254374783, "grad_norm": 0.8665776252746582, "learning_rate": 4.697315067919918e-05, "loss": 1.6107, "mean_token_accuracy": 0.6666179165244103, "num_tokens": 127642008.0, "step": 7920 }, { "epoch": 1.8378722911113687, "grad_norm": 0.9484449028968811, "learning_rate": 4.69641944957788e-05, "loss": 1.6108, "mean_token_accuracy": 0.6688623517751694, "num_tokens": 127803619.0, "step": 7930 }, { "epoch": 1.840190056785259, "grad_norm": 0.9054911732673645, "learning_rate": 4.695522593806775e-05, "loss": 1.6143, "mean_token_accuracy": 0.6690058633685112, "num_tokens": 127963822.0, "step": 7940 }, { "epoch": 1.8425078224591493, "grad_norm": 0.9806380271911621, "learning_rate": 4.694624501111878e-05, "loss": 1.6086, "mean_token_accuracy": 0.6691657528281212, "num_tokens": 128125604.0, "step": 7950 }, { "epoch": 1.8448255881330398, "grad_norm": 0.9119485020637512, "learning_rate": 4.6937251719991606e-05, "loss": 1.6093, "mean_token_accuracy": 0.6709865614771843, "num_tokens": 128286314.0, "step": 7960 }, { "epoch": 1.84714335380693, "grad_norm": 0.8906190395355225, "learning_rate": 4.6928246069752914e-05, "loss": 1.5865, "mean_token_accuracy": 0.6712403103709221, "num_tokens": 128447738.0, "step": 7970 }, { "epoch": 1.8494611194808206, "grad_norm": 0.8790930509567261, "learning_rate": 4.691922806547634e-05, "loss": 1.5909, "mean_token_accuracy": 0.6708359733223915, "num_tokens": 128607847.0, "step": 7980 }, { "epoch": 1.8517788851547108, "grad_norm": 0.8376166224479675, "learning_rate": 4.6910197712242496e-05, "loss": 1.6134, "mean_token_accuracy": 0.6701876208186149, "num_tokens": 128769805.0, "step": 7990 }, { "epoch": 1.8540966508286012, "grad_norm": 0.891890823841095, "learning_rate": 4.6901155015138946e-05, "loss": 1.6031, "mean_token_accuracy": 0.670122143626213, "num_tokens": 128931658.0, "step": 8000 }, { "epoch": 1.8564144165024916, "grad_norm": 0.9075875878334045, "learning_rate": 4.6892099979260206e-05, "loss": 1.6039, "mean_token_accuracy": 0.6715435341000557, "num_tokens": 129092301.0, "step": 8010 }, { "epoch": 1.8587321821763818, "grad_norm": 0.9262542128562927, "learning_rate": 4.6883032609707745e-05, "loss": 1.6185, "mean_token_accuracy": 0.668101304769516, "num_tokens": 129253634.0, "step": 8020 }, { "epoch": 1.8610499478502724, "grad_norm": 0.9247434139251709, "learning_rate": 4.687395291158997e-05, "loss": 1.6037, "mean_token_accuracy": 0.6697940349578857, "num_tokens": 129414464.0, "step": 8030 }, { "epoch": 1.8633677135241626, "grad_norm": 0.8741483688354492, "learning_rate": 4.686486089002226e-05, "loss": 1.6098, "mean_token_accuracy": 0.6690848156809807, "num_tokens": 129576333.0, "step": 8040 }, { "epoch": 1.865685479198053, "grad_norm": 0.8215022683143616, "learning_rate": 4.685575655012691e-05, "loss": 1.6122, "mean_token_accuracy": 0.6696516618132591, "num_tokens": 129737072.0, "step": 8050 }, { "epoch": 1.8680032448719435, "grad_norm": 0.8229594230651855, "learning_rate": 4.684663989703317e-05, "loss": 1.5962, "mean_token_accuracy": 0.6701862677931786, "num_tokens": 129898874.0, "step": 8060 }, { "epoch": 1.8703210105458337, "grad_norm": 0.8706668615341187, "learning_rate": 4.6837510935877224e-05, "loss": 1.5954, "mean_token_accuracy": 0.6703700348734856, "num_tokens": 130059702.0, "step": 8070 }, { "epoch": 1.8726387762197243, "grad_norm": 0.9248679280281067, "learning_rate": 4.682836967180219e-05, "loss": 1.6094, "mean_token_accuracy": 0.6703350841999054, "num_tokens": 130221647.0, "step": 8080 }, { "epoch": 1.8749565418936145, "grad_norm": 0.9780643582344055, "learning_rate": 4.6819216109958095e-05, "loss": 1.5946, "mean_token_accuracy": 0.6706664860248566, "num_tokens": 130383659.0, "step": 8090 }, { "epoch": 1.877274307567505, "grad_norm": 0.861741840839386, "learning_rate": 4.681005025550195e-05, "loss": 1.6184, "mean_token_accuracy": 0.668680077791214, "num_tokens": 130544173.0, "step": 8100 }, { "epoch": 1.8795920732413953, "grad_norm": 0.9291035532951355, "learning_rate": 4.6800872113597626e-05, "loss": 1.6036, "mean_token_accuracy": 0.6706070512533188, "num_tokens": 130704864.0, "step": 8110 }, { "epoch": 1.8819098389152855, "grad_norm": 0.8844566941261292, "learning_rate": 4.6791681689415975e-05, "loss": 1.6144, "mean_token_accuracy": 0.6688745513558387, "num_tokens": 130866737.0, "step": 8120 }, { "epoch": 1.8842276045891762, "grad_norm": 0.9059598445892334, "learning_rate": 4.678247898813472e-05, "loss": 1.5977, "mean_token_accuracy": 0.6701202660799026, "num_tokens": 131028167.0, "step": 8130 }, { "epoch": 1.8865453702630663, "grad_norm": 0.8422067761421204, "learning_rate": 4.6773264014938534e-05, "loss": 1.6085, "mean_token_accuracy": 0.6698824003338814, "num_tokens": 131189119.0, "step": 8140 }, { "epoch": 1.8888631359369568, "grad_norm": 0.8430538177490234, "learning_rate": 4.6764036775018985e-05, "loss": 1.6111, "mean_token_accuracy": 0.6697848707437515, "num_tokens": 131351234.0, "step": 8150 }, { "epoch": 1.8911809016108472, "grad_norm": 0.9311884641647339, "learning_rate": 4.675479727357456e-05, "loss": 1.5999, "mean_token_accuracy": 0.6719934821128846, "num_tokens": 131513075.0, "step": 8160 }, { "epoch": 1.8934986672847374, "grad_norm": 0.8319401741027832, "learning_rate": 4.674554551581065e-05, "loss": 1.5912, "mean_token_accuracy": 0.6710139870643616, "num_tokens": 131674632.0, "step": 8170 }, { "epoch": 1.895816432958628, "grad_norm": 0.8793332576751709, "learning_rate": 4.673628150693956e-05, "loss": 1.5949, "mean_token_accuracy": 0.671785244345665, "num_tokens": 131834857.0, "step": 8180 }, { "epoch": 1.8981341986325182, "grad_norm": 0.895401120185852, "learning_rate": 4.672700525218048e-05, "loss": 1.5988, "mean_token_accuracy": 0.6706750065088272, "num_tokens": 131995844.0, "step": 8190 }, { "epoch": 1.9004519643064086, "grad_norm": 0.9823587536811829, "learning_rate": 4.6717716756759535e-05, "loss": 1.5844, "mean_token_accuracy": 0.670220422744751, "num_tokens": 132157645.0, "step": 8200 }, { "epoch": 1.902769729980299, "grad_norm": 0.9929152131080627, "learning_rate": 4.6708416025909686e-05, "loss": 1.6067, "mean_token_accuracy": 0.668541744351387, "num_tokens": 132317634.0, "step": 8210 }, { "epoch": 1.9050874956541892, "grad_norm": 0.8959836363792419, "learning_rate": 4.669910306487085e-05, "loss": 1.6133, "mean_token_accuracy": 0.6681461498141289, "num_tokens": 132478476.0, "step": 8220 }, { "epoch": 1.9074052613280799, "grad_norm": 0.909214973449707, "learning_rate": 4.6689777878889794e-05, "loss": 1.6007, "mean_token_accuracy": 0.6707750082015991, "num_tokens": 132639261.0, "step": 8230 }, { "epoch": 1.90972302700197, "grad_norm": 0.897346556186676, "learning_rate": 4.668044047322018e-05, "loss": 1.6014, "mean_token_accuracy": 0.6697087466716767, "num_tokens": 132799561.0, "step": 8240 }, { "epoch": 1.9120407926758605, "grad_norm": 0.9030346274375916, "learning_rate": 4.667109085312258e-05, "loss": 1.5915, "mean_token_accuracy": 0.6712193995714187, "num_tokens": 132960378.0, "step": 8250 }, { "epoch": 1.9143585583497509, "grad_norm": 0.8719942569732666, "learning_rate": 4.6661729023864394e-05, "loss": 1.6084, "mean_token_accuracy": 0.6694616466760636, "num_tokens": 133121016.0, "step": 8260 }, { "epoch": 1.916676324023641, "grad_norm": 0.8864296674728394, "learning_rate": 4.665235499071997e-05, "loss": 1.591, "mean_token_accuracy": 0.6710841566324234, "num_tokens": 133282821.0, "step": 8270 }, { "epoch": 1.9189940896975317, "grad_norm": 0.8621924519538879, "learning_rate": 4.6642968758970475e-05, "loss": 1.6201, "mean_token_accuracy": 0.6692341238260269, "num_tokens": 133444774.0, "step": 8280 }, { "epoch": 1.921311855371422, "grad_norm": 0.9768622517585754, "learning_rate": 4.6633570333903964e-05, "loss": 1.6139, "mean_token_accuracy": 0.6688206911087036, "num_tokens": 133605603.0, "step": 8290 }, { "epoch": 1.9236296210453123, "grad_norm": 0.841989278793335, "learning_rate": 4.6624159720815376e-05, "loss": 1.5968, "mean_token_accuracy": 0.6709762528538704, "num_tokens": 133767329.0, "step": 8300 }, { "epoch": 1.9259473867192027, "grad_norm": 0.8798831701278687, "learning_rate": 4.6614736925006495e-05, "loss": 1.6314, "mean_token_accuracy": 0.6672559514641762, "num_tokens": 133928893.0, "step": 8310 }, { "epoch": 1.928265152393093, "grad_norm": 0.9010440111160278, "learning_rate": 4.660530195178599e-05, "loss": 1.593, "mean_token_accuracy": 0.6716331899166107, "num_tokens": 134089521.0, "step": 8320 }, { "epoch": 1.9305829180669836, "grad_norm": 0.9161326289176941, "learning_rate": 4.659585480646938e-05, "loss": 1.5801, "mean_token_accuracy": 0.6723697453737258, "num_tokens": 134249372.0, "step": 8330 }, { "epoch": 1.9329006837408738, "grad_norm": 0.8433426022529602, "learning_rate": 4.658639549437904e-05, "loss": 1.5934, "mean_token_accuracy": 0.6722540333867073, "num_tokens": 134411143.0, "step": 8340 }, { "epoch": 1.9352184494147642, "grad_norm": 0.9253641366958618, "learning_rate": 4.657692402084419e-05, "loss": 1.5916, "mean_token_accuracy": 0.6721230670809746, "num_tokens": 134572950.0, "step": 8350 }, { "epoch": 1.9375362150886546, "grad_norm": 0.8425189256668091, "learning_rate": 4.6567440391200934e-05, "loss": 1.6119, "mean_token_accuracy": 0.6688619941473007, "num_tokens": 134734636.0, "step": 8360 }, { "epoch": 1.9398539807625448, "grad_norm": 0.8707441687583923, "learning_rate": 4.655794461079219e-05, "loss": 1.5992, "mean_token_accuracy": 0.671531443297863, "num_tokens": 134896208.0, "step": 8370 }, { "epoch": 1.9421717464364354, "grad_norm": 0.9243817329406738, "learning_rate": 4.654843668496774e-05, "loss": 1.6073, "mean_token_accuracy": 0.6685223087668419, "num_tokens": 135057588.0, "step": 8380 }, { "epoch": 1.9444895121103256, "grad_norm": 0.8784475922584534, "learning_rate": 4.6538916619084204e-05, "loss": 1.5904, "mean_token_accuracy": 0.6716140180826187, "num_tokens": 135219163.0, "step": 8390 }, { "epoch": 1.946807277784216, "grad_norm": 0.861449122428894, "learning_rate": 4.652938441850504e-05, "loss": 1.6163, "mean_token_accuracy": 0.669672504067421, "num_tokens": 135378891.0, "step": 8400 }, { "epoch": 1.9491250434581064, "grad_norm": 0.8925347328186035, "learning_rate": 4.651984008860054e-05, "loss": 1.5918, "mean_token_accuracy": 0.6720804363489151, "num_tokens": 135540385.0, "step": 8410 }, { "epoch": 1.9514428091319966, "grad_norm": 0.8501108288764954, "learning_rate": 4.6510283634747843e-05, "loss": 1.6068, "mean_token_accuracy": 0.6704746499657631, "num_tokens": 135702703.0, "step": 8420 }, { "epoch": 1.9537605748058873, "grad_norm": 0.8683401346206665, "learning_rate": 4.65007150623309e-05, "loss": 1.5759, "mean_token_accuracy": 0.674729211628437, "num_tokens": 135863966.0, "step": 8430 }, { "epoch": 1.9560783404797775, "grad_norm": 0.9049983620643616, "learning_rate": 4.6491134376740494e-05, "loss": 1.5949, "mean_token_accuracy": 0.6703096374869346, "num_tokens": 136025537.0, "step": 8440 }, { "epoch": 1.9583961061536679, "grad_norm": 0.8575712442398071, "learning_rate": 4.648154158337424e-05, "loss": 1.5606, "mean_token_accuracy": 0.6760611936450005, "num_tokens": 136187649.0, "step": 8450 }, { "epoch": 1.9607138718275583, "grad_norm": 0.9125415682792664, "learning_rate": 4.647193668763658e-05, "loss": 1.5988, "mean_token_accuracy": 0.669968643784523, "num_tokens": 136349309.0, "step": 8460 }, { "epoch": 1.9630316375014485, "grad_norm": 1.0087320804595947, "learning_rate": 4.646231969493876e-05, "loss": 1.5913, "mean_token_accuracy": 0.6711204081773758, "num_tokens": 136509768.0, "step": 8470 }, { "epoch": 1.9653494031753391, "grad_norm": 0.8960956931114197, "learning_rate": 4.6452690610698845e-05, "loss": 1.6019, "mean_token_accuracy": 0.6696556895971298, "num_tokens": 136671360.0, "step": 8480 }, { "epoch": 1.9676671688492293, "grad_norm": 0.8781436085700989, "learning_rate": 4.6443049440341716e-05, "loss": 1.6018, "mean_token_accuracy": 0.6722475975751877, "num_tokens": 136832495.0, "step": 8490 }, { "epoch": 1.9699849345231197, "grad_norm": 0.8082851767539978, "learning_rate": 4.643339618929905e-05, "loss": 1.5977, "mean_token_accuracy": 0.672105847299099, "num_tokens": 136993196.0, "step": 8500 }, { "epoch": 1.9723027001970102, "grad_norm": 0.9402576684951782, "learning_rate": 4.642373086300936e-05, "loss": 1.6042, "mean_token_accuracy": 0.668976466357708, "num_tokens": 137155005.0, "step": 8510 }, { "epoch": 1.9746204658709003, "grad_norm": 0.8802392482757568, "learning_rate": 4.6414053466917925e-05, "loss": 1.6092, "mean_token_accuracy": 0.6694738417863846, "num_tokens": 137316780.0, "step": 8520 }, { "epoch": 1.976938231544791, "grad_norm": 0.9032849073410034, "learning_rate": 4.640436400647684e-05, "loss": 1.6241, "mean_token_accuracy": 0.6681222707033158, "num_tokens": 137476801.0, "step": 8530 }, { "epoch": 1.9792559972186812, "grad_norm": 0.9464198350906372, "learning_rate": 4.639466248714503e-05, "loss": 1.601, "mean_token_accuracy": 0.6691314190626144, "num_tokens": 137638357.0, "step": 8540 }, { "epoch": 1.9815737628925716, "grad_norm": 0.913307249546051, "learning_rate": 4.6384948914388135e-05, "loss": 1.6144, "mean_token_accuracy": 0.668762032687664, "num_tokens": 137799345.0, "step": 8550 }, { "epoch": 1.983891528566462, "grad_norm": 0.8345120549201965, "learning_rate": 4.637522329367867e-05, "loss": 1.6123, "mean_token_accuracy": 0.6692528739571572, "num_tokens": 137960970.0, "step": 8560 }, { "epoch": 1.9862092942403522, "grad_norm": 0.8589303493499756, "learning_rate": 4.636548563049589e-05, "loss": 1.6037, "mean_token_accuracy": 0.6691944852471352, "num_tokens": 138122567.0, "step": 8570 }, { "epoch": 1.9885270599142428, "grad_norm": 0.9135364890098572, "learning_rate": 4.6355735930325834e-05, "loss": 1.6034, "mean_token_accuracy": 0.6710450500249863, "num_tokens": 138283673.0, "step": 8580 }, { "epoch": 1.990844825588133, "grad_norm": 0.9528611898422241, "learning_rate": 4.634597419866135e-05, "loss": 1.5886, "mean_token_accuracy": 0.6727985203266144, "num_tokens": 138444037.0, "step": 8590 }, { "epoch": 1.9931625912620234, "grad_norm": 0.8959801197052002, "learning_rate": 4.633620044100204e-05, "loss": 1.6084, "mean_token_accuracy": 0.6707574963569641, "num_tokens": 138603792.0, "step": 8600 }, { "epoch": 1.9954803569359139, "grad_norm": 0.8508427739143372, "learning_rate": 4.632641466285429e-05, "loss": 1.5954, "mean_token_accuracy": 0.6711290180683136, "num_tokens": 138765407.0, "step": 8610 }, { "epoch": 1.997798122609804, "grad_norm": 0.9134041666984558, "learning_rate": 4.6316616869731255e-05, "loss": 1.5913, "mean_token_accuracy": 0.6716921493411064, "num_tokens": 138926988.0, "step": 8620 }, { "epoch": 2.0, "grad_norm": 1.2644245624542236, "learning_rate": 4.630680706715287e-05, "loss": 1.599, "mean_token_accuracy": 0.670348164282347, "num_tokens": 139079508.0, "step": 8630 }, { "epoch": 2.00231776567389, "grad_norm": 0.8996263146400452, "learning_rate": 4.629698526064582e-05, "loss": 1.5845, "mean_token_accuracy": 0.6727187648415566, "num_tokens": 139239854.0, "step": 8640 }, { "epoch": 2.004635531347781, "grad_norm": 0.891111433506012, "learning_rate": 4.628715145574356e-05, "loss": 1.6091, "mean_token_accuracy": 0.669684387743473, "num_tokens": 139400295.0, "step": 8650 }, { "epoch": 2.006953297021671, "grad_norm": 0.8430413007736206, "learning_rate": 4.627730565798631e-05, "loss": 1.5792, "mean_token_accuracy": 0.6732317790389061, "num_tokens": 139561396.0, "step": 8660 }, { "epoch": 2.0092710626955617, "grad_norm": 0.8967174887657166, "learning_rate": 4.626744787292104e-05, "loss": 1.5657, "mean_token_accuracy": 0.676105011999607, "num_tokens": 139722746.0, "step": 8670 }, { "epoch": 2.011588828369452, "grad_norm": 0.9133883118629456, "learning_rate": 4.625757810610147e-05, "loss": 1.589, "mean_token_accuracy": 0.6710388794541359, "num_tokens": 139883833.0, "step": 8680 }, { "epoch": 2.013906594043342, "grad_norm": 0.951022744178772, "learning_rate": 4.6247696363088076e-05, "loss": 1.5875, "mean_token_accuracy": 0.6725278928875923, "num_tokens": 140044648.0, "step": 8690 }, { "epoch": 2.0162243597172327, "grad_norm": 0.9825413227081299, "learning_rate": 4.623780264944809e-05, "loss": 1.6013, "mean_token_accuracy": 0.6697019144892693, "num_tokens": 140206141.0, "step": 8700 }, { "epoch": 2.018542125391123, "grad_norm": 0.939804196357727, "learning_rate": 4.622789697075548e-05, "loss": 1.5824, "mean_token_accuracy": 0.673361350595951, "num_tokens": 140367319.0, "step": 8710 }, { "epoch": 2.0208598910650135, "grad_norm": 0.8785629868507385, "learning_rate": 4.6217979332590935e-05, "loss": 1.6033, "mean_token_accuracy": 0.6702727496623992, "num_tokens": 140528914.0, "step": 8720 }, { "epoch": 2.0231776567389037, "grad_norm": 0.9180231094360352, "learning_rate": 4.6208049740541915e-05, "loss": 1.5864, "mean_token_accuracy": 0.6711162343621254, "num_tokens": 140690662.0, "step": 8730 }, { "epoch": 2.025495422412794, "grad_norm": 0.8380365371704102, "learning_rate": 4.6198108200202596e-05, "loss": 1.5802, "mean_token_accuracy": 0.6728779941797256, "num_tokens": 140852104.0, "step": 8740 }, { "epoch": 2.0278131880866845, "grad_norm": 0.8953895568847656, "learning_rate": 4.618815471717389e-05, "loss": 1.5804, "mean_token_accuracy": 0.6724657028913498, "num_tokens": 141013829.0, "step": 8750 }, { "epoch": 2.0301309537605747, "grad_norm": 0.9125880002975464, "learning_rate": 4.617818929706344e-05, "loss": 1.5912, "mean_token_accuracy": 0.6731867447495461, "num_tokens": 141175761.0, "step": 8760 }, { "epoch": 2.0324487194344654, "grad_norm": 0.8922501802444458, "learning_rate": 4.6168211945485606e-05, "loss": 1.5889, "mean_token_accuracy": 0.6705489918589592, "num_tokens": 141337345.0, "step": 8770 }, { "epoch": 2.0347664851083556, "grad_norm": 0.8999000787734985, "learning_rate": 4.615822266806148e-05, "loss": 1.5758, "mean_token_accuracy": 0.6724272698163987, "num_tokens": 141498995.0, "step": 8780 }, { "epoch": 2.0370842507822458, "grad_norm": 0.9072485566139221, "learning_rate": 4.614822147041886e-05, "loss": 1.5833, "mean_token_accuracy": 0.6718513697385788, "num_tokens": 141659815.0, "step": 8790 }, { "epoch": 2.0394020164561364, "grad_norm": 0.8817551732063293, "learning_rate": 4.613820835819228e-05, "loss": 1.5812, "mean_token_accuracy": 0.6724171906709671, "num_tokens": 141821445.0, "step": 8800 }, { "epoch": 2.0417197821300266, "grad_norm": 0.9432797431945801, "learning_rate": 4.6128183337022955e-05, "loss": 1.5847, "mean_token_accuracy": 0.6728482812643051, "num_tokens": 141983174.0, "step": 8810 }, { "epoch": 2.044037547803917, "grad_norm": 0.8969302773475647, "learning_rate": 4.611814641255885e-05, "loss": 1.578, "mean_token_accuracy": 0.6722259074449539, "num_tokens": 142143543.0, "step": 8820 }, { "epoch": 2.0463553134778074, "grad_norm": 0.8967535495758057, "learning_rate": 4.6108097590454604e-05, "loss": 1.5832, "mean_token_accuracy": 0.6719130650162697, "num_tokens": 142304978.0, "step": 8830 }, { "epoch": 2.0486730791516976, "grad_norm": 0.855783998966217, "learning_rate": 4.609803687637156e-05, "loss": 1.5812, "mean_token_accuracy": 0.672028774023056, "num_tokens": 142463790.0, "step": 8840 }, { "epoch": 2.0509908448255882, "grad_norm": 0.9611756205558777, "learning_rate": 4.608796427597779e-05, "loss": 1.5907, "mean_token_accuracy": 0.6709662154316902, "num_tokens": 142625339.0, "step": 8850 }, { "epoch": 2.0533086104994784, "grad_norm": 0.916819155216217, "learning_rate": 4.607787979494803e-05, "loss": 1.5781, "mean_token_accuracy": 0.6726617634296417, "num_tokens": 142786044.0, "step": 8860 }, { "epoch": 2.055626376173369, "grad_norm": 0.8920705914497375, "learning_rate": 4.6067783438963725e-05, "loss": 1.6064, "mean_token_accuracy": 0.6706503987312317, "num_tokens": 142947051.0, "step": 8870 }, { "epoch": 2.0579441418472593, "grad_norm": 0.8812915682792664, "learning_rate": 4.6057675213713e-05, "loss": 1.5855, "mean_token_accuracy": 0.6719685330986976, "num_tokens": 143108080.0, "step": 8880 }, { "epoch": 2.0602619075211495, "grad_norm": 0.8450952172279358, "learning_rate": 4.60475551248907e-05, "loss": 1.5765, "mean_token_accuracy": 0.6732410997152328, "num_tokens": 143269464.0, "step": 8890 }, { "epoch": 2.06257967319504, "grad_norm": 0.9043715596199036, "learning_rate": 4.6037423178198294e-05, "loss": 1.5609, "mean_token_accuracy": 0.6748866870999336, "num_tokens": 143431084.0, "step": 8900 }, { "epoch": 2.0648974388689303, "grad_norm": 0.9168121218681335, "learning_rate": 4.602727937934398e-05, "loss": 1.5803, "mean_token_accuracy": 0.6725966766476631, "num_tokens": 143592932.0, "step": 8910 }, { "epoch": 2.067215204542821, "grad_norm": 0.8865721821784973, "learning_rate": 4.601712373404262e-05, "loss": 1.5776, "mean_token_accuracy": 0.6723967894911766, "num_tokens": 143754510.0, "step": 8920 }, { "epoch": 2.069532970216711, "grad_norm": 0.8671010732650757, "learning_rate": 4.6006956248015755e-05, "loss": 1.5733, "mean_token_accuracy": 0.6737460240721702, "num_tokens": 143916056.0, "step": 8930 }, { "epoch": 2.0718507358906013, "grad_norm": 0.8642122745513916, "learning_rate": 4.599677692699158e-05, "loss": 1.5847, "mean_token_accuracy": 0.6715142875909805, "num_tokens": 144077664.0, "step": 8940 }, { "epoch": 2.074168501564492, "grad_norm": 0.9124774932861328, "learning_rate": 4.598658577670498e-05, "loss": 1.564, "mean_token_accuracy": 0.6737497314810753, "num_tokens": 144237765.0, "step": 8950 }, { "epoch": 2.076486267238382, "grad_norm": 0.8798601627349854, "learning_rate": 4.5976382802897475e-05, "loss": 1.5687, "mean_token_accuracy": 0.6737477242946625, "num_tokens": 144399762.0, "step": 8960 }, { "epoch": 2.078804032912273, "grad_norm": 0.8837910294532776, "learning_rate": 4.596616801131728e-05, "loss": 1.578, "mean_token_accuracy": 0.6744363710284234, "num_tokens": 144560271.0, "step": 8970 }, { "epoch": 2.081121798586163, "grad_norm": 0.8448284268379211, "learning_rate": 4.595594140771925e-05, "loss": 1.5897, "mean_token_accuracy": 0.6732521757483483, "num_tokens": 144720617.0, "step": 8980 }, { "epoch": 2.083439564260053, "grad_norm": 0.8917294144630432, "learning_rate": 4.594570299786489e-05, "loss": 1.5772, "mean_token_accuracy": 0.6735452711582184, "num_tokens": 144881993.0, "step": 8990 }, { "epoch": 2.085757329933944, "grad_norm": 0.9759949445724487, "learning_rate": 4.5935452787522374e-05, "loss": 1.5708, "mean_token_accuracy": 0.6737877264618873, "num_tokens": 145043683.0, "step": 9000 }, { "epoch": 2.088075095607834, "grad_norm": 0.8930864930152893, "learning_rate": 4.59251907824665e-05, "loss": 1.5845, "mean_token_accuracy": 0.6729452595114708, "num_tokens": 145204974.0, "step": 9010 }, { "epoch": 2.0903928612817246, "grad_norm": 0.9045622944831848, "learning_rate": 4.591491698847873e-05, "loss": 1.5826, "mean_token_accuracy": 0.6712097480893136, "num_tokens": 145366696.0, "step": 9020 }, { "epoch": 2.092710626955615, "grad_norm": 0.8423172235488892, "learning_rate": 4.5904631411347164e-05, "loss": 1.599, "mean_token_accuracy": 0.6702449530363083, "num_tokens": 145527861.0, "step": 9030 }, { "epoch": 2.095028392629505, "grad_norm": 0.8865270614624023, "learning_rate": 4.589433405686654e-05, "loss": 1.5808, "mean_token_accuracy": 0.6732192158699035, "num_tokens": 145688859.0, "step": 9040 }, { "epoch": 2.0973461583033957, "grad_norm": 0.9251270294189453, "learning_rate": 4.588402493083823e-05, "loss": 1.5843, "mean_token_accuracy": 0.6743304803967476, "num_tokens": 145849902.0, "step": 9050 }, { "epoch": 2.099663923977286, "grad_norm": 0.895305335521698, "learning_rate": 4.5873704039070233e-05, "loss": 1.5754, "mean_token_accuracy": 0.6721954733133316, "num_tokens": 146011515.0, "step": 9060 }, { "epoch": 2.1019816896511765, "grad_norm": 0.9454572200775146, "learning_rate": 4.586337138737718e-05, "loss": 1.5888, "mean_token_accuracy": 0.6725662380456925, "num_tokens": 146173181.0, "step": 9070 }, { "epoch": 2.1042994553250667, "grad_norm": 0.9289234280586243, "learning_rate": 4.585302698158034e-05, "loss": 1.5833, "mean_token_accuracy": 0.6718607440590858, "num_tokens": 146334999.0, "step": 9080 }, { "epoch": 2.106617220998957, "grad_norm": 0.9493394494056702, "learning_rate": 4.5842670827507576e-05, "loss": 1.5884, "mean_token_accuracy": 0.6737530693411827, "num_tokens": 146495379.0, "step": 9090 }, { "epoch": 2.1089349866728475, "grad_norm": 0.9151946902275085, "learning_rate": 4.5832302930993405e-05, "loss": 1.5917, "mean_token_accuracy": 0.6697140067815781, "num_tokens": 146656087.0, "step": 9100 }, { "epoch": 2.1112527523467377, "grad_norm": 0.9238967895507812, "learning_rate": 4.582192329787892e-05, "loss": 1.5803, "mean_token_accuracy": 0.6713522642850875, "num_tokens": 146816696.0, "step": 9110 }, { "epoch": 2.1135705180206283, "grad_norm": 0.885997474193573, "learning_rate": 4.581153193401187e-05, "loss": 1.6013, "mean_token_accuracy": 0.6692215755581856, "num_tokens": 146978684.0, "step": 9120 }, { "epoch": 2.1158882836945185, "grad_norm": 0.9531799554824829, "learning_rate": 4.5801128845246576e-05, "loss": 1.5923, "mean_token_accuracy": 0.6719746261835098, "num_tokens": 147139959.0, "step": 9130 }, { "epoch": 2.1182060493684087, "grad_norm": 0.8818504214286804, "learning_rate": 4.579071403744398e-05, "loss": 1.5921, "mean_token_accuracy": 0.6722407460212707, "num_tokens": 147301837.0, "step": 9140 }, { "epoch": 2.1205238150422994, "grad_norm": 0.8939410448074341, "learning_rate": 4.578028751647162e-05, "loss": 1.5644, "mean_token_accuracy": 0.6743267178535461, "num_tokens": 147463272.0, "step": 9150 }, { "epoch": 2.1228415807161896, "grad_norm": 0.9548613429069519, "learning_rate": 4.5769849288203656e-05, "loss": 1.5649, "mean_token_accuracy": 0.6747888177633286, "num_tokens": 147623784.0, "step": 9160 }, { "epoch": 2.12515934639008, "grad_norm": 0.8619328737258911, "learning_rate": 4.57593993585208e-05, "loss": 1.5935, "mean_token_accuracy": 0.6715738654136658, "num_tokens": 147785142.0, "step": 9170 }, { "epoch": 2.1274771120639704, "grad_norm": 0.8496408462524414, "learning_rate": 4.574893773331042e-05, "loss": 1.5878, "mean_token_accuracy": 0.6721489787101745, "num_tokens": 147946058.0, "step": 9180 }, { "epoch": 2.1297948777378606, "grad_norm": 0.9623493552207947, "learning_rate": 4.57384644184664e-05, "loss": 1.5723, "mean_token_accuracy": 0.675454980134964, "num_tokens": 148107810.0, "step": 9190 }, { "epoch": 2.132112643411751, "grad_norm": 1.0103284120559692, "learning_rate": 4.572797941988926e-05, "loss": 1.5784, "mean_token_accuracy": 0.6727899849414826, "num_tokens": 148269145.0, "step": 9200 }, { "epoch": 2.1344304090856414, "grad_norm": 0.8947829604148865, "learning_rate": 4.571748274348608e-05, "loss": 1.5852, "mean_token_accuracy": 0.6723432153463363, "num_tokens": 148430689.0, "step": 9210 }, { "epoch": 2.1367481747595316, "grad_norm": 0.8428583145141602, "learning_rate": 4.570697439517053e-05, "loss": 1.5847, "mean_token_accuracy": 0.6714957773685455, "num_tokens": 148591712.0, "step": 9220 }, { "epoch": 2.1390659404334222, "grad_norm": 0.8883106708526611, "learning_rate": 4.569645438086286e-05, "loss": 1.5797, "mean_token_accuracy": 0.6740030899643898, "num_tokens": 148751901.0, "step": 9230 }, { "epoch": 2.1413837061073124, "grad_norm": 0.9475587606430054, "learning_rate": 4.568592270648988e-05, "loss": 1.575, "mean_token_accuracy": 0.6727751210331917, "num_tokens": 148913603.0, "step": 9240 }, { "epoch": 2.143701471781203, "grad_norm": 0.8633056879043579, "learning_rate": 4.567537937798498e-05, "loss": 1.5631, "mean_token_accuracy": 0.6752275958657264, "num_tokens": 149073790.0, "step": 9250 }, { "epoch": 2.1460192374550933, "grad_norm": 0.9341509938240051, "learning_rate": 4.566482440128809e-05, "loss": 1.5863, "mean_token_accuracy": 0.6717932716012001, "num_tokens": 149235546.0, "step": 9260 }, { "epoch": 2.1483370031289835, "grad_norm": 0.9715346693992615, "learning_rate": 4.565425778234574e-05, "loss": 1.5762, "mean_token_accuracy": 0.6740792483091355, "num_tokens": 149396080.0, "step": 9270 }, { "epoch": 2.150654768802874, "grad_norm": 0.8940684795379639, "learning_rate": 4.5643679527111e-05, "loss": 1.5898, "mean_token_accuracy": 0.6720863699913024, "num_tokens": 149556970.0, "step": 9280 }, { "epoch": 2.1529725344767643, "grad_norm": 0.88571697473526, "learning_rate": 4.563308964154349e-05, "loss": 1.5859, "mean_token_accuracy": 0.672130611538887, "num_tokens": 149717871.0, "step": 9290 }, { "epoch": 2.155290300150655, "grad_norm": 0.8883787393569946, "learning_rate": 4.56224881316094e-05, "loss": 1.5948, "mean_token_accuracy": 0.6713257566094398, "num_tokens": 149878284.0, "step": 9300 }, { "epoch": 2.157608065824545, "grad_norm": 0.8508368134498596, "learning_rate": 4.561187500328144e-05, "loss": 1.5761, "mean_token_accuracy": 0.6739462524652481, "num_tokens": 150039299.0, "step": 9310 }, { "epoch": 2.1599258314984353, "grad_norm": 0.8786492943763733, "learning_rate": 4.5601250262538895e-05, "loss": 1.5615, "mean_token_accuracy": 0.6756158351898194, "num_tokens": 150200942.0, "step": 9320 }, { "epoch": 2.162243597172326, "grad_norm": 0.8908810019493103, "learning_rate": 4.559061391536759e-05, "loss": 1.5694, "mean_token_accuracy": 0.6742755010724067, "num_tokens": 150361949.0, "step": 9330 }, { "epoch": 2.164561362846216, "grad_norm": 0.9175237417221069, "learning_rate": 4.557996596775986e-05, "loss": 1.5813, "mean_token_accuracy": 0.6715558230876922, "num_tokens": 150523039.0, "step": 9340 }, { "epoch": 2.166879128520107, "grad_norm": 0.8975151181221008, "learning_rate": 4.556930642571462e-05, "loss": 1.5928, "mean_token_accuracy": 0.6710900962352753, "num_tokens": 150684111.0, "step": 9350 }, { "epoch": 2.169196894193997, "grad_norm": 0.9368129968643188, "learning_rate": 4.5558635295237274e-05, "loss": 1.577, "mean_token_accuracy": 0.6735766857862473, "num_tokens": 150845194.0, "step": 9360 }, { "epoch": 2.171514659867887, "grad_norm": 0.8532882928848267, "learning_rate": 4.554795258233977e-05, "loss": 1.578, "mean_token_accuracy": 0.6720626905560494, "num_tokens": 151006354.0, "step": 9370 }, { "epoch": 2.173832425541778, "grad_norm": 0.8494508266448975, "learning_rate": 4.55372582930406e-05, "loss": 1.5702, "mean_token_accuracy": 0.6737558171153069, "num_tokens": 151166690.0, "step": 9380 }, { "epoch": 2.176150191215668, "grad_norm": 0.8752051591873169, "learning_rate": 4.552655243336476e-05, "loss": 1.553, "mean_token_accuracy": 0.6773367911577225, "num_tokens": 151327950.0, "step": 9390 }, { "epoch": 2.1784679568895586, "grad_norm": 0.8807527422904968, "learning_rate": 4.5515835009343754e-05, "loss": 1.5723, "mean_token_accuracy": 0.674119770526886, "num_tokens": 151489567.0, "step": 9400 }, { "epoch": 2.180785722563449, "grad_norm": 0.9524779915809631, "learning_rate": 4.550510602701563e-05, "loss": 1.5971, "mean_token_accuracy": 0.6713982865214347, "num_tokens": 151650114.0, "step": 9410 }, { "epoch": 2.183103488237339, "grad_norm": 0.9563729166984558, "learning_rate": 4.549436549242493e-05, "loss": 1.5956, "mean_token_accuracy": 0.6719032123684883, "num_tokens": 151811567.0, "step": 9420 }, { "epoch": 2.1854212539112297, "grad_norm": 0.850095808506012, "learning_rate": 4.5483613411622696e-05, "loss": 1.5902, "mean_token_accuracy": 0.6715853378176689, "num_tokens": 151973089.0, "step": 9430 }, { "epoch": 2.18773901958512, "grad_norm": 0.9428567886352539, "learning_rate": 4.547284979066649e-05, "loss": 1.5765, "mean_token_accuracy": 0.6729090884327888, "num_tokens": 152133221.0, "step": 9440 }, { "epoch": 2.1900567852590105, "grad_norm": 0.9284944534301758, "learning_rate": 4.546207463562039e-05, "loss": 1.5711, "mean_token_accuracy": 0.6735555663704872, "num_tokens": 152294800.0, "step": 9450 }, { "epoch": 2.1923745509329007, "grad_norm": 0.9356278777122498, "learning_rate": 4.5451287952554923e-05, "loss": 1.5634, "mean_token_accuracy": 0.6743265256285668, "num_tokens": 152455610.0, "step": 9460 }, { "epoch": 2.194692316606791, "grad_norm": 0.8428475260734558, "learning_rate": 4.5440489747547155e-05, "loss": 1.564, "mean_token_accuracy": 0.6750160589814186, "num_tokens": 152617456.0, "step": 9470 }, { "epoch": 2.1970100822806815, "grad_norm": 0.8194428086280823, "learning_rate": 4.542968002668063e-05, "loss": 1.5868, "mean_token_accuracy": 0.6729116111993789, "num_tokens": 152777505.0, "step": 9480 }, { "epoch": 2.1993278479545717, "grad_norm": 0.8947421312332153, "learning_rate": 4.5418858796045384e-05, "loss": 1.5742, "mean_token_accuracy": 0.6741680085659028, "num_tokens": 152937405.0, "step": 9490 }, { "epoch": 2.2016456136284623, "grad_norm": 0.9525731205940247, "learning_rate": 4.540802606173792e-05, "loss": 1.5829, "mean_token_accuracy": 0.6728540852665901, "num_tokens": 153098320.0, "step": 9500 }, { "epoch": 2.2039633793023525, "grad_norm": 0.8158891201019287, "learning_rate": 4.5397181829861246e-05, "loss": 1.5698, "mean_token_accuracy": 0.6749908968806266, "num_tokens": 153259398.0, "step": 9510 }, { "epoch": 2.2062811449762427, "grad_norm": 0.8337882161140442, "learning_rate": 4.538632610652483e-05, "loss": 1.5943, "mean_token_accuracy": 0.6709825739264488, "num_tokens": 153420935.0, "step": 9520 }, { "epoch": 2.2085989106501334, "grad_norm": 0.9180800318717957, "learning_rate": 4.537545889784462e-05, "loss": 1.5677, "mean_token_accuracy": 0.6754168331623077, "num_tokens": 153579483.0, "step": 9530 }, { "epoch": 2.2109166763240236, "grad_norm": 0.9178804755210876, "learning_rate": 4.536458020994304e-05, "loss": 1.5835, "mean_token_accuracy": 0.6714311525225639, "num_tokens": 153739835.0, "step": 9540 }, { "epoch": 2.213234441997914, "grad_norm": 0.8946747183799744, "learning_rate": 4.535369004894897e-05, "loss": 1.5789, "mean_token_accuracy": 0.673431396484375, "num_tokens": 153901931.0, "step": 9550 }, { "epoch": 2.2155522076718044, "grad_norm": 0.9260879755020142, "learning_rate": 4.534278842099776e-05, "loss": 1.5657, "mean_token_accuracy": 0.6741075202822685, "num_tokens": 154063166.0, "step": 9560 }, { "epoch": 2.2178699733456946, "grad_norm": 0.9318385720252991, "learning_rate": 4.5331875332231235e-05, "loss": 1.5982, "mean_token_accuracy": 0.6708470702171325, "num_tokens": 154224023.0, "step": 9570 }, { "epoch": 2.220187739019585, "grad_norm": 0.8664634227752686, "learning_rate": 4.5320950788797635e-05, "loss": 1.577, "mean_token_accuracy": 0.6734039708971977, "num_tokens": 154385483.0, "step": 9580 }, { "epoch": 2.2225055046934754, "grad_norm": 0.8902740478515625, "learning_rate": 4.5310014796851696e-05, "loss": 1.5799, "mean_token_accuracy": 0.6730379238724709, "num_tokens": 154547100.0, "step": 9590 }, { "epoch": 2.224823270367366, "grad_norm": 0.90468829870224, "learning_rate": 4.5299067362554595e-05, "loss": 1.5797, "mean_token_accuracy": 0.6732700482010842, "num_tokens": 154708183.0, "step": 9600 }, { "epoch": 2.2271410360412562, "grad_norm": 0.8902176022529602, "learning_rate": 4.528810849207393e-05, "loss": 1.5863, "mean_token_accuracy": 0.6722696855664253, "num_tokens": 154869822.0, "step": 9610 }, { "epoch": 2.2294588017151464, "grad_norm": 0.8718687295913696, "learning_rate": 4.527713819158379e-05, "loss": 1.5643, "mean_token_accuracy": 0.6724706426262855, "num_tokens": 155030366.0, "step": 9620 }, { "epoch": 2.231776567389037, "grad_norm": 0.9286001920700073, "learning_rate": 4.5266156467264645e-05, "loss": 1.5808, "mean_token_accuracy": 0.6717207640409469, "num_tokens": 155192112.0, "step": 9630 }, { "epoch": 2.2340943330629273, "grad_norm": 0.9258651733398438, "learning_rate": 4.525516332530344e-05, "loss": 1.575, "mean_token_accuracy": 0.673164826631546, "num_tokens": 155353269.0, "step": 9640 }, { "epoch": 2.236412098736818, "grad_norm": 0.9155446887016296, "learning_rate": 4.5244158771893544e-05, "loss": 1.5944, "mean_token_accuracy": 0.6702372252941131, "num_tokens": 155515419.0, "step": 9650 }, { "epoch": 2.238729864410708, "grad_norm": 0.8746881484985352, "learning_rate": 4.523314281323476e-05, "loss": 1.5799, "mean_token_accuracy": 0.6726961344480514, "num_tokens": 155676807.0, "step": 9660 }, { "epoch": 2.2410476300845983, "grad_norm": 0.9335594177246094, "learning_rate": 4.52221154555333e-05, "loss": 1.5778, "mean_token_accuracy": 0.673190937936306, "num_tokens": 155837390.0, "step": 9670 }, { "epoch": 2.243365395758489, "grad_norm": 0.8622233867645264, "learning_rate": 4.521107670500181e-05, "loss": 1.5753, "mean_token_accuracy": 0.6731279522180558, "num_tokens": 155998163.0, "step": 9680 }, { "epoch": 2.245683161432379, "grad_norm": 0.8907208442687988, "learning_rate": 4.520002656785936e-05, "loss": 1.5705, "mean_token_accuracy": 0.6731559544801712, "num_tokens": 156159456.0, "step": 9690 }, { "epoch": 2.2480009271062698, "grad_norm": 0.8516606688499451, "learning_rate": 4.5188965050331424e-05, "loss": 1.5639, "mean_token_accuracy": 0.6747384935617446, "num_tokens": 156319258.0, "step": 9700 }, { "epoch": 2.25031869278016, "grad_norm": 0.9295317530632019, "learning_rate": 4.517789215864988e-05, "loss": 1.5672, "mean_token_accuracy": 0.6732812479138375, "num_tokens": 156479611.0, "step": 9710 }, { "epoch": 2.25263645845405, "grad_norm": 0.9673186540603638, "learning_rate": 4.516680789905305e-05, "loss": 1.565, "mean_token_accuracy": 0.6755199372768402, "num_tokens": 156640436.0, "step": 9720 }, { "epoch": 2.2549542241279408, "grad_norm": 0.8862589001655579, "learning_rate": 4.515571227778562e-05, "loss": 1.5829, "mean_token_accuracy": 0.6728232428431511, "num_tokens": 156802315.0, "step": 9730 }, { "epoch": 2.257271989801831, "grad_norm": 0.9365081191062927, "learning_rate": 4.514460530109869e-05, "loss": 1.5701, "mean_token_accuracy": 0.6747015386819839, "num_tokens": 156964383.0, "step": 9740 }, { "epoch": 2.259589755475721, "grad_norm": 0.9397696852684021, "learning_rate": 4.5133486975249774e-05, "loss": 1.5748, "mean_token_accuracy": 0.6722081080079079, "num_tokens": 157126341.0, "step": 9750 }, { "epoch": 2.261907521149612, "grad_norm": 0.8741840720176697, "learning_rate": 4.512235730650275e-05, "loss": 1.5733, "mean_token_accuracy": 0.6726938903331756, "num_tokens": 157287594.0, "step": 9760 }, { "epoch": 2.264225286823502, "grad_norm": 0.8961966037750244, "learning_rate": 4.511121630112791e-05, "loss": 1.5643, "mean_token_accuracy": 0.6761046707630157, "num_tokens": 157448318.0, "step": 9770 }, { "epoch": 2.2665430524973926, "grad_norm": 0.8676405549049377, "learning_rate": 4.510006396540194e-05, "loss": 1.5775, "mean_token_accuracy": 0.6730629429221153, "num_tokens": 157610248.0, "step": 9780 }, { "epoch": 2.268860818171283, "grad_norm": 0.9204206466674805, "learning_rate": 4.5088900305607876e-05, "loss": 1.5566, "mean_token_accuracy": 0.6759509548544884, "num_tokens": 157771242.0, "step": 9790 }, { "epoch": 2.271178583845173, "grad_norm": 0.9258038401603699, "learning_rate": 4.507772532803515e-05, "loss": 1.5716, "mean_token_accuracy": 0.673771096765995, "num_tokens": 157933025.0, "step": 9800 }, { "epoch": 2.2734963495190637, "grad_norm": 0.907778263092041, "learning_rate": 4.5066539038979595e-05, "loss": 1.5726, "mean_token_accuracy": 0.6737805664539337, "num_tokens": 158094152.0, "step": 9810 }, { "epoch": 2.275814115192954, "grad_norm": 0.8143916130065918, "learning_rate": 4.505534144474337e-05, "loss": 1.5718, "mean_token_accuracy": 0.6730205863714218, "num_tokens": 158256079.0, "step": 9820 }, { "epoch": 2.2781318808668445, "grad_norm": 0.8712210059165955, "learning_rate": 4.504413255163506e-05, "loss": 1.5876, "mean_token_accuracy": 0.671580645442009, "num_tokens": 158417700.0, "step": 9830 }, { "epoch": 2.2804496465407347, "grad_norm": 0.9327561259269714, "learning_rate": 4.503291236596954e-05, "loss": 1.5579, "mean_token_accuracy": 0.6767462521791459, "num_tokens": 158579443.0, "step": 9840 }, { "epoch": 2.282767412214625, "grad_norm": 0.9265860319137573, "learning_rate": 4.502168089406813e-05, "loss": 1.5915, "mean_token_accuracy": 0.6718390002846718, "num_tokens": 158739229.0, "step": 9850 }, { "epoch": 2.2850851778885155, "grad_norm": 0.9753409028053284, "learning_rate": 4.501043814225845e-05, "loss": 1.5609, "mean_token_accuracy": 0.6746455729007721, "num_tokens": 158899709.0, "step": 9860 }, { "epoch": 2.2874029435624057, "grad_norm": 0.893292248249054, "learning_rate": 4.499918411687449e-05, "loss": 1.594, "mean_token_accuracy": 0.6717567875981331, "num_tokens": 159059974.0, "step": 9870 }, { "epoch": 2.2897207092362963, "grad_norm": 0.9838335514068604, "learning_rate": 4.498791882425662e-05, "loss": 1.5803, "mean_token_accuracy": 0.6721288189291954, "num_tokens": 159220883.0, "step": 9880 }, { "epoch": 2.2920384749101865, "grad_norm": 0.9349499940872192, "learning_rate": 4.49766422707515e-05, "loss": 1.5662, "mean_token_accuracy": 0.6740754663944244, "num_tokens": 159382532.0, "step": 9890 }, { "epoch": 2.2943562405840767, "grad_norm": 0.8968580365180969, "learning_rate": 4.496535446271219e-05, "loss": 1.5762, "mean_token_accuracy": 0.6745421275496483, "num_tokens": 159544024.0, "step": 9900 }, { "epoch": 2.2966740062579674, "grad_norm": 0.8876572251319885, "learning_rate": 4.4954055406498065e-05, "loss": 1.5661, "mean_token_accuracy": 0.674108250439167, "num_tokens": 159705524.0, "step": 9910 }, { "epoch": 2.2989917719318576, "grad_norm": 0.9080151915550232, "learning_rate": 4.494274510847483e-05, "loss": 1.592, "mean_token_accuracy": 0.6712303683161736, "num_tokens": 159866618.0, "step": 9920 }, { "epoch": 2.301309537605748, "grad_norm": 0.8950001001358032, "learning_rate": 4.493142357501455e-05, "loss": 1.5616, "mean_token_accuracy": 0.6751975923776626, "num_tokens": 160027273.0, "step": 9930 }, { "epoch": 2.3036273032796384, "grad_norm": 0.9265996813774109, "learning_rate": 4.492009081249559e-05, "loss": 1.5582, "mean_token_accuracy": 0.6759831845760346, "num_tokens": 160188310.0, "step": 9940 }, { "epoch": 2.3059450689535286, "grad_norm": 0.920953631401062, "learning_rate": 4.4908746827302655e-05, "loss": 1.5558, "mean_token_accuracy": 0.6747240975499154, "num_tokens": 160348893.0, "step": 9950 }, { "epoch": 2.308262834627419, "grad_norm": 0.865856409072876, "learning_rate": 4.489739162582678e-05, "loss": 1.5772, "mean_token_accuracy": 0.6719771265983582, "num_tokens": 160509661.0, "step": 9960 }, { "epoch": 2.3105806003013094, "grad_norm": 0.9186729192733765, "learning_rate": 4.488602521446531e-05, "loss": 1.5762, "mean_token_accuracy": 0.6719639971852303, "num_tokens": 160671750.0, "step": 9970 }, { "epoch": 2.3128983659752, "grad_norm": 0.8810170292854309, "learning_rate": 4.48746475996219e-05, "loss": 1.5963, "mean_token_accuracy": 0.67125453799963, "num_tokens": 160831365.0, "step": 9980 }, { "epoch": 2.3152161316490902, "grad_norm": 0.8860878348350525, "learning_rate": 4.486325878770654e-05, "loss": 1.5701, "mean_token_accuracy": 0.6737497344613075, "num_tokens": 160992671.0, "step": 9990 }, { "epoch": 2.3175338973229804, "grad_norm": 0.9392087459564209, "learning_rate": 4.4851858785135495e-05, "loss": 1.5815, "mean_token_accuracy": 0.6719829276204109, "num_tokens": 161154422.0, "step": 10000 }, { "epoch": 2.319851662996871, "grad_norm": 0.879016637802124, "learning_rate": 4.484044759833136e-05, "loss": 1.567, "mean_token_accuracy": 0.6742028892040253, "num_tokens": 161315725.0, "step": 10010 }, { "epoch": 2.3221694286707613, "grad_norm": 0.8957821130752563, "learning_rate": 4.4829025233723016e-05, "loss": 1.5683, "mean_token_accuracy": 0.6743255004286766, "num_tokens": 161477507.0, "step": 10020 }, { "epoch": 2.324487194344652, "grad_norm": 0.9163082242012024, "learning_rate": 4.481759169774565e-05, "loss": 1.5699, "mean_token_accuracy": 0.6742016896605492, "num_tokens": 161639228.0, "step": 10030 }, { "epoch": 2.326804960018542, "grad_norm": 0.8460644483566284, "learning_rate": 4.4806146996840757e-05, "loss": 1.5884, "mean_token_accuracy": 0.6724669024348259, "num_tokens": 161800459.0, "step": 10040 }, { "epoch": 2.3291227256924323, "grad_norm": 0.8595979809761047, "learning_rate": 4.479469113745608e-05, "loss": 1.5881, "mean_token_accuracy": 0.6714617758989334, "num_tokens": 161959961.0, "step": 10050 }, { "epoch": 2.331440491366323, "grad_norm": 0.8601078987121582, "learning_rate": 4.47832241260457e-05, "loss": 1.5708, "mean_token_accuracy": 0.674575024843216, "num_tokens": 162121342.0, "step": 10060 }, { "epoch": 2.333758257040213, "grad_norm": 0.93760746717453, "learning_rate": 4.477174596906993e-05, "loss": 1.5753, "mean_token_accuracy": 0.6721722826361656, "num_tokens": 162283244.0, "step": 10070 }, { "epoch": 2.3360760227141038, "grad_norm": 0.8699531555175781, "learning_rate": 4.476025667299542e-05, "loss": 1.5645, "mean_token_accuracy": 0.6756922021508217, "num_tokens": 162444660.0, "step": 10080 }, { "epoch": 2.338393788387994, "grad_norm": 0.9413578510284424, "learning_rate": 4.474875624429502e-05, "loss": 1.5851, "mean_token_accuracy": 0.6721457824110985, "num_tokens": 162605851.0, "step": 10090 }, { "epoch": 2.340711554061884, "grad_norm": 0.9430469870567322, "learning_rate": 4.473724468944794e-05, "loss": 1.5865, "mean_token_accuracy": 0.6710149556398392, "num_tokens": 162767151.0, "step": 10100 }, { "epoch": 2.3430293197357748, "grad_norm": 0.8675362467765808, "learning_rate": 4.4725722014939586e-05, "loss": 1.5665, "mean_token_accuracy": 0.6746358305215836, "num_tokens": 162928487.0, "step": 10110 }, { "epoch": 2.345347085409665, "grad_norm": 0.9172285199165344, "learning_rate": 4.4714188227261664e-05, "loss": 1.5706, "mean_token_accuracy": 0.6737615823745727, "num_tokens": 163089742.0, "step": 10120 }, { "epoch": 2.3476648510835556, "grad_norm": 0.8697797656059265, "learning_rate": 4.470264333291213e-05, "loss": 1.5779, "mean_token_accuracy": 0.6724143102765083, "num_tokens": 163251556.0, "step": 10130 }, { "epoch": 2.349982616757446, "grad_norm": 0.8002917766571045, "learning_rate": 4.469108733839521e-05, "loss": 1.5646, "mean_token_accuracy": 0.6743959918618202, "num_tokens": 163412972.0, "step": 10140 }, { "epoch": 2.352300382431336, "grad_norm": 0.8921197056770325, "learning_rate": 4.4679520250221365e-05, "loss": 1.5926, "mean_token_accuracy": 0.6706374496221542, "num_tokens": 163572631.0, "step": 10150 }, { "epoch": 2.3546181481052266, "grad_norm": 0.9002388715744019, "learning_rate": 4.466794207490731e-05, "loss": 1.5501, "mean_token_accuracy": 0.6762612447142601, "num_tokens": 163733155.0, "step": 10160 }, { "epoch": 2.356935913779117, "grad_norm": 0.93417888879776, "learning_rate": 4.4656352818976035e-05, "loss": 1.5802, "mean_token_accuracy": 0.6730960592627525, "num_tokens": 163895256.0, "step": 10170 }, { "epoch": 2.3592536794530075, "grad_norm": 0.879782497882843, "learning_rate": 4.4644752488956727e-05, "loss": 1.5746, "mean_token_accuracy": 0.673312596976757, "num_tokens": 164057042.0, "step": 10180 }, { "epoch": 2.3615714451268977, "grad_norm": 0.8817622065544128, "learning_rate": 4.4633141091384836e-05, "loss": 1.5712, "mean_token_accuracy": 0.674054303765297, "num_tokens": 164217471.0, "step": 10190 }, { "epoch": 2.363889210800788, "grad_norm": 0.8836377859115601, "learning_rate": 4.462151863280206e-05, "loss": 1.5934, "mean_token_accuracy": 0.670887702703476, "num_tokens": 164378651.0, "step": 10200 }, { "epoch": 2.3662069764746785, "grad_norm": 0.8807094097137451, "learning_rate": 4.46098851197563e-05, "loss": 1.5778, "mean_token_accuracy": 0.6747184365987777, "num_tokens": 164539511.0, "step": 10210 }, { "epoch": 2.3685247421485687, "grad_norm": 0.8710712790489197, "learning_rate": 4.459824055880171e-05, "loss": 1.5489, "mean_token_accuracy": 0.6760299310088158, "num_tokens": 164700683.0, "step": 10220 }, { "epoch": 2.3708425078224593, "grad_norm": 0.8572421073913574, "learning_rate": 4.458658495649865e-05, "loss": 1.5599, "mean_token_accuracy": 0.6758700147271156, "num_tokens": 164861438.0, "step": 10230 }, { "epoch": 2.3731602734963495, "grad_norm": 0.9302752017974854, "learning_rate": 4.4574918319413693e-05, "loss": 1.5702, "mean_token_accuracy": 0.6735012009739876, "num_tokens": 165022853.0, "step": 10240 }, { "epoch": 2.3754780391702397, "grad_norm": 0.9034478068351746, "learning_rate": 4.456324065411968e-05, "loss": 1.5681, "mean_token_accuracy": 0.6741184189915657, "num_tokens": 165183405.0, "step": 10250 }, { "epoch": 2.3777958048441303, "grad_norm": 0.9006752967834473, "learning_rate": 4.455155196719559e-05, "loss": 1.5861, "mean_token_accuracy": 0.6729900449514389, "num_tokens": 165343854.0, "step": 10260 }, { "epoch": 2.3801135705180205, "grad_norm": 0.9423299431800842, "learning_rate": 4.4539852265226676e-05, "loss": 1.5597, "mean_token_accuracy": 0.6745796889066696, "num_tokens": 165504189.0, "step": 10270 }, { "epoch": 2.382431336191911, "grad_norm": 0.9117509722709656, "learning_rate": 4.452814155480437e-05, "loss": 1.5875, "mean_token_accuracy": 0.6708626434206962, "num_tokens": 165664770.0, "step": 10280 }, { "epoch": 2.3847491018658014, "grad_norm": 0.9179659485816956, "learning_rate": 4.451641984252629e-05, "loss": 1.593, "mean_token_accuracy": 0.670914213359356, "num_tokens": 165826066.0, "step": 10290 }, { "epoch": 2.3870668675396916, "grad_norm": 0.8708449006080627, "learning_rate": 4.450468713499628e-05, "loss": 1.5494, "mean_token_accuracy": 0.6753816589713096, "num_tokens": 165987159.0, "step": 10300 }, { "epoch": 2.389384633213582, "grad_norm": 0.9212692379951477, "learning_rate": 4.449294343882438e-05, "loss": 1.5676, "mean_token_accuracy": 0.6743277803063392, "num_tokens": 166148384.0, "step": 10310 }, { "epoch": 2.3917023988874724, "grad_norm": 0.9224367737770081, "learning_rate": 4.448118876062678e-05, "loss": 1.5803, "mean_token_accuracy": 0.6713237255811692, "num_tokens": 166310267.0, "step": 10320 }, { "epoch": 2.394020164561363, "grad_norm": 0.9641690254211426, "learning_rate": 4.446942310702591e-05, "loss": 1.5683, "mean_token_accuracy": 0.6746211811900139, "num_tokens": 166471316.0, "step": 10330 }, { "epoch": 2.396337930235253, "grad_norm": 0.9513624310493469, "learning_rate": 4.445764648465034e-05, "loss": 1.5549, "mean_token_accuracy": 0.6745530933141708, "num_tokens": 166632835.0, "step": 10340 }, { "epoch": 2.3986556959091434, "grad_norm": 0.8919451832771301, "learning_rate": 4.4445858900134865e-05, "loss": 1.5645, "mean_token_accuracy": 0.6734794393181801, "num_tokens": 166794579.0, "step": 10350 }, { "epoch": 2.400973461583034, "grad_norm": 0.9417605400085449, "learning_rate": 4.44340603601204e-05, "loss": 1.5739, "mean_token_accuracy": 0.6735389262437821, "num_tokens": 166956876.0, "step": 10360 }, { "epoch": 2.4032912272569242, "grad_norm": 0.9737564325332642, "learning_rate": 4.442225087125407e-05, "loss": 1.5675, "mean_token_accuracy": 0.6757376685738563, "num_tokens": 167118806.0, "step": 10370 }, { "epoch": 2.405608992930815, "grad_norm": 0.9006284475326538, "learning_rate": 4.441043044018918e-05, "loss": 1.5791, "mean_token_accuracy": 0.6716359093785286, "num_tokens": 167280581.0, "step": 10380 }, { "epoch": 2.407926758604705, "grad_norm": 0.9481953382492065, "learning_rate": 4.4398599073585157e-05, "loss": 1.5663, "mean_token_accuracy": 0.6720936834812165, "num_tokens": 167442566.0, "step": 10390 }, { "epoch": 2.4102445242785953, "grad_norm": 0.8653088212013245, "learning_rate": 4.438675677810762e-05, "loss": 1.57, "mean_token_accuracy": 0.6729087710380555, "num_tokens": 167603635.0, "step": 10400 }, { "epoch": 2.412562289952486, "grad_norm": 0.8947418928146362, "learning_rate": 4.437490356042833e-05, "loss": 1.5791, "mean_token_accuracy": 0.6734666869044303, "num_tokens": 167764876.0, "step": 10410 }, { "epoch": 2.414880055626376, "grad_norm": 0.941710352897644, "learning_rate": 4.436303942722522e-05, "loss": 1.5738, "mean_token_accuracy": 0.6745247647166253, "num_tokens": 167925474.0, "step": 10420 }, { "epoch": 2.4171978213002667, "grad_norm": 0.8807420134544373, "learning_rate": 4.4351164385182356e-05, "loss": 1.5506, "mean_token_accuracy": 0.6752458736300468, "num_tokens": 168087072.0, "step": 10430 }, { "epoch": 2.419515586974157, "grad_norm": 0.8931625485420227, "learning_rate": 4.4339278440989954e-05, "loss": 1.586, "mean_token_accuracy": 0.6704590767621994, "num_tokens": 168247037.0, "step": 10440 }, { "epoch": 2.421833352648047, "grad_norm": 0.8356790542602539, "learning_rate": 4.4327381601344366e-05, "loss": 1.5619, "mean_token_accuracy": 0.6755863070487976, "num_tokens": 168408084.0, "step": 10450 }, { "epoch": 2.4241511183219377, "grad_norm": 0.9374634027481079, "learning_rate": 4.43154738729481e-05, "loss": 1.5821, "mean_token_accuracy": 0.6720118597149849, "num_tokens": 168568946.0, "step": 10460 }, { "epoch": 2.426468883995828, "grad_norm": 0.9714487195014954, "learning_rate": 4.4303555262509785e-05, "loss": 1.5777, "mean_token_accuracy": 0.67247574031353, "num_tokens": 168730719.0, "step": 10470 }, { "epoch": 2.4287866496697186, "grad_norm": 0.9927068948745728, "learning_rate": 4.4291625776744184e-05, "loss": 1.5576, "mean_token_accuracy": 0.6758719399571419, "num_tokens": 168890793.0, "step": 10480 }, { "epoch": 2.4311044153436088, "grad_norm": 0.8957602977752686, "learning_rate": 4.427968542237218e-05, "loss": 1.5741, "mean_token_accuracy": 0.6723880991339684, "num_tokens": 169052238.0, "step": 10490 }, { "epoch": 2.433422181017499, "grad_norm": 0.9023804664611816, "learning_rate": 4.4267734206120795e-05, "loss": 1.5701, "mean_token_accuracy": 0.6731338292360306, "num_tokens": 169213835.0, "step": 10500 }, { "epoch": 2.4357399466913896, "grad_norm": 0.9730679392814636, "learning_rate": 4.425577213472315e-05, "loss": 1.5705, "mean_token_accuracy": 0.6729345992207527, "num_tokens": 169374740.0, "step": 10510 }, { "epoch": 2.43805771236528, "grad_norm": 0.8819413185119629, "learning_rate": 4.4243799214918505e-05, "loss": 1.5616, "mean_token_accuracy": 0.673907695710659, "num_tokens": 169535677.0, "step": 10520 }, { "epoch": 2.4403754780391704, "grad_norm": 0.9637852311134338, "learning_rate": 4.423181545345221e-05, "loss": 1.5593, "mean_token_accuracy": 0.6754495143890381, "num_tokens": 169697029.0, "step": 10530 }, { "epoch": 2.4426932437130606, "grad_norm": 0.9592570662498474, "learning_rate": 4.421982085707574e-05, "loss": 1.5823, "mean_token_accuracy": 0.6709102541208267, "num_tokens": 169857069.0, "step": 10540 }, { "epoch": 2.445011009386951, "grad_norm": 0.9082629084587097, "learning_rate": 4.420781543254666e-05, "loss": 1.5727, "mean_token_accuracy": 0.6739875555038453, "num_tokens": 170019112.0, "step": 10550 }, { "epoch": 2.4473287750608415, "grad_norm": 0.9101300835609436, "learning_rate": 4.419579918662865e-05, "loss": 1.5856, "mean_token_accuracy": 0.6708488494157792, "num_tokens": 170180950.0, "step": 10560 }, { "epoch": 2.4496465407347316, "grad_norm": 0.9411409497261047, "learning_rate": 4.418377212609147e-05, "loss": 1.5815, "mean_token_accuracy": 0.6725569725036621, "num_tokens": 170342945.0, "step": 10570 }, { "epoch": 2.4519643064086223, "grad_norm": 0.88466477394104, "learning_rate": 4.417173425771099e-05, "loss": 1.5662, "mean_token_accuracy": 0.6740006327629089, "num_tokens": 170504305.0, "step": 10580 }, { "epoch": 2.4542820720825125, "grad_norm": 0.9148867726325989, "learning_rate": 4.4159685588269155e-05, "loss": 1.5813, "mean_token_accuracy": 0.6721958220005035, "num_tokens": 170664342.0, "step": 10590 }, { "epoch": 2.4565998377564027, "grad_norm": 0.9350669384002686, "learning_rate": 4.4147626124554e-05, "loss": 1.5856, "mean_token_accuracy": 0.6703262254595757, "num_tokens": 170826279.0, "step": 10600 }, { "epoch": 2.4589176034302933, "grad_norm": 0.8927809596061707, "learning_rate": 4.413555587335965e-05, "loss": 1.585, "mean_token_accuracy": 0.6736860975623131, "num_tokens": 170987500.0, "step": 10610 }, { "epoch": 2.4612353691041835, "grad_norm": 0.8849876523017883, "learning_rate": 4.412347484148629e-05, "loss": 1.5769, "mean_token_accuracy": 0.6735613837838172, "num_tokens": 171149395.0, "step": 10620 }, { "epoch": 2.463553134778074, "grad_norm": 0.8464459180831909, "learning_rate": 4.4111383035740195e-05, "loss": 1.5648, "mean_token_accuracy": 0.6729621708393096, "num_tokens": 171311027.0, "step": 10630 }, { "epoch": 2.4658709004519643, "grad_norm": 0.9017727375030518, "learning_rate": 4.40992804629337e-05, "loss": 1.5555, "mean_token_accuracy": 0.6740154176950455, "num_tokens": 171472977.0, "step": 10640 }, { "epoch": 2.4681886661258545, "grad_norm": 0.954342782497406, "learning_rate": 4.408716712988521e-05, "loss": 1.5815, "mean_token_accuracy": 0.6719877630472183, "num_tokens": 171634245.0, "step": 10650 }, { "epoch": 2.470506431799745, "grad_norm": 0.8446694016456604, "learning_rate": 4.4075043043419185e-05, "loss": 1.56, "mean_token_accuracy": 0.6749733552336693, "num_tokens": 171795807.0, "step": 10660 }, { "epoch": 2.4728241974736354, "grad_norm": 0.8895803689956665, "learning_rate": 4.406290821036616e-05, "loss": 1.5621, "mean_token_accuracy": 0.6744669511914253, "num_tokens": 171957155.0, "step": 10670 }, { "epoch": 2.475141963147526, "grad_norm": 0.9664355516433716, "learning_rate": 4.405076263756271e-05, "loss": 1.5697, "mean_token_accuracy": 0.6735344275832176, "num_tokens": 172117364.0, "step": 10680 }, { "epoch": 2.477459728821416, "grad_norm": 0.8307134509086609, "learning_rate": 4.403860633185146e-05, "loss": 1.5654, "mean_token_accuracy": 0.6753974452614784, "num_tokens": 172278473.0, "step": 10690 }, { "epoch": 2.4797774944953064, "grad_norm": 0.9777670502662659, "learning_rate": 4.4026439300081096e-05, "loss": 1.5832, "mean_token_accuracy": 0.6717260494828224, "num_tokens": 172439904.0, "step": 10700 }, { "epoch": 2.482095260169197, "grad_norm": 0.889365017414093, "learning_rate": 4.401426154910633e-05, "loss": 1.5843, "mean_token_accuracy": 0.6722819805145264, "num_tokens": 172599812.0, "step": 10710 }, { "epoch": 2.484413025843087, "grad_norm": 0.9120805263519287, "learning_rate": 4.400207308578792e-05, "loss": 1.5556, "mean_token_accuracy": 0.6754286393523217, "num_tokens": 172761648.0, "step": 10720 }, { "epoch": 2.486730791516978, "grad_norm": 0.9972118139266968, "learning_rate": 4.398987391699266e-05, "loss": 1.5599, "mean_token_accuracy": 0.6749235421419144, "num_tokens": 172922200.0, "step": 10730 }, { "epoch": 2.489048557190868, "grad_norm": 0.8839682340621948, "learning_rate": 4.397766404959338e-05, "loss": 1.5822, "mean_token_accuracy": 0.6725971043109894, "num_tokens": 173083090.0, "step": 10740 }, { "epoch": 2.4913663228647582, "grad_norm": 0.9008544087409973, "learning_rate": 4.396544349046894e-05, "loss": 1.5644, "mean_token_accuracy": 0.6726410865783692, "num_tokens": 173244248.0, "step": 10750 }, { "epoch": 2.493684088538649, "grad_norm": 0.8914052248001099, "learning_rate": 4.395321224650418e-05, "loss": 1.5513, "mean_token_accuracy": 0.6758178904652595, "num_tokens": 173406249.0, "step": 10760 }, { "epoch": 2.496001854212539, "grad_norm": 0.87981778383255, "learning_rate": 4.3940970324590035e-05, "loss": 1.5464, "mean_token_accuracy": 0.6773307830095291, "num_tokens": 173568248.0, "step": 10770 }, { "epoch": 2.4983196198864297, "grad_norm": 0.9518803358078003, "learning_rate": 4.3928717731623405e-05, "loss": 1.5798, "mean_token_accuracy": 0.6725856885313988, "num_tokens": 173729214.0, "step": 10780 }, { "epoch": 2.50063738556032, "grad_norm": 0.8708488941192627, "learning_rate": 4.39164544745072e-05, "loss": 1.5602, "mean_token_accuracy": 0.6757431715726853, "num_tokens": 173889846.0, "step": 10790 }, { "epoch": 2.50295515123421, "grad_norm": 0.873436450958252, "learning_rate": 4.3904180560150366e-05, "loss": 1.5653, "mean_token_accuracy": 0.6756262198090554, "num_tokens": 174051410.0, "step": 10800 }, { "epoch": 2.5052729169081007, "grad_norm": 0.8987941741943359, "learning_rate": 4.3891895995467826e-05, "loss": 1.568, "mean_token_accuracy": 0.6733195230364799, "num_tokens": 174211228.0, "step": 10810 }, { "epoch": 2.507590682581991, "grad_norm": 0.8480157256126404, "learning_rate": 4.387960078738053e-05, "loss": 1.5658, "mean_token_accuracy": 0.6736076414585114, "num_tokens": 174372508.0, "step": 10820 }, { "epoch": 2.5099084482558816, "grad_norm": 0.905218780040741, "learning_rate": 4.3867294942815395e-05, "loss": 1.5588, "mean_token_accuracy": 0.675604210793972, "num_tokens": 174533965.0, "step": 10830 }, { "epoch": 2.5122262139297717, "grad_norm": 0.8502569198608398, "learning_rate": 4.3854978468705355e-05, "loss": 1.564, "mean_token_accuracy": 0.6733383774757385, "num_tokens": 174695041.0, "step": 10840 }, { "epoch": 2.514543979603662, "grad_norm": 0.9308993220329285, "learning_rate": 4.3842651371989324e-05, "loss": 1.5638, "mean_token_accuracy": 0.6746580719947814, "num_tokens": 174856913.0, "step": 10850 }, { "epoch": 2.5168617452775526, "grad_norm": 0.8883436918258667, "learning_rate": 4.3830313659612185e-05, "loss": 1.5755, "mean_token_accuracy": 0.6738904371857644, "num_tokens": 175018635.0, "step": 10860 }, { "epoch": 2.5191795109514428, "grad_norm": 0.8833568096160889, "learning_rate": 4.381796533852484e-05, "loss": 1.5581, "mean_token_accuracy": 0.674318614602089, "num_tokens": 175180049.0, "step": 10870 }, { "epoch": 2.5214972766253334, "grad_norm": 0.826974630355835, "learning_rate": 4.3805606415684125e-05, "loss": 1.5685, "mean_token_accuracy": 0.67418182939291, "num_tokens": 175340841.0, "step": 10880 }, { "epoch": 2.5238150422992236, "grad_norm": 0.8228015303611755, "learning_rate": 4.379323689805288e-05, "loss": 1.5909, "mean_token_accuracy": 0.6711256340146065, "num_tokens": 175501643.0, "step": 10890 }, { "epoch": 2.526132807973114, "grad_norm": 0.8890436291694641, "learning_rate": 4.3780856792599885e-05, "loss": 1.5889, "mean_token_accuracy": 0.672724986076355, "num_tokens": 175663535.0, "step": 10900 }, { "epoch": 2.5284505736470044, "grad_norm": 0.9041359424591064, "learning_rate": 4.376846610629992e-05, "loss": 1.5788, "mean_token_accuracy": 0.6735109061002731, "num_tokens": 175824597.0, "step": 10910 }, { "epoch": 2.5307683393208946, "grad_norm": 0.8673981428146362, "learning_rate": 4.375606484613369e-05, "loss": 1.5839, "mean_token_accuracy": 0.6721540942788125, "num_tokens": 175986183.0, "step": 10920 }, { "epoch": 2.5330861049947853, "grad_norm": 0.8522638082504272, "learning_rate": 4.374365301908789e-05, "loss": 1.5513, "mean_token_accuracy": 0.6747685641050338, "num_tokens": 176147270.0, "step": 10930 }, { "epoch": 2.5354038706686755, "grad_norm": 0.886622428894043, "learning_rate": 4.373123063215515e-05, "loss": 1.5701, "mean_token_accuracy": 0.6743329510092735, "num_tokens": 176307222.0, "step": 10940 }, { "epoch": 2.5377216363425656, "grad_norm": 0.8299809694290161, "learning_rate": 4.3718797692334046e-05, "loss": 1.5617, "mean_token_accuracy": 0.6750142782926559, "num_tokens": 176468505.0, "step": 10950 }, { "epoch": 2.5400394020164563, "grad_norm": 0.9415794610977173, "learning_rate": 4.370635420662911e-05, "loss": 1.5685, "mean_token_accuracy": 0.6738820016384125, "num_tokens": 176630341.0, "step": 10960 }, { "epoch": 2.5423571676903465, "grad_norm": 0.8775012493133545, "learning_rate": 4.369390018205082e-05, "loss": 1.5822, "mean_token_accuracy": 0.6727201357483864, "num_tokens": 176791444.0, "step": 10970 }, { "epoch": 2.544674933364237, "grad_norm": 0.855474054813385, "learning_rate": 4.368143562561556e-05, "loss": 1.5825, "mean_token_accuracy": 0.6710058808326721, "num_tokens": 176953090.0, "step": 10980 }, { "epoch": 2.5469926990381273, "grad_norm": 0.9350031614303589, "learning_rate": 4.36689605443457e-05, "loss": 1.5857, "mean_token_accuracy": 0.6709888488054275, "num_tokens": 177114832.0, "step": 10990 }, { "epoch": 2.5493104647120175, "grad_norm": 0.8906374573707581, "learning_rate": 4.365647494526949e-05, "loss": 1.5758, "mean_token_accuracy": 0.6728141903877258, "num_tokens": 177276135.0, "step": 11000 }, { "epoch": 2.551628230385908, "grad_norm": 0.8895601034164429, "learning_rate": 4.364397883542112e-05, "loss": 1.5865, "mean_token_accuracy": 0.6723509863018989, "num_tokens": 177437812.0, "step": 11010 }, { "epoch": 2.5539459960597983, "grad_norm": 0.8723879456520081, "learning_rate": 4.363147222184072e-05, "loss": 1.5561, "mean_token_accuracy": 0.6752788960933686, "num_tokens": 177599102.0, "step": 11020 }, { "epoch": 2.556263761733689, "grad_norm": 0.9129688143730164, "learning_rate": 4.3618955111574325e-05, "loss": 1.5582, "mean_token_accuracy": 0.6758482456207275, "num_tokens": 177759845.0, "step": 11030 }, { "epoch": 2.558581527407579, "grad_norm": 0.8541902899742126, "learning_rate": 4.360642751167388e-05, "loss": 1.5503, "mean_token_accuracy": 0.6749211117625237, "num_tokens": 177921631.0, "step": 11040 }, { "epoch": 2.5608992930814694, "grad_norm": 0.8773646354675293, "learning_rate": 4.359388942919723e-05, "loss": 1.5685, "mean_token_accuracy": 0.6742424249649048, "num_tokens": 178082750.0, "step": 11050 }, { "epoch": 2.56321705875536, "grad_norm": 0.8767688274383545, "learning_rate": 4.358134087120817e-05, "loss": 1.5553, "mean_token_accuracy": 0.675481478869915, "num_tokens": 178243832.0, "step": 11060 }, { "epoch": 2.56553482442925, "grad_norm": 0.8202732801437378, "learning_rate": 4.3568781844776326e-05, "loss": 1.5606, "mean_token_accuracy": 0.6754263430833817, "num_tokens": 178405015.0, "step": 11070 }, { "epoch": 2.567852590103141, "grad_norm": 0.8970996141433716, "learning_rate": 4.355621235697728e-05, "loss": 1.566, "mean_token_accuracy": 0.6731954246759415, "num_tokens": 178566490.0, "step": 11080 }, { "epoch": 2.570170355777031, "grad_norm": 0.8919036388397217, "learning_rate": 4.35436324148925e-05, "loss": 1.5776, "mean_token_accuracy": 0.6737454801797866, "num_tokens": 178727530.0, "step": 11090 }, { "epoch": 2.572488121450921, "grad_norm": 0.8909868001937866, "learning_rate": 4.353104202560931e-05, "loss": 1.5668, "mean_token_accuracy": 0.674419891834259, "num_tokens": 178889012.0, "step": 11100 }, { "epoch": 2.574805887124812, "grad_norm": 0.8636408448219299, "learning_rate": 4.3518441196220975e-05, "loss": 1.5808, "mean_token_accuracy": 0.6719794899225235, "num_tokens": 179050215.0, "step": 11110 }, { "epoch": 2.577123652798702, "grad_norm": 1.0039021968841553, "learning_rate": 4.350582993382657e-05, "loss": 1.5717, "mean_token_accuracy": 0.67432641685009, "num_tokens": 179211381.0, "step": 11120 }, { "epoch": 2.5794414184725927, "grad_norm": 0.8170202374458313, "learning_rate": 4.349320824553111e-05, "loss": 1.5594, "mean_token_accuracy": 0.6762270256876945, "num_tokens": 179372443.0, "step": 11130 }, { "epoch": 2.581759184146483, "grad_norm": 0.8563057780265808, "learning_rate": 4.3480576138445454e-05, "loss": 1.5526, "mean_token_accuracy": 0.675521020591259, "num_tokens": 179532264.0, "step": 11140 }, { "epoch": 2.584076949820373, "grad_norm": 0.8967027068138123, "learning_rate": 4.3467933619686344e-05, "loss": 1.5659, "mean_token_accuracy": 0.6744699820876121, "num_tokens": 179692565.0, "step": 11150 }, { "epoch": 2.5863947154942637, "grad_norm": 0.8677548170089722, "learning_rate": 4.3455280696376376e-05, "loss": 1.5609, "mean_token_accuracy": 0.6752002596855163, "num_tokens": 179853379.0, "step": 11160 }, { "epoch": 2.588712481168154, "grad_norm": 0.8463841676712036, "learning_rate": 4.344261737564402e-05, "loss": 1.578, "mean_token_accuracy": 0.6717696934938431, "num_tokens": 180014948.0, "step": 11170 }, { "epoch": 2.5910302468420445, "grad_norm": 0.9409759044647217, "learning_rate": 4.342994366462358e-05, "loss": 1.5577, "mean_token_accuracy": 0.6755247414112091, "num_tokens": 180176032.0, "step": 11180 }, { "epoch": 2.5933480125159347, "grad_norm": 0.9245473742485046, "learning_rate": 4.341725957045525e-05, "loss": 1.5489, "mean_token_accuracy": 0.6752829551696777, "num_tokens": 180337756.0, "step": 11190 }, { "epoch": 2.595665778189825, "grad_norm": 0.9055729508399963, "learning_rate": 4.340456510028505e-05, "loss": 1.5603, "mean_token_accuracy": 0.6738794535398483, "num_tokens": 180499230.0, "step": 11200 }, { "epoch": 2.597983543863715, "grad_norm": 0.8280284404754639, "learning_rate": 4.339186026126484e-05, "loss": 1.5468, "mean_token_accuracy": 0.6773240149021149, "num_tokens": 180660905.0, "step": 11210 }, { "epoch": 2.6003013095376057, "grad_norm": 0.890791654586792, "learning_rate": 4.337914506055234e-05, "loss": 1.5857, "mean_token_accuracy": 0.6720955550670624, "num_tokens": 180821572.0, "step": 11220 }, { "epoch": 2.6026190752114964, "grad_norm": 0.897883415222168, "learning_rate": 4.3366419505311095e-05, "loss": 1.5644, "mean_token_accuracy": 0.6729147583246231, "num_tokens": 180982715.0, "step": 11230 }, { "epoch": 2.6049368408853866, "grad_norm": 0.8671573400497437, "learning_rate": 4.3353683602710486e-05, "loss": 1.5777, "mean_token_accuracy": 0.6725055798888206, "num_tokens": 181143819.0, "step": 11240 }, { "epoch": 2.6072546065592768, "grad_norm": 0.9871276021003723, "learning_rate": 4.334093735992574e-05, "loss": 1.567, "mean_token_accuracy": 0.6740219831466675, "num_tokens": 181305359.0, "step": 11250 }, { "epoch": 2.609572372233167, "grad_norm": 0.9208345413208008, "learning_rate": 4.332818078413788e-05, "loss": 1.5735, "mean_token_accuracy": 0.6739403799176216, "num_tokens": 181466023.0, "step": 11260 }, { "epoch": 2.6118901379070576, "grad_norm": 0.9958164095878601, "learning_rate": 4.331541388253376e-05, "loss": 1.566, "mean_token_accuracy": 0.6741639629006386, "num_tokens": 181626161.0, "step": 11270 }, { "epoch": 2.6142079035809482, "grad_norm": 0.9709334969520569, "learning_rate": 4.3302636662306086e-05, "loss": 1.5655, "mean_token_accuracy": 0.6752532333135605, "num_tokens": 181787569.0, "step": 11280 }, { "epoch": 2.6165256692548384, "grad_norm": 0.9177526235580444, "learning_rate": 4.3289849130653315e-05, "loss": 1.5672, "mean_token_accuracy": 0.6744115933775902, "num_tokens": 181948718.0, "step": 11290 }, { "epoch": 2.6188434349287286, "grad_norm": 0.9137944579124451, "learning_rate": 4.3277051294779756e-05, "loss": 1.5646, "mean_token_accuracy": 0.6743355393409729, "num_tokens": 182110036.0, "step": 11300 }, { "epoch": 2.621161200602619, "grad_norm": 0.9123247265815735, "learning_rate": 4.326424316189553e-05, "loss": 1.5848, "mean_token_accuracy": 0.6720136150717735, "num_tokens": 182271561.0, "step": 11310 }, { "epoch": 2.6234789662765094, "grad_norm": 0.8899207711219788, "learning_rate": 4.3251424739216525e-05, "loss": 1.5666, "mean_token_accuracy": 0.6729052364826202, "num_tokens": 182433575.0, "step": 11320 }, { "epoch": 2.6257967319504, "grad_norm": 0.9065485000610352, "learning_rate": 4.3238596033964455e-05, "loss": 1.5746, "mean_token_accuracy": 0.6733144521713257, "num_tokens": 182594821.0, "step": 11330 }, { "epoch": 2.6281144976242903, "grad_norm": 0.9548599123954773, "learning_rate": 4.32257570533668e-05, "loss": 1.5566, "mean_token_accuracy": 0.6754528194665909, "num_tokens": 182756335.0, "step": 11340 }, { "epoch": 2.6304322632981805, "grad_norm": 0.9275557398796082, "learning_rate": 4.321290780465686e-05, "loss": 1.5599, "mean_token_accuracy": 0.6736985489726066, "num_tokens": 182917035.0, "step": 11350 }, { "epoch": 2.6327500289720707, "grad_norm": 0.9197682738304138, "learning_rate": 4.32000482950737e-05, "loss": 1.5596, "mean_token_accuracy": 0.6745417237281799, "num_tokens": 183078689.0, "step": 11360 }, { "epoch": 2.6350677946459613, "grad_norm": 0.9058223962783813, "learning_rate": 4.318717853186217e-05, "loss": 1.5568, "mean_token_accuracy": 0.6748128354549408, "num_tokens": 183239901.0, "step": 11370 }, { "epoch": 2.637385560319852, "grad_norm": 0.9426844120025635, "learning_rate": 4.31742985222729e-05, "loss": 1.5813, "mean_token_accuracy": 0.6719145327806473, "num_tokens": 183401169.0, "step": 11380 }, { "epoch": 2.639703325993742, "grad_norm": 0.9459543228149414, "learning_rate": 4.3161408273562286e-05, "loss": 1.5635, "mean_token_accuracy": 0.6737226083874702, "num_tokens": 183562516.0, "step": 11390 }, { "epoch": 2.6420210916676323, "grad_norm": 0.9817150235176086, "learning_rate": 4.3148507792992504e-05, "loss": 1.5705, "mean_token_accuracy": 0.6741032570600509, "num_tokens": 183721154.0, "step": 11400 }, { "epoch": 2.6443388573415225, "grad_norm": 0.9357630014419556, "learning_rate": 4.313559708783148e-05, "loss": 1.5602, "mean_token_accuracy": 0.6751904353499413, "num_tokens": 183882084.0, "step": 11410 }, { "epoch": 2.646656623015413, "grad_norm": 0.9003283381462097, "learning_rate": 4.312267616535291e-05, "loss": 1.5697, "mean_token_accuracy": 0.6747008800506592, "num_tokens": 184043875.0, "step": 11420 }, { "epoch": 2.6489743886893033, "grad_norm": 0.9971180558204651, "learning_rate": 4.310974503283624e-05, "loss": 1.5708, "mean_token_accuracy": 0.6744449958205223, "num_tokens": 184205945.0, "step": 11430 }, { "epoch": 2.651292154363194, "grad_norm": 0.8551807999610901, "learning_rate": 4.309680369756668e-05, "loss": 1.5728, "mean_token_accuracy": 0.6739504963159562, "num_tokens": 184367420.0, "step": 11440 }, { "epoch": 2.653609920037084, "grad_norm": 0.8934165239334106, "learning_rate": 4.308385216683517e-05, "loss": 1.5804, "mean_token_accuracy": 0.6719068944454193, "num_tokens": 184528675.0, "step": 11450 }, { "epoch": 2.6559276857109744, "grad_norm": 0.9239146709442139, "learning_rate": 4.307089044793842e-05, "loss": 1.5444, "mean_token_accuracy": 0.6774209022521973, "num_tokens": 184690052.0, "step": 11460 }, { "epoch": 2.658245451384865, "grad_norm": 0.8739234805107117, "learning_rate": 4.305791854817884e-05, "loss": 1.5613, "mean_token_accuracy": 0.6741311132907868, "num_tokens": 184851744.0, "step": 11470 }, { "epoch": 2.660563217058755, "grad_norm": 0.8649729490280151, "learning_rate": 4.3044936474864624e-05, "loss": 1.5689, "mean_token_accuracy": 0.6734780266880989, "num_tokens": 185013332.0, "step": 11480 }, { "epoch": 2.662880982732646, "grad_norm": 0.8884968757629395, "learning_rate": 4.303194423530966e-05, "loss": 1.5727, "mean_token_accuracy": 0.6747723415493965, "num_tokens": 185174473.0, "step": 11490 }, { "epoch": 2.665198748406536, "grad_norm": 0.8753563761711121, "learning_rate": 4.301894183683357e-05, "loss": 1.5758, "mean_token_accuracy": 0.6728445917367936, "num_tokens": 185336139.0, "step": 11500 }, { "epoch": 2.6675165140804262, "grad_norm": 0.8769884705543518, "learning_rate": 4.300592928676172e-05, "loss": 1.5645, "mean_token_accuracy": 0.6738799884915352, "num_tokens": 185497393.0, "step": 11510 }, { "epoch": 2.669834279754317, "grad_norm": 0.8846433162689209, "learning_rate": 4.299290659242517e-05, "loss": 1.5713, "mean_token_accuracy": 0.6725337535142899, "num_tokens": 185657885.0, "step": 11520 }, { "epoch": 2.672152045428207, "grad_norm": 0.8534242510795593, "learning_rate": 4.2979873761160724e-05, "loss": 1.5776, "mean_token_accuracy": 0.6735530570149422, "num_tokens": 185818895.0, "step": 11530 }, { "epoch": 2.6744698111020977, "grad_norm": 0.8382574915885925, "learning_rate": 4.2966830800310866e-05, "loss": 1.5733, "mean_token_accuracy": 0.6729929804801941, "num_tokens": 185980358.0, "step": 11540 }, { "epoch": 2.676787576775988, "grad_norm": 0.8923628330230713, "learning_rate": 4.29537777172238e-05, "loss": 1.576, "mean_token_accuracy": 0.6733607828617096, "num_tokens": 186142005.0, "step": 11550 }, { "epoch": 2.679105342449878, "grad_norm": 0.84076988697052, "learning_rate": 4.2940714519253435e-05, "loss": 1.5383, "mean_token_accuracy": 0.676521846652031, "num_tokens": 186303098.0, "step": 11560 }, { "epoch": 2.6814231081237687, "grad_norm": 0.8393909931182861, "learning_rate": 4.292764121375938e-05, "loss": 1.557, "mean_token_accuracy": 0.6746822193264961, "num_tokens": 186462358.0, "step": 11570 }, { "epoch": 2.683740873797659, "grad_norm": 0.8806865215301514, "learning_rate": 4.2914557808106925e-05, "loss": 1.5646, "mean_token_accuracy": 0.6740328907966614, "num_tokens": 186621964.0, "step": 11580 }, { "epoch": 2.6860586394715495, "grad_norm": 0.9114323854446411, "learning_rate": 4.290146430966707e-05, "loss": 1.5484, "mean_token_accuracy": 0.6769006386399269, "num_tokens": 186784054.0, "step": 11590 }, { "epoch": 2.6883764051454397, "grad_norm": 0.9245296716690063, "learning_rate": 4.288836072581649e-05, "loss": 1.5704, "mean_token_accuracy": 0.6739474356174469, "num_tokens": 186945519.0, "step": 11600 }, { "epoch": 2.69069417081933, "grad_norm": 0.8723097443580627, "learning_rate": 4.287524706393755e-05, "loss": 1.5444, "mean_token_accuracy": 0.6769179582595826, "num_tokens": 187104639.0, "step": 11610 }, { "epoch": 2.6930119364932206, "grad_norm": 0.8546950221061707, "learning_rate": 4.286212333141827e-05, "loss": 1.5647, "mean_token_accuracy": 0.6754549443721771, "num_tokens": 187265908.0, "step": 11620 }, { "epoch": 2.6953297021671108, "grad_norm": 0.9270439743995667, "learning_rate": 4.2848989535652366e-05, "loss": 1.5676, "mean_token_accuracy": 0.6722413212060928, "num_tokens": 187427048.0, "step": 11630 }, { "epoch": 2.6976474678410014, "grad_norm": 0.9118633270263672, "learning_rate": 4.283584568403922e-05, "loss": 1.5603, "mean_token_accuracy": 0.6742161229252815, "num_tokens": 187588962.0, "step": 11640 }, { "epoch": 2.6999652335148916, "grad_norm": 1.0224332809448242, "learning_rate": 4.282269178398388e-05, "loss": 1.5621, "mean_token_accuracy": 0.6737453579902649, "num_tokens": 187749812.0, "step": 11650 }, { "epoch": 2.702282999188782, "grad_norm": 0.8997107744216919, "learning_rate": 4.2809527842897046e-05, "loss": 1.5619, "mean_token_accuracy": 0.6730934292078018, "num_tokens": 187910449.0, "step": 11660 }, { "epoch": 2.7046007648626724, "grad_norm": 0.8585659861564636, "learning_rate": 4.279635386819509e-05, "loss": 1.5656, "mean_token_accuracy": 0.6739537209272385, "num_tokens": 188071050.0, "step": 11670 }, { "epoch": 2.7069185305365626, "grad_norm": 0.8320568799972534, "learning_rate": 4.278316986730001e-05, "loss": 1.5806, "mean_token_accuracy": 0.6716464504599571, "num_tokens": 188231519.0, "step": 11680 }, { "epoch": 2.7092362962104533, "grad_norm": 0.9187573790550232, "learning_rate": 4.276997584763949e-05, "loss": 1.5714, "mean_token_accuracy": 0.6729305416345597, "num_tokens": 188392771.0, "step": 11690 }, { "epoch": 2.7115540618843434, "grad_norm": 0.8503731489181519, "learning_rate": 4.275677181664682e-05, "loss": 1.5551, "mean_token_accuracy": 0.6742112144827843, "num_tokens": 188554513.0, "step": 11700 }, { "epoch": 2.7138718275582336, "grad_norm": 0.894567608833313, "learning_rate": 4.274355778176096e-05, "loss": 1.5515, "mean_token_accuracy": 0.6770034074783325, "num_tokens": 188715348.0, "step": 11710 }, { "epoch": 2.7161895932321243, "grad_norm": 0.8807804584503174, "learning_rate": 4.2730333750426485e-05, "loss": 1.5653, "mean_token_accuracy": 0.6737002477049827, "num_tokens": 188875781.0, "step": 11720 }, { "epoch": 2.7185073589060145, "grad_norm": 0.8920128345489502, "learning_rate": 4.2717099730093634e-05, "loss": 1.5829, "mean_token_accuracy": 0.6724794462323189, "num_tokens": 189037127.0, "step": 11730 }, { "epoch": 2.720825124579905, "grad_norm": 0.9108846187591553, "learning_rate": 4.2703855728218226e-05, "loss": 1.5697, "mean_token_accuracy": 0.6734240531921387, "num_tokens": 189199098.0, "step": 11740 }, { "epoch": 2.7231428902537953, "grad_norm": 0.9042876362800598, "learning_rate": 4.269060175226174e-05, "loss": 1.5788, "mean_token_accuracy": 0.671507041156292, "num_tokens": 189359673.0, "step": 11750 }, { "epoch": 2.7254606559276855, "grad_norm": 0.9487659931182861, "learning_rate": 4.267733780969126e-05, "loss": 1.5675, "mean_token_accuracy": 0.6741176128387452, "num_tokens": 189519411.0, "step": 11760 }, { "epoch": 2.727778421601576, "grad_norm": 0.8899070024490356, "learning_rate": 4.2664063907979505e-05, "loss": 1.5676, "mean_token_accuracy": 0.6737745374441146, "num_tokens": 189681125.0, "step": 11770 }, { "epoch": 2.7300961872754663, "grad_norm": 0.8399838209152222, "learning_rate": 4.265078005460476e-05, "loss": 1.5814, "mean_token_accuracy": 0.6717970848083497, "num_tokens": 189842602.0, "step": 11780 }, { "epoch": 2.732413952949357, "grad_norm": 0.8886008858680725, "learning_rate": 4.263748625705096e-05, "loss": 1.5784, "mean_token_accuracy": 0.6732654422521591, "num_tokens": 190004775.0, "step": 11790 }, { "epoch": 2.734731718623247, "grad_norm": 0.9198834300041199, "learning_rate": 4.262418252280762e-05, "loss": 1.5661, "mean_token_accuracy": 0.674604544043541, "num_tokens": 190166741.0, "step": 11800 }, { "epoch": 2.7370494842971373, "grad_norm": 0.8949372172355652, "learning_rate": 4.261086885936987e-05, "loss": 1.5576, "mean_token_accuracy": 0.6744737982749939, "num_tokens": 190328434.0, "step": 11810 }, { "epoch": 2.739367249971028, "grad_norm": 0.8204642534255981, "learning_rate": 4.259754527423841e-05, "loss": 1.5715, "mean_token_accuracy": 0.673452478647232, "num_tokens": 190490303.0, "step": 11820 }, { "epoch": 2.741685015644918, "grad_norm": 0.922583818435669, "learning_rate": 4.258421177491956e-05, "loss": 1.5697, "mean_token_accuracy": 0.6740804925560951, "num_tokens": 190651763.0, "step": 11830 }, { "epoch": 2.744002781318809, "grad_norm": 0.9431923627853394, "learning_rate": 4.25708683689252e-05, "loss": 1.5768, "mean_token_accuracy": 0.6729589581489563, "num_tokens": 190813364.0, "step": 11840 }, { "epoch": 2.746320546992699, "grad_norm": 0.8981934785842896, "learning_rate": 4.25575150637728e-05, "loss": 1.5481, "mean_token_accuracy": 0.6768338412046433, "num_tokens": 190974266.0, "step": 11850 }, { "epoch": 2.748638312666589, "grad_norm": 0.8502449989318848, "learning_rate": 4.254415186698541e-05, "loss": 1.5485, "mean_token_accuracy": 0.6756613209843636, "num_tokens": 191134479.0, "step": 11860 }, { "epoch": 2.75095607834048, "grad_norm": 0.8128703236579895, "learning_rate": 4.2530778786091653e-05, "loss": 1.5745, "mean_token_accuracy": 0.6729095712304115, "num_tokens": 191295640.0, "step": 11870 }, { "epoch": 2.75327384401437, "grad_norm": 0.8941384553909302, "learning_rate": 4.251739582862571e-05, "loss": 1.5702, "mean_token_accuracy": 0.6733563661575317, "num_tokens": 191457457.0, "step": 11880 }, { "epoch": 2.7555916096882607, "grad_norm": 0.8608803749084473, "learning_rate": 4.250400300212734e-05, "loss": 1.5701, "mean_token_accuracy": 0.6731189489364624, "num_tokens": 191618951.0, "step": 11890 }, { "epoch": 2.757909375362151, "grad_norm": 0.9867350459098816, "learning_rate": 4.2490600314141856e-05, "loss": 1.5829, "mean_token_accuracy": 0.6720508396625519, "num_tokens": 191779970.0, "step": 11900 }, { "epoch": 2.760227141036041, "grad_norm": 0.9466263651847839, "learning_rate": 4.247718777222012e-05, "loss": 1.5845, "mean_token_accuracy": 0.6722267687320709, "num_tokens": 191940117.0, "step": 11910 }, { "epoch": 2.7625449067099317, "grad_norm": 0.8910266757011414, "learning_rate": 4.246376538391857e-05, "loss": 1.5683, "mean_token_accuracy": 0.674344827234745, "num_tokens": 192101785.0, "step": 11920 }, { "epoch": 2.764862672383822, "grad_norm": 0.8677080273628235, "learning_rate": 4.2450333156799154e-05, "loss": 1.562, "mean_token_accuracy": 0.6745221495628357, "num_tokens": 192262567.0, "step": 11930 }, { "epoch": 2.7671804380577125, "grad_norm": 0.8454301357269287, "learning_rate": 4.243689109842939e-05, "loss": 1.5472, "mean_token_accuracy": 0.6764202535152435, "num_tokens": 192424623.0, "step": 11940 }, { "epoch": 2.7694982037316027, "grad_norm": 0.8997695446014404, "learning_rate": 4.242343921638234e-05, "loss": 1.5602, "mean_token_accuracy": 0.6751050487160682, "num_tokens": 192586370.0, "step": 11950 }, { "epoch": 2.771815969405493, "grad_norm": 0.9458882212638855, "learning_rate": 4.240997751823657e-05, "loss": 1.5608, "mean_token_accuracy": 0.6745018422603607, "num_tokens": 192747407.0, "step": 11960 }, { "epoch": 2.7741337350793835, "grad_norm": 0.8895732164382935, "learning_rate": 4.239650601157621e-05, "loss": 1.5646, "mean_token_accuracy": 0.6750097766518592, "num_tokens": 192908341.0, "step": 11970 }, { "epoch": 2.7764515007532737, "grad_norm": 0.9410310983657837, "learning_rate": 4.238302470399088e-05, "loss": 1.5621, "mean_token_accuracy": 0.6755830854177475, "num_tokens": 193069718.0, "step": 11980 }, { "epoch": 2.7787692664271644, "grad_norm": 0.8982763290405273, "learning_rate": 4.236953360307576e-05, "loss": 1.5538, "mean_token_accuracy": 0.6756154477596283, "num_tokens": 193229865.0, "step": 11990 }, { "epoch": 2.7810870321010546, "grad_norm": 0.8440506458282471, "learning_rate": 4.2356032716431526e-05, "loss": 1.5653, "mean_token_accuracy": 0.6733071342110634, "num_tokens": 193391745.0, "step": 12000 }, { "epoch": 2.7834047977749448, "grad_norm": 0.8488901853561401, "learning_rate": 4.234252205166437e-05, "loss": 1.5538, "mean_token_accuracy": 0.6749999463558197, "num_tokens": 193553172.0, "step": 12010 }, { "epoch": 2.7857225634488354, "grad_norm": 0.8835523128509521, "learning_rate": 4.232900161638599e-05, "loss": 1.571, "mean_token_accuracy": 0.6729784458875656, "num_tokens": 193714064.0, "step": 12020 }, { "epoch": 2.7880403291227256, "grad_norm": 0.8581920862197876, "learning_rate": 4.231547141821359e-05, "loss": 1.5659, "mean_token_accuracy": 0.6753469064831734, "num_tokens": 193874927.0, "step": 12030 }, { "epoch": 2.7903580947966162, "grad_norm": 0.889579713344574, "learning_rate": 4.2301931464769886e-05, "loss": 1.5704, "mean_token_accuracy": 0.6733390867710114, "num_tokens": 194034885.0, "step": 12040 }, { "epoch": 2.7926758604705064, "grad_norm": 0.8801897168159485, "learning_rate": 4.228838176368307e-05, "loss": 1.5729, "mean_token_accuracy": 0.6732494577765464, "num_tokens": 194196668.0, "step": 12050 }, { "epoch": 2.7949936261443966, "grad_norm": 0.8805829286575317, "learning_rate": 4.227482232258685e-05, "loss": 1.5615, "mean_token_accuracy": 0.6757313415408135, "num_tokens": 194358473.0, "step": 12060 }, { "epoch": 2.7973113918182873, "grad_norm": 0.8420342803001404, "learning_rate": 4.2261253149120394e-05, "loss": 1.5719, "mean_token_accuracy": 0.6737048774957657, "num_tokens": 194520052.0, "step": 12070 }, { "epoch": 2.7996291574921774, "grad_norm": 0.9021820425987244, "learning_rate": 4.224767425092837e-05, "loss": 1.5488, "mean_token_accuracy": 0.6761628642678261, "num_tokens": 194680240.0, "step": 12080 }, { "epoch": 2.801946923166068, "grad_norm": 0.9048514366149902, "learning_rate": 4.2234085635660925e-05, "loss": 1.5619, "mean_token_accuracy": 0.6737291485071182, "num_tokens": 194841925.0, "step": 12090 }, { "epoch": 2.8042646888399583, "grad_norm": 0.8300871253013611, "learning_rate": 4.222048731097367e-05, "loss": 1.5565, "mean_token_accuracy": 0.6741579785943032, "num_tokens": 195003342.0, "step": 12100 }, { "epoch": 2.8065824545138485, "grad_norm": 0.8433386087417603, "learning_rate": 4.22068792845277e-05, "loss": 1.5505, "mean_token_accuracy": 0.6757718920707703, "num_tokens": 195164637.0, "step": 12110 }, { "epoch": 2.808900220187739, "grad_norm": 0.919049084186554, "learning_rate": 4.219326156398957e-05, "loss": 1.5519, "mean_token_accuracy": 0.6772834420204162, "num_tokens": 195326460.0, "step": 12120 }, { "epoch": 2.8112179858616293, "grad_norm": 0.866165816783905, "learning_rate": 4.217963415703128e-05, "loss": 1.5523, "mean_token_accuracy": 0.6755140542984008, "num_tokens": 195486691.0, "step": 12130 }, { "epoch": 2.81353575153552, "grad_norm": 0.9088456034660339, "learning_rate": 4.216599707133032e-05, "loss": 1.5577, "mean_token_accuracy": 0.6747608780860901, "num_tokens": 195648042.0, "step": 12140 }, { "epoch": 2.81585351720941, "grad_norm": 0.8617961406707764, "learning_rate": 4.215235031456961e-05, "loss": 1.5598, "mean_token_accuracy": 0.6732538625597954, "num_tokens": 195808947.0, "step": 12150 }, { "epoch": 2.8181712828833003, "grad_norm": 0.8971878290176392, "learning_rate": 4.2138693894437517e-05, "loss": 1.5696, "mean_token_accuracy": 0.6741226762533188, "num_tokens": 195970335.0, "step": 12160 }, { "epoch": 2.820489048557191, "grad_norm": 0.8796852827072144, "learning_rate": 4.212502781862786e-05, "loss": 1.5688, "mean_token_accuracy": 0.673075620830059, "num_tokens": 196131456.0, "step": 12170 }, { "epoch": 2.822806814231081, "grad_norm": 0.8547409176826477, "learning_rate": 4.211135209483991e-05, "loss": 1.5583, "mean_token_accuracy": 0.6746181815862655, "num_tokens": 196291194.0, "step": 12180 }, { "epoch": 2.825124579904972, "grad_norm": 0.9274863004684448, "learning_rate": 4.209766673077834e-05, "loss": 1.5391, "mean_token_accuracy": 0.6770692944526673, "num_tokens": 196452548.0, "step": 12190 }, { "epoch": 2.827442345578862, "grad_norm": 0.862734854221344, "learning_rate": 4.2083971734153284e-05, "loss": 1.5562, "mean_token_accuracy": 0.6752054691314697, "num_tokens": 196614341.0, "step": 12200 }, { "epoch": 2.829760111252752, "grad_norm": 0.8328682780265808, "learning_rate": 4.20702671126803e-05, "loss": 1.5498, "mean_token_accuracy": 0.6762443408370018, "num_tokens": 196775595.0, "step": 12210 }, { "epoch": 2.832077876926643, "grad_norm": 0.8409584164619446, "learning_rate": 4.205655287408036e-05, "loss": 1.5621, "mean_token_accuracy": 0.6737825304269791, "num_tokens": 196937096.0, "step": 12220 }, { "epoch": 2.834395642600533, "grad_norm": 0.8545731902122498, "learning_rate": 4.204282902607984e-05, "loss": 1.5661, "mean_token_accuracy": 0.6745387986302376, "num_tokens": 197098969.0, "step": 12230 }, { "epoch": 2.8367134082744236, "grad_norm": 0.8358912467956543, "learning_rate": 4.202909557641057e-05, "loss": 1.5654, "mean_token_accuracy": 0.6744544863700866, "num_tokens": 197260307.0, "step": 12240 }, { "epoch": 2.839031173948314, "grad_norm": 0.8719840049743652, "learning_rate": 4.201535253280975e-05, "loss": 1.5629, "mean_token_accuracy": 0.674602136015892, "num_tokens": 197421512.0, "step": 12250 }, { "epoch": 2.841348939622204, "grad_norm": 0.8983206152915955, "learning_rate": 4.2001599903019994e-05, "loss": 1.57, "mean_token_accuracy": 0.6734285235404969, "num_tokens": 197582981.0, "step": 12260 }, { "epoch": 2.8436667052960947, "grad_norm": 0.8839084506034851, "learning_rate": 4.1987837694789345e-05, "loss": 1.5554, "mean_token_accuracy": 0.6750908702611923, "num_tokens": 197744437.0, "step": 12270 }, { "epoch": 2.845984470969985, "grad_norm": 0.8871994018554688, "learning_rate": 4.19740659158712e-05, "loss": 1.5515, "mean_token_accuracy": 0.6745428651571274, "num_tokens": 197906389.0, "step": 12280 }, { "epoch": 2.8483022366438755, "grad_norm": 0.8355122804641724, "learning_rate": 4.196028457402437e-05, "loss": 1.564, "mean_token_accuracy": 0.6759405359625816, "num_tokens": 198067896.0, "step": 12290 }, { "epoch": 2.8506200023177657, "grad_norm": 0.8434383869171143, "learning_rate": 4.194649367701307e-05, "loss": 1.5614, "mean_token_accuracy": 0.6761018916964531, "num_tokens": 198229144.0, "step": 12300 }, { "epoch": 2.852937767991656, "grad_norm": 0.8985278606414795, "learning_rate": 4.193269323260685e-05, "loss": 1.5459, "mean_token_accuracy": 0.6752711206674575, "num_tokens": 198389325.0, "step": 12310 }, { "epoch": 2.8552555336655465, "grad_norm": 0.8858087658882141, "learning_rate": 4.19188832485807e-05, "loss": 1.5589, "mean_token_accuracy": 0.6758595526218414, "num_tokens": 198551060.0, "step": 12320 }, { "epoch": 2.8575732993394367, "grad_norm": 0.8943134546279907, "learning_rate": 4.190506373271493e-05, "loss": 1.567, "mean_token_accuracy": 0.6736319020390511, "num_tokens": 198713340.0, "step": 12330 }, { "epoch": 2.8598910650133273, "grad_norm": 0.8552795052528381, "learning_rate": 4.1891234692795264e-05, "loss": 1.5482, "mean_token_accuracy": 0.67644362449646, "num_tokens": 198875445.0, "step": 12340 }, { "epoch": 2.8622088306872175, "grad_norm": 0.9147897362709045, "learning_rate": 4.187739613661275e-05, "loss": 1.5573, "mean_token_accuracy": 0.6769810304045677, "num_tokens": 199035291.0, "step": 12350 }, { "epoch": 2.8645265963611077, "grad_norm": 0.8299669027328491, "learning_rate": 4.186354807196384e-05, "loss": 1.547, "mean_token_accuracy": 0.675946268439293, "num_tokens": 199197108.0, "step": 12360 }, { "epoch": 2.8668443620349984, "grad_norm": 0.8724255561828613, "learning_rate": 4.1849690506650304e-05, "loss": 1.5663, "mean_token_accuracy": 0.6734787464141846, "num_tokens": 199358998.0, "step": 12370 }, { "epoch": 2.8691621277088886, "grad_norm": 0.8468236327171326, "learning_rate": 4.1835823448479286e-05, "loss": 1.5448, "mean_token_accuracy": 0.6760605558753013, "num_tokens": 199519982.0, "step": 12380 }, { "epoch": 2.871479893382779, "grad_norm": 0.8902110457420349, "learning_rate": 4.182194690526328e-05, "loss": 1.5597, "mean_token_accuracy": 0.6751898244023323, "num_tokens": 199681759.0, "step": 12390 }, { "epoch": 2.8737976590566694, "grad_norm": 0.9925691485404968, "learning_rate": 4.180806088482011e-05, "loss": 1.5566, "mean_token_accuracy": 0.675204236805439, "num_tokens": 199842457.0, "step": 12400 }, { "epoch": 2.8761154247305596, "grad_norm": 0.8955065608024597, "learning_rate": 4.179416539497295e-05, "loss": 1.5612, "mean_token_accuracy": 0.6736524894833564, "num_tokens": 200004264.0, "step": 12410 }, { "epoch": 2.8784331904044502, "grad_norm": 0.8608587384223938, "learning_rate": 4.1780260443550315e-05, "loss": 1.5494, "mean_token_accuracy": 0.675679224729538, "num_tokens": 200165995.0, "step": 12420 }, { "epoch": 2.8807509560783404, "grad_norm": 0.875451385974884, "learning_rate": 4.176634603838603e-05, "loss": 1.5675, "mean_token_accuracy": 0.6735008269548416, "num_tokens": 200327117.0, "step": 12430 }, { "epoch": 2.883068721752231, "grad_norm": 0.8969104290008545, "learning_rate": 4.175242218731926e-05, "loss": 1.5602, "mean_token_accuracy": 0.6748377084732056, "num_tokens": 200488592.0, "step": 12440 }, { "epoch": 2.8853864874261212, "grad_norm": 0.8477093577384949, "learning_rate": 4.17384888981945e-05, "loss": 1.5677, "mean_token_accuracy": 0.6745888903737068, "num_tokens": 200650229.0, "step": 12450 }, { "epoch": 2.8877042531000114, "grad_norm": 0.9411566257476807, "learning_rate": 4.172454617886153e-05, "loss": 1.5553, "mean_token_accuracy": 0.6755340591073036, "num_tokens": 200811396.0, "step": 12460 }, { "epoch": 2.890022018773902, "grad_norm": 0.8868020176887512, "learning_rate": 4.171059403717549e-05, "loss": 1.5515, "mean_token_accuracy": 0.6740733176469803, "num_tokens": 200972981.0, "step": 12470 }, { "epoch": 2.8923397844477923, "grad_norm": 0.8923588991165161, "learning_rate": 4.1696632480996784e-05, "loss": 1.5591, "mean_token_accuracy": 0.6727210462093354, "num_tokens": 201134334.0, "step": 12480 }, { "epoch": 2.894657550121683, "grad_norm": 0.8607123494148254, "learning_rate": 4.168266151819115e-05, "loss": 1.553, "mean_token_accuracy": 0.6754673823714257, "num_tokens": 201296113.0, "step": 12490 }, { "epoch": 2.896975315795573, "grad_norm": 0.8676448464393616, "learning_rate": 4.1668681156629597e-05, "loss": 1.5509, "mean_token_accuracy": 0.6759845599532127, "num_tokens": 201457454.0, "step": 12500 }, { "epoch": 2.8992930814694633, "grad_norm": 0.8919562101364136, "learning_rate": 4.165469140418847e-05, "loss": 1.5474, "mean_token_accuracy": 0.6765233129262924, "num_tokens": 201618497.0, "step": 12510 }, { "epoch": 2.901610847143354, "grad_norm": 0.8883578181266785, "learning_rate": 4.164069226874937e-05, "loss": 1.5597, "mean_token_accuracy": 0.6760734543204308, "num_tokens": 201779517.0, "step": 12520 }, { "epoch": 2.903928612817244, "grad_norm": 0.8537785410881042, "learning_rate": 4.1626683758199194e-05, "loss": 1.5615, "mean_token_accuracy": 0.6750468447804451, "num_tokens": 201941046.0, "step": 12530 }, { "epoch": 2.9062463784911348, "grad_norm": 0.8594660758972168, "learning_rate": 4.161266588043012e-05, "loss": 1.5646, "mean_token_accuracy": 0.6734835341572761, "num_tokens": 202102581.0, "step": 12540 }, { "epoch": 2.908564144165025, "grad_norm": 0.9539382457733154, "learning_rate": 4.159863864333961e-05, "loss": 1.5468, "mean_token_accuracy": 0.6754546493291855, "num_tokens": 202263532.0, "step": 12550 }, { "epoch": 2.910881909838915, "grad_norm": 0.8858863711357117, "learning_rate": 4.1584602054830395e-05, "loss": 1.5552, "mean_token_accuracy": 0.6747140660881996, "num_tokens": 202424528.0, "step": 12560 }, { "epoch": 2.913199675512806, "grad_norm": 0.8770908117294312, "learning_rate": 4.157055612281047e-05, "loss": 1.5279, "mean_token_accuracy": 0.6783531472086907, "num_tokens": 202585872.0, "step": 12570 }, { "epoch": 2.915517441186696, "grad_norm": 0.9087682366371155, "learning_rate": 4.15565008551931e-05, "loss": 1.5679, "mean_token_accuracy": 0.6737754568457603, "num_tokens": 202747606.0, "step": 12580 }, { "epoch": 2.9178352068605866, "grad_norm": 0.9272000193595886, "learning_rate": 4.154243625989682e-05, "loss": 1.5412, "mean_token_accuracy": 0.6760748401284218, "num_tokens": 202908850.0, "step": 12590 }, { "epoch": 2.920152972534477, "grad_norm": 0.8989066481590271, "learning_rate": 4.1528362344845385e-05, "loss": 1.5749, "mean_token_accuracy": 0.6709316954016685, "num_tokens": 203070492.0, "step": 12600 }, { "epoch": 2.922470738208367, "grad_norm": 0.8724773526191711, "learning_rate": 4.1514279117967836e-05, "loss": 1.5711, "mean_token_accuracy": 0.6723182037472725, "num_tokens": 203231223.0, "step": 12610 }, { "epoch": 2.9247885038822576, "grad_norm": 0.8490930795669556, "learning_rate": 4.150018658719844e-05, "loss": 1.5632, "mean_token_accuracy": 0.6738441199064255, "num_tokens": 203392686.0, "step": 12620 }, { "epoch": 2.927106269556148, "grad_norm": 0.9122727513313293, "learning_rate": 4.148608476047673e-05, "loss": 1.5372, "mean_token_accuracy": 0.6785076558589935, "num_tokens": 203553675.0, "step": 12630 }, { "epoch": 2.9294240352300385, "grad_norm": 0.8202568292617798, "learning_rate": 4.147197364574744e-05, "loss": 1.5489, "mean_token_accuracy": 0.6768045663833618, "num_tokens": 203715038.0, "step": 12640 }, { "epoch": 2.9317418009039287, "grad_norm": 0.8618006110191345, "learning_rate": 4.1457853250960554e-05, "loss": 1.5562, "mean_token_accuracy": 0.6756436616182327, "num_tokens": 203874724.0, "step": 12650 }, { "epoch": 2.934059566577819, "grad_norm": 0.8893322944641113, "learning_rate": 4.1443723584071305e-05, "loss": 1.5421, "mean_token_accuracy": 0.6766565203666687, "num_tokens": 204035943.0, "step": 12660 }, { "epoch": 2.9363773322517095, "grad_norm": 0.9173042178153992, "learning_rate": 4.142958465304012e-05, "loss": 1.5556, "mean_token_accuracy": 0.6738110348582268, "num_tokens": 204197136.0, "step": 12670 }, { "epoch": 2.9386950979255997, "grad_norm": 0.8515880703926086, "learning_rate": 4.1415436465832656e-05, "loss": 1.5708, "mean_token_accuracy": 0.6735527709126472, "num_tokens": 204358180.0, "step": 12680 }, { "epoch": 2.9410128635994903, "grad_norm": 0.8436678647994995, "learning_rate": 4.140127903041978e-05, "loss": 1.5356, "mean_token_accuracy": 0.6773023962974548, "num_tokens": 204518351.0, "step": 12690 }, { "epoch": 2.9433306292733805, "grad_norm": 0.8405704498291016, "learning_rate": 4.138711235477759e-05, "loss": 1.5569, "mean_token_accuracy": 0.6765649929642678, "num_tokens": 204679518.0, "step": 12700 }, { "epoch": 2.9456483949472707, "grad_norm": 0.9222233891487122, "learning_rate": 4.1372936446887356e-05, "loss": 1.5715, "mean_token_accuracy": 0.6723070099949837, "num_tokens": 204840606.0, "step": 12710 }, { "epoch": 2.9479661606211613, "grad_norm": 0.8784457445144653, "learning_rate": 4.135875131473558e-05, "loss": 1.5385, "mean_token_accuracy": 0.6778692290186882, "num_tokens": 205001897.0, "step": 12720 }, { "epoch": 2.9502839262950515, "grad_norm": 0.8612165451049805, "learning_rate": 4.134455696631394e-05, "loss": 1.5416, "mean_token_accuracy": 0.6756368711590767, "num_tokens": 205162341.0, "step": 12730 }, { "epoch": 2.952601691968942, "grad_norm": 0.862446129322052, "learning_rate": 4.1330353409619324e-05, "loss": 1.5625, "mean_token_accuracy": 0.6740820273756981, "num_tokens": 205323958.0, "step": 12740 }, { "epoch": 2.9549194576428324, "grad_norm": 0.8991357088088989, "learning_rate": 4.131614065265379e-05, "loss": 1.5691, "mean_token_accuracy": 0.6740472033619881, "num_tokens": 205483797.0, "step": 12750 }, { "epoch": 2.9572372233167226, "grad_norm": 0.8877214789390564, "learning_rate": 4.1301918703424584e-05, "loss": 1.5594, "mean_token_accuracy": 0.6750587821006775, "num_tokens": 205645398.0, "step": 12760 }, { "epoch": 2.959554988990613, "grad_norm": 0.9183115363121033, "learning_rate": 4.128768756994414e-05, "loss": 1.5748, "mean_token_accuracy": 0.6722274586558342, "num_tokens": 205806665.0, "step": 12770 }, { "epoch": 2.9618727546645034, "grad_norm": 0.8543571829795837, "learning_rate": 4.127344726023007e-05, "loss": 1.5638, "mean_token_accuracy": 0.673193795979023, "num_tokens": 205968609.0, "step": 12780 }, { "epoch": 2.964190520338394, "grad_norm": 0.8850280046463013, "learning_rate": 4.1259197782305134e-05, "loss": 1.5484, "mean_token_accuracy": 0.6749456033110619, "num_tokens": 206129292.0, "step": 12790 }, { "epoch": 2.9665082860122842, "grad_norm": 0.8941269516944885, "learning_rate": 4.124493914419727e-05, "loss": 1.5535, "mean_token_accuracy": 0.6756735816597939, "num_tokens": 206290425.0, "step": 12800 }, { "epoch": 2.9688260516861744, "grad_norm": 0.8693913221359253, "learning_rate": 4.123067135393957e-05, "loss": 1.5578, "mean_token_accuracy": 0.6754634499549865, "num_tokens": 206451877.0, "step": 12810 }, { "epoch": 2.971143817360065, "grad_norm": 0.9292336702346802, "learning_rate": 4.121639441957031e-05, "loss": 1.5605, "mean_token_accuracy": 0.6746679529547691, "num_tokens": 206612524.0, "step": 12820 }, { "epoch": 2.9734615830339552, "grad_norm": 0.8771016001701355, "learning_rate": 4.120210834913288e-05, "loss": 1.5624, "mean_token_accuracy": 0.6748718023300171, "num_tokens": 206774169.0, "step": 12830 }, { "epoch": 2.975779348707846, "grad_norm": 0.8727928400039673, "learning_rate": 4.118781315067583e-05, "loss": 1.5572, "mean_token_accuracy": 0.6754670396447182, "num_tokens": 206935512.0, "step": 12840 }, { "epoch": 2.978097114381736, "grad_norm": 0.8795922994613647, "learning_rate": 4.117350883225287e-05, "loss": 1.5655, "mean_token_accuracy": 0.6758599534630776, "num_tokens": 207096263.0, "step": 12850 }, { "epoch": 2.9804148800556263, "grad_norm": 0.8280555605888367, "learning_rate": 4.115919540192282e-05, "loss": 1.5643, "mean_token_accuracy": 0.6743537291884423, "num_tokens": 207256974.0, "step": 12860 }, { "epoch": 2.982732645729517, "grad_norm": 0.9037829041481018, "learning_rate": 4.1144872867749655e-05, "loss": 1.5676, "mean_token_accuracy": 0.6730071187019349, "num_tokens": 207418690.0, "step": 12870 }, { "epoch": 2.985050411403407, "grad_norm": 0.8434913158416748, "learning_rate": 4.1130541237802475e-05, "loss": 1.5646, "mean_token_accuracy": 0.6742421343922615, "num_tokens": 207580623.0, "step": 12880 }, { "epoch": 2.9873681770772977, "grad_norm": 0.9074137806892395, "learning_rate": 4.111620052015549e-05, "loss": 1.5438, "mean_token_accuracy": 0.6771314680576325, "num_tokens": 207741119.0, "step": 12890 }, { "epoch": 2.989685942751188, "grad_norm": 0.8274183869361877, "learning_rate": 4.110185072288805e-05, "loss": 1.5519, "mean_token_accuracy": 0.676436747610569, "num_tokens": 207902726.0, "step": 12900 }, { "epoch": 2.992003708425078, "grad_norm": 0.8213136196136475, "learning_rate": 4.1087491854084596e-05, "loss": 1.5724, "mean_token_accuracy": 0.6729806408286094, "num_tokens": 208063961.0, "step": 12910 }, { "epoch": 2.9943214740989688, "grad_norm": 0.9385107159614563, "learning_rate": 4.107312392183471e-05, "loss": 1.5489, "mean_token_accuracy": 0.6770511597394944, "num_tokens": 208225147.0, "step": 12920 }, { "epoch": 2.996639239772859, "grad_norm": 0.8723744750022888, "learning_rate": 4.105874693423307e-05, "loss": 1.5417, "mean_token_accuracy": 0.6750986352562904, "num_tokens": 208385914.0, "step": 12930 }, { "epoch": 2.9989570054467496, "grad_norm": 0.8340260982513428, "learning_rate": 4.104436089937943e-05, "loss": 1.545, "mean_token_accuracy": 0.6755188256502151, "num_tokens": 208547582.0, "step": 12940 }, { "epoch": 3.001158882836945, "grad_norm": 0.9898903965950012, "learning_rate": 4.102996582537868e-05, "loss": 1.5311, "mean_token_accuracy": 0.6779931783676147, "num_tokens": 208699864.0, "step": 12950 }, { "epoch": 3.0034766485108357, "grad_norm": 0.8521137237548828, "learning_rate": 4.101556172034078e-05, "loss": 1.5263, "mean_token_accuracy": 0.679122531414032, "num_tokens": 208861882.0, "step": 12960 }, { "epoch": 3.005794414184726, "grad_norm": 0.9282759428024292, "learning_rate": 4.1001148592380766e-05, "loss": 1.5212, "mean_token_accuracy": 0.6796492204070091, "num_tokens": 209022046.0, "step": 12970 }, { "epoch": 3.008112179858616, "grad_norm": 0.84343421459198, "learning_rate": 4.098672644961879e-05, "loss": 1.5408, "mean_token_accuracy": 0.6786701172590256, "num_tokens": 209182061.0, "step": 12980 }, { "epoch": 3.0104299455325068, "grad_norm": 0.895150899887085, "learning_rate": 4.0972295300180074e-05, "loss": 1.5409, "mean_token_accuracy": 0.6766049861907959, "num_tokens": 209342761.0, "step": 12990 }, { "epoch": 3.012747711206397, "grad_norm": 0.8706933259963989, "learning_rate": 4.0957855152194893e-05, "loss": 1.5549, "mean_token_accuracy": 0.6750305011868477, "num_tokens": 209504838.0, "step": 13000 }, { "epoch": 3.0150654768802876, "grad_norm": 0.8789107203483582, "learning_rate": 4.0943406013798595e-05, "loss": 1.5494, "mean_token_accuracy": 0.6747122541069984, "num_tokens": 209667116.0, "step": 13010 }, { "epoch": 3.017383242554178, "grad_norm": 0.920540988445282, "learning_rate": 4.092894789313163e-05, "loss": 1.5169, "mean_token_accuracy": 0.6801815405488014, "num_tokens": 209827936.0, "step": 13020 }, { "epoch": 3.019701008228068, "grad_norm": 0.8810492753982544, "learning_rate": 4.091448079833946e-05, "loss": 1.529, "mean_token_accuracy": 0.6790079221129417, "num_tokens": 209988525.0, "step": 13030 }, { "epoch": 3.0220187739019586, "grad_norm": 0.8918201327323914, "learning_rate": 4.090000473757263e-05, "loss": 1.5365, "mean_token_accuracy": 0.6776887163519859, "num_tokens": 210149320.0, "step": 13040 }, { "epoch": 3.024336539575849, "grad_norm": 0.8779435753822327, "learning_rate": 4.088551971898673e-05, "loss": 1.5427, "mean_token_accuracy": 0.6768584251403809, "num_tokens": 210310686.0, "step": 13050 }, { "epoch": 3.0266543052497394, "grad_norm": 0.907116711139679, "learning_rate": 4.08710257507424e-05, "loss": 1.5677, "mean_token_accuracy": 0.6732585370540619, "num_tokens": 210471978.0, "step": 13060 }, { "epoch": 3.0289720709236296, "grad_norm": 0.9099695682525635, "learning_rate": 4.085652284100533e-05, "loss": 1.5364, "mean_token_accuracy": 0.6776662796735764, "num_tokens": 210633184.0, "step": 13070 }, { "epoch": 3.03128983659752, "grad_norm": 0.9131120443344116, "learning_rate": 4.0842010997946226e-05, "loss": 1.5152, "mean_token_accuracy": 0.6809051141142846, "num_tokens": 210793709.0, "step": 13080 }, { "epoch": 3.0336076022714105, "grad_norm": 0.9147993922233582, "learning_rate": 4.082749022974083e-05, "loss": 1.5361, "mean_token_accuracy": 0.6765268385410309, "num_tokens": 210955237.0, "step": 13090 }, { "epoch": 3.0359253679453007, "grad_norm": 0.8829778432846069, "learning_rate": 4.081296054456993e-05, "loss": 1.5454, "mean_token_accuracy": 0.6761863738298416, "num_tokens": 211116790.0, "step": 13100 }, { "epoch": 3.0382431336191913, "grad_norm": 0.8763731122016907, "learning_rate": 4.0798421950619333e-05, "loss": 1.5379, "mean_token_accuracy": 0.6773570597171783, "num_tokens": 211278232.0, "step": 13110 }, { "epoch": 3.0405608992930815, "grad_norm": 0.8765101432800293, "learning_rate": 4.078387445607985e-05, "loss": 1.5253, "mean_token_accuracy": 0.6791025012731552, "num_tokens": 211438941.0, "step": 13120 }, { "epoch": 3.0428786649669717, "grad_norm": 0.8773775696754456, "learning_rate": 4.076931806914733e-05, "loss": 1.5536, "mean_token_accuracy": 0.6745205327868462, "num_tokens": 211600121.0, "step": 13130 }, { "epoch": 3.0451964306408623, "grad_norm": 0.8945023417472839, "learning_rate": 4.07547527980226e-05, "loss": 1.5329, "mean_token_accuracy": 0.6774380892515183, "num_tokens": 211761908.0, "step": 13140 }, { "epoch": 3.0475141963147525, "grad_norm": 0.8786992430686951, "learning_rate": 4.074017865091152e-05, "loss": 1.5518, "mean_token_accuracy": 0.6750138640403748, "num_tokens": 211922714.0, "step": 13150 }, { "epoch": 3.049831961988643, "grad_norm": 0.8768795728683472, "learning_rate": 4.0725595636024946e-05, "loss": 1.5167, "mean_token_accuracy": 0.6799836203455925, "num_tokens": 212083656.0, "step": 13160 }, { "epoch": 3.0521497276625333, "grad_norm": 0.9053186774253845, "learning_rate": 4.071100376157872e-05, "loss": 1.5363, "mean_token_accuracy": 0.6769199699163437, "num_tokens": 212243660.0, "step": 13170 }, { "epoch": 3.0544674933364235, "grad_norm": 0.8824218511581421, "learning_rate": 4.069640303579368e-05, "loss": 1.5353, "mean_token_accuracy": 0.6783929646015168, "num_tokens": 212404661.0, "step": 13180 }, { "epoch": 3.056785259010314, "grad_norm": 0.872017502784729, "learning_rate": 4.068179346689565e-05, "loss": 1.5265, "mean_token_accuracy": 0.6787965163588524, "num_tokens": 212565860.0, "step": 13190 }, { "epoch": 3.0591030246842044, "grad_norm": 0.9418271780014038, "learning_rate": 4.0667175063115434e-05, "loss": 1.5365, "mean_token_accuracy": 0.6770075112581253, "num_tokens": 212727703.0, "step": 13200 }, { "epoch": 3.061420790358095, "grad_norm": 0.9069328308105469, "learning_rate": 4.065254783268884e-05, "loss": 1.5379, "mean_token_accuracy": 0.6762085676193237, "num_tokens": 212889022.0, "step": 13210 }, { "epoch": 3.063738556031985, "grad_norm": 0.9527013897895813, "learning_rate": 4.063791178385659e-05, "loss": 1.5217, "mean_token_accuracy": 0.6793609768152237, "num_tokens": 213050630.0, "step": 13220 }, { "epoch": 3.0660563217058754, "grad_norm": 0.9164714217185974, "learning_rate": 4.0623266924864435e-05, "loss": 1.5294, "mean_token_accuracy": 0.6778235405683517, "num_tokens": 213212012.0, "step": 13230 }, { "epoch": 3.068374087379766, "grad_norm": 0.95061194896698, "learning_rate": 4.060861326396305e-05, "loss": 1.5398, "mean_token_accuracy": 0.6760078832507134, "num_tokens": 213372409.0, "step": 13240 }, { "epoch": 3.070691853053656, "grad_norm": 0.9117826819419861, "learning_rate": 4.05939508094081e-05, "loss": 1.5315, "mean_token_accuracy": 0.6779412791132927, "num_tokens": 213534101.0, "step": 13250 }, { "epoch": 3.073009618727547, "grad_norm": 0.892888605594635, "learning_rate": 4.057927956946017e-05, "loss": 1.537, "mean_token_accuracy": 0.6757266014814377, "num_tokens": 213695666.0, "step": 13260 }, { "epoch": 3.075327384401437, "grad_norm": 0.9050571918487549, "learning_rate": 4.056459955238482e-05, "loss": 1.5374, "mean_token_accuracy": 0.6770509541034698, "num_tokens": 213856827.0, "step": 13270 }, { "epoch": 3.0776451500753272, "grad_norm": 0.8808977603912354, "learning_rate": 4.0549910766452545e-05, "loss": 1.5434, "mean_token_accuracy": 0.6769178494811058, "num_tokens": 214018781.0, "step": 13280 }, { "epoch": 3.079962915749218, "grad_norm": 0.887598991394043, "learning_rate": 4.053521321993878e-05, "loss": 1.5325, "mean_token_accuracy": 0.6770251408219338, "num_tokens": 214180012.0, "step": 13290 }, { "epoch": 3.082280681423108, "grad_norm": 0.9203541278839111, "learning_rate": 4.0520506921123904e-05, "loss": 1.5405, "mean_token_accuracy": 0.678459033370018, "num_tokens": 214341840.0, "step": 13300 }, { "epoch": 3.0845984470969987, "grad_norm": 0.8429187536239624, "learning_rate": 4.050579187829321e-05, "loss": 1.5421, "mean_token_accuracy": 0.6753648698329926, "num_tokens": 214502587.0, "step": 13310 }, { "epoch": 3.086916212770889, "grad_norm": 0.9325313568115234, "learning_rate": 4.049106809973694e-05, "loss": 1.5471, "mean_token_accuracy": 0.6759650766849518, "num_tokens": 214664185.0, "step": 13320 }, { "epoch": 3.089233978444779, "grad_norm": 0.9186757206916809, "learning_rate": 4.0476335593750235e-05, "loss": 1.5295, "mean_token_accuracy": 0.6782306224107743, "num_tokens": 214825302.0, "step": 13330 }, { "epoch": 3.0915517441186697, "grad_norm": 0.8870776891708374, "learning_rate": 4.0461594368633166e-05, "loss": 1.5362, "mean_token_accuracy": 0.6768574059009552, "num_tokens": 214985408.0, "step": 13340 }, { "epoch": 3.09386950979256, "grad_norm": 0.9297446012496948, "learning_rate": 4.044684443269072e-05, "loss": 1.5216, "mean_token_accuracy": 0.6788749650120736, "num_tokens": 215146078.0, "step": 13350 }, { "epoch": 3.09618727546645, "grad_norm": 0.9070261120796204, "learning_rate": 4.043208579423278e-05, "loss": 1.5294, "mean_token_accuracy": 0.6777179032564163, "num_tokens": 215307639.0, "step": 13360 }, { "epoch": 3.0985050411403408, "grad_norm": 0.8998178839683533, "learning_rate": 4.041731846157413e-05, "loss": 1.5197, "mean_token_accuracy": 0.6777938455343246, "num_tokens": 215469675.0, "step": 13370 }, { "epoch": 3.100822806814231, "grad_norm": 0.985151469707489, "learning_rate": 4.040254244303448e-05, "loss": 1.5381, "mean_token_accuracy": 0.6779361516237259, "num_tokens": 215630603.0, "step": 13380 }, { "epoch": 3.1031405724881216, "grad_norm": 0.859347939491272, "learning_rate": 4.038775774693838e-05, "loss": 1.5502, "mean_token_accuracy": 0.6753544241189957, "num_tokens": 215792544.0, "step": 13390 }, { "epoch": 3.1054583381620118, "grad_norm": 0.9172458648681641, "learning_rate": 4.037296438161532e-05, "loss": 1.5442, "mean_token_accuracy": 0.676779355108738, "num_tokens": 215953960.0, "step": 13400 }, { "epoch": 3.107776103835902, "grad_norm": 0.8243050575256348, "learning_rate": 4.035816235539966e-05, "loss": 1.5282, "mean_token_accuracy": 0.6777375280857086, "num_tokens": 216115048.0, "step": 13410 }, { "epoch": 3.1100938695097926, "grad_norm": 0.8931747674942017, "learning_rate": 4.034335167663064e-05, "loss": 1.5462, "mean_token_accuracy": 0.6759169548749924, "num_tokens": 216275742.0, "step": 13420 }, { "epoch": 3.112411635183683, "grad_norm": 0.889395534992218, "learning_rate": 4.0328532353652335e-05, "loss": 1.5334, "mean_token_accuracy": 0.6770699635148049, "num_tokens": 216437043.0, "step": 13430 }, { "epoch": 3.1147294008575734, "grad_norm": 0.8451533913612366, "learning_rate": 4.031370439481377e-05, "loss": 1.5328, "mean_token_accuracy": 0.6762100592255592, "num_tokens": 216598397.0, "step": 13440 }, { "epoch": 3.1170471665314636, "grad_norm": 0.8751204609870911, "learning_rate": 4.0298867808468755e-05, "loss": 1.5402, "mean_token_accuracy": 0.6762215167284011, "num_tokens": 216760339.0, "step": 13450 }, { "epoch": 3.119364932205354, "grad_norm": 0.9695375561714172, "learning_rate": 4.028402260297602e-05, "loss": 1.5448, "mean_token_accuracy": 0.6754145592451095, "num_tokens": 216922282.0, "step": 13460 }, { "epoch": 3.1216826978792445, "grad_norm": 0.9467867612838745, "learning_rate": 4.0269168786699105e-05, "loss": 1.5479, "mean_token_accuracy": 0.677077679336071, "num_tokens": 217083765.0, "step": 13470 }, { "epoch": 3.1240004635531347, "grad_norm": 0.9315450191497803, "learning_rate": 4.0254306368006435e-05, "loss": 1.5417, "mean_token_accuracy": 0.6769771486520767, "num_tokens": 217245054.0, "step": 13480 }, { "epoch": 3.1263182292270253, "grad_norm": 0.889886736869812, "learning_rate": 4.023943535527127e-05, "loss": 1.5298, "mean_token_accuracy": 0.6791677191853523, "num_tokens": 217406152.0, "step": 13490 }, { "epoch": 3.1286359949009155, "grad_norm": 0.8421936631202698, "learning_rate": 4.02245557568717e-05, "loss": 1.5668, "mean_token_accuracy": 0.6737742453813553, "num_tokens": 217567296.0, "step": 13500 }, { "epoch": 3.1309537605748057, "grad_norm": 0.8955795168876648, "learning_rate": 4.020966758119067e-05, "loss": 1.5418, "mean_token_accuracy": 0.6774817898869514, "num_tokens": 217728386.0, "step": 13510 }, { "epoch": 3.1332715262486963, "grad_norm": 0.8778653144836426, "learning_rate": 4.019477083661596e-05, "loss": 1.5368, "mean_token_accuracy": 0.6771318227052688, "num_tokens": 217889188.0, "step": 13520 }, { "epoch": 3.1355892919225865, "grad_norm": 0.8689829707145691, "learning_rate": 4.017986553154015e-05, "loss": 1.5249, "mean_token_accuracy": 0.6793583527207374, "num_tokens": 218050087.0, "step": 13530 }, { "epoch": 3.137907057596477, "grad_norm": 0.9203545451164246, "learning_rate": 4.0164951674360665e-05, "loss": 1.5353, "mean_token_accuracy": 0.6762337788939476, "num_tokens": 218211535.0, "step": 13540 }, { "epoch": 3.1402248232703673, "grad_norm": 0.8537470698356628, "learning_rate": 4.015002927347975e-05, "loss": 1.5281, "mean_token_accuracy": 0.6781362310051918, "num_tokens": 218373043.0, "step": 13550 }, { "epoch": 3.1425425889442575, "grad_norm": 0.9084147214889526, "learning_rate": 4.013509833730445e-05, "loss": 1.5498, "mean_token_accuracy": 0.6748320996761322, "num_tokens": 218534945.0, "step": 13560 }, { "epoch": 3.144860354618148, "grad_norm": 0.9044816493988037, "learning_rate": 4.0120158874246646e-05, "loss": 1.5441, "mean_token_accuracy": 0.6752215877175332, "num_tokens": 218696409.0, "step": 13570 }, { "epoch": 3.1471781202920384, "grad_norm": 0.8924543261528015, "learning_rate": 4.010521089272297e-05, "loss": 1.5441, "mean_token_accuracy": 0.6757982894778252, "num_tokens": 218858441.0, "step": 13580 }, { "epoch": 3.149495885965929, "grad_norm": 0.9625018835067749, "learning_rate": 4.009025440115491e-05, "loss": 1.5266, "mean_token_accuracy": 0.6778874009847641, "num_tokens": 219020320.0, "step": 13590 }, { "epoch": 3.151813651639819, "grad_norm": 0.9527880549430847, "learning_rate": 4.0075289407968705e-05, "loss": 1.5412, "mean_token_accuracy": 0.677063237130642, "num_tokens": 219180733.0, "step": 13600 }, { "epoch": 3.1541314173137094, "grad_norm": 0.9073135256767273, "learning_rate": 4.006031592159543e-05, "loss": 1.5413, "mean_token_accuracy": 0.6766751378774643, "num_tokens": 219341850.0, "step": 13610 }, { "epoch": 3.1564491829876, "grad_norm": 0.8872031569480896, "learning_rate": 4.0045333950470894e-05, "loss": 1.5306, "mean_token_accuracy": 0.6772466495633125, "num_tokens": 219502399.0, "step": 13620 }, { "epoch": 3.15876694866149, "grad_norm": 0.8806135654449463, "learning_rate": 4.0030343503035716e-05, "loss": 1.5467, "mean_token_accuracy": 0.675444082915783, "num_tokens": 219663667.0, "step": 13630 }, { "epoch": 3.161084714335381, "grad_norm": 0.8854542374610901, "learning_rate": 4.001534458773529e-05, "loss": 1.5153, "mean_token_accuracy": 0.6802591264247895, "num_tokens": 219823860.0, "step": 13640 }, { "epoch": 3.163402480009271, "grad_norm": 0.9185445308685303, "learning_rate": 4.0000337213019765e-05, "loss": 1.5292, "mean_token_accuracy": 0.6776044890284538, "num_tokens": 219985283.0, "step": 13650 }, { "epoch": 3.1657202456831612, "grad_norm": 0.8786717057228088, "learning_rate": 3.9985321387344076e-05, "loss": 1.5594, "mean_token_accuracy": 0.6747877091169358, "num_tokens": 220146337.0, "step": 13660 }, { "epoch": 3.168038011357052, "grad_norm": 0.8803130984306335, "learning_rate": 3.9970297119167904e-05, "loss": 1.5386, "mean_token_accuracy": 0.6775888353586197, "num_tokens": 220307046.0, "step": 13670 }, { "epoch": 3.170355777030942, "grad_norm": 0.9052287340164185, "learning_rate": 3.99552644169557e-05, "loss": 1.5321, "mean_token_accuracy": 0.677568556368351, "num_tokens": 220468114.0, "step": 13680 }, { "epoch": 3.1726735427048327, "grad_norm": 0.9237887859344482, "learning_rate": 3.9940223289176645e-05, "loss": 1.5313, "mean_token_accuracy": 0.6765571489930153, "num_tokens": 220629509.0, "step": 13690 }, { "epoch": 3.174991308378723, "grad_norm": 0.8994746208190918, "learning_rate": 3.9925173744304684e-05, "loss": 1.5344, "mean_token_accuracy": 0.6773893177509308, "num_tokens": 220791124.0, "step": 13700 }, { "epoch": 3.177309074052613, "grad_norm": 0.8738769292831421, "learning_rate": 3.9910115790818505e-05, "loss": 1.5318, "mean_token_accuracy": 0.6786138623952865, "num_tokens": 220952937.0, "step": 13710 }, { "epoch": 3.1796268397265037, "grad_norm": 0.881010115146637, "learning_rate": 3.989504943720153e-05, "loss": 1.5293, "mean_token_accuracy": 0.6778526052832603, "num_tokens": 221113574.0, "step": 13720 }, { "epoch": 3.181944605400394, "grad_norm": 0.9005452394485474, "learning_rate": 3.987997469194189e-05, "loss": 1.5343, "mean_token_accuracy": 0.6774219274520874, "num_tokens": 221273579.0, "step": 13730 }, { "epoch": 3.1842623710742846, "grad_norm": 0.927162230014801, "learning_rate": 3.98648915635325e-05, "loss": 1.5455, "mean_token_accuracy": 0.6763889774680137, "num_tokens": 221433749.0, "step": 13740 }, { "epoch": 3.1865801367481748, "grad_norm": 0.8905224800109863, "learning_rate": 3.9849800060470935e-05, "loss": 1.5339, "mean_token_accuracy": 0.678204645216465, "num_tokens": 221594330.0, "step": 13750 }, { "epoch": 3.188897902422065, "grad_norm": 0.8887333273887634, "learning_rate": 3.983470019125953e-05, "loss": 1.5553, "mean_token_accuracy": 0.675048454105854, "num_tokens": 221755770.0, "step": 13760 }, { "epoch": 3.1912156680959556, "grad_norm": 0.9145733714103699, "learning_rate": 3.981959196440532e-05, "loss": 1.5533, "mean_token_accuracy": 0.6751575261354447, "num_tokens": 221917682.0, "step": 13770 }, { "epoch": 3.1935334337698458, "grad_norm": 0.8843676447868347, "learning_rate": 3.980447538842005e-05, "loss": 1.5466, "mean_token_accuracy": 0.6760154739022255, "num_tokens": 222079342.0, "step": 13780 }, { "epoch": 3.1958511994437364, "grad_norm": 0.924342155456543, "learning_rate": 3.9789350471820166e-05, "loss": 1.537, "mean_token_accuracy": 0.6780294388532638, "num_tokens": 222240341.0, "step": 13790 }, { "epoch": 3.1981689651176266, "grad_norm": 0.9876214265823364, "learning_rate": 3.97742172231268e-05, "loss": 1.5716, "mean_token_accuracy": 0.674208976328373, "num_tokens": 222400158.0, "step": 13800 }, { "epoch": 3.200486730791517, "grad_norm": 0.8876433968544006, "learning_rate": 3.9759075650865804e-05, "loss": 1.5388, "mean_token_accuracy": 0.6763280868530274, "num_tokens": 222561108.0, "step": 13810 }, { "epoch": 3.2028044964654074, "grad_norm": 0.917665958404541, "learning_rate": 3.9743925763567716e-05, "loss": 1.5401, "mean_token_accuracy": 0.6772088542580604, "num_tokens": 222721992.0, "step": 13820 }, { "epoch": 3.2051222621392976, "grad_norm": 0.8850867748260498, "learning_rate": 3.9728767569767745e-05, "loss": 1.5213, "mean_token_accuracy": 0.6782873675227166, "num_tokens": 222882721.0, "step": 13830 }, { "epoch": 3.2074400278131883, "grad_norm": 0.8406335711479187, "learning_rate": 3.971360107800578e-05, "loss": 1.5483, "mean_token_accuracy": 0.6758399337530137, "num_tokens": 223044552.0, "step": 13840 }, { "epoch": 3.2097577934870785, "grad_norm": 0.9051040410995483, "learning_rate": 3.969842629682638e-05, "loss": 1.5361, "mean_token_accuracy": 0.6780738040804863, "num_tokens": 223205844.0, "step": 13850 }, { "epoch": 3.2120755591609687, "grad_norm": 0.9074164628982544, "learning_rate": 3.9683243234778814e-05, "loss": 1.5409, "mean_token_accuracy": 0.6778242066502571, "num_tokens": 223367120.0, "step": 13860 }, { "epoch": 3.2143933248348593, "grad_norm": 0.8558695912361145, "learning_rate": 3.966805190041697e-05, "loss": 1.5274, "mean_token_accuracy": 0.6801856696605683, "num_tokens": 223528849.0, "step": 13870 }, { "epoch": 3.2167110905087495, "grad_norm": 0.9385780096054077, "learning_rate": 3.96528523022994e-05, "loss": 1.5497, "mean_token_accuracy": 0.6760586321353912, "num_tokens": 223689457.0, "step": 13880 }, { "epoch": 3.21902885618264, "grad_norm": 0.9466908574104309, "learning_rate": 3.9637644448989345e-05, "loss": 1.5371, "mean_token_accuracy": 0.6781332358717919, "num_tokens": 223850810.0, "step": 13890 }, { "epoch": 3.2213466218565303, "grad_norm": 0.8681911826133728, "learning_rate": 3.9622428349054666e-05, "loss": 1.5299, "mean_token_accuracy": 0.6774311691522599, "num_tokens": 224012408.0, "step": 13900 }, { "epoch": 3.2236643875304205, "grad_norm": 0.9087786674499512, "learning_rate": 3.960720401106788e-05, "loss": 1.553, "mean_token_accuracy": 0.6757786095142364, "num_tokens": 224174061.0, "step": 13910 }, { "epoch": 3.225982153204311, "grad_norm": 0.9682396054267883, "learning_rate": 3.9591971443606154e-05, "loss": 1.5417, "mean_token_accuracy": 0.6770755603909493, "num_tokens": 224334081.0, "step": 13920 }, { "epoch": 3.2282999188782013, "grad_norm": 0.9706996083259583, "learning_rate": 3.957673065525128e-05, "loss": 1.5344, "mean_token_accuracy": 0.6774613350629807, "num_tokens": 224495997.0, "step": 13930 }, { "epoch": 3.230617684552092, "grad_norm": 0.8703180551528931, "learning_rate": 3.956148165458968e-05, "loss": 1.5368, "mean_token_accuracy": 0.6776119440793991, "num_tokens": 224657657.0, "step": 13940 }, { "epoch": 3.232935450225982, "grad_norm": 0.9267169833183289, "learning_rate": 3.954622445021241e-05, "loss": 1.5284, "mean_token_accuracy": 0.6785599246621132, "num_tokens": 224817466.0, "step": 13950 }, { "epoch": 3.2352532158998724, "grad_norm": 0.8812583684921265, "learning_rate": 3.953095905071514e-05, "loss": 1.5384, "mean_token_accuracy": 0.6768310248851777, "num_tokens": 224979138.0, "step": 13960 }, { "epoch": 3.237570981573763, "grad_norm": 0.9971987009048462, "learning_rate": 3.951568546469818e-05, "loss": 1.5211, "mean_token_accuracy": 0.6789312779903411, "num_tokens": 225140441.0, "step": 13970 }, { "epoch": 3.239888747247653, "grad_norm": 0.8798522353172302, "learning_rate": 3.950040370076642e-05, "loss": 1.5217, "mean_token_accuracy": 0.6800252333283424, "num_tokens": 225301848.0, "step": 13980 }, { "epoch": 3.242206512921544, "grad_norm": 0.9634197950363159, "learning_rate": 3.9485113767529366e-05, "loss": 1.529, "mean_token_accuracy": 0.6764360785484314, "num_tokens": 225462064.0, "step": 13990 }, { "epoch": 3.244524278595434, "grad_norm": 0.9349685311317444, "learning_rate": 3.946981567360114e-05, "loss": 1.5441, "mean_token_accuracy": 0.677720420062542, "num_tokens": 225623967.0, "step": 14000 }, { "epoch": 3.246842044269324, "grad_norm": 0.9143983125686646, "learning_rate": 3.945450942760046e-05, "loss": 1.5246, "mean_token_accuracy": 0.6780525803565979, "num_tokens": 225785742.0, "step": 14010 }, { "epoch": 3.249159809943215, "grad_norm": 0.9089481234550476, "learning_rate": 3.9439195038150614e-05, "loss": 1.5201, "mean_token_accuracy": 0.6797397002577782, "num_tokens": 225947249.0, "step": 14020 }, { "epoch": 3.251477575617105, "grad_norm": 0.8637068271636963, "learning_rate": 3.942387251387951e-05, "loss": 1.5439, "mean_token_accuracy": 0.6768744394183159, "num_tokens": 226108810.0, "step": 14030 }, { "epoch": 3.2537953412909957, "grad_norm": 0.8705196976661682, "learning_rate": 3.9408541863419626e-05, "loss": 1.5572, "mean_token_accuracy": 0.6742212295532226, "num_tokens": 226270127.0, "step": 14040 }, { "epoch": 3.256113106964886, "grad_norm": 0.8832241296768188, "learning_rate": 3.939320309540799e-05, "loss": 1.5386, "mean_token_accuracy": 0.6764160484075546, "num_tokens": 226431399.0, "step": 14050 }, { "epoch": 3.258430872638776, "grad_norm": 0.8485342860221863, "learning_rate": 3.937785621848626e-05, "loss": 1.5289, "mean_token_accuracy": 0.6777649700641633, "num_tokens": 226591756.0, "step": 14060 }, { "epoch": 3.2607486383126667, "grad_norm": 0.9124982953071594, "learning_rate": 3.9362501241300616e-05, "loss": 1.524, "mean_token_accuracy": 0.6802674844861031, "num_tokens": 226752818.0, "step": 14070 }, { "epoch": 3.263066403986557, "grad_norm": 0.9345813989639282, "learning_rate": 3.9347138172501816e-05, "loss": 1.5335, "mean_token_accuracy": 0.6775416851043701, "num_tokens": 226913560.0, "step": 14080 }, { "epoch": 3.2653841696604475, "grad_norm": 0.9313040375709534, "learning_rate": 3.9331767020745183e-05, "loss": 1.5217, "mean_token_accuracy": 0.6798884913325309, "num_tokens": 227075191.0, "step": 14090 }, { "epoch": 3.2677019353343377, "grad_norm": 0.8754037618637085, "learning_rate": 3.931638779469058e-05, "loss": 1.5381, "mean_token_accuracy": 0.6767647698521614, "num_tokens": 227236549.0, "step": 14100 }, { "epoch": 3.270019701008228, "grad_norm": 0.8394140005111694, "learning_rate": 3.930100050300244e-05, "loss": 1.5327, "mean_token_accuracy": 0.6774972587823868, "num_tokens": 227397655.0, "step": 14110 }, { "epoch": 3.2723374666821186, "grad_norm": 0.9540085196495056, "learning_rate": 3.9285605154349714e-05, "loss": 1.5302, "mean_token_accuracy": 0.6775089129805565, "num_tokens": 227558982.0, "step": 14120 }, { "epoch": 3.2746552323560087, "grad_norm": 0.8762516379356384, "learning_rate": 3.927020175740591e-05, "loss": 1.5373, "mean_token_accuracy": 0.6754623383283616, "num_tokens": 227719806.0, "step": 14130 }, { "epoch": 3.2769729980298994, "grad_norm": 0.877583920955658, "learning_rate": 3.9254790320849075e-05, "loss": 1.5245, "mean_token_accuracy": 0.6786957010626793, "num_tokens": 227881640.0, "step": 14140 }, { "epoch": 3.2792907637037896, "grad_norm": 0.9005013108253479, "learning_rate": 3.923937085336177e-05, "loss": 1.5599, "mean_token_accuracy": 0.673600797355175, "num_tokens": 228043452.0, "step": 14150 }, { "epoch": 3.2816085293776798, "grad_norm": 0.8572436571121216, "learning_rate": 3.9223943363631075e-05, "loss": 1.5225, "mean_token_accuracy": 0.6800180926918984, "num_tokens": 228204474.0, "step": 14160 }, { "epoch": 3.2839262950515704, "grad_norm": 0.9303814768791199, "learning_rate": 3.920850786034862e-05, "loss": 1.5198, "mean_token_accuracy": 0.6791574105620384, "num_tokens": 228366247.0, "step": 14170 }, { "epoch": 3.2862440607254606, "grad_norm": 0.8583407998085022, "learning_rate": 3.919306435221052e-05, "loss": 1.5327, "mean_token_accuracy": 0.6782120481133461, "num_tokens": 228527878.0, "step": 14180 }, { "epoch": 3.2885618263993512, "grad_norm": 0.8655160069465637, "learning_rate": 3.917761284791742e-05, "loss": 1.5512, "mean_token_accuracy": 0.673947973549366, "num_tokens": 228689411.0, "step": 14190 }, { "epoch": 3.2908795920732414, "grad_norm": 0.9034389853477478, "learning_rate": 3.916215335617445e-05, "loss": 1.5305, "mean_token_accuracy": 0.677202382683754, "num_tokens": 228851030.0, "step": 14200 }, { "epoch": 3.2931973577471316, "grad_norm": 0.9479517936706543, "learning_rate": 3.9146685885691255e-05, "loss": 1.539, "mean_token_accuracy": 0.6763410806655884, "num_tokens": 229012116.0, "step": 14210 }, { "epoch": 3.2955151234210223, "grad_norm": 0.8738720417022705, "learning_rate": 3.9131210445181974e-05, "loss": 1.5342, "mean_token_accuracy": 0.6775947630405426, "num_tokens": 229173335.0, "step": 14220 }, { "epoch": 3.2978328890949125, "grad_norm": 0.8938373327255249, "learning_rate": 3.911572704336522e-05, "loss": 1.5387, "mean_token_accuracy": 0.6759917557239532, "num_tokens": 229333539.0, "step": 14230 }, { "epoch": 3.300150654768803, "grad_norm": 0.8650654554367065, "learning_rate": 3.910023568896413e-05, "loss": 1.5396, "mean_token_accuracy": 0.6753038972616195, "num_tokens": 229494194.0, "step": 14240 }, { "epoch": 3.3024684204426933, "grad_norm": 0.875406801700592, "learning_rate": 3.908473639070627e-05, "loss": 1.5294, "mean_token_accuracy": 0.676440280675888, "num_tokens": 229655629.0, "step": 14250 }, { "epoch": 3.3047861861165835, "grad_norm": 0.881237804889679, "learning_rate": 3.906922915732372e-05, "loss": 1.5213, "mean_token_accuracy": 0.6778844207525253, "num_tokens": 229816531.0, "step": 14260 }, { "epoch": 3.307103951790474, "grad_norm": 0.9698696136474609, "learning_rate": 3.9053713997553024e-05, "loss": 1.5152, "mean_token_accuracy": 0.6791907891631126, "num_tokens": 229977216.0, "step": 14270 }, { "epoch": 3.3094217174643643, "grad_norm": 0.925520658493042, "learning_rate": 3.9038190920135165e-05, "loss": 1.539, "mean_token_accuracy": 0.6763923943042756, "num_tokens": 230138263.0, "step": 14280 }, { "epoch": 3.311739483138255, "grad_norm": 0.9096680879592896, "learning_rate": 3.902265993381562e-05, "loss": 1.5371, "mean_token_accuracy": 0.67778839468956, "num_tokens": 230299736.0, "step": 14290 }, { "epoch": 3.314057248812145, "grad_norm": 0.9082056879997253, "learning_rate": 3.9007121047344316e-05, "loss": 1.5488, "mean_token_accuracy": 0.6747285574674606, "num_tokens": 230461020.0, "step": 14300 }, { "epoch": 3.3163750144860353, "grad_norm": 0.9368436336517334, "learning_rate": 3.899157426947561e-05, "loss": 1.5559, "mean_token_accuracy": 0.6743142381310463, "num_tokens": 230621706.0, "step": 14310 }, { "epoch": 3.318692780159926, "grad_norm": 0.8633617758750916, "learning_rate": 3.897601960896832e-05, "loss": 1.5384, "mean_token_accuracy": 0.6769204050302505, "num_tokens": 230783022.0, "step": 14320 }, { "epoch": 3.321010545833816, "grad_norm": 0.865004301071167, "learning_rate": 3.896045707458571e-05, "loss": 1.5426, "mean_token_accuracy": 0.6776993036270141, "num_tokens": 230944150.0, "step": 14330 }, { "epoch": 3.323328311507707, "grad_norm": 0.8861401081085205, "learning_rate": 3.894488667509547e-05, "loss": 1.5119, "mean_token_accuracy": 0.6824582710862159, "num_tokens": 231105427.0, "step": 14340 }, { "epoch": 3.325646077181597, "grad_norm": 0.8693128228187561, "learning_rate": 3.892930841926973e-05, "loss": 1.533, "mean_token_accuracy": 0.6762063637375831, "num_tokens": 231267090.0, "step": 14350 }, { "epoch": 3.327963842855487, "grad_norm": 0.95596843957901, "learning_rate": 3.8913722315885034e-05, "loss": 1.5282, "mean_token_accuracy": 0.6784743130207062, "num_tokens": 231428657.0, "step": 14360 }, { "epoch": 3.330281608529378, "grad_norm": 0.8698582649230957, "learning_rate": 3.889812837372235e-05, "loss": 1.5476, "mean_token_accuracy": 0.6761126458644867, "num_tokens": 231589637.0, "step": 14370 }, { "epoch": 3.332599374203268, "grad_norm": 0.9012112021446228, "learning_rate": 3.888252660156707e-05, "loss": 1.5505, "mean_token_accuracy": 0.6748659044504166, "num_tokens": 231750866.0, "step": 14380 }, { "epoch": 3.3349171398771587, "grad_norm": 0.8897801637649536, "learning_rate": 3.8866917008209e-05, "loss": 1.5158, "mean_token_accuracy": 0.6811315059661865, "num_tokens": 231912466.0, "step": 14390 }, { "epoch": 3.337234905551049, "grad_norm": 0.8750767707824707, "learning_rate": 3.885129960244234e-05, "loss": 1.5351, "mean_token_accuracy": 0.6777633905410767, "num_tokens": 232073915.0, "step": 14400 }, { "epoch": 3.339552671224939, "grad_norm": 0.8803972601890564, "learning_rate": 3.88356743930657e-05, "loss": 1.5387, "mean_token_accuracy": 0.6757565036416053, "num_tokens": 232235411.0, "step": 14410 }, { "epoch": 3.3418704368988297, "grad_norm": 0.8654121160507202, "learning_rate": 3.882004138888208e-05, "loss": 1.5546, "mean_token_accuracy": 0.6748257607221604, "num_tokens": 232396246.0, "step": 14420 }, { "epoch": 3.34418820257272, "grad_norm": 0.8818709254264832, "learning_rate": 3.8804400598698876e-05, "loss": 1.5357, "mean_token_accuracy": 0.6784042775630951, "num_tokens": 232556697.0, "step": 14430 }, { "epoch": 3.3465059682466105, "grad_norm": 0.8536380529403687, "learning_rate": 3.878875203132787e-05, "loss": 1.5302, "mean_token_accuracy": 0.6782801285386085, "num_tokens": 232718261.0, "step": 14440 }, { "epoch": 3.3488237339205007, "grad_norm": 0.8500943779945374, "learning_rate": 3.877309569558523e-05, "loss": 1.5274, "mean_token_accuracy": 0.6790195643901825, "num_tokens": 232878563.0, "step": 14450 }, { "epoch": 3.351141499594391, "grad_norm": 0.9324572682380676, "learning_rate": 3.875743160029148e-05, "loss": 1.5535, "mean_token_accuracy": 0.675471892952919, "num_tokens": 233040375.0, "step": 14460 }, { "epoch": 3.3534592652682815, "grad_norm": 0.9069680571556091, "learning_rate": 3.874175975427155e-05, "loss": 1.5387, "mean_token_accuracy": 0.6777769297361373, "num_tokens": 233201549.0, "step": 14470 }, { "epoch": 3.3557770309421717, "grad_norm": 0.8965266346931458, "learning_rate": 3.87260801663547e-05, "loss": 1.5294, "mean_token_accuracy": 0.6780668586492539, "num_tokens": 233361921.0, "step": 14480 }, { "epoch": 3.3580947966160624, "grad_norm": 0.9445978999137878, "learning_rate": 3.871039284537459e-05, "loss": 1.5156, "mean_token_accuracy": 0.6798426955938339, "num_tokens": 233522973.0, "step": 14490 }, { "epoch": 3.3604125622899526, "grad_norm": 0.9056114554405212, "learning_rate": 3.869469780016921e-05, "loss": 1.5428, "mean_token_accuracy": 0.6762098133563995, "num_tokens": 233684578.0, "step": 14500 }, { "epoch": 3.3627303279638427, "grad_norm": 0.911280632019043, "learning_rate": 3.8678995039580894e-05, "loss": 1.5435, "mean_token_accuracy": 0.6773236081004143, "num_tokens": 233846215.0, "step": 14510 }, { "epoch": 3.3650480936377334, "grad_norm": 0.8559538125991821, "learning_rate": 3.866328457245635e-05, "loss": 1.5181, "mean_token_accuracy": 0.6797660663723946, "num_tokens": 234006970.0, "step": 14520 }, { "epoch": 3.3673658593116236, "grad_norm": 0.9087750315666199, "learning_rate": 3.864756640764661e-05, "loss": 1.5602, "mean_token_accuracy": 0.6736096709966659, "num_tokens": 234169001.0, "step": 14530 }, { "epoch": 3.369683624985514, "grad_norm": 0.9041507840156555, "learning_rate": 3.863184055400705e-05, "loss": 1.5455, "mean_token_accuracy": 0.674496577680111, "num_tokens": 234330323.0, "step": 14540 }, { "epoch": 3.3720013906594044, "grad_norm": 0.9210264682769775, "learning_rate": 3.861610702039738e-05, "loss": 1.5236, "mean_token_accuracy": 0.679890152812004, "num_tokens": 234490521.0, "step": 14550 }, { "epoch": 3.3743191563332946, "grad_norm": 0.8918014168739319, "learning_rate": 3.860036581568162e-05, "loss": 1.5233, "mean_token_accuracy": 0.6786107435822487, "num_tokens": 234652250.0, "step": 14560 }, { "epoch": 3.3766369220071852, "grad_norm": 0.9028334021568298, "learning_rate": 3.8584616948728116e-05, "loss": 1.5373, "mean_token_accuracy": 0.6759393662214279, "num_tokens": 234813651.0, "step": 14570 }, { "epoch": 3.3789546876810754, "grad_norm": 0.8604713678359985, "learning_rate": 3.8568860428409566e-05, "loss": 1.5508, "mean_token_accuracy": 0.6755244091153145, "num_tokens": 234975267.0, "step": 14580 }, { "epoch": 3.3812724533549656, "grad_norm": 0.8825548887252808, "learning_rate": 3.855309626360293e-05, "loss": 1.5396, "mean_token_accuracy": 0.6758912846446037, "num_tokens": 235137056.0, "step": 14590 }, { "epoch": 3.3835902190288563, "grad_norm": 0.8981462121009827, "learning_rate": 3.8537324463189505e-05, "loss": 1.5307, "mean_token_accuracy": 0.680045111477375, "num_tokens": 235296731.0, "step": 14600 }, { "epoch": 3.3859079847027465, "grad_norm": 0.9186213612556458, "learning_rate": 3.852154503605488e-05, "loss": 1.518, "mean_token_accuracy": 0.6807091206312179, "num_tokens": 235458511.0, "step": 14610 }, { "epoch": 3.388225750376637, "grad_norm": 0.8661028742790222, "learning_rate": 3.8505757991088954e-05, "loss": 1.5295, "mean_token_accuracy": 0.6785890907049179, "num_tokens": 235619951.0, "step": 14620 }, { "epoch": 3.3905435160505273, "grad_norm": 0.9814384579658508, "learning_rate": 3.8489963337185884e-05, "loss": 1.5396, "mean_token_accuracy": 0.6764431670308113, "num_tokens": 235780550.0, "step": 14630 }, { "epoch": 3.3928612817244175, "grad_norm": 0.9254258871078491, "learning_rate": 3.8474161083244153e-05, "loss": 1.5387, "mean_token_accuracy": 0.676325598359108, "num_tokens": 235942355.0, "step": 14640 }, { "epoch": 3.395179047398308, "grad_norm": 0.8793222308158875, "learning_rate": 3.84583512381665e-05, "loss": 1.5291, "mean_token_accuracy": 0.6776403114199638, "num_tokens": 236103927.0, "step": 14650 }, { "epoch": 3.3974968130721983, "grad_norm": 0.8891114592552185, "learning_rate": 3.844253381085996e-05, "loss": 1.5411, "mean_token_accuracy": 0.6768928229808807, "num_tokens": 236264197.0, "step": 14660 }, { "epoch": 3.399814578746089, "grad_norm": 0.9233812689781189, "learning_rate": 3.8426708810235814e-05, "loss": 1.5239, "mean_token_accuracy": 0.6805250018835067, "num_tokens": 236425350.0, "step": 14670 }, { "epoch": 3.402132344419979, "grad_norm": 0.8840826749801636, "learning_rate": 3.841087624520964e-05, "loss": 1.5429, "mean_token_accuracy": 0.67679583132267, "num_tokens": 236587139.0, "step": 14680 }, { "epoch": 3.4044501100938693, "grad_norm": 0.8932804465293884, "learning_rate": 3.8395036124701254e-05, "loss": 1.5381, "mean_token_accuracy": 0.6782266959547997, "num_tokens": 236746982.0, "step": 14690 }, { "epoch": 3.40676787576776, "grad_norm": 0.8973506689071655, "learning_rate": 3.837918845763474e-05, "loss": 1.538, "mean_token_accuracy": 0.6776883035898209, "num_tokens": 236908759.0, "step": 14700 }, { "epoch": 3.40908564144165, "grad_norm": 0.8848878145217896, "learning_rate": 3.836333325293842e-05, "loss": 1.5302, "mean_token_accuracy": 0.6785957485437393, "num_tokens": 237070008.0, "step": 14710 }, { "epoch": 3.411403407115541, "grad_norm": 0.9052397012710571, "learning_rate": 3.834747051954489e-05, "loss": 1.5454, "mean_token_accuracy": 0.6772289827466011, "num_tokens": 237231548.0, "step": 14720 }, { "epoch": 3.413721172789431, "grad_norm": 0.8753265738487244, "learning_rate": 3.8331600266390965e-05, "loss": 1.5509, "mean_token_accuracy": 0.6750221461057663, "num_tokens": 237391547.0, "step": 14730 }, { "epoch": 3.416038938463321, "grad_norm": 0.876514196395874, "learning_rate": 3.83157225024177e-05, "loss": 1.5244, "mean_token_accuracy": 0.6772476300597191, "num_tokens": 237552900.0, "step": 14740 }, { "epoch": 3.418356704137212, "grad_norm": 0.9478664994239807, "learning_rate": 3.829983723657038e-05, "loss": 1.5399, "mean_token_accuracy": 0.6768420308828353, "num_tokens": 237714623.0, "step": 14750 }, { "epoch": 3.420674469811102, "grad_norm": 0.9031175374984741, "learning_rate": 3.828394447779854e-05, "loss": 1.5281, "mean_token_accuracy": 0.6785521358251572, "num_tokens": 237876087.0, "step": 14760 }, { "epoch": 3.4229922354849927, "grad_norm": 0.8544631600379944, "learning_rate": 3.826804423505589e-05, "loss": 1.5294, "mean_token_accuracy": 0.6788626194000245, "num_tokens": 238037946.0, "step": 14770 }, { "epoch": 3.425310001158883, "grad_norm": 0.8793312311172485, "learning_rate": 3.8252136517300406e-05, "loss": 1.5361, "mean_token_accuracy": 0.6764030411839486, "num_tokens": 238199071.0, "step": 14780 }, { "epoch": 3.427627766832773, "grad_norm": 0.8814725875854492, "learning_rate": 3.8236221333494235e-05, "loss": 1.5253, "mean_token_accuracy": 0.6784739106893539, "num_tokens": 238360691.0, "step": 14790 }, { "epoch": 3.4299455325066637, "grad_norm": 0.8743959665298462, "learning_rate": 3.8220298692603755e-05, "loss": 1.5504, "mean_token_accuracy": 0.6753043502569198, "num_tokens": 238521914.0, "step": 14800 }, { "epoch": 3.432263298180554, "grad_norm": 0.9114444255828857, "learning_rate": 3.8204368603599536e-05, "loss": 1.5351, "mean_token_accuracy": 0.6779654696583748, "num_tokens": 238681867.0, "step": 14810 }, { "epoch": 3.4345810638544445, "grad_norm": 0.8626413345336914, "learning_rate": 3.818843107545635e-05, "loss": 1.5154, "mean_token_accuracy": 0.6801468908786774, "num_tokens": 238842592.0, "step": 14820 }, { "epoch": 3.4368988295283347, "grad_norm": 0.8905646204948425, "learning_rate": 3.8172486117153163e-05, "loss": 1.5333, "mean_token_accuracy": 0.677650262415409, "num_tokens": 239003807.0, "step": 14830 }, { "epoch": 3.439216595202225, "grad_norm": 0.8482327461242676, "learning_rate": 3.81565337376731e-05, "loss": 1.5442, "mean_token_accuracy": 0.6750706031918525, "num_tokens": 239165305.0, "step": 14840 }, { "epoch": 3.4415343608761155, "grad_norm": 0.8558962345123291, "learning_rate": 3.814057394600351e-05, "loss": 1.5268, "mean_token_accuracy": 0.6775357156991959, "num_tokens": 239326849.0, "step": 14850 }, { "epoch": 3.4438521265500057, "grad_norm": 0.9457498788833618, "learning_rate": 3.812460675113587e-05, "loss": 1.5533, "mean_token_accuracy": 0.6731404006481171, "num_tokens": 239488556.0, "step": 14860 }, { "epoch": 3.4461698922238964, "grad_norm": 0.8750383853912354, "learning_rate": 3.8108632162065874e-05, "loss": 1.5441, "mean_token_accuracy": 0.6762781143188477, "num_tokens": 239650210.0, "step": 14870 }, { "epoch": 3.4484876578977866, "grad_norm": 0.9037257432937622, "learning_rate": 3.8092650187793356e-05, "loss": 1.535, "mean_token_accuracy": 0.6775509417057037, "num_tokens": 239811586.0, "step": 14880 }, { "epoch": 3.4508054235716767, "grad_norm": 0.8795648217201233, "learning_rate": 3.807666083732231e-05, "loss": 1.529, "mean_token_accuracy": 0.6787527829408646, "num_tokens": 239971625.0, "step": 14890 }, { "epoch": 3.4531231892455674, "grad_norm": 0.8506757616996765, "learning_rate": 3.8060664119660895e-05, "loss": 1.5206, "mean_token_accuracy": 0.6786843240261078, "num_tokens": 240133023.0, "step": 14900 }, { "epoch": 3.4554409549194576, "grad_norm": 0.8843573331832886, "learning_rate": 3.8044660043821415e-05, "loss": 1.5333, "mean_token_accuracy": 0.6778418913483619, "num_tokens": 240294563.0, "step": 14910 }, { "epoch": 3.4577587205933478, "grad_norm": 0.9069731831550598, "learning_rate": 3.802864861882032e-05, "loss": 1.5339, "mean_token_accuracy": 0.6771584898233414, "num_tokens": 240455439.0, "step": 14920 }, { "epoch": 3.4600764862672384, "grad_norm": 0.9343395233154297, "learning_rate": 3.80126298536782e-05, "loss": 1.5234, "mean_token_accuracy": 0.6793804198503495, "num_tokens": 240616752.0, "step": 14930 }, { "epoch": 3.4623942519411286, "grad_norm": 0.8821207880973816, "learning_rate": 3.799660375741979e-05, "loss": 1.5344, "mean_token_accuracy": 0.6778024315834046, "num_tokens": 240777058.0, "step": 14940 }, { "epoch": 3.4647120176150192, "grad_norm": 0.8730626702308655, "learning_rate": 3.798057033907394e-05, "loss": 1.528, "mean_token_accuracy": 0.6771850526332855, "num_tokens": 240938296.0, "step": 14950 }, { "epoch": 3.4670297832889094, "grad_norm": 0.9141484498977661, "learning_rate": 3.796452960767364e-05, "loss": 1.5448, "mean_token_accuracy": 0.6763568341732025, "num_tokens": 241099832.0, "step": 14960 }, { "epoch": 3.4693475489627996, "grad_norm": 0.9019760489463806, "learning_rate": 3.794848157225598e-05, "loss": 1.5487, "mean_token_accuracy": 0.6764576718211174, "num_tokens": 241259311.0, "step": 14970 }, { "epoch": 3.4716653146366903, "grad_norm": 0.8390543460845947, "learning_rate": 3.793242624186218e-05, "loss": 1.5431, "mean_token_accuracy": 0.6758200168609619, "num_tokens": 241420620.0, "step": 14980 }, { "epoch": 3.4739830803105805, "grad_norm": 0.8407006859779358, "learning_rate": 3.791636362553758e-05, "loss": 1.5279, "mean_token_accuracy": 0.6794470012187958, "num_tokens": 241582014.0, "step": 14990 }, { "epoch": 3.476300845984471, "grad_norm": 0.8513484001159668, "learning_rate": 3.79002937323316e-05, "loss": 1.5343, "mean_token_accuracy": 0.6777217432856559, "num_tokens": 241742696.0, "step": 15000 }, { "epoch": 3.4786186116583613, "grad_norm": 0.8793767094612122, "learning_rate": 3.788421657129778e-05, "loss": 1.5195, "mean_token_accuracy": 0.6792784750461578, "num_tokens": 241904467.0, "step": 15010 }, { "epoch": 3.4809363773322515, "grad_norm": 0.8772327899932861, "learning_rate": 3.786813215149373e-05, "loss": 1.525, "mean_token_accuracy": 0.6783248469233513, "num_tokens": 242064282.0, "step": 15020 }, { "epoch": 3.483254143006142, "grad_norm": 0.8928138613700867, "learning_rate": 3.785204048198119e-05, "loss": 1.5119, "mean_token_accuracy": 0.6800108656287194, "num_tokens": 242225997.0, "step": 15030 }, { "epoch": 3.4855719086800323, "grad_norm": 0.9545791149139404, "learning_rate": 3.7835941571825946e-05, "loss": 1.5367, "mean_token_accuracy": 0.6774231612682342, "num_tokens": 242386238.0, "step": 15040 }, { "epoch": 3.487889674353923, "grad_norm": 0.8801494836807251, "learning_rate": 3.7819835430097875e-05, "loss": 1.5285, "mean_token_accuracy": 0.6790339708328247, "num_tokens": 242547780.0, "step": 15050 }, { "epoch": 3.490207440027813, "grad_norm": 0.8838332295417786, "learning_rate": 3.7803722065870925e-05, "loss": 1.5246, "mean_token_accuracy": 0.677509255707264, "num_tokens": 242708041.0, "step": 15060 }, { "epoch": 3.4925252057017033, "grad_norm": 0.9042676687240601, "learning_rate": 3.7787601488223136e-05, "loss": 1.5298, "mean_token_accuracy": 0.6775943547487259, "num_tokens": 242869078.0, "step": 15070 }, { "epoch": 3.494842971375594, "grad_norm": 0.8923559784889221, "learning_rate": 3.7771473706236585e-05, "loss": 1.5204, "mean_token_accuracy": 0.6793642178177833, "num_tokens": 243030680.0, "step": 15080 }, { "epoch": 3.497160737049484, "grad_norm": 0.8511435985565186, "learning_rate": 3.775533872899741e-05, "loss": 1.5214, "mean_token_accuracy": 0.6782767593860626, "num_tokens": 243191439.0, "step": 15090 }, { "epoch": 3.499478502723375, "grad_norm": 0.966599702835083, "learning_rate": 3.7739196565595824e-05, "loss": 1.5374, "mean_token_accuracy": 0.6757339403033257, "num_tokens": 243352147.0, "step": 15100 }, { "epoch": 3.501796268397265, "grad_norm": 0.9077842831611633, "learning_rate": 3.772304722512607e-05, "loss": 1.5287, "mean_token_accuracy": 0.6780707180500031, "num_tokens": 243513804.0, "step": 15110 }, { "epoch": 3.504114034071155, "grad_norm": 0.8989670872688293, "learning_rate": 3.770689071668643e-05, "loss": 1.5247, "mean_token_accuracy": 0.6778042942285538, "num_tokens": 243674637.0, "step": 15120 }, { "epoch": 3.506431799745046, "grad_norm": 0.8553876876831055, "learning_rate": 3.769072704937925e-05, "loss": 1.5252, "mean_token_accuracy": 0.6778672322630882, "num_tokens": 243835152.0, "step": 15130 }, { "epoch": 3.508749565418936, "grad_norm": 0.9879617094993591, "learning_rate": 3.7674556232310874e-05, "loss": 1.5281, "mean_token_accuracy": 0.6789572104811669, "num_tokens": 243995586.0, "step": 15140 }, { "epoch": 3.5110673310928266, "grad_norm": 0.8959920406341553, "learning_rate": 3.765837827459171e-05, "loss": 1.5254, "mean_token_accuracy": 0.6759606257081032, "num_tokens": 244156154.0, "step": 15150 }, { "epoch": 3.513385096766717, "grad_norm": 0.8373178243637085, "learning_rate": 3.764219318533616e-05, "loss": 1.5219, "mean_token_accuracy": 0.678036080300808, "num_tokens": 244317418.0, "step": 15160 }, { "epoch": 3.515702862440607, "grad_norm": 0.8529130816459656, "learning_rate": 3.7626000973662654e-05, "loss": 1.5361, "mean_token_accuracy": 0.6762443870306015, "num_tokens": 244478801.0, "step": 15170 }, { "epoch": 3.5180206281144977, "grad_norm": 0.8770533204078674, "learning_rate": 3.7609801648693653e-05, "loss": 1.5184, "mean_token_accuracy": 0.6787195965647698, "num_tokens": 244640291.0, "step": 15180 }, { "epoch": 3.520338393788388, "grad_norm": 0.9744563102722168, "learning_rate": 3.759359521955559e-05, "loss": 1.5313, "mean_token_accuracy": 0.677562738955021, "num_tokens": 244801337.0, "step": 15190 }, { "epoch": 3.5226561594622785, "grad_norm": 0.9165347218513489, "learning_rate": 3.757738169537892e-05, "loss": 1.5347, "mean_token_accuracy": 0.677488024532795, "num_tokens": 244962997.0, "step": 15200 }, { "epoch": 3.5249739251361687, "grad_norm": 0.9247071743011475, "learning_rate": 3.7561161085298104e-05, "loss": 1.5258, "mean_token_accuracy": 0.6796065881848335, "num_tokens": 245124460.0, "step": 15210 }, { "epoch": 3.527291690810059, "grad_norm": 0.8976757526397705, "learning_rate": 3.754493339845157e-05, "loss": 1.528, "mean_token_accuracy": 0.6785601168870926, "num_tokens": 245285957.0, "step": 15220 }, { "epoch": 3.5296094564839495, "grad_norm": 0.9046645164489746, "learning_rate": 3.7528698643981775e-05, "loss": 1.5258, "mean_token_accuracy": 0.6777452185750008, "num_tokens": 245447639.0, "step": 15230 }, { "epoch": 3.5319272221578397, "grad_norm": 0.8682762980461121, "learning_rate": 3.751245683103511e-05, "loss": 1.5253, "mean_token_accuracy": 0.6778632581233979, "num_tokens": 245609563.0, "step": 15240 }, { "epoch": 3.5342449878317304, "grad_norm": 0.9271794557571411, "learning_rate": 3.7496207968761964e-05, "loss": 1.5142, "mean_token_accuracy": 0.6790249183773994, "num_tokens": 245771212.0, "step": 15250 }, { "epoch": 3.5365627535056205, "grad_norm": 0.9346029758453369, "learning_rate": 3.7479952066316705e-05, "loss": 1.5382, "mean_token_accuracy": 0.6769208118319512, "num_tokens": 245932941.0, "step": 15260 }, { "epoch": 3.5388805191795107, "grad_norm": 0.9675098061561584, "learning_rate": 3.7463689132857655e-05, "loss": 1.5468, "mean_token_accuracy": 0.6754459828138352, "num_tokens": 246094441.0, "step": 15270 }, { "epoch": 3.5411982848534014, "grad_norm": 0.8927951455116272, "learning_rate": 3.74474191775471e-05, "loss": 1.5422, "mean_token_accuracy": 0.6767211988568306, "num_tokens": 246255992.0, "step": 15280 }, { "epoch": 3.5435160505272916, "grad_norm": 0.9014127254486084, "learning_rate": 3.7431142209551285e-05, "loss": 1.5226, "mean_token_accuracy": 0.6792801454663276, "num_tokens": 246417165.0, "step": 15290 }, { "epoch": 3.545833816201182, "grad_norm": 0.9159870743751526, "learning_rate": 3.741485823804041e-05, "loss": 1.5325, "mean_token_accuracy": 0.6791370645165443, "num_tokens": 246577566.0, "step": 15300 }, { "epoch": 3.5481515818750724, "grad_norm": 0.8576531410217285, "learning_rate": 3.73985672721886e-05, "loss": 1.5278, "mean_token_accuracy": 0.6797645226120949, "num_tokens": 246739513.0, "step": 15310 }, { "epoch": 3.5504693475489626, "grad_norm": 0.9134300351142883, "learning_rate": 3.738226932117395e-05, "loss": 1.5324, "mean_token_accuracy": 0.6787910327315331, "num_tokens": 246901346.0, "step": 15320 }, { "epoch": 3.5527871132228532, "grad_norm": 0.9311147332191467, "learning_rate": 3.7365964394178474e-05, "loss": 1.5263, "mean_token_accuracy": 0.6784775465726852, "num_tokens": 247062933.0, "step": 15330 }, { "epoch": 3.5551048788967434, "grad_norm": 0.8677538633346558, "learning_rate": 3.7349652500388117e-05, "loss": 1.5197, "mean_token_accuracy": 0.6779378175735473, "num_tokens": 247223325.0, "step": 15340 }, { "epoch": 3.557422644570634, "grad_norm": 0.9218688607215881, "learning_rate": 3.733333364899275e-05, "loss": 1.5216, "mean_token_accuracy": 0.6789447337388992, "num_tokens": 247383987.0, "step": 15350 }, { "epoch": 3.5597404102445243, "grad_norm": 0.8670927882194519, "learning_rate": 3.7317007849186154e-05, "loss": 1.5227, "mean_token_accuracy": 0.6790286540985108, "num_tokens": 247544395.0, "step": 15360 }, { "epoch": 3.5620581759184144, "grad_norm": 0.9067287445068359, "learning_rate": 3.7300675110166045e-05, "loss": 1.5402, "mean_token_accuracy": 0.6773300781846047, "num_tokens": 247705914.0, "step": 15370 }, { "epoch": 3.564375941592305, "grad_norm": 0.8735440969467163, "learning_rate": 3.728433544113404e-05, "loss": 1.5248, "mean_token_accuracy": 0.6802247270941735, "num_tokens": 247867590.0, "step": 15380 }, { "epoch": 3.5666937072661953, "grad_norm": 0.856557309627533, "learning_rate": 3.726798885129565e-05, "loss": 1.5266, "mean_token_accuracy": 0.678027693927288, "num_tokens": 248029510.0, "step": 15390 }, { "epoch": 3.569011472940086, "grad_norm": 0.955251157283783, "learning_rate": 3.7251635349860294e-05, "loss": 1.5245, "mean_token_accuracy": 0.6798216253519058, "num_tokens": 248190480.0, "step": 15400 }, { "epoch": 3.571329238613976, "grad_norm": 0.8681230545043945, "learning_rate": 3.72352749460413e-05, "loss": 1.546, "mean_token_accuracy": 0.6752035647630692, "num_tokens": 248352572.0, "step": 15410 }, { "epoch": 3.5736470042878663, "grad_norm": 0.9068554639816284, "learning_rate": 3.7218907649055845e-05, "loss": 1.5332, "mean_token_accuracy": 0.6773290917277336, "num_tokens": 248513412.0, "step": 15420 }, { "epoch": 3.575964769961757, "grad_norm": 0.8945737481117249, "learning_rate": 3.720253346812503e-05, "loss": 1.554, "mean_token_accuracy": 0.6748412474989891, "num_tokens": 248672929.0, "step": 15430 }, { "epoch": 3.578282535635647, "grad_norm": 0.9250012636184692, "learning_rate": 3.718615241247381e-05, "loss": 1.5404, "mean_token_accuracy": 0.6777699813246727, "num_tokens": 248834911.0, "step": 15440 }, { "epoch": 3.5806003013095378, "grad_norm": 0.8869150280952454, "learning_rate": 3.716976449133103e-05, "loss": 1.5434, "mean_token_accuracy": 0.6756322309374809, "num_tokens": 248996302.0, "step": 15450 }, { "epoch": 3.582918066983428, "grad_norm": 0.9311455488204956, "learning_rate": 3.7153369713929374e-05, "loss": 1.5417, "mean_token_accuracy": 0.6753456503152847, "num_tokens": 249157886.0, "step": 15460 }, { "epoch": 3.585235832657318, "grad_norm": 0.8517132997512817, "learning_rate": 3.713696808950543e-05, "loss": 1.5344, "mean_token_accuracy": 0.6780963435769081, "num_tokens": 249319853.0, "step": 15470 }, { "epoch": 3.587553598331209, "grad_norm": 0.8675822019577026, "learning_rate": 3.712055962729961e-05, "loss": 1.5315, "mean_token_accuracy": 0.6782755613327026, "num_tokens": 249480825.0, "step": 15480 }, { "epoch": 3.589871364005099, "grad_norm": 0.8771219849586487, "learning_rate": 3.71041443365562e-05, "loss": 1.5283, "mean_token_accuracy": 0.6777212813496589, "num_tokens": 249642768.0, "step": 15490 }, { "epoch": 3.5921891296789896, "grad_norm": 0.8701884746551514, "learning_rate": 3.70877222265233e-05, "loss": 1.5327, "mean_token_accuracy": 0.6754121780395508, "num_tokens": 249803097.0, "step": 15500 }, { "epoch": 3.59450689535288, "grad_norm": 0.906157910823822, "learning_rate": 3.707129330645291e-05, "loss": 1.5359, "mean_token_accuracy": 0.677001841366291, "num_tokens": 249963909.0, "step": 15510 }, { "epoch": 3.59682466102677, "grad_norm": 0.9087231159210205, "learning_rate": 3.705485758560081e-05, "loss": 1.5499, "mean_token_accuracy": 0.6756796538829803, "num_tokens": 250125286.0, "step": 15520 }, { "epoch": 3.5991424267006606, "grad_norm": 0.937558114528656, "learning_rate": 3.703841507322663e-05, "loss": 1.5201, "mean_token_accuracy": 0.6779063567519188, "num_tokens": 250286605.0, "step": 15530 }, { "epoch": 3.601460192374551, "grad_norm": 0.913905143737793, "learning_rate": 3.702196577859384e-05, "loss": 1.5477, "mean_token_accuracy": 0.6742450937628746, "num_tokens": 250447987.0, "step": 15540 }, { "epoch": 3.6037779580484415, "grad_norm": 0.8811556100845337, "learning_rate": 3.700550971096972e-05, "loss": 1.5249, "mean_token_accuracy": 0.6797165662050247, "num_tokens": 250609613.0, "step": 15550 }, { "epoch": 3.6060957237223317, "grad_norm": 0.9480159878730774, "learning_rate": 3.698904687962536e-05, "loss": 1.5421, "mean_token_accuracy": 0.6776830509305001, "num_tokens": 250770786.0, "step": 15560 }, { "epoch": 3.608413489396222, "grad_norm": 0.8993578553199768, "learning_rate": 3.697257729383568e-05, "loss": 1.5193, "mean_token_accuracy": 0.6794822916388512, "num_tokens": 250932049.0, "step": 15570 }, { "epoch": 3.6107312550701125, "grad_norm": 0.8784548044204712, "learning_rate": 3.695610096287937e-05, "loss": 1.5306, "mean_token_accuracy": 0.6781141236424446, "num_tokens": 251094100.0, "step": 15580 }, { "epoch": 3.6130490207440027, "grad_norm": 0.9169755578041077, "learning_rate": 3.693961789603896e-05, "loss": 1.5341, "mean_token_accuracy": 0.6767927125096321, "num_tokens": 251255138.0, "step": 15590 }, { "epoch": 3.6153667864178933, "grad_norm": 0.8943397998809814, "learning_rate": 3.692312810260076e-05, "loss": 1.525, "mean_token_accuracy": 0.6768205657601356, "num_tokens": 251416606.0, "step": 15600 }, { "epoch": 3.6176845520917835, "grad_norm": 0.8976518511772156, "learning_rate": 3.690663159185485e-05, "loss": 1.5455, "mean_token_accuracy": 0.676387295126915, "num_tokens": 251578139.0, "step": 15610 }, { "epoch": 3.6200023177656737, "grad_norm": 0.8932918906211853, "learning_rate": 3.689012837309512e-05, "loss": 1.5395, "mean_token_accuracy": 0.6769028842449188, "num_tokens": 251739688.0, "step": 15620 }, { "epoch": 3.6223200834395644, "grad_norm": 0.8725816607475281, "learning_rate": 3.6873618455619234e-05, "loss": 1.5439, "mean_token_accuracy": 0.6780269652605057, "num_tokens": 251901153.0, "step": 15630 }, { "epoch": 3.6246378491134545, "grad_norm": 0.9134610295295715, "learning_rate": 3.6857101848728625e-05, "loss": 1.5182, "mean_token_accuracy": 0.678762449324131, "num_tokens": 252061093.0, "step": 15640 }, { "epoch": 3.626955614787345, "grad_norm": 0.9085738658905029, "learning_rate": 3.68405785617285e-05, "loss": 1.544, "mean_token_accuracy": 0.6754832804203034, "num_tokens": 252222789.0, "step": 15650 }, { "epoch": 3.6292733804612354, "grad_norm": 0.8912723660469055, "learning_rate": 3.682404860392781e-05, "loss": 1.5409, "mean_token_accuracy": 0.6781506180763245, "num_tokens": 252384094.0, "step": 15660 }, { "epoch": 3.6315911461351256, "grad_norm": 0.9199246168136597, "learning_rate": 3.6807511984639304e-05, "loss": 1.5275, "mean_token_accuracy": 0.6773539781570435, "num_tokens": 252545708.0, "step": 15670 }, { "epoch": 3.633908911809016, "grad_norm": 0.9277712106704712, "learning_rate": 3.679096871317944e-05, "loss": 1.5338, "mean_token_accuracy": 0.6768829673528671, "num_tokens": 252707383.0, "step": 15680 }, { "epoch": 3.6362266774829064, "grad_norm": 0.867670476436615, "learning_rate": 3.6774418798868446e-05, "loss": 1.5341, "mean_token_accuracy": 0.6771916747093201, "num_tokens": 252868907.0, "step": 15690 }, { "epoch": 3.638544443156797, "grad_norm": 0.8562825322151184, "learning_rate": 3.675786225103031e-05, "loss": 1.5355, "mean_token_accuracy": 0.6769492015242576, "num_tokens": 253030718.0, "step": 15700 }, { "epoch": 3.6408622088306872, "grad_norm": 0.8811575174331665, "learning_rate": 3.674129907899271e-05, "loss": 1.5224, "mean_token_accuracy": 0.6767938748002053, "num_tokens": 253191828.0, "step": 15710 }, { "epoch": 3.6431799745045774, "grad_norm": 0.897420346736908, "learning_rate": 3.67247292920871e-05, "loss": 1.5358, "mean_token_accuracy": 0.6771514773368835, "num_tokens": 253352637.0, "step": 15720 }, { "epoch": 3.645497740178468, "grad_norm": 0.8783614635467529, "learning_rate": 3.670815289964864e-05, "loss": 1.5294, "mean_token_accuracy": 0.6783826902508736, "num_tokens": 253514179.0, "step": 15730 }, { "epoch": 3.6478155058523583, "grad_norm": 0.8897547721862793, "learning_rate": 3.669156991101622e-05, "loss": 1.5235, "mean_token_accuracy": 0.6791643694043159, "num_tokens": 253676248.0, "step": 15740 }, { "epoch": 3.650133271526249, "grad_norm": 0.8457329273223877, "learning_rate": 3.6674980335532426e-05, "loss": 1.5389, "mean_token_accuracy": 0.6764813274145126, "num_tokens": 253837758.0, "step": 15750 }, { "epoch": 3.652451037200139, "grad_norm": 0.9227481484413147, "learning_rate": 3.665838418254359e-05, "loss": 1.5589, "mean_token_accuracy": 0.6746363744139672, "num_tokens": 253999101.0, "step": 15760 }, { "epoch": 3.6547688028740293, "grad_norm": 0.8859313130378723, "learning_rate": 3.664178146139972e-05, "loss": 1.5158, "mean_token_accuracy": 0.6794497773051262, "num_tokens": 254160487.0, "step": 15770 }, { "epoch": 3.65708656854792, "grad_norm": 0.9436237812042236, "learning_rate": 3.662517218145453e-05, "loss": 1.5319, "mean_token_accuracy": 0.6771997794508934, "num_tokens": 254320987.0, "step": 15780 }, { "epoch": 3.65940433422181, "grad_norm": 0.9074249863624573, "learning_rate": 3.660855635206545e-05, "loss": 1.543, "mean_token_accuracy": 0.6769874349236489, "num_tokens": 254482205.0, "step": 15790 }, { "epoch": 3.6617220998957007, "grad_norm": 0.8497018218040466, "learning_rate": 3.6591933982593565e-05, "loss": 1.5259, "mean_token_accuracy": 0.6770303934812546, "num_tokens": 254642887.0, "step": 15800 }, { "epoch": 3.664039865569591, "grad_norm": 0.9408425688743591, "learning_rate": 3.657530508240368e-05, "loss": 1.5391, "mean_token_accuracy": 0.676963672041893, "num_tokens": 254805117.0, "step": 15810 }, { "epoch": 3.666357631243481, "grad_norm": 0.9087072014808655, "learning_rate": 3.6558669660864255e-05, "loss": 1.5301, "mean_token_accuracy": 0.6774387121200561, "num_tokens": 254965678.0, "step": 15820 }, { "epoch": 3.6686753969173718, "grad_norm": 0.8577914834022522, "learning_rate": 3.6542027727347426e-05, "loss": 1.5255, "mean_token_accuracy": 0.6764340043067932, "num_tokens": 255127843.0, "step": 15830 }, { "epoch": 3.670993162591262, "grad_norm": 0.9598100781440735, "learning_rate": 3.652537929122901e-05, "loss": 1.5278, "mean_token_accuracy": 0.6771804809570312, "num_tokens": 255288960.0, "step": 15840 }, { "epoch": 3.6733109282651526, "grad_norm": 0.8690840601921082, "learning_rate": 3.650872436188849e-05, "loss": 1.5368, "mean_token_accuracy": 0.6772324666380882, "num_tokens": 255450207.0, "step": 15850 }, { "epoch": 3.675628693939043, "grad_norm": 0.9120047092437744, "learning_rate": 3.649206294870898e-05, "loss": 1.5359, "mean_token_accuracy": 0.6762813940644264, "num_tokens": 255611090.0, "step": 15860 }, { "epoch": 3.677946459612933, "grad_norm": 0.9453985095024109, "learning_rate": 3.6475395061077275e-05, "loss": 1.5314, "mean_token_accuracy": 0.6784786850214004, "num_tokens": 255772513.0, "step": 15870 }, { "epoch": 3.6802642252868236, "grad_norm": 0.9083001613616943, "learning_rate": 3.645872070838381e-05, "loss": 1.5231, "mean_token_accuracy": 0.6779908418655396, "num_tokens": 255932420.0, "step": 15880 }, { "epoch": 3.682581990960714, "grad_norm": 0.8950644731521606, "learning_rate": 3.644203990002265e-05, "loss": 1.5449, "mean_token_accuracy": 0.6766832202672959, "num_tokens": 256091788.0, "step": 15890 }, { "epoch": 3.6848997566346045, "grad_norm": 0.9380818605422974, "learning_rate": 3.642535264539151e-05, "loss": 1.5193, "mean_token_accuracy": 0.6770332530140877, "num_tokens": 256252345.0, "step": 15900 }, { "epoch": 3.6872175223084946, "grad_norm": 0.85646653175354, "learning_rate": 3.640865895389174e-05, "loss": 1.5266, "mean_token_accuracy": 0.6787269979715347, "num_tokens": 256414607.0, "step": 15910 }, { "epoch": 3.689535287982385, "grad_norm": 0.8295663595199585, "learning_rate": 3.63919588349283e-05, "loss": 1.5384, "mean_token_accuracy": 0.6767864301800728, "num_tokens": 256575957.0, "step": 15920 }, { "epoch": 3.6918530536562755, "grad_norm": 0.9128089547157288, "learning_rate": 3.6375252297909794e-05, "loss": 1.5284, "mean_token_accuracy": 0.6784809842705727, "num_tokens": 256737149.0, "step": 15930 }, { "epoch": 3.6941708193301657, "grad_norm": 0.8227361440658569, "learning_rate": 3.6358539352248423e-05, "loss": 1.5176, "mean_token_accuracy": 0.6788403749465942, "num_tokens": 256897813.0, "step": 15940 }, { "epoch": 3.6964885850040563, "grad_norm": 0.9314983487129211, "learning_rate": 3.634182000736e-05, "loss": 1.5219, "mean_token_accuracy": 0.6788966163992882, "num_tokens": 257059471.0, "step": 15950 }, { "epoch": 3.6988063506779465, "grad_norm": 0.9128071665763855, "learning_rate": 3.6325094272663943e-05, "loss": 1.5145, "mean_token_accuracy": 0.6791878700256347, "num_tokens": 257221270.0, "step": 15960 }, { "epoch": 3.7011241163518367, "grad_norm": 0.8677741885185242, "learning_rate": 3.63083621575833e-05, "loss": 1.5442, "mean_token_accuracy": 0.6765869691967964, "num_tokens": 257380481.0, "step": 15970 }, { "epoch": 3.7034418820257273, "grad_norm": 0.8721223473548889, "learning_rate": 3.6291623671544654e-05, "loss": 1.5242, "mean_token_accuracy": 0.6802246421575546, "num_tokens": 257542374.0, "step": 15980 }, { "epoch": 3.7057596476996175, "grad_norm": 0.8759878873825073, "learning_rate": 3.627487882397824e-05, "loss": 1.5392, "mean_token_accuracy": 0.6759495154023171, "num_tokens": 257703408.0, "step": 15990 }, { "epoch": 3.708077413373508, "grad_norm": 0.8918128609657288, "learning_rate": 3.6258127624317836e-05, "loss": 1.5391, "mean_token_accuracy": 0.6760935634374619, "num_tokens": 257865004.0, "step": 16000 }, { "epoch": 3.7103951790473984, "grad_norm": 0.8654720783233643, "learning_rate": 3.6241370082000805e-05, "loss": 1.5335, "mean_token_accuracy": 0.6771925151348114, "num_tokens": 258026748.0, "step": 16010 }, { "epoch": 3.7127129447212885, "grad_norm": 0.8913873434066772, "learning_rate": 3.622460620646811e-05, "loss": 1.5346, "mean_token_accuracy": 0.6766824901103974, "num_tokens": 258187452.0, "step": 16020 }, { "epoch": 3.715030710395179, "grad_norm": 0.9221161603927612, "learning_rate": 3.620783600716424e-05, "loss": 1.5398, "mean_token_accuracy": 0.6768781021237373, "num_tokens": 258349323.0, "step": 16030 }, { "epoch": 3.7173484760690694, "grad_norm": 0.9372894167900085, "learning_rate": 3.619105949353728e-05, "loss": 1.5155, "mean_token_accuracy": 0.6795677483081818, "num_tokens": 258511560.0, "step": 16040 }, { "epoch": 3.71966624174296, "grad_norm": 0.8467947244644165, "learning_rate": 3.617427667503885e-05, "loss": 1.5323, "mean_token_accuracy": 0.6768798738718033, "num_tokens": 258673410.0, "step": 16050 }, { "epoch": 3.72198400741685, "grad_norm": 0.9916259050369263, "learning_rate": 3.615748756112415e-05, "loss": 1.5336, "mean_token_accuracy": 0.6774866297841072, "num_tokens": 258834441.0, "step": 16060 }, { "epoch": 3.7243017730907404, "grad_norm": 0.8685785531997681, "learning_rate": 3.614069216125191e-05, "loss": 1.5457, "mean_token_accuracy": 0.6749328702688218, "num_tokens": 258995840.0, "step": 16070 }, { "epoch": 3.726619538764631, "grad_norm": 0.8652294278144836, "learning_rate": 3.612389048488438e-05, "loss": 1.5147, "mean_token_accuracy": 0.6801698461174965, "num_tokens": 259156348.0, "step": 16080 }, { "epoch": 3.7289373044385212, "grad_norm": 0.9095605611801147, "learning_rate": 3.6107082541487376e-05, "loss": 1.5096, "mean_token_accuracy": 0.681042055785656, "num_tokens": 259317836.0, "step": 16090 }, { "epoch": 3.731255070112412, "grad_norm": 0.8493523001670837, "learning_rate": 3.609026834053024e-05, "loss": 1.5281, "mean_token_accuracy": 0.6771048039197922, "num_tokens": 259478790.0, "step": 16100 }, { "epoch": 3.733572835786302, "grad_norm": 0.8963705897331238, "learning_rate": 3.6073447891485834e-05, "loss": 1.5223, "mean_token_accuracy": 0.6781864881515502, "num_tokens": 259640687.0, "step": 16110 }, { "epoch": 3.7358906014601923, "grad_norm": 0.9104617834091187, "learning_rate": 3.605662120383054e-05, "loss": 1.5349, "mean_token_accuracy": 0.6757396519184112, "num_tokens": 259801247.0, "step": 16120 }, { "epoch": 3.738208367134083, "grad_norm": 0.8811559677124023, "learning_rate": 3.603978828704426e-05, "loss": 1.5164, "mean_token_accuracy": 0.6801121607422829, "num_tokens": 259961587.0, "step": 16130 }, { "epoch": 3.740526132807973, "grad_norm": 0.8729875683784485, "learning_rate": 3.60229491506104e-05, "loss": 1.5209, "mean_token_accuracy": 0.6782788157463073, "num_tokens": 260123491.0, "step": 16140 }, { "epoch": 3.7428438984818637, "grad_norm": 0.8882964253425598, "learning_rate": 3.600610380401586e-05, "loss": 1.5307, "mean_token_accuracy": 0.6777479380369187, "num_tokens": 260285257.0, "step": 16150 }, { "epoch": 3.745161664155754, "grad_norm": 0.8511691093444824, "learning_rate": 3.598925225675107e-05, "loss": 1.5414, "mean_token_accuracy": 0.6774490073323249, "num_tokens": 260446864.0, "step": 16160 }, { "epoch": 3.747479429829644, "grad_norm": 0.9116806983947754, "learning_rate": 3.5972394518309914e-05, "loss": 1.5202, "mean_token_accuracy": 0.6792880341410636, "num_tokens": 260608824.0, "step": 16170 }, { "epoch": 3.7497971955035347, "grad_norm": 0.8434758186340332, "learning_rate": 3.59555305981898e-05, "loss": 1.5461, "mean_token_accuracy": 0.6760229066014289, "num_tokens": 260769645.0, "step": 16180 }, { "epoch": 3.752114961177425, "grad_norm": 0.829305112361908, "learning_rate": 3.593866050589159e-05, "loss": 1.5359, "mean_token_accuracy": 0.6771639958024025, "num_tokens": 260930582.0, "step": 16190 }, { "epoch": 3.7544327268513156, "grad_norm": 0.9255868196487427, "learning_rate": 3.592178425091965e-05, "loss": 1.5152, "mean_token_accuracy": 0.6785017848014832, "num_tokens": 261092442.0, "step": 16200 }, { "epoch": 3.7567504925252058, "grad_norm": 0.9317522048950195, "learning_rate": 3.590490184278178e-05, "loss": 1.5251, "mean_token_accuracy": 0.6763668730854988, "num_tokens": 261254243.0, "step": 16210 }, { "epoch": 3.759068258199096, "grad_norm": 0.8775080442428589, "learning_rate": 3.58880132909893e-05, "loss": 1.5362, "mean_token_accuracy": 0.6763616740703583, "num_tokens": 261415226.0, "step": 16220 }, { "epoch": 3.7613860238729866, "grad_norm": 0.9196127653121948, "learning_rate": 3.587111860505694e-05, "loss": 1.522, "mean_token_accuracy": 0.6786886408925057, "num_tokens": 261576976.0, "step": 16230 }, { "epoch": 3.763703789546877, "grad_norm": 0.8630285859107971, "learning_rate": 3.585421779450291e-05, "loss": 1.5417, "mean_token_accuracy": 0.675284706056118, "num_tokens": 261737208.0, "step": 16240 }, { "epoch": 3.7660215552207674, "grad_norm": 0.8420131802558899, "learning_rate": 3.583731086884887e-05, "loss": 1.4997, "mean_token_accuracy": 0.6818173870444297, "num_tokens": 261898501.0, "step": 16250 }, { "epoch": 3.7683393208946576, "grad_norm": 0.8903666138648987, "learning_rate": 3.582039783761992e-05, "loss": 1.5247, "mean_token_accuracy": 0.6786733046174049, "num_tokens": 262060140.0, "step": 16260 }, { "epoch": 3.770657086568548, "grad_norm": 0.9621241092681885, "learning_rate": 3.580347871034461e-05, "loss": 1.5218, "mean_token_accuracy": 0.6797304913401604, "num_tokens": 262221221.0, "step": 16270 }, { "epoch": 3.7729748522424384, "grad_norm": 0.8788756132125854, "learning_rate": 3.578655349655492e-05, "loss": 1.525, "mean_token_accuracy": 0.6796381428837777, "num_tokens": 262381624.0, "step": 16280 }, { "epoch": 3.7752926179163286, "grad_norm": 0.8371780514717102, "learning_rate": 3.576962220578623e-05, "loss": 1.5156, "mean_token_accuracy": 0.6802115768194199, "num_tokens": 262542869.0, "step": 16290 }, { "epoch": 3.7776103835902193, "grad_norm": 0.841568648815155, "learning_rate": 3.575268484757741e-05, "loss": 1.5231, "mean_token_accuracy": 0.6784732297062874, "num_tokens": 262703012.0, "step": 16300 }, { "epoch": 3.7799281492641095, "grad_norm": 0.8574280738830566, "learning_rate": 3.573574143147068e-05, "loss": 1.5229, "mean_token_accuracy": 0.6789977937936783, "num_tokens": 262864456.0, "step": 16310 }, { "epoch": 3.7822459149379997, "grad_norm": 0.9476231336593628, "learning_rate": 3.5718791967011705e-05, "loss": 1.5199, "mean_token_accuracy": 0.6795130297541618, "num_tokens": 263024565.0, "step": 16320 }, { "epoch": 3.7845636806118903, "grad_norm": 0.8961413502693176, "learning_rate": 3.570183646374956e-05, "loss": 1.5237, "mean_token_accuracy": 0.6797923788428306, "num_tokens": 263185242.0, "step": 16330 }, { "epoch": 3.7868814462857805, "grad_norm": 0.857909619808197, "learning_rate": 3.568487493123671e-05, "loss": 1.5361, "mean_token_accuracy": 0.6770828932523727, "num_tokens": 263346332.0, "step": 16340 }, { "epoch": 3.789199211959671, "grad_norm": 0.8984623551368713, "learning_rate": 3.5667907379029026e-05, "loss": 1.5243, "mean_token_accuracy": 0.6780283808708191, "num_tokens": 263507811.0, "step": 16350 }, { "epoch": 3.7915169776335613, "grad_norm": 0.8616106510162354, "learning_rate": 3.565093381668577e-05, "loss": 1.5251, "mean_token_accuracy": 0.6799539610743522, "num_tokens": 263669862.0, "step": 16360 }, { "epoch": 3.7938347433074515, "grad_norm": 0.9382621645927429, "learning_rate": 3.563395425376958e-05, "loss": 1.512, "mean_token_accuracy": 0.6790529504418373, "num_tokens": 263831509.0, "step": 16370 }, { "epoch": 3.7961525089813417, "grad_norm": 0.9182610511779785, "learning_rate": 3.5616968699846485e-05, "loss": 1.5176, "mean_token_accuracy": 0.6799261942505836, "num_tokens": 263990908.0, "step": 16380 }, { "epoch": 3.7984702746552323, "grad_norm": 0.9273796081542969, "learning_rate": 3.5599977164485885e-05, "loss": 1.5259, "mean_token_accuracy": 0.6781197383999824, "num_tokens": 264152170.0, "step": 16390 }, { "epoch": 3.800788040329123, "grad_norm": 0.8941771388053894, "learning_rate": 3.558297965726055e-05, "loss": 1.5383, "mean_token_accuracy": 0.6767831414937973, "num_tokens": 264313902.0, "step": 16400 }, { "epoch": 3.803105806003013, "grad_norm": 0.8377234935760498, "learning_rate": 3.556597618774662e-05, "loss": 1.5254, "mean_token_accuracy": 0.6789403542876243, "num_tokens": 264475349.0, "step": 16410 }, { "epoch": 3.8054235716769034, "grad_norm": 0.8979917168617249, "learning_rate": 3.5548966765523585e-05, "loss": 1.5212, "mean_token_accuracy": 0.6798860892653465, "num_tokens": 264634976.0, "step": 16420 }, { "epoch": 3.8077413373507936, "grad_norm": 0.8318324089050293, "learning_rate": 3.553195140017429e-05, "loss": 1.5472, "mean_token_accuracy": 0.6761844754219055, "num_tokens": 264795959.0, "step": 16430 }, { "epoch": 3.810059103024684, "grad_norm": 0.8608783483505249, "learning_rate": 3.5514930101284944e-05, "loss": 1.5038, "mean_token_accuracy": 0.682116511464119, "num_tokens": 264956267.0, "step": 16440 }, { "epoch": 3.812376868698575, "grad_norm": 0.955913782119751, "learning_rate": 3.549790287844507e-05, "loss": 1.5371, "mean_token_accuracy": 0.6765434861183166, "num_tokens": 265116432.0, "step": 16450 }, { "epoch": 3.814694634372465, "grad_norm": 0.8825370669364929, "learning_rate": 3.5480869741247554e-05, "loss": 1.5312, "mean_token_accuracy": 0.6786104917526246, "num_tokens": 265278363.0, "step": 16460 }, { "epoch": 3.8170124000463552, "grad_norm": 0.8253583312034607, "learning_rate": 3.5463830699288595e-05, "loss": 1.5388, "mean_token_accuracy": 0.6759225234389306, "num_tokens": 265440184.0, "step": 16470 }, { "epoch": 3.8193301657202454, "grad_norm": 0.9082375764846802, "learning_rate": 3.544678576216775e-05, "loss": 1.5404, "mean_token_accuracy": 0.6773260191082955, "num_tokens": 265601695.0, "step": 16480 }, { "epoch": 3.821647931394136, "grad_norm": 0.899709165096283, "learning_rate": 3.5429734939487844e-05, "loss": 1.5161, "mean_token_accuracy": 0.6798621192574501, "num_tokens": 265762280.0, "step": 16490 }, { "epoch": 3.8239656970680267, "grad_norm": 0.8438701629638672, "learning_rate": 3.5412678240855066e-05, "loss": 1.5274, "mean_token_accuracy": 0.6783011496067047, "num_tokens": 265923125.0, "step": 16500 }, { "epoch": 3.826283462741917, "grad_norm": 0.9777643084526062, "learning_rate": 3.5395615675878907e-05, "loss": 1.5295, "mean_token_accuracy": 0.6777545392513276, "num_tokens": 266084883.0, "step": 16510 }, { "epoch": 3.828601228415807, "grad_norm": 0.921116828918457, "learning_rate": 3.537854725417214e-05, "loss": 1.5366, "mean_token_accuracy": 0.6760732963681221, "num_tokens": 266246740.0, "step": 16520 }, { "epoch": 3.8309189940896973, "grad_norm": 0.9120193123817444, "learning_rate": 3.5361472985350866e-05, "loss": 1.5165, "mean_token_accuracy": 0.6795364305377006, "num_tokens": 266408792.0, "step": 16530 }, { "epoch": 3.833236759763588, "grad_norm": 0.9157149791717529, "learning_rate": 3.5344392879034446e-05, "loss": 1.5302, "mean_token_accuracy": 0.6776025503873825, "num_tokens": 266569869.0, "step": 16540 }, { "epoch": 3.8355545254374785, "grad_norm": 0.8907639980316162, "learning_rate": 3.5327306944845574e-05, "loss": 1.5321, "mean_token_accuracy": 0.6769652456045151, "num_tokens": 266731905.0, "step": 16550 }, { "epoch": 3.8378722911113687, "grad_norm": 0.900280773639679, "learning_rate": 3.531021519241019e-05, "loss": 1.5325, "mean_token_accuracy": 0.676838581264019, "num_tokens": 266893286.0, "step": 16560 }, { "epoch": 3.840190056785259, "grad_norm": 0.8844984173774719, "learning_rate": 3.5293117631357524e-05, "loss": 1.5298, "mean_token_accuracy": 0.6774196833372116, "num_tokens": 267054940.0, "step": 16570 }, { "epoch": 3.842507822459149, "grad_norm": 0.9064851403236389, "learning_rate": 3.527601427132009e-05, "loss": 1.5535, "mean_token_accuracy": 0.6750416859984398, "num_tokens": 267215964.0, "step": 16580 }, { "epoch": 3.8448255881330398, "grad_norm": 0.859795331954956, "learning_rate": 3.525890512193365e-05, "loss": 1.529, "mean_token_accuracy": 0.6772121593356133, "num_tokens": 267377411.0, "step": 16590 }, { "epoch": 3.84714335380693, "grad_norm": 0.8803802132606506, "learning_rate": 3.524179019283724e-05, "loss": 1.526, "mean_token_accuracy": 0.6779163673520088, "num_tokens": 267538789.0, "step": 16600 }, { "epoch": 3.8494611194808206, "grad_norm": 0.8834716081619263, "learning_rate": 3.522466949367315e-05, "loss": 1.5275, "mean_token_accuracy": 0.6784276992082596, "num_tokens": 267699468.0, "step": 16610 }, { "epoch": 3.851778885154711, "grad_norm": 0.9064461588859558, "learning_rate": 3.520754303408691e-05, "loss": 1.5218, "mean_token_accuracy": 0.6785647943615913, "num_tokens": 267860484.0, "step": 16620 }, { "epoch": 3.854096650828601, "grad_norm": 0.8897060751914978, "learning_rate": 3.519041082372732e-05, "loss": 1.5271, "mean_token_accuracy": 0.6776403650641442, "num_tokens": 268021026.0, "step": 16630 }, { "epoch": 3.8564144165024916, "grad_norm": 0.8895317912101746, "learning_rate": 3.51732728722464e-05, "loss": 1.5189, "mean_token_accuracy": 0.6794447034597397, "num_tokens": 268181800.0, "step": 16640 }, { "epoch": 3.858732182176382, "grad_norm": 0.8498573303222656, "learning_rate": 3.51561291892994e-05, "loss": 1.5165, "mean_token_accuracy": 0.6787994354963303, "num_tokens": 268343026.0, "step": 16650 }, { "epoch": 3.8610499478502724, "grad_norm": 0.9690536260604858, "learning_rate": 3.5138979784544826e-05, "loss": 1.5273, "mean_token_accuracy": 0.6767397373914719, "num_tokens": 268503433.0, "step": 16660 }, { "epoch": 3.8633677135241626, "grad_norm": 0.8727548122406006, "learning_rate": 3.5121824667644365e-05, "loss": 1.5173, "mean_token_accuracy": 0.6789012357592583, "num_tokens": 268665319.0, "step": 16670 }, { "epoch": 3.865685479198053, "grad_norm": 0.9157293438911438, "learning_rate": 3.510466384826297e-05, "loss": 1.5434, "mean_token_accuracy": 0.6752866834402085, "num_tokens": 268826782.0, "step": 16680 }, { "epoch": 3.8680032448719435, "grad_norm": 0.9089074730873108, "learning_rate": 3.5087497336068776e-05, "loss": 1.5293, "mean_token_accuracy": 0.6796619191765785, "num_tokens": 268986434.0, "step": 16690 }, { "epoch": 3.8703210105458337, "grad_norm": 0.9031400680541992, "learning_rate": 3.507032514073313e-05, "loss": 1.5211, "mean_token_accuracy": 0.6790997579693794, "num_tokens": 269146864.0, "step": 16700 }, { "epoch": 3.8726387762197243, "grad_norm": 0.8859734535217285, "learning_rate": 3.505314727193058e-05, "loss": 1.5306, "mean_token_accuracy": 0.6764767825603485, "num_tokens": 269308726.0, "step": 16710 }, { "epoch": 3.8749565418936145, "grad_norm": 0.9532806873321533, "learning_rate": 3.503596373933889e-05, "loss": 1.5397, "mean_token_accuracy": 0.6754997044801712, "num_tokens": 269469795.0, "step": 16720 }, { "epoch": 3.8772743075675047, "grad_norm": 0.8920911550521851, "learning_rate": 3.501877455263898e-05, "loss": 1.5171, "mean_token_accuracy": 0.680316860973835, "num_tokens": 269630794.0, "step": 16730 }, { "epoch": 3.8795920732413953, "grad_norm": 0.9083481431007385, "learning_rate": 3.5001579721514994e-05, "loss": 1.5228, "mean_token_accuracy": 0.6783621788024903, "num_tokens": 269792393.0, "step": 16740 }, { "epoch": 3.8819098389152855, "grad_norm": 0.9166132807731628, "learning_rate": 3.498437925565422e-05, "loss": 1.5208, "mean_token_accuracy": 0.6792530655860901, "num_tokens": 269953400.0, "step": 16750 }, { "epoch": 3.884227604589176, "grad_norm": 0.9003151059150696, "learning_rate": 3.4967173164747135e-05, "loss": 1.5177, "mean_token_accuracy": 0.6775994911789894, "num_tokens": 270114755.0, "step": 16760 }, { "epoch": 3.8865453702630663, "grad_norm": 0.9054169654846191, "learning_rate": 3.4949961458487404e-05, "loss": 1.5251, "mean_token_accuracy": 0.6780624508857727, "num_tokens": 270276652.0, "step": 16770 }, { "epoch": 3.8888631359369565, "grad_norm": 0.8495367169380188, "learning_rate": 3.493274414657183e-05, "loss": 1.5302, "mean_token_accuracy": 0.6761063724756241, "num_tokens": 270438180.0, "step": 16780 }, { "epoch": 3.891180901610847, "grad_norm": 0.877216637134552, "learning_rate": 3.4915521238700375e-05, "loss": 1.5392, "mean_token_accuracy": 0.6770023241639137, "num_tokens": 270597982.0, "step": 16790 }, { "epoch": 3.8934986672847374, "grad_norm": 0.8178584575653076, "learning_rate": 3.489829274457617e-05, "loss": 1.5128, "mean_token_accuracy": 0.6796366363763809, "num_tokens": 270759360.0, "step": 16800 }, { "epoch": 3.895816432958628, "grad_norm": 0.8586347103118896, "learning_rate": 3.488105867390549e-05, "loss": 1.5424, "mean_token_accuracy": 0.6763916730880737, "num_tokens": 270920644.0, "step": 16810 }, { "epoch": 3.898134198632518, "grad_norm": 0.875677227973938, "learning_rate": 3.486381903639773e-05, "loss": 1.5256, "mean_token_accuracy": 0.677706615626812, "num_tokens": 271080007.0, "step": 16820 }, { "epoch": 3.9004519643064084, "grad_norm": 0.8629107475280762, "learning_rate": 3.484657384176545e-05, "loss": 1.5234, "mean_token_accuracy": 0.6785922050476074, "num_tokens": 271240365.0, "step": 16830 }, { "epoch": 3.902769729980299, "grad_norm": 0.8704508543014526, "learning_rate": 3.482932309972433e-05, "loss": 1.5236, "mean_token_accuracy": 0.6788683280348777, "num_tokens": 271401773.0, "step": 16840 }, { "epoch": 3.9050874956541892, "grad_norm": 0.907910168170929, "learning_rate": 3.481206681999317e-05, "loss": 1.5278, "mean_token_accuracy": 0.6775513157248497, "num_tokens": 271562959.0, "step": 16850 }, { "epoch": 3.90740526132808, "grad_norm": 0.910001814365387, "learning_rate": 3.4794805012293885e-05, "loss": 1.5138, "mean_token_accuracy": 0.6789161041378975, "num_tokens": 271723694.0, "step": 16860 }, { "epoch": 3.90972302700197, "grad_norm": 0.8179206848144531, "learning_rate": 3.477753768635153e-05, "loss": 1.5084, "mean_token_accuracy": 0.6800903975963593, "num_tokens": 271884402.0, "step": 16870 }, { "epoch": 3.9120407926758602, "grad_norm": 0.9281895756721497, "learning_rate": 3.4760264851894233e-05, "loss": 1.5355, "mean_token_accuracy": 0.6779312923550606, "num_tokens": 272044642.0, "step": 16880 }, { "epoch": 3.914358558349751, "grad_norm": 0.862247109413147, "learning_rate": 3.474298651865326e-05, "loss": 1.5254, "mean_token_accuracy": 0.6788492351770401, "num_tokens": 272205822.0, "step": 16890 }, { "epoch": 3.916676324023641, "grad_norm": 0.9279242157936096, "learning_rate": 3.472570269636295e-05, "loss": 1.5378, "mean_token_accuracy": 0.6744638100266457, "num_tokens": 272366877.0, "step": 16900 }, { "epoch": 3.9189940896975317, "grad_norm": 0.9347245693206787, "learning_rate": 3.470841339476075e-05, "loss": 1.5169, "mean_token_accuracy": 0.6798953831195831, "num_tokens": 272528837.0, "step": 16910 }, { "epoch": 3.921311855371422, "grad_norm": 0.8102670311927795, "learning_rate": 3.469111862358719e-05, "loss": 1.5267, "mean_token_accuracy": 0.6780772298574448, "num_tokens": 272689725.0, "step": 16920 }, { "epoch": 3.923629621045312, "grad_norm": 0.8954399824142456, "learning_rate": 3.467381839258587e-05, "loss": 1.5354, "mean_token_accuracy": 0.6765269801020622, "num_tokens": 272851053.0, "step": 16930 }, { "epoch": 3.9259473867192027, "grad_norm": 0.9394648671150208, "learning_rate": 3.465651271150347e-05, "loss": 1.517, "mean_token_accuracy": 0.6801260516047478, "num_tokens": 273012425.0, "step": 16940 }, { "epoch": 3.928265152393093, "grad_norm": 0.9181943535804749, "learning_rate": 3.463920159008976e-05, "loss": 1.5114, "mean_token_accuracy": 0.6806906923651695, "num_tokens": 273173876.0, "step": 16950 }, { "epoch": 3.9305829180669836, "grad_norm": 0.9198084473609924, "learning_rate": 3.462188503809756e-05, "loss": 1.5322, "mean_token_accuracy": 0.6767918840050697, "num_tokens": 273334793.0, "step": 16960 }, { "epoch": 3.9329006837408738, "grad_norm": 0.9581477046012878, "learning_rate": 3.4604563065282736e-05, "loss": 1.5171, "mean_token_accuracy": 0.6799351617693901, "num_tokens": 273495232.0, "step": 16970 }, { "epoch": 3.935218449414764, "grad_norm": 0.9643831253051758, "learning_rate": 3.458723568140424e-05, "loss": 1.533, "mean_token_accuracy": 0.6770270958542823, "num_tokens": 273656350.0, "step": 16980 }, { "epoch": 3.9375362150886546, "grad_norm": 0.8685271143913269, "learning_rate": 3.4569902896224035e-05, "loss": 1.5335, "mean_token_accuracy": 0.6772306680679321, "num_tokens": 273817221.0, "step": 16990 }, { "epoch": 3.939853980762545, "grad_norm": 0.8675970435142517, "learning_rate": 3.455256471950718e-05, "loss": 1.5181, "mean_token_accuracy": 0.6793868660926818, "num_tokens": 273978505.0, "step": 17000 }, { "epoch": 3.9421717464364354, "grad_norm": 0.9116122126579285, "learning_rate": 3.45352211610217e-05, "loss": 1.5137, "mean_token_accuracy": 0.6785152509808541, "num_tokens": 274140392.0, "step": 17010 }, { "epoch": 3.9444895121103256, "grad_norm": 0.891683042049408, "learning_rate": 3.45178722305387e-05, "loss": 1.5281, "mean_token_accuracy": 0.6776726350188256, "num_tokens": 274302247.0, "step": 17020 }, { "epoch": 3.946807277784216, "grad_norm": 0.8353005051612854, "learning_rate": 3.450051793783232e-05, "loss": 1.5197, "mean_token_accuracy": 0.6785433158278465, "num_tokens": 274463544.0, "step": 17030 }, { "epoch": 3.9491250434581064, "grad_norm": 0.9048455953598022, "learning_rate": 3.448315829267968e-05, "loss": 1.5205, "mean_token_accuracy": 0.6805326193571091, "num_tokens": 274624637.0, "step": 17040 }, { "epoch": 3.9514428091319966, "grad_norm": 0.9010435938835144, "learning_rate": 3.4465793304860946e-05, "loss": 1.5336, "mean_token_accuracy": 0.6785528510808945, "num_tokens": 274786513.0, "step": 17050 }, { "epoch": 3.9537605748058873, "grad_norm": 0.9008451104164124, "learning_rate": 3.4448422984159295e-05, "loss": 1.5139, "mean_token_accuracy": 0.6800987377762795, "num_tokens": 274947791.0, "step": 17060 }, { "epoch": 3.9560783404797775, "grad_norm": 0.8774337768554688, "learning_rate": 3.44310473403609e-05, "loss": 1.5393, "mean_token_accuracy": 0.6757575333118438, "num_tokens": 275109385.0, "step": 17070 }, { "epoch": 3.9583961061536677, "grad_norm": 0.8892171382904053, "learning_rate": 3.4413666383254917e-05, "loss": 1.5358, "mean_token_accuracy": 0.6763322100043296, "num_tokens": 275270425.0, "step": 17080 }, { "epoch": 3.9607138718275583, "grad_norm": 0.8813824653625488, "learning_rate": 3.439628012263352e-05, "loss": 1.5172, "mean_token_accuracy": 0.678817230463028, "num_tokens": 275432100.0, "step": 17090 }, { "epoch": 3.9630316375014485, "grad_norm": 0.8738539814949036, "learning_rate": 3.437888856829186e-05, "loss": 1.5233, "mean_token_accuracy": 0.6784338906407357, "num_tokens": 275592188.0, "step": 17100 }, { "epoch": 3.965349403175339, "grad_norm": 0.8690264821052551, "learning_rate": 3.4361491730028066e-05, "loss": 1.523, "mean_token_accuracy": 0.6793935060501098, "num_tokens": 275753672.0, "step": 17110 }, { "epoch": 3.9676671688492293, "grad_norm": 0.8732134699821472, "learning_rate": 3.4344089617643264e-05, "loss": 1.5283, "mean_token_accuracy": 0.6793669909238815, "num_tokens": 275914435.0, "step": 17120 }, { "epoch": 3.9699849345231195, "grad_norm": 0.8943775296211243, "learning_rate": 3.432668224094153e-05, "loss": 1.5362, "mean_token_accuracy": 0.6774834275245667, "num_tokens": 276075076.0, "step": 17130 }, { "epoch": 3.97230270019701, "grad_norm": 0.8630611896514893, "learning_rate": 3.430926960972991e-05, "loss": 1.5304, "mean_token_accuracy": 0.6775911048054695, "num_tokens": 276236870.0, "step": 17140 }, { "epoch": 3.9746204658709003, "grad_norm": 0.7839426398277283, "learning_rate": 3.429185173381843e-05, "loss": 1.5271, "mean_token_accuracy": 0.6776840999722481, "num_tokens": 276398106.0, "step": 17150 }, { "epoch": 3.976938231544791, "grad_norm": 0.9176467061042786, "learning_rate": 3.4274428623020036e-05, "loss": 1.5153, "mean_token_accuracy": 0.6789790093898773, "num_tokens": 276559926.0, "step": 17160 }, { "epoch": 3.979255997218681, "grad_norm": 0.8856660723686218, "learning_rate": 3.425700028715065e-05, "loss": 1.5177, "mean_token_accuracy": 0.6805971220135689, "num_tokens": 276720850.0, "step": 17170 }, { "epoch": 3.9815737628925714, "grad_norm": 0.9414939284324646, "learning_rate": 3.423956673602912e-05, "loss": 1.5196, "mean_token_accuracy": 0.6787698283791542, "num_tokens": 276881745.0, "step": 17180 }, { "epoch": 3.983891528566462, "grad_norm": 0.9458112716674805, "learning_rate": 3.422212797947726e-05, "loss": 1.5379, "mean_token_accuracy": 0.6759011149406433, "num_tokens": 277043028.0, "step": 17190 }, { "epoch": 3.986209294240352, "grad_norm": 0.904479444026947, "learning_rate": 3.420468402731978e-05, "loss": 1.5044, "mean_token_accuracy": 0.6799262523651123, "num_tokens": 277202950.0, "step": 17200 }, { "epoch": 3.988527059914243, "grad_norm": 0.9638469815254211, "learning_rate": 3.418723488938434e-05, "loss": 1.5267, "mean_token_accuracy": 0.6803351417183876, "num_tokens": 277363712.0, "step": 17210 }, { "epoch": 3.990844825588133, "grad_norm": 0.8499724268913269, "learning_rate": 3.4169780575501525e-05, "loss": 1.5298, "mean_token_accuracy": 0.6776153832674027, "num_tokens": 277525259.0, "step": 17220 }, { "epoch": 3.993162591262023, "grad_norm": 0.941257894039154, "learning_rate": 3.415232109550482e-05, "loss": 1.5289, "mean_token_accuracy": 0.6777648225426673, "num_tokens": 277685404.0, "step": 17230 }, { "epoch": 3.995480356935914, "grad_norm": 0.9073279500007629, "learning_rate": 3.413485645923063e-05, "loss": 1.5249, "mean_token_accuracy": 0.6796072244644165, "num_tokens": 277846237.0, "step": 17240 }, { "epoch": 3.997798122609804, "grad_norm": 0.9025278687477112, "learning_rate": 3.411738667651827e-05, "loss": 1.5214, "mean_token_accuracy": 0.6801501229405403, "num_tokens": 278007916.0, "step": 17250 }, { "epoch": 4.0, "grad_norm": 1.2977581024169922, "learning_rate": 3.409991175720994e-05, "loss": 1.5073, "mean_token_accuracy": 0.6804688698367068, "num_tokens": 278159016.0, "step": 17260 }, { "epoch": 4.00231776567389, "grad_norm": 0.895316481590271, "learning_rate": 3.4082431711150745e-05, "loss": 1.4887, "mean_token_accuracy": 0.6829938679933548, "num_tokens": 278319695.0, "step": 17270 }, { "epoch": 4.00463553134778, "grad_norm": 0.9156033396720886, "learning_rate": 3.4064946548188684e-05, "loss": 1.5039, "mean_token_accuracy": 0.681817515194416, "num_tokens": 278480400.0, "step": 17280 }, { "epoch": 4.0069532970216715, "grad_norm": 0.9405618906021118, "learning_rate": 3.404745627817461e-05, "loss": 1.504, "mean_token_accuracy": 0.6808154538273812, "num_tokens": 278641819.0, "step": 17290 }, { "epoch": 4.009271062695562, "grad_norm": 0.9241898655891418, "learning_rate": 3.402996091096228e-05, "loss": 1.5004, "mean_token_accuracy": 0.6807022884488105, "num_tokens": 278802933.0, "step": 17300 }, { "epoch": 4.011588828369452, "grad_norm": 0.8941563963890076, "learning_rate": 3.4012460456408326e-05, "loss": 1.5226, "mean_token_accuracy": 0.6802664697170258, "num_tokens": 278964445.0, "step": 17310 }, { "epoch": 4.013906594043342, "grad_norm": 0.9095544815063477, "learning_rate": 3.399495492437222e-05, "loss": 1.5135, "mean_token_accuracy": 0.6803801536560059, "num_tokens": 279125860.0, "step": 17320 }, { "epoch": 4.016224359717232, "grad_norm": 0.8617888689041138, "learning_rate": 3.397744432471633e-05, "loss": 1.5019, "mean_token_accuracy": 0.6823529690504074, "num_tokens": 279286979.0, "step": 17330 }, { "epoch": 4.018542125391123, "grad_norm": 0.8776959180831909, "learning_rate": 3.3959928667305854e-05, "loss": 1.5228, "mean_token_accuracy": 0.6783899873495102, "num_tokens": 279448430.0, "step": 17340 }, { "epoch": 4.0208598910650135, "grad_norm": 0.9249606728553772, "learning_rate": 3.394240796200883e-05, "loss": 1.5084, "mean_token_accuracy": 0.6806844666600227, "num_tokens": 279610025.0, "step": 17350 }, { "epoch": 4.023177656738904, "grad_norm": 0.9863234162330627, "learning_rate": 3.392488221869617e-05, "loss": 1.5092, "mean_token_accuracy": 0.6816634491086007, "num_tokens": 279771458.0, "step": 17360 }, { "epoch": 4.025495422412794, "grad_norm": 0.932774007320404, "learning_rate": 3.3907351447241595e-05, "loss": 1.5093, "mean_token_accuracy": 0.6811798080801964, "num_tokens": 279931446.0, "step": 17370 }, { "epoch": 4.027813188086684, "grad_norm": 0.9582553505897522, "learning_rate": 3.3889815657521695e-05, "loss": 1.5092, "mean_token_accuracy": 0.6797087490558624, "num_tokens": 280092650.0, "step": 17380 }, { "epoch": 4.030130953760575, "grad_norm": 0.8730135560035706, "learning_rate": 3.387227485941585e-05, "loss": 1.5239, "mean_token_accuracy": 0.6786570370197296, "num_tokens": 280253384.0, "step": 17390 }, { "epoch": 4.032448719434465, "grad_norm": 0.9316835999488831, "learning_rate": 3.3854729062806286e-05, "loss": 1.4922, "mean_token_accuracy": 0.6815114259719849, "num_tokens": 280413734.0, "step": 17400 }, { "epoch": 4.034766485108356, "grad_norm": 0.8925336003303528, "learning_rate": 3.383717827757802e-05, "loss": 1.4858, "mean_token_accuracy": 0.6839555636048317, "num_tokens": 280575384.0, "step": 17410 }, { "epoch": 4.037084250782246, "grad_norm": 0.8553624749183655, "learning_rate": 3.381962251361891e-05, "loss": 1.5044, "mean_token_accuracy": 0.6817428082227707, "num_tokens": 280736765.0, "step": 17420 }, { "epoch": 4.039402016456136, "grad_norm": 0.8456765413284302, "learning_rate": 3.38020617808196e-05, "loss": 1.5112, "mean_token_accuracy": 0.6794286131858825, "num_tokens": 280898379.0, "step": 17430 }, { "epoch": 4.041719782130027, "grad_norm": 0.8878294825553894, "learning_rate": 3.3784496089073535e-05, "loss": 1.4833, "mean_token_accuracy": 0.6838394850492477, "num_tokens": 281060225.0, "step": 17440 }, { "epoch": 4.044037547803917, "grad_norm": 0.8854380249977112, "learning_rate": 3.3766925448276966e-05, "loss": 1.5142, "mean_token_accuracy": 0.6799571007490158, "num_tokens": 281220998.0, "step": 17450 }, { "epoch": 4.046355313477807, "grad_norm": 0.9323158860206604, "learning_rate": 3.3749349868328914e-05, "loss": 1.5121, "mean_token_accuracy": 0.6801918670535088, "num_tokens": 281382858.0, "step": 17460 }, { "epoch": 4.048673079151698, "grad_norm": 0.9422255158424377, "learning_rate": 3.373176935913119e-05, "loss": 1.508, "mean_token_accuracy": 0.6791133746504784, "num_tokens": 281544110.0, "step": 17470 }, { "epoch": 4.050990844825588, "grad_norm": 0.8785816431045532, "learning_rate": 3.3714183930588394e-05, "loss": 1.5126, "mean_token_accuracy": 0.6798670917749405, "num_tokens": 281706012.0, "step": 17480 }, { "epoch": 4.053308610499479, "grad_norm": 0.9845339059829712, "learning_rate": 3.369659359260788e-05, "loss": 1.4893, "mean_token_accuracy": 0.6836710691452026, "num_tokens": 281867174.0, "step": 17490 }, { "epoch": 4.055626376173369, "grad_norm": 0.9200783967971802, "learning_rate": 3.367899835509977e-05, "loss": 1.497, "mean_token_accuracy": 0.6809270039200783, "num_tokens": 282028655.0, "step": 17500 }, { "epoch": 4.057944141847259, "grad_norm": 0.9385764002799988, "learning_rate": 3.3661398227976957e-05, "loss": 1.5228, "mean_token_accuracy": 0.677911302447319, "num_tokens": 282189348.0, "step": 17510 }, { "epoch": 4.0602619075211495, "grad_norm": 0.8988301157951355, "learning_rate": 3.3643793221155084e-05, "loss": 1.499, "mean_token_accuracy": 0.6816903650760651, "num_tokens": 282350202.0, "step": 17520 }, { "epoch": 4.06257967319504, "grad_norm": 0.9137440919876099, "learning_rate": 3.362618334455254e-05, "loss": 1.4991, "mean_token_accuracy": 0.6807905450463295, "num_tokens": 282511840.0, "step": 17530 }, { "epoch": 4.064897438868931, "grad_norm": 0.9156241416931152, "learning_rate": 3.3608568608090456e-05, "loss": 1.5101, "mean_token_accuracy": 0.6795399576425553, "num_tokens": 282671479.0, "step": 17540 }, { "epoch": 4.067215204542821, "grad_norm": 0.9593009948730469, "learning_rate": 3.3590949021692715e-05, "loss": 1.5133, "mean_token_accuracy": 0.6810472637414933, "num_tokens": 282832820.0, "step": 17550 }, { "epoch": 4.069532970216711, "grad_norm": 0.9052536487579346, "learning_rate": 3.357332459528592e-05, "loss": 1.5054, "mean_token_accuracy": 0.6811117187142373, "num_tokens": 282994323.0, "step": 17560 }, { "epoch": 4.071850735890601, "grad_norm": 0.9172275066375732, "learning_rate": 3.355569533879939e-05, "loss": 1.5046, "mean_token_accuracy": 0.6797961816191673, "num_tokens": 283156203.0, "step": 17570 }, { "epoch": 4.0741685015644915, "grad_norm": 0.9353176951408386, "learning_rate": 3.353806126216519e-05, "loss": 1.5109, "mean_token_accuracy": 0.6809700295329094, "num_tokens": 283317843.0, "step": 17580 }, { "epoch": 4.076486267238383, "grad_norm": 0.9744966626167297, "learning_rate": 3.352042237531808e-05, "loss": 1.5059, "mean_token_accuracy": 0.6811608627438546, "num_tokens": 283479171.0, "step": 17590 }, { "epoch": 4.078804032912273, "grad_norm": 0.8509780764579773, "learning_rate": 3.3502778688195554e-05, "loss": 1.4945, "mean_token_accuracy": 0.6829553291201591, "num_tokens": 283639872.0, "step": 17600 }, { "epoch": 4.081121798586163, "grad_norm": 0.9047322273254395, "learning_rate": 3.3485130210737783e-05, "loss": 1.4878, "mean_token_accuracy": 0.6827815636992455, "num_tokens": 283801647.0, "step": 17610 }, { "epoch": 4.083439564260053, "grad_norm": 0.9292351007461548, "learning_rate": 3.346747695288764e-05, "loss": 1.5065, "mean_token_accuracy": 0.679702989757061, "num_tokens": 283963346.0, "step": 17620 }, { "epoch": 4.085757329933943, "grad_norm": 0.8997600078582764, "learning_rate": 3.344981892459072e-05, "loss": 1.5115, "mean_token_accuracy": 0.6815939962863922, "num_tokens": 284123619.0, "step": 17630 }, { "epoch": 4.088075095607834, "grad_norm": 0.8691164255142212, "learning_rate": 3.343215613579527e-05, "loss": 1.4979, "mean_token_accuracy": 0.6816432654857636, "num_tokens": 284284288.0, "step": 17640 }, { "epoch": 4.090392861281725, "grad_norm": 0.9277439117431641, "learning_rate": 3.341448859645225e-05, "loss": 1.5145, "mean_token_accuracy": 0.6801040068268775, "num_tokens": 284445707.0, "step": 17650 }, { "epoch": 4.092710626955615, "grad_norm": 0.9171796441078186, "learning_rate": 3.339681631651527e-05, "loss": 1.508, "mean_token_accuracy": 0.6804759442806244, "num_tokens": 284607573.0, "step": 17660 }, { "epoch": 4.095028392629505, "grad_norm": 0.8639628291130066, "learning_rate": 3.337913930594062e-05, "loss": 1.5296, "mean_token_accuracy": 0.6768782913684845, "num_tokens": 284768333.0, "step": 17670 }, { "epoch": 4.097346158303395, "grad_norm": 0.9251660108566284, "learning_rate": 3.336145757468726e-05, "loss": 1.517, "mean_token_accuracy": 0.6812085568904876, "num_tokens": 284930418.0, "step": 17680 }, { "epoch": 4.099663923977286, "grad_norm": 0.9557574987411499, "learning_rate": 3.3343771132716807e-05, "loss": 1.5168, "mean_token_accuracy": 0.6785392671823501, "num_tokens": 285090206.0, "step": 17690 }, { "epoch": 4.1019816896511765, "grad_norm": 0.9405009150505066, "learning_rate": 3.332607998999354e-05, "loss": 1.4837, "mean_token_accuracy": 0.6828371688723565, "num_tokens": 285251446.0, "step": 17700 }, { "epoch": 4.104299455325067, "grad_norm": 0.9019715785980225, "learning_rate": 3.3308384156484354e-05, "loss": 1.516, "mean_token_accuracy": 0.6786905199289321, "num_tokens": 285413343.0, "step": 17710 }, { "epoch": 4.106617220998957, "grad_norm": 0.88797926902771, "learning_rate": 3.3290683642158835e-05, "loss": 1.5054, "mean_token_accuracy": 0.6802316248416901, "num_tokens": 285574089.0, "step": 17720 }, { "epoch": 4.108934986672847, "grad_norm": 0.9312342405319214, "learning_rate": 3.3272978456989174e-05, "loss": 1.4978, "mean_token_accuracy": 0.6801338240504264, "num_tokens": 285734818.0, "step": 17730 }, { "epoch": 4.111252752346738, "grad_norm": 0.9220024943351746, "learning_rate": 3.325526861095019e-05, "loss": 1.4974, "mean_token_accuracy": 0.6820776671171188, "num_tokens": 285894240.0, "step": 17740 }, { "epoch": 4.113570518020628, "grad_norm": 0.9310296773910522, "learning_rate": 3.3237554114019365e-05, "loss": 1.4998, "mean_token_accuracy": 0.6821406930685043, "num_tokens": 286056246.0, "step": 17750 }, { "epoch": 4.1158882836945185, "grad_norm": 0.9194704294204712, "learning_rate": 3.321983497617675e-05, "loss": 1.5212, "mean_token_accuracy": 0.6786731570959091, "num_tokens": 286216852.0, "step": 17760 }, { "epoch": 4.118206049368409, "grad_norm": 0.950374960899353, "learning_rate": 3.320211120740504e-05, "loss": 1.4901, "mean_token_accuracy": 0.6828594177961349, "num_tokens": 286378598.0, "step": 17770 }, { "epoch": 4.120523815042299, "grad_norm": 0.9402995109558105, "learning_rate": 3.318438281768955e-05, "loss": 1.5015, "mean_token_accuracy": 0.6816441118717194, "num_tokens": 286540238.0, "step": 17780 }, { "epoch": 4.12284158071619, "grad_norm": 0.9054789543151855, "learning_rate": 3.316664981701816e-05, "loss": 1.5174, "mean_token_accuracy": 0.6782744318246842, "num_tokens": 286699270.0, "step": 17790 }, { "epoch": 4.12515934639008, "grad_norm": 0.8809850811958313, "learning_rate": 3.3148912215381386e-05, "loss": 1.5102, "mean_token_accuracy": 0.6793925032019615, "num_tokens": 286860143.0, "step": 17800 }, { "epoch": 4.12747711206397, "grad_norm": 0.9413992762565613, "learning_rate": 3.313117002277232e-05, "loss": 1.5236, "mean_token_accuracy": 0.6790948584675789, "num_tokens": 287020901.0, "step": 17810 }, { "epoch": 4.129794877737861, "grad_norm": 0.8953555822372437, "learning_rate": 3.311342324918664e-05, "loss": 1.5198, "mean_token_accuracy": 0.6792201384902, "num_tokens": 287182248.0, "step": 17820 }, { "epoch": 4.132112643411751, "grad_norm": 0.9200738072395325, "learning_rate": 3.309567190462261e-05, "loss": 1.4972, "mean_token_accuracy": 0.6828036576509475, "num_tokens": 287342977.0, "step": 17830 }, { "epoch": 4.134430409085642, "grad_norm": 0.9877822995185852, "learning_rate": 3.307791599908106e-05, "loss": 1.5156, "mean_token_accuracy": 0.6805880084633827, "num_tokens": 287502867.0, "step": 17840 }, { "epoch": 4.136748174759532, "grad_norm": 0.923491895198822, "learning_rate": 3.30601555425654e-05, "loss": 1.5205, "mean_token_accuracy": 0.6789566293358803, "num_tokens": 287664983.0, "step": 17850 }, { "epoch": 4.139065940433422, "grad_norm": 0.9112950563430786, "learning_rate": 3.304239054508159e-05, "loss": 1.495, "mean_token_accuracy": 0.6809851408004761, "num_tokens": 287825646.0, "step": 17860 }, { "epoch": 4.141383706107312, "grad_norm": 0.879970371723175, "learning_rate": 3.3024621016638166e-05, "loss": 1.4971, "mean_token_accuracy": 0.6833518251776696, "num_tokens": 287986039.0, "step": 17870 }, { "epoch": 4.143701471781203, "grad_norm": 0.9784168601036072, "learning_rate": 3.30068469672462e-05, "loss": 1.5111, "mean_token_accuracy": 0.6800521090626717, "num_tokens": 288146103.0, "step": 17880 }, { "epoch": 4.146019237455094, "grad_norm": 0.895348846912384, "learning_rate": 3.298906840691933e-05, "loss": 1.5161, "mean_token_accuracy": 0.6805493503808975, "num_tokens": 288308067.0, "step": 17890 }, { "epoch": 4.148337003128984, "grad_norm": 0.9180073738098145, "learning_rate": 3.297128534567371e-05, "loss": 1.4953, "mean_token_accuracy": 0.6807682037353515, "num_tokens": 288468796.0, "step": 17900 }, { "epoch": 4.150654768802874, "grad_norm": 0.9174934029579163, "learning_rate": 3.2953497793528065e-05, "loss": 1.5206, "mean_token_accuracy": 0.677942767739296, "num_tokens": 288630301.0, "step": 17910 }, { "epoch": 4.152972534476764, "grad_norm": 0.8529210686683655, "learning_rate": 3.293570576050361e-05, "loss": 1.4761, "mean_token_accuracy": 0.6841894865036011, "num_tokens": 288790884.0, "step": 17920 }, { "epoch": 4.1552903001506545, "grad_norm": 0.8684077858924866, "learning_rate": 3.291790925662412e-05, "loss": 1.508, "mean_token_accuracy": 0.6788268148899078, "num_tokens": 288952362.0, "step": 17930 }, { "epoch": 4.157608065824546, "grad_norm": 0.9733067750930786, "learning_rate": 3.2900108291915854e-05, "loss": 1.5035, "mean_token_accuracy": 0.6806637242436409, "num_tokens": 289113836.0, "step": 17940 }, { "epoch": 4.159925831498436, "grad_norm": 0.9270621538162231, "learning_rate": 3.2882302876407624e-05, "loss": 1.5171, "mean_token_accuracy": 0.6793353453278541, "num_tokens": 289274789.0, "step": 17950 }, { "epoch": 4.162243597172326, "grad_norm": 0.8993983268737793, "learning_rate": 3.2864493020130705e-05, "loss": 1.512, "mean_token_accuracy": 0.679371963441372, "num_tokens": 289436738.0, "step": 17960 }, { "epoch": 4.164561362846216, "grad_norm": 0.8783894181251526, "learning_rate": 3.284667873311891e-05, "loss": 1.5066, "mean_token_accuracy": 0.6780409097671509, "num_tokens": 289597565.0, "step": 17970 }, { "epoch": 4.166879128520106, "grad_norm": 0.9728587865829468, "learning_rate": 3.282886002540854e-05, "loss": 1.5191, "mean_token_accuracy": 0.6789194822311402, "num_tokens": 289759322.0, "step": 17980 }, { "epoch": 4.169196894193997, "grad_norm": 0.925588846206665, "learning_rate": 3.281103690703838e-05, "loss": 1.5019, "mean_token_accuracy": 0.6794223204255104, "num_tokens": 289921302.0, "step": 17990 }, { "epoch": 4.171514659867888, "grad_norm": 0.9237321019172668, "learning_rate": 3.279320938804969e-05, "loss": 1.5004, "mean_token_accuracy": 0.6799224451184273, "num_tokens": 290082355.0, "step": 18000 }, { "epoch": 4.173832425541778, "grad_norm": 0.888080894947052, "learning_rate": 3.277537747848624e-05, "loss": 1.5222, "mean_token_accuracy": 0.6770095616579056, "num_tokens": 290244157.0, "step": 18010 }, { "epoch": 4.176150191215668, "grad_norm": 0.9250674247741699, "learning_rate": 3.2757541188394234e-05, "loss": 1.5066, "mean_token_accuracy": 0.6814251810312271, "num_tokens": 290405662.0, "step": 18020 }, { "epoch": 4.178467956889558, "grad_norm": 0.9231549501419067, "learning_rate": 3.2739700527822375e-05, "loss": 1.5007, "mean_token_accuracy": 0.6810980767011643, "num_tokens": 290566381.0, "step": 18030 }, { "epoch": 4.180785722563449, "grad_norm": 0.8801710605621338, "learning_rate": 3.272185550682182e-05, "loss": 1.4999, "mean_token_accuracy": 0.6817323833703994, "num_tokens": 290724219.0, "step": 18040 }, { "epoch": 4.1831034882373395, "grad_norm": 0.9433039426803589, "learning_rate": 3.270400613544617e-05, "loss": 1.5173, "mean_token_accuracy": 0.6783183336257934, "num_tokens": 290885134.0, "step": 18050 }, { "epoch": 4.18542125391123, "grad_norm": 0.9793021082878113, "learning_rate": 3.268615242375151e-05, "loss": 1.5147, "mean_token_accuracy": 0.6806310534477233, "num_tokens": 291046975.0, "step": 18060 }, { "epoch": 4.18773901958512, "grad_norm": 0.9792535901069641, "learning_rate": 3.266829438179633e-05, "loss": 1.5235, "mean_token_accuracy": 0.67764702886343, "num_tokens": 291208171.0, "step": 18070 }, { "epoch": 4.19005678525901, "grad_norm": 0.9563701748847961, "learning_rate": 3.265043201964157e-05, "loss": 1.5157, "mean_token_accuracy": 0.6782612264156341, "num_tokens": 291369901.0, "step": 18080 }, { "epoch": 4.1923745509329, "grad_norm": 0.8917310237884521, "learning_rate": 3.263256534735063e-05, "loss": 1.5216, "mean_token_accuracy": 0.6777584329247475, "num_tokens": 291531228.0, "step": 18090 }, { "epoch": 4.194692316606791, "grad_norm": 0.8829416036605835, "learning_rate": 3.261469437498931e-05, "loss": 1.5158, "mean_token_accuracy": 0.6800570115447044, "num_tokens": 291690820.0, "step": 18100 }, { "epoch": 4.1970100822806815, "grad_norm": 0.9026559591293335, "learning_rate": 3.259681911262584e-05, "loss": 1.4766, "mean_token_accuracy": 0.6843177244067192, "num_tokens": 291851946.0, "step": 18110 }, { "epoch": 4.199327847954572, "grad_norm": 0.9292559027671814, "learning_rate": 3.257893957033087e-05, "loss": 1.5149, "mean_token_accuracy": 0.6814153820276261, "num_tokens": 292011293.0, "step": 18120 }, { "epoch": 4.201645613628462, "grad_norm": 1.0055376291275024, "learning_rate": 3.256105575817747e-05, "loss": 1.5111, "mean_token_accuracy": 0.6795411348342896, "num_tokens": 292172943.0, "step": 18130 }, { "epoch": 4.203963379302353, "grad_norm": 0.9109935164451599, "learning_rate": 3.2543167686241096e-05, "loss": 1.5015, "mean_token_accuracy": 0.6796091392636299, "num_tokens": 292334536.0, "step": 18140 }, { "epoch": 4.206281144976243, "grad_norm": 0.9161977767944336, "learning_rate": 3.2525275364599616e-05, "loss": 1.5189, "mean_token_accuracy": 0.6794295161962509, "num_tokens": 292495872.0, "step": 18150 }, { "epoch": 4.208598910650133, "grad_norm": 0.9497368335723877, "learning_rate": 3.2507378803333284e-05, "loss": 1.4943, "mean_token_accuracy": 0.6816663831472397, "num_tokens": 292656317.0, "step": 18160 }, { "epoch": 4.2109166763240236, "grad_norm": 0.8854226469993591, "learning_rate": 3.248947801252475e-05, "loss": 1.5056, "mean_token_accuracy": 0.6810885205864906, "num_tokens": 292818171.0, "step": 18170 }, { "epoch": 4.213234441997914, "grad_norm": 0.8619081377983093, "learning_rate": 3.247157300225906e-05, "loss": 1.5122, "mean_token_accuracy": 0.6802055463194847, "num_tokens": 292978985.0, "step": 18180 }, { "epoch": 4.215552207671804, "grad_norm": 0.9473296403884888, "learning_rate": 3.24536637826236e-05, "loss": 1.5074, "mean_token_accuracy": 0.6815511792898178, "num_tokens": 293140173.0, "step": 18190 }, { "epoch": 4.217869973345695, "grad_norm": 0.8860408067703247, "learning_rate": 3.243575036370817e-05, "loss": 1.512, "mean_token_accuracy": 0.680044473707676, "num_tokens": 293301627.0, "step": 18200 }, { "epoch": 4.220187739019585, "grad_norm": 0.9188624620437622, "learning_rate": 3.24178327556049e-05, "loss": 1.4878, "mean_token_accuracy": 0.6835634753108024, "num_tokens": 293463488.0, "step": 18210 }, { "epoch": 4.222505504693475, "grad_norm": 0.8855107426643372, "learning_rate": 3.239991096840831e-05, "loss": 1.4949, "mean_token_accuracy": 0.6823010966181755, "num_tokens": 293625283.0, "step": 18220 }, { "epoch": 4.224823270367366, "grad_norm": 0.90816330909729, "learning_rate": 3.2381985012215254e-05, "loss": 1.5125, "mean_token_accuracy": 0.6804756134748459, "num_tokens": 293786697.0, "step": 18230 }, { "epoch": 4.227141036041257, "grad_norm": 0.941677451133728, "learning_rate": 3.2364054897124936e-05, "loss": 1.5092, "mean_token_accuracy": 0.6793545395135879, "num_tokens": 293948087.0, "step": 18240 }, { "epoch": 4.229458801715147, "grad_norm": 0.9579547047615051, "learning_rate": 3.234612063323891e-05, "loss": 1.5147, "mean_token_accuracy": 0.6786521971225739, "num_tokens": 294110524.0, "step": 18250 }, { "epoch": 4.231776567389037, "grad_norm": 0.913398802280426, "learning_rate": 3.2328182230661075e-05, "loss": 1.5022, "mean_token_accuracy": 0.6811669945716858, "num_tokens": 294271285.0, "step": 18260 }, { "epoch": 4.234094333062927, "grad_norm": 0.9712862372398376, "learning_rate": 3.2310239699497644e-05, "loss": 1.5071, "mean_token_accuracy": 0.6815074130892753, "num_tokens": 294431910.0, "step": 18270 }, { "epoch": 4.2364120987368175, "grad_norm": 0.9134795665740967, "learning_rate": 3.229229304985716e-05, "loss": 1.4898, "mean_token_accuracy": 0.6816569343209267, "num_tokens": 294593390.0, "step": 18280 }, { "epoch": 4.238729864410708, "grad_norm": 0.9694876670837402, "learning_rate": 3.22743422918505e-05, "loss": 1.4938, "mean_token_accuracy": 0.680934140086174, "num_tokens": 294754172.0, "step": 18290 }, { "epoch": 4.241047630084599, "grad_norm": 0.8977597951889038, "learning_rate": 3.225638743559084e-05, "loss": 1.5005, "mean_token_accuracy": 0.6809873074293137, "num_tokens": 294915885.0, "step": 18300 }, { "epoch": 4.243365395758489, "grad_norm": 0.9071974754333496, "learning_rate": 3.223842849119368e-05, "loss": 1.5155, "mean_token_accuracy": 0.6807349935173989, "num_tokens": 295076773.0, "step": 18310 }, { "epoch": 4.245683161432379, "grad_norm": 0.9016053080558777, "learning_rate": 3.2220465468776804e-05, "loss": 1.4942, "mean_token_accuracy": 0.6817868605256081, "num_tokens": 295237792.0, "step": 18320 }, { "epoch": 4.248000927106269, "grad_norm": 0.8672078847885132, "learning_rate": 3.220249837846031e-05, "loss": 1.5078, "mean_token_accuracy": 0.6805759146809578, "num_tokens": 295399459.0, "step": 18330 }, { "epoch": 4.25031869278016, "grad_norm": 0.8651717901229858, "learning_rate": 3.2184527230366565e-05, "loss": 1.519, "mean_token_accuracy": 0.6774242863059043, "num_tokens": 295561359.0, "step": 18340 }, { "epoch": 4.252636458454051, "grad_norm": 0.9682283997535706, "learning_rate": 3.216655203462026e-05, "loss": 1.5214, "mean_token_accuracy": 0.679014852643013, "num_tokens": 295721570.0, "step": 18350 }, { "epoch": 4.254954224127941, "grad_norm": 0.8874722123146057, "learning_rate": 3.2148572801348336e-05, "loss": 1.5074, "mean_token_accuracy": 0.6788532555103302, "num_tokens": 295882462.0, "step": 18360 }, { "epoch": 4.257271989801831, "grad_norm": 0.9879566431045532, "learning_rate": 3.213058954068002e-05, "loss": 1.5188, "mean_token_accuracy": 0.6786593198776245, "num_tokens": 296043295.0, "step": 18370 }, { "epoch": 4.259589755475721, "grad_norm": 0.9049651026725769, "learning_rate": 3.21126022627468e-05, "loss": 1.5062, "mean_token_accuracy": 0.6797067284584045, "num_tokens": 296204578.0, "step": 18380 }, { "epoch": 4.261907521149611, "grad_norm": 0.9558441042900085, "learning_rate": 3.2094610977682436e-05, "loss": 1.5233, "mean_token_accuracy": 0.6771765783429146, "num_tokens": 296365552.0, "step": 18390 }, { "epoch": 4.264225286823502, "grad_norm": 0.9032666087150574, "learning_rate": 3.2076615695622947e-05, "loss": 1.5089, "mean_token_accuracy": 0.6793503001332283, "num_tokens": 296526733.0, "step": 18400 }, { "epoch": 4.266543052497393, "grad_norm": 0.8725643754005432, "learning_rate": 3.205861642670659e-05, "loss": 1.5068, "mean_token_accuracy": 0.680477574467659, "num_tokens": 296688243.0, "step": 18410 }, { "epoch": 4.268860818171283, "grad_norm": 0.8968142867088318, "learning_rate": 3.204061318107389e-05, "loss": 1.5126, "mean_token_accuracy": 0.6798191696405411, "num_tokens": 296849659.0, "step": 18420 }, { "epoch": 4.271178583845173, "grad_norm": 0.918083906173706, "learning_rate": 3.202260596886758e-05, "loss": 1.4956, "mean_token_accuracy": 0.6815510928630829, "num_tokens": 297011539.0, "step": 18430 }, { "epoch": 4.273496349519063, "grad_norm": 0.9525334239006042, "learning_rate": 3.200459480023267e-05, "loss": 1.5157, "mean_token_accuracy": 0.6789452940225601, "num_tokens": 297172187.0, "step": 18440 }, { "epoch": 4.275814115192954, "grad_norm": 0.9031352400779724, "learning_rate": 3.198657968531636e-05, "loss": 1.5009, "mean_token_accuracy": 0.6819437354803085, "num_tokens": 297333522.0, "step": 18450 }, { "epoch": 4.2781318808668445, "grad_norm": 0.9675595164299011, "learning_rate": 3.19685606342681e-05, "loss": 1.5061, "mean_token_accuracy": 0.6809906989336014, "num_tokens": 297494474.0, "step": 18460 }, { "epoch": 4.280449646540735, "grad_norm": 0.9063284397125244, "learning_rate": 3.1950537657239535e-05, "loss": 1.5161, "mean_token_accuracy": 0.6774724379181862, "num_tokens": 297654926.0, "step": 18470 }, { "epoch": 4.282767412214625, "grad_norm": 0.8982083201408386, "learning_rate": 3.193251076438455e-05, "loss": 1.5134, "mean_token_accuracy": 0.6792122334241867, "num_tokens": 297816114.0, "step": 18480 }, { "epoch": 4.285085177888515, "grad_norm": 0.921422004699707, "learning_rate": 3.191447996585921e-05, "loss": 1.4854, "mean_token_accuracy": 0.6822737038135529, "num_tokens": 297977824.0, "step": 18490 }, { "epoch": 4.287402943562406, "grad_norm": 0.8824591636657715, "learning_rate": 3.1896445271821795e-05, "loss": 1.4897, "mean_token_accuracy": 0.6834971696138382, "num_tokens": 298138013.0, "step": 18500 }, { "epoch": 4.289720709236296, "grad_norm": 0.9498914480209351, "learning_rate": 3.187840669243277e-05, "loss": 1.5155, "mean_token_accuracy": 0.6793612107634545, "num_tokens": 298299096.0, "step": 18510 }, { "epoch": 4.2920384749101865, "grad_norm": 0.9126007556915283, "learning_rate": 3.18603642378548e-05, "loss": 1.5176, "mean_token_accuracy": 0.6792235001921654, "num_tokens": 298460739.0, "step": 18520 }, { "epoch": 4.294356240584077, "grad_norm": 0.8802101612091064, "learning_rate": 3.184231791825272e-05, "loss": 1.5011, "mean_token_accuracy": 0.6804956510663033, "num_tokens": 298622046.0, "step": 18530 }, { "epoch": 4.296674006257967, "grad_norm": 0.9342573285102844, "learning_rate": 3.182426774379356e-05, "loss": 1.5111, "mean_token_accuracy": 0.6797511160373688, "num_tokens": 298783190.0, "step": 18540 }, { "epoch": 4.298991771931858, "grad_norm": 0.918077290058136, "learning_rate": 3.180621372464651e-05, "loss": 1.5053, "mean_token_accuracy": 0.6802830144762992, "num_tokens": 298944289.0, "step": 18550 }, { "epoch": 4.301309537605748, "grad_norm": 0.906198263168335, "learning_rate": 3.178815587098292e-05, "loss": 1.5012, "mean_token_accuracy": 0.6796340882778168, "num_tokens": 299104531.0, "step": 18560 }, { "epoch": 4.303627303279638, "grad_norm": 1.0042996406555176, "learning_rate": 3.177009419297631e-05, "loss": 1.5205, "mean_token_accuracy": 0.6777180522680283, "num_tokens": 299265604.0, "step": 18570 }, { "epoch": 4.305945068953529, "grad_norm": 0.8785635232925415, "learning_rate": 3.175202870080236e-05, "loss": 1.4931, "mean_token_accuracy": 0.6826112255454063, "num_tokens": 299427069.0, "step": 18580 }, { "epoch": 4.308262834627419, "grad_norm": 0.931093692779541, "learning_rate": 3.173395940463888e-05, "loss": 1.5109, "mean_token_accuracy": 0.6805750876665115, "num_tokens": 299588877.0, "step": 18590 }, { "epoch": 4.31058060030131, "grad_norm": 0.9486585259437561, "learning_rate": 3.171588631466585e-05, "loss": 1.501, "mean_token_accuracy": 0.6827023729681969, "num_tokens": 299750792.0, "step": 18600 }, { "epoch": 4.3128983659752, "grad_norm": 0.9088582396507263, "learning_rate": 3.169780944106536e-05, "loss": 1.5134, "mean_token_accuracy": 0.6790394097566604, "num_tokens": 299912347.0, "step": 18610 }, { "epoch": 4.31521613164909, "grad_norm": 0.9027214050292969, "learning_rate": 3.1679728794021645e-05, "loss": 1.5229, "mean_token_accuracy": 0.67853973954916, "num_tokens": 300073951.0, "step": 18620 }, { "epoch": 4.31753389732298, "grad_norm": 0.9967194199562073, "learning_rate": 3.1661644383721066e-05, "loss": 1.5172, "mean_token_accuracy": 0.6791102215647697, "num_tokens": 300235518.0, "step": 18630 }, { "epoch": 4.319851662996871, "grad_norm": 0.9068676829338074, "learning_rate": 3.1643556220352114e-05, "loss": 1.487, "mean_token_accuracy": 0.6826151669025421, "num_tokens": 300397371.0, "step": 18640 }, { "epoch": 4.322169428670762, "grad_norm": 0.9158416390419006, "learning_rate": 3.162546431410536e-05, "loss": 1.4898, "mean_token_accuracy": 0.6826768517494202, "num_tokens": 300558236.0, "step": 18650 }, { "epoch": 4.324487194344652, "grad_norm": 0.9351882934570312, "learning_rate": 3.1607368675173524e-05, "loss": 1.5175, "mean_token_accuracy": 0.6799864739179611, "num_tokens": 300719391.0, "step": 18660 }, { "epoch": 4.326804960018542, "grad_norm": 0.8825076818466187, "learning_rate": 3.1589269313751406e-05, "loss": 1.5104, "mean_token_accuracy": 0.679955518245697, "num_tokens": 300880870.0, "step": 18670 }, { "epoch": 4.329122725692432, "grad_norm": 0.891589343547821, "learning_rate": 3.157116624003591e-05, "loss": 1.5244, "mean_token_accuracy": 0.678430525958538, "num_tokens": 301041851.0, "step": 18680 }, { "epoch": 4.3314404913663225, "grad_norm": 0.9200131893157959, "learning_rate": 3.1553059464226045e-05, "loss": 1.4784, "mean_token_accuracy": 0.6834046736359596, "num_tokens": 301202409.0, "step": 18690 }, { "epoch": 4.333758257040214, "grad_norm": 0.9153326153755188, "learning_rate": 3.153494899652286e-05, "loss": 1.5071, "mean_token_accuracy": 0.6796080157160759, "num_tokens": 301363575.0, "step": 18700 }, { "epoch": 4.336076022714104, "grad_norm": 0.9672975540161133, "learning_rate": 3.1516834847129525e-05, "loss": 1.5138, "mean_token_accuracy": 0.6790976420044899, "num_tokens": 301525358.0, "step": 18710 }, { "epoch": 4.338393788387994, "grad_norm": 0.9029175639152527, "learning_rate": 3.1498717026251296e-05, "loss": 1.5106, "mean_token_accuracy": 0.679581318795681, "num_tokens": 301687268.0, "step": 18720 }, { "epoch": 4.340711554061884, "grad_norm": 0.9274295568466187, "learning_rate": 3.1480595544095454e-05, "loss": 1.5183, "mean_token_accuracy": 0.6802900731563568, "num_tokens": 301848915.0, "step": 18730 }, { "epoch": 4.343029319735774, "grad_norm": 0.9163139462471008, "learning_rate": 3.1462470410871366e-05, "loss": 1.5039, "mean_token_accuracy": 0.6802289485931396, "num_tokens": 302010350.0, "step": 18740 }, { "epoch": 4.345347085409665, "grad_norm": 0.9500520825386047, "learning_rate": 3.1444341636790464e-05, "loss": 1.5101, "mean_token_accuracy": 0.6800873517990113, "num_tokens": 302171819.0, "step": 18750 }, { "epoch": 4.347664851083556, "grad_norm": 0.938123881816864, "learning_rate": 3.1426209232066216e-05, "loss": 1.508, "mean_token_accuracy": 0.678796510398388, "num_tokens": 302332055.0, "step": 18760 }, { "epoch": 4.349982616757446, "grad_norm": 0.8994945883750916, "learning_rate": 3.1408073206914145e-05, "loss": 1.5253, "mean_token_accuracy": 0.6781068354845047, "num_tokens": 302493356.0, "step": 18770 }, { "epoch": 4.352300382431336, "grad_norm": 0.8837369084358215, "learning_rate": 3.1389933571551805e-05, "loss": 1.504, "mean_token_accuracy": 0.681763057410717, "num_tokens": 302654278.0, "step": 18780 }, { "epoch": 4.354618148105226, "grad_norm": 0.8864578008651733, "learning_rate": 3.1371790336198784e-05, "loss": 1.4936, "mean_token_accuracy": 0.6829286903142929, "num_tokens": 302815978.0, "step": 18790 }, { "epoch": 4.356935913779117, "grad_norm": 0.8789159059524536, "learning_rate": 3.1353643511076724e-05, "loss": 1.5023, "mean_token_accuracy": 0.6814733490347862, "num_tokens": 302976791.0, "step": 18800 }, { "epoch": 4.3592536794530075, "grad_norm": 0.8873025178909302, "learning_rate": 3.133549310640924e-05, "loss": 1.5095, "mean_token_accuracy": 0.6796445101499557, "num_tokens": 303136666.0, "step": 18810 }, { "epoch": 4.361571445126898, "grad_norm": 0.8740704655647278, "learning_rate": 3.1317339132422006e-05, "loss": 1.4782, "mean_token_accuracy": 0.6836270302534103, "num_tokens": 303297277.0, "step": 18820 }, { "epoch": 4.363889210800788, "grad_norm": 0.8938598036766052, "learning_rate": 3.12991815993427e-05, "loss": 1.4977, "mean_token_accuracy": 0.681120416522026, "num_tokens": 303458747.0, "step": 18830 }, { "epoch": 4.366206976474678, "grad_norm": 0.9015171527862549, "learning_rate": 3.1281020517400984e-05, "loss": 1.5233, "mean_token_accuracy": 0.6782056480646134, "num_tokens": 303619374.0, "step": 18840 }, { "epoch": 4.368524742148569, "grad_norm": 0.921261191368103, "learning_rate": 3.1262855896828536e-05, "loss": 1.502, "mean_token_accuracy": 0.6805322840809822, "num_tokens": 303780485.0, "step": 18850 }, { "epoch": 4.370842507822459, "grad_norm": 0.9557645916938782, "learning_rate": 3.124468774785903e-05, "loss": 1.5232, "mean_token_accuracy": 0.677314493060112, "num_tokens": 303941221.0, "step": 18860 }, { "epoch": 4.3731602734963495, "grad_norm": 0.9082863926887512, "learning_rate": 3.122651608072812e-05, "loss": 1.5038, "mean_token_accuracy": 0.6813188329339027, "num_tokens": 304102977.0, "step": 18870 }, { "epoch": 4.37547803917024, "grad_norm": 0.9106123447418213, "learning_rate": 3.120834090567345e-05, "loss": 1.4948, "mean_token_accuracy": 0.6815066576004029, "num_tokens": 304264340.0, "step": 18880 }, { "epoch": 4.37779580484413, "grad_norm": 1.0322130918502808, "learning_rate": 3.119016223293463e-05, "loss": 1.4971, "mean_token_accuracy": 0.6827850177884102, "num_tokens": 304424812.0, "step": 18890 }, { "epoch": 4.380113570518021, "grad_norm": 0.8554628491401672, "learning_rate": 3.1171980072753236e-05, "loss": 1.4772, "mean_token_accuracy": 0.6842404484748841, "num_tokens": 304585903.0, "step": 18900 }, { "epoch": 4.382431336191911, "grad_norm": 0.9116711616516113, "learning_rate": 3.115379443537284e-05, "loss": 1.5153, "mean_token_accuracy": 0.6788177132606507, "num_tokens": 304746731.0, "step": 18910 }, { "epoch": 4.384749101865801, "grad_norm": 0.959490180015564, "learning_rate": 3.113560533103894e-05, "loss": 1.5013, "mean_token_accuracy": 0.6807722344994545, "num_tokens": 304908461.0, "step": 18920 }, { "epoch": 4.3870668675396916, "grad_norm": 0.866607129573822, "learning_rate": 3.1117412769999e-05, "loss": 1.4859, "mean_token_accuracy": 0.6820983111858367, "num_tokens": 305068966.0, "step": 18930 }, { "epoch": 4.389384633213582, "grad_norm": 0.9659788608551025, "learning_rate": 3.1099216762502426e-05, "loss": 1.5152, "mean_token_accuracy": 0.6790996298193932, "num_tokens": 305230999.0, "step": 18940 }, { "epoch": 4.391702398887473, "grad_norm": 0.8984334468841553, "learning_rate": 3.108101731880057e-05, "loss": 1.499, "mean_token_accuracy": 0.6787572085857392, "num_tokens": 305392992.0, "step": 18950 }, { "epoch": 4.394020164561363, "grad_norm": 0.9015902280807495, "learning_rate": 3.1062814449146723e-05, "loss": 1.4934, "mean_token_accuracy": 0.682508796453476, "num_tokens": 305554417.0, "step": 18960 }, { "epoch": 4.396337930235253, "grad_norm": 0.951303243637085, "learning_rate": 3.104460816379612e-05, "loss": 1.4868, "mean_token_accuracy": 0.6814260989427566, "num_tokens": 305715509.0, "step": 18970 }, { "epoch": 4.398655695909143, "grad_norm": 0.9619463682174683, "learning_rate": 3.102639847300588e-05, "loss": 1.5067, "mean_token_accuracy": 0.6802815228700638, "num_tokens": 305875718.0, "step": 18980 }, { "epoch": 4.400973461583034, "grad_norm": 0.9399147629737854, "learning_rate": 3.1008185387035054e-05, "loss": 1.5055, "mean_token_accuracy": 0.6808161869645118, "num_tokens": 306035807.0, "step": 18990 }, { "epoch": 4.403291227256925, "grad_norm": 0.9298967719078064, "learning_rate": 3.098996891614464e-05, "loss": 1.4943, "mean_token_accuracy": 0.6823716923594475, "num_tokens": 306196424.0, "step": 19000 }, { "epoch": 4.405608992930815, "grad_norm": 0.9098662734031677, "learning_rate": 3.0971749070597514e-05, "loss": 1.5122, "mean_token_accuracy": 0.6804881110787392, "num_tokens": 306357809.0, "step": 19010 }, { "epoch": 4.407926758604705, "grad_norm": 0.8727588057518005, "learning_rate": 3.095352586065845e-05, "loss": 1.5049, "mean_token_accuracy": 0.6814926505088806, "num_tokens": 306517515.0, "step": 19020 }, { "epoch": 4.410244524278595, "grad_norm": 0.8871377110481262, "learning_rate": 3.093529929659413e-05, "loss": 1.5165, "mean_token_accuracy": 0.6801700666546822, "num_tokens": 306679361.0, "step": 19030 }, { "epoch": 4.4125622899524855, "grad_norm": 0.9493222236633301, "learning_rate": 3.091706938867312e-05, "loss": 1.5145, "mean_token_accuracy": 0.6786955684423447, "num_tokens": 306841236.0, "step": 19040 }, { "epoch": 4.4148800556263765, "grad_norm": 0.886824369430542, "learning_rate": 3.089883614716587e-05, "loss": 1.4956, "mean_token_accuracy": 0.6813245937228203, "num_tokens": 307002678.0, "step": 19050 }, { "epoch": 4.417197821300267, "grad_norm": 0.9324201345443726, "learning_rate": 3.0880599582344714e-05, "loss": 1.5162, "mean_token_accuracy": 0.6805816680192948, "num_tokens": 307164182.0, "step": 19060 }, { "epoch": 4.419515586974157, "grad_norm": 0.8672714233398438, "learning_rate": 3.086235970448385e-05, "loss": 1.5082, "mean_token_accuracy": 0.6816364973783493, "num_tokens": 307325134.0, "step": 19070 }, { "epoch": 4.421833352648047, "grad_norm": 0.9312589168548584, "learning_rate": 3.084411652385933e-05, "loss": 1.5153, "mean_token_accuracy": 0.6798796996474266, "num_tokens": 307487157.0, "step": 19080 }, { "epoch": 4.424151118321937, "grad_norm": 0.95017009973526, "learning_rate": 3.082587005074911e-05, "loss": 1.5064, "mean_token_accuracy": 0.6803607329726219, "num_tokens": 307648321.0, "step": 19090 }, { "epoch": 4.426468883995828, "grad_norm": 0.9382804036140442, "learning_rate": 3.080762029543296e-05, "loss": 1.5061, "mean_token_accuracy": 0.6796906292438507, "num_tokens": 307809935.0, "step": 19100 }, { "epoch": 4.428786649669719, "grad_norm": 0.9399010539054871, "learning_rate": 3.0789367268192515e-05, "loss": 1.5152, "mean_token_accuracy": 0.6812374040484428, "num_tokens": 307971457.0, "step": 19110 }, { "epoch": 4.431104415343609, "grad_norm": 0.946811318397522, "learning_rate": 3.0771110979311245e-05, "loss": 1.4959, "mean_token_accuracy": 0.6808669596910477, "num_tokens": 308132892.0, "step": 19120 }, { "epoch": 4.433422181017499, "grad_norm": 0.921110212802887, "learning_rate": 3.075285143907447e-05, "loss": 1.5101, "mean_token_accuracy": 0.6801500752568245, "num_tokens": 308294291.0, "step": 19130 }, { "epoch": 4.435739946691389, "grad_norm": 0.8908748626708984, "learning_rate": 3.073458865776933e-05, "loss": 1.5129, "mean_token_accuracy": 0.6818901181221009, "num_tokens": 308455973.0, "step": 19140 }, { "epoch": 4.43805771236528, "grad_norm": 0.938797116279602, "learning_rate": 3.07163226456848e-05, "loss": 1.5091, "mean_token_accuracy": 0.680858725309372, "num_tokens": 308617648.0, "step": 19150 }, { "epoch": 4.44037547803917, "grad_norm": 0.9146810173988342, "learning_rate": 3.069805341311168e-05, "loss": 1.5135, "mean_token_accuracy": 0.6785756632685661, "num_tokens": 308779079.0, "step": 19160 }, { "epoch": 4.442693243713061, "grad_norm": 0.8877745270729065, "learning_rate": 3.0679780970342556e-05, "loss": 1.5112, "mean_token_accuracy": 0.6808124646544457, "num_tokens": 308940140.0, "step": 19170 }, { "epoch": 4.445011009386951, "grad_norm": 0.9281347393989563, "learning_rate": 3.0661505327671854e-05, "loss": 1.5269, "mean_token_accuracy": 0.6773132175207138, "num_tokens": 309099949.0, "step": 19180 }, { "epoch": 4.447328775060841, "grad_norm": 0.8579099774360657, "learning_rate": 3.0643226495395794e-05, "loss": 1.4873, "mean_token_accuracy": 0.6834648862481117, "num_tokens": 309261534.0, "step": 19190 }, { "epoch": 4.449646540734732, "grad_norm": 0.9164447784423828, "learning_rate": 3.062494448381238e-05, "loss": 1.5037, "mean_token_accuracy": 0.6812910869717598, "num_tokens": 309423402.0, "step": 19200 }, { "epoch": 4.451964306408622, "grad_norm": 0.8444297313690186, "learning_rate": 3.0606659303221424e-05, "loss": 1.4993, "mean_token_accuracy": 0.6815365210175515, "num_tokens": 309585104.0, "step": 19210 }, { "epoch": 4.4542820720825125, "grad_norm": 0.9417877793312073, "learning_rate": 3.0588370963924515e-05, "loss": 1.5038, "mean_token_accuracy": 0.679962982237339, "num_tokens": 309746582.0, "step": 19220 }, { "epoch": 4.456599837756403, "grad_norm": 0.9022384881973267, "learning_rate": 3.057007947622502e-05, "loss": 1.4936, "mean_token_accuracy": 0.6830415442585945, "num_tokens": 309907295.0, "step": 19230 }, { "epoch": 4.458917603430293, "grad_norm": 0.8892304301261902, "learning_rate": 3.0551784850428097e-05, "loss": 1.4827, "mean_token_accuracy": 0.6830877423286438, "num_tokens": 310068944.0, "step": 19240 }, { "epoch": 4.461235369104184, "grad_norm": 0.9297134280204773, "learning_rate": 3.053348709684064e-05, "loss": 1.5016, "mean_token_accuracy": 0.6808180660009384, "num_tokens": 310229500.0, "step": 19250 }, { "epoch": 4.463553134778074, "grad_norm": 0.8682877421379089, "learning_rate": 3.0515186225771337e-05, "loss": 1.5015, "mean_token_accuracy": 0.6802719980478287, "num_tokens": 310391223.0, "step": 19260 }, { "epoch": 4.465870900451964, "grad_norm": 0.9723190069198608, "learning_rate": 3.0496882247530607e-05, "loss": 1.5202, "mean_token_accuracy": 0.6786849811673165, "num_tokens": 310552898.0, "step": 19270 }, { "epoch": 4.4681886661258545, "grad_norm": 0.8918301463127136, "learning_rate": 3.0478575172430634e-05, "loss": 1.5158, "mean_token_accuracy": 0.6781467258930206, "num_tokens": 310714007.0, "step": 19280 }, { "epoch": 4.470506431799745, "grad_norm": 0.9471396207809448, "learning_rate": 3.0460265010785343e-05, "loss": 1.4969, "mean_token_accuracy": 0.6811488419771194, "num_tokens": 310875320.0, "step": 19290 }, { "epoch": 4.472824197473636, "grad_norm": 0.9906548261642456, "learning_rate": 3.0441951772910405e-05, "loss": 1.4939, "mean_token_accuracy": 0.6823719307780266, "num_tokens": 311036673.0, "step": 19300 }, { "epoch": 4.475141963147526, "grad_norm": 0.8931788802146912, "learning_rate": 3.042363546912321e-05, "loss": 1.4858, "mean_token_accuracy": 0.6828942969441414, "num_tokens": 311198140.0, "step": 19310 }, { "epoch": 4.477459728821416, "grad_norm": 0.8822178244590759, "learning_rate": 3.0405316109742892e-05, "loss": 1.5108, "mean_token_accuracy": 0.6802664741873741, "num_tokens": 311359775.0, "step": 19320 }, { "epoch": 4.479777494495306, "grad_norm": 0.9716436862945557, "learning_rate": 3.038699370509029e-05, "loss": 1.5068, "mean_token_accuracy": 0.6795667752623558, "num_tokens": 311521205.0, "step": 19330 }, { "epoch": 4.482095260169197, "grad_norm": 0.8798753619194031, "learning_rate": 3.0368668265487965e-05, "loss": 1.4893, "mean_token_accuracy": 0.6823950409889221, "num_tokens": 311681913.0, "step": 19340 }, { "epoch": 4.484413025843088, "grad_norm": 0.8775781989097595, "learning_rate": 3.0350339801260198e-05, "loss": 1.4951, "mean_token_accuracy": 0.682754710316658, "num_tokens": 311844011.0, "step": 19350 }, { "epoch": 4.486730791516978, "grad_norm": 0.9571119546890259, "learning_rate": 3.0332008322732952e-05, "loss": 1.4916, "mean_token_accuracy": 0.6799537852406502, "num_tokens": 312005050.0, "step": 19360 }, { "epoch": 4.489048557190868, "grad_norm": 0.922661304473877, "learning_rate": 3.0313673840233912e-05, "loss": 1.4999, "mean_token_accuracy": 0.6818485409021378, "num_tokens": 312166284.0, "step": 19370 }, { "epoch": 4.491366322864758, "grad_norm": 0.9093092679977417, "learning_rate": 3.0295336364092443e-05, "loss": 1.4976, "mean_token_accuracy": 0.6820256873965264, "num_tokens": 312327997.0, "step": 19380 }, { "epoch": 4.493684088538648, "grad_norm": 0.9126875996589661, "learning_rate": 3.0276995904639588e-05, "loss": 1.5136, "mean_token_accuracy": 0.6786776453256607, "num_tokens": 312488685.0, "step": 19390 }, { "epoch": 4.4960018542125395, "grad_norm": 0.8947592973709106, "learning_rate": 3.0258652472208092e-05, "loss": 1.5088, "mean_token_accuracy": 0.6809905648231507, "num_tokens": 312650479.0, "step": 19400 }, { "epoch": 4.49831961988643, "grad_norm": 0.9042774438858032, "learning_rate": 3.0240306077132357e-05, "loss": 1.4979, "mean_token_accuracy": 0.6821776151657104, "num_tokens": 312811497.0, "step": 19410 }, { "epoch": 4.50063738556032, "grad_norm": 0.9128201603889465, "learning_rate": 3.022195672974847e-05, "loss": 1.493, "mean_token_accuracy": 0.6826458841562271, "num_tokens": 312973643.0, "step": 19420 }, { "epoch": 4.50295515123421, "grad_norm": 0.8611615300178528, "learning_rate": 3.020360444039416e-05, "loss": 1.5042, "mean_token_accuracy": 0.6800439596176148, "num_tokens": 313135804.0, "step": 19430 }, { "epoch": 4.5052729169081, "grad_norm": 0.9239922761917114, "learning_rate": 3.018524921940884e-05, "loss": 1.5156, "mean_token_accuracy": 0.6788843378424645, "num_tokens": 313297141.0, "step": 19440 }, { "epoch": 4.507590682581991, "grad_norm": 0.9163810014724731, "learning_rate": 3.0166891077133547e-05, "loss": 1.5084, "mean_token_accuracy": 0.6791015297174454, "num_tokens": 313458510.0, "step": 19450 }, { "epoch": 4.5099084482558816, "grad_norm": 0.8567855954170227, "learning_rate": 3.014853002391098e-05, "loss": 1.5112, "mean_token_accuracy": 0.6795439854264259, "num_tokens": 313619332.0, "step": 19460 }, { "epoch": 4.512226213929772, "grad_norm": 0.8787165284156799, "learning_rate": 3.0130166070085487e-05, "loss": 1.4847, "mean_token_accuracy": 0.6832360580563546, "num_tokens": 313780675.0, "step": 19470 }, { "epoch": 4.514543979603662, "grad_norm": 0.9212058782577515, "learning_rate": 3.011179922600302e-05, "loss": 1.506, "mean_token_accuracy": 0.6790714606642723, "num_tokens": 313942002.0, "step": 19480 }, { "epoch": 4.516861745277552, "grad_norm": 0.901494562625885, "learning_rate": 3.0093429502011188e-05, "loss": 1.5038, "mean_token_accuracy": 0.6792976677417755, "num_tokens": 314102854.0, "step": 19490 }, { "epoch": 4.519179510951442, "grad_norm": 0.8746828436851501, "learning_rate": 3.007505690845921e-05, "loss": 1.5082, "mean_token_accuracy": 0.6800541073083878, "num_tokens": 314263325.0, "step": 19500 }, { "epoch": 4.521497276625333, "grad_norm": 0.9736816883087158, "learning_rate": 3.0056681455697926e-05, "loss": 1.4954, "mean_token_accuracy": 0.6811097338795662, "num_tokens": 314424248.0, "step": 19510 }, { "epoch": 4.523815042299224, "grad_norm": 0.8818892240524292, "learning_rate": 3.0038303154079784e-05, "loss": 1.5067, "mean_token_accuracy": 0.6799106582999229, "num_tokens": 314585836.0, "step": 19520 }, { "epoch": 4.526132807973114, "grad_norm": 0.9239523410797119, "learning_rate": 3.001992201395884e-05, "loss": 1.501, "mean_token_accuracy": 0.6804905787110329, "num_tokens": 314747537.0, "step": 19530 }, { "epoch": 4.528450573647004, "grad_norm": 0.8785895109176636, "learning_rate": 3.0001538045690737e-05, "loss": 1.5202, "mean_token_accuracy": 0.6786458775401115, "num_tokens": 314909241.0, "step": 19540 }, { "epoch": 4.530768339320895, "grad_norm": 0.911293089389801, "learning_rate": 2.9983151259632726e-05, "loss": 1.5039, "mean_token_accuracy": 0.6803672730922699, "num_tokens": 315069601.0, "step": 19550 }, { "epoch": 4.533086104994785, "grad_norm": 0.8579989671707153, "learning_rate": 2.996476166614364e-05, "loss": 1.5075, "mean_token_accuracy": 0.6807014986872673, "num_tokens": 315231574.0, "step": 19560 }, { "epoch": 4.5354038706686755, "grad_norm": 0.9277522563934326, "learning_rate": 2.9946369275583898e-05, "loss": 1.5076, "mean_token_accuracy": 0.6819401323795319, "num_tokens": 315392683.0, "step": 19570 }, { "epoch": 4.537721636342566, "grad_norm": 0.936964750289917, "learning_rate": 2.9927974098315488e-05, "loss": 1.5199, "mean_token_accuracy": 0.678015461564064, "num_tokens": 315553599.0, "step": 19580 }, { "epoch": 4.540039402016456, "grad_norm": 0.9145200848579407, "learning_rate": 2.9909576144701972e-05, "loss": 1.5004, "mean_token_accuracy": 0.68082927018404, "num_tokens": 315714276.0, "step": 19590 }, { "epoch": 4.542357167690346, "grad_norm": 0.9107585549354553, "learning_rate": 2.9891175425108476e-05, "loss": 1.5088, "mean_token_accuracy": 0.6815198510885239, "num_tokens": 315874583.0, "step": 19600 }, { "epoch": 4.544674933364237, "grad_norm": 0.8838196992874146, "learning_rate": 2.987277194990168e-05, "loss": 1.5069, "mean_token_accuracy": 0.6812841087579727, "num_tokens": 316036300.0, "step": 19610 }, { "epoch": 4.546992699038127, "grad_norm": 0.9356341361999512, "learning_rate": 2.9854365729449823e-05, "loss": 1.4998, "mean_token_accuracy": 0.6794767528772354, "num_tokens": 316196609.0, "step": 19620 }, { "epoch": 4.5493104647120175, "grad_norm": 0.9209510087966919, "learning_rate": 2.9835956774122692e-05, "loss": 1.4908, "mean_token_accuracy": 0.6821051254868508, "num_tokens": 316358378.0, "step": 19630 }, { "epoch": 4.551628230385908, "grad_norm": 0.8943735361099243, "learning_rate": 2.98175450942916e-05, "loss": 1.5031, "mean_token_accuracy": 0.6809881150722503, "num_tokens": 316519987.0, "step": 19640 }, { "epoch": 4.553945996059799, "grad_norm": 0.9267044067382812, "learning_rate": 2.9799130700329415e-05, "loss": 1.506, "mean_token_accuracy": 0.6806673780083656, "num_tokens": 316681498.0, "step": 19650 }, { "epoch": 4.556263761733689, "grad_norm": 1.0273866653442383, "learning_rate": 2.978071360261052e-05, "loss": 1.4846, "mean_token_accuracy": 0.6818873375654221, "num_tokens": 316843156.0, "step": 19660 }, { "epoch": 4.558581527407579, "grad_norm": 0.9208412766456604, "learning_rate": 2.9762293811510827e-05, "loss": 1.4769, "mean_token_accuracy": 0.6841649159789085, "num_tokens": 317004555.0, "step": 19670 }, { "epoch": 4.560899293081469, "grad_norm": 0.913747251033783, "learning_rate": 2.9743871337407757e-05, "loss": 1.489, "mean_token_accuracy": 0.6821209102869034, "num_tokens": 317165322.0, "step": 19680 }, { "epoch": 4.5632170587553595, "grad_norm": 0.9451226592063904, "learning_rate": 2.9725446190680263e-05, "loss": 1.5091, "mean_token_accuracy": 0.679348747432232, "num_tokens": 317323507.0, "step": 19690 }, { "epoch": 4.56553482442925, "grad_norm": 0.9439492225646973, "learning_rate": 2.9707018381708772e-05, "loss": 1.5014, "mean_token_accuracy": 0.680987361073494, "num_tokens": 317484620.0, "step": 19700 }, { "epoch": 4.567852590103141, "grad_norm": 0.9174657464027405, "learning_rate": 2.968858792087525e-05, "loss": 1.507, "mean_token_accuracy": 0.6804089099168777, "num_tokens": 317645850.0, "step": 19710 }, { "epoch": 4.570170355777031, "grad_norm": 1.0759539604187012, "learning_rate": 2.9670154818563123e-05, "loss": 1.5073, "mean_token_accuracy": 0.6808816909790039, "num_tokens": 317807074.0, "step": 19720 }, { "epoch": 4.572488121450921, "grad_norm": 0.9670733213424683, "learning_rate": 2.9651719085157326e-05, "loss": 1.4822, "mean_token_accuracy": 0.6824246615171432, "num_tokens": 317966806.0, "step": 19730 }, { "epoch": 4.574805887124811, "grad_norm": 0.9208101630210876, "learning_rate": 2.9633280731044262e-05, "loss": 1.5177, "mean_token_accuracy": 0.679102723300457, "num_tokens": 318128519.0, "step": 19740 }, { "epoch": 4.5771236527987025, "grad_norm": 0.9145587086677551, "learning_rate": 2.9614839766611817e-05, "loss": 1.5045, "mean_token_accuracy": 0.6806765079498291, "num_tokens": 318289902.0, "step": 19750 }, { "epoch": 4.579441418472593, "grad_norm": 0.8780491352081299, "learning_rate": 2.9596396202249356e-05, "loss": 1.5105, "mean_token_accuracy": 0.6799819484353066, "num_tokens": 318449563.0, "step": 19760 }, { "epoch": 4.581759184146483, "grad_norm": 0.8958050608634949, "learning_rate": 2.9577950048347692e-05, "loss": 1.5047, "mean_token_accuracy": 0.6800057232379914, "num_tokens": 318610325.0, "step": 19770 }, { "epoch": 4.584076949820373, "grad_norm": 0.9025258421897888, "learning_rate": 2.955950131529911e-05, "loss": 1.4861, "mean_token_accuracy": 0.6831342980265618, "num_tokens": 318771182.0, "step": 19780 }, { "epoch": 4.586394715494263, "grad_norm": 0.8612815737724304, "learning_rate": 2.954105001349734e-05, "loss": 1.5106, "mean_token_accuracy": 0.6803387045860291, "num_tokens": 318932927.0, "step": 19790 }, { "epoch": 4.5887124811681534, "grad_norm": 0.914542019367218, "learning_rate": 2.9522596153337568e-05, "loss": 1.4967, "mean_token_accuracy": 0.6806890219449997, "num_tokens": 319094193.0, "step": 19800 }, { "epoch": 4.5910302468420445, "grad_norm": 0.8706080913543701, "learning_rate": 2.9504139745216413e-05, "loss": 1.4838, "mean_token_accuracy": 0.6832597211003304, "num_tokens": 319255652.0, "step": 19810 }, { "epoch": 4.593348012515935, "grad_norm": 0.9621946811676025, "learning_rate": 2.9485680799531932e-05, "loss": 1.4989, "mean_token_accuracy": 0.681278882920742, "num_tokens": 319416265.0, "step": 19820 }, { "epoch": 4.595665778189825, "grad_norm": 0.9253984689712524, "learning_rate": 2.9467219326683616e-05, "loss": 1.4904, "mean_token_accuracy": 0.6824970275163651, "num_tokens": 319576975.0, "step": 19830 }, { "epoch": 4.597983543863715, "grad_norm": 0.8818336129188538, "learning_rate": 2.944875533707237e-05, "loss": 1.5032, "mean_token_accuracy": 0.6802968069911003, "num_tokens": 319737206.0, "step": 19840 }, { "epoch": 4.600301309537606, "grad_norm": 0.8702555894851685, "learning_rate": 2.9430288841100527e-05, "loss": 1.4924, "mean_token_accuracy": 0.6821523427963256, "num_tokens": 319898947.0, "step": 19850 }, { "epoch": 4.602619075211496, "grad_norm": 0.9263268709182739, "learning_rate": 2.9411819849171825e-05, "loss": 1.5312, "mean_token_accuracy": 0.677309800684452, "num_tokens": 320059792.0, "step": 19860 }, { "epoch": 4.604936840885387, "grad_norm": 0.9291174411773682, "learning_rate": 2.9393348371691413e-05, "loss": 1.4984, "mean_token_accuracy": 0.680924904346466, "num_tokens": 320220570.0, "step": 19870 }, { "epoch": 4.607254606559277, "grad_norm": 0.962065577507019, "learning_rate": 2.937487441906584e-05, "loss": 1.5097, "mean_token_accuracy": 0.6785667851567269, "num_tokens": 320381169.0, "step": 19880 }, { "epoch": 4.609572372233167, "grad_norm": 0.9032756090164185, "learning_rate": 2.9356398001703045e-05, "loss": 1.4898, "mean_token_accuracy": 0.6824400722980499, "num_tokens": 320542525.0, "step": 19890 }, { "epoch": 4.611890137907057, "grad_norm": 0.8628743290901184, "learning_rate": 2.933791913001236e-05, "loss": 1.5207, "mean_token_accuracy": 0.6804275810718536, "num_tokens": 320704092.0, "step": 19900 }, { "epoch": 4.614207903580948, "grad_norm": 0.9222702383995056, "learning_rate": 2.93194378144045e-05, "loss": 1.503, "mean_token_accuracy": 0.6820551753044128, "num_tokens": 320863682.0, "step": 19910 }, { "epoch": 4.616525669254838, "grad_norm": 0.9404725432395935, "learning_rate": 2.9300954065291546e-05, "loss": 1.5, "mean_token_accuracy": 0.6809418067336083, "num_tokens": 321025608.0, "step": 19920 }, { "epoch": 4.618843434928729, "grad_norm": 0.9588881134986877, "learning_rate": 2.9282467893086975e-05, "loss": 1.5017, "mean_token_accuracy": 0.679395605623722, "num_tokens": 321185623.0, "step": 19930 }, { "epoch": 4.621161200602619, "grad_norm": 0.9318660497665405, "learning_rate": 2.9263979308205598e-05, "loss": 1.5142, "mean_token_accuracy": 0.6782130748033524, "num_tokens": 321347470.0, "step": 19940 }, { "epoch": 4.62347896627651, "grad_norm": 0.8834549188613892, "learning_rate": 2.9245488321063607e-05, "loss": 1.5028, "mean_token_accuracy": 0.679335305094719, "num_tokens": 321508603.0, "step": 19950 }, { "epoch": 4.6257967319504, "grad_norm": 0.9125537276268005, "learning_rate": 2.9226994942078538e-05, "loss": 1.4983, "mean_token_accuracy": 0.6817698538303375, "num_tokens": 321669858.0, "step": 19960 }, { "epoch": 4.62811449762429, "grad_norm": 0.9560539722442627, "learning_rate": 2.9208499181669286e-05, "loss": 1.5025, "mean_token_accuracy": 0.6811511605978012, "num_tokens": 321831182.0, "step": 19970 }, { "epoch": 4.6304322632981805, "grad_norm": 0.9070884585380554, "learning_rate": 2.919000105025607e-05, "loss": 1.5116, "mean_token_accuracy": 0.6800716817378998, "num_tokens": 321992877.0, "step": 19980 }, { "epoch": 4.632750028972071, "grad_norm": 0.892545759677887, "learning_rate": 2.9171500558260456e-05, "loss": 1.4949, "mean_token_accuracy": 0.682362724840641, "num_tokens": 322154509.0, "step": 19990 }, { "epoch": 4.635067794645961, "grad_norm": 0.9031998515129089, "learning_rate": 2.915299771610534e-05, "loss": 1.5032, "mean_token_accuracy": 0.6816062957048417, "num_tokens": 322316318.0, "step": 20000 }, { "epoch": 4.637385560319852, "grad_norm": 0.9305506944656372, "learning_rate": 2.913449253421494e-05, "loss": 1.4909, "mean_token_accuracy": 0.6813781231641769, "num_tokens": 322478337.0, "step": 20010 }, { "epoch": 4.639703325993742, "grad_norm": 0.9353211522102356, "learning_rate": 2.9115985023014786e-05, "loss": 1.5236, "mean_token_accuracy": 0.6777369260787964, "num_tokens": 322639861.0, "step": 20020 }, { "epoch": 4.642021091667632, "grad_norm": 0.9253190755844116, "learning_rate": 2.9097475192931732e-05, "loss": 1.5057, "mean_token_accuracy": 0.6802546486258507, "num_tokens": 322801260.0, "step": 20030 }, { "epoch": 4.6443388573415225, "grad_norm": 0.9609372019767761, "learning_rate": 2.907896305439393e-05, "loss": 1.5002, "mean_token_accuracy": 0.6809816375374794, "num_tokens": 322962206.0, "step": 20040 }, { "epoch": 4.646656623015414, "grad_norm": 0.8724604845046997, "learning_rate": 2.9060448617830838e-05, "loss": 1.4977, "mean_token_accuracy": 0.6822253078222275, "num_tokens": 323123654.0, "step": 20050 }, { "epoch": 4.648974388689304, "grad_norm": 0.9783670902252197, "learning_rate": 2.9041931893673213e-05, "loss": 1.5037, "mean_token_accuracy": 0.6803240075707435, "num_tokens": 323284206.0, "step": 20060 }, { "epoch": 4.651292154363194, "grad_norm": 0.9075254201889038, "learning_rate": 2.9023412892353076e-05, "loss": 1.4847, "mean_token_accuracy": 0.6823591828346253, "num_tokens": 323445365.0, "step": 20070 }, { "epoch": 4.653609920037084, "grad_norm": 0.9643325805664062, "learning_rate": 2.9004891624303753e-05, "loss": 1.4988, "mean_token_accuracy": 0.6807292625308037, "num_tokens": 323606568.0, "step": 20080 }, { "epoch": 4.655927685710974, "grad_norm": 1.0210999250411987, "learning_rate": 2.8986368099959856e-05, "loss": 1.5094, "mean_token_accuracy": 0.6789426207542419, "num_tokens": 323767888.0, "step": 20090 }, { "epoch": 4.658245451384865, "grad_norm": 0.9008215069770813, "learning_rate": 2.8967842329757244e-05, "loss": 1.4964, "mean_token_accuracy": 0.6815994203090667, "num_tokens": 323928460.0, "step": 20100 }, { "epoch": 4.660563217058756, "grad_norm": 0.8871430158615112, "learning_rate": 2.8949314324133047e-05, "loss": 1.5116, "mean_token_accuracy": 0.6800642341375351, "num_tokens": 324090226.0, "step": 20110 }, { "epoch": 4.662880982732646, "grad_norm": 0.9940440654754639, "learning_rate": 2.8930784093525675e-05, "loss": 1.4998, "mean_token_accuracy": 0.6815699353814125, "num_tokens": 324252162.0, "step": 20120 }, { "epoch": 4.665198748406536, "grad_norm": 0.8712096810340881, "learning_rate": 2.8912251648374773e-05, "loss": 1.4934, "mean_token_accuracy": 0.6811639428138733, "num_tokens": 324414081.0, "step": 20130 }, { "epoch": 4.667516514080426, "grad_norm": 0.8463303446769714, "learning_rate": 2.8893716999121218e-05, "loss": 1.5022, "mean_token_accuracy": 0.6796222299337387, "num_tokens": 324575570.0, "step": 20140 }, { "epoch": 4.669834279754317, "grad_norm": 0.93817538022995, "learning_rate": 2.8875180156207166e-05, "loss": 1.5107, "mean_token_accuracy": 0.6788549676537514, "num_tokens": 324736128.0, "step": 20150 }, { "epoch": 4.6721520454282075, "grad_norm": 0.8672515749931335, "learning_rate": 2.8856641130075983e-05, "loss": 1.4949, "mean_token_accuracy": 0.6813689082860946, "num_tokens": 324896857.0, "step": 20160 }, { "epoch": 4.674469811102098, "grad_norm": 0.9341351389884949, "learning_rate": 2.883809993117228e-05, "loss": 1.4916, "mean_token_accuracy": 0.6825437724590302, "num_tokens": 325058244.0, "step": 20170 }, { "epoch": 4.676787576775988, "grad_norm": 0.9328093528747559, "learning_rate": 2.8819556569941874e-05, "loss": 1.5072, "mean_token_accuracy": 0.6809431537985802, "num_tokens": 325219145.0, "step": 20180 }, { "epoch": 4.679105342449878, "grad_norm": 0.8621194958686829, "learning_rate": 2.8801011056831815e-05, "loss": 1.4989, "mean_token_accuracy": 0.680427286028862, "num_tokens": 325380929.0, "step": 20190 }, { "epoch": 4.681423108123768, "grad_norm": 0.8997635841369629, "learning_rate": 2.8782463402290366e-05, "loss": 1.5122, "mean_token_accuracy": 0.6802247568964959, "num_tokens": 325542473.0, "step": 20200 }, { "epoch": 4.683740873797659, "grad_norm": 0.8916452527046204, "learning_rate": 2.876391361676698e-05, "loss": 1.5142, "mean_token_accuracy": 0.6779758617281914, "num_tokens": 325703453.0, "step": 20210 }, { "epoch": 4.6860586394715495, "grad_norm": 0.9210074543952942, "learning_rate": 2.8745361710712333e-05, "loss": 1.4957, "mean_token_accuracy": 0.6818508237600327, "num_tokens": 325863943.0, "step": 20220 }, { "epoch": 4.68837640514544, "grad_norm": 0.912294864654541, "learning_rate": 2.872680769457828e-05, "loss": 1.5093, "mean_token_accuracy": 0.6807996436953545, "num_tokens": 326025588.0, "step": 20230 }, { "epoch": 4.69069417081933, "grad_norm": 0.9054368138313293, "learning_rate": 2.8708251578817867e-05, "loss": 1.4977, "mean_token_accuracy": 0.6810285940766334, "num_tokens": 326185962.0, "step": 20240 }, { "epoch": 4.693011936493221, "grad_norm": 0.9186562895774841, "learning_rate": 2.8689693373885335e-05, "loss": 1.4979, "mean_token_accuracy": 0.6806961387395859, "num_tokens": 326347058.0, "step": 20250 }, { "epoch": 4.695329702167111, "grad_norm": 0.8899191617965698, "learning_rate": 2.867113309023608e-05, "loss": 1.503, "mean_token_accuracy": 0.6819074496626853, "num_tokens": 326508944.0, "step": 20260 }, { "epoch": 4.697647467841001, "grad_norm": 0.9646829962730408, "learning_rate": 2.865257073832669e-05, "loss": 1.5062, "mean_token_accuracy": 0.6808078497648239, "num_tokens": 326668801.0, "step": 20270 }, { "epoch": 4.699965233514892, "grad_norm": 0.9493300318717957, "learning_rate": 2.86340063286149e-05, "loss": 1.4959, "mean_token_accuracy": 0.6819528937339783, "num_tokens": 326829793.0, "step": 20280 }, { "epoch": 4.702282999188782, "grad_norm": 0.9023949503898621, "learning_rate": 2.8615439871559624e-05, "loss": 1.5021, "mean_token_accuracy": 0.6818163126707077, "num_tokens": 326991296.0, "step": 20290 }, { "epoch": 4.704600764862672, "grad_norm": 0.9163707494735718, "learning_rate": 2.859687137762092e-05, "loss": 1.5035, "mean_token_accuracy": 0.6810843050479889, "num_tokens": 327153037.0, "step": 20300 }, { "epoch": 4.706918530536563, "grad_norm": 0.9119421243667603, "learning_rate": 2.8578300857259983e-05, "loss": 1.5, "mean_token_accuracy": 0.6806973949074745, "num_tokens": 327314688.0, "step": 20310 }, { "epoch": 4.709236296210453, "grad_norm": 0.9141783118247986, "learning_rate": 2.855972832093917e-05, "loss": 1.4995, "mean_token_accuracy": 0.6817561879754066, "num_tokens": 327475765.0, "step": 20320 }, { "epoch": 4.7115540618843434, "grad_norm": 0.89476078748703, "learning_rate": 2.854115377912196e-05, "loss": 1.5057, "mean_token_accuracy": 0.6810435056686401, "num_tokens": 327637305.0, "step": 20330 }, { "epoch": 4.713871827558234, "grad_norm": 0.8821515440940857, "learning_rate": 2.852257724227296e-05, "loss": 1.483, "mean_token_accuracy": 0.6841313496232033, "num_tokens": 327798418.0, "step": 20340 }, { "epoch": 4.716189593232125, "grad_norm": 0.8822296261787415, "learning_rate": 2.850399872085791e-05, "loss": 1.4887, "mean_token_accuracy": 0.6834469541907311, "num_tokens": 327959538.0, "step": 20350 }, { "epoch": 4.718507358906015, "grad_norm": 0.9244508743286133, "learning_rate": 2.848541822534367e-05, "loss": 1.4898, "mean_token_accuracy": 0.6841522723436355, "num_tokens": 328120561.0, "step": 20360 }, { "epoch": 4.720825124579905, "grad_norm": 0.9353927969932556, "learning_rate": 2.8466835766198203e-05, "loss": 1.5075, "mean_token_accuracy": 0.6799320265650749, "num_tokens": 328281667.0, "step": 20370 }, { "epoch": 4.723142890253795, "grad_norm": 0.8665770888328552, "learning_rate": 2.8448251353890575e-05, "loss": 1.4941, "mean_token_accuracy": 0.6832425579428673, "num_tokens": 328442788.0, "step": 20380 }, { "epoch": 4.7254606559276855, "grad_norm": 0.9072505831718445, "learning_rate": 2.842966499889097e-05, "loss": 1.5116, "mean_token_accuracy": 0.6802829578518867, "num_tokens": 328604413.0, "step": 20390 }, { "epoch": 4.727778421601576, "grad_norm": 0.8946425914764404, "learning_rate": 2.8411076711670642e-05, "loss": 1.5073, "mean_token_accuracy": 0.6787258729338645, "num_tokens": 328765839.0, "step": 20400 }, { "epoch": 4.730096187275467, "grad_norm": 0.9070533514022827, "learning_rate": 2.839248650270196e-05, "loss": 1.4967, "mean_token_accuracy": 0.6820012673735618, "num_tokens": 328927394.0, "step": 20410 }, { "epoch": 4.732413952949357, "grad_norm": 0.9123120903968811, "learning_rate": 2.8373894382458354e-05, "loss": 1.4793, "mean_token_accuracy": 0.6817825078964234, "num_tokens": 329089028.0, "step": 20420 }, { "epoch": 4.734731718623247, "grad_norm": 0.9533379673957825, "learning_rate": 2.835530036141435e-05, "loss": 1.4975, "mean_token_accuracy": 0.680748675763607, "num_tokens": 329250194.0, "step": 20430 }, { "epoch": 4.737049484297137, "grad_norm": 0.9117431044578552, "learning_rate": 2.8336704450045525e-05, "loss": 1.506, "mean_token_accuracy": 0.6804634898900985, "num_tokens": 329411461.0, "step": 20440 }, { "epoch": 4.739367249971028, "grad_norm": 0.9245266318321228, "learning_rate": 2.8318106658828537e-05, "loss": 1.4942, "mean_token_accuracy": 0.6819862097501754, "num_tokens": 329572828.0, "step": 20450 }, { "epoch": 4.741685015644919, "grad_norm": 0.8783653974533081, "learning_rate": 2.8299506998241087e-05, "loss": 1.4988, "mean_token_accuracy": 0.682164865732193, "num_tokens": 329734821.0, "step": 20460 }, { "epoch": 4.744002781318809, "grad_norm": 0.8840728998184204, "learning_rate": 2.8280905478761954e-05, "loss": 1.5069, "mean_token_accuracy": 0.680822703242302, "num_tokens": 329896507.0, "step": 20470 }, { "epoch": 4.746320546992699, "grad_norm": 0.9007934927940369, "learning_rate": 2.8262302110870935e-05, "loss": 1.4981, "mean_token_accuracy": 0.6805658489465714, "num_tokens": 330058015.0, "step": 20480 }, { "epoch": 4.748638312666589, "grad_norm": 0.8743056654930115, "learning_rate": 2.8243696905048888e-05, "loss": 1.5087, "mean_token_accuracy": 0.6809099912643433, "num_tokens": 330218184.0, "step": 20490 }, { "epoch": 4.750956078340479, "grad_norm": 0.9319774508476257, "learning_rate": 2.8225089871777705e-05, "loss": 1.4922, "mean_token_accuracy": 0.6818424463272095, "num_tokens": 330378387.0, "step": 20500 }, { "epoch": 4.7532738440143705, "grad_norm": 0.8671930432319641, "learning_rate": 2.820648102154029e-05, "loss": 1.5166, "mean_token_accuracy": 0.6781008675694465, "num_tokens": 330539263.0, "step": 20510 }, { "epoch": 4.755591609688261, "grad_norm": 0.9097120761871338, "learning_rate": 2.8187870364820614e-05, "loss": 1.4957, "mean_token_accuracy": 0.6815287232398987, "num_tokens": 330700488.0, "step": 20520 }, { "epoch": 4.757909375362151, "grad_norm": 0.8862790465354919, "learning_rate": 2.8169257912103598e-05, "loss": 1.503, "mean_token_accuracy": 0.6800638332962989, "num_tokens": 330861622.0, "step": 20530 }, { "epoch": 4.760227141036041, "grad_norm": 0.9083719253540039, "learning_rate": 2.815064367387523e-05, "loss": 1.4916, "mean_token_accuracy": 0.6813696965575218, "num_tokens": 331022879.0, "step": 20540 }, { "epoch": 4.762544906709931, "grad_norm": 0.8538328409194946, "learning_rate": 2.8132027660622495e-05, "loss": 1.4937, "mean_token_accuracy": 0.6820127457380295, "num_tokens": 331184614.0, "step": 20550 }, { "epoch": 4.764862672383822, "grad_norm": 0.8627985715866089, "learning_rate": 2.8113409882833353e-05, "loss": 1.4964, "mean_token_accuracy": 0.6828329101204872, "num_tokens": 331345671.0, "step": 20560 }, { "epoch": 4.7671804380577125, "grad_norm": 0.916907787322998, "learning_rate": 2.8094790350996792e-05, "loss": 1.5092, "mean_token_accuracy": 0.6785169139504432, "num_tokens": 331505896.0, "step": 20570 }, { "epoch": 4.769498203731603, "grad_norm": 0.9313719272613525, "learning_rate": 2.8076169075602755e-05, "loss": 1.479, "mean_token_accuracy": 0.683844593167305, "num_tokens": 331666573.0, "step": 20580 }, { "epoch": 4.771815969405493, "grad_norm": 0.878381609916687, "learning_rate": 2.8057546067142194e-05, "loss": 1.5007, "mean_token_accuracy": 0.6802169129252433, "num_tokens": 331828539.0, "step": 20590 }, { "epoch": 4.774133735079383, "grad_norm": 0.960483968257904, "learning_rate": 2.8038921336107015e-05, "loss": 1.4967, "mean_token_accuracy": 0.6799083307385445, "num_tokens": 331990502.0, "step": 20600 }, { "epoch": 4.776451500753274, "grad_norm": 0.9140462875366211, "learning_rate": 2.8020294892990123e-05, "loss": 1.491, "mean_token_accuracy": 0.6828981608152389, "num_tokens": 332151344.0, "step": 20610 }, { "epoch": 4.778769266427164, "grad_norm": 0.8831440806388855, "learning_rate": 2.800166674828536e-05, "loss": 1.4838, "mean_token_accuracy": 0.6823757618665696, "num_tokens": 332311001.0, "step": 20620 }, { "epoch": 4.781087032101055, "grad_norm": 0.9320732951164246, "learning_rate": 2.7983036912487538e-05, "loss": 1.4945, "mean_token_accuracy": 0.6829313918948173, "num_tokens": 332471696.0, "step": 20630 }, { "epoch": 4.783404797774945, "grad_norm": 0.980248212814331, "learning_rate": 2.796440539609242e-05, "loss": 1.4992, "mean_token_accuracy": 0.6809113666415214, "num_tokens": 332632514.0, "step": 20640 }, { "epoch": 4.785722563448835, "grad_norm": 0.9504123330116272, "learning_rate": 2.794577220959672e-05, "loss": 1.4903, "mean_token_accuracy": 0.6829050034284592, "num_tokens": 332793917.0, "step": 20650 }, { "epoch": 4.788040329122726, "grad_norm": 0.9494981169700623, "learning_rate": 2.79271373634981e-05, "loss": 1.5085, "mean_token_accuracy": 0.6788305073976517, "num_tokens": 332956020.0, "step": 20660 }, { "epoch": 4.790358094796616, "grad_norm": 1.0056864023208618, "learning_rate": 2.7908500868295123e-05, "loss": 1.5114, "mean_token_accuracy": 0.6799253791570663, "num_tokens": 333117222.0, "step": 20670 }, { "epoch": 4.792675860470506, "grad_norm": 0.9883501529693604, "learning_rate": 2.7889862734487322e-05, "loss": 1.4852, "mean_token_accuracy": 0.6845776975154877, "num_tokens": 333278843.0, "step": 20680 }, { "epoch": 4.794993626144397, "grad_norm": 0.8813920021057129, "learning_rate": 2.7871222972575134e-05, "loss": 1.5027, "mean_token_accuracy": 0.6811360135674477, "num_tokens": 333439965.0, "step": 20690 }, { "epoch": 4.797311391818287, "grad_norm": 0.9197227954864502, "learning_rate": 2.7852581593059906e-05, "loss": 1.5081, "mean_token_accuracy": 0.6801780343055726, "num_tokens": 333601600.0, "step": 20700 }, { "epoch": 4.799629157492178, "grad_norm": 0.879522979259491, "learning_rate": 2.7833938606443917e-05, "loss": 1.4883, "mean_token_accuracy": 0.68299850076437, "num_tokens": 333762760.0, "step": 20710 }, { "epoch": 4.801946923166068, "grad_norm": 0.8780379891395569, "learning_rate": 2.781529402323033e-05, "loss": 1.513, "mean_token_accuracy": 0.6781623959541321, "num_tokens": 333924164.0, "step": 20720 }, { "epoch": 4.804264688839958, "grad_norm": 0.9223511815071106, "learning_rate": 2.779664785392323e-05, "loss": 1.4947, "mean_token_accuracy": 0.6824640721082688, "num_tokens": 334083956.0, "step": 20730 }, { "epoch": 4.8065824545138485, "grad_norm": 0.8629807829856873, "learning_rate": 2.7778000109027574e-05, "loss": 1.5087, "mean_token_accuracy": 0.6790805578231811, "num_tokens": 334245809.0, "step": 20740 }, { "epoch": 4.808900220187739, "grad_norm": 0.8988767266273499, "learning_rate": 2.7759350799049216e-05, "loss": 1.4998, "mean_token_accuracy": 0.6832803979516029, "num_tokens": 334407033.0, "step": 20750 }, { "epoch": 4.81121798586163, "grad_norm": 0.9386240839958191, "learning_rate": 2.7740699934494897e-05, "loss": 1.5172, "mean_token_accuracy": 0.6775680005550384, "num_tokens": 334568366.0, "step": 20760 }, { "epoch": 4.81353575153552, "grad_norm": 0.8916966915130615, "learning_rate": 2.7722047525872218e-05, "loss": 1.4919, "mean_token_accuracy": 0.6836142912507057, "num_tokens": 334729501.0, "step": 20770 }, { "epoch": 4.81585351720941, "grad_norm": 0.9092524647712708, "learning_rate": 2.7703393583689673e-05, "loss": 1.5061, "mean_token_accuracy": 0.6787784725427628, "num_tokens": 334891119.0, "step": 20780 }, { "epoch": 4.8181712828833, "grad_norm": 0.9110711812973022, "learning_rate": 2.76847381184566e-05, "loss": 1.5101, "mean_token_accuracy": 0.6798563346266746, "num_tokens": 335052032.0, "step": 20790 }, { "epoch": 4.8204890485571905, "grad_norm": 0.9295179843902588, "learning_rate": 2.7666081140683205e-05, "loss": 1.5013, "mean_token_accuracy": 0.6802446559071541, "num_tokens": 335212462.0, "step": 20800 }, { "epoch": 4.822806814231082, "grad_norm": 0.9684919118881226, "learning_rate": 2.764742266088054e-05, "loss": 1.5074, "mean_token_accuracy": 0.6804559051990509, "num_tokens": 335373629.0, "step": 20810 }, { "epoch": 4.825124579904972, "grad_norm": 0.8793768882751465, "learning_rate": 2.7628762689560505e-05, "loss": 1.4972, "mean_token_accuracy": 0.6824353590607644, "num_tokens": 335534928.0, "step": 20820 }, { "epoch": 4.827442345578862, "grad_norm": 0.8553288578987122, "learning_rate": 2.7610101237235854e-05, "loss": 1.4962, "mean_token_accuracy": 0.6826825216412544, "num_tokens": 335696285.0, "step": 20830 }, { "epoch": 4.829760111252752, "grad_norm": 0.9560690522193909, "learning_rate": 2.7591438314420147e-05, "loss": 1.4993, "mean_token_accuracy": 0.6806418761610985, "num_tokens": 335858245.0, "step": 20840 }, { "epoch": 4.832077876926642, "grad_norm": 0.9002159833908081, "learning_rate": 2.7572773931627806e-05, "loss": 1.504, "mean_token_accuracy": 0.6796973839402198, "num_tokens": 336018771.0, "step": 20850 }, { "epoch": 4.8343956426005334, "grad_norm": 0.9230687022209167, "learning_rate": 2.7554108099374037e-05, "loss": 1.5108, "mean_token_accuracy": 0.6779971733689308, "num_tokens": 336179898.0, "step": 20860 }, { "epoch": 4.836713408274424, "grad_norm": 0.9046947956085205, "learning_rate": 2.7535440828174907e-05, "loss": 1.5041, "mean_token_accuracy": 0.6798162519931793, "num_tokens": 336341715.0, "step": 20870 }, { "epoch": 4.839031173948314, "grad_norm": 0.9051496386528015, "learning_rate": 2.7516772128547252e-05, "loss": 1.5237, "mean_token_accuracy": 0.678357969224453, "num_tokens": 336503564.0, "step": 20880 }, { "epoch": 4.841348939622204, "grad_norm": 0.9679556488990784, "learning_rate": 2.749810201100874e-05, "loss": 1.5015, "mean_token_accuracy": 0.6814279541373253, "num_tokens": 336665160.0, "step": 20890 }, { "epoch": 4.843666705296094, "grad_norm": 0.9134320020675659, "learning_rate": 2.7479430486077834e-05, "loss": 1.4955, "mean_token_accuracy": 0.6808779954910278, "num_tokens": 336826950.0, "step": 20900 }, { "epoch": 4.845984470969985, "grad_norm": 0.9087101221084595, "learning_rate": 2.746075756427377e-05, "loss": 1.5081, "mean_token_accuracy": 0.6806474655866623, "num_tokens": 336988057.0, "step": 20910 }, { "epoch": 4.8483022366438755, "grad_norm": 0.9239331483840942, "learning_rate": 2.7442083256116606e-05, "loss": 1.5027, "mean_token_accuracy": 0.6807838708162308, "num_tokens": 337149165.0, "step": 20920 }, { "epoch": 4.850620002317766, "grad_norm": 0.9320402145385742, "learning_rate": 2.7423407572127145e-05, "loss": 1.5093, "mean_token_accuracy": 0.6804054453969002, "num_tokens": 337310535.0, "step": 20930 }, { "epoch": 4.852937767991656, "grad_norm": 0.965937077999115, "learning_rate": 2.7404730522826988e-05, "loss": 1.4909, "mean_token_accuracy": 0.6838526532053948, "num_tokens": 337471107.0, "step": 20940 }, { "epoch": 4.855255533665546, "grad_norm": 0.9172646999359131, "learning_rate": 2.7386052118738492e-05, "loss": 1.4958, "mean_token_accuracy": 0.6811097547411918, "num_tokens": 337633039.0, "step": 20950 }, { "epoch": 4.857573299339437, "grad_norm": 0.939111590385437, "learning_rate": 2.7367372370384792e-05, "loss": 1.4962, "mean_token_accuracy": 0.682984645664692, "num_tokens": 337793905.0, "step": 20960 }, { "epoch": 4.859891065013327, "grad_norm": 0.9261757135391235, "learning_rate": 2.7348691288289768e-05, "loss": 1.5023, "mean_token_accuracy": 0.6820013001561165, "num_tokens": 337955331.0, "step": 20970 }, { "epoch": 4.8622088306872175, "grad_norm": 0.8401159644126892, "learning_rate": 2.733000888297806e-05, "loss": 1.4655, "mean_token_accuracy": 0.6857780188322067, "num_tokens": 338116471.0, "step": 20980 }, { "epoch": 4.864526596361108, "grad_norm": 0.9053835272789001, "learning_rate": 2.7311325164975044e-05, "loss": 1.4875, "mean_token_accuracy": 0.683793631196022, "num_tokens": 338277450.0, "step": 20990 }, { "epoch": 4.866844362034998, "grad_norm": 0.9933170676231384, "learning_rate": 2.729264014480684e-05, "loss": 1.5262, "mean_token_accuracy": 0.6773943722248077, "num_tokens": 338438926.0, "step": 21000 }, { "epoch": 4.869162127708889, "grad_norm": 0.9458315968513489, "learning_rate": 2.7273953833000303e-05, "loss": 1.5024, "mean_token_accuracy": 0.6793297246098519, "num_tokens": 338600738.0, "step": 21010 }, { "epoch": 4.871479893382779, "grad_norm": 0.8922477960586548, "learning_rate": 2.7255266240083015e-05, "loss": 1.5013, "mean_token_accuracy": 0.6802693739533424, "num_tokens": 338760888.0, "step": 21020 }, { "epoch": 4.873797659056669, "grad_norm": 0.9172370433807373, "learning_rate": 2.7236577376583282e-05, "loss": 1.499, "mean_token_accuracy": 0.6819846570491791, "num_tokens": 338922556.0, "step": 21030 }, { "epoch": 4.87611542473056, "grad_norm": 0.9017640352249146, "learning_rate": 2.7217887253030117e-05, "loss": 1.4958, "mean_token_accuracy": 0.6810936689376831, "num_tokens": 339084588.0, "step": 21040 }, { "epoch": 4.87843319040445, "grad_norm": 0.8698016405105591, "learning_rate": 2.7199195879953254e-05, "loss": 1.5048, "mean_token_accuracy": 0.6794813677668572, "num_tokens": 339246322.0, "step": 21050 }, { "epoch": 4.880750956078341, "grad_norm": 0.8995314836502075, "learning_rate": 2.7180503267883128e-05, "loss": 1.4968, "mean_token_accuracy": 0.6800409495830536, "num_tokens": 339407579.0, "step": 21060 }, { "epoch": 4.883068721752231, "grad_norm": 0.9016774296760559, "learning_rate": 2.716180942735087e-05, "loss": 1.4937, "mean_token_accuracy": 0.6805693507194519, "num_tokens": 339569270.0, "step": 21070 }, { "epoch": 4.885386487426121, "grad_norm": 0.9168453216552734, "learning_rate": 2.71431143688883e-05, "loss": 1.5012, "mean_token_accuracy": 0.6806706786155701, "num_tokens": 339731102.0, "step": 21080 }, { "epoch": 4.887704253100011, "grad_norm": 0.9241815209388733, "learning_rate": 2.7124418103027928e-05, "loss": 1.4886, "mean_token_accuracy": 0.6820501402020455, "num_tokens": 339892937.0, "step": 21090 }, { "epoch": 4.890022018773902, "grad_norm": 0.9412543177604675, "learning_rate": 2.7105720640302944e-05, "loss": 1.505, "mean_token_accuracy": 0.6803142979741097, "num_tokens": 340054134.0, "step": 21100 }, { "epoch": 4.892339784447793, "grad_norm": 0.9357321858406067, "learning_rate": 2.7087021991247212e-05, "loss": 1.4923, "mean_token_accuracy": 0.6820744290947914, "num_tokens": 340215850.0, "step": 21110 }, { "epoch": 4.894657550121683, "grad_norm": 0.8933391571044922, "learning_rate": 2.7068322166395272e-05, "loss": 1.4981, "mean_token_accuracy": 0.6813022255897522, "num_tokens": 340376761.0, "step": 21120 }, { "epoch": 4.896975315795573, "grad_norm": 0.9515575766563416, "learning_rate": 2.70496211762823e-05, "loss": 1.4921, "mean_token_accuracy": 0.6816424712538719, "num_tokens": 340537993.0, "step": 21130 }, { "epoch": 4.899293081469463, "grad_norm": 0.9112266898155212, "learning_rate": 2.7030919031444168e-05, "loss": 1.4908, "mean_token_accuracy": 0.681924170255661, "num_tokens": 340698611.0, "step": 21140 }, { "epoch": 4.9016108471433535, "grad_norm": 0.891154944896698, "learning_rate": 2.7012215742417368e-05, "loss": 1.5015, "mean_token_accuracy": 0.6820254668593406, "num_tokens": 340860538.0, "step": 21150 }, { "epoch": 4.903928612817245, "grad_norm": 0.8975220322608948, "learning_rate": 2.6993511319739046e-05, "loss": 1.5297, "mean_token_accuracy": 0.6780434444546699, "num_tokens": 341022099.0, "step": 21160 }, { "epoch": 4.906246378491135, "grad_norm": 0.9924400448799133, "learning_rate": 2.6974805773946992e-05, "loss": 1.5103, "mean_token_accuracy": 0.6793743699789048, "num_tokens": 341183834.0, "step": 21170 }, { "epoch": 4.908564144165025, "grad_norm": 0.8825339078903198, "learning_rate": 2.6956099115579612e-05, "loss": 1.4928, "mean_token_accuracy": 0.6836494684219361, "num_tokens": 341344905.0, "step": 21180 }, { "epoch": 4.910881909838915, "grad_norm": 0.94589763879776, "learning_rate": 2.6937391355175968e-05, "loss": 1.4932, "mean_token_accuracy": 0.6816864296793937, "num_tokens": 341506766.0, "step": 21190 }, { "epoch": 4.913199675512805, "grad_norm": 0.9898949861526489, "learning_rate": 2.6918682503275716e-05, "loss": 1.4929, "mean_token_accuracy": 0.6813726469874382, "num_tokens": 341667790.0, "step": 21200 }, { "epoch": 4.9155174411866955, "grad_norm": 0.8857076168060303, "learning_rate": 2.6899972570419136e-05, "loss": 1.4882, "mean_token_accuracy": 0.6834507688879967, "num_tokens": 341829316.0, "step": 21210 }, { "epoch": 4.917835206860587, "grad_norm": 0.9187468886375427, "learning_rate": 2.6881261567147125e-05, "loss": 1.5071, "mean_token_accuracy": 0.6810047879815102, "num_tokens": 341990896.0, "step": 21220 }, { "epoch": 4.920152972534477, "grad_norm": 0.978598415851593, "learning_rate": 2.686254950400116e-05, "loss": 1.506, "mean_token_accuracy": 0.6788967862725258, "num_tokens": 342152209.0, "step": 21230 }, { "epoch": 4.922470738208367, "grad_norm": 0.9013439416885376, "learning_rate": 2.684383639152335e-05, "loss": 1.4901, "mean_token_accuracy": 0.6814667776226997, "num_tokens": 342312925.0, "step": 21240 }, { "epoch": 4.924788503882257, "grad_norm": 0.846361517906189, "learning_rate": 2.6825122240256357e-05, "loss": 1.4957, "mean_token_accuracy": 0.6809542492032051, "num_tokens": 342474317.0, "step": 21250 }, { "epoch": 4.927106269556148, "grad_norm": 0.9342813491821289, "learning_rate": 2.680640706074346e-05, "loss": 1.5051, "mean_token_accuracy": 0.6806364014744759, "num_tokens": 342635438.0, "step": 21260 }, { "epoch": 4.9294240352300385, "grad_norm": 0.8774763345718384, "learning_rate": 2.6787690863528498e-05, "loss": 1.4948, "mean_token_accuracy": 0.6831057816743851, "num_tokens": 342796122.0, "step": 21270 }, { "epoch": 4.931741800903929, "grad_norm": 0.915189802646637, "learning_rate": 2.6768973659155893e-05, "loss": 1.5246, "mean_token_accuracy": 0.6785590916872024, "num_tokens": 342956798.0, "step": 21280 }, { "epoch": 4.934059566577819, "grad_norm": 0.902085542678833, "learning_rate": 2.6750255458170635e-05, "loss": 1.4971, "mean_token_accuracy": 0.6818168953061103, "num_tokens": 343116463.0, "step": 21290 }, { "epoch": 4.936377332251709, "grad_norm": 0.9668533205986023, "learning_rate": 2.6731536271118263e-05, "loss": 1.5173, "mean_token_accuracy": 0.6789063930511474, "num_tokens": 343277894.0, "step": 21300 }, { "epoch": 4.938695097925599, "grad_norm": 0.9853602647781372, "learning_rate": 2.6712816108544887e-05, "loss": 1.4891, "mean_token_accuracy": 0.6821427926421165, "num_tokens": 343438340.0, "step": 21310 }, { "epoch": 4.94101286359949, "grad_norm": 0.8755797147750854, "learning_rate": 2.669409498099716e-05, "loss": 1.4863, "mean_token_accuracy": 0.6823623239994049, "num_tokens": 343600035.0, "step": 21320 }, { "epoch": 4.9433306292733805, "grad_norm": 0.9306771159172058, "learning_rate": 2.6675372899022266e-05, "loss": 1.4938, "mean_token_accuracy": 0.6813113451004028, "num_tokens": 343761538.0, "step": 21330 }, { "epoch": 4.945648394947271, "grad_norm": 0.9444568753242493, "learning_rate": 2.6656649873167955e-05, "loss": 1.5126, "mean_token_accuracy": 0.6807903602719307, "num_tokens": 343923122.0, "step": 21340 }, { "epoch": 4.947966160621161, "grad_norm": 0.9092767238616943, "learning_rate": 2.6637925913982485e-05, "loss": 1.5085, "mean_token_accuracy": 0.6801023736596108, "num_tokens": 344084875.0, "step": 21350 }, { "epoch": 4.950283926295052, "grad_norm": 0.9131573438644409, "learning_rate": 2.6619201032014657e-05, "loss": 1.4926, "mean_token_accuracy": 0.6827234536409378, "num_tokens": 344242768.0, "step": 21360 }, { "epoch": 4.952601691968942, "grad_norm": 0.9534137845039368, "learning_rate": 2.6600475237813765e-05, "loss": 1.5125, "mean_token_accuracy": 0.6804347664117814, "num_tokens": 344403816.0, "step": 21370 }, { "epoch": 4.954919457642832, "grad_norm": 0.9177855253219604, "learning_rate": 2.658174854192965e-05, "loss": 1.5106, "mean_token_accuracy": 0.6790830388665199, "num_tokens": 344565136.0, "step": 21380 }, { "epoch": 4.957237223316723, "grad_norm": 0.878295361995697, "learning_rate": 2.6563020954912643e-05, "loss": 1.4849, "mean_token_accuracy": 0.6815316528081894, "num_tokens": 344725661.0, "step": 21390 }, { "epoch": 4.959554988990613, "grad_norm": 0.8974469900131226, "learning_rate": 2.6544292487313576e-05, "loss": 1.4796, "mean_token_accuracy": 0.6844438046216965, "num_tokens": 344886942.0, "step": 21400 }, { "epoch": 4.961872754664503, "grad_norm": 0.9263953566551208, "learning_rate": 2.6525563149683774e-05, "loss": 1.494, "mean_token_accuracy": 0.6811858788132668, "num_tokens": 345048626.0, "step": 21410 }, { "epoch": 4.964190520338394, "grad_norm": 0.9609734416007996, "learning_rate": 2.650683295257507e-05, "loss": 1.4993, "mean_token_accuracy": 0.6813919723033905, "num_tokens": 345210139.0, "step": 21420 }, { "epoch": 4.966508286012284, "grad_norm": 0.8944326043128967, "learning_rate": 2.648810190653977e-05, "loss": 1.5095, "mean_token_accuracy": 0.6810818046331406, "num_tokens": 345371738.0, "step": 21430 }, { "epoch": 4.968826051686174, "grad_norm": 0.9091264009475708, "learning_rate": 2.6469370022130652e-05, "loss": 1.4867, "mean_token_accuracy": 0.6831229433417321, "num_tokens": 345531241.0, "step": 21440 }, { "epoch": 4.971143817360065, "grad_norm": 0.8924766182899475, "learning_rate": 2.6450637309900978e-05, "loss": 1.4931, "mean_token_accuracy": 0.6823897302150727, "num_tokens": 345692599.0, "step": 21450 }, { "epoch": 4.973461583033956, "grad_norm": 0.9903618097305298, "learning_rate": 2.6431903780404467e-05, "loss": 1.5179, "mean_token_accuracy": 0.6775544837117196, "num_tokens": 345853748.0, "step": 21460 }, { "epoch": 4.975779348707846, "grad_norm": 0.9315907955169678, "learning_rate": 2.64131694441953e-05, "loss": 1.492, "mean_token_accuracy": 0.6816113382577896, "num_tokens": 346013823.0, "step": 21470 }, { "epoch": 4.978097114381736, "grad_norm": 0.9503138661384583, "learning_rate": 2.6394434311828124e-05, "loss": 1.5059, "mean_token_accuracy": 0.6792439520359039, "num_tokens": 346175670.0, "step": 21480 }, { "epoch": 4.980414880055626, "grad_norm": 0.9012943506240845, "learning_rate": 2.6375698393858018e-05, "loss": 1.4878, "mean_token_accuracy": 0.6830983996391297, "num_tokens": 346337200.0, "step": 21490 }, { "epoch": 4.9827326457295165, "grad_norm": 0.9150798916816711, "learning_rate": 2.635696170084052e-05, "loss": 1.4953, "mean_token_accuracy": 0.6815557822585105, "num_tokens": 346498386.0, "step": 21500 }, { "epoch": 4.985050411403407, "grad_norm": 0.9057941436767578, "learning_rate": 2.6338224243331587e-05, "loss": 1.4907, "mean_token_accuracy": 0.6823256239295006, "num_tokens": 346659003.0, "step": 21510 }, { "epoch": 4.987368177077298, "grad_norm": 0.9217302203178406, "learning_rate": 2.6319486031887615e-05, "loss": 1.5018, "mean_token_accuracy": 0.6802535191178322, "num_tokens": 346819635.0, "step": 21520 }, { "epoch": 4.989685942751188, "grad_norm": 0.8851320147514343, "learning_rate": 2.6300747077065428e-05, "loss": 1.5143, "mean_token_accuracy": 0.6786890581250191, "num_tokens": 346981037.0, "step": 21530 }, { "epoch": 4.992003708425078, "grad_norm": 0.971131443977356, "learning_rate": 2.628200738942227e-05, "loss": 1.5057, "mean_token_accuracy": 0.6805704697966576, "num_tokens": 347142324.0, "step": 21540 }, { "epoch": 4.994321474098968, "grad_norm": 0.9159948229789734, "learning_rate": 2.626326697951578e-05, "loss": 1.5086, "mean_token_accuracy": 0.6807772532105446, "num_tokens": 347304124.0, "step": 21550 }, { "epoch": 4.996639239772859, "grad_norm": 0.9096494913101196, "learning_rate": 2.624452585790404e-05, "loss": 1.5051, "mean_token_accuracy": 0.6802571281790734, "num_tokens": 347465656.0, "step": 21560 }, { "epoch": 4.99895700544675, "grad_norm": 0.9407311677932739, "learning_rate": 2.6225784035145496e-05, "loss": 1.4871, "mean_token_accuracy": 0.6834933295845985, "num_tokens": 347627037.0, "step": 21570 }, { "epoch": 5.001158882836945, "grad_norm": 0.8712770938873291, "learning_rate": 2.6207041521799e-05, "loss": 1.4672, "mean_token_accuracy": 0.6857596824043676, "num_tokens": 347779389.0, "step": 21580 }, { "epoch": 5.003476648510835, "grad_norm": 0.9607505202293396, "learning_rate": 2.618829832842381e-05, "loss": 1.4565, "mean_token_accuracy": 0.6884066298604011, "num_tokens": 347940546.0, "step": 21590 }, { "epoch": 5.0057944141847255, "grad_norm": 0.9403126835823059, "learning_rate": 2.616955446557955e-05, "loss": 1.4817, "mean_token_accuracy": 0.6824418470263481, "num_tokens": 348102260.0, "step": 21600 }, { "epoch": 5.008112179858617, "grad_norm": 0.9400493502616882, "learning_rate": 2.6150809943826216e-05, "loss": 1.4662, "mean_token_accuracy": 0.6858570143580437, "num_tokens": 348263202.0, "step": 21610 }, { "epoch": 5.010429945532507, "grad_norm": 0.9023087024688721, "learning_rate": 2.6132064773724195e-05, "loss": 1.4778, "mean_token_accuracy": 0.6844580337405205, "num_tokens": 348423848.0, "step": 21620 }, { "epoch": 5.012747711206397, "grad_norm": 0.8537144064903259, "learning_rate": 2.6113318965834223e-05, "loss": 1.4809, "mean_token_accuracy": 0.6834542691707611, "num_tokens": 348584628.0, "step": 21630 }, { "epoch": 5.015065476880287, "grad_norm": 0.9162819981575012, "learning_rate": 2.6094572530717403e-05, "loss": 1.474, "mean_token_accuracy": 0.6862503916025162, "num_tokens": 348745093.0, "step": 21640 }, { "epoch": 5.017383242554177, "grad_norm": 0.9667580723762512, "learning_rate": 2.607582547893519e-05, "loss": 1.4785, "mean_token_accuracy": 0.6846204996109009, "num_tokens": 348906932.0, "step": 21650 }, { "epoch": 5.019701008228068, "grad_norm": 0.9105300307273865, "learning_rate": 2.6057077821049387e-05, "loss": 1.4629, "mean_token_accuracy": 0.6863274514675141, "num_tokens": 349068313.0, "step": 21660 }, { "epoch": 5.022018773901959, "grad_norm": 0.9674612283706665, "learning_rate": 2.603832956762213e-05, "loss": 1.4704, "mean_token_accuracy": 0.684069998562336, "num_tokens": 349229756.0, "step": 21670 }, { "epoch": 5.024336539575849, "grad_norm": 0.9627320170402527, "learning_rate": 2.60195807292159e-05, "loss": 1.4873, "mean_token_accuracy": 0.6829053670167923, "num_tokens": 349388684.0, "step": 21680 }, { "epoch": 5.026654305249739, "grad_norm": 0.954100489616394, "learning_rate": 2.6000831316393516e-05, "loss": 1.4776, "mean_token_accuracy": 0.6828211262822151, "num_tokens": 349550284.0, "step": 21690 }, { "epoch": 5.028972070923629, "grad_norm": 1.0053691864013672, "learning_rate": 2.59820813397181e-05, "loss": 1.4785, "mean_token_accuracy": 0.683412367105484, "num_tokens": 349712151.0, "step": 21700 }, { "epoch": 5.03128983659752, "grad_norm": 0.9003102779388428, "learning_rate": 2.59633308097531e-05, "loss": 1.4816, "mean_token_accuracy": 0.6845495015382767, "num_tokens": 349873489.0, "step": 21710 }, { "epoch": 5.0336076022714105, "grad_norm": 0.9737129807472229, "learning_rate": 2.5944579737062285e-05, "loss": 1.4844, "mean_token_accuracy": 0.6832154095172882, "num_tokens": 350033602.0, "step": 21720 }, { "epoch": 5.035925367945301, "grad_norm": 0.9397016763687134, "learning_rate": 2.5925828132209712e-05, "loss": 1.4699, "mean_token_accuracy": 0.6844214573502541, "num_tokens": 350195200.0, "step": 21730 }, { "epoch": 5.038243133619191, "grad_norm": 0.8948540091514587, "learning_rate": 2.5907076005759762e-05, "loss": 1.4945, "mean_token_accuracy": 0.68172457665205, "num_tokens": 350356205.0, "step": 21740 }, { "epoch": 5.040560899293081, "grad_norm": 0.9692331552505493, "learning_rate": 2.5888323368277078e-05, "loss": 1.4839, "mean_token_accuracy": 0.6834735184907913, "num_tokens": 350515945.0, "step": 21750 }, { "epoch": 5.042878664966972, "grad_norm": 0.9973872900009155, "learning_rate": 2.5869570230326627e-05, "loss": 1.4673, "mean_token_accuracy": 0.6851476266980171, "num_tokens": 350677513.0, "step": 21760 }, { "epoch": 5.045196430640862, "grad_norm": 0.9529513716697693, "learning_rate": 2.5850816602473628e-05, "loss": 1.4651, "mean_token_accuracy": 0.6867429852485657, "num_tokens": 350838191.0, "step": 21770 }, { "epoch": 5.0475141963147525, "grad_norm": 0.9840685129165649, "learning_rate": 2.5832062495283592e-05, "loss": 1.4741, "mean_token_accuracy": 0.6846344411373139, "num_tokens": 350999115.0, "step": 21780 }, { "epoch": 5.049831961988643, "grad_norm": 0.9780298471450806, "learning_rate": 2.5813307919322295e-05, "loss": 1.4892, "mean_token_accuracy": 0.6827963098883629, "num_tokens": 351160830.0, "step": 21790 }, { "epoch": 5.052149727662533, "grad_norm": 0.9730815291404724, "learning_rate": 2.579455288515577e-05, "loss": 1.4895, "mean_token_accuracy": 0.6842002525925637, "num_tokens": 351321266.0, "step": 21800 }, { "epoch": 5.054467493336424, "grad_norm": 0.9221318364143372, "learning_rate": 2.5775797403350327e-05, "loss": 1.4698, "mean_token_accuracy": 0.6842155322432518, "num_tokens": 351481509.0, "step": 21810 }, { "epoch": 5.056785259010314, "grad_norm": 0.9607990980148315, "learning_rate": 2.575704148447251e-05, "loss": 1.4811, "mean_token_accuracy": 0.6838480457663536, "num_tokens": 351643006.0, "step": 21820 }, { "epoch": 5.059103024684204, "grad_norm": 0.9370882511138916, "learning_rate": 2.5738285139089125e-05, "loss": 1.4919, "mean_token_accuracy": 0.683330950140953, "num_tokens": 351804584.0, "step": 21830 }, { "epoch": 5.061420790358095, "grad_norm": 0.9411900639533997, "learning_rate": 2.5719528377767197e-05, "loss": 1.5014, "mean_token_accuracy": 0.6819626912474632, "num_tokens": 351966317.0, "step": 21840 }, { "epoch": 5.063738556031985, "grad_norm": 0.9327226877212524, "learning_rate": 2.570077121107401e-05, "loss": 1.477, "mean_token_accuracy": 0.6854543387889862, "num_tokens": 352127316.0, "step": 21850 }, { "epoch": 5.066056321705876, "grad_norm": 0.925276517868042, "learning_rate": 2.5682013649577063e-05, "loss": 1.4782, "mean_token_accuracy": 0.6827680572867394, "num_tokens": 352288594.0, "step": 21860 }, { "epoch": 5.068374087379766, "grad_norm": 0.8997163772583008, "learning_rate": 2.566325570384407e-05, "loss": 1.4864, "mean_token_accuracy": 0.6825277611613274, "num_tokens": 352450552.0, "step": 21870 }, { "epoch": 5.070691853053656, "grad_norm": 0.9762564897537231, "learning_rate": 2.5644497384442973e-05, "loss": 1.4966, "mean_token_accuracy": 0.6812527790665627, "num_tokens": 352611928.0, "step": 21880 }, { "epoch": 5.073009618727546, "grad_norm": 0.9634246826171875, "learning_rate": 2.562573870194192e-05, "loss": 1.465, "mean_token_accuracy": 0.6856980711221695, "num_tokens": 352773119.0, "step": 21890 }, { "epoch": 5.075327384401437, "grad_norm": 0.9280925393104553, "learning_rate": 2.5606979666909277e-05, "loss": 1.482, "mean_token_accuracy": 0.6825018554925919, "num_tokens": 352934936.0, "step": 21900 }, { "epoch": 5.077645150075328, "grad_norm": 0.9629049301147461, "learning_rate": 2.5588220289913582e-05, "loss": 1.4744, "mean_token_accuracy": 0.6849344655871391, "num_tokens": 353096722.0, "step": 21910 }, { "epoch": 5.079962915749218, "grad_norm": 0.9537055492401123, "learning_rate": 2.5569460581523596e-05, "loss": 1.4822, "mean_token_accuracy": 0.6834409594535827, "num_tokens": 353256982.0, "step": 21920 }, { "epoch": 5.082280681423108, "grad_norm": 0.9420868158340454, "learning_rate": 2.555070055230824e-05, "loss": 1.4833, "mean_token_accuracy": 0.6827144980430603, "num_tokens": 353417059.0, "step": 21930 }, { "epoch": 5.084598447096998, "grad_norm": 0.9972743391990662, "learning_rate": 2.5531940212836626e-05, "loss": 1.4764, "mean_token_accuracy": 0.6830799922347068, "num_tokens": 353578241.0, "step": 21940 }, { "epoch": 5.0869162127708885, "grad_norm": 0.9510798454284668, "learning_rate": 2.551317957367805e-05, "loss": 1.501, "mean_token_accuracy": 0.681732676923275, "num_tokens": 353740004.0, "step": 21950 }, { "epoch": 5.0892339784447795, "grad_norm": 0.9148961901664734, "learning_rate": 2.5494418645401962e-05, "loss": 1.4771, "mean_token_accuracy": 0.683317206799984, "num_tokens": 353901872.0, "step": 21960 }, { "epoch": 5.09155174411867, "grad_norm": 0.9757585525512695, "learning_rate": 2.5475657438577986e-05, "loss": 1.4811, "mean_token_accuracy": 0.6847433373332024, "num_tokens": 354061550.0, "step": 21970 }, { "epoch": 5.09386950979256, "grad_norm": 0.9347004890441895, "learning_rate": 2.5456895963775897e-05, "loss": 1.4848, "mean_token_accuracy": 0.6836804106831551, "num_tokens": 354223288.0, "step": 21980 }, { "epoch": 5.09618727546645, "grad_norm": 0.9148557186126709, "learning_rate": 2.543813423156562e-05, "loss": 1.468, "mean_token_accuracy": 0.6842194035649299, "num_tokens": 354384762.0, "step": 21990 }, { "epoch": 5.09850504114034, "grad_norm": 0.937114417552948, "learning_rate": 2.541937225251723e-05, "loss": 1.4977, "mean_token_accuracy": 0.6813980966806412, "num_tokens": 354545564.0, "step": 22000 }, { "epoch": 5.100822806814231, "grad_norm": 0.9194408655166626, "learning_rate": 2.5400610037200938e-05, "loss": 1.4913, "mean_token_accuracy": 0.6812768086791039, "num_tokens": 354707044.0, "step": 22010 }, { "epoch": 5.103140572488122, "grad_norm": 0.9055007696151733, "learning_rate": 2.538184759618709e-05, "loss": 1.4704, "mean_token_accuracy": 0.6855472177267075, "num_tokens": 354868901.0, "step": 22020 }, { "epoch": 5.105458338162012, "grad_norm": 0.9127270579338074, "learning_rate": 2.5363084940046154e-05, "loss": 1.482, "mean_token_accuracy": 0.6830404534935951, "num_tokens": 355029358.0, "step": 22030 }, { "epoch": 5.107776103835902, "grad_norm": 0.9175198078155518, "learning_rate": 2.5344322079348726e-05, "loss": 1.4834, "mean_token_accuracy": 0.6829610168933868, "num_tokens": 355190317.0, "step": 22040 }, { "epoch": 5.110093869509792, "grad_norm": 0.9687806367874146, "learning_rate": 2.532555902466552e-05, "loss": 1.4703, "mean_token_accuracy": 0.6850745037198067, "num_tokens": 355351860.0, "step": 22050 }, { "epoch": 5.112411635183683, "grad_norm": 0.916983962059021, "learning_rate": 2.5306795786567344e-05, "loss": 1.4705, "mean_token_accuracy": 0.6865906685590744, "num_tokens": 355512042.0, "step": 22060 }, { "epoch": 5.114729400857573, "grad_norm": 0.9558030962944031, "learning_rate": 2.528803237562512e-05, "loss": 1.5131, "mean_token_accuracy": 0.6798249170184135, "num_tokens": 355671703.0, "step": 22070 }, { "epoch": 5.117047166531464, "grad_norm": 0.9361735582351685, "learning_rate": 2.5269268802409874e-05, "loss": 1.479, "mean_token_accuracy": 0.6821359485387802, "num_tokens": 355833080.0, "step": 22080 }, { "epoch": 5.119364932205354, "grad_norm": 0.9263583421707153, "learning_rate": 2.5250505077492702e-05, "loss": 1.4706, "mean_token_accuracy": 0.6858884364366531, "num_tokens": 355994009.0, "step": 22090 }, { "epoch": 5.121682697879244, "grad_norm": 0.9114333391189575, "learning_rate": 2.523174121144481e-05, "loss": 1.4709, "mean_token_accuracy": 0.6848941057920456, "num_tokens": 356155459.0, "step": 22100 }, { "epoch": 5.124000463553135, "grad_norm": 0.8933506011962891, "learning_rate": 2.5212977214837474e-05, "loss": 1.4756, "mean_token_accuracy": 0.6866590216755867, "num_tokens": 356317130.0, "step": 22110 }, { "epoch": 5.126318229227025, "grad_norm": 0.9269260168075562, "learning_rate": 2.519421309824203e-05, "loss": 1.4684, "mean_token_accuracy": 0.6850204199552536, "num_tokens": 356478211.0, "step": 22120 }, { "epoch": 5.1286359949009155, "grad_norm": 0.9643650650978088, "learning_rate": 2.5175448872229905e-05, "loss": 1.4687, "mean_token_accuracy": 0.6861467435956001, "num_tokens": 356639704.0, "step": 22130 }, { "epoch": 5.130953760574806, "grad_norm": 0.9230321049690247, "learning_rate": 2.5156684547372578e-05, "loss": 1.4908, "mean_token_accuracy": 0.6820364579558372, "num_tokens": 356800966.0, "step": 22140 }, { "epoch": 5.133271526248696, "grad_norm": 0.898551344871521, "learning_rate": 2.5137920134241576e-05, "loss": 1.4805, "mean_token_accuracy": 0.6825964480638504, "num_tokens": 356962139.0, "step": 22150 }, { "epoch": 5.135589291922587, "grad_norm": 0.9550626873970032, "learning_rate": 2.5119155643408482e-05, "loss": 1.4926, "mean_token_accuracy": 0.6822183683514595, "num_tokens": 357122240.0, "step": 22160 }, { "epoch": 5.137907057596477, "grad_norm": 0.9313462376594543, "learning_rate": 2.5100391085444923e-05, "loss": 1.4773, "mean_token_accuracy": 0.6849136248230934, "num_tokens": 357283982.0, "step": 22170 }, { "epoch": 5.140224823270367, "grad_norm": 0.8963204622268677, "learning_rate": 2.5081626470922565e-05, "loss": 1.4715, "mean_token_accuracy": 0.6838357836008072, "num_tokens": 357445326.0, "step": 22180 }, { "epoch": 5.1425425889442575, "grad_norm": 0.8951752185821533, "learning_rate": 2.5062861810413108e-05, "loss": 1.4651, "mean_token_accuracy": 0.6860166862607002, "num_tokens": 357606879.0, "step": 22190 }, { "epoch": 5.144860354618148, "grad_norm": 0.9479119777679443, "learning_rate": 2.5044097114488264e-05, "loss": 1.4874, "mean_token_accuracy": 0.6831406340003013, "num_tokens": 357768030.0, "step": 22200 }, { "epoch": 5.147178120292039, "grad_norm": 0.9346294403076172, "learning_rate": 2.5025332393719786e-05, "loss": 1.4705, "mean_token_accuracy": 0.6864842981100082, "num_tokens": 357928181.0, "step": 22210 }, { "epoch": 5.149495885965929, "grad_norm": 0.896194338798523, "learning_rate": 2.500656765867942e-05, "loss": 1.4778, "mean_token_accuracy": 0.6843934744596482, "num_tokens": 358088939.0, "step": 22220 }, { "epoch": 5.151813651639819, "grad_norm": 0.9455170035362244, "learning_rate": 2.498780291993894e-05, "loss": 1.4789, "mean_token_accuracy": 0.6827389016747475, "num_tokens": 358250088.0, "step": 22230 }, { "epoch": 5.154131417313709, "grad_norm": 0.9557190537452698, "learning_rate": 2.4969038188070108e-05, "loss": 1.4845, "mean_token_accuracy": 0.6812279194593429, "num_tokens": 358411089.0, "step": 22240 }, { "epoch": 5.1564491829876, "grad_norm": 0.9896844625473022, "learning_rate": 2.4950273473644693e-05, "loss": 1.4732, "mean_token_accuracy": 0.6843216121196747, "num_tokens": 358571177.0, "step": 22250 }, { "epoch": 5.158766948661491, "grad_norm": 0.9954651594161987, "learning_rate": 2.4931508787234432e-05, "loss": 1.4887, "mean_token_accuracy": 0.6822498828172684, "num_tokens": 358732861.0, "step": 22260 }, { "epoch": 5.161084714335381, "grad_norm": 0.9026806950569153, "learning_rate": 2.4912744139411072e-05, "loss": 1.492, "mean_token_accuracy": 0.6834847837686538, "num_tokens": 358893732.0, "step": 22270 }, { "epoch": 5.163402480009271, "grad_norm": 0.9352847933769226, "learning_rate": 2.489397954074633e-05, "loss": 1.4751, "mean_token_accuracy": 0.6848849281668663, "num_tokens": 359054349.0, "step": 22280 }, { "epoch": 5.165720245683161, "grad_norm": 0.9371057152748108, "learning_rate": 2.4875215001811884e-05, "loss": 1.4753, "mean_token_accuracy": 0.6838301494717598, "num_tokens": 359216041.0, "step": 22290 }, { "epoch": 5.168038011357051, "grad_norm": 0.9358397722244263, "learning_rate": 2.48564505331794e-05, "loss": 1.4823, "mean_token_accuracy": 0.6835426032543183, "num_tokens": 359377318.0, "step": 22300 }, { "epoch": 5.1703557770309425, "grad_norm": 0.9259112477302551, "learning_rate": 2.4837686145420486e-05, "loss": 1.4795, "mean_token_accuracy": 0.6836799383163452, "num_tokens": 359538231.0, "step": 22310 }, { "epoch": 5.172673542704833, "grad_norm": 1.032861590385437, "learning_rate": 2.481892184910671e-05, "loss": 1.4806, "mean_token_accuracy": 0.6831195071339607, "num_tokens": 359698822.0, "step": 22320 }, { "epoch": 5.174991308378723, "grad_norm": 0.9264110326766968, "learning_rate": 2.480015765480959e-05, "loss": 1.4894, "mean_token_accuracy": 0.6839231625199318, "num_tokens": 359860987.0, "step": 22330 }, { "epoch": 5.177309074052613, "grad_norm": 0.9379228353500366, "learning_rate": 2.4781393573100585e-05, "loss": 1.4865, "mean_token_accuracy": 0.6832414999604225, "num_tokens": 360022726.0, "step": 22340 }, { "epoch": 5.179626839726503, "grad_norm": 0.9728876352310181, "learning_rate": 2.4762629614551092e-05, "loss": 1.4765, "mean_token_accuracy": 0.6847902268171311, "num_tokens": 360184273.0, "step": 22350 }, { "epoch": 5.181944605400394, "grad_norm": 0.9223935604095459, "learning_rate": 2.4743865789732435e-05, "loss": 1.4655, "mean_token_accuracy": 0.6865390434861183, "num_tokens": 360345113.0, "step": 22360 }, { "epoch": 5.184262371074285, "grad_norm": 0.925032913684845, "learning_rate": 2.472510210921587e-05, "loss": 1.487, "mean_token_accuracy": 0.6842687666416168, "num_tokens": 360506253.0, "step": 22370 }, { "epoch": 5.186580136748175, "grad_norm": 0.9689394235610962, "learning_rate": 2.4706338583572566e-05, "loss": 1.4778, "mean_token_accuracy": 0.6846802130341529, "num_tokens": 360667802.0, "step": 22380 }, { "epoch": 5.188897902422065, "grad_norm": 0.9185059070587158, "learning_rate": 2.4687575223373602e-05, "loss": 1.4738, "mean_token_accuracy": 0.6845926836133003, "num_tokens": 360829410.0, "step": 22390 }, { "epoch": 5.191215668095955, "grad_norm": 0.9430227875709534, "learning_rate": 2.4668812039189974e-05, "loss": 1.4866, "mean_token_accuracy": 0.6841532275080681, "num_tokens": 360991098.0, "step": 22400 }, { "epoch": 5.193533433769846, "grad_norm": 0.9766164422035217, "learning_rate": 2.4650049041592567e-05, "loss": 1.4831, "mean_token_accuracy": 0.6845285207033157, "num_tokens": 361152149.0, "step": 22410 }, { "epoch": 5.195851199443736, "grad_norm": 0.9308554530143738, "learning_rate": 2.4631286241152168e-05, "loss": 1.4884, "mean_token_accuracy": 0.6816630512475967, "num_tokens": 361313974.0, "step": 22420 }, { "epoch": 5.198168965117627, "grad_norm": 0.927574098110199, "learning_rate": 2.4612523648439454e-05, "loss": 1.491, "mean_token_accuracy": 0.6816366776823998, "num_tokens": 361475363.0, "step": 22430 }, { "epoch": 5.200486730791517, "grad_norm": 0.9572771787643433, "learning_rate": 2.4593761274024975e-05, "loss": 1.4784, "mean_token_accuracy": 0.6843810990452767, "num_tokens": 361635452.0, "step": 22440 }, { "epoch": 5.202804496465407, "grad_norm": 0.906354546546936, "learning_rate": 2.4574999128479175e-05, "loss": 1.4589, "mean_token_accuracy": 0.6881908655166626, "num_tokens": 361796988.0, "step": 22450 }, { "epoch": 5.205122262139298, "grad_norm": 0.9648762345314026, "learning_rate": 2.4556237222372354e-05, "loss": 1.4877, "mean_token_accuracy": 0.6835971340537071, "num_tokens": 361956689.0, "step": 22460 }, { "epoch": 5.207440027813188, "grad_norm": 0.9716407060623169, "learning_rate": 2.4537475566274683e-05, "loss": 1.4931, "mean_token_accuracy": 0.6819683745503425, "num_tokens": 362117650.0, "step": 22470 }, { "epoch": 5.2097577934870785, "grad_norm": 0.9244881272315979, "learning_rate": 2.4518714170756202e-05, "loss": 1.4664, "mean_token_accuracy": 0.6840852558612823, "num_tokens": 362279067.0, "step": 22480 }, { "epoch": 5.212075559160969, "grad_norm": 0.9060690402984619, "learning_rate": 2.4499953046386774e-05, "loss": 1.4944, "mean_token_accuracy": 0.6833547815680504, "num_tokens": 362439938.0, "step": 22490 }, { "epoch": 5.214393324834859, "grad_norm": 0.9446709156036377, "learning_rate": 2.4481192203736137e-05, "loss": 1.4891, "mean_token_accuracy": 0.6834095552563667, "num_tokens": 362601488.0, "step": 22500 }, { "epoch": 5.21671109050875, "grad_norm": 0.9929749369621277, "learning_rate": 2.4462431653373863e-05, "loss": 1.486, "mean_token_accuracy": 0.6820397332310677, "num_tokens": 362763400.0, "step": 22510 }, { "epoch": 5.21902885618264, "grad_norm": 0.9531674385070801, "learning_rate": 2.444367140586936e-05, "loss": 1.4918, "mean_token_accuracy": 0.6834374785423278, "num_tokens": 362925003.0, "step": 22520 }, { "epoch": 5.22134662185653, "grad_norm": 0.9512975811958313, "learning_rate": 2.4424911471791863e-05, "loss": 1.4702, "mean_token_accuracy": 0.6850329220294953, "num_tokens": 363086045.0, "step": 22530 }, { "epoch": 5.2236643875304205, "grad_norm": 0.9509850144386292, "learning_rate": 2.4406151861710435e-05, "loss": 1.4755, "mean_token_accuracy": 0.6849849209189415, "num_tokens": 363247345.0, "step": 22540 }, { "epoch": 5.225982153204311, "grad_norm": 0.9182525873184204, "learning_rate": 2.4387392586193946e-05, "loss": 1.4829, "mean_token_accuracy": 0.6844910591840744, "num_tokens": 363407627.0, "step": 22550 }, { "epoch": 5.228299918878202, "grad_norm": 0.9647533893585205, "learning_rate": 2.436863365581109e-05, "loss": 1.4734, "mean_token_accuracy": 0.68351329267025, "num_tokens": 363568799.0, "step": 22560 }, { "epoch": 5.230617684552092, "grad_norm": 0.9305232763290405, "learning_rate": 2.434987508113036e-05, "loss": 1.4792, "mean_token_accuracy": 0.6821207642555237, "num_tokens": 363730467.0, "step": 22570 }, { "epoch": 5.232935450225982, "grad_norm": 0.9876173138618469, "learning_rate": 2.433111687272005e-05, "loss": 1.4771, "mean_token_accuracy": 0.6837939888238906, "num_tokens": 363891219.0, "step": 22580 }, { "epoch": 5.235253215899872, "grad_norm": 0.9534009099006653, "learning_rate": 2.4312359041148243e-05, "loss": 1.4702, "mean_token_accuracy": 0.6838527917861938, "num_tokens": 364053313.0, "step": 22590 }, { "epoch": 5.2375709815737626, "grad_norm": 0.9400526285171509, "learning_rate": 2.4293601596982817e-05, "loss": 1.4798, "mean_token_accuracy": 0.6840135231614113, "num_tokens": 364215054.0, "step": 22600 }, { "epoch": 5.239888747247654, "grad_norm": 0.9418924450874329, "learning_rate": 2.427484455079143e-05, "loss": 1.4818, "mean_token_accuracy": 0.6821861386299133, "num_tokens": 364377005.0, "step": 22610 }, { "epoch": 5.242206512921544, "grad_norm": 0.9038235545158386, "learning_rate": 2.4256087913141515e-05, "loss": 1.4898, "mean_token_accuracy": 0.6814705848693847, "num_tokens": 364538241.0, "step": 22620 }, { "epoch": 5.244524278595434, "grad_norm": 0.8937869071960449, "learning_rate": 2.423733169460027e-05, "loss": 1.4786, "mean_token_accuracy": 0.6852758944034576, "num_tokens": 364699167.0, "step": 22630 }, { "epoch": 5.246842044269324, "grad_norm": 1.0334495306015015, "learning_rate": 2.4218575905734665e-05, "loss": 1.4823, "mean_token_accuracy": 0.6831799894571304, "num_tokens": 364858629.0, "step": 22640 }, { "epoch": 5.249159809943214, "grad_norm": 0.8597651720046997, "learning_rate": 2.4199820557111418e-05, "loss": 1.4896, "mean_token_accuracy": 0.6817499354481698, "num_tokens": 365020531.0, "step": 22650 }, { "epoch": 5.2514775756171055, "grad_norm": 0.9186803102493286, "learning_rate": 2.4181065659297014e-05, "loss": 1.512, "mean_token_accuracy": 0.6790812119841576, "num_tokens": 365182224.0, "step": 22660 }, { "epoch": 5.253795341290996, "grad_norm": 0.9163674712181091, "learning_rate": 2.416231122285767e-05, "loss": 1.4751, "mean_token_accuracy": 0.6824179023504258, "num_tokens": 365344338.0, "step": 22670 }, { "epoch": 5.256113106964886, "grad_norm": 0.9241855144500732, "learning_rate": 2.414355725835935e-05, "loss": 1.4815, "mean_token_accuracy": 0.6820998400449753, "num_tokens": 365505053.0, "step": 22680 }, { "epoch": 5.258430872638776, "grad_norm": 0.9315381646156311, "learning_rate": 2.4124803776367748e-05, "loss": 1.4943, "mean_token_accuracy": 0.6823946788907052, "num_tokens": 365666676.0, "step": 22690 }, { "epoch": 5.260748638312666, "grad_norm": 0.9078759551048279, "learning_rate": 2.410605078744829e-05, "loss": 1.4877, "mean_token_accuracy": 0.6823652729392051, "num_tokens": 365827212.0, "step": 22700 }, { "epoch": 5.263066403986557, "grad_norm": 0.9197059273719788, "learning_rate": 2.408729830216612e-05, "loss": 1.4674, "mean_token_accuracy": 0.6841082707047462, "num_tokens": 365988317.0, "step": 22710 }, { "epoch": 5.2653841696604475, "grad_norm": 0.9315102696418762, "learning_rate": 2.406854633108611e-05, "loss": 1.459, "mean_token_accuracy": 0.6860574021935463, "num_tokens": 366149775.0, "step": 22720 }, { "epoch": 5.267701935334338, "grad_norm": 0.9275806546211243, "learning_rate": 2.4049794884772824e-05, "loss": 1.48, "mean_token_accuracy": 0.6840551421046257, "num_tokens": 366311329.0, "step": 22730 }, { "epoch": 5.270019701008228, "grad_norm": 0.9861752390861511, "learning_rate": 2.403104397379054e-05, "loss": 1.4848, "mean_token_accuracy": 0.6833162620663643, "num_tokens": 366473285.0, "step": 22740 }, { "epoch": 5.272337466682118, "grad_norm": 1.0192224979400635, "learning_rate": 2.4012293608703234e-05, "loss": 1.4732, "mean_token_accuracy": 0.6857819318771362, "num_tokens": 366635285.0, "step": 22750 }, { "epoch": 5.274655232356009, "grad_norm": 0.9798491597175598, "learning_rate": 2.3993543800074577e-05, "loss": 1.4801, "mean_token_accuracy": 0.6844120517373085, "num_tokens": 366796779.0, "step": 22760 }, { "epoch": 5.276972998029899, "grad_norm": 0.9350327253341675, "learning_rate": 2.397479455846792e-05, "loss": 1.4833, "mean_token_accuracy": 0.6810864895582199, "num_tokens": 366957244.0, "step": 22770 }, { "epoch": 5.27929076370379, "grad_norm": 0.9219409227371216, "learning_rate": 2.395604589444631e-05, "loss": 1.4809, "mean_token_accuracy": 0.6821111217141151, "num_tokens": 367119065.0, "step": 22780 }, { "epoch": 5.28160852937768, "grad_norm": 0.9167717695236206, "learning_rate": 2.3937297818572444e-05, "loss": 1.4732, "mean_token_accuracy": 0.6832714691758156, "num_tokens": 367279989.0, "step": 22790 }, { "epoch": 5.28392629505157, "grad_norm": 0.9721064567565918, "learning_rate": 2.3918550341408706e-05, "loss": 1.4789, "mean_token_accuracy": 0.6834093898534774, "num_tokens": 367441607.0, "step": 22800 }, { "epoch": 5.286244060725461, "grad_norm": 0.8958656787872314, "learning_rate": 2.3899803473517136e-05, "loss": 1.4995, "mean_token_accuracy": 0.6801527976989746, "num_tokens": 367602656.0, "step": 22810 }, { "epoch": 5.288561826399351, "grad_norm": 0.9163427948951721, "learning_rate": 2.3881057225459435e-05, "loss": 1.4863, "mean_token_accuracy": 0.6831633433699608, "num_tokens": 367764125.0, "step": 22820 }, { "epoch": 5.290879592073241, "grad_norm": 0.9742670655250549, "learning_rate": 2.386231160779695e-05, "loss": 1.4783, "mean_token_accuracy": 0.6832084521651268, "num_tokens": 367924507.0, "step": 22830 }, { "epoch": 5.293197357747132, "grad_norm": 0.9691260457038879, "learning_rate": 2.3843566631090673e-05, "loss": 1.4687, "mean_token_accuracy": 0.6852199614048005, "num_tokens": 368085183.0, "step": 22840 }, { "epoch": 5.295515123421022, "grad_norm": 0.9536343216896057, "learning_rate": 2.382482230590124e-05, "loss": 1.4517, "mean_token_accuracy": 0.6879955053329467, "num_tokens": 368246890.0, "step": 22850 }, { "epoch": 5.297832889094913, "grad_norm": 0.9695982933044434, "learning_rate": 2.380607864278891e-05, "loss": 1.4841, "mean_token_accuracy": 0.6824263215065003, "num_tokens": 368407493.0, "step": 22860 }, { "epoch": 5.300150654768803, "grad_norm": 0.963365375995636, "learning_rate": 2.378733565231359e-05, "loss": 1.4813, "mean_token_accuracy": 0.6819864720106125, "num_tokens": 368568661.0, "step": 22870 }, { "epoch": 5.302468420442693, "grad_norm": 0.9767793416976929, "learning_rate": 2.3768593345034777e-05, "loss": 1.4935, "mean_token_accuracy": 0.6841294631361962, "num_tokens": 368730431.0, "step": 22880 }, { "epoch": 5.3047861861165835, "grad_norm": 0.9506517648696899, "learning_rate": 2.374985173151161e-05, "loss": 1.483, "mean_token_accuracy": 0.6829928830265999, "num_tokens": 368891551.0, "step": 22890 }, { "epoch": 5.307103951790474, "grad_norm": 0.9585841298103333, "learning_rate": 2.3731110822302825e-05, "loss": 1.472, "mean_token_accuracy": 0.6863012194633484, "num_tokens": 369053052.0, "step": 22900 }, { "epoch": 5.309421717464365, "grad_norm": 0.9051620364189148, "learning_rate": 2.371237062796676e-05, "loss": 1.4808, "mean_token_accuracy": 0.6820671871304512, "num_tokens": 369214674.0, "step": 22910 }, { "epoch": 5.311739483138255, "grad_norm": 0.982741117477417, "learning_rate": 2.3693631159061357e-05, "loss": 1.488, "mean_token_accuracy": 0.6818719029426574, "num_tokens": 369376857.0, "step": 22920 }, { "epoch": 5.314057248812145, "grad_norm": 0.9297928810119629, "learning_rate": 2.3674892426144143e-05, "loss": 1.4733, "mean_token_accuracy": 0.6837424755096435, "num_tokens": 369537037.0, "step": 22930 }, { "epoch": 5.316375014486035, "grad_norm": 0.9499163031578064, "learning_rate": 2.3656154439772235e-05, "loss": 1.4686, "mean_token_accuracy": 0.6863073572516442, "num_tokens": 369698244.0, "step": 22940 }, { "epoch": 5.3186927801599255, "grad_norm": 0.8975582718849182, "learning_rate": 2.3637417210502326e-05, "loss": 1.4596, "mean_token_accuracy": 0.6864734694361687, "num_tokens": 369859460.0, "step": 22950 }, { "epoch": 5.321010545833817, "grad_norm": 0.975035548210144, "learning_rate": 2.3618680748890685e-05, "loss": 1.4921, "mean_token_accuracy": 0.6827057853341103, "num_tokens": 370020920.0, "step": 22960 }, { "epoch": 5.323328311507707, "grad_norm": 1.0056698322296143, "learning_rate": 2.359994506549315e-05, "loss": 1.4715, "mean_token_accuracy": 0.6852115496993065, "num_tokens": 370181080.0, "step": 22970 }, { "epoch": 5.325646077181597, "grad_norm": 0.9395904541015625, "learning_rate": 2.3581210170865114e-05, "loss": 1.4659, "mean_token_accuracy": 0.6852887064218521, "num_tokens": 370342631.0, "step": 22980 }, { "epoch": 5.327963842855487, "grad_norm": 0.941054105758667, "learning_rate": 2.356247607556153e-05, "loss": 1.4761, "mean_token_accuracy": 0.6835667535662651, "num_tokens": 370504135.0, "step": 22990 }, { "epoch": 5.330281608529377, "grad_norm": 0.9334387183189392, "learning_rate": 2.35437427901369e-05, "loss": 1.4871, "mean_token_accuracy": 0.6814445316791534, "num_tokens": 370665656.0, "step": 23000 }, { "epoch": 5.3325993742032685, "grad_norm": 0.8902997970581055, "learning_rate": 2.3525010325145272e-05, "loss": 1.4855, "mean_token_accuracy": 0.683847026526928, "num_tokens": 370826234.0, "step": 23010 }, { "epoch": 5.334917139877159, "grad_norm": 0.9767879247665405, "learning_rate": 2.3506278691140225e-05, "loss": 1.4875, "mean_token_accuracy": 0.6829020857810975, "num_tokens": 370987251.0, "step": 23020 }, { "epoch": 5.337234905551049, "grad_norm": 0.9352767467498779, "learning_rate": 2.3487547898674883e-05, "loss": 1.4944, "mean_token_accuracy": 0.6823547199368477, "num_tokens": 371149005.0, "step": 23030 }, { "epoch": 5.339552671224939, "grad_norm": 0.9649722576141357, "learning_rate": 2.3468817958301893e-05, "loss": 1.4814, "mean_token_accuracy": 0.6821729347109795, "num_tokens": 371310522.0, "step": 23040 }, { "epoch": 5.341870436898829, "grad_norm": 0.9103240370750427, "learning_rate": 2.3450088880573397e-05, "loss": 1.4801, "mean_token_accuracy": 0.683924263715744, "num_tokens": 371472120.0, "step": 23050 }, { "epoch": 5.34418820257272, "grad_norm": 0.9201915264129639, "learning_rate": 2.3431360676041076e-05, "loss": 1.4789, "mean_token_accuracy": 0.6844888269901276, "num_tokens": 371632561.0, "step": 23060 }, { "epoch": 5.3465059682466105, "grad_norm": 0.98710036277771, "learning_rate": 2.3412633355256118e-05, "loss": 1.4837, "mean_token_accuracy": 0.6838723152875901, "num_tokens": 371793885.0, "step": 23070 }, { "epoch": 5.348823733920501, "grad_norm": 0.9239997863769531, "learning_rate": 2.3393906928769207e-05, "loss": 1.4869, "mean_token_accuracy": 0.6829042315483094, "num_tokens": 371955154.0, "step": 23080 }, { "epoch": 5.351141499594391, "grad_norm": 1.0433560609817505, "learning_rate": 2.3375181407130524e-05, "loss": 1.4875, "mean_token_accuracy": 0.6817549049854279, "num_tokens": 372115914.0, "step": 23090 }, { "epoch": 5.353459265268281, "grad_norm": 0.9686881899833679, "learning_rate": 2.335645680088975e-05, "loss": 1.4792, "mean_token_accuracy": 0.6833802103996277, "num_tokens": 372276920.0, "step": 23100 }, { "epoch": 5.355777030942171, "grad_norm": 0.9299511313438416, "learning_rate": 2.3337733120596027e-05, "loss": 1.4676, "mean_token_accuracy": 0.6854553326964379, "num_tokens": 372437991.0, "step": 23110 }, { "epoch": 5.358094796616062, "grad_norm": 0.9553348422050476, "learning_rate": 2.3319010376798e-05, "loss": 1.4619, "mean_token_accuracy": 0.6856243908405304, "num_tokens": 372598703.0, "step": 23120 }, { "epoch": 5.3604125622899526, "grad_norm": 0.946917712688446, "learning_rate": 2.330028858004377e-05, "loss": 1.4977, "mean_token_accuracy": 0.68162582218647, "num_tokens": 372759436.0, "step": 23130 }, { "epoch": 5.362730327963843, "grad_norm": 1.0252541303634644, "learning_rate": 2.3281567740880912e-05, "loss": 1.4758, "mean_token_accuracy": 0.6840407088398933, "num_tokens": 372920754.0, "step": 23140 }, { "epoch": 5.365048093637733, "grad_norm": 0.9404100179672241, "learning_rate": 2.3262847869856466e-05, "loss": 1.4719, "mean_token_accuracy": 0.6853617623448371, "num_tokens": 373081962.0, "step": 23150 }, { "epoch": 5.367365859311624, "grad_norm": 0.9390661716461182, "learning_rate": 2.3244128977516912e-05, "loss": 1.4938, "mean_token_accuracy": 0.6808280318975448, "num_tokens": 373243975.0, "step": 23160 }, { "epoch": 5.369683624985514, "grad_norm": 0.9222830533981323, "learning_rate": 2.322541107440819e-05, "loss": 1.498, "mean_token_accuracy": 0.681205153465271, "num_tokens": 373405937.0, "step": 23170 }, { "epoch": 5.372001390659404, "grad_norm": 1.006321668624878, "learning_rate": 2.320669417107568e-05, "loss": 1.468, "mean_token_accuracy": 0.6827315032482147, "num_tokens": 373567703.0, "step": 23180 }, { "epoch": 5.374319156333295, "grad_norm": 0.9223192930221558, "learning_rate": 2.318797827806421e-05, "loss": 1.4751, "mean_token_accuracy": 0.6855286702513694, "num_tokens": 373728457.0, "step": 23190 }, { "epoch": 5.376636922007185, "grad_norm": 0.9535536766052246, "learning_rate": 2.3169263405918006e-05, "loss": 1.4706, "mean_token_accuracy": 0.685208122432232, "num_tokens": 373890697.0, "step": 23200 }, { "epoch": 5.378954687681075, "grad_norm": 0.9192311763763428, "learning_rate": 2.315054956518075e-05, "loss": 1.4935, "mean_token_accuracy": 0.6806171670556068, "num_tokens": 374051019.0, "step": 23210 }, { "epoch": 5.381272453354966, "grad_norm": 0.9730108380317688, "learning_rate": 2.3131836766395536e-05, "loss": 1.4634, "mean_token_accuracy": 0.685106772184372, "num_tokens": 374212963.0, "step": 23220 }, { "epoch": 5.383590219028856, "grad_norm": 0.9740179181098938, "learning_rate": 2.3113125020104868e-05, "loss": 1.4789, "mean_token_accuracy": 0.6833565652370452, "num_tokens": 374374239.0, "step": 23230 }, { "epoch": 5.3859079847027465, "grad_norm": 1.0255630016326904, "learning_rate": 2.3094414336850653e-05, "loss": 1.4854, "mean_token_accuracy": 0.68144790828228, "num_tokens": 374535853.0, "step": 23240 }, { "epoch": 5.388225750376637, "grad_norm": 0.9477957487106323, "learning_rate": 2.3075704727174205e-05, "loss": 1.5022, "mean_token_accuracy": 0.6800271242856979, "num_tokens": 374696845.0, "step": 23250 }, { "epoch": 5.390543516050528, "grad_norm": 0.999024510383606, "learning_rate": 2.3056996201616232e-05, "loss": 1.4805, "mean_token_accuracy": 0.6834577843546867, "num_tokens": 374858534.0, "step": 23260 }, { "epoch": 5.392861281724418, "grad_norm": 0.9269569516181946, "learning_rate": 2.303828877071683e-05, "loss": 1.4786, "mean_token_accuracy": 0.6849720671772956, "num_tokens": 375019900.0, "step": 23270 }, { "epoch": 5.395179047398308, "grad_norm": 1.0026991367340088, "learning_rate": 2.301958244501548e-05, "loss": 1.464, "mean_token_accuracy": 0.6849672645330429, "num_tokens": 375181063.0, "step": 23280 }, { "epoch": 5.397496813072198, "grad_norm": 1.0152195692062378, "learning_rate": 2.300087723505104e-05, "loss": 1.4662, "mean_token_accuracy": 0.6865991562604904, "num_tokens": 375341703.0, "step": 23290 }, { "epoch": 5.3998145787460885, "grad_norm": 0.916052520275116, "learning_rate": 2.2982173151361736e-05, "loss": 1.468, "mean_token_accuracy": 0.6856881245970726, "num_tokens": 375502425.0, "step": 23300 }, { "epoch": 5.402132344419979, "grad_norm": 0.9049825072288513, "learning_rate": 2.2963470204485164e-05, "loss": 1.4803, "mean_token_accuracy": 0.6845880091190338, "num_tokens": 375663668.0, "step": 23310 }, { "epoch": 5.40445011009387, "grad_norm": 0.9836143851280212, "learning_rate": 2.2944768404958277e-05, "loss": 1.4966, "mean_token_accuracy": 0.6823597073554992, "num_tokens": 375824569.0, "step": 23320 }, { "epoch": 5.40676787576776, "grad_norm": 0.9215028882026672, "learning_rate": 2.292606776331738e-05, "loss": 1.4741, "mean_token_accuracy": 0.683975687623024, "num_tokens": 375986124.0, "step": 23330 }, { "epoch": 5.40908564144165, "grad_norm": 0.9268199801445007, "learning_rate": 2.2907368290098135e-05, "loss": 1.4834, "mean_token_accuracy": 0.6833422049880028, "num_tokens": 376147720.0, "step": 23340 }, { "epoch": 5.41140340711554, "grad_norm": 0.9322562217712402, "learning_rate": 2.2888669995835533e-05, "loss": 1.4664, "mean_token_accuracy": 0.6853449985384941, "num_tokens": 376309154.0, "step": 23350 }, { "epoch": 5.413721172789431, "grad_norm": 0.9723404049873352, "learning_rate": 2.2869972891063908e-05, "loss": 1.4753, "mean_token_accuracy": 0.6840288922190666, "num_tokens": 376470091.0, "step": 23360 }, { "epoch": 5.416038938463322, "grad_norm": 0.9395827651023865, "learning_rate": 2.285127698631692e-05, "loss": 1.4765, "mean_token_accuracy": 0.6822999849915504, "num_tokens": 376631298.0, "step": 23370 }, { "epoch": 5.418356704137212, "grad_norm": 0.9098713397979736, "learning_rate": 2.2832582292127562e-05, "loss": 1.4765, "mean_token_accuracy": 0.6851559713482857, "num_tokens": 376792714.0, "step": 23380 }, { "epoch": 5.420674469811102, "grad_norm": 0.9793747663497925, "learning_rate": 2.2813888819028133e-05, "loss": 1.4913, "mean_token_accuracy": 0.6813568264245987, "num_tokens": 376954133.0, "step": 23390 }, { "epoch": 5.422992235484992, "grad_norm": 0.9537090063095093, "learning_rate": 2.279519657755025e-05, "loss": 1.4784, "mean_token_accuracy": 0.6843742862343788, "num_tokens": 377115153.0, "step": 23400 }, { "epoch": 5.425310001158882, "grad_norm": 0.9118247628211975, "learning_rate": 2.2776505578224834e-05, "loss": 1.4883, "mean_token_accuracy": 0.6835865899920464, "num_tokens": 377277389.0, "step": 23410 }, { "epoch": 5.4276277668327735, "grad_norm": 0.9809777140617371, "learning_rate": 2.275781583158211e-05, "loss": 1.4733, "mean_token_accuracy": 0.6851356491446495, "num_tokens": 377438757.0, "step": 23420 }, { "epoch": 5.429945532506664, "grad_norm": 0.9741263389587402, "learning_rate": 2.2739127348151596e-05, "loss": 1.4749, "mean_token_accuracy": 0.6846576049923897, "num_tokens": 377599596.0, "step": 23430 }, { "epoch": 5.432263298180554, "grad_norm": 0.8959022164344788, "learning_rate": 2.2720440138462094e-05, "loss": 1.4945, "mean_token_accuracy": 0.6807007402181625, "num_tokens": 377760281.0, "step": 23440 }, { "epoch": 5.434581063854444, "grad_norm": 0.9478444457054138, "learning_rate": 2.2701754213041693e-05, "loss": 1.4754, "mean_token_accuracy": 0.6851659372448922, "num_tokens": 377921454.0, "step": 23450 }, { "epoch": 5.436898829528335, "grad_norm": 0.9186699986457825, "learning_rate": 2.2683069582417756e-05, "loss": 1.5079, "mean_token_accuracy": 0.6795964106917382, "num_tokens": 378083161.0, "step": 23460 }, { "epoch": 5.439216595202225, "grad_norm": 0.9297274351119995, "learning_rate": 2.266438625711692e-05, "loss": 1.4813, "mean_token_accuracy": 0.6818057477474213, "num_tokens": 378244167.0, "step": 23470 }, { "epoch": 5.4415343608761155, "grad_norm": 0.9296846389770508, "learning_rate": 2.2645704247665085e-05, "loss": 1.4892, "mean_token_accuracy": 0.6825606361031532, "num_tokens": 378405499.0, "step": 23480 }, { "epoch": 5.443852126550006, "grad_norm": 0.9547589421272278, "learning_rate": 2.2627023564587404e-05, "loss": 1.488, "mean_token_accuracy": 0.6836725443601608, "num_tokens": 378567058.0, "step": 23490 }, { "epoch": 5.446169892223896, "grad_norm": 0.977207362651825, "learning_rate": 2.2608344218408295e-05, "loss": 1.4553, "mean_token_accuracy": 0.6857222065329551, "num_tokens": 378728437.0, "step": 23500 }, { "epoch": 5.448487657897786, "grad_norm": 0.9386272430419922, "learning_rate": 2.2589666219651407e-05, "loss": 1.4662, "mean_token_accuracy": 0.6838478416204452, "num_tokens": 378889971.0, "step": 23510 }, { "epoch": 5.450805423571677, "grad_norm": 0.9394489526748657, "learning_rate": 2.257098957883965e-05, "loss": 1.4857, "mean_token_accuracy": 0.6828806564211846, "num_tokens": 379051234.0, "step": 23520 }, { "epoch": 5.453123189245567, "grad_norm": 0.9639504551887512, "learning_rate": 2.2552314306495144e-05, "loss": 1.477, "mean_token_accuracy": 0.6829421371221542, "num_tokens": 379212714.0, "step": 23530 }, { "epoch": 5.455440954919458, "grad_norm": 0.9007455706596375, "learning_rate": 2.2533640413139256e-05, "loss": 1.4903, "mean_token_accuracy": 0.6817493513226509, "num_tokens": 379374584.0, "step": 23540 }, { "epoch": 5.457758720593348, "grad_norm": 0.8851383328437805, "learning_rate": 2.2514967909292573e-05, "loss": 1.4982, "mean_token_accuracy": 0.6808187827467919, "num_tokens": 379535457.0, "step": 23550 }, { "epoch": 5.460076486267239, "grad_norm": 0.9672232270240784, "learning_rate": 2.2496296805474898e-05, "loss": 1.4701, "mean_token_accuracy": 0.6836699038743973, "num_tokens": 379694630.0, "step": 23560 }, { "epoch": 5.462394251941129, "grad_norm": 0.9479807615280151, "learning_rate": 2.2477627112205242e-05, "loss": 1.4777, "mean_token_accuracy": 0.6835611909627914, "num_tokens": 379855027.0, "step": 23570 }, { "epoch": 5.464712017615019, "grad_norm": 0.9648742079734802, "learning_rate": 2.245895884000183e-05, "loss": 1.4653, "mean_token_accuracy": 0.686250638961792, "num_tokens": 380016668.0, "step": 23580 }, { "epoch": 5.467029783288909, "grad_norm": 0.9849069714546204, "learning_rate": 2.244029199938207e-05, "loss": 1.4838, "mean_token_accuracy": 0.6836204633116723, "num_tokens": 380178102.0, "step": 23590 }, { "epoch": 5.4693475489628, "grad_norm": 0.9165015816688538, "learning_rate": 2.242162660086259e-05, "loss": 1.4963, "mean_token_accuracy": 0.6818549901247024, "num_tokens": 380337230.0, "step": 23600 }, { "epoch": 5.47166531463669, "grad_norm": 1.1234077215194702, "learning_rate": 2.2402962654959175e-05, "loss": 1.4969, "mean_token_accuracy": 0.6810921847820282, "num_tokens": 380497607.0, "step": 23610 }, { "epoch": 5.473983080310581, "grad_norm": 0.9082645773887634, "learning_rate": 2.238430017218681e-05, "loss": 1.4662, "mean_token_accuracy": 0.685595928132534, "num_tokens": 380659281.0, "step": 23620 }, { "epoch": 5.476300845984471, "grad_norm": 0.9952271580696106, "learning_rate": 2.2365639163059658e-05, "loss": 1.494, "mean_token_accuracy": 0.6816147118806839, "num_tokens": 380820100.0, "step": 23630 }, { "epoch": 5.478618611658361, "grad_norm": 0.879757285118103, "learning_rate": 2.2346979638091038e-05, "loss": 1.4876, "mean_token_accuracy": 0.6831616371870041, "num_tokens": 380981293.0, "step": 23640 }, { "epoch": 5.4809363773322515, "grad_norm": 0.9806226491928101, "learning_rate": 2.232832160779344e-05, "loss": 1.4863, "mean_token_accuracy": 0.6821774527430534, "num_tokens": 381141746.0, "step": 23650 }, { "epoch": 5.483254143006142, "grad_norm": 0.9619382619857788, "learning_rate": 2.2309665082678525e-05, "loss": 1.4984, "mean_token_accuracy": 0.6818780899047852, "num_tokens": 381303659.0, "step": 23660 }, { "epoch": 5.485571908680033, "grad_norm": 0.9386687278747559, "learning_rate": 2.2291010073257075e-05, "loss": 1.4838, "mean_token_accuracy": 0.6816540032625198, "num_tokens": 381464600.0, "step": 23670 }, { "epoch": 5.487889674353923, "grad_norm": 0.910364031791687, "learning_rate": 2.2272356590039048e-05, "loss": 1.4798, "mean_token_accuracy": 0.6825198009610176, "num_tokens": 381625258.0, "step": 23680 }, { "epoch": 5.490207440027813, "grad_norm": 0.9713786840438843, "learning_rate": 2.225370464353353e-05, "loss": 1.4913, "mean_token_accuracy": 0.68214311003685, "num_tokens": 381786798.0, "step": 23690 }, { "epoch": 5.492525205701703, "grad_norm": 0.9619480967521667, "learning_rate": 2.2235054244248738e-05, "loss": 1.48, "mean_token_accuracy": 0.6842897176742554, "num_tokens": 381947938.0, "step": 23700 }, { "epoch": 5.4948429713755935, "grad_norm": 0.955849826335907, "learning_rate": 2.2216405402692023e-05, "loss": 1.4665, "mean_token_accuracy": 0.6858689188957214, "num_tokens": 382109029.0, "step": 23710 }, { "epoch": 5.497160737049485, "grad_norm": 0.9367942214012146, "learning_rate": 2.2197758129369857e-05, "loss": 1.4901, "mean_token_accuracy": 0.681382030248642, "num_tokens": 382270626.0, "step": 23720 }, { "epoch": 5.499478502723375, "grad_norm": 0.9426813125610352, "learning_rate": 2.217911243478783e-05, "loss": 1.465, "mean_token_accuracy": 0.6863502189517021, "num_tokens": 382432022.0, "step": 23730 }, { "epoch": 5.501796268397265, "grad_norm": 0.9373536705970764, "learning_rate": 2.216046832945064e-05, "loss": 1.4822, "mean_token_accuracy": 0.6845909729599953, "num_tokens": 382593305.0, "step": 23740 }, { "epoch": 5.504114034071155, "grad_norm": 0.9530133008956909, "learning_rate": 2.2141825823862087e-05, "loss": 1.4846, "mean_token_accuracy": 0.6827423989772796, "num_tokens": 382755169.0, "step": 23750 }, { "epoch": 5.506431799745046, "grad_norm": 0.8977628946304321, "learning_rate": 2.2123184928525074e-05, "loss": 1.4606, "mean_token_accuracy": 0.6861572980880737, "num_tokens": 382916255.0, "step": 23760 }, { "epoch": 5.5087495654189365, "grad_norm": 0.9627030491828918, "learning_rate": 2.21045456539416e-05, "loss": 1.4874, "mean_token_accuracy": 0.6826333224773407, "num_tokens": 383077186.0, "step": 23770 }, { "epoch": 5.511067331092827, "grad_norm": 0.9657625555992126, "learning_rate": 2.208590801061274e-05, "loss": 1.4858, "mean_token_accuracy": 0.683456438779831, "num_tokens": 383238332.0, "step": 23780 }, { "epoch": 5.513385096766717, "grad_norm": 0.9431820511817932, "learning_rate": 2.2067272009038663e-05, "loss": 1.4749, "mean_token_accuracy": 0.6832573756575584, "num_tokens": 383399115.0, "step": 23790 }, { "epoch": 5.515702862440607, "grad_norm": 0.9714628458023071, "learning_rate": 2.2048637659718603e-05, "loss": 1.4808, "mean_token_accuracy": 0.6823980748653412, "num_tokens": 383560434.0, "step": 23800 }, { "epoch": 5.518020628114497, "grad_norm": 1.0035232305526733, "learning_rate": 2.203000497315087e-05, "loss": 1.4954, "mean_token_accuracy": 0.6814045757055283, "num_tokens": 383721941.0, "step": 23810 }, { "epoch": 5.520338393788388, "grad_norm": 0.9409409761428833, "learning_rate": 2.201137395983283e-05, "loss": 1.483, "mean_token_accuracy": 0.6825511172413826, "num_tokens": 383883330.0, "step": 23820 }, { "epoch": 5.5226561594622785, "grad_norm": 0.9546827077865601, "learning_rate": 2.1992744630260913e-05, "loss": 1.4821, "mean_token_accuracy": 0.6828737154603004, "num_tokens": 384045131.0, "step": 23830 }, { "epoch": 5.524973925136169, "grad_norm": 0.9746487140655518, "learning_rate": 2.19741169949306e-05, "loss": 1.4901, "mean_token_accuracy": 0.6809811577200889, "num_tokens": 384206328.0, "step": 23840 }, { "epoch": 5.527291690810059, "grad_norm": 0.9067673087120056, "learning_rate": 2.195549106433641e-05, "loss": 1.4839, "mean_token_accuracy": 0.6839757069945336, "num_tokens": 384368096.0, "step": 23850 }, { "epoch": 5.52960945648395, "grad_norm": 0.9511673450469971, "learning_rate": 2.193686684897191e-05, "loss": 1.4747, "mean_token_accuracy": 0.6835606575012207, "num_tokens": 384529869.0, "step": 23860 }, { "epoch": 5.53192722215784, "grad_norm": 0.9570055603981018, "learning_rate": 2.19182443593297e-05, "loss": 1.4839, "mean_token_accuracy": 0.6821305304765701, "num_tokens": 384690171.0, "step": 23870 }, { "epoch": 5.53424498783173, "grad_norm": 0.9551637172698975, "learning_rate": 2.1899623605901403e-05, "loss": 1.4586, "mean_token_accuracy": 0.6856750532984733, "num_tokens": 384851407.0, "step": 23880 }, { "epoch": 5.5365627535056205, "grad_norm": 0.980552613735199, "learning_rate": 2.1881004599177663e-05, "loss": 1.4706, "mean_token_accuracy": 0.6853152245283127, "num_tokens": 385012425.0, "step": 23890 }, { "epoch": 5.538880519179511, "grad_norm": 0.9375631809234619, "learning_rate": 2.1862387349648144e-05, "loss": 1.467, "mean_token_accuracy": 0.6844683796167373, "num_tokens": 385173571.0, "step": 23900 }, { "epoch": 5.541198284853401, "grad_norm": 0.9159774780273438, "learning_rate": 2.1843771867801527e-05, "loss": 1.4749, "mean_token_accuracy": 0.68482024371624, "num_tokens": 385334666.0, "step": 23910 }, { "epoch": 5.543516050527292, "grad_norm": 0.9381917715072632, "learning_rate": 2.1825158164125482e-05, "loss": 1.4791, "mean_token_accuracy": 0.6823354303836823, "num_tokens": 385496641.0, "step": 23920 }, { "epoch": 5.545833816201182, "grad_norm": 0.9483277201652527, "learning_rate": 2.1806546249106685e-05, "loss": 1.4919, "mean_token_accuracy": 0.6827008724212646, "num_tokens": 385657024.0, "step": 23930 }, { "epoch": 5.548151581875072, "grad_norm": 1.0054048299789429, "learning_rate": 2.1787936133230805e-05, "loss": 1.4706, "mean_token_accuracy": 0.6831655785441398, "num_tokens": 385818390.0, "step": 23940 }, { "epoch": 5.550469347548963, "grad_norm": 0.9329858422279358, "learning_rate": 2.1769327826982494e-05, "loss": 1.4843, "mean_token_accuracy": 0.6833862483501434, "num_tokens": 385979423.0, "step": 23950 }, { "epoch": 5.552787113222853, "grad_norm": 0.9763336777687073, "learning_rate": 2.1750721340845393e-05, "loss": 1.4797, "mean_token_accuracy": 0.6852932691574096, "num_tokens": 386141040.0, "step": 23960 }, { "epoch": 5.555104878896744, "grad_norm": 0.9570325613021851, "learning_rate": 2.1732116685302105e-05, "loss": 1.489, "mean_token_accuracy": 0.6817557737231255, "num_tokens": 386302729.0, "step": 23970 }, { "epoch": 5.557422644570634, "grad_norm": 0.9429967403411865, "learning_rate": 2.171351387083421e-05, "loss": 1.4592, "mean_token_accuracy": 0.6864361554384232, "num_tokens": 386463612.0, "step": 23980 }, { "epoch": 5.559740410244524, "grad_norm": 0.9176984429359436, "learning_rate": 2.169491290792225e-05, "loss": 1.4837, "mean_token_accuracy": 0.682217326760292, "num_tokens": 386625322.0, "step": 23990 }, { "epoch": 5.5620581759184144, "grad_norm": 0.8912404179573059, "learning_rate": 2.1676313807045718e-05, "loss": 1.4805, "mean_token_accuracy": 0.6836983099579811, "num_tokens": 386787159.0, "step": 24000 }, { "epoch": 5.564375941592305, "grad_norm": 0.95235675573349, "learning_rate": 2.1657716578683065e-05, "loss": 1.4872, "mean_token_accuracy": 0.6824890539050102, "num_tokens": 386948657.0, "step": 24010 }, { "epoch": 5.566693707266196, "grad_norm": 0.9631364345550537, "learning_rate": 2.1639121233311686e-05, "loss": 1.4772, "mean_token_accuracy": 0.6852870419621467, "num_tokens": 387108877.0, "step": 24020 }, { "epoch": 5.569011472940086, "grad_norm": 0.9336622357368469, "learning_rate": 2.1620527781407912e-05, "loss": 1.4766, "mean_token_accuracy": 0.6830011427402496, "num_tokens": 387270051.0, "step": 24030 }, { "epoch": 5.571329238613976, "grad_norm": 0.9417889714241028, "learning_rate": 2.1601936233447013e-05, "loss": 1.4721, "mean_token_accuracy": 0.6852163270115852, "num_tokens": 387430378.0, "step": 24040 }, { "epoch": 5.573647004287866, "grad_norm": 0.915116548538208, "learning_rate": 2.1583346599903173e-05, "loss": 1.5, "mean_token_accuracy": 0.6810022562742233, "num_tokens": 387591911.0, "step": 24050 }, { "epoch": 5.5759647699617565, "grad_norm": 0.9399242997169495, "learning_rate": 2.1564758891249516e-05, "loss": 1.4758, "mean_token_accuracy": 0.6846576109528542, "num_tokens": 387753796.0, "step": 24060 }, { "epoch": 5.578282535635648, "grad_norm": 0.9258753061294556, "learning_rate": 2.154617311795807e-05, "loss": 1.4732, "mean_token_accuracy": 0.6846010640263558, "num_tokens": 387914881.0, "step": 24070 }, { "epoch": 5.580600301309538, "grad_norm": 0.9377974271774292, "learning_rate": 2.152758929049977e-05, "loss": 1.4718, "mean_token_accuracy": 0.6839715376496315, "num_tokens": 388075700.0, "step": 24080 }, { "epoch": 5.582918066983428, "grad_norm": 0.8950276374816895, "learning_rate": 2.1509007419344466e-05, "loss": 1.4748, "mean_token_accuracy": 0.6852838814258575, "num_tokens": 388236650.0, "step": 24090 }, { "epoch": 5.585235832657318, "grad_norm": 0.9771143198013306, "learning_rate": 2.1490427514960894e-05, "loss": 1.4753, "mean_token_accuracy": 0.6844643294811249, "num_tokens": 388397865.0, "step": 24100 }, { "epoch": 5.587553598331208, "grad_norm": 0.974360466003418, "learning_rate": 2.1471849587816695e-05, "loss": 1.4825, "mean_token_accuracy": 0.6821909084916115, "num_tokens": 388560226.0, "step": 24110 }, { "epoch": 5.589871364005099, "grad_norm": 0.9296836256980896, "learning_rate": 2.145327364837838e-05, "loss": 1.4987, "mean_token_accuracy": 0.67891805768013, "num_tokens": 388721641.0, "step": 24120 }, { "epoch": 5.59218912967899, "grad_norm": 0.9279503226280212, "learning_rate": 2.143469970711135e-05, "loss": 1.4795, "mean_token_accuracy": 0.6831611841917038, "num_tokens": 388882855.0, "step": 24130 }, { "epoch": 5.59450689535288, "grad_norm": 0.9453723430633545, "learning_rate": 2.141612777447989e-05, "loss": 1.4757, "mean_token_accuracy": 0.6827290326356887, "num_tokens": 389043667.0, "step": 24140 }, { "epoch": 5.59682466102677, "grad_norm": 0.9391123652458191, "learning_rate": 2.139755786094713e-05, "loss": 1.4835, "mean_token_accuracy": 0.6836968496441841, "num_tokens": 389203944.0, "step": 24150 }, { "epoch": 5.59914242670066, "grad_norm": 0.9424486756324768, "learning_rate": 2.137898997697508e-05, "loss": 1.4844, "mean_token_accuracy": 0.6850200459361077, "num_tokens": 389365129.0, "step": 24160 }, { "epoch": 5.601460192374551, "grad_norm": 0.9057825803756714, "learning_rate": 2.1360424133024605e-05, "loss": 1.4822, "mean_token_accuracy": 0.6823964565992355, "num_tokens": 389527001.0, "step": 24170 }, { "epoch": 5.6037779580484415, "grad_norm": 1.0269972085952759, "learning_rate": 2.1341860339555407e-05, "loss": 1.4745, "mean_token_accuracy": 0.6833953633904457, "num_tokens": 389687109.0, "step": 24180 }, { "epoch": 5.606095723722332, "grad_norm": 0.9187858700752258, "learning_rate": 2.1323298607026044e-05, "loss": 1.4574, "mean_token_accuracy": 0.6863720551133156, "num_tokens": 389848431.0, "step": 24190 }, { "epoch": 5.608413489396222, "grad_norm": 0.9529664516448975, "learning_rate": 2.1304738945893917e-05, "loss": 1.4837, "mean_token_accuracy": 0.6829784736037254, "num_tokens": 390010199.0, "step": 24200 }, { "epoch": 5.610731255070112, "grad_norm": 0.9221665263175964, "learning_rate": 2.1286181366615254e-05, "loss": 1.4755, "mean_token_accuracy": 0.6837879657745362, "num_tokens": 390171933.0, "step": 24210 }, { "epoch": 5.613049020744003, "grad_norm": 0.9293248057365417, "learning_rate": 2.126762587964511e-05, "loss": 1.474, "mean_token_accuracy": 0.6852818086743355, "num_tokens": 390333133.0, "step": 24220 }, { "epoch": 5.615366786417893, "grad_norm": 0.946164071559906, "learning_rate": 2.124907249543736e-05, "loss": 1.4903, "mean_token_accuracy": 0.680969774723053, "num_tokens": 390493880.0, "step": 24230 }, { "epoch": 5.6176845520917835, "grad_norm": 0.9403670430183411, "learning_rate": 2.1230521224444703e-05, "loss": 1.4726, "mean_token_accuracy": 0.6842221900820732, "num_tokens": 390654572.0, "step": 24240 }, { "epoch": 5.620002317765674, "grad_norm": 0.9887409210205078, "learning_rate": 2.1211972077118636e-05, "loss": 1.4835, "mean_token_accuracy": 0.6828412517905236, "num_tokens": 390816159.0, "step": 24250 }, { "epoch": 5.622320083439564, "grad_norm": 0.9737712740898132, "learning_rate": 2.119342506390946e-05, "loss": 1.4796, "mean_token_accuracy": 0.6845076486468316, "num_tokens": 390976866.0, "step": 24260 }, { "epoch": 5.624637849113455, "grad_norm": 0.9812999963760376, "learning_rate": 2.117488019526629e-05, "loss": 1.4698, "mean_token_accuracy": 0.6846280440688133, "num_tokens": 391136934.0, "step": 24270 }, { "epoch": 5.626955614787345, "grad_norm": 0.9127523303031921, "learning_rate": 2.1156337481637005e-05, "loss": 1.455, "mean_token_accuracy": 0.6876298576593399, "num_tokens": 391298343.0, "step": 24280 }, { "epoch": 5.629273380461235, "grad_norm": 0.9760691523551941, "learning_rate": 2.11377969334683e-05, "loss": 1.4722, "mean_token_accuracy": 0.6862464010715484, "num_tokens": 391459591.0, "step": 24290 }, { "epoch": 5.631591146135126, "grad_norm": 0.8986432552337646, "learning_rate": 2.111925856120563e-05, "loss": 1.4738, "mean_token_accuracy": 0.6845198377966881, "num_tokens": 391621486.0, "step": 24300 }, { "epoch": 5.633908911809016, "grad_norm": 0.9657794237136841, "learning_rate": 2.1100722375293234e-05, "loss": 1.4704, "mean_token_accuracy": 0.6863161861896515, "num_tokens": 391782989.0, "step": 24310 }, { "epoch": 5.636226677482907, "grad_norm": 0.9239623546600342, "learning_rate": 2.1082188386174108e-05, "loss": 1.4721, "mean_token_accuracy": 0.6842399835586548, "num_tokens": 391943843.0, "step": 24320 }, { "epoch": 5.638544443156797, "grad_norm": 0.9466719627380371, "learning_rate": 2.1063656604290017e-05, "loss": 1.4731, "mean_token_accuracy": 0.6858338251709938, "num_tokens": 392105425.0, "step": 24330 }, { "epoch": 5.640862208830687, "grad_norm": 0.9363370537757874, "learning_rate": 2.104512704008149e-05, "loss": 1.4826, "mean_token_accuracy": 0.6849423184990883, "num_tokens": 392266625.0, "step": 24340 }, { "epoch": 5.643179974504577, "grad_norm": 0.8956242799758911, "learning_rate": 2.1026599703987802e-05, "loss": 1.4837, "mean_token_accuracy": 0.6831867814064025, "num_tokens": 392428279.0, "step": 24350 }, { "epoch": 5.645497740178468, "grad_norm": 0.8814415335655212, "learning_rate": 2.100807460644696e-05, "loss": 1.4959, "mean_token_accuracy": 0.6813264504075051, "num_tokens": 392589747.0, "step": 24360 }, { "epoch": 5.647815505852359, "grad_norm": 0.9543858170509338, "learning_rate": 2.0989551757895722e-05, "loss": 1.4935, "mean_token_accuracy": 0.6828908458352089, "num_tokens": 392751237.0, "step": 24370 }, { "epoch": 5.650133271526249, "grad_norm": 0.9166055917739868, "learning_rate": 2.0971031168769585e-05, "loss": 1.4748, "mean_token_accuracy": 0.6836152702569962, "num_tokens": 392913000.0, "step": 24380 }, { "epoch": 5.652451037200139, "grad_norm": 0.9543689489364624, "learning_rate": 2.095251284950276e-05, "loss": 1.4699, "mean_token_accuracy": 0.6846020877361297, "num_tokens": 393073348.0, "step": 24390 }, { "epoch": 5.654768802874029, "grad_norm": 0.9472091197967529, "learning_rate": 2.093399681052818e-05, "loss": 1.4835, "mean_token_accuracy": 0.682108087837696, "num_tokens": 393234902.0, "step": 24400 }, { "epoch": 5.6570865685479195, "grad_norm": 0.9744907021522522, "learning_rate": 2.0915483062277508e-05, "loss": 1.485, "mean_token_accuracy": 0.68358224183321, "num_tokens": 393396478.0, "step": 24410 }, { "epoch": 5.6594043342218106, "grad_norm": 0.9173738956451416, "learning_rate": 2.0896971615181097e-05, "loss": 1.4738, "mean_token_accuracy": 0.6838119983673095, "num_tokens": 393557311.0, "step": 24420 }, { "epoch": 5.661722099895701, "grad_norm": 0.9857180118560791, "learning_rate": 2.0878462479668015e-05, "loss": 1.4767, "mean_token_accuracy": 0.683594425022602, "num_tokens": 393718230.0, "step": 24430 }, { "epoch": 5.664039865569591, "grad_norm": 0.9286143183708191, "learning_rate": 2.085995566616603e-05, "loss": 1.4774, "mean_token_accuracy": 0.6837965652346611, "num_tokens": 393879509.0, "step": 24440 }, { "epoch": 5.666357631243481, "grad_norm": 0.9390275478363037, "learning_rate": 2.0841451185101595e-05, "loss": 1.4895, "mean_token_accuracy": 0.6826191425323487, "num_tokens": 394040515.0, "step": 24450 }, { "epoch": 5.668675396917371, "grad_norm": 0.971332848072052, "learning_rate": 2.082294904689985e-05, "loss": 1.4733, "mean_token_accuracy": 0.6829885691404343, "num_tokens": 394201942.0, "step": 24460 }, { "epoch": 5.670993162591262, "grad_norm": 0.9482378959655762, "learning_rate": 2.080444926198461e-05, "loss": 1.4792, "mean_token_accuracy": 0.6836181610822678, "num_tokens": 394363294.0, "step": 24470 }, { "epoch": 5.673310928265153, "grad_norm": 0.9462664127349854, "learning_rate": 2.078595184077838e-05, "loss": 1.4927, "mean_token_accuracy": 0.682468381524086, "num_tokens": 394524562.0, "step": 24480 }, { "epoch": 5.675628693939043, "grad_norm": 0.9037666916847229, "learning_rate": 2.076745679370232e-05, "loss": 1.4681, "mean_token_accuracy": 0.6841675773262977, "num_tokens": 394685113.0, "step": 24490 }, { "epoch": 5.677946459612933, "grad_norm": 0.9670752882957458, "learning_rate": 2.0748964131176254e-05, "loss": 1.4627, "mean_token_accuracy": 0.6857571750879288, "num_tokens": 394846565.0, "step": 24500 }, { "epoch": 5.680264225286823, "grad_norm": 0.9837561249732971, "learning_rate": 2.073047386361867e-05, "loss": 1.4685, "mean_token_accuracy": 0.683641143143177, "num_tokens": 395007022.0, "step": 24510 }, { "epoch": 5.682581990960714, "grad_norm": 0.9574834108352661, "learning_rate": 2.0711986001446694e-05, "loss": 1.4736, "mean_token_accuracy": 0.6834138065576554, "num_tokens": 395168710.0, "step": 24520 }, { "epoch": 5.6848997566346045, "grad_norm": 0.9695027470588684, "learning_rate": 2.069350055507611e-05, "loss": 1.4489, "mean_token_accuracy": 0.6873763546347618, "num_tokens": 395329930.0, "step": 24530 }, { "epoch": 5.687217522308495, "grad_norm": 0.9096373319625854, "learning_rate": 2.0675017534921338e-05, "loss": 1.4719, "mean_token_accuracy": 0.6826886534690857, "num_tokens": 395489925.0, "step": 24540 }, { "epoch": 5.689535287982385, "grad_norm": 0.9534870982170105, "learning_rate": 2.0656536951395422e-05, "loss": 1.4853, "mean_token_accuracy": 0.6825460627675056, "num_tokens": 395651130.0, "step": 24550 }, { "epoch": 5.691853053656275, "grad_norm": 0.9365695714950562, "learning_rate": 2.0638058814910043e-05, "loss": 1.4762, "mean_token_accuracy": 0.6838822439312935, "num_tokens": 395810992.0, "step": 24560 }, { "epoch": 5.694170819330166, "grad_norm": 0.9421387314796448, "learning_rate": 2.0619583135875498e-05, "loss": 1.4886, "mean_token_accuracy": 0.6820739522576332, "num_tokens": 395971993.0, "step": 24570 }, { "epoch": 5.696488585004056, "grad_norm": 0.9545032382011414, "learning_rate": 2.06011099247007e-05, "loss": 1.4775, "mean_token_accuracy": 0.6825381144881248, "num_tokens": 396133529.0, "step": 24580 }, { "epoch": 5.6988063506779465, "grad_norm": 0.9445208311080933, "learning_rate": 2.058263919179318e-05, "loss": 1.4813, "mean_token_accuracy": 0.6843006357550621, "num_tokens": 396295117.0, "step": 24590 }, { "epoch": 5.701124116351837, "grad_norm": 0.9437428712844849, "learning_rate": 2.0564170947559058e-05, "loss": 1.4875, "mean_token_accuracy": 0.6814496010541916, "num_tokens": 396456345.0, "step": 24600 }, { "epoch": 5.703441882025727, "grad_norm": 0.9117065668106079, "learning_rate": 2.0545705202403065e-05, "loss": 1.4879, "mean_token_accuracy": 0.6822070881724358, "num_tokens": 396616733.0, "step": 24610 }, { "epoch": 5.705759647699617, "grad_norm": 0.9288530945777893, "learning_rate": 2.0527241966728516e-05, "loss": 1.4753, "mean_token_accuracy": 0.6849299758672714, "num_tokens": 396777761.0, "step": 24620 }, { "epoch": 5.708077413373508, "grad_norm": 0.9839590787887573, "learning_rate": 2.050878125093732e-05, "loss": 1.4863, "mean_token_accuracy": 0.6826990723609925, "num_tokens": 396938677.0, "step": 24630 }, { "epoch": 5.710395179047398, "grad_norm": 0.9148010015487671, "learning_rate": 2.0490323065429953e-05, "loss": 1.4771, "mean_token_accuracy": 0.6824682995676994, "num_tokens": 397099597.0, "step": 24640 }, { "epoch": 5.7127129447212885, "grad_norm": 1.0311312675476074, "learning_rate": 2.0471867420605485e-05, "loss": 1.4748, "mean_token_accuracy": 0.6833994731307029, "num_tokens": 397260917.0, "step": 24650 }, { "epoch": 5.715030710395179, "grad_norm": 1.008866786956787, "learning_rate": 2.0453414326861532e-05, "loss": 1.4895, "mean_token_accuracy": 0.6818422749638557, "num_tokens": 397422365.0, "step": 24660 }, { "epoch": 5.71734847606907, "grad_norm": 0.8940554261207581, "learning_rate": 2.043496379459429e-05, "loss": 1.5047, "mean_token_accuracy": 0.6800349369645119, "num_tokens": 397583867.0, "step": 24670 }, { "epoch": 5.71966624174296, "grad_norm": 0.9438555240631104, "learning_rate": 2.041651583419851e-05, "loss": 1.4795, "mean_token_accuracy": 0.6846083581447602, "num_tokens": 397745705.0, "step": 24680 }, { "epoch": 5.72198400741685, "grad_norm": 0.8878459334373474, "learning_rate": 2.039807045606748e-05, "loss": 1.478, "mean_token_accuracy": 0.6820854336023331, "num_tokens": 397906880.0, "step": 24690 }, { "epoch": 5.72430177309074, "grad_norm": 0.9212328791618347, "learning_rate": 2.0379627670593055e-05, "loss": 1.4846, "mean_token_accuracy": 0.6824972867965698, "num_tokens": 398068610.0, "step": 24700 }, { "epoch": 5.726619538764631, "grad_norm": 1.0441721677780151, "learning_rate": 2.0361187488165608e-05, "loss": 1.481, "mean_token_accuracy": 0.6833595931529999, "num_tokens": 398230467.0, "step": 24710 }, { "epoch": 5.728937304438521, "grad_norm": 0.9156786799430847, "learning_rate": 2.034274991917406e-05, "loss": 1.4839, "mean_token_accuracy": 0.6826907485723496, "num_tokens": 398392074.0, "step": 24720 }, { "epoch": 5.731255070112412, "grad_norm": 0.9533706307411194, "learning_rate": 2.032431497400587e-05, "loss": 1.4848, "mean_token_accuracy": 0.68309555798769, "num_tokens": 398551840.0, "step": 24730 }, { "epoch": 5.733572835786302, "grad_norm": 0.9617100358009338, "learning_rate": 2.0305882663046975e-05, "loss": 1.4627, "mean_token_accuracy": 0.6853883549571037, "num_tokens": 398713490.0, "step": 24740 }, { "epoch": 5.735890601460192, "grad_norm": 0.9400156140327454, "learning_rate": 2.0287452996681867e-05, "loss": 1.4941, "mean_token_accuracy": 0.6813562154769898, "num_tokens": 398875457.0, "step": 24750 }, { "epoch": 5.738208367134082, "grad_norm": 0.9946928024291992, "learning_rate": 2.0269025985293543e-05, "loss": 1.4872, "mean_token_accuracy": 0.6819316118955612, "num_tokens": 399037248.0, "step": 24760 }, { "epoch": 5.7405261328079735, "grad_norm": 0.9844775795936584, "learning_rate": 2.0250601639263495e-05, "loss": 1.4748, "mean_token_accuracy": 0.6835900023579597, "num_tokens": 399197114.0, "step": 24770 }, { "epoch": 5.742843898481864, "grad_norm": 0.91276615858078, "learning_rate": 2.023217996897172e-05, "loss": 1.4942, "mean_token_accuracy": 0.6829621240496635, "num_tokens": 399358835.0, "step": 24780 }, { "epoch": 5.745161664155754, "grad_norm": 0.9371582269668579, "learning_rate": 2.0213760984796703e-05, "loss": 1.4721, "mean_token_accuracy": 0.6840099126100541, "num_tokens": 399520172.0, "step": 24790 }, { "epoch": 5.747479429829644, "grad_norm": 0.9580284953117371, "learning_rate": 2.0195344697115417e-05, "loss": 1.4766, "mean_token_accuracy": 0.680713340640068, "num_tokens": 399682232.0, "step": 24800 }, { "epoch": 5.749797195503534, "grad_norm": 0.9162783622741699, "learning_rate": 2.0176931116303317e-05, "loss": 1.4678, "mean_token_accuracy": 0.6865439251065254, "num_tokens": 399843399.0, "step": 24810 }, { "epoch": 5.7521149611774245, "grad_norm": 0.9594659209251404, "learning_rate": 2.0158520252734335e-05, "loss": 1.4694, "mean_token_accuracy": 0.6846700817346573, "num_tokens": 400003928.0, "step": 24820 }, { "epoch": 5.754432726851316, "grad_norm": 1.005556583404541, "learning_rate": 2.0140112116780863e-05, "loss": 1.4896, "mean_token_accuracy": 0.682821960747242, "num_tokens": 400163562.0, "step": 24830 }, { "epoch": 5.756750492525206, "grad_norm": 0.8620633482933044, "learning_rate": 2.0121706718813773e-05, "loss": 1.4915, "mean_token_accuracy": 0.6821605935692787, "num_tokens": 400323941.0, "step": 24840 }, { "epoch": 5.759068258199096, "grad_norm": 0.907159149646759, "learning_rate": 2.010330406920237e-05, "loss": 1.4715, "mean_token_accuracy": 0.6856137439608574, "num_tokens": 400486035.0, "step": 24850 }, { "epoch": 5.761386023872986, "grad_norm": 0.9153845310211182, "learning_rate": 2.0084904178314438e-05, "loss": 1.4915, "mean_token_accuracy": 0.6820729777216912, "num_tokens": 400647531.0, "step": 24860 }, { "epoch": 5.763703789546877, "grad_norm": 0.9482409358024597, "learning_rate": 2.0066507056516187e-05, "loss": 1.4917, "mean_token_accuracy": 0.6827699944376946, "num_tokens": 400808562.0, "step": 24870 }, { "epoch": 5.766021555220767, "grad_norm": 0.880073606967926, "learning_rate": 2.0048112714172273e-05, "loss": 1.4818, "mean_token_accuracy": 0.6824029430747032, "num_tokens": 400967241.0, "step": 24880 }, { "epoch": 5.768339320894658, "grad_norm": 1.0187894105911255, "learning_rate": 2.0029721161645792e-05, "loss": 1.4738, "mean_token_accuracy": 0.6835780829191208, "num_tokens": 401128267.0, "step": 24890 }, { "epoch": 5.770657086568548, "grad_norm": 0.9121785759925842, "learning_rate": 2.001133240929826e-05, "loss": 1.4801, "mean_token_accuracy": 0.6836853981018066, "num_tokens": 401289758.0, "step": 24900 }, { "epoch": 5.772974852242438, "grad_norm": 0.9978516101837158, "learning_rate": 1.9992946467489615e-05, "loss": 1.4734, "mean_token_accuracy": 0.6832943916320801, "num_tokens": 401450376.0, "step": 24910 }, { "epoch": 5.775292617916328, "grad_norm": 0.9700103402137756, "learning_rate": 1.9974563346578224e-05, "loss": 1.488, "mean_token_accuracy": 0.6803311035037041, "num_tokens": 401611599.0, "step": 24920 }, { "epoch": 5.777610383590219, "grad_norm": 0.898034393787384, "learning_rate": 1.9956183056920845e-05, "loss": 1.4799, "mean_token_accuracy": 0.6832446053624153, "num_tokens": 401772956.0, "step": 24930 }, { "epoch": 5.7799281492641095, "grad_norm": 0.9109935164451599, "learning_rate": 1.9937805608872657e-05, "loss": 1.481, "mean_token_accuracy": 0.6829732358455658, "num_tokens": 401934208.0, "step": 24940 }, { "epoch": 5.782245914938, "grad_norm": 0.9050207734107971, "learning_rate": 1.991943101278723e-05, "loss": 1.4993, "mean_token_accuracy": 0.6810339272022248, "num_tokens": 402095446.0, "step": 24950 }, { "epoch": 5.78456368061189, "grad_norm": 0.9778710603713989, "learning_rate": 1.990105927901653e-05, "loss": 1.4878, "mean_token_accuracy": 0.6817631378769875, "num_tokens": 402257023.0, "step": 24960 }, { "epoch": 5.786881446285781, "grad_norm": 0.9330336451530457, "learning_rate": 1.9882690417910916e-05, "loss": 1.4903, "mean_token_accuracy": 0.6825296700000762, "num_tokens": 402417988.0, "step": 24970 }, { "epoch": 5.789199211959671, "grad_norm": 0.9906246066093445, "learning_rate": 1.9864324439819115e-05, "loss": 1.467, "mean_token_accuracy": 0.6853843674063682, "num_tokens": 402579912.0, "step": 24980 }, { "epoch": 5.791516977633561, "grad_norm": 0.9382312297821045, "learning_rate": 1.9845961355088245e-05, "loss": 1.4632, "mean_token_accuracy": 0.6848302438855172, "num_tokens": 402742155.0, "step": 24990 }, { "epoch": 5.7938347433074515, "grad_norm": 0.8913921117782593, "learning_rate": 1.9827601174063782e-05, "loss": 1.4755, "mean_token_accuracy": 0.6837182670831681, "num_tokens": 402903904.0, "step": 25000 }, { "epoch": 5.796152508981342, "grad_norm": 0.9626984596252441, "learning_rate": 1.9809243907089573e-05, "loss": 1.4706, "mean_token_accuracy": 0.6846188187599183, "num_tokens": 403065974.0, "step": 25010 }, { "epoch": 5.798470274655232, "grad_norm": 0.9621884226799011, "learning_rate": 1.979088956450782e-05, "loss": 1.4967, "mean_token_accuracy": 0.6809614464640618, "num_tokens": 403225775.0, "step": 25020 }, { "epoch": 5.800788040329123, "grad_norm": 0.9498599171638489, "learning_rate": 1.9772538156659077e-05, "loss": 1.4814, "mean_token_accuracy": 0.6822582229971885, "num_tokens": 403387390.0, "step": 25030 }, { "epoch": 5.803105806003013, "grad_norm": 0.9311501979827881, "learning_rate": 1.9754189693882246e-05, "loss": 1.4649, "mean_token_accuracy": 0.6861807033419609, "num_tokens": 403548949.0, "step": 25040 }, { "epoch": 5.805423571676903, "grad_norm": 0.9385866522789001, "learning_rate": 1.9735844186514573e-05, "loss": 1.4723, "mean_token_accuracy": 0.6835502743721008, "num_tokens": 403710190.0, "step": 25050 }, { "epoch": 5.807741337350794, "grad_norm": 0.9183253645896912, "learning_rate": 1.9717501644891633e-05, "loss": 1.4835, "mean_token_accuracy": 0.6825462028384208, "num_tokens": 403871154.0, "step": 25060 }, { "epoch": 5.810059103024685, "grad_norm": 0.9250614047050476, "learning_rate": 1.9699162079347335e-05, "loss": 1.468, "mean_token_accuracy": 0.6857350006699562, "num_tokens": 404031045.0, "step": 25070 }, { "epoch": 5.812376868698575, "grad_norm": 0.9885476231575012, "learning_rate": 1.9680825500213906e-05, "loss": 1.49, "mean_token_accuracy": 0.682492145895958, "num_tokens": 404192112.0, "step": 25080 }, { "epoch": 5.814694634372465, "grad_norm": 0.9886691570281982, "learning_rate": 1.96624919178219e-05, "loss": 1.466, "mean_token_accuracy": 0.6847280248999595, "num_tokens": 404352895.0, "step": 25090 }, { "epoch": 5.817012400046355, "grad_norm": 0.9874179363250732, "learning_rate": 1.964416134250017e-05, "loss": 1.4685, "mean_token_accuracy": 0.684283272922039, "num_tokens": 404513737.0, "step": 25100 }, { "epoch": 5.819330165720245, "grad_norm": 0.9571714401245117, "learning_rate": 1.962583378457588e-05, "loss": 1.4807, "mean_token_accuracy": 0.6847361162304878, "num_tokens": 404675496.0, "step": 25110 }, { "epoch": 5.821647931394136, "grad_norm": 0.9234232306480408, "learning_rate": 1.9607509254374504e-05, "loss": 1.4618, "mean_token_accuracy": 0.6851789653301239, "num_tokens": 404837273.0, "step": 25120 }, { "epoch": 5.823965697068027, "grad_norm": 0.9671846628189087, "learning_rate": 1.9589187762219792e-05, "loss": 1.4805, "mean_token_accuracy": 0.6828990891575814, "num_tokens": 404999010.0, "step": 25130 }, { "epoch": 5.826283462741917, "grad_norm": 0.9294595122337341, "learning_rate": 1.95708693184338e-05, "loss": 1.4652, "mean_token_accuracy": 0.6834053784608841, "num_tokens": 405160552.0, "step": 25140 }, { "epoch": 5.828601228415807, "grad_norm": 0.9419618844985962, "learning_rate": 1.9552553933336854e-05, "loss": 1.476, "mean_token_accuracy": 0.6843108981847763, "num_tokens": 405321518.0, "step": 25150 }, { "epoch": 5.830918994089697, "grad_norm": 0.9494933485984802, "learning_rate": 1.9534241617247565e-05, "loss": 1.4667, "mean_token_accuracy": 0.6848409101366997, "num_tokens": 405482943.0, "step": 25160 }, { "epoch": 5.833236759763588, "grad_norm": 0.9306872487068176, "learning_rate": 1.9515932380482804e-05, "loss": 1.4801, "mean_token_accuracy": 0.6830917462706566, "num_tokens": 405644517.0, "step": 25170 }, { "epoch": 5.8355545254374785, "grad_norm": 0.9590463042259216, "learning_rate": 1.949762623335772e-05, "loss": 1.4739, "mean_token_accuracy": 0.6828176796436309, "num_tokens": 405805746.0, "step": 25180 }, { "epoch": 5.837872291111369, "grad_norm": 0.9684875011444092, "learning_rate": 1.947932318618572e-05, "loss": 1.4835, "mean_token_accuracy": 0.6826340079307556, "num_tokens": 405967498.0, "step": 25190 }, { "epoch": 5.840190056785259, "grad_norm": 0.9727869033813477, "learning_rate": 1.946102324927845e-05, "loss": 1.4824, "mean_token_accuracy": 0.6828630775213241, "num_tokens": 406128789.0, "step": 25200 }, { "epoch": 5.842507822459149, "grad_norm": 1.0190255641937256, "learning_rate": 1.944272643294582e-05, "loss": 1.4905, "mean_token_accuracy": 0.6806401088833809, "num_tokens": 406289628.0, "step": 25210 }, { "epoch": 5.844825588133039, "grad_norm": 0.9875733256340027, "learning_rate": 1.942443274749598e-05, "loss": 1.489, "mean_token_accuracy": 0.6819958135485649, "num_tokens": 406450070.0, "step": 25220 }, { "epoch": 5.84714335380693, "grad_norm": 0.927956223487854, "learning_rate": 1.9406142203235302e-05, "loss": 1.471, "mean_token_accuracy": 0.6848057523369789, "num_tokens": 406611475.0, "step": 25230 }, { "epoch": 5.849461119480821, "grad_norm": 0.9539511203765869, "learning_rate": 1.9387854810468407e-05, "loss": 1.4839, "mean_token_accuracy": 0.6829037889838219, "num_tokens": 406773110.0, "step": 25240 }, { "epoch": 5.851778885154711, "grad_norm": 0.9609122276306152, "learning_rate": 1.9369570579498133e-05, "loss": 1.4724, "mean_token_accuracy": 0.6849015787243843, "num_tokens": 406934888.0, "step": 25250 }, { "epoch": 5.854096650828601, "grad_norm": 0.9259504675865173, "learning_rate": 1.9351289520625527e-05, "loss": 1.4926, "mean_token_accuracy": 0.6810162127017975, "num_tokens": 407095436.0, "step": 25260 }, { "epoch": 5.856414416502492, "grad_norm": 0.9082051515579224, "learning_rate": 1.9333011644149864e-05, "loss": 1.4555, "mean_token_accuracy": 0.6858873665332794, "num_tokens": 407257185.0, "step": 25270 }, { "epoch": 5.858732182176382, "grad_norm": 0.9564042687416077, "learning_rate": 1.9314736960368614e-05, "loss": 1.4764, "mean_token_accuracy": 0.6830277547240258, "num_tokens": 407418380.0, "step": 25280 }, { "epoch": 5.8610499478502724, "grad_norm": 0.9497370719909668, "learning_rate": 1.9296465479577463e-05, "loss": 1.4716, "mean_token_accuracy": 0.6819067463278771, "num_tokens": 407579255.0, "step": 25290 }, { "epoch": 5.863367713524163, "grad_norm": 0.9269720911979675, "learning_rate": 1.9278197212070273e-05, "loss": 1.4698, "mean_token_accuracy": 0.6862071692943573, "num_tokens": 407740921.0, "step": 25300 }, { "epoch": 5.865685479198053, "grad_norm": 0.9867302179336548, "learning_rate": 1.9259932168139112e-05, "loss": 1.4778, "mean_token_accuracy": 0.6856586337089539, "num_tokens": 407901418.0, "step": 25310 }, { "epoch": 5.868003244871943, "grad_norm": 1.0003857612609863, "learning_rate": 1.9241670358074224e-05, "loss": 1.4723, "mean_token_accuracy": 0.6829286620020867, "num_tokens": 408062853.0, "step": 25320 }, { "epoch": 5.870321010545834, "grad_norm": 0.9462061524391174, "learning_rate": 1.9223411792164033e-05, "loss": 1.474, "mean_token_accuracy": 0.6850529834628105, "num_tokens": 408223456.0, "step": 25330 }, { "epoch": 5.872638776219724, "grad_norm": 0.979820728302002, "learning_rate": 1.9205156480695135e-05, "loss": 1.4805, "mean_token_accuracy": 0.6834593042731285, "num_tokens": 408385209.0, "step": 25340 }, { "epoch": 5.8749565418936145, "grad_norm": 0.9693838953971863, "learning_rate": 1.9186904433952297e-05, "loss": 1.4758, "mean_token_accuracy": 0.6823131129145622, "num_tokens": 408545217.0, "step": 25350 }, { "epoch": 5.877274307567505, "grad_norm": 0.9404817819595337, "learning_rate": 1.9168655662218437e-05, "loss": 1.4813, "mean_token_accuracy": 0.6834396541118621, "num_tokens": 408705943.0, "step": 25360 }, { "epoch": 5.879592073241396, "grad_norm": 0.9710847735404968, "learning_rate": 1.9150410175774637e-05, "loss": 1.4723, "mean_token_accuracy": 0.6848701283335685, "num_tokens": 408866971.0, "step": 25370 }, { "epoch": 5.881909838915286, "grad_norm": 0.9380342364311218, "learning_rate": 1.9132167984900122e-05, "loss": 1.474, "mean_token_accuracy": 0.6843603804707528, "num_tokens": 409027068.0, "step": 25380 }, { "epoch": 5.884227604589176, "grad_norm": 0.9035939574241638, "learning_rate": 1.911392909987227e-05, "loss": 1.4807, "mean_token_accuracy": 0.6834913194179535, "num_tokens": 409188733.0, "step": 25390 }, { "epoch": 5.886545370263066, "grad_norm": 0.9706595540046692, "learning_rate": 1.909569353096658e-05, "loss": 1.4852, "mean_token_accuracy": 0.6824498564004898, "num_tokens": 409349978.0, "step": 25400 }, { "epoch": 5.8888631359369565, "grad_norm": 0.944389283657074, "learning_rate": 1.9077461288456692e-05, "loss": 1.4846, "mean_token_accuracy": 0.6829983994364739, "num_tokens": 409511268.0, "step": 25410 }, { "epoch": 5.891180901610847, "grad_norm": 0.9260926246643066, "learning_rate": 1.9059232382614377e-05, "loss": 1.4796, "mean_token_accuracy": 0.6838613390922547, "num_tokens": 409672889.0, "step": 25420 }, { "epoch": 5.893498667284738, "grad_norm": 1.0024808645248413, "learning_rate": 1.904100682370952e-05, "loss": 1.4811, "mean_token_accuracy": 0.6828532442450523, "num_tokens": 409834221.0, "step": 25430 }, { "epoch": 5.895816432958628, "grad_norm": 0.9176005721092224, "learning_rate": 1.9022784622010123e-05, "loss": 1.4744, "mean_token_accuracy": 0.6839819803833962, "num_tokens": 409995190.0, "step": 25440 }, { "epoch": 5.898134198632518, "grad_norm": 0.918170690536499, "learning_rate": 1.900456578778229e-05, "loss": 1.4625, "mean_token_accuracy": 0.6842417597770691, "num_tokens": 410156263.0, "step": 25450 }, { "epoch": 5.900451964306408, "grad_norm": 0.9105513095855713, "learning_rate": 1.8986350331290244e-05, "loss": 1.4738, "mean_token_accuracy": 0.685439795255661, "num_tokens": 410316680.0, "step": 25460 }, { "epoch": 5.9027697299802995, "grad_norm": 0.9327322244644165, "learning_rate": 1.896813826279628e-05, "loss": 1.4698, "mean_token_accuracy": 0.6852667018771171, "num_tokens": 410478000.0, "step": 25470 }, { "epoch": 5.90508749565419, "grad_norm": 0.9018927216529846, "learning_rate": 1.894992959256081e-05, "loss": 1.4853, "mean_token_accuracy": 0.6827414900064468, "num_tokens": 410639128.0, "step": 25480 }, { "epoch": 5.90740526132808, "grad_norm": 0.9220282435417175, "learning_rate": 1.8931724330842314e-05, "loss": 1.4882, "mean_token_accuracy": 0.6822974920272827, "num_tokens": 410800576.0, "step": 25490 }, { "epoch": 5.90972302700197, "grad_norm": 0.9923434257507324, "learning_rate": 1.8913522487897355e-05, "loss": 1.4838, "mean_token_accuracy": 0.6828869044780731, "num_tokens": 410961478.0, "step": 25500 }, { "epoch": 5.91204079267586, "grad_norm": 0.9397545456886292, "learning_rate": 1.889532407398058e-05, "loss": 1.4802, "mean_token_accuracy": 0.682678735256195, "num_tokens": 411123004.0, "step": 25510 }, { "epoch": 5.91435855834975, "grad_norm": 0.9431864619255066, "learning_rate": 1.887712909934469e-05, "loss": 1.4951, "mean_token_accuracy": 0.6811388731002808, "num_tokens": 411283550.0, "step": 25520 }, { "epoch": 5.9166763240236415, "grad_norm": 0.9822903275489807, "learning_rate": 1.8858937574240465e-05, "loss": 1.4901, "mean_token_accuracy": 0.682662945985794, "num_tokens": 411445324.0, "step": 25530 }, { "epoch": 5.918994089697532, "grad_norm": 0.9714484810829163, "learning_rate": 1.8840749508916715e-05, "loss": 1.4709, "mean_token_accuracy": 0.6836304172873497, "num_tokens": 411605977.0, "step": 25540 }, { "epoch": 5.921311855371422, "grad_norm": 1.0304479598999023, "learning_rate": 1.882256491362033e-05, "loss": 1.4941, "mean_token_accuracy": 0.6823256030678749, "num_tokens": 411766584.0, "step": 25550 }, { "epoch": 5.923629621045312, "grad_norm": 0.9290578961372375, "learning_rate": 1.8804383798596235e-05, "loss": 1.4731, "mean_token_accuracy": 0.685566246509552, "num_tokens": 411927699.0, "step": 25560 }, { "epoch": 5.925947386719203, "grad_norm": 0.8718850016593933, "learning_rate": 1.8786206174087378e-05, "loss": 1.4744, "mean_token_accuracy": 0.6829119607806206, "num_tokens": 412089065.0, "step": 25570 }, { "epoch": 5.928265152393093, "grad_norm": 0.9141421914100647, "learning_rate": 1.8768032050334765e-05, "loss": 1.4677, "mean_token_accuracy": 0.6835500851273537, "num_tokens": 412250447.0, "step": 25580 }, { "epoch": 5.930582918066984, "grad_norm": 0.9651163220405579, "learning_rate": 1.8749861437577415e-05, "loss": 1.4688, "mean_token_accuracy": 0.6835002735257149, "num_tokens": 412411998.0, "step": 25590 }, { "epoch": 5.932900683740874, "grad_norm": 0.9240498542785645, "learning_rate": 1.8731694346052376e-05, "loss": 1.4752, "mean_token_accuracy": 0.6846168175339699, "num_tokens": 412573495.0, "step": 25600 }, { "epoch": 5.935218449414764, "grad_norm": 0.9529602527618408, "learning_rate": 1.8713530785994704e-05, "loss": 1.4824, "mean_token_accuracy": 0.682653796672821, "num_tokens": 412735864.0, "step": 25610 }, { "epoch": 5.937536215088654, "grad_norm": 0.9212474822998047, "learning_rate": 1.8695370767637475e-05, "loss": 1.4907, "mean_token_accuracy": 0.6827538087964058, "num_tokens": 412896825.0, "step": 25620 }, { "epoch": 5.939853980762545, "grad_norm": 0.9325900077819824, "learning_rate": 1.867721430121176e-05, "loss": 1.4941, "mean_token_accuracy": 0.6822805508971215, "num_tokens": 413058016.0, "step": 25630 }, { "epoch": 5.942171746436435, "grad_norm": 0.9387367963790894, "learning_rate": 1.8659061396946638e-05, "loss": 1.4741, "mean_token_accuracy": 0.6835706040263176, "num_tokens": 413219130.0, "step": 25640 }, { "epoch": 5.944489512110326, "grad_norm": 0.9686933159828186, "learning_rate": 1.864091206506918e-05, "loss": 1.4776, "mean_token_accuracy": 0.6854066029191017, "num_tokens": 413380807.0, "step": 25650 }, { "epoch": 5.946807277784216, "grad_norm": 0.932312548160553, "learning_rate": 1.8622766315804435e-05, "loss": 1.4774, "mean_token_accuracy": 0.6830252289772034, "num_tokens": 413542674.0, "step": 25660 }, { "epoch": 5.949125043458106, "grad_norm": 0.999439001083374, "learning_rate": 1.8604624159375443e-05, "loss": 1.4782, "mean_token_accuracy": 0.682141263782978, "num_tokens": 413703728.0, "step": 25670 }, { "epoch": 5.951442809131997, "grad_norm": 0.9317094683647156, "learning_rate": 1.8586485606003218e-05, "loss": 1.4743, "mean_token_accuracy": 0.6833127722144127, "num_tokens": 413864259.0, "step": 25680 }, { "epoch": 5.953760574805887, "grad_norm": 0.8941425085067749, "learning_rate": 1.856835066590674e-05, "loss": 1.4791, "mean_token_accuracy": 0.6826879814267158, "num_tokens": 414025461.0, "step": 25690 }, { "epoch": 5.9560783404797775, "grad_norm": 0.9428048133850098, "learning_rate": 1.855021934930296e-05, "loss": 1.4926, "mean_token_accuracy": 0.6829124420881272, "num_tokens": 414186133.0, "step": 25700 }, { "epoch": 5.958396106153668, "grad_norm": 0.9740791320800781, "learning_rate": 1.8532091666406787e-05, "loss": 1.474, "mean_token_accuracy": 0.6834058597683906, "num_tokens": 414347022.0, "step": 25710 }, { "epoch": 5.960713871827558, "grad_norm": 0.9844754934310913, "learning_rate": 1.8513967627431066e-05, "loss": 1.4796, "mean_token_accuracy": 0.6833277195692062, "num_tokens": 414508715.0, "step": 25720 }, { "epoch": 5.963031637501449, "grad_norm": 0.9652242660522461, "learning_rate": 1.8495847242586616e-05, "loss": 1.4966, "mean_token_accuracy": 0.6813862577080727, "num_tokens": 414670454.0, "step": 25730 }, { "epoch": 5.965349403175339, "grad_norm": 0.9557402729988098, "learning_rate": 1.847773052208218e-05, "loss": 1.4578, "mean_token_accuracy": 0.6871259108185768, "num_tokens": 414831611.0, "step": 25740 }, { "epoch": 5.967667168849229, "grad_norm": 0.9290654063224792, "learning_rate": 1.845961747612445e-05, "loss": 1.4698, "mean_token_accuracy": 0.6840295597910881, "num_tokens": 414992185.0, "step": 25750 }, { "epoch": 5.9699849345231195, "grad_norm": 0.9732536673545837, "learning_rate": 1.8441508114918015e-05, "loss": 1.4725, "mean_token_accuracy": 0.6849562019109726, "num_tokens": 415154225.0, "step": 25760 }, { "epoch": 5.97230270019701, "grad_norm": 0.9811781644821167, "learning_rate": 1.8423402448665433e-05, "loss": 1.4759, "mean_token_accuracy": 0.6858690246939659, "num_tokens": 415315370.0, "step": 25770 }, { "epoch": 5.974620465870901, "grad_norm": 0.9173783659934998, "learning_rate": 1.840530048756715e-05, "loss": 1.4631, "mean_token_accuracy": 0.6841573253273964, "num_tokens": 415476908.0, "step": 25780 }, { "epoch": 5.976938231544791, "grad_norm": 0.9719087481498718, "learning_rate": 1.8387202241821538e-05, "loss": 1.4578, "mean_token_accuracy": 0.6854892507195472, "num_tokens": 415638027.0, "step": 25790 }, { "epoch": 5.979255997218681, "grad_norm": 0.9413026571273804, "learning_rate": 1.8369107721624874e-05, "loss": 1.4741, "mean_token_accuracy": 0.6839399799704552, "num_tokens": 415796137.0, "step": 25800 }, { "epoch": 5.981573762892571, "grad_norm": 0.9513376355171204, "learning_rate": 1.8351016937171322e-05, "loss": 1.4763, "mean_token_accuracy": 0.6832201302051544, "num_tokens": 415958080.0, "step": 25810 }, { "epoch": 5.983891528566462, "grad_norm": 0.9522374272346497, "learning_rate": 1.833292989865296e-05, "loss": 1.4768, "mean_token_accuracy": 0.6828445613384246, "num_tokens": 416119713.0, "step": 25820 }, { "epoch": 5.986209294240353, "grad_norm": 1.0199464559555054, "learning_rate": 1.8314846616259748e-05, "loss": 1.4787, "mean_token_accuracy": 0.6831972986459732, "num_tokens": 416281751.0, "step": 25830 }, { "epoch": 5.988527059914243, "grad_norm": 0.9082645773887634, "learning_rate": 1.829676710017953e-05, "loss": 1.5174, "mean_token_accuracy": 0.6794212073087692, "num_tokens": 416442769.0, "step": 25840 }, { "epoch": 5.990844825588133, "grad_norm": 0.9791744947433472, "learning_rate": 1.827869136059804e-05, "loss": 1.4773, "mean_token_accuracy": 0.6832885712385177, "num_tokens": 416603483.0, "step": 25850 }, { "epoch": 5.993162591262023, "grad_norm": 0.965356171131134, "learning_rate": 1.8260619407698853e-05, "loss": 1.4816, "mean_token_accuracy": 0.6852660074830055, "num_tokens": 416764773.0, "step": 25860 }, { "epoch": 5.995480356935913, "grad_norm": 0.9595062732696533, "learning_rate": 1.8242551251663436e-05, "loss": 1.4852, "mean_token_accuracy": 0.6826123103499413, "num_tokens": 416925911.0, "step": 25870 }, { "epoch": 5.9977981226098045, "grad_norm": 0.9432960748672485, "learning_rate": 1.8224486902671113e-05, "loss": 1.4704, "mean_token_accuracy": 0.6844592213630676, "num_tokens": 417087091.0, "step": 25880 }, { "epoch": 6.0, "grad_norm": 1.4644317626953125, "learning_rate": 1.8206426370899065e-05, "loss": 1.4752, "mean_token_accuracy": 0.6828099724493528, "num_tokens": 417238524.0, "step": 25890 }, { "epoch": 6.00231776567389, "grad_norm": 0.9435387849807739, "learning_rate": 1.8188369666522314e-05, "loss": 1.4434, "mean_token_accuracy": 0.6872098341584205, "num_tokens": 417399403.0, "step": 25900 }, { "epoch": 6.00463553134778, "grad_norm": 0.9703451991081238, "learning_rate": 1.817031679971373e-05, "loss": 1.4699, "mean_token_accuracy": 0.6841319516301155, "num_tokens": 417561191.0, "step": 25910 }, { "epoch": 6.0069532970216715, "grad_norm": 0.9329960942268372, "learning_rate": 1.815226778064403e-05, "loss": 1.4558, "mean_token_accuracy": 0.6856221750378608, "num_tokens": 417722922.0, "step": 25920 }, { "epoch": 6.009271062695562, "grad_norm": 0.9711855053901672, "learning_rate": 1.8134222619481747e-05, "loss": 1.4598, "mean_token_accuracy": 0.6868283092975617, "num_tokens": 417884632.0, "step": 25930 }, { "epoch": 6.011588828369452, "grad_norm": 0.9471392035484314, "learning_rate": 1.811618132639325e-05, "loss": 1.4485, "mean_token_accuracy": 0.6890185594558715, "num_tokens": 418045157.0, "step": 25940 }, { "epoch": 6.013906594043342, "grad_norm": 0.9610333442687988, "learning_rate": 1.8098143911542732e-05, "loss": 1.4617, "mean_token_accuracy": 0.6876611828804016, "num_tokens": 418206645.0, "step": 25950 }, { "epoch": 6.016224359717232, "grad_norm": 0.9760961532592773, "learning_rate": 1.8080110385092187e-05, "loss": 1.4468, "mean_token_accuracy": 0.6887180984020234, "num_tokens": 418367623.0, "step": 25960 }, { "epoch": 6.018542125391123, "grad_norm": 1.027928113937378, "learning_rate": 1.8062080757201434e-05, "loss": 1.462, "mean_token_accuracy": 0.6871805265545845, "num_tokens": 418527526.0, "step": 25970 }, { "epoch": 6.0208598910650135, "grad_norm": 1.0210903882980347, "learning_rate": 1.8044055038028085e-05, "loss": 1.4465, "mean_token_accuracy": 0.6874225839972496, "num_tokens": 418689214.0, "step": 25980 }, { "epoch": 6.023177656738904, "grad_norm": 0.9869022965431213, "learning_rate": 1.802603323772756e-05, "loss": 1.4603, "mean_token_accuracy": 0.6877674922347069, "num_tokens": 418850700.0, "step": 25990 }, { "epoch": 6.025495422412794, "grad_norm": 0.9532431960105896, "learning_rate": 1.800801536645306e-05, "loss": 1.457, "mean_token_accuracy": 0.6866831541061401, "num_tokens": 419011088.0, "step": 26000 }, { "epoch": 6.027813188086684, "grad_norm": 1.054762840270996, "learning_rate": 1.799000143435558e-05, "loss": 1.4375, "mean_token_accuracy": 0.6883418098092079, "num_tokens": 419173088.0, "step": 26010 }, { "epoch": 6.030130953760575, "grad_norm": 0.9225447177886963, "learning_rate": 1.797199145158389e-05, "loss": 1.469, "mean_token_accuracy": 0.68516204059124, "num_tokens": 419333945.0, "step": 26020 }, { "epoch": 6.032448719434465, "grad_norm": 0.9211044907569885, "learning_rate": 1.7953985428284544e-05, "loss": 1.4541, "mean_token_accuracy": 0.6866428390145302, "num_tokens": 419494429.0, "step": 26030 }, { "epoch": 6.034766485108356, "grad_norm": 0.975175678730011, "learning_rate": 1.7935983374601855e-05, "loss": 1.4535, "mean_token_accuracy": 0.6880973160266877, "num_tokens": 419655425.0, "step": 26040 }, { "epoch": 6.037084250782246, "grad_norm": 0.9843259453773499, "learning_rate": 1.7917985300677908e-05, "loss": 1.4648, "mean_token_accuracy": 0.684689137339592, "num_tokens": 419815771.0, "step": 26050 }, { "epoch": 6.039402016456136, "grad_norm": 0.9596763849258423, "learning_rate": 1.789999121665254e-05, "loss": 1.4594, "mean_token_accuracy": 0.6868163228034974, "num_tokens": 419975968.0, "step": 26060 }, { "epoch": 6.041719782130027, "grad_norm": 1.0523635149002075, "learning_rate": 1.788200113266334e-05, "loss": 1.464, "mean_token_accuracy": 0.6857366010546684, "num_tokens": 420136925.0, "step": 26070 }, { "epoch": 6.044037547803917, "grad_norm": 0.9338698387145996, "learning_rate": 1.786401505884565e-05, "loss": 1.4651, "mean_token_accuracy": 0.6842764988541603, "num_tokens": 420297726.0, "step": 26080 }, { "epoch": 6.046355313477807, "grad_norm": 0.9940962791442871, "learning_rate": 1.7846033005332545e-05, "loss": 1.4583, "mean_token_accuracy": 0.6855599120259285, "num_tokens": 420458771.0, "step": 26090 }, { "epoch": 6.048673079151698, "grad_norm": 0.9262765049934387, "learning_rate": 1.7828054982254843e-05, "loss": 1.4626, "mean_token_accuracy": 0.6860551044344902, "num_tokens": 420620743.0, "step": 26100 }, { "epoch": 6.050990844825588, "grad_norm": 0.9921814799308777, "learning_rate": 1.781008099974108e-05, "loss": 1.4687, "mean_token_accuracy": 0.6867390692234039, "num_tokens": 420782550.0, "step": 26110 }, { "epoch": 6.053308610499479, "grad_norm": 0.9671886563301086, "learning_rate": 1.7792111067917526e-05, "loss": 1.458, "mean_token_accuracy": 0.6852737098932267, "num_tokens": 420943424.0, "step": 26120 }, { "epoch": 6.055626376173369, "grad_norm": 1.0251903533935547, "learning_rate": 1.7774145196908167e-05, "loss": 1.4508, "mean_token_accuracy": 0.6869918838143348, "num_tokens": 421104744.0, "step": 26130 }, { "epoch": 6.057944141847259, "grad_norm": 0.9381194710731506, "learning_rate": 1.7756183396834697e-05, "loss": 1.454, "mean_token_accuracy": 0.6874110370874404, "num_tokens": 421266503.0, "step": 26140 }, { "epoch": 6.0602619075211495, "grad_norm": 0.9582933187484741, "learning_rate": 1.773822567781652e-05, "loss": 1.4771, "mean_token_accuracy": 0.683307833969593, "num_tokens": 421427497.0, "step": 26150 }, { "epoch": 6.06257967319504, "grad_norm": 0.9711164236068726, "learning_rate": 1.772027204997074e-05, "loss": 1.4712, "mean_token_accuracy": 0.6852592006325722, "num_tokens": 421587308.0, "step": 26160 }, { "epoch": 6.064897438868931, "grad_norm": 0.9396569132804871, "learning_rate": 1.7702322523412153e-05, "loss": 1.4636, "mean_token_accuracy": 0.686156903207302, "num_tokens": 421748314.0, "step": 26170 }, { "epoch": 6.067215204542821, "grad_norm": 0.9930253624916077, "learning_rate": 1.7684377108253257e-05, "loss": 1.4682, "mean_token_accuracy": 0.683904992043972, "num_tokens": 421909572.0, "step": 26180 }, { "epoch": 6.069532970216711, "grad_norm": 0.9757665395736694, "learning_rate": 1.7666435814604213e-05, "loss": 1.4601, "mean_token_accuracy": 0.6862719401717186, "num_tokens": 422071396.0, "step": 26190 }, { "epoch": 6.071850735890601, "grad_norm": 1.024673342704773, "learning_rate": 1.764849865257287e-05, "loss": 1.46, "mean_token_accuracy": 0.6850072026252747, "num_tokens": 422232140.0, "step": 26200 }, { "epoch": 6.0741685015644915, "grad_norm": 0.9787823557853699, "learning_rate": 1.763056563226476e-05, "loss": 1.4702, "mean_token_accuracy": 0.6836473241448402, "num_tokens": 422393461.0, "step": 26210 }, { "epoch": 6.076486267238383, "grad_norm": 0.9692432284355164, "learning_rate": 1.7612636763783063e-05, "loss": 1.4665, "mean_token_accuracy": 0.6846235662698745, "num_tokens": 422555396.0, "step": 26220 }, { "epoch": 6.078804032912273, "grad_norm": 1.0032254457473755, "learning_rate": 1.7594712057228635e-05, "loss": 1.4802, "mean_token_accuracy": 0.6849156871438027, "num_tokens": 422716744.0, "step": 26230 }, { "epoch": 6.081121798586163, "grad_norm": 1.0280731916427612, "learning_rate": 1.7576791522699976e-05, "loss": 1.4477, "mean_token_accuracy": 0.6874064311385155, "num_tokens": 422878389.0, "step": 26240 }, { "epoch": 6.083439564260053, "grad_norm": 0.9527820348739624, "learning_rate": 1.7558875170293244e-05, "loss": 1.4655, "mean_token_accuracy": 0.6862843036651611, "num_tokens": 423039826.0, "step": 26250 }, { "epoch": 6.085757329933943, "grad_norm": 0.97977215051651, "learning_rate": 1.7540963010102236e-05, "loss": 1.4464, "mean_token_accuracy": 0.6886539816856384, "num_tokens": 423201290.0, "step": 26260 }, { "epoch": 6.088075095607834, "grad_norm": 0.9844222068786621, "learning_rate": 1.752305505221839e-05, "loss": 1.4643, "mean_token_accuracy": 0.6852332070469856, "num_tokens": 423363084.0, "step": 26270 }, { "epoch": 6.090392861281725, "grad_norm": 1.0256034135818481, "learning_rate": 1.7505151306730775e-05, "loss": 1.4594, "mean_token_accuracy": 0.6855567499995232, "num_tokens": 423524809.0, "step": 26280 }, { "epoch": 6.092710626955615, "grad_norm": 0.9951295852661133, "learning_rate": 1.7487251783726085e-05, "loss": 1.4513, "mean_token_accuracy": 0.6874641150236129, "num_tokens": 423685708.0, "step": 26290 }, { "epoch": 6.095028392629505, "grad_norm": 0.9458116888999939, "learning_rate": 1.746935649328864e-05, "loss": 1.4536, "mean_token_accuracy": 0.6880922317504883, "num_tokens": 423847625.0, "step": 26300 }, { "epoch": 6.097346158303395, "grad_norm": 0.9875934720039368, "learning_rate": 1.745146544550037e-05, "loss": 1.4594, "mean_token_accuracy": 0.6865083068609238, "num_tokens": 424009052.0, "step": 26310 }, { "epoch": 6.099663923977286, "grad_norm": 0.9891019463539124, "learning_rate": 1.7433578650440817e-05, "loss": 1.4505, "mean_token_accuracy": 0.6877846613526344, "num_tokens": 424170486.0, "step": 26320 }, { "epoch": 6.1019816896511765, "grad_norm": 0.9924180507659912, "learning_rate": 1.741569611818713e-05, "loss": 1.4568, "mean_token_accuracy": 0.6874511361122131, "num_tokens": 424329995.0, "step": 26330 }, { "epoch": 6.104299455325067, "grad_norm": 0.9857043027877808, "learning_rate": 1.739781785881405e-05, "loss": 1.466, "mean_token_accuracy": 0.686513352394104, "num_tokens": 424491698.0, "step": 26340 }, { "epoch": 6.106617220998957, "grad_norm": 0.9377350807189941, "learning_rate": 1.7379943882393917e-05, "loss": 1.4632, "mean_token_accuracy": 0.6854834869503975, "num_tokens": 424653057.0, "step": 26350 }, { "epoch": 6.108934986672847, "grad_norm": 0.9701479077339172, "learning_rate": 1.7362074198996652e-05, "loss": 1.4513, "mean_token_accuracy": 0.6870635092258454, "num_tokens": 424813768.0, "step": 26360 }, { "epoch": 6.111252752346738, "grad_norm": 0.944466769695282, "learning_rate": 1.734420881868977e-05, "loss": 1.4822, "mean_token_accuracy": 0.6837626576423645, "num_tokens": 424975364.0, "step": 26370 }, { "epoch": 6.113570518020628, "grad_norm": 0.9734157919883728, "learning_rate": 1.732634775153834e-05, "loss": 1.4558, "mean_token_accuracy": 0.6859775513410569, "num_tokens": 425136416.0, "step": 26380 }, { "epoch": 6.1158882836945185, "grad_norm": 1.022586703300476, "learning_rate": 1.7308491007605025e-05, "loss": 1.4673, "mean_token_accuracy": 0.6857727080583572, "num_tokens": 425298404.0, "step": 26390 }, { "epoch": 6.118206049368409, "grad_norm": 0.9624522924423218, "learning_rate": 1.7290638596950035e-05, "loss": 1.4602, "mean_token_accuracy": 0.6855768039822578, "num_tokens": 425459924.0, "step": 26400 }, { "epoch": 6.120523815042299, "grad_norm": 0.9977163672447205, "learning_rate": 1.7272790529631156e-05, "loss": 1.4547, "mean_token_accuracy": 0.68821372538805, "num_tokens": 425620643.0, "step": 26410 }, { "epoch": 6.12284158071619, "grad_norm": 0.9817999005317688, "learning_rate": 1.725494681570369e-05, "loss": 1.468, "mean_token_accuracy": 0.6853987589478493, "num_tokens": 425782188.0, "step": 26420 }, { "epoch": 6.12515934639008, "grad_norm": 0.9813991785049438, "learning_rate": 1.723710746522053e-05, "loss": 1.47, "mean_token_accuracy": 0.6849318444728851, "num_tokens": 425943887.0, "step": 26430 }, { "epoch": 6.12747711206397, "grad_norm": 0.9519615173339844, "learning_rate": 1.721927248823209e-05, "loss": 1.4445, "mean_token_accuracy": 0.687421603500843, "num_tokens": 426105744.0, "step": 26440 }, { "epoch": 6.129794877737861, "grad_norm": 1.0035909414291382, "learning_rate": 1.7201441894786323e-05, "loss": 1.4496, "mean_token_accuracy": 0.6858509257435799, "num_tokens": 426266565.0, "step": 26450 }, { "epoch": 6.132112643411751, "grad_norm": 0.9332961440086365, "learning_rate": 1.718361569492871e-05, "loss": 1.4627, "mean_token_accuracy": 0.6844363912940026, "num_tokens": 426428702.0, "step": 26460 }, { "epoch": 6.134430409085642, "grad_norm": 0.961576521396637, "learning_rate": 1.7165793898702264e-05, "loss": 1.4566, "mean_token_accuracy": 0.6856817021965981, "num_tokens": 426590263.0, "step": 26470 }, { "epoch": 6.136748174759532, "grad_norm": 0.9617564082145691, "learning_rate": 1.7147976516147506e-05, "loss": 1.4502, "mean_token_accuracy": 0.6890402987599373, "num_tokens": 426751229.0, "step": 26480 }, { "epoch": 6.139065940433422, "grad_norm": 1.0117207765579224, "learning_rate": 1.713016355730248e-05, "loss": 1.4758, "mean_token_accuracy": 0.6845542758703231, "num_tokens": 426912076.0, "step": 26490 }, { "epoch": 6.141383706107312, "grad_norm": 0.9488076567649841, "learning_rate": 1.711235503220274e-05, "loss": 1.4566, "mean_token_accuracy": 0.6856582060456275, "num_tokens": 427073385.0, "step": 26500 }, { "epoch": 6.143701471781203, "grad_norm": 0.9506925940513611, "learning_rate": 1.7094550950881318e-05, "loss": 1.4462, "mean_token_accuracy": 0.6887672305107116, "num_tokens": 427235104.0, "step": 26510 }, { "epoch": 6.146019237455094, "grad_norm": 1.0105575323104858, "learning_rate": 1.7076751323368778e-05, "loss": 1.4616, "mean_token_accuracy": 0.684637388586998, "num_tokens": 427396517.0, "step": 26520 }, { "epoch": 6.148337003128984, "grad_norm": 1.006343126296997, "learning_rate": 1.705895615969315e-05, "loss": 1.4628, "mean_token_accuracy": 0.6858861461281777, "num_tokens": 427557398.0, "step": 26530 }, { "epoch": 6.150654768802874, "grad_norm": 0.970847487449646, "learning_rate": 1.7041165469879957e-05, "loss": 1.4778, "mean_token_accuracy": 0.6849715188145638, "num_tokens": 427718573.0, "step": 26540 }, { "epoch": 6.152972534476764, "grad_norm": 1.0005520582199097, "learning_rate": 1.7023379263952203e-05, "loss": 1.4586, "mean_token_accuracy": 0.6862229242920875, "num_tokens": 427878041.0, "step": 26550 }, { "epoch": 6.1552903001506545, "grad_norm": 0.9817692041397095, "learning_rate": 1.7005597551930362e-05, "loss": 1.4615, "mean_token_accuracy": 0.6845095157623291, "num_tokens": 428038636.0, "step": 26560 }, { "epoch": 6.157608065824546, "grad_norm": 0.962030291557312, "learning_rate": 1.6987820343832383e-05, "loss": 1.4646, "mean_token_accuracy": 0.685101917386055, "num_tokens": 428200106.0, "step": 26570 }, { "epoch": 6.159925831498436, "grad_norm": 0.9701398015022278, "learning_rate": 1.6970047649673664e-05, "loss": 1.4787, "mean_token_accuracy": 0.6830510467290878, "num_tokens": 428360408.0, "step": 26580 }, { "epoch": 6.162243597172326, "grad_norm": 0.9927846789360046, "learning_rate": 1.6952279479467075e-05, "loss": 1.4558, "mean_token_accuracy": 0.6858642905950546, "num_tokens": 428522027.0, "step": 26590 }, { "epoch": 6.164561362846216, "grad_norm": 1.0257856845855713, "learning_rate": 1.693451584322293e-05, "loss": 1.4516, "mean_token_accuracy": 0.6871085643768311, "num_tokens": 428683295.0, "step": 26600 }, { "epoch": 6.166879128520106, "grad_norm": 1.0340486764907837, "learning_rate": 1.691675675094899e-05, "loss": 1.4512, "mean_token_accuracy": 0.6877652540802955, "num_tokens": 428844567.0, "step": 26610 }, { "epoch": 6.169196894193997, "grad_norm": 0.9450758099555969, "learning_rate": 1.689900221265045e-05, "loss": 1.4521, "mean_token_accuracy": 0.688511623442173, "num_tokens": 429004842.0, "step": 26620 }, { "epoch": 6.171514659867888, "grad_norm": 0.9533858895301819, "learning_rate": 1.6881252238329954e-05, "loss": 1.4594, "mean_token_accuracy": 0.6851364299654961, "num_tokens": 429166434.0, "step": 26630 }, { "epoch": 6.173832425541778, "grad_norm": 0.9578535556793213, "learning_rate": 1.686350683798756e-05, "loss": 1.4607, "mean_token_accuracy": 0.6862990573048592, "num_tokens": 429327453.0, "step": 26640 }, { "epoch": 6.176150191215668, "grad_norm": 0.9750587940216064, "learning_rate": 1.6845766021620757e-05, "loss": 1.4674, "mean_token_accuracy": 0.6859272181987762, "num_tokens": 429488327.0, "step": 26650 }, { "epoch": 6.178467956889558, "grad_norm": 0.9509193897247314, "learning_rate": 1.6828029799224444e-05, "loss": 1.4674, "mean_token_accuracy": 0.6860796764492989, "num_tokens": 429649116.0, "step": 26660 }, { "epoch": 6.180785722563449, "grad_norm": 0.988453209400177, "learning_rate": 1.681029818079094e-05, "loss": 1.4684, "mean_token_accuracy": 0.6846799075603485, "num_tokens": 429810844.0, "step": 26670 }, { "epoch": 6.1831034882373395, "grad_norm": 0.9199245572090149, "learning_rate": 1.6792571176309972e-05, "loss": 1.453, "mean_token_accuracy": 0.6871122628450393, "num_tokens": 429972103.0, "step": 26680 }, { "epoch": 6.18542125391123, "grad_norm": 0.9358761310577393, "learning_rate": 1.6774848795768655e-05, "loss": 1.4627, "mean_token_accuracy": 0.6852865397930146, "num_tokens": 430134040.0, "step": 26690 }, { "epoch": 6.18773901958512, "grad_norm": 0.9718300104141235, "learning_rate": 1.675713104915151e-05, "loss": 1.453, "mean_token_accuracy": 0.6863400980830192, "num_tokens": 430295682.0, "step": 26700 }, { "epoch": 6.19005678525901, "grad_norm": 0.9683449268341064, "learning_rate": 1.6739417946440443e-05, "loss": 1.4524, "mean_token_accuracy": 0.6866264745593071, "num_tokens": 430457551.0, "step": 26710 }, { "epoch": 6.1923745509329, "grad_norm": 0.9430807828903198, "learning_rate": 1.6721709497614747e-05, "loss": 1.4481, "mean_token_accuracy": 0.6865129947662354, "num_tokens": 430619042.0, "step": 26720 }, { "epoch": 6.194692316606791, "grad_norm": 1.0190078020095825, "learning_rate": 1.670400571265109e-05, "loss": 1.461, "mean_token_accuracy": 0.6842522755265236, "num_tokens": 430780853.0, "step": 26730 }, { "epoch": 6.1970100822806815, "grad_norm": 0.9755081534385681, "learning_rate": 1.6686306601523513e-05, "loss": 1.4493, "mean_token_accuracy": 0.687986271083355, "num_tokens": 430941768.0, "step": 26740 }, { "epoch": 6.199327847954572, "grad_norm": 0.9496902227401733, "learning_rate": 1.6668612174203423e-05, "loss": 1.4589, "mean_token_accuracy": 0.6866541564464569, "num_tokens": 431103274.0, "step": 26750 }, { "epoch": 6.201645613628462, "grad_norm": 0.9792174100875854, "learning_rate": 1.6650922440659587e-05, "loss": 1.4646, "mean_token_accuracy": 0.6852841183543206, "num_tokens": 431264485.0, "step": 26760 }, { "epoch": 6.203963379302353, "grad_norm": 0.9547586441040039, "learning_rate": 1.663323741085814e-05, "loss": 1.4679, "mean_token_accuracy": 0.6857559219002723, "num_tokens": 431426022.0, "step": 26770 }, { "epoch": 6.206281144976243, "grad_norm": 0.9900307059288025, "learning_rate": 1.6615557094762546e-05, "loss": 1.4501, "mean_token_accuracy": 0.687015350162983, "num_tokens": 431587775.0, "step": 26780 }, { "epoch": 6.208598910650133, "grad_norm": 1.053925633430481, "learning_rate": 1.6597881502333625e-05, "loss": 1.454, "mean_token_accuracy": 0.6851998582482338, "num_tokens": 431746833.0, "step": 26790 }, { "epoch": 6.2109166763240236, "grad_norm": 1.011189579963684, "learning_rate": 1.6580210643529543e-05, "loss": 1.4637, "mean_token_accuracy": 0.685746145248413, "num_tokens": 431908459.0, "step": 26800 }, { "epoch": 6.213234441997914, "grad_norm": 0.9549413919448853, "learning_rate": 1.6562544528305782e-05, "loss": 1.461, "mean_token_accuracy": 0.6871947303414345, "num_tokens": 432068843.0, "step": 26810 }, { "epoch": 6.215552207671804, "grad_norm": 0.9470545649528503, "learning_rate": 1.6544883166615167e-05, "loss": 1.4725, "mean_token_accuracy": 0.6841667518019676, "num_tokens": 432230013.0, "step": 26820 }, { "epoch": 6.217869973345695, "grad_norm": 0.9358773827552795, "learning_rate": 1.6527226568407833e-05, "loss": 1.4547, "mean_token_accuracy": 0.6876661136746407, "num_tokens": 432391725.0, "step": 26830 }, { "epoch": 6.220187739019585, "grad_norm": 0.9465756416320801, "learning_rate": 1.6509574743631245e-05, "loss": 1.4856, "mean_token_accuracy": 0.6828774824738503, "num_tokens": 432553548.0, "step": 26840 }, { "epoch": 6.222505504693475, "grad_norm": 1.0294382572174072, "learning_rate": 1.6491927702230162e-05, "loss": 1.4629, "mean_token_accuracy": 0.685822294652462, "num_tokens": 432714429.0, "step": 26850 }, { "epoch": 6.224823270367366, "grad_norm": 1.0390620231628418, "learning_rate": 1.6474285454146664e-05, "loss": 1.4529, "mean_token_accuracy": 0.6872712671756744, "num_tokens": 432875451.0, "step": 26860 }, { "epoch": 6.227141036041257, "grad_norm": 1.0154727697372437, "learning_rate": 1.6456648009320112e-05, "loss": 1.4529, "mean_token_accuracy": 0.6876735344529152, "num_tokens": 433035759.0, "step": 26870 }, { "epoch": 6.229458801715147, "grad_norm": 0.9157379269599915, "learning_rate": 1.6439015377687188e-05, "loss": 1.4613, "mean_token_accuracy": 0.6852223679423333, "num_tokens": 433197119.0, "step": 26880 }, { "epoch": 6.231776567389037, "grad_norm": 0.897366464138031, "learning_rate": 1.6421387569181837e-05, "loss": 1.4673, "mean_token_accuracy": 0.685249574482441, "num_tokens": 433358180.0, "step": 26890 }, { "epoch": 6.234094333062927, "grad_norm": 0.9921805262565613, "learning_rate": 1.64037645937353e-05, "loss": 1.4418, "mean_token_accuracy": 0.6885715246200561, "num_tokens": 433519872.0, "step": 26900 }, { "epoch": 6.2364120987368175, "grad_norm": 0.9973004460334778, "learning_rate": 1.6386146461276082e-05, "loss": 1.4705, "mean_token_accuracy": 0.6845072031021118, "num_tokens": 433681253.0, "step": 26910 }, { "epoch": 6.238729864410708, "grad_norm": 0.9834024310112, "learning_rate": 1.6368533181729985e-05, "loss": 1.4542, "mean_token_accuracy": 0.6875192478299141, "num_tokens": 433841749.0, "step": 26920 }, { "epoch": 6.241047630084599, "grad_norm": 0.9178522825241089, "learning_rate": 1.6350924765020045e-05, "loss": 1.4756, "mean_token_accuracy": 0.6838097795844078, "num_tokens": 434003277.0, "step": 26930 }, { "epoch": 6.243365395758489, "grad_norm": 0.9943789839744568, "learning_rate": 1.6333321221066582e-05, "loss": 1.4557, "mean_token_accuracy": 0.685735097527504, "num_tokens": 434164594.0, "step": 26940 }, { "epoch": 6.245683161432379, "grad_norm": 0.9427120685577393, "learning_rate": 1.6315722559787165e-05, "loss": 1.4366, "mean_token_accuracy": 0.6893441244959831, "num_tokens": 434326099.0, "step": 26950 }, { "epoch": 6.248000927106269, "grad_norm": 0.9554375410079956, "learning_rate": 1.6298128791096605e-05, "loss": 1.4624, "mean_token_accuracy": 0.6865783795714379, "num_tokens": 434486661.0, "step": 26960 }, { "epoch": 6.25031869278016, "grad_norm": 0.9476125240325928, "learning_rate": 1.6280539924906972e-05, "loss": 1.4501, "mean_token_accuracy": 0.6871706560254097, "num_tokens": 434648026.0, "step": 26970 }, { "epoch": 6.252636458454051, "grad_norm": 1.01328444480896, "learning_rate": 1.6262955971127553e-05, "loss": 1.4389, "mean_token_accuracy": 0.6883027911186218, "num_tokens": 434809218.0, "step": 26980 }, { "epoch": 6.254954224127941, "grad_norm": 0.974461555480957, "learning_rate": 1.6245376939664873e-05, "loss": 1.4582, "mean_token_accuracy": 0.6853718861937523, "num_tokens": 434970112.0, "step": 26990 }, { "epoch": 6.257271989801831, "grad_norm": 0.9867652654647827, "learning_rate": 1.62278028404227e-05, "loss": 1.47, "mean_token_accuracy": 0.6852759674191475, "num_tokens": 435131696.0, "step": 27000 }, { "epoch": 6.259589755475721, "grad_norm": 0.9679502844810486, "learning_rate": 1.6210233683302014e-05, "loss": 1.4478, "mean_token_accuracy": 0.6893231943249702, "num_tokens": 435293473.0, "step": 27010 }, { "epoch": 6.261907521149611, "grad_norm": 1.033921241760254, "learning_rate": 1.6192669478200995e-05, "loss": 1.4533, "mean_token_accuracy": 0.6857079610228538, "num_tokens": 435454411.0, "step": 27020 }, { "epoch": 6.264225286823502, "grad_norm": 0.9887164235115051, "learning_rate": 1.6175110235015062e-05, "loss": 1.4545, "mean_token_accuracy": 0.6876726225018501, "num_tokens": 435615527.0, "step": 27030 }, { "epoch": 6.266543052497393, "grad_norm": 0.9682192802429199, "learning_rate": 1.6157555963636817e-05, "loss": 1.4603, "mean_token_accuracy": 0.6875231936573982, "num_tokens": 435776828.0, "step": 27040 }, { "epoch": 6.268860818171283, "grad_norm": 1.0127763748168945, "learning_rate": 1.6140006673956063e-05, "loss": 1.4579, "mean_token_accuracy": 0.6864855766296387, "num_tokens": 435938667.0, "step": 27050 }, { "epoch": 6.271178583845173, "grad_norm": 1.0430256128311157, "learning_rate": 1.612246237585981e-05, "loss": 1.4622, "mean_token_accuracy": 0.6861644729971885, "num_tokens": 436099804.0, "step": 27060 }, { "epoch": 6.273496349519063, "grad_norm": 0.9454817175865173, "learning_rate": 1.6104923079232246e-05, "loss": 1.4721, "mean_token_accuracy": 0.6853567764163018, "num_tokens": 436261232.0, "step": 27070 }, { "epoch": 6.275814115192954, "grad_norm": 0.9907195568084717, "learning_rate": 1.6087388793954737e-05, "loss": 1.4613, "mean_token_accuracy": 0.6869114026427269, "num_tokens": 436422484.0, "step": 27080 }, { "epoch": 6.2781318808668445, "grad_norm": 0.9768335819244385, "learning_rate": 1.606985952990583e-05, "loss": 1.4753, "mean_token_accuracy": 0.685280741751194, "num_tokens": 436584411.0, "step": 27090 }, { "epoch": 6.280449646540735, "grad_norm": 0.9449771046638489, "learning_rate": 1.6052335296961248e-05, "loss": 1.465, "mean_token_accuracy": 0.6855067476630211, "num_tokens": 436746246.0, "step": 27100 }, { "epoch": 6.282767412214625, "grad_norm": 0.9693108201026917, "learning_rate": 1.6034816104993876e-05, "loss": 1.4709, "mean_token_accuracy": 0.6831365540623665, "num_tokens": 436907991.0, "step": 27110 }, { "epoch": 6.285085177888515, "grad_norm": 0.9952885508537292, "learning_rate": 1.6017301963873756e-05, "loss": 1.4501, "mean_token_accuracy": 0.687538954615593, "num_tokens": 437069635.0, "step": 27120 }, { "epoch": 6.287402943562406, "grad_norm": 0.9775577187538147, "learning_rate": 1.5999792883468083e-05, "loss": 1.4735, "mean_token_accuracy": 0.6831477120518684, "num_tokens": 437229446.0, "step": 27130 }, { "epoch": 6.289720709236296, "grad_norm": 0.9771126508712769, "learning_rate": 1.5982288873641215e-05, "loss": 1.4587, "mean_token_accuracy": 0.6847447127103805, "num_tokens": 437389913.0, "step": 27140 }, { "epoch": 6.2920384749101865, "grad_norm": 0.9213534593582153, "learning_rate": 1.5964789944254633e-05, "loss": 1.4648, "mean_token_accuracy": 0.6836266875267029, "num_tokens": 437550913.0, "step": 27150 }, { "epoch": 6.294356240584077, "grad_norm": 1.0353281497955322, "learning_rate": 1.5947296105166965e-05, "loss": 1.4653, "mean_token_accuracy": 0.6859912946820259, "num_tokens": 437712741.0, "step": 27160 }, { "epoch": 6.296674006257967, "grad_norm": 1.0299404859542847, "learning_rate": 1.5929807366233977e-05, "loss": 1.4764, "mean_token_accuracy": 0.6843461126089097, "num_tokens": 437873940.0, "step": 27170 }, { "epoch": 6.298991771931858, "grad_norm": 0.994711697101593, "learning_rate": 1.591232373730856e-05, "loss": 1.4604, "mean_token_accuracy": 0.6851348206400871, "num_tokens": 438034849.0, "step": 27180 }, { "epoch": 6.301309537605748, "grad_norm": 0.9752190709114075, "learning_rate": 1.5894845228240717e-05, "loss": 1.4657, "mean_token_accuracy": 0.685891005396843, "num_tokens": 438195771.0, "step": 27190 }, { "epoch": 6.303627303279638, "grad_norm": 0.9992647767066956, "learning_rate": 1.5877371848877574e-05, "loss": 1.466, "mean_token_accuracy": 0.6854487344622612, "num_tokens": 438357553.0, "step": 27200 }, { "epoch": 6.305945068953529, "grad_norm": 0.9908290505409241, "learning_rate": 1.5859903609063366e-05, "loss": 1.4573, "mean_token_accuracy": 0.6855462089180946, "num_tokens": 438518966.0, "step": 27210 }, { "epoch": 6.308262834627419, "grad_norm": 1.0251911878585815, "learning_rate": 1.5842440518639424e-05, "loss": 1.448, "mean_token_accuracy": 0.6871737167239189, "num_tokens": 438680762.0, "step": 27220 }, { "epoch": 6.31058060030131, "grad_norm": 1.0346390008926392, "learning_rate": 1.58249825874442e-05, "loss": 1.4602, "mean_token_accuracy": 0.6854320436716079, "num_tokens": 438841818.0, "step": 27230 }, { "epoch": 6.3128983659752, "grad_norm": 0.8997315168380737, "learning_rate": 1.580752982531321e-05, "loss": 1.4555, "mean_token_accuracy": 0.6855256468057632, "num_tokens": 439003501.0, "step": 27240 }, { "epoch": 6.31521613164909, "grad_norm": 0.9880214333534241, "learning_rate": 1.5790082242079086e-05, "loss": 1.4642, "mean_token_accuracy": 0.6857900634407997, "num_tokens": 439165316.0, "step": 27250 }, { "epoch": 6.31753389732298, "grad_norm": 0.9393448829650879, "learning_rate": 1.577263984757152e-05, "loss": 1.4524, "mean_token_accuracy": 0.6865608856081963, "num_tokens": 439326776.0, "step": 27260 }, { "epoch": 6.319851662996871, "grad_norm": 0.9550817608833313, "learning_rate": 1.575520265161729e-05, "loss": 1.456, "mean_token_accuracy": 0.6867682442069054, "num_tokens": 439486877.0, "step": 27270 }, { "epoch": 6.322169428670762, "grad_norm": 0.9395416975021362, "learning_rate": 1.5737770664040252e-05, "loss": 1.4467, "mean_token_accuracy": 0.6878602981567383, "num_tokens": 439648701.0, "step": 27280 }, { "epoch": 6.324487194344652, "grad_norm": 1.0022233724594116, "learning_rate": 1.5720343894661315e-05, "loss": 1.448, "mean_token_accuracy": 0.6882449850440026, "num_tokens": 439810284.0, "step": 27290 }, { "epoch": 6.326804960018542, "grad_norm": 0.9560152888298035, "learning_rate": 1.5702922353298457e-05, "loss": 1.4436, "mean_token_accuracy": 0.6868335485458374, "num_tokens": 439971405.0, "step": 27300 }, { "epoch": 6.329122725692432, "grad_norm": 0.9447515606880188, "learning_rate": 1.5685506049766703e-05, "loss": 1.4657, "mean_token_accuracy": 0.6859713912010192, "num_tokens": 440132854.0, "step": 27310 }, { "epoch": 6.3314404913663225, "grad_norm": 0.9689453840255737, "learning_rate": 1.5668094993878137e-05, "loss": 1.4571, "mean_token_accuracy": 0.6867858037352562, "num_tokens": 440293445.0, "step": 27320 }, { "epoch": 6.333758257040214, "grad_norm": 0.9619418978691101, "learning_rate": 1.565068919544188e-05, "loss": 1.4772, "mean_token_accuracy": 0.6837991148233413, "num_tokens": 440453435.0, "step": 27330 }, { "epoch": 6.336076022714104, "grad_norm": 0.9515949487686157, "learning_rate": 1.563328866426409e-05, "loss": 1.4678, "mean_token_accuracy": 0.684754778444767, "num_tokens": 440615281.0, "step": 27340 }, { "epoch": 6.338393788387994, "grad_norm": 0.9362770318984985, "learning_rate": 1.5615893410147957e-05, "loss": 1.4482, "mean_token_accuracy": 0.6861850798130036, "num_tokens": 440776000.0, "step": 27350 }, { "epoch": 6.340711554061884, "grad_norm": 0.9867570996284485, "learning_rate": 1.5598503442893706e-05, "loss": 1.4791, "mean_token_accuracy": 0.683813925087452, "num_tokens": 440938151.0, "step": 27360 }, { "epoch": 6.343029319735774, "grad_norm": 0.9618247151374817, "learning_rate": 1.5581118772298575e-05, "loss": 1.4496, "mean_token_accuracy": 0.6873901650309563, "num_tokens": 441099364.0, "step": 27370 }, { "epoch": 6.345347085409665, "grad_norm": 0.9773918390274048, "learning_rate": 1.5563739408156823e-05, "loss": 1.4578, "mean_token_accuracy": 0.684888345003128, "num_tokens": 441259994.0, "step": 27380 }, { "epoch": 6.347664851083556, "grad_norm": 1.003365159034729, "learning_rate": 1.554636536025972e-05, "loss": 1.468, "mean_token_accuracy": 0.6850407958030701, "num_tokens": 441421281.0, "step": 27390 }, { "epoch": 6.349982616757446, "grad_norm": 1.0154473781585693, "learning_rate": 1.552899663839553e-05, "loss": 1.4604, "mean_token_accuracy": 0.6846727401018142, "num_tokens": 441582950.0, "step": 27400 }, { "epoch": 6.352300382431336, "grad_norm": 1.00803542137146, "learning_rate": 1.5511633252349536e-05, "loss": 1.4643, "mean_token_accuracy": 0.6856443673372269, "num_tokens": 441744527.0, "step": 27410 }, { "epoch": 6.354618148105226, "grad_norm": 0.9998584985733032, "learning_rate": 1.5494275211903994e-05, "loss": 1.4708, "mean_token_accuracy": 0.6849493190646172, "num_tokens": 441906241.0, "step": 27420 }, { "epoch": 6.356935913779117, "grad_norm": 1.0219706296920776, "learning_rate": 1.547692252683816e-05, "loss": 1.4529, "mean_token_accuracy": 0.6870075210928916, "num_tokens": 442067422.0, "step": 27430 }, { "epoch": 6.3592536794530075, "grad_norm": 0.9690080881118774, "learning_rate": 1.5459575206928284e-05, "loss": 1.4664, "mean_token_accuracy": 0.684785258769989, "num_tokens": 442228101.0, "step": 27440 }, { "epoch": 6.361571445126898, "grad_norm": 0.9584500789642334, "learning_rate": 1.5442233261947558e-05, "loss": 1.471, "mean_token_accuracy": 0.6847134709358216, "num_tokens": 442388835.0, "step": 27450 }, { "epoch": 6.363889210800788, "grad_norm": 1.0216178894042969, "learning_rate": 1.542489670166618e-05, "loss": 1.4503, "mean_token_accuracy": 0.6864566370844841, "num_tokens": 442549750.0, "step": 27460 }, { "epoch": 6.366206976474678, "grad_norm": 0.9670652747154236, "learning_rate": 1.5407565535851303e-05, "loss": 1.4585, "mean_token_accuracy": 0.6857015058398247, "num_tokens": 442710601.0, "step": 27470 }, { "epoch": 6.368524742148569, "grad_norm": 0.9836032390594482, "learning_rate": 1.539023977426704e-05, "loss": 1.4488, "mean_token_accuracy": 0.6869010865688324, "num_tokens": 442872042.0, "step": 27480 }, { "epoch": 6.370842507822459, "grad_norm": 0.9935031533241272, "learning_rate": 1.5372919426674455e-05, "loss": 1.4666, "mean_token_accuracy": 0.6854722902178765, "num_tokens": 443032881.0, "step": 27490 }, { "epoch": 6.3731602734963495, "grad_norm": 0.9907321333885193, "learning_rate": 1.535560450283157e-05, "loss": 1.4568, "mean_token_accuracy": 0.6850080385804176, "num_tokens": 443193975.0, "step": 27500 }, { "epoch": 6.37547803917024, "grad_norm": 1.0043789148330688, "learning_rate": 1.533829501249335e-05, "loss": 1.4546, "mean_token_accuracy": 0.6858153685927391, "num_tokens": 443355148.0, "step": 27510 }, { "epoch": 6.37779580484413, "grad_norm": 0.9708976745605469, "learning_rate": 1.5320990965411694e-05, "loss": 1.4631, "mean_token_accuracy": 0.6856991156935692, "num_tokens": 443516747.0, "step": 27520 }, { "epoch": 6.380113570518021, "grad_norm": 0.9412824511528015, "learning_rate": 1.5303692371335438e-05, "loss": 1.4834, "mean_token_accuracy": 0.6823769330978393, "num_tokens": 443677137.0, "step": 27530 }, { "epoch": 6.382431336191911, "grad_norm": 0.9812083840370178, "learning_rate": 1.5286399240010347e-05, "loss": 1.4634, "mean_token_accuracy": 0.6846715584397316, "num_tokens": 443838622.0, "step": 27540 }, { "epoch": 6.384749101865801, "grad_norm": 0.999151349067688, "learning_rate": 1.52691115811791e-05, "loss": 1.4508, "mean_token_accuracy": 0.6876340046525001, "num_tokens": 443999222.0, "step": 27550 }, { "epoch": 6.3870668675396916, "grad_norm": 0.9828012585639954, "learning_rate": 1.52518294045813e-05, "loss": 1.4563, "mean_token_accuracy": 0.6873546883463859, "num_tokens": 444160990.0, "step": 27560 }, { "epoch": 6.389384633213582, "grad_norm": 1.0174509286880493, "learning_rate": 1.5234552719953465e-05, "loss": 1.462, "mean_token_accuracy": 0.6860176920890808, "num_tokens": 444322469.0, "step": 27570 }, { "epoch": 6.391702398887473, "grad_norm": 1.0204999446868896, "learning_rate": 1.5217281537029018e-05, "loss": 1.4608, "mean_token_accuracy": 0.6862976372241973, "num_tokens": 444483997.0, "step": 27580 }, { "epoch": 6.394020164561363, "grad_norm": 0.947860062122345, "learning_rate": 1.520001586553827e-05, "loss": 1.4834, "mean_token_accuracy": 0.682927031815052, "num_tokens": 444646010.0, "step": 27590 }, { "epoch": 6.396337930235253, "grad_norm": 0.9321897029876709, "learning_rate": 1.5182755715208435e-05, "loss": 1.4741, "mean_token_accuracy": 0.6826653644442559, "num_tokens": 444807598.0, "step": 27600 }, { "epoch": 6.398655695909143, "grad_norm": 0.9987635612487793, "learning_rate": 1.5165501095763624e-05, "loss": 1.4669, "mean_token_accuracy": 0.6857869133353234, "num_tokens": 444969051.0, "step": 27610 }, { "epoch": 6.400973461583034, "grad_norm": 0.9571878910064697, "learning_rate": 1.5148252016924822e-05, "loss": 1.4462, "mean_token_accuracy": 0.6906179532408714, "num_tokens": 445131229.0, "step": 27620 }, { "epoch": 6.403291227256925, "grad_norm": 0.9873722195625305, "learning_rate": 1.5131008488409895e-05, "loss": 1.4452, "mean_token_accuracy": 0.6877648696303368, "num_tokens": 445291366.0, "step": 27630 }, { "epoch": 6.405608992930815, "grad_norm": 0.9943732619285583, "learning_rate": 1.5113770519933584e-05, "loss": 1.4597, "mean_token_accuracy": 0.6856428697705269, "num_tokens": 445452113.0, "step": 27640 }, { "epoch": 6.407926758604705, "grad_norm": 0.9848132729530334, "learning_rate": 1.5096538121207498e-05, "loss": 1.4774, "mean_token_accuracy": 0.6848329782485962, "num_tokens": 445613134.0, "step": 27650 }, { "epoch": 6.410244524278595, "grad_norm": 0.9786550998687744, "learning_rate": 1.5079311301940108e-05, "loss": 1.4656, "mean_token_accuracy": 0.685207587480545, "num_tokens": 445774840.0, "step": 27660 }, { "epoch": 6.4125622899524855, "grad_norm": 0.9463892579078674, "learning_rate": 1.5062090071836732e-05, "loss": 1.4548, "mean_token_accuracy": 0.6847029149532318, "num_tokens": 445934869.0, "step": 27670 }, { "epoch": 6.4148800556263765, "grad_norm": 0.9685718417167664, "learning_rate": 1.5044874440599554e-05, "loss": 1.4444, "mean_token_accuracy": 0.6884435012936592, "num_tokens": 446096107.0, "step": 27680 }, { "epoch": 6.417197821300267, "grad_norm": 0.9517065286636353, "learning_rate": 1.5027664417927598e-05, "loss": 1.4591, "mean_token_accuracy": 0.687053981423378, "num_tokens": 446256832.0, "step": 27690 }, { "epoch": 6.419515586974157, "grad_norm": 1.062131404876709, "learning_rate": 1.501046001351672e-05, "loss": 1.4725, "mean_token_accuracy": 0.6842435091733933, "num_tokens": 446417147.0, "step": 27700 }, { "epoch": 6.421833352648047, "grad_norm": 0.968019962310791, "learning_rate": 1.4993261237059625e-05, "loss": 1.4575, "mean_token_accuracy": 0.6865095347166061, "num_tokens": 446578992.0, "step": 27710 }, { "epoch": 6.424151118321937, "grad_norm": 0.9966297149658203, "learning_rate": 1.4976068098245835e-05, "loss": 1.4588, "mean_token_accuracy": 0.6867791846394539, "num_tokens": 446740168.0, "step": 27720 }, { "epoch": 6.426468883995828, "grad_norm": 0.9432647824287415, "learning_rate": 1.4958880606761705e-05, "loss": 1.442, "mean_token_accuracy": 0.6876231089234353, "num_tokens": 446901699.0, "step": 27730 }, { "epoch": 6.428786649669719, "grad_norm": 0.9413987994194031, "learning_rate": 1.4941698772290402e-05, "loss": 1.4549, "mean_token_accuracy": 0.686482697725296, "num_tokens": 447062861.0, "step": 27740 }, { "epoch": 6.431104415343609, "grad_norm": 0.9264858961105347, "learning_rate": 1.4924522604511906e-05, "loss": 1.4542, "mean_token_accuracy": 0.6866124078631402, "num_tokens": 447222525.0, "step": 27750 }, { "epoch": 6.433422181017499, "grad_norm": 0.9862968325614929, "learning_rate": 1.4907352113103013e-05, "loss": 1.453, "mean_token_accuracy": 0.6885322287678719, "num_tokens": 447383198.0, "step": 27760 }, { "epoch": 6.435739946691389, "grad_norm": 1.0184249877929688, "learning_rate": 1.489018730773731e-05, "loss": 1.4656, "mean_token_accuracy": 0.6851133316755295, "num_tokens": 447544294.0, "step": 27770 }, { "epoch": 6.43805771236528, "grad_norm": 0.9886869192123413, "learning_rate": 1.4873028198085187e-05, "loss": 1.4677, "mean_token_accuracy": 0.6872124299407005, "num_tokens": 447704953.0, "step": 27780 }, { "epoch": 6.44037547803917, "grad_norm": 0.9330316781997681, "learning_rate": 1.4855874793813823e-05, "loss": 1.4434, "mean_token_accuracy": 0.690010218322277, "num_tokens": 447865583.0, "step": 27790 }, { "epoch": 6.442693243713061, "grad_norm": 1.0059503316879272, "learning_rate": 1.4838727104587186e-05, "loss": 1.4602, "mean_token_accuracy": 0.6863186880946159, "num_tokens": 448026070.0, "step": 27800 }, { "epoch": 6.445011009386951, "grad_norm": 1.0177421569824219, "learning_rate": 1.4821585140066016e-05, "loss": 1.452, "mean_token_accuracy": 0.6885066524147987, "num_tokens": 448186586.0, "step": 27810 }, { "epoch": 6.447328775060841, "grad_norm": 0.9959946274757385, "learning_rate": 1.4804448909907837e-05, "loss": 1.4695, "mean_token_accuracy": 0.6841275796294213, "num_tokens": 448348308.0, "step": 27820 }, { "epoch": 6.449646540734732, "grad_norm": 0.9851788878440857, "learning_rate": 1.4787318423766939e-05, "loss": 1.4744, "mean_token_accuracy": 0.6838018521666527, "num_tokens": 448509965.0, "step": 27830 }, { "epoch": 6.451964306408622, "grad_norm": 0.9518818855285645, "learning_rate": 1.477019369129437e-05, "loss": 1.4512, "mean_token_accuracy": 0.6868446663022041, "num_tokens": 448671136.0, "step": 27840 }, { "epoch": 6.4542820720825125, "grad_norm": 0.9804069995880127, "learning_rate": 1.475307472213795e-05, "loss": 1.4692, "mean_token_accuracy": 0.6857080176472664, "num_tokens": 448833248.0, "step": 27850 }, { "epoch": 6.456599837756403, "grad_norm": 1.0354526042938232, "learning_rate": 1.4735961525942238e-05, "loss": 1.4617, "mean_token_accuracy": 0.6855050519108772, "num_tokens": 448994718.0, "step": 27860 }, { "epoch": 6.458917603430293, "grad_norm": 1.016332745552063, "learning_rate": 1.4718854112348546e-05, "loss": 1.4562, "mean_token_accuracy": 0.6857602953910827, "num_tokens": 449156601.0, "step": 27870 }, { "epoch": 6.461235369104184, "grad_norm": 1.0020614862442017, "learning_rate": 1.4701752490994936e-05, "loss": 1.4624, "mean_token_accuracy": 0.6851693704724312, "num_tokens": 449317739.0, "step": 27880 }, { "epoch": 6.463553134778074, "grad_norm": 0.9620176553726196, "learning_rate": 1.4684656671516189e-05, "loss": 1.4564, "mean_token_accuracy": 0.6872239083051681, "num_tokens": 449477890.0, "step": 27890 }, { "epoch": 6.465870900451964, "grad_norm": 0.9884548783302307, "learning_rate": 1.4667566663543836e-05, "loss": 1.4531, "mean_token_accuracy": 0.6870823189616203, "num_tokens": 449639014.0, "step": 27900 }, { "epoch": 6.4681886661258545, "grad_norm": 1.0108156204223633, "learning_rate": 1.4650482476706128e-05, "loss": 1.4726, "mean_token_accuracy": 0.6851526468992233, "num_tokens": 449799955.0, "step": 27910 }, { "epoch": 6.470506431799745, "grad_norm": 1.0122815370559692, "learning_rate": 1.4633404120628024e-05, "loss": 1.4582, "mean_token_accuracy": 0.6871131375432015, "num_tokens": 449961176.0, "step": 27920 }, { "epoch": 6.472824197473636, "grad_norm": 1.0064773559570312, "learning_rate": 1.4616331604931216e-05, "loss": 1.447, "mean_token_accuracy": 0.6877633452415466, "num_tokens": 450122633.0, "step": 27930 }, { "epoch": 6.475141963147526, "grad_norm": 1.0273360013961792, "learning_rate": 1.4599264939234092e-05, "loss": 1.46, "mean_token_accuracy": 0.6859487414360046, "num_tokens": 450283296.0, "step": 27940 }, { "epoch": 6.477459728821416, "grad_norm": 1.006880760192871, "learning_rate": 1.4582204133151755e-05, "loss": 1.4648, "mean_token_accuracy": 0.684977674484253, "num_tokens": 450445600.0, "step": 27950 }, { "epoch": 6.479777494495306, "grad_norm": 0.9558408856391907, "learning_rate": 1.4565149196295996e-05, "loss": 1.4573, "mean_token_accuracy": 0.6857533365488052, "num_tokens": 450607734.0, "step": 27960 }, { "epoch": 6.482095260169197, "grad_norm": 0.9525658488273621, "learning_rate": 1.4548100138275312e-05, "loss": 1.4607, "mean_token_accuracy": 0.6858454093337059, "num_tokens": 450769582.0, "step": 27970 }, { "epoch": 6.484413025843088, "grad_norm": 0.9843184947967529, "learning_rate": 1.4531056968694878e-05, "loss": 1.4569, "mean_token_accuracy": 0.6863381877541542, "num_tokens": 450931530.0, "step": 27980 }, { "epoch": 6.486730791516978, "grad_norm": 0.9724676012992859, "learning_rate": 1.4514019697156556e-05, "loss": 1.4493, "mean_token_accuracy": 0.6872935339808464, "num_tokens": 451092788.0, "step": 27990 }, { "epoch": 6.489048557190868, "grad_norm": 0.9893018007278442, "learning_rate": 1.449698833325888e-05, "loss": 1.4553, "mean_token_accuracy": 0.6865999475121498, "num_tokens": 451251697.0, "step": 28000 }, { "epoch": 6.491366322864758, "grad_norm": 0.9549909830093384, "learning_rate": 1.447996288659706e-05, "loss": 1.4455, "mean_token_accuracy": 0.6884704753756523, "num_tokens": 451414066.0, "step": 28010 }, { "epoch": 6.493684088538648, "grad_norm": 0.9539801478385925, "learning_rate": 1.4462943366762977e-05, "loss": 1.4651, "mean_token_accuracy": 0.6870638638734817, "num_tokens": 451574763.0, "step": 28020 }, { "epoch": 6.4960018542125395, "grad_norm": 0.9359098672866821, "learning_rate": 1.4445929783345163e-05, "loss": 1.47, "mean_token_accuracy": 0.6852535054087638, "num_tokens": 451735910.0, "step": 28030 }, { "epoch": 6.49831961988643, "grad_norm": 1.0293490886688232, "learning_rate": 1.4428922145928813e-05, "loss": 1.4619, "mean_token_accuracy": 0.6860377505421639, "num_tokens": 451897430.0, "step": 28040 }, { "epoch": 6.50063738556032, "grad_norm": 1.0025441646575928, "learning_rate": 1.4411920464095769e-05, "loss": 1.4522, "mean_token_accuracy": 0.6865978211164474, "num_tokens": 452058576.0, "step": 28050 }, { "epoch": 6.50295515123421, "grad_norm": 0.9665971994400024, "learning_rate": 1.4394924747424513e-05, "loss": 1.4573, "mean_token_accuracy": 0.6888135194778442, "num_tokens": 452219438.0, "step": 28060 }, { "epoch": 6.5052729169081, "grad_norm": 1.0223628282546997, "learning_rate": 1.4377935005490179e-05, "loss": 1.4699, "mean_token_accuracy": 0.6855476930737495, "num_tokens": 452380103.0, "step": 28070 }, { "epoch": 6.507590682581991, "grad_norm": 0.9105995893478394, "learning_rate": 1.4360951247864518e-05, "loss": 1.4548, "mean_token_accuracy": 0.6871564328670502, "num_tokens": 452541688.0, "step": 28080 }, { "epoch": 6.5099084482558816, "grad_norm": 0.9866730570793152, "learning_rate": 1.434397348411593e-05, "loss": 1.4572, "mean_token_accuracy": 0.6867662727832794, "num_tokens": 452703389.0, "step": 28090 }, { "epoch": 6.512226213929772, "grad_norm": 0.9513915777206421, "learning_rate": 1.432700172380943e-05, "loss": 1.4582, "mean_token_accuracy": 0.6865298792719841, "num_tokens": 452864392.0, "step": 28100 }, { "epoch": 6.514543979603662, "grad_norm": 1.0084179639816284, "learning_rate": 1.431003597650663e-05, "loss": 1.4651, "mean_token_accuracy": 0.686661995947361, "num_tokens": 453025713.0, "step": 28110 }, { "epoch": 6.516861745277552, "grad_norm": 0.9797160029411316, "learning_rate": 1.429307625176578e-05, "loss": 1.4429, "mean_token_accuracy": 0.6872500494122505, "num_tokens": 453186749.0, "step": 28120 }, { "epoch": 6.519179510951442, "grad_norm": 0.9816312789916992, "learning_rate": 1.4276122559141733e-05, "loss": 1.4536, "mean_token_accuracy": 0.6860126718878746, "num_tokens": 453348332.0, "step": 28130 }, { "epoch": 6.521497276625333, "grad_norm": 1.0134718418121338, "learning_rate": 1.4259174908185935e-05, "loss": 1.4525, "mean_token_accuracy": 0.687584376335144, "num_tokens": 453509940.0, "step": 28140 }, { "epoch": 6.523815042299224, "grad_norm": 0.965495228767395, "learning_rate": 1.4242233308446434e-05, "loss": 1.4534, "mean_token_accuracy": 0.6870983392000198, "num_tokens": 453671054.0, "step": 28150 }, { "epoch": 6.526132807973114, "grad_norm": 0.9431406855583191, "learning_rate": 1.4225297769467871e-05, "loss": 1.4599, "mean_token_accuracy": 0.6867160618305206, "num_tokens": 453832174.0, "step": 28160 }, { "epoch": 6.528450573647004, "grad_norm": 1.0067087411880493, "learning_rate": 1.420836830079147e-05, "loss": 1.4615, "mean_token_accuracy": 0.686388547718525, "num_tokens": 453993023.0, "step": 28170 }, { "epoch": 6.530768339320895, "grad_norm": 1.00872004032135, "learning_rate": 1.4191444911955026e-05, "loss": 1.4595, "mean_token_accuracy": 0.685800701379776, "num_tokens": 454154186.0, "step": 28180 }, { "epoch": 6.533086104994785, "grad_norm": 0.9873982667922974, "learning_rate": 1.4174527612492921e-05, "loss": 1.4721, "mean_token_accuracy": 0.6832575812935829, "num_tokens": 454315358.0, "step": 28190 }, { "epoch": 6.5354038706686755, "grad_norm": 0.9982601404190063, "learning_rate": 1.4157616411936109e-05, "loss": 1.4599, "mean_token_accuracy": 0.6854400187730789, "num_tokens": 454476669.0, "step": 28200 }, { "epoch": 6.537721636342566, "grad_norm": 0.9653288125991821, "learning_rate": 1.4140711319812094e-05, "loss": 1.4585, "mean_token_accuracy": 0.6854562059044838, "num_tokens": 454637835.0, "step": 28210 }, { "epoch": 6.540039402016456, "grad_norm": 1.0141040086746216, "learning_rate": 1.4123812345644943e-05, "loss": 1.4763, "mean_token_accuracy": 0.6848685443401337, "num_tokens": 454799133.0, "step": 28220 }, { "epoch": 6.542357167690346, "grad_norm": 0.9847939014434814, "learning_rate": 1.410691949895529e-05, "loss": 1.4572, "mean_token_accuracy": 0.6861115992069244, "num_tokens": 454960793.0, "step": 28230 }, { "epoch": 6.544674933364237, "grad_norm": 1.0157783031463623, "learning_rate": 1.40900327892603e-05, "loss": 1.4694, "mean_token_accuracy": 0.6839999601244926, "num_tokens": 455122553.0, "step": 28240 }, { "epoch": 6.546992699038127, "grad_norm": 1.0023051500320435, "learning_rate": 1.4073152226073683e-05, "loss": 1.4478, "mean_token_accuracy": 0.6887223422527313, "num_tokens": 455283576.0, "step": 28250 }, { "epoch": 6.5493104647120175, "grad_norm": 1.0177314281463623, "learning_rate": 1.4056277818905694e-05, "loss": 1.4604, "mean_token_accuracy": 0.6851492300629616, "num_tokens": 455445136.0, "step": 28260 }, { "epoch": 6.551628230385908, "grad_norm": 1.000407338142395, "learning_rate": 1.4039409577263112e-05, "loss": 1.4591, "mean_token_accuracy": 0.6869543954730034, "num_tokens": 455606980.0, "step": 28270 }, { "epoch": 6.553945996059799, "grad_norm": 1.0068674087524414, "learning_rate": 1.4022547510649254e-05, "loss": 1.4598, "mean_token_accuracy": 0.6868215560913086, "num_tokens": 455768288.0, "step": 28280 }, { "epoch": 6.556263761733689, "grad_norm": 0.967254638671875, "learning_rate": 1.4005691628563944e-05, "loss": 1.45, "mean_token_accuracy": 0.6867759689688683, "num_tokens": 455930247.0, "step": 28290 }, { "epoch": 6.558581527407579, "grad_norm": 0.9693471193313599, "learning_rate": 1.3988841940503522e-05, "loss": 1.4617, "mean_token_accuracy": 0.6860944464802742, "num_tokens": 456091351.0, "step": 28300 }, { "epoch": 6.560899293081469, "grad_norm": 1.0125179290771484, "learning_rate": 1.3971998455960855e-05, "loss": 1.4511, "mean_token_accuracy": 0.6856093183159828, "num_tokens": 456252292.0, "step": 28310 }, { "epoch": 6.5632170587553595, "grad_norm": 0.9851871728897095, "learning_rate": 1.39551611844253e-05, "loss": 1.4452, "mean_token_accuracy": 0.6869953230023385, "num_tokens": 456412404.0, "step": 28320 }, { "epoch": 6.56553482442925, "grad_norm": 0.9554400444030762, "learning_rate": 1.3938330135382716e-05, "loss": 1.4567, "mean_token_accuracy": 0.687612310051918, "num_tokens": 456572904.0, "step": 28330 }, { "epoch": 6.567852590103141, "grad_norm": 1.0171633958816528, "learning_rate": 1.3921505318315458e-05, "loss": 1.4481, "mean_token_accuracy": 0.6861208915710449, "num_tokens": 456733683.0, "step": 28340 }, { "epoch": 6.570170355777031, "grad_norm": 0.9993101954460144, "learning_rate": 1.3904686742702375e-05, "loss": 1.4516, "mean_token_accuracy": 0.6872444421052932, "num_tokens": 456893973.0, "step": 28350 }, { "epoch": 6.572488121450921, "grad_norm": 0.9135920405387878, "learning_rate": 1.388787441801879e-05, "loss": 1.4776, "mean_token_accuracy": 0.6833023577928543, "num_tokens": 457055488.0, "step": 28360 }, { "epoch": 6.574805887124811, "grad_norm": 0.9653252363204956, "learning_rate": 1.3871068353736511e-05, "loss": 1.4645, "mean_token_accuracy": 0.6863752350211143, "num_tokens": 457216919.0, "step": 28370 }, { "epoch": 6.5771236527987025, "grad_norm": 1.0394198894500732, "learning_rate": 1.3854268559323814e-05, "loss": 1.4442, "mean_token_accuracy": 0.6887266784906387, "num_tokens": 457378017.0, "step": 28380 }, { "epoch": 6.579441418472593, "grad_norm": 0.9769636392593384, "learning_rate": 1.3837475044245451e-05, "loss": 1.4568, "mean_token_accuracy": 0.6855932220816612, "num_tokens": 457539054.0, "step": 28390 }, { "epoch": 6.581759184146483, "grad_norm": 0.9647876024246216, "learning_rate": 1.3820687817962627e-05, "loss": 1.4636, "mean_token_accuracy": 0.6853345856070518, "num_tokens": 457699074.0, "step": 28400 }, { "epoch": 6.584076949820373, "grad_norm": 0.9816511273384094, "learning_rate": 1.3803906889933011e-05, "loss": 1.4651, "mean_token_accuracy": 0.6858154863119126, "num_tokens": 457860170.0, "step": 28410 }, { "epoch": 6.586394715494263, "grad_norm": 0.9755637645721436, "learning_rate": 1.3787132269610719e-05, "loss": 1.469, "mean_token_accuracy": 0.6845251575112343, "num_tokens": 458020523.0, "step": 28420 }, { "epoch": 6.5887124811681534, "grad_norm": 0.9423686861991882, "learning_rate": 1.3770363966446315e-05, "loss": 1.4563, "mean_token_accuracy": 0.6876324102282524, "num_tokens": 458182438.0, "step": 28430 }, { "epoch": 6.5910302468420445, "grad_norm": 0.9659048318862915, "learning_rate": 1.3753601989886805e-05, "loss": 1.4777, "mean_token_accuracy": 0.6844514697790146, "num_tokens": 458344144.0, "step": 28440 }, { "epoch": 6.593348012515935, "grad_norm": 0.9081105589866638, "learning_rate": 1.3736846349375632e-05, "loss": 1.4533, "mean_token_accuracy": 0.6871145889163017, "num_tokens": 458505147.0, "step": 28450 }, { "epoch": 6.595665778189825, "grad_norm": 0.9802573323249817, "learning_rate": 1.372009705435266e-05, "loss": 1.4437, "mean_token_accuracy": 0.68784119784832, "num_tokens": 458666876.0, "step": 28460 }, { "epoch": 6.597983543863715, "grad_norm": 0.9798479676246643, "learning_rate": 1.3703354114254191e-05, "loss": 1.439, "mean_token_accuracy": 0.6890552937984467, "num_tokens": 458828739.0, "step": 28470 }, { "epoch": 6.600301309537606, "grad_norm": 0.9392927289009094, "learning_rate": 1.3686617538512941e-05, "loss": 1.4541, "mean_token_accuracy": 0.6869611948728561, "num_tokens": 458990499.0, "step": 28480 }, { "epoch": 6.602619075211496, "grad_norm": 1.0340545177459717, "learning_rate": 1.366988733655804e-05, "loss": 1.4534, "mean_token_accuracy": 0.6876740738749504, "num_tokens": 459152475.0, "step": 28490 }, { "epoch": 6.604936840885387, "grad_norm": 1.0289560556411743, "learning_rate": 1.3653163517815024e-05, "loss": 1.46, "mean_token_accuracy": 0.6879760548472404, "num_tokens": 459314223.0, "step": 28500 }, { "epoch": 6.607254606559277, "grad_norm": 0.9847128987312317, "learning_rate": 1.363644609170584e-05, "loss": 1.4707, "mean_token_accuracy": 0.6849367707967758, "num_tokens": 459475249.0, "step": 28510 }, { "epoch": 6.609572372233167, "grad_norm": 0.9804769158363342, "learning_rate": 1.3619735067648829e-05, "loss": 1.4741, "mean_token_accuracy": 0.684486235678196, "num_tokens": 459635515.0, "step": 28520 }, { "epoch": 6.611890137907057, "grad_norm": 0.9771856665611267, "learning_rate": 1.3603030455058724e-05, "loss": 1.4474, "mean_token_accuracy": 0.6878649175167084, "num_tokens": 459795259.0, "step": 28530 }, { "epoch": 6.614207903580948, "grad_norm": 1.0079662799835205, "learning_rate": 1.3586332263346656e-05, "loss": 1.4562, "mean_token_accuracy": 0.6856920495629311, "num_tokens": 459956762.0, "step": 28540 }, { "epoch": 6.616525669254838, "grad_norm": 0.9847933053970337, "learning_rate": 1.3569640501920122e-05, "loss": 1.4587, "mean_token_accuracy": 0.6858259662985802, "num_tokens": 460118677.0, "step": 28550 }, { "epoch": 6.618843434928729, "grad_norm": 0.9658843278884888, "learning_rate": 1.3552955180183002e-05, "loss": 1.4386, "mean_token_accuracy": 0.6878265798091888, "num_tokens": 460278801.0, "step": 28560 }, { "epoch": 6.621161200602619, "grad_norm": 0.9558590650558472, "learning_rate": 1.3536276307535553e-05, "loss": 1.4517, "mean_token_accuracy": 0.6877832725644112, "num_tokens": 460438977.0, "step": 28570 }, { "epoch": 6.62347896627651, "grad_norm": 0.9648163914680481, "learning_rate": 1.35196038933744e-05, "loss": 1.4456, "mean_token_accuracy": 0.6874385267496109, "num_tokens": 460599855.0, "step": 28580 }, { "epoch": 6.6257967319504, "grad_norm": 0.9983997344970703, "learning_rate": 1.3502937947092526e-05, "loss": 1.4604, "mean_token_accuracy": 0.687956164777279, "num_tokens": 460760647.0, "step": 28590 }, { "epoch": 6.62811449762429, "grad_norm": 0.929640531539917, "learning_rate": 1.3486278478079261e-05, "loss": 1.45, "mean_token_accuracy": 0.6866033107042313, "num_tokens": 460921318.0, "step": 28600 }, { "epoch": 6.6304322632981805, "grad_norm": 1.0127403736114502, "learning_rate": 1.3469625495720309e-05, "loss": 1.4587, "mean_token_accuracy": 0.6860018104314805, "num_tokens": 461082843.0, "step": 28610 }, { "epoch": 6.632750028972071, "grad_norm": 0.9707757234573364, "learning_rate": 1.3452979009397692e-05, "loss": 1.4625, "mean_token_accuracy": 0.6848604083061218, "num_tokens": 461244641.0, "step": 28620 }, { "epoch": 6.635067794645961, "grad_norm": 0.9845738410949707, "learning_rate": 1.3436339028489798e-05, "loss": 1.4589, "mean_token_accuracy": 0.6849948272109032, "num_tokens": 461406452.0, "step": 28630 }, { "epoch": 6.637385560319852, "grad_norm": 0.9856504797935486, "learning_rate": 1.3419705562371322e-05, "loss": 1.4461, "mean_token_accuracy": 0.688805191218853, "num_tokens": 461568554.0, "step": 28640 }, { "epoch": 6.639703325993742, "grad_norm": 1.0047416687011719, "learning_rate": 1.3403078620413318e-05, "loss": 1.4572, "mean_token_accuracy": 0.6866393551230431, "num_tokens": 461729209.0, "step": 28650 }, { "epoch": 6.642021091667632, "grad_norm": 1.0172102451324463, "learning_rate": 1.3386458211983139e-05, "loss": 1.4727, "mean_token_accuracy": 0.6859749257564545, "num_tokens": 461891126.0, "step": 28660 }, { "epoch": 6.6443388573415225, "grad_norm": 1.0781408548355103, "learning_rate": 1.3369844346444477e-05, "loss": 1.4696, "mean_token_accuracy": 0.6867605313658715, "num_tokens": 462051716.0, "step": 28670 }, { "epoch": 6.646656623015414, "grad_norm": 1.0066980123519897, "learning_rate": 1.3353237033157334e-05, "loss": 1.484, "mean_token_accuracy": 0.6825478032231331, "num_tokens": 462213213.0, "step": 28680 }, { "epoch": 6.648974388689304, "grad_norm": 0.9861814975738525, "learning_rate": 1.3336636281478002e-05, "loss": 1.4635, "mean_token_accuracy": 0.6864942967891693, "num_tokens": 462373577.0, "step": 28690 }, { "epoch": 6.651292154363194, "grad_norm": 0.9587047696113586, "learning_rate": 1.3320042100759095e-05, "loss": 1.4342, "mean_token_accuracy": 0.6909455880522728, "num_tokens": 462535312.0, "step": 28700 }, { "epoch": 6.653609920037084, "grad_norm": 0.9579704999923706, "learning_rate": 1.330345450034952e-05, "loss": 1.4751, "mean_token_accuracy": 0.6832230553030968, "num_tokens": 462696956.0, "step": 28710 }, { "epoch": 6.655927685710974, "grad_norm": 0.9818848371505737, "learning_rate": 1.3286873489594479e-05, "loss": 1.4791, "mean_token_accuracy": 0.682834054529667, "num_tokens": 462857866.0, "step": 28720 }, { "epoch": 6.658245451384865, "grad_norm": 0.9433848261833191, "learning_rate": 1.3270299077835458e-05, "loss": 1.4503, "mean_token_accuracy": 0.6874720633029938, "num_tokens": 463019078.0, "step": 28730 }, { "epoch": 6.660563217058756, "grad_norm": 0.9820661544799805, "learning_rate": 1.3253731274410231e-05, "loss": 1.4489, "mean_token_accuracy": 0.687898850440979, "num_tokens": 463179992.0, "step": 28740 }, { "epoch": 6.662880982732646, "grad_norm": 0.9865959882736206, "learning_rate": 1.3237170088652837e-05, "loss": 1.4683, "mean_token_accuracy": 0.6845810383558273, "num_tokens": 463342127.0, "step": 28750 }, { "epoch": 6.665198748406536, "grad_norm": 1.0506346225738525, "learning_rate": 1.3220615529893598e-05, "loss": 1.4486, "mean_token_accuracy": 0.687548914551735, "num_tokens": 463502852.0, "step": 28760 }, { "epoch": 6.667516514080426, "grad_norm": 1.0568573474884033, "learning_rate": 1.3204067607459103e-05, "loss": 1.4671, "mean_token_accuracy": 0.6841390833258629, "num_tokens": 463663603.0, "step": 28770 }, { "epoch": 6.669834279754317, "grad_norm": 0.9940068125724792, "learning_rate": 1.318752633067219e-05, "loss": 1.453, "mean_token_accuracy": 0.6887967109680175, "num_tokens": 463824496.0, "step": 28780 }, { "epoch": 6.6721520454282075, "grad_norm": 0.9971837997436523, "learning_rate": 1.3170991708851968e-05, "loss": 1.4542, "mean_token_accuracy": 0.6872444435954094, "num_tokens": 463986037.0, "step": 28790 }, { "epoch": 6.674469811102098, "grad_norm": 1.0107227563858032, "learning_rate": 1.3154463751313784e-05, "loss": 1.4432, "mean_token_accuracy": 0.6879676342010498, "num_tokens": 464147728.0, "step": 28800 }, { "epoch": 6.676787576775988, "grad_norm": 1.011099934577942, "learning_rate": 1.3137942467369241e-05, "loss": 1.4682, "mean_token_accuracy": 0.6843449041247368, "num_tokens": 464308879.0, "step": 28810 }, { "epoch": 6.679105342449878, "grad_norm": 1.0618914365768433, "learning_rate": 1.3121427866326174e-05, "loss": 1.4658, "mean_token_accuracy": 0.6857728645205498, "num_tokens": 464469597.0, "step": 28820 }, { "epoch": 6.681423108123768, "grad_norm": 0.9739341735839844, "learning_rate": 1.310491995748866e-05, "loss": 1.4503, "mean_token_accuracy": 0.6870113924145699, "num_tokens": 464631475.0, "step": 28830 }, { "epoch": 6.683740873797659, "grad_norm": 1.0104302167892456, "learning_rate": 1.3088418750156994e-05, "loss": 1.4736, "mean_token_accuracy": 0.6840099900960922, "num_tokens": 464792361.0, "step": 28840 }, { "epoch": 6.6860586394715495, "grad_norm": 0.9288576245307922, "learning_rate": 1.3071924253627709e-05, "loss": 1.4625, "mean_token_accuracy": 0.685417865216732, "num_tokens": 464953699.0, "step": 28850 }, { "epoch": 6.68837640514544, "grad_norm": 0.9874981641769409, "learning_rate": 1.3055436477193555e-05, "loss": 1.4606, "mean_token_accuracy": 0.6855022579431533, "num_tokens": 465115841.0, "step": 28860 }, { "epoch": 6.69069417081933, "grad_norm": 0.989585280418396, "learning_rate": 1.3038955430143487e-05, "loss": 1.4509, "mean_token_accuracy": 0.6858395636081696, "num_tokens": 465277095.0, "step": 28870 }, { "epoch": 6.693011936493221, "grad_norm": 1.020081639289856, "learning_rate": 1.3022481121762676e-05, "loss": 1.4575, "mean_token_accuracy": 0.6851540237665177, "num_tokens": 465438965.0, "step": 28880 }, { "epoch": 6.695329702167111, "grad_norm": 1.0119709968566895, "learning_rate": 1.3006013561332498e-05, "loss": 1.4807, "mean_token_accuracy": 0.6840224370360375, "num_tokens": 465600243.0, "step": 28890 }, { "epoch": 6.697647467841001, "grad_norm": 0.9770737290382385, "learning_rate": 1.2989552758130516e-05, "loss": 1.4571, "mean_token_accuracy": 0.6862091347575188, "num_tokens": 465761404.0, "step": 28900 }, { "epoch": 6.699965233514892, "grad_norm": 0.9216016530990601, "learning_rate": 1.2973098721430501e-05, "loss": 1.4615, "mean_token_accuracy": 0.6861558526754379, "num_tokens": 465922712.0, "step": 28910 }, { "epoch": 6.702282999188782, "grad_norm": 0.9833133816719055, "learning_rate": 1.2956651460502406e-05, "loss": 1.4598, "mean_token_accuracy": 0.6861120387911797, "num_tokens": 466083837.0, "step": 28920 }, { "epoch": 6.704600764862672, "grad_norm": 1.0009251832962036, "learning_rate": 1.2940210984612361e-05, "loss": 1.4751, "mean_token_accuracy": 0.6835392355918884, "num_tokens": 466244628.0, "step": 28930 }, { "epoch": 6.706918530536563, "grad_norm": 1.134559154510498, "learning_rate": 1.2923777303022683e-05, "loss": 1.451, "mean_token_accuracy": 0.6875072225928307, "num_tokens": 466404507.0, "step": 28940 }, { "epoch": 6.709236296210453, "grad_norm": 1.0152463912963867, "learning_rate": 1.2907350424991854e-05, "loss": 1.4453, "mean_token_accuracy": 0.6886522650718689, "num_tokens": 466565431.0, "step": 28950 }, { "epoch": 6.7115540618843434, "grad_norm": 0.9649578928947449, "learning_rate": 1.2890930359774528e-05, "loss": 1.4546, "mean_token_accuracy": 0.687452919781208, "num_tokens": 466726484.0, "step": 28960 }, { "epoch": 6.713871827558234, "grad_norm": 0.9546358585357666, "learning_rate": 1.2874517116621512e-05, "loss": 1.465, "mean_token_accuracy": 0.6850874468684196, "num_tokens": 466887629.0, "step": 28970 }, { "epoch": 6.716189593232125, "grad_norm": 0.9947759509086609, "learning_rate": 1.2858110704779783e-05, "loss": 1.4593, "mean_token_accuracy": 0.6864599823951721, "num_tokens": 467049198.0, "step": 28980 }, { "epoch": 6.718507358906015, "grad_norm": 0.9674093127250671, "learning_rate": 1.2841711133492456e-05, "loss": 1.4506, "mean_token_accuracy": 0.6860075980424881, "num_tokens": 467210279.0, "step": 28990 }, { "epoch": 6.720825124579905, "grad_norm": 0.9883754849433899, "learning_rate": 1.2825318411998802e-05, "loss": 1.4469, "mean_token_accuracy": 0.685712119936943, "num_tokens": 467371111.0, "step": 29000 }, { "epoch": 6.723142890253795, "grad_norm": 1.0045446157455444, "learning_rate": 1.2808932549534228e-05, "loss": 1.4582, "mean_token_accuracy": 0.683869868516922, "num_tokens": 467532654.0, "step": 29010 }, { "epoch": 6.7254606559276855, "grad_norm": 0.9924425482749939, "learning_rate": 1.2792553555330277e-05, "loss": 1.4739, "mean_token_accuracy": 0.6831199675798416, "num_tokens": 467694289.0, "step": 29020 }, { "epoch": 6.727778421601576, "grad_norm": 0.9866393804550171, "learning_rate": 1.2776181438614625e-05, "loss": 1.4466, "mean_token_accuracy": 0.6869394779205322, "num_tokens": 467855166.0, "step": 29030 }, { "epoch": 6.730096187275467, "grad_norm": 1.0033401250839233, "learning_rate": 1.2759816208611072e-05, "loss": 1.4623, "mean_token_accuracy": 0.6842972934246063, "num_tokens": 468016456.0, "step": 29040 }, { "epoch": 6.732413952949357, "grad_norm": 0.9753136038780212, "learning_rate": 1.2743457874539539e-05, "loss": 1.45, "mean_token_accuracy": 0.6872117936611175, "num_tokens": 468176615.0, "step": 29050 }, { "epoch": 6.734731718623247, "grad_norm": 1.003542423248291, "learning_rate": 1.2727106445616055e-05, "loss": 1.4614, "mean_token_accuracy": 0.68516054302454, "num_tokens": 468337198.0, "step": 29060 }, { "epoch": 6.737049484297137, "grad_norm": 0.957353949546814, "learning_rate": 1.2710761931052766e-05, "loss": 1.4371, "mean_token_accuracy": 0.6915230691432953, "num_tokens": 468497101.0, "step": 29070 }, { "epoch": 6.739367249971028, "grad_norm": 1.009644627571106, "learning_rate": 1.2694424340057932e-05, "loss": 1.4541, "mean_token_accuracy": 0.6873427301645278, "num_tokens": 468656734.0, "step": 29080 }, { "epoch": 6.741685015644919, "grad_norm": 0.96925288438797, "learning_rate": 1.2678093681835884e-05, "loss": 1.4503, "mean_token_accuracy": 0.6872164383530617, "num_tokens": 468818524.0, "step": 29090 }, { "epoch": 6.744002781318809, "grad_norm": 1.0276696681976318, "learning_rate": 1.2661769965587078e-05, "loss": 1.4652, "mean_token_accuracy": 0.6851012915372848, "num_tokens": 468979596.0, "step": 29100 }, { "epoch": 6.746320546992699, "grad_norm": 1.0122367143630981, "learning_rate": 1.264545320050804e-05, "loss": 1.451, "mean_token_accuracy": 0.6866640463471413, "num_tokens": 469141135.0, "step": 29110 }, { "epoch": 6.748638312666589, "grad_norm": 1.0084885358810425, "learning_rate": 1.2629143395791388e-05, "loss": 1.4589, "mean_token_accuracy": 0.6857517346739769, "num_tokens": 469303047.0, "step": 29120 }, { "epoch": 6.750956078340479, "grad_norm": 1.0403257608413696, "learning_rate": 1.261284056062581e-05, "loss": 1.4744, "mean_token_accuracy": 0.6831424385309219, "num_tokens": 469464240.0, "step": 29130 }, { "epoch": 6.7532738440143705, "grad_norm": 1.0211975574493408, "learning_rate": 1.2596544704196083e-05, "loss": 1.4737, "mean_token_accuracy": 0.685033293068409, "num_tokens": 469625951.0, "step": 29140 }, { "epoch": 6.755591609688261, "grad_norm": 0.9568000435829163, "learning_rate": 1.2580255835683035e-05, "loss": 1.4467, "mean_token_accuracy": 0.6867040440440177, "num_tokens": 469787468.0, "step": 29150 }, { "epoch": 6.757909375362151, "grad_norm": 1.006854772567749, "learning_rate": 1.256397396426357e-05, "loss": 1.4414, "mean_token_accuracy": 0.6871100172400475, "num_tokens": 469947704.0, "step": 29160 }, { "epoch": 6.760227141036041, "grad_norm": 0.9653562903404236, "learning_rate": 1.2547699099110643e-05, "loss": 1.4675, "mean_token_accuracy": 0.6836227849125862, "num_tokens": 470109268.0, "step": 29170 }, { "epoch": 6.762544906709931, "grad_norm": 1.014249563217163, "learning_rate": 1.253143124939326e-05, "loss": 1.4577, "mean_token_accuracy": 0.686081263422966, "num_tokens": 470270142.0, "step": 29180 }, { "epoch": 6.764862672383822, "grad_norm": 0.9835718274116516, "learning_rate": 1.2515170424276487e-05, "loss": 1.4431, "mean_token_accuracy": 0.6872073352336884, "num_tokens": 470431003.0, "step": 29190 }, { "epoch": 6.7671804380577125, "grad_norm": 0.9816526174545288, "learning_rate": 1.2498916632921414e-05, "loss": 1.4434, "mean_token_accuracy": 0.6876777410507202, "num_tokens": 470591748.0, "step": 29200 }, { "epoch": 6.769498203731603, "grad_norm": 0.9843991994857788, "learning_rate": 1.2482669884485187e-05, "loss": 1.4551, "mean_token_accuracy": 0.6858713194727898, "num_tokens": 470752672.0, "step": 29210 }, { "epoch": 6.771815969405493, "grad_norm": 0.9769500494003296, "learning_rate": 1.2466430188120978e-05, "loss": 1.4573, "mean_token_accuracy": 0.6855825573205948, "num_tokens": 470914699.0, "step": 29220 }, { "epoch": 6.774133735079383, "grad_norm": 0.997018575668335, "learning_rate": 1.2450197552977968e-05, "loss": 1.459, "mean_token_accuracy": 0.6875819697976112, "num_tokens": 471076713.0, "step": 29230 }, { "epoch": 6.776451500753274, "grad_norm": 0.9572548866271973, "learning_rate": 1.2433971988201382e-05, "loss": 1.4541, "mean_token_accuracy": 0.6860234916210175, "num_tokens": 471238060.0, "step": 29240 }, { "epoch": 6.778769266427164, "grad_norm": 1.045973539352417, "learning_rate": 1.2417753502932453e-05, "loss": 1.4559, "mean_token_accuracy": 0.6866987839341163, "num_tokens": 471398136.0, "step": 29250 }, { "epoch": 6.781087032101055, "grad_norm": 0.975159227848053, "learning_rate": 1.2401542106308433e-05, "loss": 1.4652, "mean_token_accuracy": 0.6851476013660431, "num_tokens": 471558959.0, "step": 29260 }, { "epoch": 6.783404797774945, "grad_norm": 1.0074782371520996, "learning_rate": 1.238533780746257e-05, "loss": 1.472, "mean_token_accuracy": 0.6831891939043999, "num_tokens": 471720804.0, "step": 29270 }, { "epoch": 6.785722563448835, "grad_norm": 0.9802244901657104, "learning_rate": 1.2369140615524119e-05, "loss": 1.4452, "mean_token_accuracy": 0.6860176965594291, "num_tokens": 471881653.0, "step": 29280 }, { "epoch": 6.788040329122726, "grad_norm": 0.923150897026062, "learning_rate": 1.2352950539618328e-05, "loss": 1.4687, "mean_token_accuracy": 0.6848419889807701, "num_tokens": 472043528.0, "step": 29290 }, { "epoch": 6.790358094796616, "grad_norm": 1.0126787424087524, "learning_rate": 1.2336767588866441e-05, "loss": 1.4526, "mean_token_accuracy": 0.6875339657068252, "num_tokens": 472204858.0, "step": 29300 }, { "epoch": 6.792675860470506, "grad_norm": 0.9476326107978821, "learning_rate": 1.232059177238568e-05, "loss": 1.4741, "mean_token_accuracy": 0.6837856203317643, "num_tokens": 472366808.0, "step": 29310 }, { "epoch": 6.794993626144397, "grad_norm": 1.0062848329544067, "learning_rate": 1.2304423099289258e-05, "loss": 1.4638, "mean_token_accuracy": 0.6852654248476029, "num_tokens": 472527432.0, "step": 29320 }, { "epoch": 6.797311391818287, "grad_norm": 1.0063546895980835, "learning_rate": 1.2288261578686356e-05, "loss": 1.4758, "mean_token_accuracy": 0.68343665599823, "num_tokens": 472688089.0, "step": 29330 }, { "epoch": 6.799629157492178, "grad_norm": 1.0331366062164307, "learning_rate": 1.2272107219682121e-05, "loss": 1.4559, "mean_token_accuracy": 0.687675715982914, "num_tokens": 472848721.0, "step": 29340 }, { "epoch": 6.801946923166068, "grad_norm": 1.0025601387023926, "learning_rate": 1.2255960031377681e-05, "loss": 1.4809, "mean_token_accuracy": 0.6835554704070091, "num_tokens": 473010137.0, "step": 29350 }, { "epoch": 6.804264688839958, "grad_norm": 1.005445957183838, "learning_rate": 1.2239820022870107e-05, "loss": 1.4482, "mean_token_accuracy": 0.6864732399582862, "num_tokens": 473171716.0, "step": 29360 }, { "epoch": 6.8065824545138485, "grad_norm": 0.9714868664741516, "learning_rate": 1.2223687203252434e-05, "loss": 1.4642, "mean_token_accuracy": 0.6850950762629509, "num_tokens": 473332494.0, "step": 29370 }, { "epoch": 6.808900220187739, "grad_norm": 1.0413898229599, "learning_rate": 1.220756158161365e-05, "loss": 1.4797, "mean_token_accuracy": 0.6829142108559608, "num_tokens": 473492990.0, "step": 29380 }, { "epoch": 6.81121798586163, "grad_norm": 0.9762955904006958, "learning_rate": 1.219144316703868e-05, "loss": 1.4602, "mean_token_accuracy": 0.6855031475424767, "num_tokens": 473655121.0, "step": 29390 }, { "epoch": 6.81353575153552, "grad_norm": 0.9814906120300293, "learning_rate": 1.2175331968608389e-05, "loss": 1.4392, "mean_token_accuracy": 0.6884114533662796, "num_tokens": 473817000.0, "step": 29400 }, { "epoch": 6.81585351720941, "grad_norm": 0.990664541721344, "learning_rate": 1.215922799539958e-05, "loss": 1.4412, "mean_token_accuracy": 0.6891962677240372, "num_tokens": 473978581.0, "step": 29410 }, { "epoch": 6.8181712828833, "grad_norm": 0.9787161350250244, "learning_rate": 1.214313125648499e-05, "loss": 1.4876, "mean_token_accuracy": 0.6825928345322609, "num_tokens": 474140322.0, "step": 29420 }, { "epoch": 6.8204890485571905, "grad_norm": 0.9254423975944519, "learning_rate": 1.2127041760933273e-05, "loss": 1.4543, "mean_token_accuracy": 0.685690526664257, "num_tokens": 474301779.0, "step": 29430 }, { "epoch": 6.822806814231082, "grad_norm": 0.9614309668540955, "learning_rate": 1.2110959517809e-05, "loss": 1.4658, "mean_token_accuracy": 0.6856334537267685, "num_tokens": 474463223.0, "step": 29440 }, { "epoch": 6.825124579904972, "grad_norm": 0.9511658549308777, "learning_rate": 1.2094884536172665e-05, "loss": 1.4649, "mean_token_accuracy": 0.6853669047355652, "num_tokens": 474624210.0, "step": 29450 }, { "epoch": 6.827442345578862, "grad_norm": 1.0206670761108398, "learning_rate": 1.2078816825080663e-05, "loss": 1.4676, "mean_token_accuracy": 0.6851143956184387, "num_tokens": 474785491.0, "step": 29460 }, { "epoch": 6.829760111252752, "grad_norm": 1.0348827838897705, "learning_rate": 1.2062756393585301e-05, "loss": 1.4384, "mean_token_accuracy": 0.6879691854119301, "num_tokens": 474946201.0, "step": 29470 }, { "epoch": 6.832077876926642, "grad_norm": 0.9576787352561951, "learning_rate": 1.2046703250734784e-05, "loss": 1.4519, "mean_token_accuracy": 0.6868717551231385, "num_tokens": 475108112.0, "step": 29480 }, { "epoch": 6.8343956426005334, "grad_norm": 1.0355250835418701, "learning_rate": 1.2030657405573193e-05, "loss": 1.4827, "mean_token_accuracy": 0.6835582867264748, "num_tokens": 475269235.0, "step": 29490 }, { "epoch": 6.836713408274424, "grad_norm": 0.9847627282142639, "learning_rate": 1.2014618867140518e-05, "loss": 1.47, "mean_token_accuracy": 0.683941662311554, "num_tokens": 475430062.0, "step": 29500 }, { "epoch": 6.839031173948314, "grad_norm": 1.0089809894561768, "learning_rate": 1.1998587644472623e-05, "loss": 1.4361, "mean_token_accuracy": 0.6889027073979378, "num_tokens": 475589654.0, "step": 29510 }, { "epoch": 6.841348939622204, "grad_norm": 0.9892637729644775, "learning_rate": 1.1982563746601259e-05, "loss": 1.4669, "mean_token_accuracy": 0.6857278853654861, "num_tokens": 475750555.0, "step": 29520 }, { "epoch": 6.843666705296094, "grad_norm": 2.8397934436798096, "learning_rate": 1.1966547182554042e-05, "loss": 1.4549, "mean_token_accuracy": 0.6882754117250443, "num_tokens": 475912194.0, "step": 29530 }, { "epoch": 6.845984470969985, "grad_norm": 0.9767532348632812, "learning_rate": 1.1950537961354455e-05, "loss": 1.4535, "mean_token_accuracy": 0.6877492010593415, "num_tokens": 476073250.0, "step": 29540 }, { "epoch": 6.8483022366438755, "grad_norm": 0.9600573182106018, "learning_rate": 1.1934536092021855e-05, "loss": 1.4563, "mean_token_accuracy": 0.686535595357418, "num_tokens": 476235320.0, "step": 29550 }, { "epoch": 6.850620002317766, "grad_norm": 1.0582014322280884, "learning_rate": 1.1918541583571447e-05, "loss": 1.452, "mean_token_accuracy": 0.687188982963562, "num_tokens": 476396261.0, "step": 29560 }, { "epoch": 6.852937767991656, "grad_norm": 1.054034948348999, "learning_rate": 1.1902554445014288e-05, "loss": 1.4613, "mean_token_accuracy": 0.6855417922139168, "num_tokens": 476556689.0, "step": 29570 }, { "epoch": 6.855255533665546, "grad_norm": 0.9702194333076477, "learning_rate": 1.1886574685357293e-05, "loss": 1.4572, "mean_token_accuracy": 0.6865079596638679, "num_tokens": 476717435.0, "step": 29580 }, { "epoch": 6.857573299339437, "grad_norm": 0.9392906427383423, "learning_rate": 1.1870602313603213e-05, "loss": 1.4466, "mean_token_accuracy": 0.6894305497407913, "num_tokens": 476878354.0, "step": 29590 }, { "epoch": 6.859891065013327, "grad_norm": 0.9284941554069519, "learning_rate": 1.1854637338750632e-05, "loss": 1.4422, "mean_token_accuracy": 0.6870123609900475, "num_tokens": 477038908.0, "step": 29600 }, { "epoch": 6.8622088306872175, "grad_norm": 0.9748296737670898, "learning_rate": 1.1838679769793978e-05, "loss": 1.4529, "mean_token_accuracy": 0.6864281326532364, "num_tokens": 477200681.0, "step": 29610 }, { "epoch": 6.864526596361108, "grad_norm": 0.9350164532661438, "learning_rate": 1.1822729615723502e-05, "loss": 1.461, "mean_token_accuracy": 0.6863307014107705, "num_tokens": 477361423.0, "step": 29620 }, { "epoch": 6.866844362034998, "grad_norm": 1.0085139274597168, "learning_rate": 1.180678688552527e-05, "loss": 1.4712, "mean_token_accuracy": 0.6855048403143883, "num_tokens": 477521637.0, "step": 29630 }, { "epoch": 6.869162127708889, "grad_norm": 0.9914702773094177, "learning_rate": 1.1790851588181176e-05, "loss": 1.4594, "mean_token_accuracy": 0.6855636179447174, "num_tokens": 477683201.0, "step": 29640 }, { "epoch": 6.871479893382779, "grad_norm": 1.0346399545669556, "learning_rate": 1.177492373266892e-05, "loss": 1.4605, "mean_token_accuracy": 0.6858128815889358, "num_tokens": 477845173.0, "step": 29650 }, { "epoch": 6.873797659056669, "grad_norm": 1.0206151008605957, "learning_rate": 1.1759003327962015e-05, "loss": 1.4548, "mean_token_accuracy": 0.6853224068880082, "num_tokens": 478006995.0, "step": 29660 }, { "epoch": 6.87611542473056, "grad_norm": 0.9900175929069519, "learning_rate": 1.1743090383029773e-05, "loss": 1.4591, "mean_token_accuracy": 0.6850492343306541, "num_tokens": 478168234.0, "step": 29670 }, { "epoch": 6.87843319040445, "grad_norm": 1.0056209564208984, "learning_rate": 1.17271849068373e-05, "loss": 1.4713, "mean_token_accuracy": 0.6830391004681587, "num_tokens": 478327448.0, "step": 29680 }, { "epoch": 6.880750956078341, "grad_norm": 0.94859379529953, "learning_rate": 1.1711286908345497e-05, "loss": 1.4614, "mean_token_accuracy": 0.6852126851677894, "num_tokens": 478489390.0, "step": 29690 }, { "epoch": 6.883068721752231, "grad_norm": 0.9435256719589233, "learning_rate": 1.169539639651106e-05, "loss": 1.4388, "mean_token_accuracy": 0.6871607914566994, "num_tokens": 478650667.0, "step": 29700 }, { "epoch": 6.885386487426121, "grad_norm": 0.9505389332771301, "learning_rate": 1.1679513380286453e-05, "loss": 1.4571, "mean_token_accuracy": 0.687000036239624, "num_tokens": 478812105.0, "step": 29710 }, { "epoch": 6.887704253100011, "grad_norm": 0.967971682548523, "learning_rate": 1.1663637868619928e-05, "loss": 1.4637, "mean_token_accuracy": 0.6854980528354645, "num_tokens": 478973788.0, "step": 29720 }, { "epoch": 6.890022018773902, "grad_norm": 0.9789060354232788, "learning_rate": 1.1647769870455503e-05, "loss": 1.4614, "mean_token_accuracy": 0.6863331437110901, "num_tokens": 479133990.0, "step": 29730 }, { "epoch": 6.892339784447793, "grad_norm": 0.9694551825523376, "learning_rate": 1.1631909394732962e-05, "loss": 1.4675, "mean_token_accuracy": 0.6855992287397384, "num_tokens": 479294506.0, "step": 29740 }, { "epoch": 6.894657550121683, "grad_norm": 1.0008679628372192, "learning_rate": 1.161605645038786e-05, "loss": 1.4544, "mean_token_accuracy": 0.6859511002898216, "num_tokens": 479456268.0, "step": 29750 }, { "epoch": 6.896975315795573, "grad_norm": 0.9935459494590759, "learning_rate": 1.16002110463515e-05, "loss": 1.4571, "mean_token_accuracy": 0.6854961663484573, "num_tokens": 479617775.0, "step": 29760 }, { "epoch": 6.899293081469463, "grad_norm": 1.0089430809020996, "learning_rate": 1.1584373191550941e-05, "loss": 1.4744, "mean_token_accuracy": 0.682920390367508, "num_tokens": 479778937.0, "step": 29770 }, { "epoch": 6.9016108471433535, "grad_norm": 0.9763585329055786, "learning_rate": 1.1568542894908995e-05, "loss": 1.47, "mean_token_accuracy": 0.6855308026075363, "num_tokens": 479940367.0, "step": 29780 }, { "epoch": 6.903928612817245, "grad_norm": 1.0168782472610474, "learning_rate": 1.155272016534419e-05, "loss": 1.4562, "mean_token_accuracy": 0.6861865192651748, "num_tokens": 480101896.0, "step": 29790 }, { "epoch": 6.906246378491135, "grad_norm": 0.9992652535438538, "learning_rate": 1.1536905011770824e-05, "loss": 1.4732, "mean_token_accuracy": 0.6841618105769157, "num_tokens": 480262641.0, "step": 29800 }, { "epoch": 6.908564144165025, "grad_norm": 0.9707855582237244, "learning_rate": 1.1521097443098905e-05, "loss": 1.4616, "mean_token_accuracy": 0.6864232778549194, "num_tokens": 480422567.0, "step": 29810 }, { "epoch": 6.910881909838915, "grad_norm": 0.9253271222114563, "learning_rate": 1.1505297468234177e-05, "loss": 1.4786, "mean_token_accuracy": 0.6828747972846031, "num_tokens": 480583703.0, "step": 29820 }, { "epoch": 6.913199675512805, "grad_norm": 0.9709325432777405, "learning_rate": 1.148950509607811e-05, "loss": 1.459, "mean_token_accuracy": 0.684778293967247, "num_tokens": 480744425.0, "step": 29830 }, { "epoch": 6.9155174411866955, "grad_norm": 0.9640081524848938, "learning_rate": 1.1473720335527873e-05, "loss": 1.4444, "mean_token_accuracy": 0.688093975186348, "num_tokens": 480904800.0, "step": 29840 }, { "epoch": 6.917835206860587, "grad_norm": 0.9829614758491516, "learning_rate": 1.145794319547637e-05, "loss": 1.4663, "mean_token_accuracy": 0.6846221044659615, "num_tokens": 481065692.0, "step": 29850 }, { "epoch": 6.920152972534477, "grad_norm": 0.9645756483078003, "learning_rate": 1.1442173684812194e-05, "loss": 1.4685, "mean_token_accuracy": 0.685664527118206, "num_tokens": 481227286.0, "step": 29860 }, { "epoch": 6.922470738208367, "grad_norm": 1.0025651454925537, "learning_rate": 1.1426411812419643e-05, "loss": 1.4412, "mean_token_accuracy": 0.6882328316569328, "num_tokens": 481388303.0, "step": 29870 }, { "epoch": 6.924788503882257, "grad_norm": 1.0281659364700317, "learning_rate": 1.1410657587178719e-05, "loss": 1.4728, "mean_token_accuracy": 0.683619712293148, "num_tokens": 481550273.0, "step": 29880 }, { "epoch": 6.927106269556148, "grad_norm": 0.9929125308990479, "learning_rate": 1.1394911017965113e-05, "loss": 1.4554, "mean_token_accuracy": 0.6869554772973061, "num_tokens": 481710189.0, "step": 29890 }, { "epoch": 6.9294240352300385, "grad_norm": 0.9991934299468994, "learning_rate": 1.1379172113650195e-05, "loss": 1.4593, "mean_token_accuracy": 0.6863330349326133, "num_tokens": 481871904.0, "step": 29900 }, { "epoch": 6.931741800903929, "grad_norm": 1.0018563270568848, "learning_rate": 1.1363440883101026e-05, "loss": 1.4538, "mean_token_accuracy": 0.6859148651361465, "num_tokens": 482033062.0, "step": 29910 }, { "epoch": 6.934059566577819, "grad_norm": 0.9936092495918274, "learning_rate": 1.1347717335180339e-05, "loss": 1.4608, "mean_token_accuracy": 0.685537925362587, "num_tokens": 482192707.0, "step": 29920 }, { "epoch": 6.936377332251709, "grad_norm": 0.9707922339439392, "learning_rate": 1.1332001478746542e-05, "loss": 1.4585, "mean_token_accuracy": 0.6859954357147217, "num_tokens": 482353783.0, "step": 29930 }, { "epoch": 6.938695097925599, "grad_norm": 0.9422765374183655, "learning_rate": 1.1316293322653706e-05, "loss": 1.4467, "mean_token_accuracy": 0.686287647485733, "num_tokens": 482515221.0, "step": 29940 }, { "epoch": 6.94101286359949, "grad_norm": 0.9901282787322998, "learning_rate": 1.1300592875751565e-05, "loss": 1.4632, "mean_token_accuracy": 0.6859884813427926, "num_tokens": 482676596.0, "step": 29950 }, { "epoch": 6.9433306292733805, "grad_norm": 0.9976504445075989, "learning_rate": 1.1284900146885512e-05, "loss": 1.4457, "mean_token_accuracy": 0.6881479874253273, "num_tokens": 482837900.0, "step": 29960 }, { "epoch": 6.945648394947271, "grad_norm": 0.9263715744018555, "learning_rate": 1.1269215144896588e-05, "loss": 1.4475, "mean_token_accuracy": 0.6866578832268715, "num_tokens": 482999701.0, "step": 29970 }, { "epoch": 6.947966160621161, "grad_norm": 0.9563489556312561, "learning_rate": 1.1253537878621484e-05, "loss": 1.4622, "mean_token_accuracy": 0.686160059273243, "num_tokens": 483161769.0, "step": 29980 }, { "epoch": 6.950283926295052, "grad_norm": 1.0061250925064087, "learning_rate": 1.1237868356892533e-05, "loss": 1.467, "mean_token_accuracy": 0.6855025187134742, "num_tokens": 483322920.0, "step": 29990 }, { "epoch": 6.952601691968942, "grad_norm": 0.9990372657775879, "learning_rate": 1.1222206588537701e-05, "loss": 1.4626, "mean_token_accuracy": 0.6865766152739525, "num_tokens": 483484905.0, "step": 30000 }, { "epoch": 6.954919457642832, "grad_norm": 0.970174252986908, "learning_rate": 1.1206552582380589e-05, "loss": 1.4632, "mean_token_accuracy": 0.6852799966931343, "num_tokens": 483645904.0, "step": 30010 }, { "epoch": 6.957237223316723, "grad_norm": 0.985427737236023, "learning_rate": 1.1190906347240426e-05, "loss": 1.4526, "mean_token_accuracy": 0.685934416949749, "num_tokens": 483805743.0, "step": 30020 }, { "epoch": 6.959554988990613, "grad_norm": 0.9783066511154175, "learning_rate": 1.1175267891932059e-05, "loss": 1.4439, "mean_token_accuracy": 0.6887204125523567, "num_tokens": 483966420.0, "step": 30030 }, { "epoch": 6.961872754664503, "grad_norm": 0.9907786846160889, "learning_rate": 1.1159637225265956e-05, "loss": 1.46, "mean_token_accuracy": 0.6848722159862518, "num_tokens": 484127836.0, "step": 30040 }, { "epoch": 6.964190520338394, "grad_norm": 0.9718541502952576, "learning_rate": 1.114401435604819e-05, "loss": 1.4849, "mean_token_accuracy": 0.6820522636175156, "num_tokens": 484290008.0, "step": 30050 }, { "epoch": 6.966508286012284, "grad_norm": 1.0100051164627075, "learning_rate": 1.1128399293080454e-05, "loss": 1.4769, "mean_token_accuracy": 0.6838291749358177, "num_tokens": 484451032.0, "step": 30060 }, { "epoch": 6.968826051686174, "grad_norm": 0.9499564170837402, "learning_rate": 1.111279204516003e-05, "loss": 1.4415, "mean_token_accuracy": 0.6883779659867286, "num_tokens": 484612531.0, "step": 30070 }, { "epoch": 6.971143817360065, "grad_norm": 1.0352647304534912, "learning_rate": 1.1097192621079805e-05, "loss": 1.4577, "mean_token_accuracy": 0.6857477471232414, "num_tokens": 484773582.0, "step": 30080 }, { "epoch": 6.973461583033956, "grad_norm": 1.079274296760559, "learning_rate": 1.1081601029628256e-05, "loss": 1.4677, "mean_token_accuracy": 0.6846997126936912, "num_tokens": 484934567.0, "step": 30090 }, { "epoch": 6.975779348707846, "grad_norm": 0.9628220200538635, "learning_rate": 1.1066017279589442e-05, "loss": 1.4542, "mean_token_accuracy": 0.6866884127259254, "num_tokens": 485096065.0, "step": 30100 }, { "epoch": 6.978097114381736, "grad_norm": 0.9146168231964111, "learning_rate": 1.1050441379743013e-05, "loss": 1.4485, "mean_token_accuracy": 0.6875680208206176, "num_tokens": 485256788.0, "step": 30110 }, { "epoch": 6.980414880055626, "grad_norm": 0.9957306981086731, "learning_rate": 1.103487333886419e-05, "loss": 1.4651, "mean_token_accuracy": 0.6847381308674813, "num_tokens": 485417233.0, "step": 30120 }, { "epoch": 6.9827326457295165, "grad_norm": 0.9413638710975647, "learning_rate": 1.1019313165723772e-05, "loss": 1.4505, "mean_token_accuracy": 0.6867574334144593, "num_tokens": 485578514.0, "step": 30130 }, { "epoch": 6.985050411403407, "grad_norm": 0.9751201868057251, "learning_rate": 1.1003760869088117e-05, "loss": 1.4474, "mean_token_accuracy": 0.6870372503995895, "num_tokens": 485740102.0, "step": 30140 }, { "epoch": 6.987368177077298, "grad_norm": 0.9708794355392456, "learning_rate": 1.0988216457719159e-05, "loss": 1.4692, "mean_token_accuracy": 0.6843780755996705, "num_tokens": 485901783.0, "step": 30150 }, { "epoch": 6.989685942751188, "grad_norm": 1.0625628232955933, "learning_rate": 1.0972679940374372e-05, "loss": 1.4365, "mean_token_accuracy": 0.6892052903771401, "num_tokens": 486062748.0, "step": 30160 }, { "epoch": 6.992003708425078, "grad_norm": 0.9565974473953247, "learning_rate": 1.0957151325806797e-05, "loss": 1.483, "mean_token_accuracy": 0.6824429273605347, "num_tokens": 486224117.0, "step": 30170 }, { "epoch": 6.994321474098968, "grad_norm": 1.0619940757751465, "learning_rate": 1.0941630622765017e-05, "loss": 1.4461, "mean_token_accuracy": 0.6880158513784409, "num_tokens": 486385245.0, "step": 30180 }, { "epoch": 6.996639239772859, "grad_norm": 0.9973335862159729, "learning_rate": 1.0926117839993158e-05, "loss": 1.464, "mean_token_accuracy": 0.6866938933730126, "num_tokens": 486545923.0, "step": 30190 }, { "epoch": 6.99895700544675, "grad_norm": 0.9932464957237244, "learning_rate": 1.0910612986230884e-05, "loss": 1.4483, "mean_token_accuracy": 0.6878403052687645, "num_tokens": 486707181.0, "step": 30200 }, { "epoch": 7.001158882836945, "grad_norm": 0.9551072716712952, "learning_rate": 1.0895116070213392e-05, "loss": 1.4441, "mean_token_accuracy": 0.68859233197413, "num_tokens": 486859295.0, "step": 30210 }, { "epoch": 7.003476648510835, "grad_norm": 1.0183480978012085, "learning_rate": 1.0879627100671408e-05, "loss": 1.462, "mean_token_accuracy": 0.6850327283143998, "num_tokens": 487020592.0, "step": 30220 }, { "epoch": 7.0057944141847255, "grad_norm": 0.9864630103111267, "learning_rate": 1.0864146086331178e-05, "loss": 1.4378, "mean_token_accuracy": 0.6898705571889877, "num_tokens": 487180773.0, "step": 30230 }, { "epoch": 7.008112179858617, "grad_norm": 0.9975780844688416, "learning_rate": 1.084867303591447e-05, "loss": 1.4304, "mean_token_accuracy": 0.6892667710781097, "num_tokens": 487342211.0, "step": 30240 }, { "epoch": 7.010429945532507, "grad_norm": 0.994164764881134, "learning_rate": 1.0833207958138561e-05, "loss": 1.4413, "mean_token_accuracy": 0.6895751729607582, "num_tokens": 487503636.0, "step": 30250 }, { "epoch": 7.012747711206397, "grad_norm": 0.995633602142334, "learning_rate": 1.0817750861716241e-05, "loss": 1.4309, "mean_token_accuracy": 0.6898093819618225, "num_tokens": 487665116.0, "step": 30260 }, { "epoch": 7.015065476880287, "grad_norm": 1.0043178796768188, "learning_rate": 1.0802301755355814e-05, "loss": 1.4422, "mean_token_accuracy": 0.6894893646240234, "num_tokens": 487826871.0, "step": 30270 }, { "epoch": 7.017383242554177, "grad_norm": 0.9798178672790527, "learning_rate": 1.0786860647761046e-05, "loss": 1.4434, "mean_token_accuracy": 0.6884770467877388, "num_tokens": 487987188.0, "step": 30280 }, { "epoch": 7.019701008228068, "grad_norm": 1.003448486328125, "learning_rate": 1.077142754763123e-05, "loss": 1.431, "mean_token_accuracy": 0.6893723636865616, "num_tokens": 488148118.0, "step": 30290 }, { "epoch": 7.022018773901959, "grad_norm": 0.9974724650382996, "learning_rate": 1.0756002463661142e-05, "loss": 1.4358, "mean_token_accuracy": 0.6876380741596222, "num_tokens": 488308993.0, "step": 30300 }, { "epoch": 7.024336539575849, "grad_norm": 1.005236029624939, "learning_rate": 1.0740585404541034e-05, "loss": 1.4327, "mean_token_accuracy": 0.6886857658624649, "num_tokens": 488470882.0, "step": 30310 }, { "epoch": 7.026654305249739, "grad_norm": 1.0185840129852295, "learning_rate": 1.0725176378956645e-05, "loss": 1.4432, "mean_token_accuracy": 0.6891646519303322, "num_tokens": 488632393.0, "step": 30320 }, { "epoch": 7.028972070923629, "grad_norm": 0.9959010481834412, "learning_rate": 1.0709775395589181e-05, "loss": 1.4314, "mean_token_accuracy": 0.690037977695465, "num_tokens": 488793681.0, "step": 30330 }, { "epoch": 7.03128983659752, "grad_norm": 0.9878491759300232, "learning_rate": 1.0694382463115318e-05, "loss": 1.4546, "mean_token_accuracy": 0.687186236679554, "num_tokens": 488954595.0, "step": 30340 }, { "epoch": 7.0336076022714105, "grad_norm": 0.9929236769676208, "learning_rate": 1.0678997590207202e-05, "loss": 1.4478, "mean_token_accuracy": 0.688758097589016, "num_tokens": 489115105.0, "step": 30350 }, { "epoch": 7.035925367945301, "grad_norm": 1.0284937620162964, "learning_rate": 1.0663620785532433e-05, "loss": 1.4618, "mean_token_accuracy": 0.6858133226633072, "num_tokens": 489276994.0, "step": 30360 }, { "epoch": 7.038243133619191, "grad_norm": 1.0299055576324463, "learning_rate": 1.0648252057754069e-05, "loss": 1.4492, "mean_token_accuracy": 0.6874721989035606, "num_tokens": 489437090.0, "step": 30370 }, { "epoch": 7.040560899293081, "grad_norm": 1.0103245973587036, "learning_rate": 1.0632891415530616e-05, "loss": 1.4382, "mean_token_accuracy": 0.6884740099310875, "num_tokens": 489598824.0, "step": 30380 }, { "epoch": 7.042878664966972, "grad_norm": 0.9937975406646729, "learning_rate": 1.0617538867516018e-05, "loss": 1.4365, "mean_token_accuracy": 0.6878159090876579, "num_tokens": 489759481.0, "step": 30390 }, { "epoch": 7.045196430640862, "grad_norm": 1.0674666166305542, "learning_rate": 1.0602194422359676e-05, "loss": 1.4452, "mean_token_accuracy": 0.6867089673876763, "num_tokens": 489920550.0, "step": 30400 }, { "epoch": 7.0475141963147525, "grad_norm": 0.991886556148529, "learning_rate": 1.0586858088706417e-05, "loss": 1.4549, "mean_token_accuracy": 0.6882152274250984, "num_tokens": 490082235.0, "step": 30410 }, { "epoch": 7.049831961988643, "grad_norm": 1.022518515586853, "learning_rate": 1.0571529875196479e-05, "loss": 1.4448, "mean_token_accuracy": 0.6873372912406921, "num_tokens": 490242644.0, "step": 30420 }, { "epoch": 7.052149727662533, "grad_norm": 1.0036334991455078, "learning_rate": 1.0556209790465551e-05, "loss": 1.4564, "mean_token_accuracy": 0.6856729671359062, "num_tokens": 490403097.0, "step": 30430 }, { "epoch": 7.054467493336424, "grad_norm": 1.023674726486206, "learning_rate": 1.054089784314474e-05, "loss": 1.4396, "mean_token_accuracy": 0.6892703488469124, "num_tokens": 490564754.0, "step": 30440 }, { "epoch": 7.056785259010314, "grad_norm": 1.0150258541107178, "learning_rate": 1.0525594041860556e-05, "loss": 1.4489, "mean_token_accuracy": 0.6872036814689636, "num_tokens": 490726383.0, "step": 30450 }, { "epoch": 7.059103024684204, "grad_norm": 1.0371928215026855, "learning_rate": 1.0510298395234927e-05, "loss": 1.453, "mean_token_accuracy": 0.6849699392914772, "num_tokens": 490887625.0, "step": 30460 }, { "epoch": 7.061420790358095, "grad_norm": 0.9809801578521729, "learning_rate": 1.0495010911885189e-05, "loss": 1.4611, "mean_token_accuracy": 0.6845606118440628, "num_tokens": 491048297.0, "step": 30470 }, { "epoch": 7.063738556031985, "grad_norm": 0.9944107532501221, "learning_rate": 1.0479731600424076e-05, "loss": 1.4438, "mean_token_accuracy": 0.6874740079045296, "num_tokens": 491210079.0, "step": 30480 }, { "epoch": 7.066056321705876, "grad_norm": 1.0969388484954834, "learning_rate": 1.0464460469459714e-05, "loss": 1.4413, "mean_token_accuracy": 0.6877382740378379, "num_tokens": 491371733.0, "step": 30490 }, { "epoch": 7.068374087379766, "grad_norm": 0.979986310005188, "learning_rate": 1.0449197527595628e-05, "loss": 1.4375, "mean_token_accuracy": 0.6886613860726356, "num_tokens": 491532502.0, "step": 30500 }, { "epoch": 7.070691853053656, "grad_norm": 0.997211217880249, "learning_rate": 1.0433942783430722e-05, "loss": 1.4503, "mean_token_accuracy": 0.6882845893502235, "num_tokens": 491692964.0, "step": 30510 }, { "epoch": 7.073009618727546, "grad_norm": 0.9985148310661316, "learning_rate": 1.041869624555929e-05, "loss": 1.4713, "mean_token_accuracy": 0.6848978862166405, "num_tokens": 491854562.0, "step": 30520 }, { "epoch": 7.075327384401437, "grad_norm": 1.002441644668579, "learning_rate": 1.0403457922570992e-05, "loss": 1.4298, "mean_token_accuracy": 0.6906353697180748, "num_tokens": 492014993.0, "step": 30530 }, { "epoch": 7.077645150075328, "grad_norm": 1.02065908908844, "learning_rate": 1.0388227823050872e-05, "loss": 1.4422, "mean_token_accuracy": 0.6899056494235992, "num_tokens": 492175993.0, "step": 30540 }, { "epoch": 7.079962915749218, "grad_norm": 1.0307271480560303, "learning_rate": 1.037300595557933e-05, "loss": 1.4453, "mean_token_accuracy": 0.6888017967343331, "num_tokens": 492337666.0, "step": 30550 }, { "epoch": 7.082280681423108, "grad_norm": 1.1349022388458252, "learning_rate": 1.0357792328732132e-05, "loss": 1.4434, "mean_token_accuracy": 0.6876017346978187, "num_tokens": 492497607.0, "step": 30560 }, { "epoch": 7.084598447096998, "grad_norm": 0.9857518076896667, "learning_rate": 1.0342586951080411e-05, "loss": 1.4432, "mean_token_accuracy": 0.6882939413189888, "num_tokens": 492658884.0, "step": 30570 }, { "epoch": 7.0869162127708885, "grad_norm": 0.9850713610649109, "learning_rate": 1.0327389831190631e-05, "loss": 1.4541, "mean_token_accuracy": 0.6865961134433747, "num_tokens": 492819661.0, "step": 30580 }, { "epoch": 7.0892339784447795, "grad_norm": 0.9888629913330078, "learning_rate": 1.0312200977624627e-05, "loss": 1.4412, "mean_token_accuracy": 0.6876433029770851, "num_tokens": 492981100.0, "step": 30590 }, { "epoch": 7.09155174411867, "grad_norm": 0.9621537923812866, "learning_rate": 1.0297020398939561e-05, "loss": 1.426, "mean_token_accuracy": 0.692603413760662, "num_tokens": 493142819.0, "step": 30600 }, { "epoch": 7.09386950979256, "grad_norm": 0.9533586502075195, "learning_rate": 1.028184810368794e-05, "loss": 1.4415, "mean_token_accuracy": 0.6880926549434662, "num_tokens": 493304137.0, "step": 30610 }, { "epoch": 7.09618727546645, "grad_norm": 0.9991888999938965, "learning_rate": 1.0266684100417603e-05, "loss": 1.433, "mean_token_accuracy": 0.6902589336037636, "num_tokens": 493465500.0, "step": 30620 }, { "epoch": 7.09850504114034, "grad_norm": 1.0222811698913574, "learning_rate": 1.0251528397671716e-05, "loss": 1.4428, "mean_token_accuracy": 0.6897612705826759, "num_tokens": 493627633.0, "step": 30630 }, { "epoch": 7.100822806814231, "grad_norm": 1.0252362489700317, "learning_rate": 1.0236381003988773e-05, "loss": 1.4257, "mean_token_accuracy": 0.6909507170319558, "num_tokens": 493789421.0, "step": 30640 }, { "epoch": 7.103140572488122, "grad_norm": 0.9800766110420227, "learning_rate": 1.0221241927902578e-05, "loss": 1.4285, "mean_token_accuracy": 0.690148264169693, "num_tokens": 493950596.0, "step": 30650 }, { "epoch": 7.105458338162012, "grad_norm": 1.0779850482940674, "learning_rate": 1.020611117794226e-05, "loss": 1.4278, "mean_token_accuracy": 0.6905058786273003, "num_tokens": 494110990.0, "step": 30660 }, { "epoch": 7.107776103835902, "grad_norm": 1.0185943841934204, "learning_rate": 1.0190988762632248e-05, "loss": 1.4392, "mean_token_accuracy": 0.6881745889782905, "num_tokens": 494271727.0, "step": 30670 }, { "epoch": 7.110093869509792, "grad_norm": 1.0523755550384521, "learning_rate": 1.017587469049228e-05, "loss": 1.4424, "mean_token_accuracy": 0.688021969795227, "num_tokens": 494433167.0, "step": 30680 }, { "epoch": 7.112411635183683, "grad_norm": 0.9280577898025513, "learning_rate": 1.0160768970037393e-05, "loss": 1.441, "mean_token_accuracy": 0.689510440826416, "num_tokens": 494594682.0, "step": 30690 }, { "epoch": 7.114729400857573, "grad_norm": 0.99925297498703, "learning_rate": 1.0145671609777918e-05, "loss": 1.4362, "mean_token_accuracy": 0.6901866853237152, "num_tokens": 494753584.0, "step": 30700 }, { "epoch": 7.117047166531464, "grad_norm": 1.0488483905792236, "learning_rate": 1.0130582618219478e-05, "loss": 1.4427, "mean_token_accuracy": 0.6894569173455238, "num_tokens": 494915652.0, "step": 30710 }, { "epoch": 7.119364932205354, "grad_norm": 1.077598214149475, "learning_rate": 1.0115502003862979e-05, "loss": 1.4451, "mean_token_accuracy": 0.6882471278309822, "num_tokens": 495077197.0, "step": 30720 }, { "epoch": 7.121682697879244, "grad_norm": 1.0019336938858032, "learning_rate": 1.010042977520461e-05, "loss": 1.4435, "mean_token_accuracy": 0.68965103328228, "num_tokens": 495238064.0, "step": 30730 }, { "epoch": 7.124000463553135, "grad_norm": 1.0261828899383545, "learning_rate": 1.0085365940735827e-05, "loss": 1.444, "mean_token_accuracy": 0.6890273153781891, "num_tokens": 495399088.0, "step": 30740 }, { "epoch": 7.126318229227025, "grad_norm": 0.9949498176574707, "learning_rate": 1.0070310508943368e-05, "loss": 1.4362, "mean_token_accuracy": 0.6886647582054138, "num_tokens": 495560377.0, "step": 30750 }, { "epoch": 7.1286359949009155, "grad_norm": 0.9846725463867188, "learning_rate": 1.0055263488309236e-05, "loss": 1.4485, "mean_token_accuracy": 0.6873080730438232, "num_tokens": 495720881.0, "step": 30760 }, { "epoch": 7.130953760574806, "grad_norm": 1.03962242603302, "learning_rate": 1.0040224887310683e-05, "loss": 1.4397, "mean_token_accuracy": 0.6894658893346787, "num_tokens": 495881746.0, "step": 30770 }, { "epoch": 7.133271526248696, "grad_norm": 1.0102128982543945, "learning_rate": 1.0025194714420238e-05, "loss": 1.4444, "mean_token_accuracy": 0.6879579722881317, "num_tokens": 496043329.0, "step": 30780 }, { "epoch": 7.135589291922587, "grad_norm": 1.0145090818405151, "learning_rate": 1.0010172978105661e-05, "loss": 1.4402, "mean_token_accuracy": 0.6898718625307083, "num_tokens": 496203478.0, "step": 30790 }, { "epoch": 7.137907057596477, "grad_norm": 1.0873581171035767, "learning_rate": 9.995159686829972e-06, "loss": 1.4324, "mean_token_accuracy": 0.6895222038030624, "num_tokens": 496365124.0, "step": 30800 }, { "epoch": 7.140224823270367, "grad_norm": 1.0019670724868774, "learning_rate": 9.980154849051426e-06, "loss": 1.4366, "mean_token_accuracy": 0.6897258207201957, "num_tokens": 496525694.0, "step": 30810 }, { "epoch": 7.1425425889442575, "grad_norm": 0.9842937588691711, "learning_rate": 9.965158473223524e-06, "loss": 1.4316, "mean_token_accuracy": 0.6907808214426041, "num_tokens": 496687310.0, "step": 30820 }, { "epoch": 7.144860354618148, "grad_norm": 1.031673550605774, "learning_rate": 9.950170567794988e-06, "loss": 1.4353, "mean_token_accuracy": 0.6891088098287582, "num_tokens": 496848199.0, "step": 30830 }, { "epoch": 7.147178120292039, "grad_norm": 1.0569835901260376, "learning_rate": 9.935191141209782e-06, "loss": 1.4274, "mean_token_accuracy": 0.6902445018291473, "num_tokens": 497009668.0, "step": 30840 }, { "epoch": 7.149495885965929, "grad_norm": 1.0038057565689087, "learning_rate": 9.92022020190708e-06, "loss": 1.4359, "mean_token_accuracy": 0.6900002107024192, "num_tokens": 497171042.0, "step": 30850 }, { "epoch": 7.151813651639819, "grad_norm": 0.9728618264198303, "learning_rate": 9.90525775832128e-06, "loss": 1.4363, "mean_token_accuracy": 0.6899436265230179, "num_tokens": 497332424.0, "step": 30860 }, { "epoch": 7.154131417313709, "grad_norm": 1.0294747352600098, "learning_rate": 9.890303818881996e-06, "loss": 1.433, "mean_token_accuracy": 0.6900658398866654, "num_tokens": 497493393.0, "step": 30870 }, { "epoch": 7.1564491829876, "grad_norm": 0.9878299236297607, "learning_rate": 9.875358392014045e-06, "loss": 1.4362, "mean_token_accuracy": 0.6892552852630616, "num_tokens": 497654244.0, "step": 30880 }, { "epoch": 7.158766948661491, "grad_norm": 0.9671548008918762, "learning_rate": 9.860421486137452e-06, "loss": 1.44, "mean_token_accuracy": 0.6895726516842842, "num_tokens": 497815705.0, "step": 30890 }, { "epoch": 7.161084714335381, "grad_norm": 1.0987701416015625, "learning_rate": 9.845493109667448e-06, "loss": 1.4426, "mean_token_accuracy": 0.6881386786699295, "num_tokens": 497977562.0, "step": 30900 }, { "epoch": 7.163402480009271, "grad_norm": 1.0015140771865845, "learning_rate": 9.830573271014451e-06, "loss": 1.4378, "mean_token_accuracy": 0.6898176014423371, "num_tokens": 498138159.0, "step": 30910 }, { "epoch": 7.165720245683161, "grad_norm": 1.0331001281738281, "learning_rate": 9.815661978584054e-06, "loss": 1.4318, "mean_token_accuracy": 0.689538586139679, "num_tokens": 498300150.0, "step": 30920 }, { "epoch": 7.168038011357051, "grad_norm": 1.0110365152359009, "learning_rate": 9.800759240777063e-06, "loss": 1.4634, "mean_token_accuracy": 0.687342070043087, "num_tokens": 498461333.0, "step": 30930 }, { "epoch": 7.1703557770309425, "grad_norm": 0.9751272797584534, "learning_rate": 9.785865065989452e-06, "loss": 1.4436, "mean_token_accuracy": 0.6871877744793892, "num_tokens": 498623366.0, "step": 30940 }, { "epoch": 7.172673542704833, "grad_norm": 0.9991422891616821, "learning_rate": 9.770979462612368e-06, "loss": 1.4557, "mean_token_accuracy": 0.6876368924975396, "num_tokens": 498784924.0, "step": 30950 }, { "epoch": 7.174991308378723, "grad_norm": 0.9871208667755127, "learning_rate": 9.75610243903213e-06, "loss": 1.4359, "mean_token_accuracy": 0.6893579810857773, "num_tokens": 498946382.0, "step": 30960 }, { "epoch": 7.177309074052613, "grad_norm": 1.0069568157196045, "learning_rate": 9.741234003630231e-06, "loss": 1.4578, "mean_token_accuracy": 0.6860309943556786, "num_tokens": 499107748.0, "step": 30970 }, { "epoch": 7.179626839726503, "grad_norm": 1.0208183526992798, "learning_rate": 9.726374164783317e-06, "loss": 1.443, "mean_token_accuracy": 0.6876033470034599, "num_tokens": 499269034.0, "step": 30980 }, { "epoch": 7.181944605400394, "grad_norm": 1.0021162033081055, "learning_rate": 9.711522930863194e-06, "loss": 1.4488, "mean_token_accuracy": 0.6872418507933616, "num_tokens": 499430165.0, "step": 30990 }, { "epoch": 7.184262371074285, "grad_norm": 1.0171480178833008, "learning_rate": 9.69668031023682e-06, "loss": 1.44, "mean_token_accuracy": 0.6888231351971627, "num_tokens": 499590283.0, "step": 31000 }, { "epoch": 7.186580136748175, "grad_norm": 1.0605463981628418, "learning_rate": 9.681846311266295e-06, "loss": 1.4303, "mean_token_accuracy": 0.690081961452961, "num_tokens": 499751886.0, "step": 31010 }, { "epoch": 7.188897902422065, "grad_norm": 1.0469043254852295, "learning_rate": 9.667020942308874e-06, "loss": 1.4437, "mean_token_accuracy": 0.6879828378558159, "num_tokens": 499912442.0, "step": 31020 }, { "epoch": 7.191215668095955, "grad_norm": 1.0026171207427979, "learning_rate": 9.652204211716939e-06, "loss": 1.4555, "mean_token_accuracy": 0.686031487584114, "num_tokens": 500073030.0, "step": 31030 }, { "epoch": 7.193533433769846, "grad_norm": 1.0509086847305298, "learning_rate": 9.637396127838008e-06, "loss": 1.4463, "mean_token_accuracy": 0.6872867673635483, "num_tokens": 500232827.0, "step": 31040 }, { "epoch": 7.195851199443736, "grad_norm": 1.0694477558135986, "learning_rate": 9.622596699014733e-06, "loss": 1.445, "mean_token_accuracy": 0.6881368279457092, "num_tokens": 500393905.0, "step": 31050 }, { "epoch": 7.198168965117627, "grad_norm": 0.9904961585998535, "learning_rate": 9.607805933584881e-06, "loss": 1.4456, "mean_token_accuracy": 0.6883071482181549, "num_tokens": 500555270.0, "step": 31060 }, { "epoch": 7.200486730791517, "grad_norm": 1.0203797817230225, "learning_rate": 9.593023839881345e-06, "loss": 1.437, "mean_token_accuracy": 0.688243442773819, "num_tokens": 500716527.0, "step": 31070 }, { "epoch": 7.202804496465407, "grad_norm": 0.9583370089530945, "learning_rate": 9.57825042623213e-06, "loss": 1.4474, "mean_token_accuracy": 0.6869601741433143, "num_tokens": 500878433.0, "step": 31080 }, { "epoch": 7.205122262139298, "grad_norm": 1.0274547338485718, "learning_rate": 9.56348570096035e-06, "loss": 1.4239, "mean_token_accuracy": 0.6900291219353676, "num_tokens": 501040197.0, "step": 31090 }, { "epoch": 7.207440027813188, "grad_norm": 1.0054259300231934, "learning_rate": 9.548729672384227e-06, "loss": 1.4344, "mean_token_accuracy": 0.6892176285386086, "num_tokens": 501202127.0, "step": 31100 }, { "epoch": 7.2097577934870785, "grad_norm": 1.0023301839828491, "learning_rate": 9.533982348817078e-06, "loss": 1.4432, "mean_token_accuracy": 0.689396733045578, "num_tokens": 501363090.0, "step": 31110 }, { "epoch": 7.212075559160969, "grad_norm": 1.0165735483169556, "learning_rate": 9.519243738567322e-06, "loss": 1.4601, "mean_token_accuracy": 0.6872800707817077, "num_tokens": 501524956.0, "step": 31120 }, { "epoch": 7.214393324834859, "grad_norm": 1.0188285112380981, "learning_rate": 9.504513849938462e-06, "loss": 1.4358, "mean_token_accuracy": 0.6884884178638458, "num_tokens": 501686437.0, "step": 31130 }, { "epoch": 7.21671109050875, "grad_norm": 0.9576101899147034, "learning_rate": 9.489792691229097e-06, "loss": 1.4625, "mean_token_accuracy": 0.6864946514368058, "num_tokens": 501847465.0, "step": 31140 }, { "epoch": 7.21902885618264, "grad_norm": 1.0364762544631958, "learning_rate": 9.4750802707329e-06, "loss": 1.4324, "mean_token_accuracy": 0.6903828456997871, "num_tokens": 502008482.0, "step": 31150 }, { "epoch": 7.22134662185653, "grad_norm": 1.0085428953170776, "learning_rate": 9.460376596738624e-06, "loss": 1.4497, "mean_token_accuracy": 0.6883071035146713, "num_tokens": 502170061.0, "step": 31160 }, { "epoch": 7.2236643875304205, "grad_norm": 1.014865517616272, "learning_rate": 9.44568167753009e-06, "loss": 1.4541, "mean_token_accuracy": 0.6871026366949081, "num_tokens": 502331621.0, "step": 31170 }, { "epoch": 7.225982153204311, "grad_norm": 1.1413544416427612, "learning_rate": 9.430995521386199e-06, "loss": 1.4544, "mean_token_accuracy": 0.6879636451601983, "num_tokens": 502493128.0, "step": 31180 }, { "epoch": 7.228299918878202, "grad_norm": 1.072137713432312, "learning_rate": 9.416318136580896e-06, "loss": 1.4395, "mean_token_accuracy": 0.6878548979759216, "num_tokens": 502654729.0, "step": 31190 }, { "epoch": 7.230617684552092, "grad_norm": 1.0635238885879517, "learning_rate": 9.401649531383211e-06, "loss": 1.442, "mean_token_accuracy": 0.6887672394514084, "num_tokens": 502815883.0, "step": 31200 }, { "epoch": 7.232935450225982, "grad_norm": 1.0289065837860107, "learning_rate": 9.386989714057187e-06, "loss": 1.4343, "mean_token_accuracy": 0.6888587504625321, "num_tokens": 502976012.0, "step": 31210 }, { "epoch": 7.235253215899872, "grad_norm": 1.0507081747055054, "learning_rate": 9.372338692861957e-06, "loss": 1.4371, "mean_token_accuracy": 0.6887548372149468, "num_tokens": 503137796.0, "step": 31220 }, { "epoch": 7.2375709815737626, "grad_norm": 1.022354006767273, "learning_rate": 9.357696476051677e-06, "loss": 1.4534, "mean_token_accuracy": 0.6860371917486191, "num_tokens": 503298645.0, "step": 31230 }, { "epoch": 7.239888747247654, "grad_norm": 1.0199449062347412, "learning_rate": 9.343063071875549e-06, "loss": 1.4239, "mean_token_accuracy": 0.6936516240239143, "num_tokens": 503458592.0, "step": 31240 }, { "epoch": 7.242206512921544, "grad_norm": 0.9879464507102966, "learning_rate": 9.328438488577809e-06, "loss": 1.4378, "mean_token_accuracy": 0.6893694147467613, "num_tokens": 503619941.0, "step": 31250 }, { "epoch": 7.244524278595434, "grad_norm": 1.0254650115966797, "learning_rate": 9.313822734397724e-06, "loss": 1.4378, "mean_token_accuracy": 0.6897098675370217, "num_tokens": 503781503.0, "step": 31260 }, { "epoch": 7.246842044269324, "grad_norm": 0.9932230114936829, "learning_rate": 9.299215817569582e-06, "loss": 1.4438, "mean_token_accuracy": 0.688132680952549, "num_tokens": 503942398.0, "step": 31270 }, { "epoch": 7.249159809943214, "grad_norm": 1.045911192893982, "learning_rate": 9.284617746322699e-06, "loss": 1.4388, "mean_token_accuracy": 0.688774311542511, "num_tokens": 504103207.0, "step": 31280 }, { "epoch": 7.2514775756171055, "grad_norm": 0.9974359273910522, "learning_rate": 9.270028528881405e-06, "loss": 1.4459, "mean_token_accuracy": 0.6872999221086502, "num_tokens": 504265048.0, "step": 31290 }, { "epoch": 7.253795341290996, "grad_norm": 0.987821102142334, "learning_rate": 9.255448173465042e-06, "loss": 1.447, "mean_token_accuracy": 0.6868783235549927, "num_tokens": 504426416.0, "step": 31300 }, { "epoch": 7.256113106964886, "grad_norm": 1.0000786781311035, "learning_rate": 9.240876688287959e-06, "loss": 1.4533, "mean_token_accuracy": 0.6874597206711769, "num_tokens": 504587734.0, "step": 31310 }, { "epoch": 7.258430872638776, "grad_norm": 1.0313451290130615, "learning_rate": 9.226314081559503e-06, "loss": 1.4425, "mean_token_accuracy": 0.6885128542780876, "num_tokens": 504749465.0, "step": 31320 }, { "epoch": 7.260748638312666, "grad_norm": 1.0305451154708862, "learning_rate": 9.211760361484032e-06, "loss": 1.4372, "mean_token_accuracy": 0.6904575049877166, "num_tokens": 504910526.0, "step": 31330 }, { "epoch": 7.263066403986557, "grad_norm": 1.0642818212509155, "learning_rate": 9.19721553626088e-06, "loss": 1.4541, "mean_token_accuracy": 0.6875181794166565, "num_tokens": 505072243.0, "step": 31340 }, { "epoch": 7.2653841696604475, "grad_norm": 0.9988014101982117, "learning_rate": 9.182679614084382e-06, "loss": 1.4566, "mean_token_accuracy": 0.68729527592659, "num_tokens": 505233072.0, "step": 31350 }, { "epoch": 7.267701935334338, "grad_norm": 1.0567981004714966, "learning_rate": 9.168152603143856e-06, "loss": 1.4438, "mean_token_accuracy": 0.6878857150673866, "num_tokens": 505394076.0, "step": 31360 }, { "epoch": 7.270019701008228, "grad_norm": 1.010799527168274, "learning_rate": 9.153634511623597e-06, "loss": 1.4461, "mean_token_accuracy": 0.6860446989536285, "num_tokens": 505555581.0, "step": 31370 }, { "epoch": 7.272337466682118, "grad_norm": 1.042605996131897, "learning_rate": 9.139125347702873e-06, "loss": 1.4632, "mean_token_accuracy": 0.6854613214731217, "num_tokens": 505716923.0, "step": 31380 }, { "epoch": 7.274655232356009, "grad_norm": 0.9750323295593262, "learning_rate": 9.124625119555926e-06, "loss": 1.4551, "mean_token_accuracy": 0.6868986517190934, "num_tokens": 505878664.0, "step": 31390 }, { "epoch": 7.276972998029899, "grad_norm": 1.0192437171936035, "learning_rate": 9.110133835351961e-06, "loss": 1.4458, "mean_token_accuracy": 0.6885544657707214, "num_tokens": 506039894.0, "step": 31400 }, { "epoch": 7.27929076370379, "grad_norm": 1.0361396074295044, "learning_rate": 9.095651503255148e-06, "loss": 1.4528, "mean_token_accuracy": 0.6851871684193611, "num_tokens": 506201580.0, "step": 31410 }, { "epoch": 7.28160852937768, "grad_norm": 1.006473183631897, "learning_rate": 9.08117813142461e-06, "loss": 1.4331, "mean_token_accuracy": 0.6885660901665688, "num_tokens": 506362828.0, "step": 31420 }, { "epoch": 7.28392629505157, "grad_norm": 1.0315940380096436, "learning_rate": 9.066713728014423e-06, "loss": 1.4309, "mean_token_accuracy": 0.6896597743034363, "num_tokens": 506524600.0, "step": 31430 }, { "epoch": 7.286244060725461, "grad_norm": 1.0231457948684692, "learning_rate": 9.052258301173611e-06, "loss": 1.4532, "mean_token_accuracy": 0.6866271898150444, "num_tokens": 506686049.0, "step": 31440 }, { "epoch": 7.288561826399351, "grad_norm": 1.0108541250228882, "learning_rate": 9.037811859046136e-06, "loss": 1.441, "mean_token_accuracy": 0.6882435590028763, "num_tokens": 506846884.0, "step": 31450 }, { "epoch": 7.290879592073241, "grad_norm": 0.9884301424026489, "learning_rate": 9.023374409770907e-06, "loss": 1.4304, "mean_token_accuracy": 0.6894416347146034, "num_tokens": 507006650.0, "step": 31460 }, { "epoch": 7.293197357747132, "grad_norm": 1.0033371448516846, "learning_rate": 9.008945961481768e-06, "loss": 1.4478, "mean_token_accuracy": 0.6871186807751656, "num_tokens": 507168522.0, "step": 31470 }, { "epoch": 7.295515123421022, "grad_norm": 1.0097182989120483, "learning_rate": 8.994526522307467e-06, "loss": 1.454, "mean_token_accuracy": 0.6875100880861282, "num_tokens": 507329219.0, "step": 31480 }, { "epoch": 7.297832889094913, "grad_norm": 0.9916151165962219, "learning_rate": 8.980116100371708e-06, "loss": 1.4429, "mean_token_accuracy": 0.6889938950538635, "num_tokens": 507490029.0, "step": 31490 }, { "epoch": 7.300150654768803, "grad_norm": 1.0222407579421997, "learning_rate": 8.965714703793099e-06, "loss": 1.4647, "mean_token_accuracy": 0.6852585703134537, "num_tokens": 507650845.0, "step": 31500 }, { "epoch": 7.302468420442693, "grad_norm": 1.0031529664993286, "learning_rate": 8.951322340685168e-06, "loss": 1.4513, "mean_token_accuracy": 0.6883315667510033, "num_tokens": 507811523.0, "step": 31510 }, { "epoch": 7.3047861861165835, "grad_norm": 0.9877418875694275, "learning_rate": 8.936939019156349e-06, "loss": 1.4432, "mean_token_accuracy": 0.689127379655838, "num_tokens": 507972993.0, "step": 31520 }, { "epoch": 7.307103951790474, "grad_norm": 1.0159478187561035, "learning_rate": 8.92256474730999e-06, "loss": 1.4485, "mean_token_accuracy": 0.6877331450581551, "num_tokens": 508134957.0, "step": 31530 }, { "epoch": 7.309421717464365, "grad_norm": 1.0844104290008545, "learning_rate": 8.908199533244326e-06, "loss": 1.441, "mean_token_accuracy": 0.6880430012941361, "num_tokens": 508295210.0, "step": 31540 }, { "epoch": 7.311739483138255, "grad_norm": 1.0369929075241089, "learning_rate": 8.893843385052511e-06, "loss": 1.4361, "mean_token_accuracy": 0.6881285801529884, "num_tokens": 508455715.0, "step": 31550 }, { "epoch": 7.314057248812145, "grad_norm": 1.0725512504577637, "learning_rate": 8.879496310822568e-06, "loss": 1.4412, "mean_token_accuracy": 0.6878151625394822, "num_tokens": 508617921.0, "step": 31560 }, { "epoch": 7.316375014486035, "grad_norm": 1.0245285034179688, "learning_rate": 8.865158318637426e-06, "loss": 1.4326, "mean_token_accuracy": 0.690050458908081, "num_tokens": 508778675.0, "step": 31570 }, { "epoch": 7.3186927801599255, "grad_norm": 1.0752452611923218, "learning_rate": 8.850829416574887e-06, "loss": 1.4382, "mean_token_accuracy": 0.689277409017086, "num_tokens": 508940388.0, "step": 31580 }, { "epoch": 7.321010545833817, "grad_norm": 1.0347281694412231, "learning_rate": 8.836509612707636e-06, "loss": 1.4428, "mean_token_accuracy": 0.6882695034146309, "num_tokens": 509101393.0, "step": 31590 }, { "epoch": 7.323328311507707, "grad_norm": 1.0360352993011475, "learning_rate": 8.82219891510323e-06, "loss": 1.4449, "mean_token_accuracy": 0.6871876314282417, "num_tokens": 509261088.0, "step": 31600 }, { "epoch": 7.325646077181597, "grad_norm": 0.9911429286003113, "learning_rate": 8.8078973318241e-06, "loss": 1.4547, "mean_token_accuracy": 0.6862717658281327, "num_tokens": 509422559.0, "step": 31610 }, { "epoch": 7.327963842855487, "grad_norm": 1.0376567840576172, "learning_rate": 8.793604870927532e-06, "loss": 1.4524, "mean_token_accuracy": 0.6869779676198959, "num_tokens": 509584142.0, "step": 31620 }, { "epoch": 7.330281608529377, "grad_norm": 1.0072051286697388, "learning_rate": 8.779321540465687e-06, "loss": 1.4361, "mean_token_accuracy": 0.6902975991368294, "num_tokens": 509745940.0, "step": 31630 }, { "epoch": 7.3325993742032685, "grad_norm": 1.0567742586135864, "learning_rate": 8.765047348485569e-06, "loss": 1.4504, "mean_token_accuracy": 0.6881549060344696, "num_tokens": 509907208.0, "step": 31640 }, { "epoch": 7.334917139877159, "grad_norm": 1.0607340335845947, "learning_rate": 8.750782303029042e-06, "loss": 1.4384, "mean_token_accuracy": 0.6895036637783051, "num_tokens": 510068209.0, "step": 31650 }, { "epoch": 7.337234905551049, "grad_norm": 0.9730541110038757, "learning_rate": 8.736526412132815e-06, "loss": 1.4427, "mean_token_accuracy": 0.6898528650403023, "num_tokens": 510228878.0, "step": 31660 }, { "epoch": 7.339552671224939, "grad_norm": 1.0047345161437988, "learning_rate": 8.722279683828438e-06, "loss": 1.4427, "mean_token_accuracy": 0.6880295276641846, "num_tokens": 510390732.0, "step": 31670 }, { "epoch": 7.341870436898829, "grad_norm": 1.1261826753616333, "learning_rate": 8.708042126142296e-06, "loss": 1.4399, "mean_token_accuracy": 0.6888741970062255, "num_tokens": 510551874.0, "step": 31680 }, { "epoch": 7.34418820257272, "grad_norm": 1.0499557256698608, "learning_rate": 8.693813747095614e-06, "loss": 1.4564, "mean_token_accuracy": 0.6855880632996559, "num_tokens": 510711302.0, "step": 31690 }, { "epoch": 7.3465059682466105, "grad_norm": 0.9766618609428406, "learning_rate": 8.67959455470444e-06, "loss": 1.4389, "mean_token_accuracy": 0.6894386291503907, "num_tokens": 510872178.0, "step": 31700 }, { "epoch": 7.348823733920501, "grad_norm": 1.0491461753845215, "learning_rate": 8.665384556979653e-06, "loss": 1.4353, "mean_token_accuracy": 0.6906443372368812, "num_tokens": 511034049.0, "step": 31710 }, { "epoch": 7.351141499594391, "grad_norm": 1.0465970039367676, "learning_rate": 8.651183761926949e-06, "loss": 1.4419, "mean_token_accuracy": 0.6903037399053573, "num_tokens": 511195832.0, "step": 31720 }, { "epoch": 7.353459265268281, "grad_norm": 1.0262311697006226, "learning_rate": 8.636992177546832e-06, "loss": 1.4492, "mean_token_accuracy": 0.6890801817178727, "num_tokens": 511357019.0, "step": 31730 }, { "epoch": 7.355777030942171, "grad_norm": 1.0086815357208252, "learning_rate": 8.622809811834626e-06, "loss": 1.4573, "mean_token_accuracy": 0.6871799737215042, "num_tokens": 511518615.0, "step": 31740 }, { "epoch": 7.358094796616062, "grad_norm": 1.0036875009536743, "learning_rate": 8.608636672780463e-06, "loss": 1.4493, "mean_token_accuracy": 0.6884276568889618, "num_tokens": 511678317.0, "step": 31750 }, { "epoch": 7.3604125622899526, "grad_norm": 1.025403380393982, "learning_rate": 8.594472768369266e-06, "loss": 1.4462, "mean_token_accuracy": 0.687311227619648, "num_tokens": 511839460.0, "step": 31760 }, { "epoch": 7.362730327963843, "grad_norm": 0.9978502988815308, "learning_rate": 8.580318106580764e-06, "loss": 1.4503, "mean_token_accuracy": 0.6870240285992623, "num_tokens": 512000663.0, "step": 31770 }, { "epoch": 7.365048093637733, "grad_norm": 1.029289960861206, "learning_rate": 8.56617269538948e-06, "loss": 1.4366, "mean_token_accuracy": 0.6890831381082535, "num_tokens": 512161611.0, "step": 31780 }, { "epoch": 7.367365859311624, "grad_norm": 1.0220807790756226, "learning_rate": 8.552036542764715e-06, "loss": 1.4472, "mean_token_accuracy": 0.6870829433202743, "num_tokens": 512322206.0, "step": 31790 }, { "epoch": 7.369683624985514, "grad_norm": 1.0592591762542725, "learning_rate": 8.537909656670571e-06, "loss": 1.4472, "mean_token_accuracy": 0.6882890820503235, "num_tokens": 512483665.0, "step": 31800 }, { "epoch": 7.372001390659404, "grad_norm": 1.030042290687561, "learning_rate": 8.523792045065906e-06, "loss": 1.4357, "mean_token_accuracy": 0.6888395875692368, "num_tokens": 512645307.0, "step": 31810 }, { "epoch": 7.374319156333295, "grad_norm": 1.0105366706848145, "learning_rate": 8.509683715904379e-06, "loss": 1.447, "mean_token_accuracy": 0.687783706188202, "num_tokens": 512806648.0, "step": 31820 }, { "epoch": 7.376636922007185, "grad_norm": 0.9877455234527588, "learning_rate": 8.495584677134396e-06, "loss": 1.4384, "mean_token_accuracy": 0.6901824966073036, "num_tokens": 512966011.0, "step": 31830 }, { "epoch": 7.378954687681075, "grad_norm": 1.0048744678497314, "learning_rate": 8.481494936699147e-06, "loss": 1.4454, "mean_token_accuracy": 0.6881531417369843, "num_tokens": 513127442.0, "step": 31840 }, { "epoch": 7.381272453354966, "grad_norm": 1.0213381052017212, "learning_rate": 8.467414502536572e-06, "loss": 1.4562, "mean_token_accuracy": 0.6884176462888718, "num_tokens": 513289157.0, "step": 31850 }, { "epoch": 7.383590219028856, "grad_norm": 1.0577824115753174, "learning_rate": 8.453343382579373e-06, "loss": 1.4505, "mean_token_accuracy": 0.6877394929528237, "num_tokens": 513450433.0, "step": 31860 }, { "epoch": 7.3859079847027465, "grad_norm": 0.9695032238960266, "learning_rate": 8.439281584755004e-06, "loss": 1.4525, "mean_token_accuracy": 0.6877537682652474, "num_tokens": 513611946.0, "step": 31870 }, { "epoch": 7.388225750376637, "grad_norm": 1.0674630403518677, "learning_rate": 8.425229116985665e-06, "loss": 1.4439, "mean_token_accuracy": 0.6885508850216866, "num_tokens": 513773421.0, "step": 31880 }, { "epoch": 7.390543516050528, "grad_norm": 1.0830594301223755, "learning_rate": 8.4111859871883e-06, "loss": 1.4448, "mean_token_accuracy": 0.6872540801763535, "num_tokens": 513934603.0, "step": 31890 }, { "epoch": 7.392861281724418, "grad_norm": 1.0326237678527832, "learning_rate": 8.397152203274597e-06, "loss": 1.4423, "mean_token_accuracy": 0.6885855361819267, "num_tokens": 514096204.0, "step": 31900 }, { "epoch": 7.395179047398308, "grad_norm": 1.001453161239624, "learning_rate": 8.383127773150973e-06, "loss": 1.4466, "mean_token_accuracy": 0.6887331917881966, "num_tokens": 514257824.0, "step": 31910 }, { "epoch": 7.397496813072198, "grad_norm": 1.0219908952713013, "learning_rate": 8.369112704718577e-06, "loss": 1.4346, "mean_token_accuracy": 0.6883081331849098, "num_tokens": 514418732.0, "step": 31920 }, { "epoch": 7.3998145787460885, "grad_norm": 1.069950819015503, "learning_rate": 8.35510700587328e-06, "loss": 1.4441, "mean_token_accuracy": 0.6876535221934319, "num_tokens": 514578966.0, "step": 31930 }, { "epoch": 7.402132344419979, "grad_norm": 1.0536203384399414, "learning_rate": 8.341110684505684e-06, "loss": 1.4527, "mean_token_accuracy": 0.6863170221447945, "num_tokens": 514740726.0, "step": 31940 }, { "epoch": 7.40445011009387, "grad_norm": 0.9827163219451904, "learning_rate": 8.327123748501103e-06, "loss": 1.4548, "mean_token_accuracy": 0.6880886167287826, "num_tokens": 514901657.0, "step": 31950 }, { "epoch": 7.40676787576776, "grad_norm": 1.0248984098434448, "learning_rate": 8.313146205739555e-06, "loss": 1.4375, "mean_token_accuracy": 0.6895182773470878, "num_tokens": 515062882.0, "step": 31960 }, { "epoch": 7.40908564144165, "grad_norm": 1.0137073993682861, "learning_rate": 8.299178064095781e-06, "loss": 1.4505, "mean_token_accuracy": 0.6875103771686554, "num_tokens": 515224248.0, "step": 31970 }, { "epoch": 7.41140340711554, "grad_norm": 0.9758316278457642, "learning_rate": 8.285219331439212e-06, "loss": 1.4692, "mean_token_accuracy": 0.6853254854679107, "num_tokens": 515384966.0, "step": 31980 }, { "epoch": 7.413721172789431, "grad_norm": 1.064527988433838, "learning_rate": 8.271270015633988e-06, "loss": 1.4339, "mean_token_accuracy": 0.6892671644687652, "num_tokens": 515546049.0, "step": 31990 }, { "epoch": 7.416038938463322, "grad_norm": 1.0322916507720947, "learning_rate": 8.257330124538948e-06, "loss": 1.4554, "mean_token_accuracy": 0.6852184012532234, "num_tokens": 515707542.0, "step": 32000 }, { "epoch": 7.418356704137212, "grad_norm": 1.0075485706329346, "learning_rate": 8.243399666007592e-06, "loss": 1.438, "mean_token_accuracy": 0.6880797356367111, "num_tokens": 515869417.0, "step": 32010 }, { "epoch": 7.420674469811102, "grad_norm": 1.013810634613037, "learning_rate": 8.229478647888142e-06, "loss": 1.426, "mean_token_accuracy": 0.6917711481451988, "num_tokens": 516030601.0, "step": 32020 }, { "epoch": 7.422992235484992, "grad_norm": 0.9471526145935059, "learning_rate": 8.21556707802348e-06, "loss": 1.4321, "mean_token_accuracy": 0.6899822786450386, "num_tokens": 516192179.0, "step": 32030 }, { "epoch": 7.425310001158882, "grad_norm": 0.9758390784263611, "learning_rate": 8.201664964251177e-06, "loss": 1.4405, "mean_token_accuracy": 0.6882810860872268, "num_tokens": 516353275.0, "step": 32040 }, { "epoch": 7.4276277668327735, "grad_norm": 1.0588722229003906, "learning_rate": 8.18777231440347e-06, "loss": 1.447, "mean_token_accuracy": 0.6874753519892692, "num_tokens": 516514193.0, "step": 32050 }, { "epoch": 7.429945532506664, "grad_norm": 1.0172712802886963, "learning_rate": 8.173889136307259e-06, "loss": 1.4475, "mean_token_accuracy": 0.6884192600846291, "num_tokens": 516676004.0, "step": 32060 }, { "epoch": 7.432263298180554, "grad_norm": 1.03225576877594, "learning_rate": 8.16001543778412e-06, "loss": 1.4453, "mean_token_accuracy": 0.6884449616074562, "num_tokens": 516837180.0, "step": 32070 }, { "epoch": 7.434581063854444, "grad_norm": 1.0302573442459106, "learning_rate": 8.146151226650278e-06, "loss": 1.4438, "mean_token_accuracy": 0.6875146090984344, "num_tokens": 516999251.0, "step": 32080 }, { "epoch": 7.436898829528335, "grad_norm": 0.9796600341796875, "learning_rate": 8.132296510716622e-06, "loss": 1.4438, "mean_token_accuracy": 0.6901736333966255, "num_tokens": 517158316.0, "step": 32090 }, { "epoch": 7.439216595202225, "grad_norm": 1.0791475772857666, "learning_rate": 8.118451297788684e-06, "loss": 1.452, "mean_token_accuracy": 0.6870232164859772, "num_tokens": 517319504.0, "step": 32100 }, { "epoch": 7.4415343608761155, "grad_norm": 1.049209475517273, "learning_rate": 8.104615595666645e-06, "loss": 1.4193, "mean_token_accuracy": 0.6923088386654854, "num_tokens": 517481598.0, "step": 32110 }, { "epoch": 7.443852126550006, "grad_norm": 1.057094693183899, "learning_rate": 8.090789412145328e-06, "loss": 1.4409, "mean_token_accuracy": 0.6880345419049263, "num_tokens": 517642148.0, "step": 32120 }, { "epoch": 7.446169892223896, "grad_norm": 1.0132508277893066, "learning_rate": 8.076972755014189e-06, "loss": 1.4396, "mean_token_accuracy": 0.6889125764369964, "num_tokens": 517802320.0, "step": 32130 }, { "epoch": 7.448487657897786, "grad_norm": 1.0120174884796143, "learning_rate": 8.063165632057337e-06, "loss": 1.4465, "mean_token_accuracy": 0.6868365779519081, "num_tokens": 517963279.0, "step": 32140 }, { "epoch": 7.450805423571677, "grad_norm": 0.9706781506538391, "learning_rate": 8.049368051053471e-06, "loss": 1.4524, "mean_token_accuracy": 0.6865990296006202, "num_tokens": 518123176.0, "step": 32150 }, { "epoch": 7.453123189245567, "grad_norm": 1.0309149026870728, "learning_rate": 8.035580019775946e-06, "loss": 1.4342, "mean_token_accuracy": 0.6890995681285859, "num_tokens": 518284631.0, "step": 32160 }, { "epoch": 7.455440954919458, "grad_norm": 1.0835545063018799, "learning_rate": 8.02180154599273e-06, "loss": 1.4618, "mean_token_accuracy": 0.6859710901975632, "num_tokens": 518446467.0, "step": 32170 }, { "epoch": 7.457758720593348, "grad_norm": 1.007627010345459, "learning_rate": 8.0080326374664e-06, "loss": 1.4369, "mean_token_accuracy": 0.6887131690979004, "num_tokens": 518607983.0, "step": 32180 }, { "epoch": 7.460076486267239, "grad_norm": 1.0832096338272095, "learning_rate": 7.994273301954152e-06, "loss": 1.4398, "mean_token_accuracy": 0.6889218464493752, "num_tokens": 518768623.0, "step": 32190 }, { "epoch": 7.462394251941129, "grad_norm": 1.0164293050765991, "learning_rate": 7.980523547207786e-06, "loss": 1.4391, "mean_token_accuracy": 0.6891108855605126, "num_tokens": 518928431.0, "step": 32200 }, { "epoch": 7.464712017615019, "grad_norm": 1.0178301334381104, "learning_rate": 7.966783380973697e-06, "loss": 1.444, "mean_token_accuracy": 0.6875931903719902, "num_tokens": 519089760.0, "step": 32210 }, { "epoch": 7.467029783288909, "grad_norm": 1.00615656375885, "learning_rate": 7.953052810992887e-06, "loss": 1.4363, "mean_token_accuracy": 0.6890184611082077, "num_tokens": 519250770.0, "step": 32220 }, { "epoch": 7.4693475489628, "grad_norm": 1.0040165185928345, "learning_rate": 7.939331845000953e-06, "loss": 1.4596, "mean_token_accuracy": 0.6858802825212479, "num_tokens": 519411823.0, "step": 32230 }, { "epoch": 7.47166531463669, "grad_norm": 1.0588769912719727, "learning_rate": 7.92562049072807e-06, "loss": 1.4501, "mean_token_accuracy": 0.6872653260827064, "num_tokens": 519573249.0, "step": 32240 }, { "epoch": 7.473983080310581, "grad_norm": 1.0357472896575928, "learning_rate": 7.91191875589901e-06, "loss": 1.4466, "mean_token_accuracy": 0.6867182850837708, "num_tokens": 519734791.0, "step": 32250 }, { "epoch": 7.476300845984471, "grad_norm": 0.978489875793457, "learning_rate": 7.898226648233118e-06, "loss": 1.4345, "mean_token_accuracy": 0.6875841945409775, "num_tokens": 519896076.0, "step": 32260 }, { "epoch": 7.478618611658361, "grad_norm": 1.0290390253067017, "learning_rate": 7.884544175444319e-06, "loss": 1.4349, "mean_token_accuracy": 0.6892690554261207, "num_tokens": 520057282.0, "step": 32270 }, { "epoch": 7.4809363773322515, "grad_norm": 1.0330085754394531, "learning_rate": 7.870871345241107e-06, "loss": 1.4431, "mean_token_accuracy": 0.68798548579216, "num_tokens": 520219180.0, "step": 32280 }, { "epoch": 7.483254143006142, "grad_norm": 1.0363632440567017, "learning_rate": 7.857208165326545e-06, "loss": 1.4593, "mean_token_accuracy": 0.6865054339170455, "num_tokens": 520381045.0, "step": 32290 }, { "epoch": 7.485571908680033, "grad_norm": 1.030999779701233, "learning_rate": 7.843554643398266e-06, "loss": 1.4229, "mean_token_accuracy": 0.6920273497700691, "num_tokens": 520541148.0, "step": 32300 }, { "epoch": 7.487889674353923, "grad_norm": 0.9769495129585266, "learning_rate": 7.829910787148445e-06, "loss": 1.4494, "mean_token_accuracy": 0.6870709925889968, "num_tokens": 520702829.0, "step": 32310 }, { "epoch": 7.490207440027813, "grad_norm": 1.0366933345794678, "learning_rate": 7.816276604263826e-06, "loss": 1.4494, "mean_token_accuracy": 0.6857015773653984, "num_tokens": 520864312.0, "step": 32320 }, { "epoch": 7.492525205701703, "grad_norm": 1.0046875476837158, "learning_rate": 7.8026521024257e-06, "loss": 1.4514, "mean_token_accuracy": 0.6867579638957977, "num_tokens": 521026448.0, "step": 32330 }, { "epoch": 7.4948429713755935, "grad_norm": 1.049505352973938, "learning_rate": 7.7890372893099e-06, "loss": 1.4364, "mean_token_accuracy": 0.6880466923117637, "num_tokens": 521187792.0, "step": 32340 }, { "epoch": 7.497160737049485, "grad_norm": 0.9804846048355103, "learning_rate": 7.775432172586804e-06, "loss": 1.4669, "mean_token_accuracy": 0.6849103614687919, "num_tokens": 521349268.0, "step": 32350 }, { "epoch": 7.499478502723375, "grad_norm": 0.9909638166427612, "learning_rate": 7.761836759921329e-06, "loss": 1.436, "mean_token_accuracy": 0.6895779848098755, "num_tokens": 521511363.0, "step": 32360 }, { "epoch": 7.501796268397265, "grad_norm": 1.0118390321731567, "learning_rate": 7.748251058972918e-06, "loss": 1.4473, "mean_token_accuracy": 0.6879427015781403, "num_tokens": 521672805.0, "step": 32370 }, { "epoch": 7.504114034071155, "grad_norm": 1.0117896795272827, "learning_rate": 7.73467507739555e-06, "loss": 1.4404, "mean_token_accuracy": 0.6877265512943268, "num_tokens": 521834092.0, "step": 32380 }, { "epoch": 7.506431799745046, "grad_norm": 1.0289005041122437, "learning_rate": 7.721108822837725e-06, "loss": 1.4369, "mean_token_accuracy": 0.6869025021791458, "num_tokens": 521994876.0, "step": 32390 }, { "epoch": 7.5087495654189365, "grad_norm": 1.1440712213516235, "learning_rate": 7.707552302942458e-06, "loss": 1.4267, "mean_token_accuracy": 0.6890601590275764, "num_tokens": 522155772.0, "step": 32400 }, { "epoch": 7.511067331092827, "grad_norm": 1.0564872026443481, "learning_rate": 7.694005525347293e-06, "loss": 1.4395, "mean_token_accuracy": 0.6887574568390846, "num_tokens": 522316102.0, "step": 32410 }, { "epoch": 7.513385096766717, "grad_norm": 1.057431697845459, "learning_rate": 7.680468497684267e-06, "loss": 1.4388, "mean_token_accuracy": 0.6882605910301208, "num_tokens": 522477828.0, "step": 32420 }, { "epoch": 7.515702862440607, "grad_norm": 0.9785516858100891, "learning_rate": 7.666941227579943e-06, "loss": 1.4342, "mean_token_accuracy": 0.6891146898269653, "num_tokens": 522638783.0, "step": 32430 }, { "epoch": 7.518020628114497, "grad_norm": 1.0437616109848022, "learning_rate": 7.653423722655373e-06, "loss": 1.4385, "mean_token_accuracy": 0.6871486201882362, "num_tokens": 522800447.0, "step": 32440 }, { "epoch": 7.520338393788388, "grad_norm": 1.0089956521987915, "learning_rate": 7.639915990526114e-06, "loss": 1.4538, "mean_token_accuracy": 0.6871182739734649, "num_tokens": 522962207.0, "step": 32450 }, { "epoch": 7.5226561594622785, "grad_norm": 1.0027110576629639, "learning_rate": 7.626418038802213e-06, "loss": 1.458, "mean_token_accuracy": 0.6866484403610229, "num_tokens": 523123652.0, "step": 32460 }, { "epoch": 7.524973925136169, "grad_norm": 1.0224926471710205, "learning_rate": 7.612929875088212e-06, "loss": 1.4502, "mean_token_accuracy": 0.6878048852086067, "num_tokens": 523285826.0, "step": 32470 }, { "epoch": 7.527291690810059, "grad_norm": 1.0440561771392822, "learning_rate": 7.599451506983132e-06, "loss": 1.4499, "mean_token_accuracy": 0.6881605923175812, "num_tokens": 523446599.0, "step": 32480 }, { "epoch": 7.52960945648395, "grad_norm": 1.0580934286117554, "learning_rate": 7.585982942080486e-06, "loss": 1.4454, "mean_token_accuracy": 0.6872997730970383, "num_tokens": 523608056.0, "step": 32490 }, { "epoch": 7.53192722215784, "grad_norm": 1.0964021682739258, "learning_rate": 7.57252418796825e-06, "loss": 1.4421, "mean_token_accuracy": 0.6875754043459892, "num_tokens": 523769847.0, "step": 32500 }, { "epoch": 7.53424498783173, "grad_norm": 1.0233668088912964, "learning_rate": 7.559075252228881e-06, "loss": 1.4442, "mean_token_accuracy": 0.688002134859562, "num_tokens": 523930941.0, "step": 32510 }, { "epoch": 7.5365627535056205, "grad_norm": 0.972683310508728, "learning_rate": 7.545636142439308e-06, "loss": 1.4408, "mean_token_accuracy": 0.6895653992891312, "num_tokens": 524091587.0, "step": 32520 }, { "epoch": 7.538880519179511, "grad_norm": 0.997475266456604, "learning_rate": 7.532206866170916e-06, "loss": 1.4447, "mean_token_accuracy": 0.68751520216465, "num_tokens": 524252534.0, "step": 32530 }, { "epoch": 7.541198284853401, "grad_norm": 1.0293915271759033, "learning_rate": 7.518787430989552e-06, "loss": 1.4369, "mean_token_accuracy": 0.6880972295999527, "num_tokens": 524414329.0, "step": 32540 }, { "epoch": 7.543516050527292, "grad_norm": 0.9829599261283875, "learning_rate": 7.5053778444555215e-06, "loss": 1.4586, "mean_token_accuracy": 0.6849916845560073, "num_tokens": 524576344.0, "step": 32550 }, { "epoch": 7.545833816201182, "grad_norm": 1.0209779739379883, "learning_rate": 7.4919781141235826e-06, "loss": 1.4247, "mean_token_accuracy": 0.6899746105074882, "num_tokens": 524737937.0, "step": 32560 }, { "epoch": 7.548151581875072, "grad_norm": 1.0575013160705566, "learning_rate": 7.478588247542936e-06, "loss": 1.4415, "mean_token_accuracy": 0.6892987862229347, "num_tokens": 524899032.0, "step": 32570 }, { "epoch": 7.550469347548963, "grad_norm": 1.0007580518722534, "learning_rate": 7.465208252257228e-06, "loss": 1.4458, "mean_token_accuracy": 0.6884526371955871, "num_tokens": 525060218.0, "step": 32580 }, { "epoch": 7.552787113222853, "grad_norm": 1.049758791923523, "learning_rate": 7.451838135804551e-06, "loss": 1.4441, "mean_token_accuracy": 0.6890849322080612, "num_tokens": 525221135.0, "step": 32590 }, { "epoch": 7.555104878896744, "grad_norm": 1.0069292783737183, "learning_rate": 7.438477905717406e-06, "loss": 1.4263, "mean_token_accuracy": 0.6920935302972794, "num_tokens": 525382773.0, "step": 32600 }, { "epoch": 7.557422644570634, "grad_norm": 1.0252472162246704, "learning_rate": 7.425127569522752e-06, "loss": 1.443, "mean_token_accuracy": 0.6871246114373207, "num_tokens": 525544189.0, "step": 32610 }, { "epoch": 7.559740410244524, "grad_norm": 0.9816136360168457, "learning_rate": 7.411787134741963e-06, "loss": 1.4433, "mean_token_accuracy": 0.6880725666880607, "num_tokens": 525705062.0, "step": 32620 }, { "epoch": 7.5620581759184144, "grad_norm": 1.0760810375213623, "learning_rate": 7.398456608890836e-06, "loss": 1.4485, "mean_token_accuracy": 0.6869195133447648, "num_tokens": 525865681.0, "step": 32630 }, { "epoch": 7.564375941592305, "grad_norm": 1.0700560808181763, "learning_rate": 7.385135999479584e-06, "loss": 1.4484, "mean_token_accuracy": 0.6887783542275429, "num_tokens": 526027154.0, "step": 32640 }, { "epoch": 7.566693707266196, "grad_norm": 1.012082815170288, "learning_rate": 7.371825314012837e-06, "loss": 1.4628, "mean_token_accuracy": 0.6850529685616493, "num_tokens": 526188206.0, "step": 32650 }, { "epoch": 7.569011472940086, "grad_norm": 1.0656970739364624, "learning_rate": 7.3585245599896265e-06, "loss": 1.456, "mean_token_accuracy": 0.6863014400005341, "num_tokens": 526348594.0, "step": 32660 }, { "epoch": 7.571329238613976, "grad_norm": 1.0284100770950317, "learning_rate": 7.345233744903399e-06, "loss": 1.4475, "mean_token_accuracy": 0.6882706061005592, "num_tokens": 526510099.0, "step": 32670 }, { "epoch": 7.573647004287866, "grad_norm": 1.0110914707183838, "learning_rate": 7.331952876241993e-06, "loss": 1.4614, "mean_token_accuracy": 0.6867785900831223, "num_tokens": 526671069.0, "step": 32680 }, { "epoch": 7.5759647699617565, "grad_norm": 1.0035020112991333, "learning_rate": 7.3186819614876445e-06, "loss": 1.438, "mean_token_accuracy": 0.6894149482250214, "num_tokens": 526832210.0, "step": 32690 }, { "epoch": 7.578282535635648, "grad_norm": 0.9898139834403992, "learning_rate": 7.305421008116989e-06, "loss": 1.4382, "mean_token_accuracy": 0.6871469855308533, "num_tokens": 526993510.0, "step": 32700 }, { "epoch": 7.580600301309538, "grad_norm": 1.0140835046768188, "learning_rate": 7.292170023601039e-06, "loss": 1.4464, "mean_token_accuracy": 0.6868055030703545, "num_tokens": 527155748.0, "step": 32710 }, { "epoch": 7.582918066983428, "grad_norm": 1.016816258430481, "learning_rate": 7.2789290154051975e-06, "loss": 1.445, "mean_token_accuracy": 0.6889985859394073, "num_tokens": 527317146.0, "step": 32720 }, { "epoch": 7.585235832657318, "grad_norm": 0.9902787208557129, "learning_rate": 7.265697990989248e-06, "loss": 1.4514, "mean_token_accuracy": 0.6858447402715683, "num_tokens": 527478956.0, "step": 32730 }, { "epoch": 7.587553598331208, "grad_norm": 1.0822395086288452, "learning_rate": 7.252476957807347e-06, "loss": 1.449, "mean_token_accuracy": 0.6897020146250725, "num_tokens": 527639903.0, "step": 32740 }, { "epoch": 7.589871364005099, "grad_norm": 1.0342154502868652, "learning_rate": 7.23926592330802e-06, "loss": 1.4465, "mean_token_accuracy": 0.6869777366518974, "num_tokens": 527799956.0, "step": 32750 }, { "epoch": 7.59218912967899, "grad_norm": 1.0060772895812988, "learning_rate": 7.226064894934162e-06, "loss": 1.4292, "mean_token_accuracy": 0.6910192564129829, "num_tokens": 527961598.0, "step": 32760 }, { "epoch": 7.59450689535288, "grad_norm": 1.0777093172073364, "learning_rate": 7.2128738801230274e-06, "loss": 1.4517, "mean_token_accuracy": 0.6878333181142807, "num_tokens": 528123011.0, "step": 32770 }, { "epoch": 7.59682466102677, "grad_norm": 1.0587408542633057, "learning_rate": 7.199692886306239e-06, "loss": 1.4442, "mean_token_accuracy": 0.6884318962693214, "num_tokens": 528282427.0, "step": 32780 }, { "epoch": 7.59914242670066, "grad_norm": 0.985977828502655, "learning_rate": 7.186521920909761e-06, "loss": 1.4339, "mean_token_accuracy": 0.6890422195196152, "num_tokens": 528443778.0, "step": 32790 }, { "epoch": 7.601460192374551, "grad_norm": 0.9973867535591125, "learning_rate": 7.173360991353914e-06, "loss": 1.4452, "mean_token_accuracy": 0.6864807114005089, "num_tokens": 528604064.0, "step": 32800 }, { "epoch": 7.6037779580484415, "grad_norm": 1.034022569656372, "learning_rate": 7.160210105053364e-06, "loss": 1.462, "mean_token_accuracy": 0.6847562104463577, "num_tokens": 528765756.0, "step": 32810 }, { "epoch": 7.606095723722332, "grad_norm": 1.0272555351257324, "learning_rate": 7.147069269417123e-06, "loss": 1.4452, "mean_token_accuracy": 0.6891944780945778, "num_tokens": 528927698.0, "step": 32820 }, { "epoch": 7.608413489396222, "grad_norm": 0.959269642829895, "learning_rate": 7.13393849184853e-06, "loss": 1.4369, "mean_token_accuracy": 0.6875798866152764, "num_tokens": 529089348.0, "step": 32830 }, { "epoch": 7.610731255070112, "grad_norm": 1.030045747756958, "learning_rate": 7.12081777974527e-06, "loss": 1.445, "mean_token_accuracy": 0.6881606787443161, "num_tokens": 529250090.0, "step": 32840 }, { "epoch": 7.613049020744003, "grad_norm": 1.0950100421905518, "learning_rate": 7.107707140499348e-06, "loss": 1.441, "mean_token_accuracy": 0.6867286249995231, "num_tokens": 529411886.0, "step": 32850 }, { "epoch": 7.615366786417893, "grad_norm": 0.9975480437278748, "learning_rate": 7.0946065814970955e-06, "loss": 1.4404, "mean_token_accuracy": 0.6883272513747215, "num_tokens": 529573356.0, "step": 32860 }, { "epoch": 7.6176845520917835, "grad_norm": 1.0264642238616943, "learning_rate": 7.081516110119171e-06, "loss": 1.4568, "mean_token_accuracy": 0.6872819289565086, "num_tokens": 529734208.0, "step": 32870 }, { "epoch": 7.620002317765674, "grad_norm": 1.0034130811691284, "learning_rate": 7.068435733740545e-06, "loss": 1.4283, "mean_token_accuracy": 0.689424441754818, "num_tokens": 529893966.0, "step": 32880 }, { "epoch": 7.622320083439564, "grad_norm": 1.0282976627349854, "learning_rate": 7.055365459730495e-06, "loss": 1.4417, "mean_token_accuracy": 0.6907221555709839, "num_tokens": 530054796.0, "step": 32890 }, { "epoch": 7.624637849113455, "grad_norm": 1.038467288017273, "learning_rate": 7.042305295452623e-06, "loss": 1.4449, "mean_token_accuracy": 0.6887781992554665, "num_tokens": 530216485.0, "step": 32900 }, { "epoch": 7.626955614787345, "grad_norm": 1.0460805892944336, "learning_rate": 7.029255248264816e-06, "loss": 1.4381, "mean_token_accuracy": 0.6875792980194092, "num_tokens": 530377130.0, "step": 32910 }, { "epoch": 7.629273380461235, "grad_norm": 1.0130562782287598, "learning_rate": 7.016215325519276e-06, "loss": 1.4379, "mean_token_accuracy": 0.6894944757223129, "num_tokens": 530537933.0, "step": 32920 }, { "epoch": 7.631591146135126, "grad_norm": 1.0300108194351196, "learning_rate": 7.003185534562504e-06, "loss": 1.442, "mean_token_accuracy": 0.6885079249739647, "num_tokens": 530699061.0, "step": 32930 }, { "epoch": 7.633908911809016, "grad_norm": 1.0456326007843018, "learning_rate": 6.990165882735264e-06, "loss": 1.4564, "mean_token_accuracy": 0.6864947721362114, "num_tokens": 530860232.0, "step": 32940 }, { "epoch": 7.636226677482907, "grad_norm": 1.0437432527542114, "learning_rate": 6.977156377372643e-06, "loss": 1.4413, "mean_token_accuracy": 0.6893554151058197, "num_tokens": 531022363.0, "step": 32950 }, { "epoch": 7.638544443156797, "grad_norm": 1.0019075870513916, "learning_rate": 6.964157025803991e-06, "loss": 1.4332, "mean_token_accuracy": 0.6901695325970649, "num_tokens": 531184139.0, "step": 32960 }, { "epoch": 7.640862208830687, "grad_norm": 1.0328218936920166, "learning_rate": 6.951167835352948e-06, "loss": 1.4375, "mean_token_accuracy": 0.6902770072221756, "num_tokens": 531345409.0, "step": 32970 }, { "epoch": 7.643179974504577, "grad_norm": 1.0006868839263916, "learning_rate": 6.9381888133374205e-06, "loss": 1.4215, "mean_token_accuracy": 0.6908038631081581, "num_tokens": 531507053.0, "step": 32980 }, { "epoch": 7.645497740178468, "grad_norm": 1.0480772256851196, "learning_rate": 6.925219967069596e-06, "loss": 1.4599, "mean_token_accuracy": 0.6853287279605865, "num_tokens": 531667544.0, "step": 32990 }, { "epoch": 7.647815505852359, "grad_norm": 0.9991621971130371, "learning_rate": 6.912261303855919e-06, "loss": 1.4501, "mean_token_accuracy": 0.6889531955122947, "num_tokens": 531828449.0, "step": 33000 }, { "epoch": 7.650133271526249, "grad_norm": 1.0497734546661377, "learning_rate": 6.899312830997104e-06, "loss": 1.447, "mean_token_accuracy": 0.687856824696064, "num_tokens": 531989462.0, "step": 33010 }, { "epoch": 7.652451037200139, "grad_norm": 1.0496587753295898, "learning_rate": 6.886374555788122e-06, "loss": 1.4547, "mean_token_accuracy": 0.6865445896983147, "num_tokens": 532150229.0, "step": 33020 }, { "epoch": 7.654768802874029, "grad_norm": 1.020783543586731, "learning_rate": 6.873446485518198e-06, "loss": 1.4503, "mean_token_accuracy": 0.6862737879157066, "num_tokens": 532310834.0, "step": 33030 }, { "epoch": 7.6570865685479195, "grad_norm": 1.0599312782287598, "learning_rate": 6.860528627470811e-06, "loss": 1.4412, "mean_token_accuracy": 0.6865986406803131, "num_tokens": 532472587.0, "step": 33040 }, { "epoch": 7.6594043342218106, "grad_norm": 1.0954513549804688, "learning_rate": 6.847620988923683e-06, "loss": 1.4385, "mean_token_accuracy": 0.6879061087965965, "num_tokens": 532634257.0, "step": 33050 }, { "epoch": 7.661722099895701, "grad_norm": 0.9940388202667236, "learning_rate": 6.83472357714878e-06, "loss": 1.4367, "mean_token_accuracy": 0.6897659987211228, "num_tokens": 532796362.0, "step": 33060 }, { "epoch": 7.664039865569591, "grad_norm": 1.112136960029602, "learning_rate": 6.821836399412304e-06, "loss": 1.4476, "mean_token_accuracy": 0.687261538207531, "num_tokens": 532957660.0, "step": 33070 }, { "epoch": 7.666357631243481, "grad_norm": 1.0113444328308105, "learning_rate": 6.808959462974699e-06, "loss": 1.4471, "mean_token_accuracy": 0.6882909640669823, "num_tokens": 533119314.0, "step": 33080 }, { "epoch": 7.668675396917371, "grad_norm": 1.060120940208435, "learning_rate": 6.796092775090626e-06, "loss": 1.4475, "mean_token_accuracy": 0.6874174878001214, "num_tokens": 533279649.0, "step": 33090 }, { "epoch": 7.670993162591262, "grad_norm": 1.046044945716858, "learning_rate": 6.783236343008986e-06, "loss": 1.4381, "mean_token_accuracy": 0.6879659995436669, "num_tokens": 533440408.0, "step": 33100 }, { "epoch": 7.673310928265153, "grad_norm": 1.0267163515090942, "learning_rate": 6.770390173972893e-06, "loss": 1.4402, "mean_token_accuracy": 0.6882911175489426, "num_tokens": 533602107.0, "step": 33110 }, { "epoch": 7.675628693939043, "grad_norm": 1.0159523487091064, "learning_rate": 6.757554275219685e-06, "loss": 1.4548, "mean_token_accuracy": 0.6878572970628738, "num_tokens": 533763681.0, "step": 33120 }, { "epoch": 7.677946459612933, "grad_norm": 1.0349351167678833, "learning_rate": 6.744728653980905e-06, "loss": 1.443, "mean_token_accuracy": 0.6881927862763405, "num_tokens": 533925285.0, "step": 33130 }, { "epoch": 7.680264225286823, "grad_norm": 1.0423095226287842, "learning_rate": 6.731913317482319e-06, "loss": 1.4373, "mean_token_accuracy": 0.6916598021984101, "num_tokens": 534085668.0, "step": 33140 }, { "epoch": 7.682581990960714, "grad_norm": 1.0381803512573242, "learning_rate": 6.719108272943883e-06, "loss": 1.4201, "mean_token_accuracy": 0.6901688039302826, "num_tokens": 534247605.0, "step": 33150 }, { "epoch": 7.6848997566346045, "grad_norm": 1.010860800743103, "learning_rate": 6.706313527579777e-06, "loss": 1.4535, "mean_token_accuracy": 0.6868166595697403, "num_tokens": 534409020.0, "step": 33160 }, { "epoch": 7.687217522308495, "grad_norm": 1.0355968475341797, "learning_rate": 6.693529088598347e-06, "loss": 1.4524, "mean_token_accuracy": 0.6873574301600456, "num_tokens": 534569497.0, "step": 33170 }, { "epoch": 7.689535287982385, "grad_norm": 0.9996838569641113, "learning_rate": 6.680754963202157e-06, "loss": 1.4343, "mean_token_accuracy": 0.6898039042949676, "num_tokens": 534730083.0, "step": 33180 }, { "epoch": 7.691853053656275, "grad_norm": 1.0260119438171387, "learning_rate": 6.6679911585879525e-06, "loss": 1.4475, "mean_token_accuracy": 0.686523386836052, "num_tokens": 534890844.0, "step": 33190 }, { "epoch": 7.694170819330166, "grad_norm": 1.1016314029693604, "learning_rate": 6.655237681946669e-06, "loss": 1.4282, "mean_token_accuracy": 0.6911327883601188, "num_tokens": 535050363.0, "step": 33200 }, { "epoch": 7.696488585004056, "grad_norm": 1.0068466663360596, "learning_rate": 6.642494540463417e-06, "loss": 1.446, "mean_token_accuracy": 0.6878154546022415, "num_tokens": 535210144.0, "step": 33210 }, { "epoch": 7.6988063506779465, "grad_norm": 1.0413726568222046, "learning_rate": 6.629761741317489e-06, "loss": 1.4578, "mean_token_accuracy": 0.6854970693588257, "num_tokens": 535371908.0, "step": 33220 }, { "epoch": 7.701124116351837, "grad_norm": 1.0426267385482788, "learning_rate": 6.6170392916823475e-06, "loss": 1.463, "mean_token_accuracy": 0.6855678781867027, "num_tokens": 535532547.0, "step": 33230 }, { "epoch": 7.703441882025727, "grad_norm": 1.028377890586853, "learning_rate": 6.604327198725627e-06, "loss": 1.4262, "mean_token_accuracy": 0.6911369413137436, "num_tokens": 535692824.0, "step": 33240 }, { "epoch": 7.705759647699617, "grad_norm": 1.0137765407562256, "learning_rate": 6.591625469609125e-06, "loss": 1.4476, "mean_token_accuracy": 0.6882524579763413, "num_tokens": 535854204.0, "step": 33250 }, { "epoch": 7.708077413373508, "grad_norm": 1.0141998529434204, "learning_rate": 6.578934111488802e-06, "loss": 1.4485, "mean_token_accuracy": 0.6869852915406227, "num_tokens": 536014781.0, "step": 33260 }, { "epoch": 7.710395179047398, "grad_norm": 1.0213395357131958, "learning_rate": 6.566253131514774e-06, "loss": 1.4446, "mean_token_accuracy": 0.6870436146855354, "num_tokens": 536173495.0, "step": 33270 }, { "epoch": 7.7127129447212885, "grad_norm": 0.9997633695602417, "learning_rate": 6.55358253683131e-06, "loss": 1.4451, "mean_token_accuracy": 0.687407548725605, "num_tokens": 536335141.0, "step": 33280 }, { "epoch": 7.715030710395179, "grad_norm": 1.0168501138687134, "learning_rate": 6.54092233457683e-06, "loss": 1.4321, "mean_token_accuracy": 0.6901274710893631, "num_tokens": 536496521.0, "step": 33290 }, { "epoch": 7.71734847606907, "grad_norm": 1.0327699184417725, "learning_rate": 6.528272531883894e-06, "loss": 1.4473, "mean_token_accuracy": 0.6868635848164558, "num_tokens": 536658548.0, "step": 33300 }, { "epoch": 7.71966624174296, "grad_norm": 1.0249278545379639, "learning_rate": 6.515633135879215e-06, "loss": 1.4272, "mean_token_accuracy": 0.6902917191386223, "num_tokens": 536819313.0, "step": 33310 }, { "epoch": 7.72198400741685, "grad_norm": 1.0512721538543701, "learning_rate": 6.503004153683631e-06, "loss": 1.4411, "mean_token_accuracy": 0.690236383676529, "num_tokens": 536980881.0, "step": 33320 }, { "epoch": 7.72430177309074, "grad_norm": 1.0262107849121094, "learning_rate": 6.490385592412115e-06, "loss": 1.4315, "mean_token_accuracy": 0.689617569744587, "num_tokens": 537140868.0, "step": 33330 }, { "epoch": 7.726619538764631, "grad_norm": 1.0175546407699585, "learning_rate": 6.477777459173775e-06, "loss": 1.4416, "mean_token_accuracy": 0.6892708271741868, "num_tokens": 537302913.0, "step": 33340 }, { "epoch": 7.728937304438521, "grad_norm": 1.0235161781311035, "learning_rate": 6.465179761071838e-06, "loss": 1.4573, "mean_token_accuracy": 0.6879903838038445, "num_tokens": 537463503.0, "step": 33350 }, { "epoch": 7.731255070112412, "grad_norm": 1.073657751083374, "learning_rate": 6.452592505203653e-06, "loss": 1.4531, "mean_token_accuracy": 0.6882041350007058, "num_tokens": 537624928.0, "step": 33360 }, { "epoch": 7.733572835786302, "grad_norm": 1.044343113899231, "learning_rate": 6.440015698660687e-06, "loss": 1.4355, "mean_token_accuracy": 0.6875660166144371, "num_tokens": 537786440.0, "step": 33370 }, { "epoch": 7.735890601460192, "grad_norm": 0.9898009300231934, "learning_rate": 6.427449348528522e-06, "loss": 1.4337, "mean_token_accuracy": 0.6873521238565445, "num_tokens": 537947946.0, "step": 33380 }, { "epoch": 7.738208367134082, "grad_norm": 0.9862487316131592, "learning_rate": 6.414893461886848e-06, "loss": 1.4286, "mean_token_accuracy": 0.6904676556587219, "num_tokens": 538108756.0, "step": 33390 }, { "epoch": 7.7405261328079735, "grad_norm": 1.0116355419158936, "learning_rate": 6.402348045809459e-06, "loss": 1.4346, "mean_token_accuracy": 0.689212080836296, "num_tokens": 538270535.0, "step": 33400 }, { "epoch": 7.742843898481864, "grad_norm": 1.0455520153045654, "learning_rate": 6.389813107364245e-06, "loss": 1.4498, "mean_token_accuracy": 0.6872003242373467, "num_tokens": 538430740.0, "step": 33410 }, { "epoch": 7.745161664155754, "grad_norm": 1.070708155632019, "learning_rate": 6.377288653613206e-06, "loss": 1.4174, "mean_token_accuracy": 0.6926845744252205, "num_tokens": 538592092.0, "step": 33420 }, { "epoch": 7.747479429829644, "grad_norm": 1.0295727252960205, "learning_rate": 6.364774691612421e-06, "loss": 1.4478, "mean_token_accuracy": 0.6872161626815796, "num_tokens": 538753795.0, "step": 33430 }, { "epoch": 7.749797195503534, "grad_norm": 0.9756242036819458, "learning_rate": 6.352271228412068e-06, "loss": 1.4368, "mean_token_accuracy": 0.6895528793334961, "num_tokens": 538915517.0, "step": 33440 }, { "epoch": 7.7521149611774245, "grad_norm": 0.9890099763870239, "learning_rate": 6.339778271056407e-06, "loss": 1.447, "mean_token_accuracy": 0.6889455273747445, "num_tokens": 539076852.0, "step": 33450 }, { "epoch": 7.754432726851316, "grad_norm": 1.0192726850509644, "learning_rate": 6.327295826583776e-06, "loss": 1.4366, "mean_token_accuracy": 0.689868001639843, "num_tokens": 539238088.0, "step": 33460 }, { "epoch": 7.756750492525206, "grad_norm": 1.052350640296936, "learning_rate": 6.314823902026596e-06, "loss": 1.4619, "mean_token_accuracy": 0.6853627994656563, "num_tokens": 539399788.0, "step": 33470 }, { "epoch": 7.759068258199096, "grad_norm": 1.0409854650497437, "learning_rate": 6.302362504411355e-06, "loss": 1.4515, "mean_token_accuracy": 0.690156315267086, "num_tokens": 539558188.0, "step": 33480 }, { "epoch": 7.761386023872986, "grad_norm": 1.0647883415222168, "learning_rate": 6.289911640758614e-06, "loss": 1.4394, "mean_token_accuracy": 0.6878943413496017, "num_tokens": 539718167.0, "step": 33490 }, { "epoch": 7.763703789546877, "grad_norm": 1.0374475717544556, "learning_rate": 6.277471318083e-06, "loss": 1.4517, "mean_token_accuracy": 0.6872158050537109, "num_tokens": 539879359.0, "step": 33500 }, { "epoch": 7.766021555220767, "grad_norm": 1.0703191757202148, "learning_rate": 6.265041543393202e-06, "loss": 1.4599, "mean_token_accuracy": 0.6863290131092071, "num_tokens": 540039622.0, "step": 33510 }, { "epoch": 7.768339320894658, "grad_norm": 1.0878018140792847, "learning_rate": 6.252622323691961e-06, "loss": 1.4465, "mean_token_accuracy": 0.6877007707953453, "num_tokens": 540200618.0, "step": 33520 }, { "epoch": 7.770657086568548, "grad_norm": 0.9884047508239746, "learning_rate": 6.24021366597608e-06, "loss": 1.4265, "mean_token_accuracy": 0.6911189645528794, "num_tokens": 540361729.0, "step": 33530 }, { "epoch": 7.772974852242438, "grad_norm": 0.9725205302238464, "learning_rate": 6.227815577236401e-06, "loss": 1.4349, "mean_token_accuracy": 0.6890500113368034, "num_tokens": 540523122.0, "step": 33540 }, { "epoch": 7.775292617916328, "grad_norm": 0.9803812503814697, "learning_rate": 6.2154280644578204e-06, "loss": 1.4384, "mean_token_accuracy": 0.6876339808106422, "num_tokens": 540684751.0, "step": 33550 }, { "epoch": 7.777610383590219, "grad_norm": 1.0458061695098877, "learning_rate": 6.2030511346192725e-06, "loss": 1.4451, "mean_token_accuracy": 0.6869767114520073, "num_tokens": 540844810.0, "step": 33560 }, { "epoch": 7.7799281492641095, "grad_norm": 1.0250868797302246, "learning_rate": 6.19068479469373e-06, "loss": 1.4319, "mean_token_accuracy": 0.6906542524695396, "num_tokens": 541005976.0, "step": 33570 }, { "epoch": 7.782245914938, "grad_norm": 1.0342904329299927, "learning_rate": 6.178329051648199e-06, "loss": 1.4431, "mean_token_accuracy": 0.6888828679919243, "num_tokens": 541167013.0, "step": 33580 }, { "epoch": 7.78456368061189, "grad_norm": 1.002210259437561, "learning_rate": 6.165983912443715e-06, "loss": 1.4367, "mean_token_accuracy": 0.6886719420552254, "num_tokens": 541327935.0, "step": 33590 }, { "epoch": 7.786881446285781, "grad_norm": 1.0117205381393433, "learning_rate": 6.153649384035343e-06, "loss": 1.4523, "mean_token_accuracy": 0.687437105178833, "num_tokens": 541489452.0, "step": 33600 }, { "epoch": 7.789199211959671, "grad_norm": 1.0161348581314087, "learning_rate": 6.141325473372167e-06, "loss": 1.449, "mean_token_accuracy": 0.6877579838037491, "num_tokens": 541650707.0, "step": 33610 }, { "epoch": 7.791516977633561, "grad_norm": 1.0038700103759766, "learning_rate": 6.129012187397284e-06, "loss": 1.4446, "mean_token_accuracy": 0.6885146558284759, "num_tokens": 541812185.0, "step": 33620 }, { "epoch": 7.7938347433074515, "grad_norm": 1.0240920782089233, "learning_rate": 6.116709533047818e-06, "loss": 1.4244, "mean_token_accuracy": 0.6905678436160088, "num_tokens": 541973154.0, "step": 33630 }, { "epoch": 7.796152508981342, "grad_norm": 0.9909100532531738, "learning_rate": 6.104417517254893e-06, "loss": 1.452, "mean_token_accuracy": 0.6887642651796341, "num_tokens": 542134988.0, "step": 33640 }, { "epoch": 7.798470274655232, "grad_norm": 1.0277833938598633, "learning_rate": 6.092136146943641e-06, "loss": 1.4473, "mean_token_accuracy": 0.6885185688734055, "num_tokens": 542296604.0, "step": 33650 }, { "epoch": 7.800788040329123, "grad_norm": 0.9978247284889221, "learning_rate": 6.079865429033199e-06, "loss": 1.4396, "mean_token_accuracy": 0.6875937357544899, "num_tokens": 542457707.0, "step": 33660 }, { "epoch": 7.803105806003013, "grad_norm": 1.0142273902893066, "learning_rate": 6.067605370436702e-06, "loss": 1.4417, "mean_token_accuracy": 0.6865286499261856, "num_tokens": 542619460.0, "step": 33670 }, { "epoch": 7.805423571676903, "grad_norm": 1.0209680795669556, "learning_rate": 6.055355978061278e-06, "loss": 1.4462, "mean_token_accuracy": 0.6889600947499275, "num_tokens": 542780620.0, "step": 33680 }, { "epoch": 7.807741337350794, "grad_norm": 0.989974319934845, "learning_rate": 6.043117258808048e-06, "loss": 1.4442, "mean_token_accuracy": 0.6879233345389366, "num_tokens": 542941068.0, "step": 33690 }, { "epoch": 7.810059103024685, "grad_norm": 1.009827971458435, "learning_rate": 6.030889219572119e-06, "loss": 1.4371, "mean_token_accuracy": 0.6878628149628639, "num_tokens": 543102697.0, "step": 33700 }, { "epoch": 7.812376868698575, "grad_norm": 0.9949890971183777, "learning_rate": 6.018671867242581e-06, "loss": 1.426, "mean_token_accuracy": 0.6904384851455688, "num_tokens": 543264710.0, "step": 33710 }, { "epoch": 7.814694634372465, "grad_norm": 1.0298362970352173, "learning_rate": 6.0064652087025105e-06, "loss": 1.4455, "mean_token_accuracy": 0.6872871145606041, "num_tokens": 543425789.0, "step": 33720 }, { "epoch": 7.817012400046355, "grad_norm": 1.0048683881759644, "learning_rate": 5.994269250828935e-06, "loss": 1.4478, "mean_token_accuracy": 0.688794755935669, "num_tokens": 543587451.0, "step": 33730 }, { "epoch": 7.819330165720245, "grad_norm": 1.0220130681991577, "learning_rate": 5.9820840004928794e-06, "loss": 1.4424, "mean_token_accuracy": 0.6880957841873169, "num_tokens": 543748025.0, "step": 33740 }, { "epoch": 7.821647931394136, "grad_norm": 1.0274059772491455, "learning_rate": 5.969909464559329e-06, "loss": 1.4466, "mean_token_accuracy": 0.6870011746883392, "num_tokens": 543909507.0, "step": 33750 }, { "epoch": 7.823965697068027, "grad_norm": 1.0301843881607056, "learning_rate": 5.957745649887228e-06, "loss": 1.4462, "mean_token_accuracy": 0.6870789915323258, "num_tokens": 544071158.0, "step": 33760 }, { "epoch": 7.826283462741917, "grad_norm": 0.9701120853424072, "learning_rate": 5.945592563329483e-06, "loss": 1.4364, "mean_token_accuracy": 0.6875440135598183, "num_tokens": 544232212.0, "step": 33770 }, { "epoch": 7.828601228415807, "grad_norm": 1.0088708400726318, "learning_rate": 5.93345021173296e-06, "loss": 1.4402, "mean_token_accuracy": 0.6882127910852432, "num_tokens": 544393350.0, "step": 33780 }, { "epoch": 7.830918994089697, "grad_norm": 0.9809911847114563, "learning_rate": 5.921318601938469e-06, "loss": 1.4485, "mean_token_accuracy": 0.688425499200821, "num_tokens": 544553675.0, "step": 33790 }, { "epoch": 7.833236759763588, "grad_norm": 1.0555365085601807, "learning_rate": 5.909197740780778e-06, "loss": 1.4515, "mean_token_accuracy": 0.6865433111786843, "num_tokens": 544715631.0, "step": 33800 }, { "epoch": 7.8355545254374785, "grad_norm": 1.0049537420272827, "learning_rate": 5.897087635088591e-06, "loss": 1.4338, "mean_token_accuracy": 0.6902094140648842, "num_tokens": 544877977.0, "step": 33810 }, { "epoch": 7.837872291111369, "grad_norm": 1.0344187021255493, "learning_rate": 5.88498829168456e-06, "loss": 1.442, "mean_token_accuracy": 0.689549820125103, "num_tokens": 545039129.0, "step": 33820 }, { "epoch": 7.840190056785259, "grad_norm": 1.0444834232330322, "learning_rate": 5.8728997173852635e-06, "loss": 1.4394, "mean_token_accuracy": 0.6879210561513901, "num_tokens": 545200661.0, "step": 33830 }, { "epoch": 7.842507822459149, "grad_norm": 0.9895975589752197, "learning_rate": 5.860821919001225e-06, "loss": 1.4667, "mean_token_accuracy": 0.684870408475399, "num_tokens": 545360843.0, "step": 33840 }, { "epoch": 7.844825588133039, "grad_norm": 0.9956210255622864, "learning_rate": 5.848754903336889e-06, "loss": 1.4431, "mean_token_accuracy": 0.6889991745352745, "num_tokens": 545522520.0, "step": 33850 }, { "epoch": 7.84714335380693, "grad_norm": 0.9874727725982666, "learning_rate": 5.8366986771906245e-06, "loss": 1.4427, "mean_token_accuracy": 0.6905592009425163, "num_tokens": 545683728.0, "step": 33860 }, { "epoch": 7.849461119480821, "grad_norm": 1.0639824867248535, "learning_rate": 5.824653247354736e-06, "loss": 1.4396, "mean_token_accuracy": 0.687761552631855, "num_tokens": 545844471.0, "step": 33870 }, { "epoch": 7.851778885154711, "grad_norm": 0.9649064540863037, "learning_rate": 5.812618620615418e-06, "loss": 1.4488, "mean_token_accuracy": 0.6883160322904587, "num_tokens": 546005996.0, "step": 33880 }, { "epoch": 7.854096650828601, "grad_norm": 1.0587027072906494, "learning_rate": 5.800594803752801e-06, "loss": 1.457, "mean_token_accuracy": 0.6861724123358727, "num_tokens": 546166575.0, "step": 33890 }, { "epoch": 7.856414416502492, "grad_norm": 0.9897339940071106, "learning_rate": 5.788581803540919e-06, "loss": 1.4341, "mean_token_accuracy": 0.6896589457988739, "num_tokens": 546328828.0, "step": 33900 }, { "epoch": 7.858732182176382, "grad_norm": 1.105193018913269, "learning_rate": 5.776579626747716e-06, "loss": 1.4556, "mean_token_accuracy": 0.6876937210559845, "num_tokens": 546489947.0, "step": 33910 }, { "epoch": 7.8610499478502724, "grad_norm": 1.0040106773376465, "learning_rate": 5.76458828013503e-06, "loss": 1.4445, "mean_token_accuracy": 0.6874695315957069, "num_tokens": 546651951.0, "step": 33920 }, { "epoch": 7.863367713524163, "grad_norm": 1.0448986291885376, "learning_rate": 5.752607770458604e-06, "loss": 1.4532, "mean_token_accuracy": 0.6870597630739212, "num_tokens": 546813907.0, "step": 33930 }, { "epoch": 7.865685479198053, "grad_norm": 1.0252423286437988, "learning_rate": 5.740638104468074e-06, "loss": 1.4461, "mean_token_accuracy": 0.6878797337412834, "num_tokens": 546975688.0, "step": 33940 }, { "epoch": 7.868003244871943, "grad_norm": 0.9894943833351135, "learning_rate": 5.728679288906968e-06, "loss": 1.4262, "mean_token_accuracy": 0.6919299602508545, "num_tokens": 547136524.0, "step": 33950 }, { "epoch": 7.870321010545834, "grad_norm": 1.05995512008667, "learning_rate": 5.716731330512695e-06, "loss": 1.4459, "mean_token_accuracy": 0.6866505742073059, "num_tokens": 547298446.0, "step": 33960 }, { "epoch": 7.872638776219724, "grad_norm": 1.040144681930542, "learning_rate": 5.704794236016556e-06, "loss": 1.4467, "mean_token_accuracy": 0.6870525285601616, "num_tokens": 547460664.0, "step": 33970 }, { "epoch": 7.8749565418936145, "grad_norm": 1.0106217861175537, "learning_rate": 5.6928680121437234e-06, "loss": 1.4474, "mean_token_accuracy": 0.6861162111163139, "num_tokens": 547621781.0, "step": 33980 }, { "epoch": 7.877274307567505, "grad_norm": 1.051658272743225, "learning_rate": 5.680952665613251e-06, "loss": 1.4238, "mean_token_accuracy": 0.6908125445246697, "num_tokens": 547783210.0, "step": 33990 }, { "epoch": 7.879592073241396, "grad_norm": 1.011995553970337, "learning_rate": 5.669048203138064e-06, "loss": 1.4324, "mean_token_accuracy": 0.69104383289814, "num_tokens": 547945009.0, "step": 34000 }, { "epoch": 7.881909838915286, "grad_norm": 0.9977952837944031, "learning_rate": 5.657154631424949e-06, "loss": 1.4237, "mean_token_accuracy": 0.6922805964946747, "num_tokens": 548105175.0, "step": 34010 }, { "epoch": 7.884227604589176, "grad_norm": 1.0663936138153076, "learning_rate": 5.645271957174569e-06, "loss": 1.4213, "mean_token_accuracy": 0.6901376500725747, "num_tokens": 548265928.0, "step": 34020 }, { "epoch": 7.886545370263066, "grad_norm": 1.0549746751785278, "learning_rate": 5.633400187081436e-06, "loss": 1.4311, "mean_token_accuracy": 0.6894985169172287, "num_tokens": 548426719.0, "step": 34030 }, { "epoch": 7.8888631359369565, "grad_norm": 1.0674878358840942, "learning_rate": 5.621539327833925e-06, "loss": 1.4386, "mean_token_accuracy": 0.6885993674397468, "num_tokens": 548588367.0, "step": 34040 }, { "epoch": 7.891180901610847, "grad_norm": 1.0180392265319824, "learning_rate": 5.6096893861142614e-06, "loss": 1.4389, "mean_token_accuracy": 0.6898083120584488, "num_tokens": 548749596.0, "step": 34050 }, { "epoch": 7.893498667284738, "grad_norm": 0.9920876026153564, "learning_rate": 5.5978503685985226e-06, "loss": 1.4377, "mean_token_accuracy": 0.6900687038898468, "num_tokens": 548910112.0, "step": 34060 }, { "epoch": 7.895816432958628, "grad_norm": 0.9768338799476624, "learning_rate": 5.5860222819566265e-06, "loss": 1.4338, "mean_token_accuracy": 0.6900445997714997, "num_tokens": 549071678.0, "step": 34070 }, { "epoch": 7.898134198632518, "grad_norm": 1.0090728998184204, "learning_rate": 5.574205132852339e-06, "loss": 1.4388, "mean_token_accuracy": 0.6884642347693444, "num_tokens": 549232929.0, "step": 34080 }, { "epoch": 7.900451964306408, "grad_norm": 1.0187740325927734, "learning_rate": 5.5623989279432605e-06, "loss": 1.4565, "mean_token_accuracy": 0.6873713433742523, "num_tokens": 549394327.0, "step": 34090 }, { "epoch": 7.9027697299802995, "grad_norm": 1.0718978643417358, "learning_rate": 5.550603673880824e-06, "loss": 1.4406, "mean_token_accuracy": 0.687862603366375, "num_tokens": 549555991.0, "step": 34100 }, { "epoch": 7.90508749565419, "grad_norm": 1.0110739469528198, "learning_rate": 5.5388193773102975e-06, "loss": 1.4445, "mean_token_accuracy": 0.6870573133230209, "num_tokens": 549717938.0, "step": 34110 }, { "epoch": 7.90740526132808, "grad_norm": 1.0297942161560059, "learning_rate": 5.5270460448707705e-06, "loss": 1.4249, "mean_token_accuracy": 0.6910751909017563, "num_tokens": 549879193.0, "step": 34120 }, { "epoch": 7.90972302700197, "grad_norm": 1.0217798948287964, "learning_rate": 5.51528368319516e-06, "loss": 1.4492, "mean_token_accuracy": 0.6885127335786819, "num_tokens": 550040106.0, "step": 34130 }, { "epoch": 7.91204079267586, "grad_norm": 1.019109845161438, "learning_rate": 5.5035322989101995e-06, "loss": 1.4316, "mean_token_accuracy": 0.689945611357689, "num_tokens": 550201674.0, "step": 34140 }, { "epoch": 7.91435855834975, "grad_norm": 1.0685582160949707, "learning_rate": 5.491791898636439e-06, "loss": 1.441, "mean_token_accuracy": 0.6890279024839401, "num_tokens": 550362501.0, "step": 34150 }, { "epoch": 7.9166763240236415, "grad_norm": 1.0208868980407715, "learning_rate": 5.4800624889882415e-06, "loss": 1.4273, "mean_token_accuracy": 0.6893036291003227, "num_tokens": 550523644.0, "step": 34160 }, { "epoch": 7.918994089697532, "grad_norm": 1.062898874282837, "learning_rate": 5.468344076573775e-06, "loss": 1.4354, "mean_token_accuracy": 0.6901079207658768, "num_tokens": 550685303.0, "step": 34170 }, { "epoch": 7.921311855371422, "grad_norm": 1.0025897026062012, "learning_rate": 5.4566366679950135e-06, "loss": 1.4351, "mean_token_accuracy": 0.6888181149959565, "num_tokens": 550847169.0, "step": 34180 }, { "epoch": 7.923629621045312, "grad_norm": 1.013969898223877, "learning_rate": 5.444940269847731e-06, "loss": 1.4383, "mean_token_accuracy": 0.6899235799908638, "num_tokens": 551008316.0, "step": 34190 }, { "epoch": 7.925947386719203, "grad_norm": 0.9529881477355957, "learning_rate": 5.433254888721501e-06, "loss": 1.4459, "mean_token_accuracy": 0.6872092321515083, "num_tokens": 551169242.0, "step": 34200 }, { "epoch": 7.928265152393093, "grad_norm": 1.035365343093872, "learning_rate": 5.421580531199688e-06, "loss": 1.4337, "mean_token_accuracy": 0.6893705412745476, "num_tokens": 551330059.0, "step": 34210 }, { "epoch": 7.930582918066984, "grad_norm": 1.043168544769287, "learning_rate": 5.409917203859444e-06, "loss": 1.4495, "mean_token_accuracy": 0.6869211912155151, "num_tokens": 551491538.0, "step": 34220 }, { "epoch": 7.932900683740874, "grad_norm": 0.9898838400840759, "learning_rate": 5.398264913271711e-06, "loss": 1.4414, "mean_token_accuracy": 0.6885412201285362, "num_tokens": 551652411.0, "step": 34230 }, { "epoch": 7.935218449414764, "grad_norm": 1.0194792747497559, "learning_rate": 5.3866236660012116e-06, "loss": 1.4367, "mean_token_accuracy": 0.6891609832644463, "num_tokens": 551814292.0, "step": 34240 }, { "epoch": 7.937536215088654, "grad_norm": 1.0773563385009766, "learning_rate": 5.374993468606446e-06, "loss": 1.4404, "mean_token_accuracy": 0.6889613181352615, "num_tokens": 551975687.0, "step": 34250 }, { "epoch": 7.939853980762545, "grad_norm": 0.9966750144958496, "learning_rate": 5.3633743276396834e-06, "loss": 1.4479, "mean_token_accuracy": 0.6877866953611373, "num_tokens": 552137269.0, "step": 34260 }, { "epoch": 7.942171746436435, "grad_norm": 1.0101598501205444, "learning_rate": 5.351766249646978e-06, "loss": 1.451, "mean_token_accuracy": 0.6871252551674842, "num_tokens": 552299052.0, "step": 34270 }, { "epoch": 7.944489512110326, "grad_norm": 1.0725735425949097, "learning_rate": 5.340169241168147e-06, "loss": 1.4412, "mean_token_accuracy": 0.6883417084813118, "num_tokens": 552460471.0, "step": 34280 }, { "epoch": 7.946807277784216, "grad_norm": 1.0635759830474854, "learning_rate": 5.328583308736751e-06, "loss": 1.4508, "mean_token_accuracy": 0.6883015990257263, "num_tokens": 552620738.0, "step": 34290 }, { "epoch": 7.949125043458106, "grad_norm": 0.9784599542617798, "learning_rate": 5.317008458880138e-06, "loss": 1.4439, "mean_token_accuracy": 0.6875757500529289, "num_tokens": 552781889.0, "step": 34300 }, { "epoch": 7.951442809131997, "grad_norm": 1.0118101835250854, "learning_rate": 5.3054446981194e-06, "loss": 1.4572, "mean_token_accuracy": 0.6862839907407761, "num_tokens": 552943051.0, "step": 34310 }, { "epoch": 7.953760574805887, "grad_norm": 1.0790315866470337, "learning_rate": 5.293892032969378e-06, "loss": 1.4331, "mean_token_accuracy": 0.6894345089793206, "num_tokens": 553104887.0, "step": 34320 }, { "epoch": 7.9560783404797775, "grad_norm": 1.0655386447906494, "learning_rate": 5.282350469938671e-06, "loss": 1.4375, "mean_token_accuracy": 0.6890939757227897, "num_tokens": 553266445.0, "step": 34330 }, { "epoch": 7.958396106153668, "grad_norm": 1.0598725080490112, "learning_rate": 5.270820015529617e-06, "loss": 1.4501, "mean_token_accuracy": 0.6875252351164818, "num_tokens": 553426740.0, "step": 34340 }, { "epoch": 7.960713871827558, "grad_norm": 1.0226889848709106, "learning_rate": 5.259300676238302e-06, "loss": 1.4505, "mean_token_accuracy": 0.6880502969026565, "num_tokens": 553587974.0, "step": 34350 }, { "epoch": 7.963031637501449, "grad_norm": 1.0150314569473267, "learning_rate": 5.2477924585545376e-06, "loss": 1.4369, "mean_token_accuracy": 0.6885888010263443, "num_tokens": 553748664.0, "step": 34360 }, { "epoch": 7.965349403175339, "grad_norm": 1.0064842700958252, "learning_rate": 5.2362953689618824e-06, "loss": 1.4413, "mean_token_accuracy": 0.689802248775959, "num_tokens": 553910119.0, "step": 34370 }, { "epoch": 7.967667168849229, "grad_norm": 1.03987717628479, "learning_rate": 5.22480941393762e-06, "loss": 1.4482, "mean_token_accuracy": 0.6890187531709671, "num_tokens": 554071239.0, "step": 34380 }, { "epoch": 7.9699849345231195, "grad_norm": 0.9984422922134399, "learning_rate": 5.2133345999527625e-06, "loss": 1.4374, "mean_token_accuracy": 0.6888560846447944, "num_tokens": 554232953.0, "step": 34390 }, { "epoch": 7.97230270019701, "grad_norm": 1.0596919059753418, "learning_rate": 5.201870933472039e-06, "loss": 1.45, "mean_token_accuracy": 0.6862125650048256, "num_tokens": 554393584.0, "step": 34400 }, { "epoch": 7.974620465870901, "grad_norm": 1.012531042098999, "learning_rate": 5.1904184209539105e-06, "loss": 1.4524, "mean_token_accuracy": 0.6863235741853714, "num_tokens": 554555138.0, "step": 34410 }, { "epoch": 7.976938231544791, "grad_norm": 1.000101089477539, "learning_rate": 5.178977068850541e-06, "loss": 1.4431, "mean_token_accuracy": 0.6872305497527122, "num_tokens": 554714476.0, "step": 34420 }, { "epoch": 7.979255997218681, "grad_norm": 1.0009567737579346, "learning_rate": 5.167546883607818e-06, "loss": 1.4373, "mean_token_accuracy": 0.687743927538395, "num_tokens": 554875957.0, "step": 34430 }, { "epoch": 7.981573762892571, "grad_norm": 1.0297393798828125, "learning_rate": 5.156127871665328e-06, "loss": 1.4364, "mean_token_accuracy": 0.688326096534729, "num_tokens": 555037547.0, "step": 34440 }, { "epoch": 7.983891528566462, "grad_norm": 0.9820913672447205, "learning_rate": 5.144720039456372e-06, "loss": 1.4646, "mean_token_accuracy": 0.6854452505707741, "num_tokens": 555199378.0, "step": 34450 }, { "epoch": 7.986209294240353, "grad_norm": 1.0278030633926392, "learning_rate": 5.133323393407946e-06, "loss": 1.4331, "mean_token_accuracy": 0.6889728352427482, "num_tokens": 555360901.0, "step": 34460 }, { "epoch": 7.988527059914243, "grad_norm": 1.055631160736084, "learning_rate": 5.121937939940743e-06, "loss": 1.4428, "mean_token_accuracy": 0.6882819175720215, "num_tokens": 555521874.0, "step": 34470 }, { "epoch": 7.990844825588133, "grad_norm": 1.0221967697143555, "learning_rate": 5.110563685469158e-06, "loss": 1.4329, "mean_token_accuracy": 0.6900606229901314, "num_tokens": 555683397.0, "step": 34480 }, { "epoch": 7.993162591262023, "grad_norm": 1.0489164590835571, "learning_rate": 5.099200636401269e-06, "loss": 1.4309, "mean_token_accuracy": 0.6898876711726188, "num_tokens": 555844797.0, "step": 34490 }, { "epoch": 7.995480356935913, "grad_norm": 1.0208367109298706, "learning_rate": 5.087848799138844e-06, "loss": 1.4349, "mean_token_accuracy": 0.6887486040592193, "num_tokens": 556005551.0, "step": 34500 }, { "epoch": 7.9977981226098045, "grad_norm": 1.0257656574249268, "learning_rate": 5.076508180077336e-06, "loss": 1.4629, "mean_token_accuracy": 0.6854806110262871, "num_tokens": 556166345.0, "step": 34510 }, { "epoch": 8.0, "grad_norm": 1.3714781999588013, "learning_rate": 5.065178785605871e-06, "loss": 1.4208, "mean_token_accuracy": 0.6911530777027732, "num_tokens": 556318032.0, "step": 34520 } ], "logging_steps": 10, "max_steps": 43150, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.503695185889616e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }