{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9895148391682957, "eval_steps": 200, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014217167229429535, "grad_norm": 0.07947567850351334, "learning_rate": 4.4999999999999996e-05, "loss": 1.3759, "mean_token_accuracy": 0.6159000303596258, "num_tokens": 84090.0, "step": 10 }, { "epoch": 0.02843433445885907, "grad_norm": 0.08515394479036331, "learning_rate": 9.5e-05, "loss": 1.3323, "mean_token_accuracy": 0.6351301483809948, "num_tokens": 170759.0, "step": 20 }, { "epoch": 0.04265150168828861, "grad_norm": 0.07723315060138702, "learning_rate": 0.000145, "loss": 1.1533, "mean_token_accuracy": 0.6630864188075065, "num_tokens": 253787.0, "step": 30 }, { "epoch": 0.05686866891771814, "grad_norm": 0.06530017405748367, "learning_rate": 0.00019500000000000002, "loss": 1.1601, "mean_token_accuracy": 0.6682917241007089, "num_tokens": 336856.0, "step": 40 }, { "epoch": 0.07108583614714768, "grad_norm": 0.07639652490615845, "learning_rate": 0.000245, "loss": 1.1466, "mean_token_accuracy": 0.66524165160954, "num_tokens": 424034.0, "step": 50 }, { "epoch": 0.08530300337657722, "grad_norm": 0.07605864852666855, "learning_rate": 0.000295, "loss": 1.0968, "mean_token_accuracy": 0.6777923263609409, "num_tokens": 506343.0, "step": 60 }, { "epoch": 0.09952017060600675, "grad_norm": 0.07540106028318405, "learning_rate": 0.000345, "loss": 1.1026, "mean_token_accuracy": 0.6769415199756622, "num_tokens": 597209.0, "step": 70 }, { "epoch": 0.11373733783543628, "grad_norm": 0.07050619274377823, "learning_rate": 0.000395, "loss": 1.1332, "mean_token_accuracy": 0.6799008328467607, "num_tokens": 681397.0, "step": 80 }, { "epoch": 0.12795450506486583, "grad_norm": 0.0690559446811676, "learning_rate": 0.00044500000000000003, "loss": 1.0939, "mean_token_accuracy": 0.6780450720340013, "num_tokens": 770078.0, "step": 90 }, { "epoch": 0.14217167229429536, "grad_norm": 0.07599011063575745, "learning_rate": 0.000495, "loss": 1.0984, "mean_token_accuracy": 0.6856418281793595, "num_tokens": 854875.0, "step": 100 }, { "epoch": 0.1563888395237249, "grad_norm": 0.07766395807266235, "learning_rate": 0.00049997524130583, "loss": 1.0643, "mean_token_accuracy": 0.6823979251086711, "num_tokens": 941388.0, "step": 110 }, { "epoch": 0.17060600675315443, "grad_norm": 0.07025604695081711, "learning_rate": 0.000499889661992257, "loss": 1.0308, "mean_token_accuracy": 0.6956785634160042, "num_tokens": 1034278.0, "step": 120 }, { "epoch": 0.18482317398258397, "grad_norm": 0.07456669211387634, "learning_rate": 0.0004997429773180627, "loss": 1.1476, "mean_token_accuracy": 0.6694141685962677, "num_tokens": 1120392.0, "step": 130 }, { "epoch": 0.1990403412120135, "grad_norm": 0.07715420424938202, "learning_rate": 0.0004995352231519573, "loss": 1.0151, "mean_token_accuracy": 0.6931566946208477, "num_tokens": 1206998.0, "step": 140 }, { "epoch": 0.21325750844144303, "grad_norm": 0.07406821101903915, "learning_rate": 0.0004992664502959351, "loss": 1.0737, "mean_token_accuracy": 0.676430806145072, "num_tokens": 1293765.0, "step": 150 }, { "epoch": 0.22747467567087257, "grad_norm": 0.06690461933612823, "learning_rate": 0.0004989367244728525, "loss": 1.0627, "mean_token_accuracy": 0.6812205474823714, "num_tokens": 1380899.0, "step": 160 }, { "epoch": 0.24169184290030213, "grad_norm": 0.07383380085229874, "learning_rate": 0.0004985461263103559, "loss": 1.0646, "mean_token_accuracy": 0.687187984585762, "num_tokens": 1465568.0, "step": 170 }, { "epoch": 0.25590901012973166, "grad_norm": 0.06884950399398804, "learning_rate": 0.0004980947513211662, "loss": 1.0251, "mean_token_accuracy": 0.694553443044424, "num_tokens": 1550271.0, "step": 180 }, { "epoch": 0.27012617735916117, "grad_norm": 0.06553196907043457, "learning_rate": 0.0004975827098797236, "loss": 1.0548, "mean_token_accuracy": 0.6926396556198597, "num_tokens": 1636861.0, "step": 190 }, { "epoch": 0.28434334458859073, "grad_norm": 0.06570354849100113, "learning_rate": 0.0004970101271951969, "loss": 1.0264, "step": 200 }, { "epoch": 0.28434334458859073, "eval_loss": 1.07483971118927, "eval_mean_token_accuracy": 0.6932999727725982, "eval_num_tokens": 1724122.0, "eval_runtime": 42.5065, "eval_samples_per_second": 23.502, "eval_steps_per_second": 5.881, "step": 200 }, { "epoch": 0.29856051181802024, "grad_norm": 0.06550676375627518, "learning_rate": 0.000496377143280867, "loss": 1.1593, "mean_token_accuracy": 0.6800552912056446, "num_tokens": 1812034.0, "step": 210 }, { "epoch": 0.3127776790474498, "grad_norm": 0.06636584550142288, "learning_rate": 0.0004956839129198892, "loss": 1.0884, "mean_token_accuracy": 0.6808783996850252, "num_tokens": 1897847.0, "step": 220 }, { "epoch": 0.32699484627687936, "grad_norm": 0.07459145039319992, "learning_rate": 0.0004949306056274443, "loss": 1.0021, "mean_token_accuracy": 0.7001412186771632, "num_tokens": 1985093.0, "step": 230 }, { "epoch": 0.34121201350630886, "grad_norm": 0.07089713215827942, "learning_rate": 0.0004941174056092868, "loss": 1.1351, "mean_token_accuracy": 0.6768312893807888, "num_tokens": 2068917.0, "step": 240 }, { "epoch": 0.3554291807357384, "grad_norm": 0.07157368212938309, "learning_rate": 0.0004932445117167016, "loss": 1.0925, "mean_token_accuracy": 0.6823811627924442, "num_tokens": 2156664.0, "step": 250 }, { "epoch": 0.36964634796516793, "grad_norm": 0.08121207356452942, "learning_rate": 0.0004923121373978788, "loss": 1.0691, "mean_token_accuracy": 0.6801868129521609, "num_tokens": 2238958.0, "step": 260 }, { "epoch": 0.3838635151945975, "grad_norm": 0.07213877141475677, "learning_rate": 0.0004913205106457196, "loss": 1.0836, "mean_token_accuracy": 0.6846270024776459, "num_tokens": 2325983.0, "step": 270 }, { "epoch": 0.398080682424027, "grad_norm": 0.06338542699813843, "learning_rate": 0.000490269873942085, "loss": 1.0642, "mean_token_accuracy": 0.6862983625382185, "num_tokens": 2413507.0, "step": 280 }, { "epoch": 0.41229784965345656, "grad_norm": 0.07177390903234482, "learning_rate": 0.0004891604841985019, "loss": 1.0845, "mean_token_accuracy": 0.6847749698907137, "num_tokens": 2501002.0, "step": 290 }, { "epoch": 0.42651501688288607, "grad_norm": 0.076690673828125, "learning_rate": 0.00048799261269334124, "loss": 1.0119, "mean_token_accuracy": 0.6916860986500979, "num_tokens": 2585987.0, "step": 300 }, { "epoch": 0.44073218411231563, "grad_norm": 0.06709789484739304, "learning_rate": 0.0004867665450054816, "loss": 0.9942, "mean_token_accuracy": 0.7062507443130016, "num_tokens": 2674750.0, "step": 310 }, { "epoch": 0.45494935134174513, "grad_norm": 0.06799201667308807, "learning_rate": 0.0004854825809444773, "loss": 1.0469, "mean_token_accuracy": 0.6844053711742163, "num_tokens": 2759329.0, "step": 320 }, { "epoch": 0.4691665185711747, "grad_norm": 0.0704847127199173, "learning_rate": 0.00048414103447724636, "loss": 1.0386, "mean_token_accuracy": 0.6981413580477238, "num_tokens": 2843743.0, "step": 330 }, { "epoch": 0.48338368580060426, "grad_norm": 0.06998146325349808, "learning_rate": 0.0004827422336512958, "loss": 1.0033, "mean_token_accuracy": 0.7021258160471916, "num_tokens": 2930981.0, "step": 340 }, { "epoch": 0.49760085303003376, "grad_norm": 0.07551276683807373, "learning_rate": 0.0004812865205145048, "loss": 1.0475, "mean_token_accuracy": 0.6880744352936745, "num_tokens": 3017518.0, "step": 350 }, { "epoch": 0.5118180202594633, "grad_norm": 0.07380267232656479, "learning_rate": 0.00047977425103148377, "loss": 0.9871, "mean_token_accuracy": 0.7036375127732754, "num_tokens": 3106242.0, "step": 360 }, { "epoch": 0.5260351874888929, "grad_norm": 0.07055483758449554, "learning_rate": 0.0004782057949965307, "loss": 1.075, "mean_token_accuracy": 0.6817354142665863, "num_tokens": 3194043.0, "step": 370 }, { "epoch": 0.5402523547183223, "grad_norm": 0.07755430787801743, "learning_rate": 0.00047658153594320535, "loss": 1.047, "mean_token_accuracy": 0.6804160382598639, "num_tokens": 3277336.0, "step": 380 }, { "epoch": 0.5544695219477519, "grad_norm": 0.07740534842014313, "learning_rate": 0.00047490187105054437, "loss": 1.0101, "mean_token_accuracy": 0.7028108246624469, "num_tokens": 3364865.0, "step": 390 }, { "epoch": 0.5686866891771815, "grad_norm": 0.07296065241098404, "learning_rate": 0.00047316721104593906, "loss": 1.0261, "step": 400 }, { "epoch": 0.5686866891771815, "eval_loss": 1.0445445775985718, "eval_mean_token_accuracy": 0.7004730106592179, "eval_num_tokens": 3453057.0, "eval_runtime": 42.4654, "eval_samples_per_second": 23.525, "eval_steps_per_second": 5.887, "step": 400 }, { "epoch": 0.582903856406611, "grad_norm": 0.07180076837539673, "learning_rate": 0.00047137798010470056, "loss": 1.0054, "mean_token_accuracy": 0.6979773432016373, "num_tokens": 3543849.0, "step": 410 }, { "epoch": 0.5971210236360405, "grad_norm": 0.07680170238018036, "learning_rate": 0.0004695346157463367, "loss": 1.0841, "mean_token_accuracy": 0.6853552751243115, "num_tokens": 3628251.0, "step": 420 }, { "epoch": 0.61133819086547, "grad_norm": 0.07107053697109222, "learning_rate": 0.00046763756872756523, "loss": 0.9948, "mean_token_accuracy": 0.6986799541860819, "num_tokens": 3717234.0, "step": 430 }, { "epoch": 0.6255553580948996, "grad_norm": 0.08255039900541306, "learning_rate": 0.00046568730293209104, "loss": 1.0413, "mean_token_accuracy": 0.6862188082188367, "num_tokens": 3801222.0, "step": 440 }, { "epoch": 0.6397725253243292, "grad_norm": 0.07656990736722946, "learning_rate": 0.00046368429525717273, "loss": 1.052, "mean_token_accuracy": 0.6892995785921812, "num_tokens": 3886081.0, "step": 450 }, { "epoch": 0.6539896925537587, "grad_norm": 0.07661142945289612, "learning_rate": 0.00046162903549700705, "loss": 1.012, "mean_token_accuracy": 0.6943552978336811, "num_tokens": 3972184.0, "step": 460 }, { "epoch": 0.6682068597831882, "grad_norm": 0.07201256603002548, "learning_rate": 0.00045952202622296013, "loss": 1.0699, "mean_token_accuracy": 0.6910859402269125, "num_tokens": 4055872.0, "step": 470 }, { "epoch": 0.6824240270126177, "grad_norm": 0.07143343985080719, "learning_rate": 0.00045736378266067414, "loss": 1.0064, "mean_token_accuracy": 0.7026031356304884, "num_tokens": 4144012.0, "step": 480 }, { "epoch": 0.6966411942420473, "grad_norm": 0.07650640606880188, "learning_rate": 0.0004551548325640789, "loss": 1.0517, "mean_token_accuracy": 0.6946574732661247, "num_tokens": 4230019.0, "step": 490 }, { "epoch": 0.7108583614714769, "grad_norm": 0.07642583549022675, "learning_rate": 0.0004528957160863412, "loss": 1.0346, "mean_token_accuracy": 0.6941149879246951, "num_tokens": 4311202.0, "step": 500 }, { "epoch": 0.7250755287009063, "grad_norm": 0.07585732638835907, "learning_rate": 0.00045058698564778106, "loss": 1.0548, "mean_token_accuracy": 0.6841745227575302, "num_tokens": 4397292.0, "step": 510 }, { "epoch": 0.7392926959303359, "grad_norm": 0.07219212502241135, "learning_rate": 0.00044822920580078887, "loss": 1.0194, "mean_token_accuracy": 0.6980217099189758, "num_tokens": 4484848.0, "step": 520 }, { "epoch": 0.7535098631597654, "grad_norm": 0.07723289728164673, "learning_rate": 0.0004458229530917759, "loss": 0.9967, "mean_token_accuracy": 0.6917125687003136, "num_tokens": 4568171.0, "step": 530 }, { "epoch": 0.767727030389195, "grad_norm": 0.07553447037935257, "learning_rate": 0.00044336881592019163, "loss": 1.0854, "mean_token_accuracy": 0.6873999379575253, "num_tokens": 4655953.0, "step": 540 }, { "epoch": 0.7819441976186244, "grad_norm": 0.08117958158254623, "learning_rate": 0.00044086739439464266, "loss": 1.0472, "mean_token_accuracy": 0.68543600179255, "num_tokens": 4741659.0, "step": 550 }, { "epoch": 0.796161364848054, "grad_norm": 0.0754791796207428, "learning_rate": 0.00043831930018614873, "loss": 1.0366, "mean_token_accuracy": 0.6899502877146005, "num_tokens": 4821791.0, "step": 560 }, { "epoch": 0.8103785320774836, "grad_norm": 0.07144685089588165, "learning_rate": 0.00043572515637857126, "loss": 0.9703, "mean_token_accuracy": 0.7105959545820951, "num_tokens": 4911804.0, "step": 570 }, { "epoch": 0.8245956993069131, "grad_norm": 0.07971132546663284, "learning_rate": 0.00043308559731625087, "loss": 1.0127, "mean_token_accuracy": 0.6948394205421209, "num_tokens": 4997188.0, "step": 580 }, { "epoch": 0.8388128665363427, "grad_norm": 0.07623772323131561, "learning_rate": 0.0004304012684488917, "loss": 0.9698, "mean_token_accuracy": 0.7132892053574323, "num_tokens": 5082069.0, "step": 590 }, { "epoch": 0.8530300337657721, "grad_norm": 0.07272877544164658, "learning_rate": 0.0004276728261737298, "loss": 0.9706, "step": 600 }, { "epoch": 0.8530300337657721, "eval_loss": 1.025159478187561, "eval_mean_token_accuracy": 0.7048066265583038, "eval_num_tokens": 5169872.0, "eval_runtime": 42.4633, "eval_samples_per_second": 23.526, "eval_steps_per_second": 5.887, "step": 600 }, { "epoch": 0.8672472009952017, "grad_norm": 0.07478692382574081, "learning_rate": 0.0004249009376750249, "loss": 1.046, "mean_token_accuracy": 0.7029114665463567, "num_tokens": 5257518.0, "step": 610 }, { "epoch": 0.8814643682246313, "grad_norm": 0.07147077471017838, "learning_rate": 0.0004220862807609143, "loss": 0.9995, "mean_token_accuracy": 0.6980900842696428, "num_tokens": 5343931.0, "step": 620 }, { "epoch": 0.8956815354540608, "grad_norm": 0.07459016889333725, "learning_rate": 0.0004192295436976688, "loss": 1.019, "mean_token_accuracy": 0.7001338206231594, "num_tokens": 5426802.0, "step": 630 }, { "epoch": 0.9098987026834903, "grad_norm": 0.07691922038793564, "learning_rate": 0.00041633142504139133, "loss": 1.0229, "mean_token_accuracy": 0.697338268905878, "num_tokens": 5511524.0, "step": 640 }, { "epoch": 0.9241158699129198, "grad_norm": 0.07407371699810028, "learning_rate": 0.0004133926334671996, "loss": 0.9848, "mean_token_accuracy": 0.7054470591247082, "num_tokens": 5600023.0, "step": 650 }, { "epoch": 0.9383330371423494, "grad_norm": 0.07771491259336472, "learning_rate": 0.000410413887595934, "loss": 1.067, "mean_token_accuracy": 0.6928857892751694, "num_tokens": 5690236.0, "step": 660 }, { "epoch": 0.952550204371779, "grad_norm": 0.07632343471050262, "learning_rate": 0.00040739591581843407, "loss": 1.037, "mean_token_accuracy": 0.696436945348978, "num_tokens": 5776575.0, "step": 670 }, { "epoch": 0.9667673716012085, "grad_norm": 0.07429352402687073, "learning_rate": 0.0004043394561174251, "loss": 0.9819, "mean_token_accuracy": 0.7112393327057361, "num_tokens": 5867257.0, "step": 680 }, { "epoch": 0.980984538830638, "grad_norm": 0.07374002784490585, "learning_rate": 0.0004012452558870602, "loss": 0.913, "mean_token_accuracy": 0.7194178324192763, "num_tokens": 5956262.0, "step": 690 }, { "epoch": 0.9952017060600675, "grad_norm": 0.07516616582870483, "learning_rate": 0.0003981140717501599, "loss": 1.0199, "mean_token_accuracy": 0.699269012734294, "num_tokens": 6042335.0, "step": 700 }, { "epoch": 1.0085303003376578, "grad_norm": 0.07739484310150146, "learning_rate": 0.00039494666937319616, "loss": 0.9484, "mean_token_accuracy": 0.7045566352208456, "num_tokens": 6119308.0, "step": 710 }, { "epoch": 1.0227474675670873, "grad_norm": 0.08529424667358398, "learning_rate": 0.0003917438232790641, "loss": 0.9295, "mean_token_accuracy": 0.7197049882262945, "num_tokens": 6203866.0, "step": 720 }, { "epoch": 1.036964634796517, "grad_norm": 0.08253555744886398, "learning_rate": 0.0003885063166576886, "loss": 0.8767, "mean_token_accuracy": 0.7232992608100176, "num_tokens": 6290259.0, "step": 730 }, { "epoch": 1.0511818020259462, "grad_norm": 0.07590445876121521, "learning_rate": 0.0003852349411745113, "loss": 0.9535, "mean_token_accuracy": 0.71869598031044, "num_tokens": 6377505.0, "step": 740 }, { "epoch": 1.0653989692553758, "grad_norm": 0.08107644319534302, "learning_rate": 0.00038193049677690493, "loss": 0.9614, "mean_token_accuracy": 0.7086535628885031, "num_tokens": 6463425.0, "step": 750 }, { "epoch": 1.0796161364848054, "grad_norm": 0.07508436590433121, "learning_rate": 0.00037859379149856334, "loss": 0.9326, "mean_token_accuracy": 0.720469256490469, "num_tokens": 6552684.0, "step": 760 }, { "epoch": 1.093833303714235, "grad_norm": 0.0790044292807579, "learning_rate": 0.00037522564126191274, "loss": 0.9778, "mean_token_accuracy": 0.7115771636366844, "num_tokens": 6636911.0, "step": 770 }, { "epoch": 1.1080504709436645, "grad_norm": 0.08326832950115204, "learning_rate": 0.000371826869678595, "loss": 1.056, "mean_token_accuracy": 0.6882848802953958, "num_tokens": 6718835.0, "step": 780 }, { "epoch": 1.122267638173094, "grad_norm": 0.08922722190618515, "learning_rate": 0.0003683983078480708, "loss": 0.9371, "mean_token_accuracy": 0.7138718143105507, "num_tokens": 6802682.0, "step": 790 }, { "epoch": 1.1364848054025236, "grad_norm": 0.07749788463115692, "learning_rate": 0.00036494079415439086, "loss": 0.9389, "step": 800 }, { "epoch": 1.1364848054025236, "eval_loss": 1.0164164304733276, "eval_mean_token_accuracy": 0.7073304796218872, "eval_num_tokens": 6890030.0, "eval_runtime": 42.5558, "eval_samples_per_second": 23.475, "eval_steps_per_second": 5.875, "step": 800 }, { "epoch": 1.1507019726319532, "grad_norm": 0.07486575841903687, "learning_rate": 0.00036145517406118674, "loss": 0.9133, "mean_token_accuracy": 0.7182627562433481, "num_tokens": 6977677.0, "step": 810 }, { "epoch": 1.1649191398613827, "grad_norm": 0.08833441883325577, "learning_rate": 0.00035794229990492987, "loss": 0.9745, "mean_token_accuracy": 0.7087426863610744, "num_tokens": 7058573.0, "step": 820 }, { "epoch": 1.179136307090812, "grad_norm": 0.08615554124116898, "learning_rate": 0.0003544030306865108, "loss": 0.8777, "mean_token_accuracy": 0.7278520867228508, "num_tokens": 7143361.0, "step": 830 }, { "epoch": 1.1933534743202416, "grad_norm": 0.07825995981693268, "learning_rate": 0.00035083823186118744, "loss": 0.9291, "mean_token_accuracy": 0.7207735061645508, "num_tokens": 7232054.0, "step": 840 }, { "epoch": 1.2075706415496712, "grad_norm": 0.08184744417667389, "learning_rate": 0.00034724877512695674, "loss": 0.8692, "mean_token_accuracy": 0.7367342457175254, "num_tokens": 7321898.0, "step": 850 }, { "epoch": 1.2217878087791008, "grad_norm": 0.08207986503839493, "learning_rate": 0.0003436355382113982, "loss": 1.0342, "mean_token_accuracy": 0.6977195214480162, "num_tokens": 7407614.0, "step": 860 }, { "epoch": 1.2360049760085303, "grad_norm": 0.08287196606397629, "learning_rate": 0.00033999940465704394, "loss": 0.8808, "mean_token_accuracy": 0.727986204251647, "num_tokens": 7495498.0, "step": 870 }, { "epoch": 1.2502221432379599, "grad_norm": 0.08513687551021576, "learning_rate": 0.0003363412636053269, "loss": 0.9547, "mean_token_accuracy": 0.7101028818637133, "num_tokens": 7584760.0, "step": 880 }, { "epoch": 1.2644393104673894, "grad_norm": 0.08163287490606308, "learning_rate": 0.00033266200957915925, "loss": 0.9349, "mean_token_accuracy": 0.7183492567390204, "num_tokens": 7672425.0, "step": 890 }, { "epoch": 1.2786564776968188, "grad_norm": 0.08243776857852936, "learning_rate": 0.00032896254226419543, "loss": 0.9292, "mean_token_accuracy": 0.7134430769830942, "num_tokens": 7757203.0, "step": 900 }, { "epoch": 1.2928736449262486, "grad_norm": 0.09458713978528976, "learning_rate": 0.00032524376628883253, "loss": 0.9744, "mean_token_accuracy": 0.7050179023295641, "num_tokens": 7846548.0, "step": 910 }, { "epoch": 1.307090812155678, "grad_norm": 0.08505059778690338, "learning_rate": 0.0003215065910030021, "loss": 0.9786, "mean_token_accuracy": 0.709782537817955, "num_tokens": 7930393.0, "step": 920 }, { "epoch": 1.3213079793851075, "grad_norm": 0.08586814999580383, "learning_rate": 0.00031775193025580773, "loss": 0.8947, "mean_token_accuracy": 0.7237144611775875, "num_tokens": 8016877.0, "step": 930 }, { "epoch": 1.335525146614537, "grad_norm": 0.0874597579240799, "learning_rate": 0.00031398070217206127, "loss": 0.8978, "mean_token_accuracy": 0.720280421897769, "num_tokens": 8103878.0, "step": 940 }, { "epoch": 1.3497423138439666, "grad_norm": 0.09211765974760056, "learning_rate": 0.0003101938289277753, "loss": 0.9288, "mean_token_accuracy": 0.7201074693351984, "num_tokens": 8190575.0, "step": 950 }, { "epoch": 1.3639594810733962, "grad_norm": 0.0884537398815155, "learning_rate": 0.00030639223652466336, "loss": 0.9295, "mean_token_accuracy": 0.713099530339241, "num_tokens": 8275179.0, "step": 960 }, { "epoch": 1.3781766483028257, "grad_norm": 0.08991611003875732, "learning_rate": 0.0003025768545637057, "loss": 0.9265, "mean_token_accuracy": 0.7252651590853929, "num_tokens": 8360478.0, "step": 970 }, { "epoch": 1.3923938155322553, "grad_norm": 0.08758469671010971, "learning_rate": 0.0002987486160178344, "loss": 0.977, "mean_token_accuracy": 0.7134061496704817, "num_tokens": 8445197.0, "step": 980 }, { "epoch": 1.4066109827616846, "grad_norm": 0.08159046620130539, "learning_rate": 0.0002949084570037939, "loss": 0.9115, "mean_token_accuracy": 0.7211619779467583, "num_tokens": 8530327.0, "step": 990 }, { "epoch": 1.4208281499911144, "grad_norm": 0.08836396783590317, "learning_rate": 0.00029105731655323344, "loss": 0.9154, "step": 1000 }, { "epoch": 1.4208281499911144, "eval_loss": 1.0047532320022583, "eval_mean_token_accuracy": 0.7103532667160034, "eval_num_tokens": 8619497.0, "eval_runtime": 42.4552, "eval_samples_per_second": 23.531, "eval_steps_per_second": 5.889, "step": 1000 }, { "epoch": 1.4350453172205437, "grad_norm": 0.09082586318254471, "learning_rate": 0.0002871961363830858, "loss": 0.9077, "mean_token_accuracy": 0.7218164600431919, "num_tokens": 8703507.0, "step": 1010 }, { "epoch": 1.4492624844499733, "grad_norm": 0.09680253267288208, "learning_rate": 0.0002833258606652901, "loss": 0.8928, "mean_token_accuracy": 0.7247406598180532, "num_tokens": 8788781.0, "step": 1020 }, { "epoch": 1.4634796516794029, "grad_norm": 0.08538492023944855, "learning_rate": 0.0002794474357959138, "loss": 0.9021, "mean_token_accuracy": 0.7273614536970854, "num_tokens": 8873572.0, "step": 1030 }, { "epoch": 1.4776968189088324, "grad_norm": 0.08687795698642731, "learning_rate": 0.00027556181016373147, "loss": 0.9266, "mean_token_accuracy": 0.7160039469599724, "num_tokens": 8960541.0, "step": 1040 }, { "epoch": 1.491913986138262, "grad_norm": 0.08219098299741745, "learning_rate": 0.00027166993391831566, "loss": 0.9344, "mean_token_accuracy": 0.7199014227837324, "num_tokens": 9046332.0, "step": 1050 }, { "epoch": 1.5061311533676913, "grad_norm": 0.08430644124746323, "learning_rate": 0.000267772758737697, "loss": 0.963, "mean_token_accuracy": 0.7126489922404289, "num_tokens": 9133194.0, "step": 1060 }, { "epoch": 1.520348320597121, "grad_norm": 0.08255460858345032, "learning_rate": 0.00026387123759565197, "loss": 0.8784, "mean_token_accuracy": 0.7328134395182133, "num_tokens": 9220168.0, "step": 1070 }, { "epoch": 1.5345654878265504, "grad_norm": 0.09743297845125198, "learning_rate": 0.00025996632452867166, "loss": 0.8675, "mean_token_accuracy": 0.7273001208901405, "num_tokens": 9306051.0, "step": 1080 }, { "epoch": 1.5487826550559802, "grad_norm": 0.08803436160087585, "learning_rate": 0.0002560589744026729, "loss": 0.8476, "mean_token_accuracy": 0.7371940363198519, "num_tokens": 9398040.0, "step": 1090 }, { "epoch": 1.5629998222854096, "grad_norm": 0.08522514998912811, "learning_rate": 0.00025215014267950463, "loss": 0.9067, "mean_token_accuracy": 0.714500817283988, "num_tokens": 9485987.0, "step": 1100 }, { "epoch": 1.5772169895148391, "grad_norm": 0.09162591397762299, "learning_rate": 0.00024824078518331013, "loss": 0.936, "mean_token_accuracy": 0.7146482899785042, "num_tokens": 9571356.0, "step": 1110 }, { "epoch": 1.5914341567442687, "grad_norm": 0.0838090255856514, "learning_rate": 0.00024433185786679955, "loss": 0.9111, "mean_token_accuracy": 0.7223079178482295, "num_tokens": 9657761.0, "step": 1120 }, { "epoch": 1.6056513239736983, "grad_norm": 0.08568980544805527, "learning_rate": 0.00024042431657749118, "loss": 0.9572, "mean_token_accuracy": 0.710682961717248, "num_tokens": 9742840.0, "step": 1130 }, { "epoch": 1.6198684912031278, "grad_norm": 0.08540449291467667, "learning_rate": 0.00023651911682397937, "loss": 0.901, "mean_token_accuracy": 0.7298479046672582, "num_tokens": 9833491.0, "step": 1140 }, { "epoch": 1.6340856584325572, "grad_norm": 0.09621024876832962, "learning_rate": 0.0002326172135422839, "loss": 0.8465, "mean_token_accuracy": 0.739329730719328, "num_tokens": 9921422.0, "step": 1150 }, { "epoch": 1.648302825661987, "grad_norm": 0.0809611827135086, "learning_rate": 0.00022871956086234062, "loss": 0.8573, "mean_token_accuracy": 0.7349758796393872, "num_tokens": 10007862.0, "step": 1160 }, { "epoch": 1.6625199928914163, "grad_norm": 0.08698837459087372, "learning_rate": 0.00022482711187468823, "loss": 0.8512, "mean_token_accuracy": 0.7366860095411539, "num_tokens": 10100577.0, "step": 1170 }, { "epoch": 1.676737160120846, "grad_norm": 0.08779753744602203, "learning_rate": 0.00022094081839741004, "loss": 0.8936, "mean_token_accuracy": 0.7239959452301263, "num_tokens": 10186346.0, "step": 1180 }, { "epoch": 1.6909543273502754, "grad_norm": 0.08322805166244507, "learning_rate": 0.0002170616307433861, "loss": 0.9271, "mean_token_accuracy": 0.7187892116606236, "num_tokens": 10270881.0, "step": 1190 }, { "epoch": 1.705171494579705, "grad_norm": 0.09833941608667374, "learning_rate": 0.00021319049748791416, "loss": 1.0209, "step": 1200 }, { "epoch": 1.705171494579705, "eval_loss": 0.9964284300804138, "eval_mean_token_accuracy": 0.7130191161632538, "eval_num_tokens": 10355263.0, "eval_runtime": 42.5075, "eval_samples_per_second": 23.502, "eval_steps_per_second": 5.881, "step": 1200 }, { "epoch": 1.7193886618091345, "grad_norm": 0.09196127951145172, "learning_rate": 0.00020932836523675493, "loss": 0.9229, "mean_token_accuracy": 0.7052626656368375, "num_tokens": 10439472.0, "step": 1210 }, { "epoch": 1.733605829038564, "grad_norm": 0.09308561682701111, "learning_rate": 0.00020547617839465924, "loss": 0.9532, "mean_token_accuracy": 0.7139888234436512, "num_tokens": 10524216.0, "step": 1220 }, { "epoch": 1.7478229962679936, "grad_norm": 0.09138765186071396, "learning_rate": 0.0002016348789344335, "loss": 0.895, "mean_token_accuracy": 0.7220640182495117, "num_tokens": 10611738.0, "step": 1230 }, { "epoch": 1.762040163497423, "grad_norm": 0.08532418310642242, "learning_rate": 0.0001978054061665993, "loss": 0.9175, "mean_token_accuracy": 0.7228887047618627, "num_tokens": 10697449.0, "step": 1240 }, { "epoch": 1.7762573307268528, "grad_norm": 0.08684728294610977, "learning_rate": 0.0001939886965097048, "loss": 0.9053, "mean_token_accuracy": 0.7226716388016939, "num_tokens": 10783296.0, "step": 1250 }, { "epoch": 1.790474497956282, "grad_norm": 0.08659351617097855, "learning_rate": 0.0001901856832613426, "loss": 0.9259, "mean_token_accuracy": 0.7152926828712225, "num_tokens": 10871421.0, "step": 1260 }, { "epoch": 1.804691665185712, "grad_norm": 0.08151692152023315, "learning_rate": 0.00018639729636993137, "loss": 0.8651, "mean_token_accuracy": 0.7316095266491175, "num_tokens": 10960209.0, "step": 1270 }, { "epoch": 1.8189088324151412, "grad_norm": 0.09470875561237335, "learning_rate": 0.00018262446220731582, "loss": 0.9005, "mean_token_accuracy": 0.7300491981208325, "num_tokens": 11046451.0, "step": 1280 }, { "epoch": 1.8331259996445708, "grad_norm": 0.08670168370008469, "learning_rate": 0.0001788681033422419, "loss": 0.8818, "mean_token_accuracy": 0.7283473834395409, "num_tokens": 11132072.0, "step": 1290 }, { "epoch": 1.8473431668740004, "grad_norm": 0.08462122082710266, "learning_rate": 0.00017512913831476136, "loss": 0.9394, "mean_token_accuracy": 0.7198392864316702, "num_tokens": 11218335.0, "step": 1300 }, { "epoch": 1.86156033410343, "grad_norm": 0.0869954526424408, "learning_rate": 0.000171408481411622, "loss": 0.8649, "mean_token_accuracy": 0.7348789256066084, "num_tokens": 11305589.0, "step": 1310 }, { "epoch": 1.8757775013328595, "grad_norm": 0.09598379582166672, "learning_rate": 0.00016770704244269735, "loss": 0.8799, "mean_token_accuracy": 0.7312331754714251, "num_tokens": 11393534.0, "step": 1320 }, { "epoch": 1.8899946685622888, "grad_norm": 0.09157571196556091, "learning_rate": 0.00016402572651851217, "loss": 0.9071, "mean_token_accuracy": 0.7278904471546411, "num_tokens": 11481006.0, "step": 1330 }, { "epoch": 1.9042118357917186, "grad_norm": 0.08688797056674957, "learning_rate": 0.0001603654338289151, "loss": 0.921, "mean_token_accuracy": 0.725471879914403, "num_tokens": 11570046.0, "step": 1340 }, { "epoch": 1.918429003021148, "grad_norm": 0.09125315397977829, "learning_rate": 0.00015672705942295734, "loss": 0.8608, "mean_token_accuracy": 0.7313944108784198, "num_tokens": 11656322.0, "step": 1350 }, { "epoch": 1.9326461702505777, "grad_norm": 0.08270374685525894, "learning_rate": 0.00015311149299002542, "loss": 0.8296, "mean_token_accuracy": 0.740207726508379, "num_tokens": 11745127.0, "step": 1360 }, { "epoch": 1.946863337480007, "grad_norm": 0.09753109514713287, "learning_rate": 0.0001495196186422872, "loss": 0.9149, "mean_token_accuracy": 0.72291075065732, "num_tokens": 11826582.0, "step": 1370 }, { "epoch": 1.9610805047094366, "grad_norm": 0.09203537553548813, "learning_rate": 0.00014595231469849963, "loss": 0.9261, "mean_token_accuracy": 0.7244169395416975, "num_tokens": 11911842.0, "step": 1380 }, { "epoch": 1.9752976719388662, "grad_norm": 0.10030084103345871, "learning_rate": 0.00014241045346923462, "loss": 0.8738, "mean_token_accuracy": 0.7312715597450733, "num_tokens": 11997173.0, "step": 1390 }, { "epoch": 1.9895148391682957, "grad_norm": 0.09157243371009827, "learning_rate": 0.00013889490104357276, "loss": 0.944, "step": 1400 }, { "epoch": 1.9895148391682957, "eval_loss": 0.9828982949256897, "eval_mean_token_accuracy": 0.7159896125793457, "eval_num_tokens": 12079318.0, "eval_runtime": 42.4967, "eval_samples_per_second": 23.508, "eval_steps_per_second": 5.883, "step": 1400 } ], "logging_steps": 10, "max_steps": 2109, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2636469427598029e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }