|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9895148391682957, |
|
"eval_steps": 200, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014217167229429535, |
|
"grad_norm": 0.07947567850351334, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 1.3759, |
|
"mean_token_accuracy": 0.6159000303596258, |
|
"num_tokens": 84090.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02843433445885907, |
|
"grad_norm": 0.08515394479036331, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.3323, |
|
"mean_token_accuracy": 0.6351301483809948, |
|
"num_tokens": 170759.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04265150168828861, |
|
"grad_norm": 0.07723315060138702, |
|
"learning_rate": 0.000145, |
|
"loss": 1.1533, |
|
"mean_token_accuracy": 0.6630864188075065, |
|
"num_tokens": 253787.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05686866891771814, |
|
"grad_norm": 0.06530017405748367, |
|
"learning_rate": 0.00019500000000000002, |
|
"loss": 1.1601, |
|
"mean_token_accuracy": 0.6682917241007089, |
|
"num_tokens": 336856.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07108583614714768, |
|
"grad_norm": 0.07639652490615845, |
|
"learning_rate": 0.000245, |
|
"loss": 1.1466, |
|
"mean_token_accuracy": 0.66524165160954, |
|
"num_tokens": 424034.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08530300337657722, |
|
"grad_norm": 0.07605864852666855, |
|
"learning_rate": 0.000295, |
|
"loss": 1.0968, |
|
"mean_token_accuracy": 0.6777923263609409, |
|
"num_tokens": 506343.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09952017060600675, |
|
"grad_norm": 0.07540106028318405, |
|
"learning_rate": 0.000345, |
|
"loss": 1.1026, |
|
"mean_token_accuracy": 0.6769415199756622, |
|
"num_tokens": 597209.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11373733783543628, |
|
"grad_norm": 0.07050619274377823, |
|
"learning_rate": 0.000395, |
|
"loss": 1.1332, |
|
"mean_token_accuracy": 0.6799008328467607, |
|
"num_tokens": 681397.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12795450506486583, |
|
"grad_norm": 0.0690559446811676, |
|
"learning_rate": 0.00044500000000000003, |
|
"loss": 1.0939, |
|
"mean_token_accuracy": 0.6780450720340013, |
|
"num_tokens": 770078.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14217167229429536, |
|
"grad_norm": 0.07599011063575745, |
|
"learning_rate": 0.000495, |
|
"loss": 1.0984, |
|
"mean_token_accuracy": 0.6856418281793595, |
|
"num_tokens": 854875.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1563888395237249, |
|
"grad_norm": 0.07766395807266235, |
|
"learning_rate": 0.00049997524130583, |
|
"loss": 1.0643, |
|
"mean_token_accuracy": 0.6823979251086711, |
|
"num_tokens": 941388.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17060600675315443, |
|
"grad_norm": 0.07025604695081711, |
|
"learning_rate": 0.000499889661992257, |
|
"loss": 1.0308, |
|
"mean_token_accuracy": 0.6956785634160042, |
|
"num_tokens": 1034278.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18482317398258397, |
|
"grad_norm": 0.07456669211387634, |
|
"learning_rate": 0.0004997429773180627, |
|
"loss": 1.1476, |
|
"mean_token_accuracy": 0.6694141685962677, |
|
"num_tokens": 1120392.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1990403412120135, |
|
"grad_norm": 0.07715420424938202, |
|
"learning_rate": 0.0004995352231519573, |
|
"loss": 1.0151, |
|
"mean_token_accuracy": 0.6931566946208477, |
|
"num_tokens": 1206998.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21325750844144303, |
|
"grad_norm": 0.07406821101903915, |
|
"learning_rate": 0.0004992664502959351, |
|
"loss": 1.0737, |
|
"mean_token_accuracy": 0.676430806145072, |
|
"num_tokens": 1293765.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22747467567087257, |
|
"grad_norm": 0.06690461933612823, |
|
"learning_rate": 0.0004989367244728525, |
|
"loss": 1.0627, |
|
"mean_token_accuracy": 0.6812205474823714, |
|
"num_tokens": 1380899.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24169184290030213, |
|
"grad_norm": 0.07383380085229874, |
|
"learning_rate": 0.0004985461263103559, |
|
"loss": 1.0646, |
|
"mean_token_accuracy": 0.687187984585762, |
|
"num_tokens": 1465568.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.25590901012973166, |
|
"grad_norm": 0.06884950399398804, |
|
"learning_rate": 0.0004980947513211662, |
|
"loss": 1.0251, |
|
"mean_token_accuracy": 0.694553443044424, |
|
"num_tokens": 1550271.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27012617735916117, |
|
"grad_norm": 0.06553196907043457, |
|
"learning_rate": 0.0004975827098797236, |
|
"loss": 1.0548, |
|
"mean_token_accuracy": 0.6926396556198597, |
|
"num_tokens": 1636861.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.28434334458859073, |
|
"grad_norm": 0.06570354849100113, |
|
"learning_rate": 0.0004970101271951969, |
|
"loss": 1.0264, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.28434334458859073, |
|
"eval_loss": 1.07483971118927, |
|
"eval_mean_token_accuracy": 0.6932999727725982, |
|
"eval_num_tokens": 1724122.0, |
|
"eval_runtime": 42.5065, |
|
"eval_samples_per_second": 23.502, |
|
"eval_steps_per_second": 5.881, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29856051181802024, |
|
"grad_norm": 0.06550676375627518, |
|
"learning_rate": 0.000496377143280867, |
|
"loss": 1.1593, |
|
"mean_token_accuracy": 0.6800552912056446, |
|
"num_tokens": 1812034.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3127776790474498, |
|
"grad_norm": 0.06636584550142288, |
|
"learning_rate": 0.0004956839129198892, |
|
"loss": 1.0884, |
|
"mean_token_accuracy": 0.6808783996850252, |
|
"num_tokens": 1897847.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.32699484627687936, |
|
"grad_norm": 0.07459145039319992, |
|
"learning_rate": 0.0004949306056274443, |
|
"loss": 1.0021, |
|
"mean_token_accuracy": 0.7001412186771632, |
|
"num_tokens": 1985093.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.34121201350630886, |
|
"grad_norm": 0.07089713215827942, |
|
"learning_rate": 0.0004941174056092868, |
|
"loss": 1.1351, |
|
"mean_token_accuracy": 0.6768312893807888, |
|
"num_tokens": 2068917.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3554291807357384, |
|
"grad_norm": 0.07157368212938309, |
|
"learning_rate": 0.0004932445117167016, |
|
"loss": 1.0925, |
|
"mean_token_accuracy": 0.6823811627924442, |
|
"num_tokens": 2156664.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.36964634796516793, |
|
"grad_norm": 0.08121207356452942, |
|
"learning_rate": 0.0004923121373978788, |
|
"loss": 1.0691, |
|
"mean_token_accuracy": 0.6801868129521609, |
|
"num_tokens": 2238958.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3838635151945975, |
|
"grad_norm": 0.07213877141475677, |
|
"learning_rate": 0.0004913205106457196, |
|
"loss": 1.0836, |
|
"mean_token_accuracy": 0.6846270024776459, |
|
"num_tokens": 2325983.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.398080682424027, |
|
"grad_norm": 0.06338542699813843, |
|
"learning_rate": 0.000490269873942085, |
|
"loss": 1.0642, |
|
"mean_token_accuracy": 0.6862983625382185, |
|
"num_tokens": 2413507.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.41229784965345656, |
|
"grad_norm": 0.07177390903234482, |
|
"learning_rate": 0.0004891604841985019, |
|
"loss": 1.0845, |
|
"mean_token_accuracy": 0.6847749698907137, |
|
"num_tokens": 2501002.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.42651501688288607, |
|
"grad_norm": 0.076690673828125, |
|
"learning_rate": 0.00048799261269334124, |
|
"loss": 1.0119, |
|
"mean_token_accuracy": 0.6916860986500979, |
|
"num_tokens": 2585987.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44073218411231563, |
|
"grad_norm": 0.06709789484739304, |
|
"learning_rate": 0.0004867665450054816, |
|
"loss": 0.9942, |
|
"mean_token_accuracy": 0.7062507443130016, |
|
"num_tokens": 2674750.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.45494935134174513, |
|
"grad_norm": 0.06799201667308807, |
|
"learning_rate": 0.0004854825809444773, |
|
"loss": 1.0469, |
|
"mean_token_accuracy": 0.6844053711742163, |
|
"num_tokens": 2759329.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4691665185711747, |
|
"grad_norm": 0.0704847127199173, |
|
"learning_rate": 0.00048414103447724636, |
|
"loss": 1.0386, |
|
"mean_token_accuracy": 0.6981413580477238, |
|
"num_tokens": 2843743.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.48338368580060426, |
|
"grad_norm": 0.06998146325349808, |
|
"learning_rate": 0.0004827422336512958, |
|
"loss": 1.0033, |
|
"mean_token_accuracy": 0.7021258160471916, |
|
"num_tokens": 2930981.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.49760085303003376, |
|
"grad_norm": 0.07551276683807373, |
|
"learning_rate": 0.0004812865205145048, |
|
"loss": 1.0475, |
|
"mean_token_accuracy": 0.6880744352936745, |
|
"num_tokens": 3017518.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5118180202594633, |
|
"grad_norm": 0.07380267232656479, |
|
"learning_rate": 0.00047977425103148377, |
|
"loss": 0.9871, |
|
"mean_token_accuracy": 0.7036375127732754, |
|
"num_tokens": 3106242.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5260351874888929, |
|
"grad_norm": 0.07055483758449554, |
|
"learning_rate": 0.0004782057949965307, |
|
"loss": 1.075, |
|
"mean_token_accuracy": 0.6817354142665863, |
|
"num_tokens": 3194043.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5402523547183223, |
|
"grad_norm": 0.07755430787801743, |
|
"learning_rate": 0.00047658153594320535, |
|
"loss": 1.047, |
|
"mean_token_accuracy": 0.6804160382598639, |
|
"num_tokens": 3277336.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5544695219477519, |
|
"grad_norm": 0.07740534842014313, |
|
"learning_rate": 0.00047490187105054437, |
|
"loss": 1.0101, |
|
"mean_token_accuracy": 0.7028108246624469, |
|
"num_tokens": 3364865.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5686866891771815, |
|
"grad_norm": 0.07296065241098404, |
|
"learning_rate": 0.00047316721104593906, |
|
"loss": 1.0261, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5686866891771815, |
|
"eval_loss": 1.0445445775985718, |
|
"eval_mean_token_accuracy": 0.7004730106592179, |
|
"eval_num_tokens": 3453057.0, |
|
"eval_runtime": 42.4654, |
|
"eval_samples_per_second": 23.525, |
|
"eval_steps_per_second": 5.887, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.582903856406611, |
|
"grad_norm": 0.07180076837539673, |
|
"learning_rate": 0.00047137798010470056, |
|
"loss": 1.0054, |
|
"mean_token_accuracy": 0.6979773432016373, |
|
"num_tokens": 3543849.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5971210236360405, |
|
"grad_norm": 0.07680170238018036, |
|
"learning_rate": 0.0004695346157463367, |
|
"loss": 1.0841, |
|
"mean_token_accuracy": 0.6853552751243115, |
|
"num_tokens": 3628251.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.61133819086547, |
|
"grad_norm": 0.07107053697109222, |
|
"learning_rate": 0.00046763756872756523, |
|
"loss": 0.9948, |
|
"mean_token_accuracy": 0.6986799541860819, |
|
"num_tokens": 3717234.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6255553580948996, |
|
"grad_norm": 0.08255039900541306, |
|
"learning_rate": 0.00046568730293209104, |
|
"loss": 1.0413, |
|
"mean_token_accuracy": 0.6862188082188367, |
|
"num_tokens": 3801222.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6397725253243292, |
|
"grad_norm": 0.07656990736722946, |
|
"learning_rate": 0.00046368429525717273, |
|
"loss": 1.052, |
|
"mean_token_accuracy": 0.6892995785921812, |
|
"num_tokens": 3886081.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6539896925537587, |
|
"grad_norm": 0.07661142945289612, |
|
"learning_rate": 0.00046162903549700705, |
|
"loss": 1.012, |
|
"mean_token_accuracy": 0.6943552978336811, |
|
"num_tokens": 3972184.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6682068597831882, |
|
"grad_norm": 0.07201256603002548, |
|
"learning_rate": 0.00045952202622296013, |
|
"loss": 1.0699, |
|
"mean_token_accuracy": 0.6910859402269125, |
|
"num_tokens": 4055872.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6824240270126177, |
|
"grad_norm": 0.07143343985080719, |
|
"learning_rate": 0.00045736378266067414, |
|
"loss": 1.0064, |
|
"mean_token_accuracy": 0.7026031356304884, |
|
"num_tokens": 4144012.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6966411942420473, |
|
"grad_norm": 0.07650640606880188, |
|
"learning_rate": 0.0004551548325640789, |
|
"loss": 1.0517, |
|
"mean_token_accuracy": 0.6946574732661247, |
|
"num_tokens": 4230019.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7108583614714769, |
|
"grad_norm": 0.07642583549022675, |
|
"learning_rate": 0.0004528957160863412, |
|
"loss": 1.0346, |
|
"mean_token_accuracy": 0.6941149879246951, |
|
"num_tokens": 4311202.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7250755287009063, |
|
"grad_norm": 0.07585732638835907, |
|
"learning_rate": 0.00045058698564778106, |
|
"loss": 1.0548, |
|
"mean_token_accuracy": 0.6841745227575302, |
|
"num_tokens": 4397292.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7392926959303359, |
|
"grad_norm": 0.07219212502241135, |
|
"learning_rate": 0.00044822920580078887, |
|
"loss": 1.0194, |
|
"mean_token_accuracy": 0.6980217099189758, |
|
"num_tokens": 4484848.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7535098631597654, |
|
"grad_norm": 0.07723289728164673, |
|
"learning_rate": 0.0004458229530917759, |
|
"loss": 0.9967, |
|
"mean_token_accuracy": 0.6917125687003136, |
|
"num_tokens": 4568171.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.767727030389195, |
|
"grad_norm": 0.07553447037935257, |
|
"learning_rate": 0.00044336881592019163, |
|
"loss": 1.0854, |
|
"mean_token_accuracy": 0.6873999379575253, |
|
"num_tokens": 4655953.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7819441976186244, |
|
"grad_norm": 0.08117958158254623, |
|
"learning_rate": 0.00044086739439464266, |
|
"loss": 1.0472, |
|
"mean_token_accuracy": 0.68543600179255, |
|
"num_tokens": 4741659.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.796161364848054, |
|
"grad_norm": 0.0754791796207428, |
|
"learning_rate": 0.00043831930018614873, |
|
"loss": 1.0366, |
|
"mean_token_accuracy": 0.6899502877146005, |
|
"num_tokens": 4821791.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8103785320774836, |
|
"grad_norm": 0.07144685089588165, |
|
"learning_rate": 0.00043572515637857126, |
|
"loss": 0.9703, |
|
"mean_token_accuracy": 0.7105959545820951, |
|
"num_tokens": 4911804.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8245956993069131, |
|
"grad_norm": 0.07971132546663284, |
|
"learning_rate": 0.00043308559731625087, |
|
"loss": 1.0127, |
|
"mean_token_accuracy": 0.6948394205421209, |
|
"num_tokens": 4997188.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8388128665363427, |
|
"grad_norm": 0.07623772323131561, |
|
"learning_rate": 0.0004304012684488917, |
|
"loss": 0.9698, |
|
"mean_token_accuracy": 0.7132892053574323, |
|
"num_tokens": 5082069.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8530300337657721, |
|
"grad_norm": 0.07272877544164658, |
|
"learning_rate": 0.0004276728261737298, |
|
"loss": 0.9706, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8530300337657721, |
|
"eval_loss": 1.025159478187561, |
|
"eval_mean_token_accuracy": 0.7048066265583038, |
|
"eval_num_tokens": 5169872.0, |
|
"eval_runtime": 42.4633, |
|
"eval_samples_per_second": 23.526, |
|
"eval_steps_per_second": 5.887, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8672472009952017, |
|
"grad_norm": 0.07478692382574081, |
|
"learning_rate": 0.0004249009376750249, |
|
"loss": 1.046, |
|
"mean_token_accuracy": 0.7029114665463567, |
|
"num_tokens": 5257518.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8814643682246313, |
|
"grad_norm": 0.07147077471017838, |
|
"learning_rate": 0.0004220862807609143, |
|
"loss": 0.9995, |
|
"mean_token_accuracy": 0.6980900842696428, |
|
"num_tokens": 5343931.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8956815354540608, |
|
"grad_norm": 0.07459016889333725, |
|
"learning_rate": 0.0004192295436976688, |
|
"loss": 1.019, |
|
"mean_token_accuracy": 0.7001338206231594, |
|
"num_tokens": 5426802.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9098987026834903, |
|
"grad_norm": 0.07691922038793564, |
|
"learning_rate": 0.00041633142504139133, |
|
"loss": 1.0229, |
|
"mean_token_accuracy": 0.697338268905878, |
|
"num_tokens": 5511524.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9241158699129198, |
|
"grad_norm": 0.07407371699810028, |
|
"learning_rate": 0.0004133926334671996, |
|
"loss": 0.9848, |
|
"mean_token_accuracy": 0.7054470591247082, |
|
"num_tokens": 5600023.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9383330371423494, |
|
"grad_norm": 0.07771491259336472, |
|
"learning_rate": 0.000410413887595934, |
|
"loss": 1.067, |
|
"mean_token_accuracy": 0.6928857892751694, |
|
"num_tokens": 5690236.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.952550204371779, |
|
"grad_norm": 0.07632343471050262, |
|
"learning_rate": 0.00040739591581843407, |
|
"loss": 1.037, |
|
"mean_token_accuracy": 0.696436945348978, |
|
"num_tokens": 5776575.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9667673716012085, |
|
"grad_norm": 0.07429352402687073, |
|
"learning_rate": 0.0004043394561174251, |
|
"loss": 0.9819, |
|
"mean_token_accuracy": 0.7112393327057361, |
|
"num_tokens": 5867257.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.980984538830638, |
|
"grad_norm": 0.07374002784490585, |
|
"learning_rate": 0.0004012452558870602, |
|
"loss": 0.913, |
|
"mean_token_accuracy": 0.7194178324192763, |
|
"num_tokens": 5956262.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9952017060600675, |
|
"grad_norm": 0.07516616582870483, |
|
"learning_rate": 0.0003981140717501599, |
|
"loss": 1.0199, |
|
"mean_token_accuracy": 0.699269012734294, |
|
"num_tokens": 6042335.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0085303003376578, |
|
"grad_norm": 0.07739484310150146, |
|
"learning_rate": 0.00039494666937319616, |
|
"loss": 0.9484, |
|
"mean_token_accuracy": 0.7045566352208456, |
|
"num_tokens": 6119308.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0227474675670873, |
|
"grad_norm": 0.08529424667358398, |
|
"learning_rate": 0.0003917438232790641, |
|
"loss": 0.9295, |
|
"mean_token_accuracy": 0.7197049882262945, |
|
"num_tokens": 6203866.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.036964634796517, |
|
"grad_norm": 0.08253555744886398, |
|
"learning_rate": 0.0003885063166576886, |
|
"loss": 0.8767, |
|
"mean_token_accuracy": 0.7232992608100176, |
|
"num_tokens": 6290259.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0511818020259462, |
|
"grad_norm": 0.07590445876121521, |
|
"learning_rate": 0.0003852349411745113, |
|
"loss": 0.9535, |
|
"mean_token_accuracy": 0.71869598031044, |
|
"num_tokens": 6377505.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0653989692553758, |
|
"grad_norm": 0.08107644319534302, |
|
"learning_rate": 0.00038193049677690493, |
|
"loss": 0.9614, |
|
"mean_token_accuracy": 0.7086535628885031, |
|
"num_tokens": 6463425.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0796161364848054, |
|
"grad_norm": 0.07508436590433121, |
|
"learning_rate": 0.00037859379149856334, |
|
"loss": 0.9326, |
|
"mean_token_accuracy": 0.720469256490469, |
|
"num_tokens": 6552684.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.093833303714235, |
|
"grad_norm": 0.0790044292807579, |
|
"learning_rate": 0.00037522564126191274, |
|
"loss": 0.9778, |
|
"mean_token_accuracy": 0.7115771636366844, |
|
"num_tokens": 6636911.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1080504709436645, |
|
"grad_norm": 0.08326832950115204, |
|
"learning_rate": 0.000371826869678595, |
|
"loss": 1.056, |
|
"mean_token_accuracy": 0.6882848802953958, |
|
"num_tokens": 6718835.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.122267638173094, |
|
"grad_norm": 0.08922722190618515, |
|
"learning_rate": 0.0003683983078480708, |
|
"loss": 0.9371, |
|
"mean_token_accuracy": 0.7138718143105507, |
|
"num_tokens": 6802682.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1364848054025236, |
|
"grad_norm": 0.07749788463115692, |
|
"learning_rate": 0.00036494079415439086, |
|
"loss": 0.9389, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1364848054025236, |
|
"eval_loss": 1.0164164304733276, |
|
"eval_mean_token_accuracy": 0.7073304796218872, |
|
"eval_num_tokens": 6890030.0, |
|
"eval_runtime": 42.5558, |
|
"eval_samples_per_second": 23.475, |
|
"eval_steps_per_second": 5.875, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1507019726319532, |
|
"grad_norm": 0.07486575841903687, |
|
"learning_rate": 0.00036145517406118674, |
|
"loss": 0.9133, |
|
"mean_token_accuracy": 0.7182627562433481, |
|
"num_tokens": 6977677.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1649191398613827, |
|
"grad_norm": 0.08833441883325577, |
|
"learning_rate": 0.00035794229990492987, |
|
"loss": 0.9745, |
|
"mean_token_accuracy": 0.7087426863610744, |
|
"num_tokens": 7058573.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.179136307090812, |
|
"grad_norm": 0.08615554124116898, |
|
"learning_rate": 0.0003544030306865108, |
|
"loss": 0.8777, |
|
"mean_token_accuracy": 0.7278520867228508, |
|
"num_tokens": 7143361.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1933534743202416, |
|
"grad_norm": 0.07825995981693268, |
|
"learning_rate": 0.00035083823186118744, |
|
"loss": 0.9291, |
|
"mean_token_accuracy": 0.7207735061645508, |
|
"num_tokens": 7232054.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2075706415496712, |
|
"grad_norm": 0.08184744417667389, |
|
"learning_rate": 0.00034724877512695674, |
|
"loss": 0.8692, |
|
"mean_token_accuracy": 0.7367342457175254, |
|
"num_tokens": 7321898.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2217878087791008, |
|
"grad_norm": 0.08207986503839493, |
|
"learning_rate": 0.0003436355382113982, |
|
"loss": 1.0342, |
|
"mean_token_accuracy": 0.6977195214480162, |
|
"num_tokens": 7407614.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2360049760085303, |
|
"grad_norm": 0.08287196606397629, |
|
"learning_rate": 0.00033999940465704394, |
|
"loss": 0.8808, |
|
"mean_token_accuracy": 0.727986204251647, |
|
"num_tokens": 7495498.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2502221432379599, |
|
"grad_norm": 0.08513687551021576, |
|
"learning_rate": 0.0003363412636053269, |
|
"loss": 0.9547, |
|
"mean_token_accuracy": 0.7101028818637133, |
|
"num_tokens": 7584760.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2644393104673894, |
|
"grad_norm": 0.08163287490606308, |
|
"learning_rate": 0.00033266200957915925, |
|
"loss": 0.9349, |
|
"mean_token_accuracy": 0.7183492567390204, |
|
"num_tokens": 7672425.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2786564776968188, |
|
"grad_norm": 0.08243776857852936, |
|
"learning_rate": 0.00032896254226419543, |
|
"loss": 0.9292, |
|
"mean_token_accuracy": 0.7134430769830942, |
|
"num_tokens": 7757203.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2928736449262486, |
|
"grad_norm": 0.09458713978528976, |
|
"learning_rate": 0.00032524376628883253, |
|
"loss": 0.9744, |
|
"mean_token_accuracy": 0.7050179023295641, |
|
"num_tokens": 7846548.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.307090812155678, |
|
"grad_norm": 0.08505059778690338, |
|
"learning_rate": 0.0003215065910030021, |
|
"loss": 0.9786, |
|
"mean_token_accuracy": 0.709782537817955, |
|
"num_tokens": 7930393.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3213079793851075, |
|
"grad_norm": 0.08586814999580383, |
|
"learning_rate": 0.00031775193025580773, |
|
"loss": 0.8947, |
|
"mean_token_accuracy": 0.7237144611775875, |
|
"num_tokens": 8016877.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.335525146614537, |
|
"grad_norm": 0.0874597579240799, |
|
"learning_rate": 0.00031398070217206127, |
|
"loss": 0.8978, |
|
"mean_token_accuracy": 0.720280421897769, |
|
"num_tokens": 8103878.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3497423138439666, |
|
"grad_norm": 0.09211765974760056, |
|
"learning_rate": 0.0003101938289277753, |
|
"loss": 0.9288, |
|
"mean_token_accuracy": 0.7201074693351984, |
|
"num_tokens": 8190575.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3639594810733962, |
|
"grad_norm": 0.0884537398815155, |
|
"learning_rate": 0.00030639223652466336, |
|
"loss": 0.9295, |
|
"mean_token_accuracy": 0.713099530339241, |
|
"num_tokens": 8275179.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3781766483028257, |
|
"grad_norm": 0.08991611003875732, |
|
"learning_rate": 0.0003025768545637057, |
|
"loss": 0.9265, |
|
"mean_token_accuracy": 0.7252651590853929, |
|
"num_tokens": 8360478.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3923938155322553, |
|
"grad_norm": 0.08758469671010971, |
|
"learning_rate": 0.0002987486160178344, |
|
"loss": 0.977, |
|
"mean_token_accuracy": 0.7134061496704817, |
|
"num_tokens": 8445197.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4066109827616846, |
|
"grad_norm": 0.08159046620130539, |
|
"learning_rate": 0.0002949084570037939, |
|
"loss": 0.9115, |
|
"mean_token_accuracy": 0.7211619779467583, |
|
"num_tokens": 8530327.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4208281499911144, |
|
"grad_norm": 0.08836396783590317, |
|
"learning_rate": 0.00029105731655323344, |
|
"loss": 0.9154, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4208281499911144, |
|
"eval_loss": 1.0047532320022583, |
|
"eval_mean_token_accuracy": 0.7103532667160034, |
|
"eval_num_tokens": 8619497.0, |
|
"eval_runtime": 42.4552, |
|
"eval_samples_per_second": 23.531, |
|
"eval_steps_per_second": 5.889, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4350453172205437, |
|
"grad_norm": 0.09082586318254471, |
|
"learning_rate": 0.0002871961363830858, |
|
"loss": 0.9077, |
|
"mean_token_accuracy": 0.7218164600431919, |
|
"num_tokens": 8703507.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4492624844499733, |
|
"grad_norm": 0.09680253267288208, |
|
"learning_rate": 0.0002833258606652901, |
|
"loss": 0.8928, |
|
"mean_token_accuracy": 0.7247406598180532, |
|
"num_tokens": 8788781.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.4634796516794029, |
|
"grad_norm": 0.08538492023944855, |
|
"learning_rate": 0.0002794474357959138, |
|
"loss": 0.9021, |
|
"mean_token_accuracy": 0.7273614536970854, |
|
"num_tokens": 8873572.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4776968189088324, |
|
"grad_norm": 0.08687795698642731, |
|
"learning_rate": 0.00027556181016373147, |
|
"loss": 0.9266, |
|
"mean_token_accuracy": 0.7160039469599724, |
|
"num_tokens": 8960541.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.491913986138262, |
|
"grad_norm": 0.08219098299741745, |
|
"learning_rate": 0.00027166993391831566, |
|
"loss": 0.9344, |
|
"mean_token_accuracy": 0.7199014227837324, |
|
"num_tokens": 9046332.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5061311533676913, |
|
"grad_norm": 0.08430644124746323, |
|
"learning_rate": 0.000267772758737697, |
|
"loss": 0.963, |
|
"mean_token_accuracy": 0.7126489922404289, |
|
"num_tokens": 9133194.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.520348320597121, |
|
"grad_norm": 0.08255460858345032, |
|
"learning_rate": 0.00026387123759565197, |
|
"loss": 0.8784, |
|
"mean_token_accuracy": 0.7328134395182133, |
|
"num_tokens": 9220168.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.5345654878265504, |
|
"grad_norm": 0.09743297845125198, |
|
"learning_rate": 0.00025996632452867166, |
|
"loss": 0.8675, |
|
"mean_token_accuracy": 0.7273001208901405, |
|
"num_tokens": 9306051.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5487826550559802, |
|
"grad_norm": 0.08803436160087585, |
|
"learning_rate": 0.0002560589744026729, |
|
"loss": 0.8476, |
|
"mean_token_accuracy": 0.7371940363198519, |
|
"num_tokens": 9398040.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.5629998222854096, |
|
"grad_norm": 0.08522514998912811, |
|
"learning_rate": 0.00025215014267950463, |
|
"loss": 0.9067, |
|
"mean_token_accuracy": 0.714500817283988, |
|
"num_tokens": 9485987.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5772169895148391, |
|
"grad_norm": 0.09162591397762299, |
|
"learning_rate": 0.00024824078518331013, |
|
"loss": 0.936, |
|
"mean_token_accuracy": 0.7146482899785042, |
|
"num_tokens": 9571356.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5914341567442687, |
|
"grad_norm": 0.0838090255856514, |
|
"learning_rate": 0.00024433185786679955, |
|
"loss": 0.9111, |
|
"mean_token_accuracy": 0.7223079178482295, |
|
"num_tokens": 9657761.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6056513239736983, |
|
"grad_norm": 0.08568980544805527, |
|
"learning_rate": 0.00024042431657749118, |
|
"loss": 0.9572, |
|
"mean_token_accuracy": 0.710682961717248, |
|
"num_tokens": 9742840.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6198684912031278, |
|
"grad_norm": 0.08540449291467667, |
|
"learning_rate": 0.00023651911682397937, |
|
"loss": 0.901, |
|
"mean_token_accuracy": 0.7298479046672582, |
|
"num_tokens": 9833491.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.6340856584325572, |
|
"grad_norm": 0.09621024876832962, |
|
"learning_rate": 0.0002326172135422839, |
|
"loss": 0.8465, |
|
"mean_token_accuracy": 0.739329730719328, |
|
"num_tokens": 9921422.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.648302825661987, |
|
"grad_norm": 0.0809611827135086, |
|
"learning_rate": 0.00022871956086234062, |
|
"loss": 0.8573, |
|
"mean_token_accuracy": 0.7349758796393872, |
|
"num_tokens": 10007862.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.6625199928914163, |
|
"grad_norm": 0.08698837459087372, |
|
"learning_rate": 0.00022482711187468823, |
|
"loss": 0.8512, |
|
"mean_token_accuracy": 0.7366860095411539, |
|
"num_tokens": 10100577.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.676737160120846, |
|
"grad_norm": 0.08779753744602203, |
|
"learning_rate": 0.00022094081839741004, |
|
"loss": 0.8936, |
|
"mean_token_accuracy": 0.7239959452301263, |
|
"num_tokens": 10186346.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.6909543273502754, |
|
"grad_norm": 0.08322805166244507, |
|
"learning_rate": 0.0002170616307433861, |
|
"loss": 0.9271, |
|
"mean_token_accuracy": 0.7187892116606236, |
|
"num_tokens": 10270881.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.705171494579705, |
|
"grad_norm": 0.09833941608667374, |
|
"learning_rate": 0.00021319049748791416, |
|
"loss": 1.0209, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.705171494579705, |
|
"eval_loss": 0.9964284300804138, |
|
"eval_mean_token_accuracy": 0.7130191161632538, |
|
"eval_num_tokens": 10355263.0, |
|
"eval_runtime": 42.5075, |
|
"eval_samples_per_second": 23.502, |
|
"eval_steps_per_second": 5.881, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7193886618091345, |
|
"grad_norm": 0.09196127951145172, |
|
"learning_rate": 0.00020932836523675493, |
|
"loss": 0.9229, |
|
"mean_token_accuracy": 0.7052626656368375, |
|
"num_tokens": 10439472.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.733605829038564, |
|
"grad_norm": 0.09308561682701111, |
|
"learning_rate": 0.00020547617839465924, |
|
"loss": 0.9532, |
|
"mean_token_accuracy": 0.7139888234436512, |
|
"num_tokens": 10524216.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.7478229962679936, |
|
"grad_norm": 0.09138765186071396, |
|
"learning_rate": 0.0002016348789344335, |
|
"loss": 0.895, |
|
"mean_token_accuracy": 0.7220640182495117, |
|
"num_tokens": 10611738.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.762040163497423, |
|
"grad_norm": 0.08532418310642242, |
|
"learning_rate": 0.0001978054061665993, |
|
"loss": 0.9175, |
|
"mean_token_accuracy": 0.7228887047618627, |
|
"num_tokens": 10697449.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.7762573307268528, |
|
"grad_norm": 0.08684728294610977, |
|
"learning_rate": 0.0001939886965097048, |
|
"loss": 0.9053, |
|
"mean_token_accuracy": 0.7226716388016939, |
|
"num_tokens": 10783296.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.790474497956282, |
|
"grad_norm": 0.08659351617097855, |
|
"learning_rate": 0.0001901856832613426, |
|
"loss": 0.9259, |
|
"mean_token_accuracy": 0.7152926828712225, |
|
"num_tokens": 10871421.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.804691665185712, |
|
"grad_norm": 0.08151692152023315, |
|
"learning_rate": 0.00018639729636993137, |
|
"loss": 0.8651, |
|
"mean_token_accuracy": 0.7316095266491175, |
|
"num_tokens": 10960209.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8189088324151412, |
|
"grad_norm": 0.09470875561237335, |
|
"learning_rate": 0.00018262446220731582, |
|
"loss": 0.9005, |
|
"mean_token_accuracy": 0.7300491981208325, |
|
"num_tokens": 11046451.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.8331259996445708, |
|
"grad_norm": 0.08670168370008469, |
|
"learning_rate": 0.0001788681033422419, |
|
"loss": 0.8818, |
|
"mean_token_accuracy": 0.7283473834395409, |
|
"num_tokens": 11132072.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.8473431668740004, |
|
"grad_norm": 0.08462122082710266, |
|
"learning_rate": 0.00017512913831476136, |
|
"loss": 0.9394, |
|
"mean_token_accuracy": 0.7198392864316702, |
|
"num_tokens": 11218335.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.86156033410343, |
|
"grad_norm": 0.0869954526424408, |
|
"learning_rate": 0.000171408481411622, |
|
"loss": 0.8649, |
|
"mean_token_accuracy": 0.7348789256066084, |
|
"num_tokens": 11305589.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.8757775013328595, |
|
"grad_norm": 0.09598379582166672, |
|
"learning_rate": 0.00016770704244269735, |
|
"loss": 0.8799, |
|
"mean_token_accuracy": 0.7312331754714251, |
|
"num_tokens": 11393534.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8899946685622888, |
|
"grad_norm": 0.09157571196556091, |
|
"learning_rate": 0.00016402572651851217, |
|
"loss": 0.9071, |
|
"mean_token_accuracy": 0.7278904471546411, |
|
"num_tokens": 11481006.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9042118357917186, |
|
"grad_norm": 0.08688797056674957, |
|
"learning_rate": 0.0001603654338289151, |
|
"loss": 0.921, |
|
"mean_token_accuracy": 0.725471879914403, |
|
"num_tokens": 11570046.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.918429003021148, |
|
"grad_norm": 0.09125315397977829, |
|
"learning_rate": 0.00015672705942295734, |
|
"loss": 0.8608, |
|
"mean_token_accuracy": 0.7313944108784198, |
|
"num_tokens": 11656322.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9326461702505777, |
|
"grad_norm": 0.08270374685525894, |
|
"learning_rate": 0.00015311149299002542, |
|
"loss": 0.8296, |
|
"mean_token_accuracy": 0.740207726508379, |
|
"num_tokens": 11745127.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.946863337480007, |
|
"grad_norm": 0.09753109514713287, |
|
"learning_rate": 0.0001495196186422872, |
|
"loss": 0.9149, |
|
"mean_token_accuracy": 0.72291075065732, |
|
"num_tokens": 11826582.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.9610805047094366, |
|
"grad_norm": 0.09203537553548813, |
|
"learning_rate": 0.00014595231469849963, |
|
"loss": 0.9261, |
|
"mean_token_accuracy": 0.7244169395416975, |
|
"num_tokens": 11911842.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.9752976719388662, |
|
"grad_norm": 0.10030084103345871, |
|
"learning_rate": 0.00014241045346923462, |
|
"loss": 0.8738, |
|
"mean_token_accuracy": 0.7312715597450733, |
|
"num_tokens": 11997173.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.9895148391682957, |
|
"grad_norm": 0.09157243371009827, |
|
"learning_rate": 0.00013889490104357276, |
|
"loss": 0.944, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9895148391682957, |
|
"eval_loss": 0.9828982949256897, |
|
"eval_mean_token_accuracy": 0.7159896125793457, |
|
"eval_num_tokens": 12079318.0, |
|
"eval_runtime": 42.4967, |
|
"eval_samples_per_second": 23.508, |
|
"eval_steps_per_second": 5.883, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2109, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2636469427598029e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|