|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 1.5717865228652954, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 2.7077, |
|
"mean_token_accuracy": 0.459680899977684, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.202968716621399, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 2.2671, |
|
"mean_token_accuracy": 0.5242175050079823, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.0542172193527222, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 2.2698, |
|
"mean_token_accuracy": 0.5283941477537155, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.7223343253135681, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 2.1156, |
|
"mean_token_accuracy": 0.5514968127012253, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.6670629382133484, |
|
"learning_rate": 6.349206349206349e-05, |
|
"loss": 1.8644, |
|
"mean_token_accuracy": 0.5947718620300293, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8882656693458557, |
|
"learning_rate": 7.936507936507937e-05, |
|
"loss": 1.6563, |
|
"mean_token_accuracy": 0.6210890173912048, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.0315715074539185, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 1.4437, |
|
"mean_token_accuracy": 0.654322224855423, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 1.4416899681091309, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.0037, |
|
"mean_token_accuracy": 0.7415849208831787, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.8182677626609802, |
|
"learning_rate": 0.00012698412698412698, |
|
"loss": 0.6626, |
|
"mean_token_accuracy": 0.853333032131195, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.4539111256599426, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.4946, |
|
"mean_token_accuracy": 0.8977023303508759, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3059958815574646, |
|
"learning_rate": 0.00015873015873015873, |
|
"loss": 0.4924, |
|
"mean_token_accuracy": 0.8881445944309234, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.20895916223526, |
|
"learning_rate": 0.00017460317460317462, |
|
"loss": 0.3816, |
|
"mean_token_accuracy": 0.9159973561763763, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.241449773311615, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.2424, |
|
"mean_token_accuracy": 0.9461545169353485, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.13886623084545135, |
|
"learning_rate": 0.00019999375039475277, |
|
"loss": 0.2525, |
|
"mean_token_accuracy": 0.9451982796192169, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.15808190405368805, |
|
"learning_rate": 0.0001999234513064475, |
|
"loss": 0.3108, |
|
"mean_token_accuracy": 0.9311837553977966, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.1368260234594345, |
|
"learning_rate": 0.00019977509622105233, |
|
"loss": 0.2217, |
|
"mean_token_accuracy": 0.9483723938465118, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.14972065389156342, |
|
"learning_rate": 0.0001995488010273198, |
|
"loss": 0.1707, |
|
"mean_token_accuracy": 0.9621350407600403, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.2032196819782257, |
|
"learning_rate": 0.00019924474249753655, |
|
"loss": 0.3796, |
|
"mean_token_accuracy": 0.9232820510864258, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.09374168515205383, |
|
"learning_rate": 0.00019886315814943647, |
|
"loss": 0.3032, |
|
"mean_token_accuracy": 0.9280714929103852, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.1895170956850052, |
|
"learning_rate": 0.0001984043460606618, |
|
"loss": 0.2635, |
|
"mean_token_accuracy": 0.9419675827026367, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.17207027971744537, |
|
"learning_rate": 0.0001978686646359173, |
|
"loss": 0.2961, |
|
"mean_token_accuracy": 0.9313458263874054, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.1757289320230484, |
|
"learning_rate": 0.0001972565323269996, |
|
"loss": 0.2087, |
|
"mean_token_accuracy": 0.9513824045658111, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.2260744869709015, |
|
"learning_rate": 0.00019656842730592046, |
|
"loss": 0.3184, |
|
"mean_token_accuracy": 0.9232833862304688, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.052938517183065414, |
|
"learning_rate": 0.0001958048870913786, |
|
"loss": 0.1822, |
|
"mean_token_accuracy": 0.9538747906684876, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.1941942721605301, |
|
"learning_rate": 0.0001949665081288729, |
|
"loss": 0.2273, |
|
"mean_token_accuracy": 0.9453578472137452, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2198331207036972, |
|
"learning_rate": 0.00019405394532478424, |
|
"loss": 0.2049, |
|
"mean_token_accuracy": 0.9542803287506103, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.24331390857696533, |
|
"learning_rate": 0.00019306791153479006, |
|
"loss": 0.2061, |
|
"mean_token_accuracy": 0.9513079404830933, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.2335432767868042, |
|
"learning_rate": 0.00019200917700701176, |
|
"loss": 0.1368, |
|
"mean_token_accuracy": 0.9651266932487488, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.23610533773899078, |
|
"learning_rate": 0.0001908785687803289, |
|
"loss": 0.3442, |
|
"mean_token_accuracy": 0.9188908874988556, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.22944550216197968, |
|
"learning_rate": 0.00018967697003833157, |
|
"loss": 0.337, |
|
"mean_token_accuracy": 0.9214253067970276, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.21961958706378937, |
|
"learning_rate": 0.0001884053194194142, |
|
"loss": 0.223, |
|
"mean_token_accuracy": 0.9437191367149353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.20730167627334595, |
|
"learning_rate": 0.00018706461028355104, |
|
"loss": 0.2741, |
|
"mean_token_accuracy": 0.938022255897522, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.16260486841201782, |
|
"learning_rate": 0.00018565588993632487, |
|
"loss": 0.2648, |
|
"mean_token_accuracy": 0.9357350587844848, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.22049878537654877, |
|
"learning_rate": 0.0001841802588108161, |
|
"loss": 0.3412, |
|
"mean_token_accuracy": 0.9166976153850556, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.19371896982192993, |
|
"learning_rate": 0.00018263886960799062, |
|
"loss": 0.2456, |
|
"mean_token_accuracy": 0.945724493265152, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.22933930158615112, |
|
"learning_rate": 0.00018103292639625837, |
|
"loss": 0.3841, |
|
"mean_token_accuracy": 0.9089005470275879, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.1793544441461563, |
|
"learning_rate": 0.0001793636836709057, |
|
"loss": 0.2999, |
|
"mean_token_accuracy": 0.9297621667385101, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.0395968072116375, |
|
"learning_rate": 0.0001776324453741365, |
|
"loss": 0.1624, |
|
"mean_token_accuracy": 0.9584607481956482, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.2473573386669159, |
|
"learning_rate": 0.00017584056387648727, |
|
"loss": 0.3053, |
|
"mean_token_accuracy": 0.9382229685783386, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.04702218249440193, |
|
"learning_rate": 0.0001739894389204122, |
|
"loss": 0.172, |
|
"mean_token_accuracy": 0.9594997644424439, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.23656697571277618, |
|
"learning_rate": 0.00017208051652686335, |
|
"loss": 0.2276, |
|
"mean_token_accuracy": 0.9398577272891998, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.07044972479343414, |
|
"learning_rate": 0.00017011528786571969, |
|
"loss": 0.2854, |
|
"mean_token_accuracy": 0.926360809803009, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.03222399577498436, |
|
"learning_rate": 0.00016809528809094807, |
|
"loss": 0.2172, |
|
"mean_token_accuracy": 0.9438831508159637, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.243390753865242, |
|
"learning_rate": 0.0001660220951414055, |
|
"loss": 0.2852, |
|
"mean_token_accuracy": 0.9389818787574769, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.23130089044570923, |
|
"learning_rate": 0.00016389732850821966, |
|
"loss": 0.2899, |
|
"mean_token_accuracy": 0.9331403017044068, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2532561123371124, |
|
"learning_rate": 0.0001617226479697105, |
|
"loss": 0.3069, |
|
"mean_token_accuracy": 0.9253335416316986, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.20991413295269012, |
|
"learning_rate": 0.00015949975229484134, |
|
"loss": 0.3178, |
|
"mean_token_accuracy": 0.9201959311962128, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.20208591222763062, |
|
"learning_rate": 0.00015723037791621193, |
|
"loss": 0.18, |
|
"mean_token_accuracy": 0.9534633576869964, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.23704850673675537, |
|
"learning_rate": 0.00015491629757363032, |
|
"loss": 0.3864, |
|
"mean_token_accuracy": 0.9041174411773681, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.22486881911754608, |
|
"learning_rate": 0.00015255931892932333, |
|
"loss": 0.3174, |
|
"mean_token_accuracy": 0.9177540898323059, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2111813724040985, |
|
"learning_rate": 0.0001501612831558664, |
|
"loss": 0.3145, |
|
"mean_token_accuracy": 0.9238011240959167, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 0.23434004187583923, |
|
"learning_rate": 0.00014772406349793744, |
|
"loss": 0.2928, |
|
"mean_token_accuracy": 0.929435807466507, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.25533345341682434, |
|
"learning_rate": 0.0001452495638090167, |
|
"loss": 0.3171, |
|
"mean_token_accuracy": 0.927526468038559, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.1922113448381424, |
|
"learning_rate": 0.00014273971706417647, |
|
"loss": 0.2162, |
|
"mean_token_accuracy": 0.9465757727622985, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.2513563334941864, |
|
"learning_rate": 0.00014019648385012244, |
|
"loss": 0.3813, |
|
"mean_token_accuracy": 0.908132141828537, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.25622016191482544, |
|
"learning_rate": 0.00013762185083366556, |
|
"loss": 0.3005, |
|
"mean_token_accuracy": 0.9310987770557404, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.20694169402122498, |
|
"learning_rate": 0.00013501782920982184, |
|
"loss": 0.1721, |
|
"mean_token_accuracy": 0.9563108265399933, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.21768316626548767, |
|
"learning_rate": 0.00013238645313075104, |
|
"loss": 0.2444, |
|
"mean_token_accuracy": 0.9432082533836365, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.1706106811761856, |
|
"learning_rate": 0.00012972977811676287, |
|
"loss": 0.233, |
|
"mean_token_accuracy": 0.9396327018737793, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.2342345267534256, |
|
"learning_rate": 0.00012704987945063068, |
|
"loss": 0.2106, |
|
"mean_token_accuracy": 0.9517422139644622, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2374894917011261, |
|
"learning_rate": 0.00012434885055646823, |
|
"loss": 0.2222, |
|
"mean_token_accuracy": 0.9430564045906067, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 0.20393089950084686, |
|
"learning_rate": 0.00012162880136443447, |
|
"loss": 0.3069, |
|
"mean_token_accuracy": 0.9208929121494294, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.2132725864648819, |
|
"learning_rate": 0.00011889185666254506, |
|
"loss": 0.2821, |
|
"mean_token_accuracy": 0.9318405151367187, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.15664634108543396, |
|
"learning_rate": 0.00011614015443687722, |
|
"loss": 0.16, |
|
"mean_token_accuracy": 0.9559766769409179, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.22459107637405396, |
|
"learning_rate": 0.0001133758442014651, |
|
"loss": 0.1543, |
|
"mean_token_accuracy": 0.9638924479484559, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2619342505931854, |
|
"learning_rate": 0.00011060108531918971, |
|
"loss": 0.2865, |
|
"mean_token_accuracy": 0.9385590136051178, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.21467000246047974, |
|
"learning_rate": 0.0001078180453149754, |
|
"loss": 0.2918, |
|
"mean_token_accuracy": 0.9376808702945709, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 0.2496771514415741, |
|
"learning_rate": 0.00010502889818261075, |
|
"loss": 0.4184, |
|
"mean_token_accuracy": 0.8969966411590576, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.2137787938117981, |
|
"learning_rate": 0.00010223582268651586, |
|
"loss": 0.257, |
|
"mean_token_accuracy": 0.9423645496368408, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.2176610231399536, |
|
"learning_rate": 9.94410006597835e-05, |
|
"loss": 0.3322, |
|
"mean_token_accuracy": 0.9129085004329681, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.21382386982440948, |
|
"learning_rate": 9.66466152998226e-05, |
|
"loss": 0.3399, |
|
"mean_token_accuracy": 0.9135319530963898, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 0.22343839704990387, |
|
"learning_rate": 9.385484946293637e-05, |
|
"loss": 0.2464, |
|
"mean_token_accuracy": 0.9397193789482117, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.18257169425487518, |
|
"learning_rate": 9.106788395916678e-05, |
|
"loss": 0.1896, |
|
"mean_token_accuracy": 0.951214075088501, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 0.04507756605744362, |
|
"learning_rate": 8.828789584873754e-05, |
|
"loss": 0.2693, |
|
"mean_token_accuracy": 0.9345629513263702, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.23864690959453583, |
|
"learning_rate": 8.551705674142617e-05, |
|
"loss": 0.2743, |
|
"mean_token_accuracy": 0.9325626492500305, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.035314541310071945, |
|
"learning_rate": 8.275753110019367e-05, |
|
"loss": 0.1799, |
|
"mean_token_accuracy": 0.9588949978351593, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.19131699204444885, |
|
"learning_rate": 8.001147455039735e-05, |
|
"loss": 0.2339, |
|
"mean_token_accuracy": 0.9403933227062226, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 0.20205603539943695, |
|
"learning_rate": 7.728103219590681e-05, |
|
"loss": 0.2315, |
|
"mean_token_accuracy": 0.9407491445541382, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.23396767675876617, |
|
"learning_rate": 7.456833694343906e-05, |
|
"loss": 0.2248, |
|
"mean_token_accuracy": 0.9434651434421539, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 0.05149530991911888, |
|
"learning_rate": 7.18755078364214e-05, |
|
"loss": 0.1834, |
|
"mean_token_accuracy": 0.950703501701355, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1885431706905365, |
|
"learning_rate": 6.920464839968405e-05, |
|
"loss": 0.2292, |
|
"mean_token_accuracy": 0.9455470621585846, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.22768881916999817, |
|
"learning_rate": 6.65578449962749e-05, |
|
"loss": 0.3276, |
|
"mean_token_accuracy": 0.9167207598686218, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.2584383189678192, |
|
"learning_rate": 6.393716519768047e-05, |
|
"loss": 0.2714, |
|
"mean_token_accuracy": 0.9378273606300354, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 0.27143898606300354, |
|
"learning_rate": 6.134465616872598e-05, |
|
"loss": 0.3238, |
|
"mean_token_accuracy": 0.9266472160816193, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.16849137842655182, |
|
"learning_rate": 5.878234306841637e-05, |
|
"loss": 0.1855, |
|
"mean_token_accuracy": 0.9501271903514862, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.23237672448158264, |
|
"learning_rate": 5.62522274679673e-05, |
|
"loss": 0.2252, |
|
"mean_token_accuracy": 0.9437941908836365, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.19916483759880066, |
|
"learning_rate": 5.375628578726181e-05, |
|
"loss": 0.1561, |
|
"mean_token_accuracy": 0.9600933134555817, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.22922399640083313, |
|
"learning_rate": 5.1296467750954314e-05, |
|
"loss": 0.3147, |
|
"mean_token_accuracy": 0.925593638420105, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.2341531664133072, |
|
"learning_rate": 4.8874694865427676e-05, |
|
"loss": 0.2745, |
|
"mean_token_accuracy": 0.9313132464885712, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 0.24038153886795044, |
|
"learning_rate": 4.649285891779327e-05, |
|
"loss": 0.2954, |
|
"mean_token_accuracy": 0.9240333676338196, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2073904424905777, |
|
"learning_rate": 4.415282049810644e-05, |
|
"loss": 0.2418, |
|
"mean_token_accuracy": 0.93941730260849, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 0.232681542634964, |
|
"learning_rate": 4.1856407545951834e-05, |
|
"loss": 0.3575, |
|
"mean_token_accuracy": 0.9102162063121796, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.20729133486747742, |
|
"learning_rate": 3.9605413922533874e-05, |
|
"loss": 0.2844, |
|
"mean_token_accuracy": 0.9322370827198029, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.20540155470371246, |
|
"learning_rate": 3.740159800938784e-05, |
|
"loss": 0.1801, |
|
"mean_token_accuracy": 0.9571804940700531, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.27062955498695374, |
|
"learning_rate": 3.5246681334806175e-05, |
|
"loss": 0.1628, |
|
"mean_token_accuracy": 0.9602212190628052, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.2563348710536957, |
|
"learning_rate": 3.3142347229053015e-05, |
|
"loss": 0.2317, |
|
"mean_token_accuracy": 0.9395133197307587, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.23781529068946838, |
|
"learning_rate": 3.109023950941736e-05, |
|
"loss": 0.3099, |
|
"mean_token_accuracy": 0.9263223648071289, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 0.04337846860289574, |
|
"learning_rate": 2.909196119613218e-05, |
|
"loss": 0.1425, |
|
"mean_token_accuracy": 0.9617048621177673, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.20578506588935852, |
|
"learning_rate": 2.7149073260162416e-05, |
|
"loss": 0.2547, |
|
"mean_token_accuracy": 0.9427634000778198, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.2657724618911743, |
|
"learning_rate": 2.5263093403840142e-05, |
|
"loss": 0.2759, |
|
"mean_token_accuracy": 0.9276242256164551, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.20405757427215576, |
|
"learning_rate": 2.3435494875299314e-05, |
|
"loss": 0.1946, |
|
"mean_token_accuracy": 0.9477591276168823, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 0.2120545208454132, |
|
"learning_rate": 2.166770531763633e-05, |
|
"loss": 0.1492, |
|
"mean_token_accuracy": 0.9617019176483155, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.1790059208869934, |
|
"learning_rate": 1.9961105653695266e-05, |
|
"loss": 0.2097, |
|
"mean_token_accuracy": 0.9475365877151489, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 0.23069578409194946, |
|
"learning_rate": 1.8317029007349085e-05, |
|
"loss": 0.3262, |
|
"mean_token_accuracy": 0.9245599567890167, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.1952206939458847, |
|
"learning_rate": 1.6736759662119183e-05, |
|
"loss": 0.2593, |
|
"mean_token_accuracy": 0.9349613726139069, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.16667886078357697, |
|
"learning_rate": 1.5221532057947419e-05, |
|
"loss": 0.1832, |
|
"mean_token_accuracy": 0.9498234987258911, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.16531656682491302, |
|
"learning_rate": 1.3772529826903269e-05, |
|
"loss": 0.3003, |
|
"mean_token_accuracy": 0.927387660741806, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 0.03590136766433716, |
|
"learning_rate": 1.23908848685804e-05, |
|
"loss": 0.209, |
|
"mean_token_accuracy": 0.9517148792743683, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.20711013674736023, |
|
"learning_rate": 1.1077676465904208e-05, |
|
"loss": 0.229, |
|
"mean_token_accuracy": 0.9443170964717865, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.2173730880022049, |
|
"learning_rate": 9.833930442041506e-06, |
|
"loss": 0.1711, |
|
"mean_token_accuracy": 0.95446497797966, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.25314000248908997, |
|
"learning_rate": 8.660618359070604e-06, |
|
"loss": 0.2478, |
|
"mean_token_accuracy": 0.9370902240276336, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.21840238571166992, |
|
"learning_rate": 7.558656759037797e-06, |
|
"loss": 0.184, |
|
"mean_token_accuracy": 0.9521986067295074, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.21549563109874725, |
|
"learning_rate": 6.528906447993288e-06, |
|
"loss": 0.2443, |
|
"mean_token_accuracy": 0.9441956281661987, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 0.1734277904033661, |
|
"learning_rate": 5.572171823565797e-06, |
|
"loss": 0.22, |
|
"mean_token_accuracy": 0.9439444482326508, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.2154768407344818, |
|
"learning_rate": 4.689200246600867e-06, |
|
"loss": 0.1987, |
|
"mean_token_accuracy": 0.9493344485759735, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.1548367738723755, |
|
"learning_rate": 3.880681457354118e-06, |
|
"loss": 0.1942, |
|
"mean_token_accuracy": 0.9513484239578247, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.18485531210899353, |
|
"learning_rate": 3.1472470366950334e-06, |
|
"loss": 0.2624, |
|
"mean_token_accuracy": 0.9301692008972168, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.24809648096561432, |
|
"learning_rate": 2.4894699127426367e-06, |
|
"loss": 0.2501, |
|
"mean_token_accuracy": 0.9349741101264953, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.2469683289527893, |
|
"learning_rate": 1.907863913318153e-06, |
|
"loss": 0.1806, |
|
"mean_token_accuracy": 0.9534947514533997, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 0.21619068086147308, |
|
"learning_rate": 1.4028833645643113e-06, |
|
"loss": 0.1114, |
|
"mean_token_accuracy": 0.9707520604133606, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.20197683572769165, |
|
"learning_rate": 9.749227360448143e-07, |
|
"loss": 0.202, |
|
"mean_token_accuracy": 0.947997921705246, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 0.21867972612380981, |
|
"learning_rate": 6.243163326014267e-07, |
|
"loss": 0.2394, |
|
"mean_token_accuracy": 0.9438014030456543, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.2265222817659378, |
|
"learning_rate": 3.5133803320896994e-07, |
|
"loss": 0.2911, |
|
"mean_token_accuracy": 0.9275119185447693, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.19685231149196625, |
|
"learning_rate": 1.562010770326916e-07, |
|
"loss": 0.2725, |
|
"mean_token_accuracy": 0.9320066690444946, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.15336309373378754, |
|
"learning_rate": 3.905789685471062e-08, |
|
"loss": 0.3007, |
|
"mean_token_accuracy": 0.9265518903732299, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.23042897880077362, |
|
"learning_rate": 0.0, |
|
"loss": 0.2386, |
|
"mean_token_accuracy": 0.9401108622550964, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 625, |
|
"total_flos": 1.5047606448160768e+16, |
|
"train_loss": 0.3481092903137207, |
|
"train_runtime": 972.7733, |
|
"train_samples_per_second": 5.14, |
|
"train_steps_per_second": 0.642 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5047606448160768e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|