|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 30.0, |
|
"eval_steps": 500, |
|
"global_step": 4170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007194244604316547, |
|
"grad_norm": 0.3506069883321638, |
|
"learning_rate": 4.796163069544364e-07, |
|
"loss": 1.8211, |
|
"mean_token_accuracy": 0.6063699722290039, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03597122302158273, |
|
"grad_norm": 0.3397364335493786, |
|
"learning_rate": 2.3980815347721824e-06, |
|
"loss": 1.8489, |
|
"mean_token_accuracy": 0.6016613021492958, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07194244604316546, |
|
"grad_norm": 0.36588885578293806, |
|
"learning_rate": 4.796163069544365e-06, |
|
"loss": 1.8553, |
|
"mean_token_accuracy": 0.602922260761261, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1079136690647482, |
|
"grad_norm": 0.38953277553950383, |
|
"learning_rate": 7.1942446043165465e-06, |
|
"loss": 1.854, |
|
"mean_token_accuracy": 0.6022201240062713, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.38828154068570925, |
|
"learning_rate": 9.59232613908873e-06, |
|
"loss": 1.8273, |
|
"mean_token_accuracy": 0.6043285429477692, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17985611510791366, |
|
"grad_norm": 0.4678851058069788, |
|
"learning_rate": 1.1990407673860912e-05, |
|
"loss": 1.797, |
|
"mean_token_accuracy": 0.6082902371883392, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2158273381294964, |
|
"grad_norm": 0.49705633435698987, |
|
"learning_rate": 1.4388489208633093e-05, |
|
"loss": 1.7648, |
|
"mean_token_accuracy": 0.6104614853858947, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2517985611510791, |
|
"grad_norm": 0.5253836453595289, |
|
"learning_rate": 1.6786570743405277e-05, |
|
"loss": 1.7535, |
|
"mean_token_accuracy": 0.6107279539108277, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.28776978417266186, |
|
"grad_norm": 0.4197047432820652, |
|
"learning_rate": 1.918465227817746e-05, |
|
"loss": 1.6591, |
|
"mean_token_accuracy": 0.6199684083461762, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3237410071942446, |
|
"grad_norm": 0.2687351382925973, |
|
"learning_rate": 2.1582733812949642e-05, |
|
"loss": 1.6015, |
|
"mean_token_accuracy": 0.6256727695465087, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3597122302158273, |
|
"grad_norm": 0.2514281363945216, |
|
"learning_rate": 2.3980815347721824e-05, |
|
"loss": 1.5121, |
|
"mean_token_accuracy": 0.6378357112407684, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.39568345323741005, |
|
"grad_norm": 0.27620691115174834, |
|
"learning_rate": 2.637889688249401e-05, |
|
"loss": 1.4599, |
|
"mean_token_accuracy": 0.6464233458042145, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4316546762589928, |
|
"grad_norm": 0.2747144748462002, |
|
"learning_rate": 2.8776978417266186e-05, |
|
"loss": 1.3595, |
|
"mean_token_accuracy": 0.6629432022571564, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4676258992805755, |
|
"grad_norm": 0.2803337874474452, |
|
"learning_rate": 3.117505995203837e-05, |
|
"loss": 1.2729, |
|
"mean_token_accuracy": 0.6793328762054444, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5035971223021583, |
|
"grad_norm": 0.3141630297057898, |
|
"learning_rate": 3.3573141486810554e-05, |
|
"loss": 1.1426, |
|
"mean_token_accuracy": 0.7037691950798035, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.539568345323741, |
|
"grad_norm": 0.3554897054791459, |
|
"learning_rate": 3.597122302158273e-05, |
|
"loss": 0.9772, |
|
"mean_token_accuracy": 0.7396033108234406, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5755395683453237, |
|
"grad_norm": 0.3922829203034533, |
|
"learning_rate": 3.836930455635492e-05, |
|
"loss": 0.7946, |
|
"mean_token_accuracy": 0.791416597366333, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6115107913669064, |
|
"grad_norm": 0.4415520616858967, |
|
"learning_rate": 4.0767386091127105e-05, |
|
"loss": 0.5796, |
|
"mean_token_accuracy": 0.852098262310028, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6474820143884892, |
|
"grad_norm": 0.3221304026208011, |
|
"learning_rate": 4.3165467625899284e-05, |
|
"loss": 0.3595, |
|
"mean_token_accuracy": 0.916002345085144, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6834532374100719, |
|
"grad_norm": 0.2579065417189077, |
|
"learning_rate": 4.556354916067146e-05, |
|
"loss": 0.2257, |
|
"mean_token_accuracy": 0.9520921051502228, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7194244604316546, |
|
"grad_norm": 0.15356241858989592, |
|
"learning_rate": 4.796163069544365e-05, |
|
"loss": 0.1586, |
|
"mean_token_accuracy": 0.9685133516788482, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7553956834532374, |
|
"grad_norm": 0.12878276526429025, |
|
"learning_rate": 5.035971223021583e-05, |
|
"loss": 0.1404, |
|
"mean_token_accuracy": 0.9713728368282318, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7913669064748201, |
|
"grad_norm": 0.10471757647129615, |
|
"learning_rate": 5.275779376498802e-05, |
|
"loss": 0.1271, |
|
"mean_token_accuracy": 0.9753898620605469, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8273381294964028, |
|
"grad_norm": 0.09680394845041788, |
|
"learning_rate": 5.515587529976019e-05, |
|
"loss": 0.1277, |
|
"mean_token_accuracy": 0.9750036299228668, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8633093525179856, |
|
"grad_norm": 0.12123784922225729, |
|
"learning_rate": 5.755395683453237e-05, |
|
"loss": 0.1224, |
|
"mean_token_accuracy": 0.9754109263420105, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8992805755395683, |
|
"grad_norm": 0.11686026875002653, |
|
"learning_rate": 5.9952038369304564e-05, |
|
"loss": 0.1156, |
|
"mean_token_accuracy": 0.9775736808776856, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.935251798561151, |
|
"grad_norm": 0.08598616604099492, |
|
"learning_rate": 6.235011990407674e-05, |
|
"loss": 0.1399, |
|
"mean_token_accuracy": 0.9725452423095703, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9712230215827338, |
|
"grad_norm": 0.1673532970509405, |
|
"learning_rate": 6.474820143884892e-05, |
|
"loss": 0.0929, |
|
"mean_token_accuracy": 0.9821974813938141, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.12023145705461502, |
|
"eval_mean_token_accuracy": 0.9781519497434298, |
|
"eval_runtime": 20.7288, |
|
"eval_samples_per_second": 5.886, |
|
"eval_steps_per_second": 0.772, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0071942446043165, |
|
"grad_norm": 0.08888350379847303, |
|
"learning_rate": 6.714628297362111e-05, |
|
"loss": 0.111, |
|
"mean_token_accuracy": 0.9802520871162415, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0431654676258992, |
|
"grad_norm": 0.0879355109627538, |
|
"learning_rate": 6.954436450839329e-05, |
|
"loss": 0.1106, |
|
"mean_token_accuracy": 0.9783557474613189, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.079136690647482, |
|
"grad_norm": 0.07545083881475075, |
|
"learning_rate": 7.194244604316547e-05, |
|
"loss": 0.0989, |
|
"mean_token_accuracy": 0.9803751826286315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1151079136690647, |
|
"grad_norm": 0.06702405978093251, |
|
"learning_rate": 7.434052757793766e-05, |
|
"loss": 0.0984, |
|
"mean_token_accuracy": 0.980546236038208, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1510791366906474, |
|
"grad_norm": 0.08746346415813978, |
|
"learning_rate": 7.673860911270984e-05, |
|
"loss": 0.0971, |
|
"mean_token_accuracy": 0.980619478225708, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1870503597122302, |
|
"grad_norm": 0.07148480917132531, |
|
"learning_rate": 7.913669064748202e-05, |
|
"loss": 0.0995, |
|
"mean_token_accuracy": 0.9798974812030792, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.223021582733813, |
|
"grad_norm": 0.07231936051146864, |
|
"learning_rate": 8.153477218225421e-05, |
|
"loss": 0.1026, |
|
"mean_token_accuracy": 0.979968684911728, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2589928057553956, |
|
"grad_norm": 0.06885790662310835, |
|
"learning_rate": 8.393285371702639e-05, |
|
"loss": 0.0943, |
|
"mean_token_accuracy": 0.9808494627475739, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2949640287769784, |
|
"grad_norm": 0.08334798597727301, |
|
"learning_rate": 8.633093525179857e-05, |
|
"loss": 0.0925, |
|
"mean_token_accuracy": 0.9816609919071198, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.330935251798561, |
|
"grad_norm": 0.09251301084879311, |
|
"learning_rate": 8.872901678657075e-05, |
|
"loss": 0.1132, |
|
"mean_token_accuracy": 0.9775943398475647, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3669064748201438, |
|
"grad_norm": 0.07084603124056196, |
|
"learning_rate": 9.112709832134293e-05, |
|
"loss": 0.0955, |
|
"mean_token_accuracy": 0.9806205093860626, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4028776978417266, |
|
"grad_norm": 0.0771787796949035, |
|
"learning_rate": 9.35251798561151e-05, |
|
"loss": 0.1044, |
|
"mean_token_accuracy": 0.9783063352108001, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4388489208633093, |
|
"grad_norm": 0.07306767327642648, |
|
"learning_rate": 9.59232613908873e-05, |
|
"loss": 0.0852, |
|
"mean_token_accuracy": 0.9823802232742309, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.474820143884892, |
|
"grad_norm": 0.08702124943881479, |
|
"learning_rate": 9.832134292565948e-05, |
|
"loss": 0.0793, |
|
"mean_token_accuracy": 0.9833337783813476, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5107913669064748, |
|
"grad_norm": 0.09562766038385109, |
|
"learning_rate": 0.00010071942446043166, |
|
"loss": 0.0845, |
|
"mean_token_accuracy": 0.982536792755127, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5467625899280577, |
|
"grad_norm": 0.07345574083799765, |
|
"learning_rate": 0.00010311750599520385, |
|
"loss": 0.0698, |
|
"mean_token_accuracy": 0.9853514194488525, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.5827338129496402, |
|
"grad_norm": 0.06101323873063209, |
|
"learning_rate": 0.00010551558752997604, |
|
"loss": 0.0818, |
|
"mean_token_accuracy": 0.9826856195926666, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6187050359712232, |
|
"grad_norm": 0.06705744022149719, |
|
"learning_rate": 0.0001079136690647482, |
|
"loss": 0.0901, |
|
"mean_token_accuracy": 0.9815958976745606, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6546762589928057, |
|
"grad_norm": 0.06132406862414683, |
|
"learning_rate": 0.00011031175059952039, |
|
"loss": 0.0855, |
|
"mean_token_accuracy": 0.9825255811214447, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6906474820143886, |
|
"grad_norm": 0.07399014413697551, |
|
"learning_rate": 0.00011270983213429258, |
|
"loss": 0.0788, |
|
"mean_token_accuracy": 0.9834049463272094, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7266187050359711, |
|
"grad_norm": 0.058894526105802536, |
|
"learning_rate": 0.00011510791366906474, |
|
"loss": 0.0704, |
|
"mean_token_accuracy": 0.9853868961334229, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.762589928057554, |
|
"grad_norm": 0.08305627567650643, |
|
"learning_rate": 0.00011750599520383694, |
|
"loss": 0.0856, |
|
"mean_token_accuracy": 0.9817408621311188, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.7985611510791366, |
|
"grad_norm": 0.05855661629998082, |
|
"learning_rate": 0.00011990407673860913, |
|
"loss": 0.0718, |
|
"mean_token_accuracy": 0.9844718694686889, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8345323741007196, |
|
"grad_norm": 0.0670672867431674, |
|
"learning_rate": 0.0001223021582733813, |
|
"loss": 0.0829, |
|
"mean_token_accuracy": 0.9828297436237335, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.870503597122302, |
|
"grad_norm": 0.07172440002334786, |
|
"learning_rate": 0.00012470023980815347, |
|
"loss": 0.0712, |
|
"mean_token_accuracy": 0.9848017036914826, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.906474820143885, |
|
"grad_norm": 0.08171945353658899, |
|
"learning_rate": 0.00012709832134292568, |
|
"loss": 0.0899, |
|
"mean_token_accuracy": 0.9812785029411316, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9424460431654675, |
|
"grad_norm": 0.09215495770516072, |
|
"learning_rate": 0.00012949640287769783, |
|
"loss": 0.0901, |
|
"mean_token_accuracy": 0.9818152070045472, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9784172661870505, |
|
"grad_norm": 0.05819449472830757, |
|
"learning_rate": 0.00013189448441247004, |
|
"loss": 0.0855, |
|
"mean_token_accuracy": 0.9816466629505157, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.09057755023241043, |
|
"eval_mean_token_accuracy": 0.9828948188911785, |
|
"eval_runtime": 20.6375, |
|
"eval_samples_per_second": 5.912, |
|
"eval_steps_per_second": 0.775, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.014388489208633, |
|
"grad_norm": 0.0579264171607264, |
|
"learning_rate": 0.00013429256594724222, |
|
"loss": 0.0807, |
|
"mean_token_accuracy": 0.9847154915332794, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.050359712230216, |
|
"grad_norm": 0.06381845611677527, |
|
"learning_rate": 0.0001366906474820144, |
|
"loss": 0.0721, |
|
"mean_token_accuracy": 0.984616607427597, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.0863309352517985, |
|
"grad_norm": 0.07718475085953005, |
|
"learning_rate": 0.00013908872901678657, |
|
"loss": 0.0841, |
|
"mean_token_accuracy": 0.9817797482013703, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.1223021582733814, |
|
"grad_norm": 0.05892985671753617, |
|
"learning_rate": 0.00014148681055155878, |
|
"loss": 0.0751, |
|
"mean_token_accuracy": 0.9831727027893067, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.158273381294964, |
|
"grad_norm": 0.0804925115008608, |
|
"learning_rate": 0.00014388489208633093, |
|
"loss": 0.0749, |
|
"mean_token_accuracy": 0.9842367172241211, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.194244604316547, |
|
"grad_norm": 0.05121626528606145, |
|
"learning_rate": 0.0001462829736211031, |
|
"loss": 0.0773, |
|
"mean_token_accuracy": 0.9835640609264373, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.2302158273381294, |
|
"grad_norm": 0.08889974111718164, |
|
"learning_rate": 0.00014868105515587532, |
|
"loss": 0.0791, |
|
"mean_token_accuracy": 0.9834680020809173, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.2661870503597124, |
|
"grad_norm": 0.053476424317901526, |
|
"learning_rate": 0.00015107913669064747, |
|
"loss": 0.077, |
|
"mean_token_accuracy": 0.9838110446929932, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.302158273381295, |
|
"grad_norm": 0.05633921643284814, |
|
"learning_rate": 0.00015347721822541968, |
|
"loss": 0.0829, |
|
"mean_token_accuracy": 0.982527244091034, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.338129496402878, |
|
"grad_norm": 0.056650154444109466, |
|
"learning_rate": 0.00015587529976019186, |
|
"loss": 0.0796, |
|
"mean_token_accuracy": 0.9829414904117584, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.3741007194244603, |
|
"grad_norm": 0.06044924727673958, |
|
"learning_rate": 0.00015827338129496403, |
|
"loss": 0.0601, |
|
"mean_token_accuracy": 0.9872002065181732, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4100719424460433, |
|
"grad_norm": 0.05992425734936301, |
|
"learning_rate": 0.0001606714628297362, |
|
"loss": 0.0792, |
|
"mean_token_accuracy": 0.9831002652645111, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.446043165467626, |
|
"grad_norm": 0.05470386798150016, |
|
"learning_rate": 0.00016306954436450842, |
|
"loss": 0.0623, |
|
"mean_token_accuracy": 0.987056291103363, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.4820143884892087, |
|
"grad_norm": 0.059337571166361285, |
|
"learning_rate": 0.00016546762589928057, |
|
"loss": 0.08, |
|
"mean_token_accuracy": 0.9831870436668396, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.5179856115107913, |
|
"grad_norm": 0.05942919896434834, |
|
"learning_rate": 0.00016786570743405278, |
|
"loss": 0.0853, |
|
"mean_token_accuracy": 0.981755542755127, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.553956834532374, |
|
"grad_norm": 0.04624108736295381, |
|
"learning_rate": 0.00017026378896882496, |
|
"loss": 0.066, |
|
"mean_token_accuracy": 0.9858887672424317, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.5899280575539567, |
|
"grad_norm": 0.06579321358044239, |
|
"learning_rate": 0.00017266187050359714, |
|
"loss": 0.0884, |
|
"mean_token_accuracy": 0.9812662482261658, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6258992805755397, |
|
"grad_norm": 0.06258890069214806, |
|
"learning_rate": 0.00017505995203836931, |
|
"loss": 0.0713, |
|
"mean_token_accuracy": 0.984937310218811, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.661870503597122, |
|
"grad_norm": 0.06270259498254936, |
|
"learning_rate": 0.0001774580335731415, |
|
"loss": 0.073, |
|
"mean_token_accuracy": 0.9842502534389496, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.697841726618705, |
|
"grad_norm": 0.05589997924614264, |
|
"learning_rate": 0.00017985611510791367, |
|
"loss": 0.0768, |
|
"mean_token_accuracy": 0.983589482307434, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.7338129496402876, |
|
"grad_norm": 0.04009483221136256, |
|
"learning_rate": 0.00018225419664268585, |
|
"loss": 0.0751, |
|
"mean_token_accuracy": 0.984445083141327, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.7697841726618706, |
|
"grad_norm": 0.05881218057232397, |
|
"learning_rate": 0.00018465227817745806, |
|
"loss": 0.0707, |
|
"mean_token_accuracy": 0.9846773445606232, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.805755395683453, |
|
"grad_norm": 0.07312271736187839, |
|
"learning_rate": 0.0001870503597122302, |
|
"loss": 0.0903, |
|
"mean_token_accuracy": 0.980737829208374, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.841726618705036, |
|
"grad_norm": 0.04533772120467666, |
|
"learning_rate": 0.00018944844124700242, |
|
"loss": 0.0548, |
|
"mean_token_accuracy": 0.9884092271327972, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.8776978417266186, |
|
"grad_norm": 0.05840450449653284, |
|
"learning_rate": 0.0001918465227817746, |
|
"loss": 0.0676, |
|
"mean_token_accuracy": 0.9858544588088989, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9136690647482015, |
|
"grad_norm": 0.06171453893995398, |
|
"learning_rate": 0.00019424460431654677, |
|
"loss": 0.0817, |
|
"mean_token_accuracy": 0.9826960742473603, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.949640287769784, |
|
"grad_norm": 0.0631522796745376, |
|
"learning_rate": 0.00019664268585131895, |
|
"loss": 0.0752, |
|
"mean_token_accuracy": 0.9839196085929871, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.985611510791367, |
|
"grad_norm": 0.05036488138002462, |
|
"learning_rate": 0.00019904076738609113, |
|
"loss": 0.0823, |
|
"mean_token_accuracy": 0.9825737118721009, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.08580321818590164, |
|
"eval_mean_token_accuracy": 0.9844951361417771, |
|
"eval_runtime": 20.7493, |
|
"eval_samples_per_second": 5.88, |
|
"eval_steps_per_second": 0.771, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.0215827338129495, |
|
"grad_norm": 0.0457372684064395, |
|
"learning_rate": 0.0001999996846775429, |
|
"loss": 0.0646, |
|
"mean_token_accuracy": 0.9852441847324371, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.0575539568345325, |
|
"grad_norm": 0.04793056670224028, |
|
"learning_rate": 0.000199997757714173, |
|
"loss": 0.0729, |
|
"mean_token_accuracy": 0.9836010575294495, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.093525179856115, |
|
"grad_norm": 0.06721942436030308, |
|
"learning_rate": 0.00019999407900029147, |
|
"loss": 0.0738, |
|
"mean_token_accuracy": 0.9839203715324402, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.129496402877698, |
|
"grad_norm": 0.056660744728913394, |
|
"learning_rate": 0.00019998864860034169, |
|
"loss": 0.0757, |
|
"mean_token_accuracy": 0.9841017842292785, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.1654676258992804, |
|
"grad_norm": 0.05761414694560119, |
|
"learning_rate": 0.00019998146660945277, |
|
"loss": 0.082, |
|
"mean_token_accuracy": 0.982598501443863, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.2014388489208634, |
|
"grad_norm": 0.046839229541453344, |
|
"learning_rate": 0.0001999725331534382, |
|
"loss": 0.0681, |
|
"mean_token_accuracy": 0.9851432383060456, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.237410071942446, |
|
"grad_norm": 0.05445851360485557, |
|
"learning_rate": 0.00019996184838879326, |
|
"loss": 0.0641, |
|
"mean_token_accuracy": 0.9865113973617554, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.273381294964029, |
|
"grad_norm": 0.048523472160407664, |
|
"learning_rate": 0.0001999494125026926, |
|
"loss": 0.0672, |
|
"mean_token_accuracy": 0.9852766156196594, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.3093525179856114, |
|
"grad_norm": 0.051936987103197454, |
|
"learning_rate": 0.00019993522571298678, |
|
"loss": 0.0654, |
|
"mean_token_accuracy": 0.985963374376297, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.3453237410071943, |
|
"grad_norm": 0.04457189008558806, |
|
"learning_rate": 0.00019991928826819857, |
|
"loss": 0.0742, |
|
"mean_token_accuracy": 0.9842129707336426, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.381294964028777, |
|
"grad_norm": 0.056266351400963775, |
|
"learning_rate": 0.0001999016004475185, |
|
"loss": 0.0755, |
|
"mean_token_accuracy": 0.983711302280426, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.41726618705036, |
|
"grad_norm": 0.5220247379709618, |
|
"learning_rate": 0.00019988216256079997, |
|
"loss": 0.0722, |
|
"mean_token_accuracy": 0.9841032028198242, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.4532374100719423, |
|
"grad_norm": 0.0729813271238147, |
|
"learning_rate": 0.0001998609749485539, |
|
"loss": 0.0916, |
|
"mean_token_accuracy": 0.9794904887676239, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.4892086330935252, |
|
"grad_norm": 0.06612977773669373, |
|
"learning_rate": 0.0001998380379819428, |
|
"loss": 0.0636, |
|
"mean_token_accuracy": 0.9862911105155945, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.5251798561151078, |
|
"grad_norm": 0.06217153246894537, |
|
"learning_rate": 0.00019981335206277397, |
|
"loss": 0.0741, |
|
"mean_token_accuracy": 0.9842127680778503, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.5611510791366907, |
|
"grad_norm": 0.07400702775391514, |
|
"learning_rate": 0.00019978691762349295, |
|
"loss": 0.0687, |
|
"mean_token_accuracy": 0.9851798236370086, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.597122302158273, |
|
"grad_norm": 0.08585874467498368, |
|
"learning_rate": 0.00019975873512717546, |
|
"loss": 0.0609, |
|
"mean_token_accuracy": 0.986882072687149, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.633093525179856, |
|
"grad_norm": 0.051816554926674696, |
|
"learning_rate": 0.00019972880506751968, |
|
"loss": 0.0701, |
|
"mean_token_accuracy": 0.9853014886379242, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.6690647482014387, |
|
"grad_norm": 0.05057892453950836, |
|
"learning_rate": 0.00019969712796883725, |
|
"loss": 0.0741, |
|
"mean_token_accuracy": 0.9835891008377076, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.7050359712230216, |
|
"grad_norm": 0.07153654683802517, |
|
"learning_rate": 0.0001996637043860444, |
|
"loss": 0.0688, |
|
"mean_token_accuracy": 0.9850581645965576, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.741007194244604, |
|
"grad_norm": 0.04708930317430444, |
|
"learning_rate": 0.00019962853490465202, |
|
"loss": 0.0661, |
|
"mean_token_accuracy": 0.985362309217453, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.776978417266187, |
|
"grad_norm": 0.055807985616846, |
|
"learning_rate": 0.00019959162014075553, |
|
"loss": 0.0821, |
|
"mean_token_accuracy": 0.9829040467739105, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.81294964028777, |
|
"grad_norm": 0.04505227199614646, |
|
"learning_rate": 0.00019955296074102393, |
|
"loss": 0.0741, |
|
"mean_token_accuracy": 0.9845075249671936, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.8489208633093526, |
|
"grad_norm": 0.05335430120004925, |
|
"learning_rate": 0.00019951255738268872, |
|
"loss": 0.0737, |
|
"mean_token_accuracy": 0.9842015564441681, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.884892086330935, |
|
"grad_norm": 0.05015874969380626, |
|
"learning_rate": 0.00019947041077353177, |
|
"loss": 0.0511, |
|
"mean_token_accuracy": 0.9884456872940064, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.920863309352518, |
|
"grad_norm": 0.039523803165780566, |
|
"learning_rate": 0.00019942652165187306, |
|
"loss": 0.0526, |
|
"mean_token_accuracy": 0.9887028813362122, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.956834532374101, |
|
"grad_norm": 0.033565888789523046, |
|
"learning_rate": 0.00019938089078655775, |
|
"loss": 0.0634, |
|
"mean_token_accuracy": 0.9865010201930999, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.9928057553956835, |
|
"grad_norm": 0.0406257264738635, |
|
"learning_rate": 0.0001993335189769427, |
|
"loss": 0.0794, |
|
"mean_token_accuracy": 0.982637244462967, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.08812480419874191, |
|
"eval_mean_token_accuracy": 0.9846961365805732, |
|
"eval_runtime": 20.6402, |
|
"eval_samples_per_second": 5.911, |
|
"eval_steps_per_second": 0.775, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 4.028776978417266, |
|
"grad_norm": 0.0543120656292955, |
|
"learning_rate": 0.0001992844070528824, |
|
"loss": 0.0608, |
|
"mean_token_accuracy": 0.9861808717250824, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.0647482014388485, |
|
"grad_norm": 0.06445221295308218, |
|
"learning_rate": 0.00019923355587471458, |
|
"loss": 0.0763, |
|
"mean_token_accuracy": 0.983160275220871, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.100719424460432, |
|
"grad_norm": 0.05078293574914197, |
|
"learning_rate": 0.00019918096633324492, |
|
"loss": 0.069, |
|
"mean_token_accuracy": 0.9846292018890381, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.136690647482014, |
|
"grad_norm": 0.048929071374438124, |
|
"learning_rate": 0.00019912663934973168, |
|
"loss": 0.0667, |
|
"mean_token_accuracy": 0.9851913154125214, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.172661870503597, |
|
"grad_norm": 0.05408191334830909, |
|
"learning_rate": 0.0001990705758758694, |
|
"loss": 0.0693, |
|
"mean_token_accuracy": 0.9847879648208618, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.2086330935251794, |
|
"grad_norm": 0.05934948421112335, |
|
"learning_rate": 0.0001990127768937723, |
|
"loss": 0.0714, |
|
"mean_token_accuracy": 0.9839065909385681, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.244604316546763, |
|
"grad_norm": 0.06248100052161056, |
|
"learning_rate": 0.00019895324341595707, |
|
"loss": 0.0649, |
|
"mean_token_accuracy": 0.9853267908096314, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.280575539568345, |
|
"grad_norm": 0.058374434880137584, |
|
"learning_rate": 0.00019889197648532503, |
|
"loss": 0.071, |
|
"mean_token_accuracy": 0.9845187664031982, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.316546762589928, |
|
"grad_norm": 0.07289571230193848, |
|
"learning_rate": 0.00019882897717514407, |
|
"loss": 0.0625, |
|
"mean_token_accuracy": 0.9861088514328002, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.35251798561151, |
|
"grad_norm": 0.05591731428953037, |
|
"learning_rate": 0.00019876424658902967, |
|
"loss": 0.0701, |
|
"mean_token_accuracy": 0.9845547020435333, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.388489208633094, |
|
"grad_norm": 0.05638213741724957, |
|
"learning_rate": 0.00019869778586092564, |
|
"loss": 0.0707, |
|
"mean_token_accuracy": 0.9847763419151306, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.424460431654676, |
|
"grad_norm": 0.057841809730352224, |
|
"learning_rate": 0.00019862959615508417, |
|
"loss": 0.0608, |
|
"mean_token_accuracy": 0.9867449104785919, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.460431654676259, |
|
"grad_norm": 0.053932576578369425, |
|
"learning_rate": 0.00019855967866604562, |
|
"loss": 0.0587, |
|
"mean_token_accuracy": 0.9870499551296235, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.496402877697841, |
|
"grad_norm": 0.05211700106675136, |
|
"learning_rate": 0.0001984880346186174, |
|
"loss": 0.0534, |
|
"mean_token_accuracy": 0.9879081964492797, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.532374100719425, |
|
"grad_norm": 0.05540373657902223, |
|
"learning_rate": 0.00019841466526785266, |
|
"loss": 0.0663, |
|
"mean_token_accuracy": 0.9853027820587158, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.568345323741007, |
|
"grad_norm": 0.048602335259883014, |
|
"learning_rate": 0.00019833957189902815, |
|
"loss": 0.0603, |
|
"mean_token_accuracy": 0.9864147365093231, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.60431654676259, |
|
"grad_norm": 0.05673454468520649, |
|
"learning_rate": 0.00019826275582762186, |
|
"loss": 0.0615, |
|
"mean_token_accuracy": 0.9861698567867279, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.640287769784173, |
|
"grad_norm": 0.05852615284556405, |
|
"learning_rate": 0.0001981842183992899, |
|
"loss": 0.0624, |
|
"mean_token_accuracy": 0.986009931564331, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.676258992805756, |
|
"grad_norm": 0.08431448411850327, |
|
"learning_rate": 0.00019810396098984292, |
|
"loss": 0.0572, |
|
"mean_token_accuracy": 0.9874668717384338, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.712230215827338, |
|
"grad_norm": 0.06730656620028044, |
|
"learning_rate": 0.00019802198500522197, |
|
"loss": 0.0616, |
|
"mean_token_accuracy": 0.9861456751823425, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.748201438848921, |
|
"grad_norm": 0.044974290832838465, |
|
"learning_rate": 0.00019793829188147406, |
|
"loss": 0.0574, |
|
"mean_token_accuracy": 0.987455677986145, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.784172661870503, |
|
"grad_norm": 0.06716196494496443, |
|
"learning_rate": 0.00019785288308472672, |
|
"loss": 0.0814, |
|
"mean_token_accuracy": 0.9825004875659943, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.820143884892087, |
|
"grad_norm": 0.054996115096736096, |
|
"learning_rate": 0.00019776576011116263, |
|
"loss": 0.0737, |
|
"mean_token_accuracy": 0.9838329493999481, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.856115107913669, |
|
"grad_norm": 0.033705316368331954, |
|
"learning_rate": 0.00019767692448699302, |
|
"loss": 0.0502, |
|
"mean_token_accuracy": 0.9890934944152832, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.892086330935252, |
|
"grad_norm": 0.05047378970674569, |
|
"learning_rate": 0.00019758637776843137, |
|
"loss": 0.0691, |
|
"mean_token_accuracy": 0.9849341213703156, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.928057553956835, |
|
"grad_norm": 0.04984841000823012, |
|
"learning_rate": 0.00019749412154166583, |
|
"loss": 0.0589, |
|
"mean_token_accuracy": 0.9870136559009552, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.9640287769784175, |
|
"grad_norm": 0.03930276013196912, |
|
"learning_rate": 0.00019740015742283155, |
|
"loss": 0.0554, |
|
"mean_token_accuracy": 0.9878572285175323, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.045628151478910806, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.0501, |
|
"mean_token_accuracy": 0.9887760579586029, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.09487643092870712, |
|
"eval_mean_token_accuracy": 0.9840419329702854, |
|
"eval_runtime": 20.6735, |
|
"eval_samples_per_second": 5.901, |
|
"eval_steps_per_second": 0.774, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 5.0359712230215825, |
|
"grad_norm": 0.05493054119678511, |
|
"learning_rate": 0.00019720711212306205, |
|
"loss": 0.0597, |
|
"mean_token_accuracy": 0.9867689490318299, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.071942446043165, |
|
"grad_norm": 0.04837069496624849, |
|
"learning_rate": 0.00019710803432387465, |
|
"loss": 0.0561, |
|
"mean_token_accuracy": 0.9872341334819794, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 5.107913669064748, |
|
"grad_norm": 0.05589419149281416, |
|
"learning_rate": 0.000197007255396055, |
|
"loss": 0.0582, |
|
"mean_token_accuracy": 0.9867084145545959, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.143884892086331, |
|
"grad_norm": 0.059477184547365045, |
|
"learning_rate": 0.00019690477710503809, |
|
"loss": 0.0581, |
|
"mean_token_accuracy": 0.9864130139350891, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 5.179856115107913, |
|
"grad_norm": 0.051282761432200584, |
|
"learning_rate": 0.00019680060124602808, |
|
"loss": 0.044, |
|
"mean_token_accuracy": 0.9898509323596955, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.215827338129497, |
|
"grad_norm": 0.08016188967120222, |
|
"learning_rate": 0.00019669472964396712, |
|
"loss": 0.053, |
|
"mean_token_accuracy": 0.9872821033000946, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.251798561151079, |
|
"grad_norm": 0.05229073710194996, |
|
"learning_rate": 0.0001965871641535031, |
|
"loss": 0.0528, |
|
"mean_token_accuracy": 0.9878568768501281, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.287769784172662, |
|
"grad_norm": 0.07418543392117145, |
|
"learning_rate": 0.0001964779066589573, |
|
"loss": 0.0532, |
|
"mean_token_accuracy": 0.9879068970680237, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 5.323741007194244, |
|
"grad_norm": 0.05647478312480804, |
|
"learning_rate": 0.00019636695907429132, |
|
"loss": 0.06, |
|
"mean_token_accuracy": 0.9861337542533875, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.359712230215827, |
|
"grad_norm": 0.08571837256821345, |
|
"learning_rate": 0.00019625432334307368, |
|
"loss": 0.0652, |
|
"mean_token_accuracy": 0.9846034228801728, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 5.39568345323741, |
|
"grad_norm": 0.0792782233228753, |
|
"learning_rate": 0.00019614000143844558, |
|
"loss": 0.0641, |
|
"mean_token_accuracy": 0.9854226410388947, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.431654676258993, |
|
"grad_norm": 0.058478799045197496, |
|
"learning_rate": 0.0001960239953630865, |
|
"loss": 0.0571, |
|
"mean_token_accuracy": 0.9870614647865296, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 5.467625899280575, |
|
"grad_norm": 0.056558458972068175, |
|
"learning_rate": 0.00019590630714917898, |
|
"loss": 0.0595, |
|
"mean_token_accuracy": 0.986426830291748, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.503597122302159, |
|
"grad_norm": 0.0692782763770465, |
|
"learning_rate": 0.0001957869388583732, |
|
"loss": 0.049, |
|
"mean_token_accuracy": 0.9884204208850861, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 5.539568345323741, |
|
"grad_norm": 0.049674110177074314, |
|
"learning_rate": 0.00019566589258175068, |
|
"loss": 0.0534, |
|
"mean_token_accuracy": 0.9881749093532562, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.575539568345324, |
|
"grad_norm": 0.04655468775322885, |
|
"learning_rate": 0.00019554317043978773, |
|
"loss": 0.0467, |
|
"mean_token_accuracy": 0.9892040431499481, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.611510791366906, |
|
"grad_norm": 0.06639514118497526, |
|
"learning_rate": 0.00019541877458231825, |
|
"loss": 0.0571, |
|
"mean_token_accuracy": 0.9866962909698487, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.647482014388489, |
|
"grad_norm": 0.07907920132092487, |
|
"learning_rate": 0.00019529270718849625, |
|
"loss": 0.0635, |
|
"mean_token_accuracy": 0.9850185811519623, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 5.683453237410072, |
|
"grad_norm": 0.06387100290060817, |
|
"learning_rate": 0.00019516497046675744, |
|
"loss": 0.0569, |
|
"mean_token_accuracy": 0.9872703731060029, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.719424460431655, |
|
"grad_norm": 0.07096878405082174, |
|
"learning_rate": 0.00019503556665478067, |
|
"loss": 0.0609, |
|
"mean_token_accuracy": 0.9861226320266724, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 5.755395683453237, |
|
"grad_norm": 0.07451473740931176, |
|
"learning_rate": 0.00019490449801944868, |
|
"loss": 0.0533, |
|
"mean_token_accuracy": 0.9878711819648742, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.7913669064748206, |
|
"grad_norm": 0.06410885313727609, |
|
"learning_rate": 0.0001947717668568085, |
|
"loss": 0.0488, |
|
"mean_token_accuracy": 0.9891544997692108, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 5.827338129496403, |
|
"grad_norm": 0.053854419589313515, |
|
"learning_rate": 0.00019463737549203105, |
|
"loss": 0.0488, |
|
"mean_token_accuracy": 0.9887990176677703, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.863309352517986, |
|
"grad_norm": 0.04561191580156929, |
|
"learning_rate": 0.00019450132627937055, |
|
"loss": 0.0644, |
|
"mean_token_accuracy": 0.9854602158069611, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 5.899280575539568, |
|
"grad_norm": 0.04767754908778601, |
|
"learning_rate": 0.0001943636216021232, |
|
"loss": 0.0549, |
|
"mean_token_accuracy": 0.9869880855083466, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.935251798561151, |
|
"grad_norm": 0.0669886262398955, |
|
"learning_rate": 0.00019422426387258551, |
|
"loss": 0.0641, |
|
"mean_token_accuracy": 0.9850812613964081, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.971223021582734, |
|
"grad_norm": 0.057178276885445106, |
|
"learning_rate": 0.00019408325553201192, |
|
"loss": 0.0616, |
|
"mean_token_accuracy": 0.9861096978187561, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.09532783925533295, |
|
"eval_mean_token_accuracy": 0.9849477683504423, |
|
"eval_runtime": 20.7734, |
|
"eval_samples_per_second": 5.873, |
|
"eval_steps_per_second": 0.77, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 6.0071942446043165, |
|
"grad_norm": 0.06761995605993293, |
|
"learning_rate": 0.0001939405990505722, |
|
"loss": 0.0573, |
|
"mean_token_accuracy": 0.9845331013202667, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 6.043165467625899, |
|
"grad_norm": 0.07263548817088912, |
|
"learning_rate": 0.00019379629692730798, |
|
"loss": 0.0503, |
|
"mean_token_accuracy": 0.9876116633415222, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.079136690647482, |
|
"grad_norm": 0.08479650809428431, |
|
"learning_rate": 0.00019365035169008915, |
|
"loss": 0.0427, |
|
"mean_token_accuracy": 0.9894964694976807, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 6.115107913669065, |
|
"grad_norm": 0.06919827278420493, |
|
"learning_rate": 0.00019350276589556948, |
|
"loss": 0.0472, |
|
"mean_token_accuracy": 0.9883952558040618, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.151079136690647, |
|
"grad_norm": 0.08264329920052639, |
|
"learning_rate": 0.00019335354212914187, |
|
"loss": 0.0496, |
|
"mean_token_accuracy": 0.9882358908653259, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 6.18705035971223, |
|
"grad_norm": 0.06396607395380566, |
|
"learning_rate": 0.00019320268300489297, |
|
"loss": 0.0471, |
|
"mean_token_accuracy": 0.9883708119392395, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.223021582733813, |
|
"grad_norm": 0.08316463171318977, |
|
"learning_rate": 0.00019305019116555754, |
|
"loss": 0.0384, |
|
"mean_token_accuracy": 0.9907682836055756, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 6.258992805755396, |
|
"grad_norm": 0.07480912129949462, |
|
"learning_rate": 0.00019289606928247208, |
|
"loss": 0.0463, |
|
"mean_token_accuracy": 0.9888597249984741, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.294964028776978, |
|
"grad_norm": 0.0663383121635371, |
|
"learning_rate": 0.00019274032005552798, |
|
"loss": 0.0384, |
|
"mean_token_accuracy": 0.990657901763916, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 6.330935251798561, |
|
"grad_norm": 0.07501372798585075, |
|
"learning_rate": 0.00019258294621312433, |
|
"loss": 0.0528, |
|
"mean_token_accuracy": 0.9871481537818909, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.366906474820144, |
|
"grad_norm": 0.07366099061163396, |
|
"learning_rate": 0.00019242395051212, |
|
"loss": 0.0499, |
|
"mean_token_accuracy": 0.9882595360279083, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 6.402877697841727, |
|
"grad_norm": 0.06804867314458733, |
|
"learning_rate": 0.00019226333573778544, |
|
"loss": 0.046, |
|
"mean_token_accuracy": 0.9889584600925445, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.438848920863309, |
|
"grad_norm": 0.06482541942067276, |
|
"learning_rate": 0.00019210110470375394, |
|
"loss": 0.0457, |
|
"mean_token_accuracy": 0.9892277956008911, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 6.474820143884892, |
|
"grad_norm": 0.09362083699600474, |
|
"learning_rate": 0.0001919372602519721, |
|
"loss": 0.0479, |
|
"mean_token_accuracy": 0.9887864112854003, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.510791366906475, |
|
"grad_norm": 0.07419422320428706, |
|
"learning_rate": 0.00019177180525265037, |
|
"loss": 0.0462, |
|
"mean_token_accuracy": 0.988640570640564, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 6.546762589928058, |
|
"grad_norm": 0.0680933466552101, |
|
"learning_rate": 0.0001916047426042125, |
|
"loss": 0.0412, |
|
"mean_token_accuracy": 0.9902673780918121, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.58273381294964, |
|
"grad_norm": 0.0753203749472904, |
|
"learning_rate": 0.00019143607523324497, |
|
"loss": 0.0409, |
|
"mean_token_accuracy": 0.9900835871696472, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 6.618705035971223, |
|
"grad_norm": 0.09155392976849171, |
|
"learning_rate": 0.00019126580609444549, |
|
"loss": 0.0563, |
|
"mean_token_accuracy": 0.986204868555069, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.654676258992806, |
|
"grad_norm": 0.08500902229953358, |
|
"learning_rate": 0.00019109393817057148, |
|
"loss": 0.0464, |
|
"mean_token_accuracy": 0.9887993991374969, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.690647482014389, |
|
"grad_norm": 0.06130970774026331, |
|
"learning_rate": 0.00019092047447238773, |
|
"loss": 0.0463, |
|
"mean_token_accuracy": 0.9888347625732422, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.726618705035971, |
|
"grad_norm": 0.08321729417279401, |
|
"learning_rate": 0.0001907454180386135, |
|
"loss": 0.0515, |
|
"mean_token_accuracy": 0.9873551964759827, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 6.762589928057554, |
|
"grad_norm": 0.0788243708046946, |
|
"learning_rate": 0.00019056877193586962, |
|
"loss": 0.0552, |
|
"mean_token_accuracy": 0.9864752233028412, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.798561151079137, |
|
"grad_norm": 0.09851923268411174, |
|
"learning_rate": 0.00019039053925862443, |
|
"loss": 0.0605, |
|
"mean_token_accuracy": 0.9862433850765229, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 6.83453237410072, |
|
"grad_norm": 0.04852850455052362, |
|
"learning_rate": 0.00019021072312913986, |
|
"loss": 0.0402, |
|
"mean_token_accuracy": 0.9904878795146942, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.870503597122302, |
|
"grad_norm": 0.07705035380290443, |
|
"learning_rate": 0.00019002932669741639, |
|
"loss": 0.0476, |
|
"mean_token_accuracy": 0.9887258052825928, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 6.906474820143885, |
|
"grad_norm": 0.06935047132024741, |
|
"learning_rate": 0.00018984635314113826, |
|
"loss": 0.0458, |
|
"mean_token_accuracy": 0.9895333528518677, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.942446043165468, |
|
"grad_norm": 0.07052799437742344, |
|
"learning_rate": 0.00018966180566561757, |
|
"loss": 0.0471, |
|
"mean_token_accuracy": 0.9885306537151337, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 6.9784172661870505, |
|
"grad_norm": 0.07250750893823193, |
|
"learning_rate": 0.0001894756875037381, |
|
"loss": 0.0578, |
|
"mean_token_accuracy": 0.9862792432308197, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.09820590913295746, |
|
"eval_mean_token_accuracy": 0.9843102124604312, |
|
"eval_runtime": 20.6203, |
|
"eval_samples_per_second": 5.917, |
|
"eval_steps_per_second": 0.776, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 7.014388489208633, |
|
"grad_norm": 0.08165511724370542, |
|
"learning_rate": 0.0001892880019158988, |
|
"loss": 0.0547, |
|
"mean_token_accuracy": 0.9885966777801514, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.0503597122302155, |
|
"grad_norm": 0.09115471075741952, |
|
"learning_rate": 0.0001890987521899567, |
|
"loss": 0.0348, |
|
"mean_token_accuracy": 0.991256856918335, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.086330935251799, |
|
"grad_norm": 0.18703400358025105, |
|
"learning_rate": 0.0001889079416411692, |
|
"loss": 0.0344, |
|
"mean_token_accuracy": 0.9911470890045166, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 7.122302158273381, |
|
"grad_norm": 0.07593574468723076, |
|
"learning_rate": 0.00018871557361213595, |
|
"loss": 0.04, |
|
"mean_token_accuracy": 0.9902300417423249, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.158273381294964, |
|
"grad_norm": 0.08163153615480963, |
|
"learning_rate": 0.00018852165147274045, |
|
"loss": 0.0344, |
|
"mean_token_accuracy": 0.9915133118629456, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 7.194244604316546, |
|
"grad_norm": 0.08162384924322541, |
|
"learning_rate": 0.00018832617862009097, |
|
"loss": 0.0339, |
|
"mean_token_accuracy": 0.9912963092327118, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.23021582733813, |
|
"grad_norm": 0.06754095615055344, |
|
"learning_rate": 0.00018812915847846097, |
|
"loss": 0.0334, |
|
"mean_token_accuracy": 0.9912936687469482, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 7.266187050359712, |
|
"grad_norm": 0.07992585396768462, |
|
"learning_rate": 0.0001879305944992292, |
|
"loss": 0.0383, |
|
"mean_token_accuracy": 0.990229606628418, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.302158273381295, |
|
"grad_norm": 0.09213616209553331, |
|
"learning_rate": 0.00018773049016081913, |
|
"loss": 0.0457, |
|
"mean_token_accuracy": 0.9886265099048615, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 7.338129496402877, |
|
"grad_norm": 0.07024023343334314, |
|
"learning_rate": 0.0001875288489686382, |
|
"loss": 0.0367, |
|
"mean_token_accuracy": 0.9905371308326721, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.374100719424461, |
|
"grad_norm": 0.07286451277511494, |
|
"learning_rate": 0.0001873256744550162, |
|
"loss": 0.0347, |
|
"mean_token_accuracy": 0.9913554310798645, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 7.410071942446043, |
|
"grad_norm": 0.08298535555396302, |
|
"learning_rate": 0.00018712097017914352, |
|
"loss": 0.0388, |
|
"mean_token_accuracy": 0.9905226647853851, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.446043165467626, |
|
"grad_norm": 0.08830074749459958, |
|
"learning_rate": 0.00018691473972700875, |
|
"loss": 0.0445, |
|
"mean_token_accuracy": 0.9889210820198059, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 7.482014388489208, |
|
"grad_norm": 0.07217666187560311, |
|
"learning_rate": 0.00018670698671133593, |
|
"loss": 0.0452, |
|
"mean_token_accuracy": 0.9885773658752441, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.517985611510792, |
|
"grad_norm": 0.08661908711629725, |
|
"learning_rate": 0.00018649771477152115, |
|
"loss": 0.0339, |
|
"mean_token_accuracy": 0.9911720871925354, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 7.553956834532374, |
|
"grad_norm": 0.09371311177176188, |
|
"learning_rate": 0.0001862869275735689, |
|
"loss": 0.0367, |
|
"mean_token_accuracy": 0.9905966579914093, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.589928057553957, |
|
"grad_norm": 0.07707240942098416, |
|
"learning_rate": 0.00018607462881002778, |
|
"loss": 0.0343, |
|
"mean_token_accuracy": 0.9915632963180542, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 7.625899280575539, |
|
"grad_norm": 0.07730587819818967, |
|
"learning_rate": 0.0001858608221999259, |
|
"loss": 0.0383, |
|
"mean_token_accuracy": 0.9904868125915527, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.661870503597123, |
|
"grad_norm": 0.07304839741727129, |
|
"learning_rate": 0.00018564551148870563, |
|
"loss": 0.0439, |
|
"mean_token_accuracy": 0.9891519188880921, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 7.697841726618705, |
|
"grad_norm": 0.09016682635662701, |
|
"learning_rate": 0.00018542870044815796, |
|
"loss": 0.0425, |
|
"mean_token_accuracy": 0.98941091299057, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.733812949640288, |
|
"grad_norm": 0.07730069908696634, |
|
"learning_rate": 0.0001852103928763566, |
|
"loss": 0.0379, |
|
"mean_token_accuracy": 0.9907430112361908, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 7.76978417266187, |
|
"grad_norm": 0.07286962203888536, |
|
"learning_rate": 0.0001849905925975914, |
|
"loss": 0.0395, |
|
"mean_token_accuracy": 0.9902792334556579, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.805755395683454, |
|
"grad_norm": 0.12596219085722438, |
|
"learning_rate": 0.00018476930346230107, |
|
"loss": 0.043, |
|
"mean_token_accuracy": 0.9893492221832275, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 7.841726618705036, |
|
"grad_norm": 0.0721410843397686, |
|
"learning_rate": 0.00018454652934700615, |
|
"loss": 0.0337, |
|
"mean_token_accuracy": 0.9913184523582459, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.877697841726619, |
|
"grad_norm": 0.08734696713463556, |
|
"learning_rate": 0.00018432227415424084, |
|
"loss": 0.041, |
|
"mean_token_accuracy": 0.9895088315010071, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 7.913669064748201, |
|
"grad_norm": 0.08034908109385859, |
|
"learning_rate": 0.00018409654181248474, |
|
"loss": 0.0446, |
|
"mean_token_accuracy": 0.988712877035141, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.9496402877697845, |
|
"grad_norm": 0.0697845242925141, |
|
"learning_rate": 0.00018386933627609394, |
|
"loss": 0.0359, |
|
"mean_token_accuracy": 0.9910129487514496, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 7.985611510791367, |
|
"grad_norm": 0.07078505068848803, |
|
"learning_rate": 0.00018364066152523183, |
|
"loss": 0.0408, |
|
"mean_token_accuracy": 0.9896426558494568, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.1054563969373703, |
|
"eval_mean_token_accuracy": 0.984645739197731, |
|
"eval_runtime": 20.6985, |
|
"eval_samples_per_second": 5.894, |
|
"eval_steps_per_second": 0.773, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 8.02158273381295, |
|
"grad_norm": 0.06608211950667531, |
|
"learning_rate": 0.0001834105215657994, |
|
"loss": 0.0311, |
|
"mean_token_accuracy": 0.9939679900805155, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 8.057553956834532, |
|
"grad_norm": 0.08564587725938204, |
|
"learning_rate": 0.00018317892042936487, |
|
"loss": 0.0267, |
|
"mean_token_accuracy": 0.9928701162338257, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.093525179856115, |
|
"grad_norm": 0.08996704309284011, |
|
"learning_rate": 0.00018294586217309342, |
|
"loss": 0.0302, |
|
"mean_token_accuracy": 0.991721647977829, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 8.129496402877697, |
|
"grad_norm": 0.10213993059199547, |
|
"learning_rate": 0.00018271135087967574, |
|
"loss": 0.0255, |
|
"mean_token_accuracy": 0.9934465944766998, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 8.16546762589928, |
|
"grad_norm": 0.10289029084415881, |
|
"learning_rate": 0.0001824753906572567, |
|
"loss": 0.0271, |
|
"mean_token_accuracy": 0.9926867604255676, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 8.201438848920864, |
|
"grad_norm": 0.07938513450083459, |
|
"learning_rate": 0.00018223798563936344, |
|
"loss": 0.0277, |
|
"mean_token_accuracy": 0.9926994025707245, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.237410071942445, |
|
"grad_norm": 0.0799335759541154, |
|
"learning_rate": 0.00018199913998483282, |
|
"loss": 0.0292, |
|
"mean_token_accuracy": 0.9922228872776031, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 8.273381294964029, |
|
"grad_norm": 0.07791297569908608, |
|
"learning_rate": 0.0001817588578777386, |
|
"loss": 0.0251, |
|
"mean_token_accuracy": 0.9932994604110718, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.309352517985612, |
|
"grad_norm": 0.10478924127717758, |
|
"learning_rate": 0.00018151714352731822, |
|
"loss": 0.0296, |
|
"mean_token_accuracy": 0.9923690974712371, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 8.345323741007194, |
|
"grad_norm": 0.05952264303244273, |
|
"learning_rate": 0.000181274001167899, |
|
"loss": 0.0259, |
|
"mean_token_accuracy": 0.9932628035545349, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.381294964028777, |
|
"grad_norm": 0.11638720739620267, |
|
"learning_rate": 0.00018102943505882396, |
|
"loss": 0.0311, |
|
"mean_token_accuracy": 0.9920145153999329, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 8.417266187050359, |
|
"grad_norm": 0.07862143397116596, |
|
"learning_rate": 0.00018078344948437724, |
|
"loss": 0.0233, |
|
"mean_token_accuracy": 0.9941556990146637, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.453237410071942, |
|
"grad_norm": 0.08087339161763747, |
|
"learning_rate": 0.00018053604875370907, |
|
"loss": 0.0265, |
|
"mean_token_accuracy": 0.9931528508663178, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 8.489208633093526, |
|
"grad_norm": 0.061976387703659395, |
|
"learning_rate": 0.0001802872372007601, |
|
"loss": 0.0281, |
|
"mean_token_accuracy": 0.9925530850887299, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.525179856115107, |
|
"grad_norm": 0.08968392584335196, |
|
"learning_rate": 0.0001800370191841858, |
|
"loss": 0.032, |
|
"mean_token_accuracy": 0.9915622353553772, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 8.56115107913669, |
|
"grad_norm": 0.09146240533508403, |
|
"learning_rate": 0.0001797853990872798, |
|
"loss": 0.0329, |
|
"mean_token_accuracy": 0.9913170158863067, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.597122302158274, |
|
"grad_norm": 0.10059791196991036, |
|
"learning_rate": 0.0001795323813178973, |
|
"loss": 0.0256, |
|
"mean_token_accuracy": 0.9930787861347199, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 8.633093525179856, |
|
"grad_norm": 0.07933964343809208, |
|
"learning_rate": 0.00017927797030837768, |
|
"loss": 0.0284, |
|
"mean_token_accuracy": 0.9926510810852051, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.66906474820144, |
|
"grad_norm": 0.10008206157504908, |
|
"learning_rate": 0.00017902217051546715, |
|
"loss": 0.0296, |
|
"mean_token_accuracy": 0.9919540584087372, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 8.70503597122302, |
|
"grad_norm": 0.07195996592535572, |
|
"learning_rate": 0.00017876498642024026, |
|
"loss": 0.0263, |
|
"mean_token_accuracy": 0.993087249994278, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.741007194244604, |
|
"grad_norm": 0.0840990736088915, |
|
"learning_rate": 0.0001785064225280218, |
|
"loss": 0.0331, |
|
"mean_token_accuracy": 0.9914765417575836, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 8.776978417266188, |
|
"grad_norm": 0.07556361151629382, |
|
"learning_rate": 0.00017824648336830763, |
|
"loss": 0.0239, |
|
"mean_token_accuracy": 0.9935317218303681, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.81294964028777, |
|
"grad_norm": 0.0817902776134609, |
|
"learning_rate": 0.00017798517349468539, |
|
"loss": 0.0293, |
|
"mean_token_accuracy": 0.9924435615539551, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 8.848920863309353, |
|
"grad_norm": 0.07844793746584716, |
|
"learning_rate": 0.0001777224974847548, |
|
"loss": 0.032, |
|
"mean_token_accuracy": 0.9916129529476165, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 8.884892086330936, |
|
"grad_norm": 0.09174283379497755, |
|
"learning_rate": 0.0001774584599400474, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9922227621078491, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 8.920863309352518, |
|
"grad_norm": 0.08346812519931995, |
|
"learning_rate": 0.0001771930654859459, |
|
"loss": 0.0278, |
|
"mean_token_accuracy": 0.9929319977760315, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.956834532374101, |
|
"grad_norm": 0.09081059448512323, |
|
"learning_rate": 0.00017692631877160326, |
|
"loss": 0.0365, |
|
"mean_token_accuracy": 0.9903396785259246, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 8.992805755395683, |
|
"grad_norm": 0.0840058011137499, |
|
"learning_rate": 0.0001766582244698612, |
|
"loss": 0.0297, |
|
"mean_token_accuracy": 0.9923931121826172, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.10754524171352386, |
|
"eval_mean_token_accuracy": 0.9839274419678582, |
|
"eval_runtime": 20.8073, |
|
"eval_samples_per_second": 5.863, |
|
"eval_steps_per_second": 0.769, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 9.028776978417266, |
|
"grad_norm": 0.06802116638648911, |
|
"learning_rate": 0.00017638878727716838, |
|
"loss": 0.0239, |
|
"mean_token_accuracy": 0.994832769036293, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 9.06474820143885, |
|
"grad_norm": 0.08131934072937834, |
|
"learning_rate": 0.00017611801191349798, |
|
"loss": 0.0177, |
|
"mean_token_accuracy": 0.9950850903987885, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 9.100719424460431, |
|
"grad_norm": 0.09962740909778638, |
|
"learning_rate": 0.0001758459031222652, |
|
"loss": 0.0169, |
|
"mean_token_accuracy": 0.9952557981014252, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 9.136690647482014, |
|
"grad_norm": 0.08910176909961409, |
|
"learning_rate": 0.00017557246567024404, |
|
"loss": 0.0193, |
|
"mean_token_accuracy": 0.9950962662696838, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 9.172661870503598, |
|
"grad_norm": 0.08896573436836375, |
|
"learning_rate": 0.0001752977043474839, |
|
"loss": 0.0185, |
|
"mean_token_accuracy": 0.9951821863651276, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 9.20863309352518, |
|
"grad_norm": 0.07069710110622436, |
|
"learning_rate": 0.00017502162396722558, |
|
"loss": 0.0182, |
|
"mean_token_accuracy": 0.9950909554958344, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.244604316546763, |
|
"grad_norm": 0.10794611681753156, |
|
"learning_rate": 0.00017474422936581698, |
|
"loss": 0.0204, |
|
"mean_token_accuracy": 0.9944604396820068, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 9.280575539568344, |
|
"grad_norm": 0.0964081310067874, |
|
"learning_rate": 0.00017446552540262844, |
|
"loss": 0.0193, |
|
"mean_token_accuracy": 0.9947298228740692, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 9.316546762589928, |
|
"grad_norm": 0.06694312069681227, |
|
"learning_rate": 0.0001741855169599675, |
|
"loss": 0.0182, |
|
"mean_token_accuracy": 0.9948891222476959, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 9.352517985611511, |
|
"grad_norm": 0.09194435151559001, |
|
"learning_rate": 0.0001739042089429935, |
|
"loss": 0.0211, |
|
"mean_token_accuracy": 0.9945831596851349, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.388489208633093, |
|
"grad_norm": 0.08485510859325882, |
|
"learning_rate": 0.0001736216062796316, |
|
"loss": 0.0178, |
|
"mean_token_accuracy": 0.9953541696071625, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 9.424460431654676, |
|
"grad_norm": 0.07658351486107501, |
|
"learning_rate": 0.0001733377139204863, |
|
"loss": 0.0176, |
|
"mean_token_accuracy": 0.9950843632221222, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.46043165467626, |
|
"grad_norm": 0.0851945396842124, |
|
"learning_rate": 0.0001730525368387551, |
|
"loss": 0.0176, |
|
"mean_token_accuracy": 0.995317256450653, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 9.496402877697841, |
|
"grad_norm": 0.07680564483723305, |
|
"learning_rate": 0.0001727660800301409, |
|
"loss": 0.0195, |
|
"mean_token_accuracy": 0.9947294652462005, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.532374100719425, |
|
"grad_norm": 0.06733986423413497, |
|
"learning_rate": 0.00017247834851276492, |
|
"loss": 0.0225, |
|
"mean_token_accuracy": 0.9939347088336945, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 9.568345323741006, |
|
"grad_norm": 0.12457969840303192, |
|
"learning_rate": 0.00017218934732707842, |
|
"loss": 0.0212, |
|
"mean_token_accuracy": 0.9943628013134003, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.60431654676259, |
|
"grad_norm": 0.06957276517390819, |
|
"learning_rate": 0.00017189908153577473, |
|
"loss": 0.0206, |
|
"mean_token_accuracy": 0.9946195781230927, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 9.640287769784173, |
|
"grad_norm": 0.09308663583934602, |
|
"learning_rate": 0.00017160755622370032, |
|
"loss": 0.0184, |
|
"mean_token_accuracy": 0.9952435672283173, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.676258992805755, |
|
"grad_norm": 0.07546127826289363, |
|
"learning_rate": 0.00017131477649776587, |
|
"loss": 0.0198, |
|
"mean_token_accuracy": 0.9945826590061188, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 9.712230215827338, |
|
"grad_norm": 0.06447487107416815, |
|
"learning_rate": 0.00017102074748685673, |
|
"loss": 0.0191, |
|
"mean_token_accuracy": 0.9948029279708862, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.748201438848922, |
|
"grad_norm": 0.10429555757378318, |
|
"learning_rate": 0.00017072547434174304, |
|
"loss": 0.0224, |
|
"mean_token_accuracy": 0.9938852250576019, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 9.784172661870503, |
|
"grad_norm": 0.10174525963107275, |
|
"learning_rate": 0.0001704289622349897, |
|
"loss": 0.0209, |
|
"mean_token_accuracy": 0.9941792845726013, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 9.820143884892087, |
|
"grad_norm": 0.06515111457479097, |
|
"learning_rate": 0.0001701312163608655, |
|
"loss": 0.0197, |
|
"mean_token_accuracy": 0.9947053015232086, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 9.85611510791367, |
|
"grad_norm": 0.0853360162922663, |
|
"learning_rate": 0.0001698322419352522, |
|
"loss": 0.026, |
|
"mean_token_accuracy": 0.9930291116237641, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 9.892086330935252, |
|
"grad_norm": 0.08349002733460555, |
|
"learning_rate": 0.0001695320441955534, |
|
"loss": 0.0223, |
|
"mean_token_accuracy": 0.9938614785671234, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 9.928057553956835, |
|
"grad_norm": 0.11836172608748735, |
|
"learning_rate": 0.00016923062840060234, |
|
"loss": 0.021, |
|
"mean_token_accuracy": 0.9945221424102784, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 9.964028776978417, |
|
"grad_norm": 0.0979581744574528, |
|
"learning_rate": 0.0001689279998305702, |
|
"loss": 0.0263, |
|
"mean_token_accuracy": 0.9928580164909363, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.07684774295930966, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.0207, |
|
"mean_token_accuracy": 0.9940943062305451, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.11269818246364594, |
|
"eval_mean_token_accuracy": 0.9830840341746807, |
|
"eval_runtime": 20.6237, |
|
"eval_samples_per_second": 5.916, |
|
"eval_steps_per_second": 0.776, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 10.035971223021583, |
|
"grad_norm": 0.08599421791676856, |
|
"learning_rate": 0.00016831912559208063, |
|
"loss": 0.0121, |
|
"mean_token_accuracy": 0.9970287322998047, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 10.071942446043165, |
|
"grad_norm": 0.0904666825072302, |
|
"learning_rate": 0.00016801289058982, |
|
"loss": 0.013, |
|
"mean_token_accuracy": 0.99660022854805, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 10.107913669064748, |
|
"grad_norm": 0.107927530031339, |
|
"learning_rate": 0.00016770546414468488, |
|
"loss": 0.015, |
|
"mean_token_accuracy": 0.9960623264312745, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 10.14388489208633, |
|
"grad_norm": 0.06925828681503625, |
|
"learning_rate": 0.00016739685164214046, |
|
"loss": 0.0122, |
|
"mean_token_accuracy": 0.996869707107544, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 10.179856115107913, |
|
"grad_norm": 0.08749007468566572, |
|
"learning_rate": 0.00016708705848842898, |
|
"loss": 0.014, |
|
"mean_token_accuracy": 0.99650257229805, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 10.215827338129497, |
|
"grad_norm": 0.11345976381463416, |
|
"learning_rate": 0.00016677609011047533, |
|
"loss": 0.0131, |
|
"mean_token_accuracy": 0.9966128468513489, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 10.251798561151078, |
|
"grad_norm": 0.0850375168432864, |
|
"learning_rate": 0.00016646395195579178, |
|
"loss": 0.0148, |
|
"mean_token_accuracy": 0.9960009098052979, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 10.287769784172662, |
|
"grad_norm": 0.07294058737884025, |
|
"learning_rate": 0.00016615064949238267, |
|
"loss": 0.0132, |
|
"mean_token_accuracy": 0.9964902937412262, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 10.323741007194245, |
|
"grad_norm": 0.07943531885485305, |
|
"learning_rate": 0.00016583618820864858, |
|
"loss": 0.0135, |
|
"mean_token_accuracy": 0.9963561594486237, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 10.359712230215827, |
|
"grad_norm": 0.09060949078579321, |
|
"learning_rate": 0.0001655205736132902, |
|
"loss": 0.012, |
|
"mean_token_accuracy": 0.9970167279243469, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 10.39568345323741, |
|
"grad_norm": 0.08743228431554707, |
|
"learning_rate": 0.0001652038112352117, |
|
"loss": 0.0158, |
|
"mean_token_accuracy": 0.9957569420337677, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 10.431654676258994, |
|
"grad_norm": 0.08434742513312765, |
|
"learning_rate": 0.0001648859066234242, |
|
"loss": 0.0127, |
|
"mean_token_accuracy": 0.9967720329761505, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 10.467625899280575, |
|
"grad_norm": 0.08534299510053663, |
|
"learning_rate": 0.00016456686534694817, |
|
"loss": 0.0124, |
|
"mean_token_accuracy": 0.996967202425003, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 10.503597122302159, |
|
"grad_norm": 0.07636341426608007, |
|
"learning_rate": 0.00016424669299471614, |
|
"loss": 0.0134, |
|
"mean_token_accuracy": 0.9965148985385894, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 10.53956834532374, |
|
"grad_norm": 0.08631265171713358, |
|
"learning_rate": 0.0001639253951754747, |
|
"loss": 0.0125, |
|
"mean_token_accuracy": 0.996735155582428, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 10.575539568345324, |
|
"grad_norm": 0.08200188375124749, |
|
"learning_rate": 0.0001636029775176862, |
|
"loss": 0.0113, |
|
"mean_token_accuracy": 0.99694344997406, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 10.611510791366907, |
|
"grad_norm": 0.07881864745220842, |
|
"learning_rate": 0.00016327944566943035, |
|
"loss": 0.0119, |
|
"mean_token_accuracy": 0.9968697667121887, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 10.647482014388489, |
|
"grad_norm": 0.08190061340848842, |
|
"learning_rate": 0.00016295480529830494, |
|
"loss": 0.0156, |
|
"mean_token_accuracy": 0.9960256695747376, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 10.683453237410072, |
|
"grad_norm": 0.10685678023770241, |
|
"learning_rate": 0.00016262906209132692, |
|
"loss": 0.0144, |
|
"mean_token_accuracy": 0.9962826788425445, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 10.719424460431654, |
|
"grad_norm": 0.07320426903216805, |
|
"learning_rate": 0.0001623022217548325, |
|
"loss": 0.0148, |
|
"mean_token_accuracy": 0.9962579011917114, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 10.755395683453237, |
|
"grad_norm": 0.0935421010172905, |
|
"learning_rate": 0.00016197429001437735, |
|
"loss": 0.0165, |
|
"mean_token_accuracy": 0.995512044429779, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 10.79136690647482, |
|
"grad_norm": 0.05923694064442418, |
|
"learning_rate": 0.0001616452726146362, |
|
"loss": 0.0162, |
|
"mean_token_accuracy": 0.9955240964889527, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 10.827338129496402, |
|
"grad_norm": 0.0902823909222933, |
|
"learning_rate": 0.0001613151753193023, |
|
"loss": 0.0122, |
|
"mean_token_accuracy": 0.9967601776123047, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 10.863309352517986, |
|
"grad_norm": 0.06620900034563318, |
|
"learning_rate": 0.00016098400391098636, |
|
"loss": 0.0146, |
|
"mean_token_accuracy": 0.9960503220558167, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 10.899280575539569, |
|
"grad_norm": 0.08083785941879719, |
|
"learning_rate": 0.0001606517641911153, |
|
"loss": 0.0125, |
|
"mean_token_accuracy": 0.9967718720436096, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 10.93525179856115, |
|
"grad_norm": 0.1222321673203135, |
|
"learning_rate": 0.00016031846197983062, |
|
"loss": 0.0139, |
|
"mean_token_accuracy": 0.9963804185390472, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 10.971223021582734, |
|
"grad_norm": 0.09011217682765804, |
|
"learning_rate": 0.00015998410311588644, |
|
"loss": 0.0151, |
|
"mean_token_accuracy": 0.9960378229618072, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.12537601590156555, |
|
"eval_mean_token_accuracy": 0.9872708097100258, |
|
"eval_runtime": 20.7748, |
|
"eval_samples_per_second": 5.873, |
|
"eval_steps_per_second": 0.77, |
|
"step": 1529 |
|
}, |
|
{ |
|
"epoch": 11.007194244604317, |
|
"grad_norm": 0.05145660527450271, |
|
"learning_rate": 0.00015964869345654718, |
|
"loss": 0.0118, |
|
"mean_token_accuracy": 0.9978603720664978, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 11.043165467625899, |
|
"grad_norm": 0.07821203281348997, |
|
"learning_rate": 0.0001593122388774851, |
|
"loss": 0.0085, |
|
"mean_token_accuracy": 0.9977623283863067, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 11.079136690647482, |
|
"grad_norm": 0.07234857181108979, |
|
"learning_rate": 0.00015897474527267703, |
|
"loss": 0.009, |
|
"mean_token_accuracy": 0.9976400792598724, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 11.115107913669064, |
|
"grad_norm": 0.04075447553316834, |
|
"learning_rate": 0.00015863621855430159, |
|
"loss": 0.0092, |
|
"mean_token_accuracy": 0.9976687788963318, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 11.151079136690647, |
|
"grad_norm": 0.05794021578435905, |
|
"learning_rate": 0.00015829666465263525, |
|
"loss": 0.0088, |
|
"mean_token_accuracy": 0.9977623224258423, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 11.18705035971223, |
|
"grad_norm": 0.07683795817076886, |
|
"learning_rate": 0.00015795608951594859, |
|
"loss": 0.0095, |
|
"mean_token_accuracy": 0.997480845451355, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 11.223021582733812, |
|
"grad_norm": 0.07115098159372155, |
|
"learning_rate": 0.00015761449911040208, |
|
"loss": 0.0101, |
|
"mean_token_accuracy": 0.9975174725055694, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 11.258992805755396, |
|
"grad_norm": 0.03884336408006673, |
|
"learning_rate": 0.00015727189941994158, |
|
"loss": 0.0093, |
|
"mean_token_accuracy": 0.9976275801658631, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 11.29496402877698, |
|
"grad_norm": 0.06656440131240968, |
|
"learning_rate": 0.00015692829644619352, |
|
"loss": 0.0082, |
|
"mean_token_accuracy": 0.9979580223560334, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 11.33093525179856, |
|
"grad_norm": 0.06686553477037634, |
|
"learning_rate": 0.0001565836962083597, |
|
"loss": 0.0084, |
|
"mean_token_accuracy": 0.9977380752563476, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 11.366906474820144, |
|
"grad_norm": 0.051925628479388856, |
|
"learning_rate": 0.00015623810474311187, |
|
"loss": 0.0099, |
|
"mean_token_accuracy": 0.9973831713199616, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 11.402877697841726, |
|
"grad_norm": 0.07626073368161976, |
|
"learning_rate": 0.0001558915281044861, |
|
"loss": 0.0097, |
|
"mean_token_accuracy": 0.9975177109241485, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 11.43884892086331, |
|
"grad_norm": 0.09353665419288143, |
|
"learning_rate": 0.0001555439723637765, |
|
"loss": 0.0098, |
|
"mean_token_accuracy": 0.9974563598632813, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 11.474820143884893, |
|
"grad_norm": 0.06026792088715974, |
|
"learning_rate": 0.00015519544360942917, |
|
"loss": 0.0099, |
|
"mean_token_accuracy": 0.9973953664302826, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 11.510791366906474, |
|
"grad_norm": 0.0680669683566074, |
|
"learning_rate": 0.0001548459479469351, |
|
"loss": 0.011, |
|
"mean_token_accuracy": 0.9970895767211914, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 11.546762589928058, |
|
"grad_norm": 0.07661464909353981, |
|
"learning_rate": 0.00015449549149872376, |
|
"loss": 0.0094, |
|
"mean_token_accuracy": 0.9975910663604737, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 11.582733812949641, |
|
"grad_norm": 0.06540550364929187, |
|
"learning_rate": 0.00015414408040405537, |
|
"loss": 0.0089, |
|
"mean_token_accuracy": 0.9978111922740937, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 11.618705035971223, |
|
"grad_norm": 0.05130373899495586, |
|
"learning_rate": 0.0001537917208189136, |
|
"loss": 0.0091, |
|
"mean_token_accuracy": 0.9975790679454803, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 11.654676258992806, |
|
"grad_norm": 0.06949815748126974, |
|
"learning_rate": 0.00015343841891589776, |
|
"loss": 0.0108, |
|
"mean_token_accuracy": 0.9970408082008362, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 11.690647482014388, |
|
"grad_norm": 0.07039422200836666, |
|
"learning_rate": 0.00015308418088411444, |
|
"loss": 0.0103, |
|
"mean_token_accuracy": 0.997383177280426, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 11.726618705035971, |
|
"grad_norm": 0.09950022146159282, |
|
"learning_rate": 0.00015272901292906935, |
|
"loss": 0.01, |
|
"mean_token_accuracy": 0.9974565923213958, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 11.762589928057555, |
|
"grad_norm": 0.07438295375677863, |
|
"learning_rate": 0.00015237292127255852, |
|
"loss": 0.0094, |
|
"mean_token_accuracy": 0.9976524710655212, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 11.798561151079136, |
|
"grad_norm": 0.06394419085018742, |
|
"learning_rate": 0.00015201591215255916, |
|
"loss": 0.0097, |
|
"mean_token_accuracy": 0.9976644575595855, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 11.83453237410072, |
|
"grad_norm": 0.07476579460405877, |
|
"learning_rate": 0.00015165799182312062, |
|
"loss": 0.0114, |
|
"mean_token_accuracy": 0.9969670593738555, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 11.870503597122303, |
|
"grad_norm": 0.0719444710852101, |
|
"learning_rate": 0.00015129916655425468, |
|
"loss": 0.0104, |
|
"mean_token_accuracy": 0.9972853481769561, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 11.906474820143885, |
|
"grad_norm": 0.078313419391383, |
|
"learning_rate": 0.00015093944263182583, |
|
"loss": 0.0118, |
|
"mean_token_accuracy": 0.9968084037303925, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 11.942446043165468, |
|
"grad_norm": 0.04178771006797904, |
|
"learning_rate": 0.00015057882635744098, |
|
"loss": 0.0098, |
|
"mean_token_accuracy": 0.997468900680542, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 11.97841726618705, |
|
"grad_norm": 0.06783244720344332, |
|
"learning_rate": 0.0001502173240483392, |
|
"loss": 0.0115, |
|
"mean_token_accuracy": 0.9969798445701599, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.12743568420410156, |
|
"eval_mean_token_accuracy": 0.98657088117166, |
|
"eval_runtime": 20.6428, |
|
"eval_samples_per_second": 5.91, |
|
"eval_steps_per_second": 0.775, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 12.014388489208633, |
|
"grad_norm": 0.027342999643264056, |
|
"learning_rate": 0.00014985494203728102, |
|
"loss": 0.0103, |
|
"mean_token_accuracy": 0.9981654733419418, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 12.050359712230216, |
|
"grad_norm": 0.059963744431938956, |
|
"learning_rate": 0.00014949168667243758, |
|
"loss": 0.0072, |
|
"mean_token_accuracy": 0.9981658458709717, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 12.086330935251798, |
|
"grad_norm": 0.07816781926571713, |
|
"learning_rate": 0.00014912756431727922, |
|
"loss": 0.0069, |
|
"mean_token_accuracy": 0.9983003556728363, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 12.122302158273381, |
|
"grad_norm": 0.03878434979130554, |
|
"learning_rate": 0.00014876258135046422, |
|
"loss": 0.0077, |
|
"mean_token_accuracy": 0.9979945898056031, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 12.158273381294965, |
|
"grad_norm": 0.05123502981808833, |
|
"learning_rate": 0.00014839674416572694, |
|
"loss": 0.0062, |
|
"mean_token_accuracy": 0.9983372211456298, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 12.194244604316546, |
|
"grad_norm": 0.06471897367408934, |
|
"learning_rate": 0.00014803005917176585, |
|
"loss": 0.0068, |
|
"mean_token_accuracy": 0.9983494818210602, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 12.23021582733813, |
|
"grad_norm": 0.05910329781704424, |
|
"learning_rate": 0.00014766253279213117, |
|
"loss": 0.0076, |
|
"mean_token_accuracy": 0.9981291174888611, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 12.266187050359711, |
|
"grad_norm": 0.059920297250759154, |
|
"learning_rate": 0.00014729417146511255, |
|
"loss": 0.0081, |
|
"mean_token_accuracy": 0.9980435788631439, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 12.302158273381295, |
|
"grad_norm": 0.047092720411911114, |
|
"learning_rate": 0.00014692498164362613, |
|
"loss": 0.0083, |
|
"mean_token_accuracy": 0.9978721857070922, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 12.338129496402878, |
|
"grad_norm": 0.06484446628772765, |
|
"learning_rate": 0.0001465549697951015, |
|
"loss": 0.0081, |
|
"mean_token_accuracy": 0.9979456484317779, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 12.37410071942446, |
|
"grad_norm": 0.037828218939601234, |
|
"learning_rate": 0.00014618414240136844, |
|
"loss": 0.0074, |
|
"mean_token_accuracy": 0.9981168389320374, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 12.410071942446043, |
|
"grad_norm": 0.09326182092147371, |
|
"learning_rate": 0.00014581250595854336, |
|
"loss": 0.0079, |
|
"mean_token_accuracy": 0.9980802178382874, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 12.446043165467627, |
|
"grad_norm": 0.1146676492718464, |
|
"learning_rate": 0.00014544006697691557, |
|
"loss": 0.0089, |
|
"mean_token_accuracy": 0.9978107392787934, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 12.482014388489208, |
|
"grad_norm": 0.05343799953840085, |
|
"learning_rate": 0.00014506683198083314, |
|
"loss": 0.0084, |
|
"mean_token_accuracy": 0.9978642165660858, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 12.517985611510792, |
|
"grad_norm": 0.06686229156740404, |
|
"learning_rate": 0.00014469280750858854, |
|
"loss": 0.0074, |
|
"mean_token_accuracy": 0.9980190098285675, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 12.553956834532373, |
|
"grad_norm": 0.07182393374099447, |
|
"learning_rate": 0.0001443180001123044, |
|
"loss": 0.0078, |
|
"mean_token_accuracy": 0.9979457080364227, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 12.589928057553957, |
|
"grad_norm": 0.07145188250380506, |
|
"learning_rate": 0.00014394241635781838, |
|
"loss": 0.0073, |
|
"mean_token_accuracy": 0.9980436384677887, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 12.62589928057554, |
|
"grad_norm": 0.06052636573650373, |
|
"learning_rate": 0.00014356606282456833, |
|
"loss": 0.008, |
|
"mean_token_accuracy": 0.9978723347187042, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 12.661870503597122, |
|
"grad_norm": 0.051892477022163645, |
|
"learning_rate": 0.00014318894610547707, |
|
"loss": 0.0077, |
|
"mean_token_accuracy": 0.9979701161384582, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 12.697841726618705, |
|
"grad_norm": 0.06531042040712823, |
|
"learning_rate": 0.00014281107280683677, |
|
"loss": 0.0077, |
|
"mean_token_accuracy": 0.9981413781642914, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 12.733812949640289, |
|
"grad_norm": 0.05421707276721448, |
|
"learning_rate": 0.00014243244954819328, |
|
"loss": 0.0084, |
|
"mean_token_accuracy": 0.9978357255458832, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 12.76978417266187, |
|
"grad_norm": 0.05688320731946079, |
|
"learning_rate": 0.00014205308296223024, |
|
"loss": 0.0088, |
|
"mean_token_accuracy": 0.9977129817008972, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 12.805755395683454, |
|
"grad_norm": 0.047097464345664586, |
|
"learning_rate": 0.0001416729796946527, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9982883751392364, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 12.841726618705035, |
|
"grad_norm": 0.04951718994376441, |
|
"learning_rate": 0.00014129214640407102, |
|
"loss": 0.0074, |
|
"mean_token_accuracy": 0.9980681598186493, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 12.877697841726619, |
|
"grad_norm": 0.03535068544285628, |
|
"learning_rate": 0.0001409105897618838, |
|
"loss": 0.0068, |
|
"mean_token_accuracy": 0.9982638895511627, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 12.913669064748202, |
|
"grad_norm": 0.05813590209574777, |
|
"learning_rate": 0.0001405283164521614, |
|
"loss": 0.0087, |
|
"mean_token_accuracy": 0.9977501213550568, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 12.949640287769784, |
|
"grad_norm": 0.08313242752328498, |
|
"learning_rate": 0.0001401453331715286, |
|
"loss": 0.0086, |
|
"mean_token_accuracy": 0.9979334115982056, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 12.985611510791367, |
|
"grad_norm": 0.06490143326945057, |
|
"learning_rate": 0.00013976164662904745, |
|
"loss": 0.0083, |
|
"mean_token_accuracy": 0.997908991575241, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.1358712911605835, |
|
"eval_mean_token_accuracy": 0.9858476608991623, |
|
"eval_runtime": 20.5836, |
|
"eval_samples_per_second": 5.927, |
|
"eval_steps_per_second": 0.777, |
|
"step": 1807 |
|
}, |
|
{ |
|
"epoch": 13.02158273381295, |
|
"grad_norm": 0.03861550158961351, |
|
"learning_rate": 0.00013937726354609962, |
|
"loss": 0.0074, |
|
"mean_token_accuracy": 0.9983490506807963, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 13.057553956834532, |
|
"grad_norm": 0.03224073083223682, |
|
"learning_rate": 0.0001389921906562687, |
|
"loss": 0.0062, |
|
"mean_token_accuracy": 0.9983859360218048, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 13.093525179856115, |
|
"grad_norm": 0.032898659395212033, |
|
"learning_rate": 0.0001386064347052223, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.9982513129711151, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 13.129496402877697, |
|
"grad_norm": 0.03628135892190089, |
|
"learning_rate": 0.00013822000245059378, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9982879996299744, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 13.16546762589928, |
|
"grad_norm": 0.03828613461344576, |
|
"learning_rate": 0.00013783290066186391, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9985626757144928, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 13.201438848920864, |
|
"grad_norm": 0.03628422925570306, |
|
"learning_rate": 0.0001374451361202423, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.9981167852878571, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 13.237410071942445, |
|
"grad_norm": 0.044986378614690334, |
|
"learning_rate": 0.00013705671561854867, |
|
"loss": 0.0068, |
|
"mean_token_accuracy": 0.9982267796993256, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 13.273381294964029, |
|
"grad_norm": 0.05267548343245477, |
|
"learning_rate": 0.00013666764596109365, |
|
"loss": 0.0064, |
|
"mean_token_accuracy": 0.9983249008655548, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 13.309352517985612, |
|
"grad_norm": 0.03226487542606456, |
|
"learning_rate": 0.00013627793396355983, |
|
"loss": 0.0064, |
|
"mean_token_accuracy": 0.9984836876392365, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 13.345323741007194, |
|
"grad_norm": 0.05115506820484576, |
|
"learning_rate": 0.00013588758645288217, |
|
"loss": 0.0061, |
|
"mean_token_accuracy": 0.9983738422393799, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 13.381294964028777, |
|
"grad_norm": 0.03667313092197686, |
|
"learning_rate": 0.0001354966102671285, |
|
"loss": 0.0062, |
|
"mean_token_accuracy": 0.9983859896659851, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 13.417266187050359, |
|
"grad_norm": 0.05542873770045281, |
|
"learning_rate": 0.00013510501225537976, |
|
"loss": 0.0068, |
|
"mean_token_accuracy": 0.9980922400951385, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 13.453237410071942, |
|
"grad_norm": 0.049749433850759105, |
|
"learning_rate": 0.00013471279927760997, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.998239153623581, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 13.489208633093526, |
|
"grad_norm": 0.04722428251226616, |
|
"learning_rate": 0.00013431997820456592, |
|
"loss": 0.0068, |
|
"mean_token_accuracy": 0.9983492016792297, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 13.525179856115107, |
|
"grad_norm": 0.06972025494758348, |
|
"learning_rate": 0.00013392655591764723, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9983003556728363, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 13.56115107913669, |
|
"grad_norm": 0.04149046158387444, |
|
"learning_rate": 0.00013353253930878525, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9984471023082733, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 13.597122302158274, |
|
"grad_norm": 0.053027022615863284, |
|
"learning_rate": 0.00013313793528032278, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.9981414675712585, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 13.633093525179856, |
|
"grad_norm": 0.06016753855344703, |
|
"learning_rate": 0.0001327427507448928, |
|
"loss": 0.0058, |
|
"mean_token_accuracy": 0.9983982980251312, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 13.66906474820144, |
|
"grad_norm": 0.06568374751780803, |
|
"learning_rate": 0.00013234699262529778, |
|
"loss": 0.0063, |
|
"mean_token_accuracy": 0.9984226942062377, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 13.70503597122302, |
|
"grad_norm": 0.02812931422079084, |
|
"learning_rate": 0.000131950667854388, |
|
"loss": 0.0069, |
|
"mean_token_accuracy": 0.9982512354850769, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 13.741007194244604, |
|
"grad_norm": 0.04777322152627654, |
|
"learning_rate": 0.00013155378337494035, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9982635855674744, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 13.776978417266188, |
|
"grad_norm": 0.043818355689989874, |
|
"learning_rate": 0.00013115634613953663, |
|
"loss": 0.007, |
|
"mean_token_accuracy": 0.9982267916202545, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 13.81294964028777, |
|
"grad_norm": 0.03349871508194793, |
|
"learning_rate": 0.00013075836311044175, |
|
"loss": 0.0069, |
|
"mean_token_accuracy": 0.9982512712478637, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 13.848920863309353, |
|
"grad_norm": 0.0382839105195503, |
|
"learning_rate": 0.00013035984125948178, |
|
"loss": 0.0065, |
|
"mean_token_accuracy": 0.9983247220516205, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 13.884892086330936, |
|
"grad_norm": 0.03717138922436437, |
|
"learning_rate": 0.00012996078756792186, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9981537342071534, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 13.920863309352518, |
|
"grad_norm": 0.04286039147486022, |
|
"learning_rate": 0.00012956120902634378, |
|
"loss": 0.0065, |
|
"mean_token_accuracy": 0.9982879340648652, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 13.956834532374101, |
|
"grad_norm": 0.05579034741194405, |
|
"learning_rate": 0.00012916111263452368, |
|
"loss": 0.007, |
|
"mean_token_accuracy": 0.9980191111564636, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 13.992805755395683, |
|
"grad_norm": 0.04290302964746944, |
|
"learning_rate": 0.00012876050540130927, |
|
"loss": 0.0071, |
|
"mean_token_accuracy": 0.998129004240036, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.14095589518547058, |
|
"eval_mean_token_accuracy": 0.9844649698999193, |
|
"eval_runtime": 20.7085, |
|
"eval_samples_per_second": 5.891, |
|
"eval_steps_per_second": 0.773, |
|
"step": 1946 |
|
}, |
|
{ |
|
"epoch": 14.028776978417266, |
|
"grad_norm": 0.02728336439016839, |
|
"learning_rate": 0.00012835939434449714, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9983949959278107, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 14.06474820143885, |
|
"grad_norm": 0.028891124116282068, |
|
"learning_rate": 0.00012795778649070993, |
|
"loss": 0.0057, |
|
"mean_token_accuracy": 0.9985325753688812, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 14.100719424460431, |
|
"grad_norm": 0.04257323636970389, |
|
"learning_rate": 0.00012755568887527297, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9985634684562683, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 14.136690647482014, |
|
"grad_norm": 0.06375421515135235, |
|
"learning_rate": 0.00012715310854209124, |
|
"loss": 0.0059, |
|
"mean_token_accuracy": 0.9984101951122284, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 14.172661870503598, |
|
"grad_norm": 0.02540027723004083, |
|
"learning_rate": 0.00012675005254352594, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.998593783378601, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 14.20863309352518, |
|
"grad_norm": 0.023921388854413334, |
|
"learning_rate": 0.00012634652794027087, |
|
"loss": 0.0062, |
|
"mean_token_accuracy": 0.9983613193035126, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 14.244604316546763, |
|
"grad_norm": 0.039564667117860226, |
|
"learning_rate": 0.00012594254180122886, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9983247637748718, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 14.280575539568344, |
|
"grad_norm": 0.03305788290352457, |
|
"learning_rate": 0.00012553810120338786, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9987037897109985, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 14.316546762589928, |
|
"grad_norm": 0.059088222287712794, |
|
"learning_rate": 0.000125133213231697, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9985817670822144, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 14.352517985611511, |
|
"grad_norm": 0.023509426202254203, |
|
"learning_rate": 0.00012472788497894236, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9986183822154999, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 14.388489208633093, |
|
"grad_norm": 0.022890239589631503, |
|
"learning_rate": 0.00012432212354562298, |
|
"loss": 0.0057, |
|
"mean_token_accuracy": 0.9984715104103088, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 14.424460431654676, |
|
"grad_norm": 0.04733476932184841, |
|
"learning_rate": 0.00012391593603982618, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9984348475933075, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 14.46043165467626, |
|
"grad_norm": 0.07353913032836777, |
|
"learning_rate": 0.0001235093295771032, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.998398095369339, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 14.496402877697841, |
|
"grad_norm": 0.037687053524519704, |
|
"learning_rate": 0.00012310231128034464, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9984593033790589, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 14.532374100719425, |
|
"grad_norm": 0.04717362898069163, |
|
"learning_rate": 0.00012269488827965536, |
|
"loss": 0.0058, |
|
"mean_token_accuracy": 0.9983981728553772, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 14.568345323741006, |
|
"grad_norm": 0.03594233576079112, |
|
"learning_rate": 0.00012228706771223, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9984471380710602, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 14.60431654676259, |
|
"grad_norm": 0.03917732285509177, |
|
"learning_rate": 0.00012187885672222752, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9983980774879455, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 14.640287769784173, |
|
"grad_norm": 0.02633180057530005, |
|
"learning_rate": 0.00012147026246064644, |
|
"loss": 0.0065, |
|
"mean_token_accuracy": 0.9982512533664704, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 14.676258992805755, |
|
"grad_norm": 0.0404471117892732, |
|
"learning_rate": 0.00012106129208519934, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9985327005386353, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 14.712230215827338, |
|
"grad_norm": 0.06192069060353017, |
|
"learning_rate": 0.00012065195276018746, |
|
"loss": 0.0058, |
|
"mean_token_accuracy": 0.9984227299690247, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 14.748201438848922, |
|
"grad_norm": 0.04930101254092268, |
|
"learning_rate": 0.00012024225165637531, |
|
"loss": 0.0062, |
|
"mean_token_accuracy": 0.9983979761600494, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 14.784172661870503, |
|
"grad_norm": 0.024850771626895557, |
|
"learning_rate": 0.00011983219595086506, |
|
"loss": 0.0061, |
|
"mean_token_accuracy": 0.998300313949585, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 14.820143884892087, |
|
"grad_norm": 0.038918118854958376, |
|
"learning_rate": 0.00011942179282697064, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9983371019363403, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 14.85611510791367, |
|
"grad_norm": 0.0516184338246842, |
|
"learning_rate": 0.00011901104947409212, |
|
"loss": 0.0059, |
|
"mean_token_accuracy": 0.9983981013298034, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 14.892086330935252, |
|
"grad_norm": 0.10462590843104572, |
|
"learning_rate": 0.00011859997308758959, |
|
"loss": 0.0066, |
|
"mean_token_accuracy": 0.9981902480125427, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 14.928057553956835, |
|
"grad_norm": 0.09491933759966162, |
|
"learning_rate": 0.00011818857086865725, |
|
"loss": 0.0067, |
|
"mean_token_accuracy": 0.9982022881507874, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 14.964028776978417, |
|
"grad_norm": 0.022774250628572534, |
|
"learning_rate": 0.00011777685002419717, |
|
"loss": 0.0057, |
|
"mean_token_accuracy": 0.9985937774181366, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.034773500298789874, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.006, |
|
"mean_token_accuracy": 0.9984226584434509, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.14427591860294342, |
|
"eval_mean_token_accuracy": 0.9827392026782036, |
|
"eval_runtime": 20.6283, |
|
"eval_samples_per_second": 5.914, |
|
"eval_steps_per_second": 0.776, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 15.035971223021583, |
|
"grad_norm": 0.035104691061661225, |
|
"learning_rate": 0.00011695248131408394, |
|
"loss": 0.0052, |
|
"mean_token_accuracy": 0.9986181318759918, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 15.071942446043165, |
|
"grad_norm": 0.03200252235283823, |
|
"learning_rate": 0.00011653984788963775, |
|
"loss": 0.0046, |
|
"mean_token_accuracy": 0.9987406134605408, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 15.107913669064748, |
|
"grad_norm": 0.15594026975405056, |
|
"learning_rate": 0.00011612692472182463, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9986916542053222, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 15.14388489208633, |
|
"grad_norm": 0.055765964735062255, |
|
"learning_rate": 0.00011571371904419053, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.998593682050705, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 15.179856115107913, |
|
"grad_norm": 0.029662600967569834, |
|
"learning_rate": 0.0001153002380952303, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9984715342521667, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 15.215827338129497, |
|
"grad_norm": 0.033094414530212134, |
|
"learning_rate": 0.00011488648911826099, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9985202550888062, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 15.251798561151078, |
|
"grad_norm": 0.05314626647068158, |
|
"learning_rate": 0.00011447247936129497, |
|
"loss": 0.0059, |
|
"mean_token_accuracy": 0.9983490586280823, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 15.287769784172662, |
|
"grad_norm": 0.045699141908757346, |
|
"learning_rate": 0.00011405821607691287, |
|
"loss": 0.0061, |
|
"mean_token_accuracy": 0.9984403252601624, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 15.323741007194245, |
|
"grad_norm": 0.022330079247343013, |
|
"learning_rate": 0.00011364370652213665, |
|
"loss": 0.0059, |
|
"mean_token_accuracy": 0.9984836757183075, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 15.359712230215827, |
|
"grad_norm": 0.04482842486338068, |
|
"learning_rate": 0.00011322895795830237, |
|
"loss": 0.0061, |
|
"mean_token_accuracy": 0.9984592318534851, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 15.39568345323741, |
|
"grad_norm": 0.035455200603189345, |
|
"learning_rate": 0.00011281397765093301, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9985081374645233, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 15.431654676258994, |
|
"grad_norm": 0.03415653545073392, |
|
"learning_rate": 0.00011239877286961122, |
|
"loss": 0.0059, |
|
"mean_token_accuracy": 0.9984224319458008, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 15.467625899280575, |
|
"grad_norm": 0.022878497923323426, |
|
"learning_rate": 0.000111983350887852, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.9985940217971802, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 15.503597122302159, |
|
"grad_norm": 0.04143356216028894, |
|
"learning_rate": 0.00011156771898297525, |
|
"loss": 0.0061, |
|
"mean_token_accuracy": 0.9983247220516205, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 15.53956834532374, |
|
"grad_norm": 0.031009283054650786, |
|
"learning_rate": 0.00011115188443597821, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9984225571155548, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 15.575539568345324, |
|
"grad_norm": 0.022394840577598808, |
|
"learning_rate": 0.000110735854531408, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986917078495026, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 15.611510791366907, |
|
"grad_norm": 0.02091443750785051, |
|
"learning_rate": 0.00011031963655723407, |
|
"loss": 0.0055, |
|
"mean_token_accuracy": 0.9984104752540588, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 15.647482014388489, |
|
"grad_norm": 0.02380856139135341, |
|
"learning_rate": 0.00010990323780472041, |
|
"loss": 0.0052, |
|
"mean_token_accuracy": 0.9986672401428223, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 15.683453237410072, |
|
"grad_norm": 0.033810504796281546, |
|
"learning_rate": 0.00010948666556829781, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9985450327396392, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 15.719424460431654, |
|
"grad_norm": 0.03600666830072387, |
|
"learning_rate": 0.0001090699271454362, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9987039625644684, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 15.755395683453237, |
|
"grad_norm": 0.03847446311106137, |
|
"learning_rate": 0.00010865302983651673, |
|
"loss": 0.0058, |
|
"mean_token_accuracy": 0.9983490526676178, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 15.79136690647482, |
|
"grad_norm": 0.04082747481136435, |
|
"learning_rate": 0.00010823598094470393, |
|
"loss": 0.0065, |
|
"mean_token_accuracy": 0.9983490526676178, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 15.827338129496402, |
|
"grad_norm": 0.03422471271789895, |
|
"learning_rate": 0.00010781878777581771, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9984959781169891, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 15.863309352517986, |
|
"grad_norm": 0.04659967607202778, |
|
"learning_rate": 0.00010740145763820532, |
|
"loss": 0.0056, |
|
"mean_token_accuracy": 0.9985326588153839, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 15.899280575539569, |
|
"grad_norm": 0.03223979544330336, |
|
"learning_rate": 0.00010698399784261366, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9985695660114289, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 15.93525179856115, |
|
"grad_norm": 0.021828243243609862, |
|
"learning_rate": 0.0001065664157020607, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.998581486940384, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 15.971223021582734, |
|
"grad_norm": 0.057542338253446325, |
|
"learning_rate": 0.00010614871853170781, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.998410427570343, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.1483013927936554, |
|
"eval_mean_token_accuracy": 0.9880256205797195, |
|
"eval_runtime": 20.8076, |
|
"eval_samples_per_second": 5.863, |
|
"eval_steps_per_second": 0.769, |
|
"step": 2224 |
|
}, |
|
{ |
|
"epoch": 16.007194244604317, |
|
"grad_norm": 0.02766002764688597, |
|
"learning_rate": 0.00010573091364873132, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9988994002342224, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 16.0431654676259, |
|
"grad_norm": 0.023692963524001718, |
|
"learning_rate": 0.00010531300837219455, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.998691588640213, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 16.07913669064748, |
|
"grad_norm": 0.029459937106562046, |
|
"learning_rate": 0.00010489501002291952, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.998544842004776, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 16.115107913669064, |
|
"grad_norm": 0.035502170379668525, |
|
"learning_rate": 0.00010447692592335861, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986058592796325, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 16.151079136690647, |
|
"grad_norm": 0.03791085471996512, |
|
"learning_rate": 0.00010405876339746636, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9988628804683686, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 16.18705035971223, |
|
"grad_norm": 0.02626409405940899, |
|
"learning_rate": 0.00010364052977057126, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9985937297344207, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 16.223021582733814, |
|
"grad_norm": 0.04031732388074853, |
|
"learning_rate": 0.00010322223236924727, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9987038433551788, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 16.258992805755394, |
|
"grad_norm": 0.01795687371349505, |
|
"learning_rate": 0.00010280387852118554, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986060202121735, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 16.294964028776977, |
|
"grad_norm": 0.04430468807526254, |
|
"learning_rate": 0.00010238547555506614, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9984959602355957, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 16.33093525179856, |
|
"grad_norm": 0.05249900301460759, |
|
"learning_rate": 0.00010196703080042946, |
|
"loss": 0.0052, |
|
"mean_token_accuracy": 0.9986181795597077, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 16.366906474820144, |
|
"grad_norm": 0.021493191230659354, |
|
"learning_rate": 0.00010154855158754805, |
|
"loss": 0.0046, |
|
"mean_token_accuracy": 0.998752897977829, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 16.402877697841728, |
|
"grad_norm": 0.0833831847193212, |
|
"learning_rate": 0.00010113004524729799, |
|
"loss": 0.0057, |
|
"mean_token_accuracy": 0.9984051644802093, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 16.43884892086331, |
|
"grad_norm": 0.030622972707148307, |
|
"learning_rate": 0.00010071151911103063, |
|
"loss": 0.0055, |
|
"mean_token_accuracy": 0.9984959185123443, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 16.47482014388489, |
|
"grad_norm": 0.030014388256583948, |
|
"learning_rate": 0.00010029298051044414, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9984592616558075, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 16.510791366906474, |
|
"grad_norm": 0.023782333462141935, |
|
"learning_rate": 9.987443677745496e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9986916482448578, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 16.546762589928058, |
|
"grad_norm": 0.0583624667571686, |
|
"learning_rate": 9.945589524406951e-05, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9984225928783417, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 16.58273381294964, |
|
"grad_norm": 0.025457368768833585, |
|
"learning_rate": 9.90373632422556e-05, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9985692918300628, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 16.618705035971225, |
|
"grad_norm": 0.027459291921463742, |
|
"learning_rate": 9.861884810381417e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986428856849671, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 16.654676258992804, |
|
"grad_norm": 0.03495997309084576, |
|
"learning_rate": 9.820035716025068e-05, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9984227001667023, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 16.690647482014388, |
|
"grad_norm": 0.03329874416462551, |
|
"learning_rate": 9.77818977426467e-05, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.998716139793396, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 16.72661870503597, |
|
"grad_norm": 0.03740885190723497, |
|
"learning_rate": 9.73634771815317e-05, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9985325634479523, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 16.762589928057555, |
|
"grad_norm": 0.027197902407817453, |
|
"learning_rate": 9.694510280675423e-05, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.998703807592392, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 16.798561151079138, |
|
"grad_norm": 0.025041006989781844, |
|
"learning_rate": 9.652678194735394e-05, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9986181437969208, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 16.834532374100718, |
|
"grad_norm": 0.024478935060570445, |
|
"learning_rate": 9.610852193143299e-05, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9984714210033416, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 16.8705035971223, |
|
"grad_norm": 0.03976876347602272, |
|
"learning_rate": 9.569033008602756e-05, |
|
"loss": 0.0058, |
|
"mean_token_accuracy": 0.9983245432376862, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 16.906474820143885, |
|
"grad_norm": 0.028228360140161907, |
|
"learning_rate": 9.527221373697973e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986182987689972, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 16.942446043165468, |
|
"grad_norm": 0.022048124762114728, |
|
"learning_rate": 9.485418020880907e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986796140670776, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 16.97841726618705, |
|
"grad_norm": 0.026867898824077054, |
|
"learning_rate": 9.44362368245842e-05, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9984837353229523, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.15214376151561737, |
|
"eval_mean_token_accuracy": 0.986979441209273, |
|
"eval_runtime": 20.6429, |
|
"eval_samples_per_second": 5.91, |
|
"eval_steps_per_second": 0.775, |
|
"step": 2363 |
|
}, |
|
{ |
|
"epoch": 17.014388489208635, |
|
"grad_norm": 0.022380133629246797, |
|
"learning_rate": 9.401839090579462e-05, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.9988689571619034, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 17.050359712230215, |
|
"grad_norm": 0.017392820309802014, |
|
"learning_rate": 9.360064977222262e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9988016843795776, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 17.086330935251798, |
|
"grad_norm": 0.028255077169732083, |
|
"learning_rate": 9.31830207418146e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9988139510154724, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 17.12230215827338, |
|
"grad_norm": 0.024878084296257166, |
|
"learning_rate": 9.276551113055337e-05, |
|
"loss": 0.0045, |
|
"mean_token_accuracy": 0.9986426115036011, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 17.158273381294965, |
|
"grad_norm": 0.020588745089007283, |
|
"learning_rate": 9.23481282523296e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987526893615722, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 17.194244604316548, |
|
"grad_norm": 0.03258067411700634, |
|
"learning_rate": 9.193087941881397e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988873302936554, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 17.230215827338128, |
|
"grad_norm": 0.032706145299279836, |
|
"learning_rate": 9.151377193932903e-05, |
|
"loss": 0.0052, |
|
"mean_token_accuracy": 0.9985202550888062, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 17.26618705035971, |
|
"grad_norm": 0.023979072230274387, |
|
"learning_rate": 9.109681312072091e-05, |
|
"loss": 0.0045, |
|
"mean_token_accuracy": 0.9987037956714631, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 17.302158273381295, |
|
"grad_norm": 0.03210227027143969, |
|
"learning_rate": 9.068001026723166e-05, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9985203862190246, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 17.33812949640288, |
|
"grad_norm": 0.02958254695418276, |
|
"learning_rate": 9.026337068037122e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986793696880341, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 17.37410071942446, |
|
"grad_norm": 0.02003759643098566, |
|
"learning_rate": 8.984690165878921e-05, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.9985570669174194, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 17.41007194244604, |
|
"grad_norm": 0.01976792000129784, |
|
"learning_rate": 8.943061049814752e-05, |
|
"loss": 0.0045, |
|
"mean_token_accuracy": 0.998789393901825, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 17.446043165467625, |
|
"grad_norm": 0.022841179893650605, |
|
"learning_rate": 8.901450449099214e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9988627791404724, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 17.48201438848921, |
|
"grad_norm": 0.02670252796794463, |
|
"learning_rate": 8.859859092662563e-05, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9986181497573853, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 17.51798561151079, |
|
"grad_norm": 0.025962870388272177, |
|
"learning_rate": 8.818287709097947e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987486064434051, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 17.553956834532375, |
|
"grad_norm": 0.02094749161409735, |
|
"learning_rate": 8.776737026648605e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986182987689972, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 17.58992805755396, |
|
"grad_norm": 0.02161896425598796, |
|
"learning_rate": 8.735207773195156e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986915528774262, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 17.62589928057554, |
|
"grad_norm": 0.018472080207272413, |
|
"learning_rate": 8.693700676242828e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9985081493854523, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 17.66187050359712, |
|
"grad_norm": 0.02228454767779503, |
|
"learning_rate": 8.652216462908698e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986059486865997, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 17.697841726618705, |
|
"grad_norm": 0.040612353279765694, |
|
"learning_rate": 8.610755859908991e-05, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9985325336456299, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 17.73381294964029, |
|
"grad_norm": 0.022409599302205964, |
|
"learning_rate": 8.569319593546309e-05, |
|
"loss": 0.0051, |
|
"mean_token_accuracy": 0.9984713613986969, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 17.769784172661872, |
|
"grad_norm": 0.022125836247721072, |
|
"learning_rate": 8.527908389696936e-05, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9985570132732391, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 17.805755395683452, |
|
"grad_norm": 0.023112422737157953, |
|
"learning_rate": 8.486522973798126e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9987773120403289, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 17.841726618705035, |
|
"grad_norm": 0.019917824060406528, |
|
"learning_rate": 8.445164070835357e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987040340900422, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 17.87769784172662, |
|
"grad_norm": 0.01946476215758075, |
|
"learning_rate": 8.403832405329671e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987283408641815, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 17.913669064748202, |
|
"grad_norm": 0.0230992291308728, |
|
"learning_rate": 8.362528701324976e-05, |
|
"loss": 0.0054, |
|
"mean_token_accuracy": 0.9984836399555206, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 17.949640287769785, |
|
"grad_norm": 0.02120013351137524, |
|
"learning_rate": 8.321253682375324e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9986916482448578, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 17.985611510791365, |
|
"grad_norm": 0.02970778760107319, |
|
"learning_rate": 8.2800080715323e-05, |
|
"loss": 0.0048, |
|
"mean_token_accuracy": 0.9986183524131775, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.15446293354034424, |
|
"eval_mean_token_accuracy": 0.9858783274888993, |
|
"eval_runtime": 20.4074, |
|
"eval_samples_per_second": 5.978, |
|
"eval_steps_per_second": 0.784, |
|
"step": 2502 |
|
}, |
|
{ |
|
"epoch": 18.02158273381295, |
|
"grad_norm": 0.01959404120242462, |
|
"learning_rate": 8.238792591332299e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.999062736829122, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 18.057553956834532, |
|
"grad_norm": 0.021486196998924824, |
|
"learning_rate": 8.197607963783889e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987650036811828, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 18.093525179856115, |
|
"grad_norm": 0.02817489227154816, |
|
"learning_rate": 8.156454910355183e-05, |
|
"loss": 0.0049, |
|
"mean_token_accuracy": 0.9985775172710418, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 18.1294964028777, |
|
"grad_norm": 0.026885737104364704, |
|
"learning_rate": 8.115334151961158e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9987282276153564, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 18.165467625899282, |
|
"grad_norm": 0.02789797766744541, |
|
"learning_rate": 8.07424640895107e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9987159430980682, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 18.201438848920862, |
|
"grad_norm": 0.025083417796611586, |
|
"learning_rate": 8.033192401095808e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987037479877472, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 18.237410071942445, |
|
"grad_norm": 0.019413881714524635, |
|
"learning_rate": 7.99217284757528e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9987403869628906, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 18.27338129496403, |
|
"grad_norm": 0.024314824267057528, |
|
"learning_rate": 7.951188466965848e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9986917316913605, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 18.309352517985612, |
|
"grad_norm": 0.02065428863541667, |
|
"learning_rate": 7.910239977227708e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988263070583343, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 18.345323741007196, |
|
"grad_norm": 0.022929150955455176, |
|
"learning_rate": 7.869328095692312e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9987528324127197, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 18.381294964028775, |
|
"grad_norm": 0.024841087612176065, |
|
"learning_rate": 7.828453539049839e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9986672222614288, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 18.41726618705036, |
|
"grad_norm": 0.02829228325153677, |
|
"learning_rate": 7.787617023336583e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9987404704093933, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 18.453237410071942, |
|
"grad_norm": 0.019138233811495497, |
|
"learning_rate": 7.74681926392247e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9987406373023987, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 18.489208633093526, |
|
"grad_norm": 0.02591841929384043, |
|
"learning_rate": 7.706060975498486e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9985324561595916, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 18.52517985611511, |
|
"grad_norm": 0.017988718910731748, |
|
"learning_rate": 7.665342872064156e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9986671388149262, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 18.56115107913669, |
|
"grad_norm": 0.022342004155552982, |
|
"learning_rate": 7.624665666915068e-05, |
|
"loss": 0.005, |
|
"mean_token_accuracy": 0.9986057758331299, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 18.597122302158272, |
|
"grad_norm": 0.02255746365433144, |
|
"learning_rate": 7.584030072630351e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9988750219345093, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 18.633093525179856, |
|
"grad_norm": 0.02826855123526112, |
|
"learning_rate": 7.543436801060187e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987161874771118, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 18.66906474820144, |
|
"grad_norm": 0.027035933219826336, |
|
"learning_rate": 7.502886563313376e-05, |
|
"loss": 0.0046, |
|
"mean_token_accuracy": 0.9986913681030274, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 18.705035971223023, |
|
"grad_norm": 0.02493410112387194, |
|
"learning_rate": 7.462380069744832e-05, |
|
"loss": 0.0046, |
|
"mean_token_accuracy": 0.9986426711082459, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 18.741007194244606, |
|
"grad_norm": 0.0262740477326827, |
|
"learning_rate": 7.421918029943181e-05, |
|
"loss": 0.0053, |
|
"mean_token_accuracy": 0.9984836339950561, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 18.776978417266186, |
|
"grad_norm": 0.020297919413359653, |
|
"learning_rate": 7.381501152718308e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986794233322144, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 18.81294964028777, |
|
"grad_norm": 0.024175582709317186, |
|
"learning_rate": 7.341130146088935e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9986303865909576, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 18.848920863309353, |
|
"grad_norm": 0.02486759536470699, |
|
"learning_rate": 7.30080571727024e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9987650454044342, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 18.884892086330936, |
|
"grad_norm": 0.026194580551628172, |
|
"learning_rate": 7.26052857266145e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9988141477108001, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 18.92086330935252, |
|
"grad_norm": 0.024554347584631382, |
|
"learning_rate": 7.220299417833472e-05, |
|
"loss": 0.0045, |
|
"mean_token_accuracy": 0.9986916840076446, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 18.9568345323741, |
|
"grad_norm": 0.02228534507545218, |
|
"learning_rate": 7.180118957516533e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986916720867157, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 18.992805755395683, |
|
"grad_norm": 0.022918389634665453, |
|
"learning_rate": 7.139987895587836e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9986428201198578, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.1591736525297165, |
|
"eval_mean_token_accuracy": 0.9844231969780393, |
|
"eval_runtime": 20.6939, |
|
"eval_samples_per_second": 5.895, |
|
"eval_steps_per_second": 0.773, |
|
"step": 2641 |
|
}, |
|
{ |
|
"epoch": 19.028776978417266, |
|
"grad_norm": 0.02209360258119876, |
|
"learning_rate": 7.099906935059229e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.998822808265686, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 19.06474820143885, |
|
"grad_norm": 0.022577796363744688, |
|
"learning_rate": 7.059876778064885e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9988506972789765, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 19.100719424460433, |
|
"grad_norm": 0.017119283944210285, |
|
"learning_rate": 7.019898125849004e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9987404644489288, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 19.136690647482013, |
|
"grad_norm": 0.020443101276389687, |
|
"learning_rate": 6.97997167875354e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9987649381160736, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 19.172661870503596, |
|
"grad_norm": 0.026169139333505773, |
|
"learning_rate": 6.940098136205917e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988015532493592, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 19.20863309352518, |
|
"grad_norm": 0.030602977469022324, |
|
"learning_rate": 6.90027819670678e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9987772822380065, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 19.244604316546763, |
|
"grad_norm": 0.027716828357940895, |
|
"learning_rate": 6.860512557817767e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.998663604259491, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 19.280575539568346, |
|
"grad_norm": 0.024284021779463878, |
|
"learning_rate": 6.82080191614928e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9986548125743866, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 19.31654676258993, |
|
"grad_norm": 0.02384668123159404, |
|
"learning_rate": 6.781146967348284e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988385021686554, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 19.35251798561151, |
|
"grad_norm": 0.023240608787474342, |
|
"learning_rate": 6.741548406086126e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988384068012237, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 19.388489208633093, |
|
"grad_norm": 0.02403795582857897, |
|
"learning_rate": 6.70200692604636e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988505661487579, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 19.424460431654676, |
|
"grad_norm": 0.021246091430060766, |
|
"learning_rate": 6.662523219912595e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9987038612365723, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 19.46043165467626, |
|
"grad_norm": 0.026758139834932266, |
|
"learning_rate": 6.623097979356367e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.998716127872467, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 19.496402877697843, |
|
"grad_norm": 0.023539086602775094, |
|
"learning_rate": 6.583731895025014e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988018155097962, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 19.532374100719423, |
|
"grad_norm": 0.023275067809463437, |
|
"learning_rate": 6.544425656529582e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9988016784191132, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 19.568345323741006, |
|
"grad_norm": 0.01761053119993567, |
|
"learning_rate": 6.505179952432748e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9988506674766541, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 19.60431654676259, |
|
"grad_norm": 0.01935988487682489, |
|
"learning_rate": 6.465995470236743e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986671209335327, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 19.640287769784173, |
|
"grad_norm": 0.02366340939887518, |
|
"learning_rate": 6.426872896371331e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9987650036811828, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 19.676258992805757, |
|
"grad_norm": 0.02674071489195662, |
|
"learning_rate": 6.387812916181772e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9988261640071869, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 19.71223021582734, |
|
"grad_norm": 0.02265806365028318, |
|
"learning_rate": 6.348816213916802e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986304640769958, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 19.74820143884892, |
|
"grad_norm": 0.017940892755793966, |
|
"learning_rate": 6.309883472716677e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988262236118317, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 19.784172661870503, |
|
"grad_norm": 0.020099792386730327, |
|
"learning_rate": 6.271015374601179e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9986548662185669, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 19.820143884892087, |
|
"grad_norm": 0.0234673815288768, |
|
"learning_rate": 6.232212600457684e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.998654842376709, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 19.85611510791367, |
|
"grad_norm": 0.02264543742102836, |
|
"learning_rate": 6.193475830029232e-05, |
|
"loss": 0.0047, |
|
"mean_token_accuracy": 0.9985569298267365, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 19.892086330935253, |
|
"grad_norm": 0.03458114838915935, |
|
"learning_rate": 6.154805741902608e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9987527251243591, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 19.928057553956833, |
|
"grad_norm": 0.0232135058890561, |
|
"learning_rate": 6.116203013496471e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988996028900147, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 19.964028776978417, |
|
"grad_norm": 0.020975513905473402, |
|
"learning_rate": 6.0776683210494766e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986794054508209, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.02375239274771892, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.0046, |
|
"mean_token_accuracy": 0.9986914992332458, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.162822425365448, |
|
"eval_mean_token_accuracy": 0.9826169647276402, |
|
"eval_runtime": 20.7014, |
|
"eval_samples_per_second": 5.893, |
|
"eval_steps_per_second": 0.773, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 20.035971223021583, |
|
"grad_norm": 0.020169789952639114, |
|
"learning_rate": 6.0008057430164755e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987894237041474, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 20.071942446043167, |
|
"grad_norm": 0.024950995662450234, |
|
"learning_rate": 5.9624792039012634e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988994717597961, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 20.107913669064747, |
|
"grad_norm": 0.022289199748791136, |
|
"learning_rate": 5.9242233936631974e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988993704319, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 20.14388489208633, |
|
"grad_norm": 0.017031427967920152, |
|
"learning_rate": 5.886038982463658e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988994240760803, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 20.179856115107913, |
|
"grad_norm": 0.02769921472621731, |
|
"learning_rate": 5.847926639213259e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9989483714103699, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 20.215827338129497, |
|
"grad_norm": 0.020218427009491082, |
|
"learning_rate": 5.809887031560137e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988261520862579, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 20.25179856115108, |
|
"grad_norm": 0.023672867362183343, |
|
"learning_rate": 5.771920825878268e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9987159848213196, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 20.28776978417266, |
|
"grad_norm": 0.01825857572816428, |
|
"learning_rate": 5.734028687255751e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9987772822380065, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 20.323741007194243, |
|
"grad_norm": 0.019161583401344055, |
|
"learning_rate": 5.6962112794832144e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9988016784191132, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 20.359712230215827, |
|
"grad_norm": 0.020980808156140812, |
|
"learning_rate": 5.65846926504214e-05, |
|
"loss": 0.0042, |
|
"mean_token_accuracy": 0.9987405419349671, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 20.39568345323741, |
|
"grad_norm": 0.02213766137578922, |
|
"learning_rate": 5.620803305093282e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.998777174949646, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 20.431654676258994, |
|
"grad_norm": 0.020197374684862185, |
|
"learning_rate": 5.583214059465094e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.998789495229721, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 20.467625899280577, |
|
"grad_norm": 0.02848380661932676, |
|
"learning_rate": 5.545702186642132e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9988626658916473, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 20.503597122302157, |
|
"grad_norm": 0.021978948352305485, |
|
"learning_rate": 5.5082683437535574e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.998862886428833, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 20.53956834532374, |
|
"grad_norm": 0.02155398957358224, |
|
"learning_rate": 5.470913186561616e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987405002117157, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 20.575539568345324, |
|
"grad_norm": 0.021479791644405853, |
|
"learning_rate": 5.433637369450123e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9988384425640107, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 20.611510791366907, |
|
"grad_norm": 0.020996011807413666, |
|
"learning_rate": 5.39644154541305e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.998924195766449, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 20.64748201438849, |
|
"grad_norm": 0.01978564420652077, |
|
"learning_rate": 5.359326366043047e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9989973843097687, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 20.68345323741007, |
|
"grad_norm": 0.02623546539682555, |
|
"learning_rate": 5.322292481520027e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986303389072418, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 20.719424460431654, |
|
"grad_norm": 0.024843856079622802, |
|
"learning_rate": 5.285340540599808e-05, |
|
"loss": 0.0044, |
|
"mean_token_accuracy": 0.9985324263572692, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 20.755395683453237, |
|
"grad_norm": 0.024519174672566837, |
|
"learning_rate": 5.2484711906027084e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9988139152526856, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 20.79136690647482, |
|
"grad_norm": 0.027279076643368993, |
|
"learning_rate": 5.211685077402246e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988231182098388, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 20.827338129496404, |
|
"grad_norm": 0.0261791376934865, |
|
"learning_rate": 5.1749828454137996e-05, |
|
"loss": 0.0043, |
|
"mean_token_accuracy": 0.9986548364162445, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 20.863309352517987, |
|
"grad_norm": 0.025300978285089024, |
|
"learning_rate": 5.138365137583314e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9987772285938263, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 20.899280575539567, |
|
"grad_norm": 0.02099648760004841, |
|
"learning_rate": 5.101832595376059e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988138735294342, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 20.93525179856115, |
|
"grad_norm": 0.018269036560369226, |
|
"learning_rate": 5.065385858765383e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.998667049407959, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 20.971223021582734, |
|
"grad_norm": 0.021112292097246896, |
|
"learning_rate": 5.0290255662214945e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.998618096113205, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.16446451842784882, |
|
"eval_mean_token_accuracy": 0.9880100190639496, |
|
"eval_runtime": 20.8403, |
|
"eval_samples_per_second": 5.854, |
|
"eval_steps_per_second": 0.768, |
|
"step": 2919 |
|
}, |
|
{ |
|
"epoch": 21.007194244604317, |
|
"grad_norm": 0.022743077396830767, |
|
"learning_rate": 4.992752354700292e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9993276000022888, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 21.0431654676259, |
|
"grad_norm": 0.020371511578122142, |
|
"learning_rate": 4.956566859632183e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9987894117832183, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 21.07913669064748, |
|
"grad_norm": 0.016322090768065737, |
|
"learning_rate": 4.920469714910982e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9991074562072754, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 21.115107913669064, |
|
"grad_norm": 0.02086363099641971, |
|
"learning_rate": 4.8844615528827874e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989607691764831, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 21.151079136690647, |
|
"grad_norm": 0.02198403181646824, |
|
"learning_rate": 4.8485430043348955e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9988505244255066, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 21.18705035971223, |
|
"grad_norm": 0.020254569354367712, |
|
"learning_rate": 4.812714698484784e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.99900963306427, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 21.223021582733814, |
|
"grad_norm": 0.02242033220917943, |
|
"learning_rate": 4.776977262969057e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.99886274933815, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 21.258992805755394, |
|
"grad_norm": 0.027896489445715905, |
|
"learning_rate": 4.7413313238324556e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9988628089427948, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 21.294964028776977, |
|
"grad_norm": 0.026693838170829185, |
|
"learning_rate": 4.705777505516904e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988016128540039, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 21.33093525179856, |
|
"grad_norm": 0.02617631180403082, |
|
"learning_rate": 4.6703164308505634e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9987404823303223, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 21.366906474820144, |
|
"grad_norm": 0.02094580431705213, |
|
"learning_rate": 4.63494872103692e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988383114337921, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 21.402877697841728, |
|
"grad_norm": 0.024972295644906873, |
|
"learning_rate": 4.599674995643909e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987526178359986, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 21.43884892086331, |
|
"grad_norm": 0.026511984691810955, |
|
"learning_rate": 4.564495872593041e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9986182391643524, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 21.47482014388489, |
|
"grad_norm": 0.02770322638498023, |
|
"learning_rate": 4.5294119681486066e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9988994836807251, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 21.510791366906474, |
|
"grad_norm": 0.025625116896544668, |
|
"learning_rate": 4.494423896906864e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9989360928535461, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 21.546762589928058, |
|
"grad_norm": 0.026657201513430866, |
|
"learning_rate": 4.459532271785273e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9986671328544616, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 21.58273381294964, |
|
"grad_norm": 0.023164027774429576, |
|
"learning_rate": 4.42473770401176e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9986792922019958, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 21.618705035971225, |
|
"grad_norm": 0.01911923815995147, |
|
"learning_rate": 4.390040803114015e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990341305732727, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 21.654676258992804, |
|
"grad_norm": 0.01988333819776402, |
|
"learning_rate": 4.355442176908798e-05, |
|
"loss": 0.0045, |
|
"mean_token_accuracy": 0.9987158477306366, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 21.690647482014388, |
|
"grad_norm": 0.024407661206665206, |
|
"learning_rate": 4.3209424314913174e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9989240050315857, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 21.72661870503597, |
|
"grad_norm": 0.021530015839347865, |
|
"learning_rate": 4.286542171224589e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9987647831439972, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 21.762589928057555, |
|
"grad_norm": 0.02141074113834186, |
|
"learning_rate": 4.252241998728861e-05, |
|
"loss": 0.0041, |
|
"mean_token_accuracy": 0.9986670732498169, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 21.798561151079138, |
|
"grad_norm": 0.02332310027280023, |
|
"learning_rate": 4.218042514871058e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988139569759369, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 21.834532374100718, |
|
"grad_norm": 0.02040834265816528, |
|
"learning_rate": 4.183944318754238e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9987895727157593, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 21.8705035971223, |
|
"grad_norm": 0.025359310719483122, |
|
"learning_rate": 4.149948007707126e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.998899495601654, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 21.906474820143885, |
|
"grad_norm": 0.02406626773998849, |
|
"learning_rate": 4.116054177273627e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9986980199813843, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 21.942446043165468, |
|
"grad_norm": 0.020726891185531986, |
|
"learning_rate": 4.082263421202403e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9987283110618591, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 21.97841726618705, |
|
"grad_norm": 0.019329853559948335, |
|
"learning_rate": 4.0485763314364735e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988263309001922, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.16608409583568573, |
|
"eval_mean_token_accuracy": 0.987030259587548, |
|
"eval_runtime": 20.6843, |
|
"eval_samples_per_second": 5.898, |
|
"eval_steps_per_second": 0.774, |
|
"step": 3058 |
|
}, |
|
{ |
|
"epoch": 22.014388489208635, |
|
"grad_norm": 0.021157088143697933, |
|
"learning_rate": 4.0149934981028294e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9990525096654892, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 22.050359712230215, |
|
"grad_norm": 0.02223133996568501, |
|
"learning_rate": 3.9815155095021215e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9990095376968384, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 22.086330935251798, |
|
"grad_norm": 0.022466272840321055, |
|
"learning_rate": 3.948142952098336e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9988016068935395, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 22.12230215827338, |
|
"grad_norm": 0.021919057069398156, |
|
"learning_rate": 3.914876410508528e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988504886627197, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 22.158273381294965, |
|
"grad_norm": 0.024330160771715296, |
|
"learning_rate": 3.8817164674925766e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988466024398803, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 22.194244604316548, |
|
"grad_norm": 0.02730664438385192, |
|
"learning_rate": 3.848663703942981e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988750219345093, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 22.230215827338128, |
|
"grad_norm": 0.02072812988318122, |
|
"learning_rate": 3.815718698874672e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9990951418876648, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 22.26618705035971, |
|
"grad_norm": 0.021412173412854167, |
|
"learning_rate": 3.78288202941489e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989484190940857, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 22.302158273381295, |
|
"grad_norm": 0.017054242580422836, |
|
"learning_rate": 3.750154270793058e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989240109920502, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 22.33812949640288, |
|
"grad_norm": 0.01980247762228368, |
|
"learning_rate": 3.717535996330711e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.998972886800766, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 22.37410071942446, |
|
"grad_norm": 0.02184699946449326, |
|
"learning_rate": 3.6850277774314544e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989607214927674, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 22.41007194244604, |
|
"grad_norm": 0.013978977988707543, |
|
"learning_rate": 3.652630183570941e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.999009644985199, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 22.446043165467625, |
|
"grad_norm": 0.025100662338247242, |
|
"learning_rate": 3.620343782286917e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988382995128632, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 22.48201438848921, |
|
"grad_norm": 0.018821374531494295, |
|
"learning_rate": 3.588169139169263e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988874316215515, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 22.51798561151079, |
|
"grad_norm": 0.023763269773152667, |
|
"learning_rate": 3.5561068178500945e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9987893342971802, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 22.553956834532375, |
|
"grad_norm": 0.021714269090932634, |
|
"learning_rate": 3.524157379993882e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989117383956909, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 22.58992805755396, |
|
"grad_norm": 0.034796645480556165, |
|
"learning_rate": 3.49232138528762e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9988261342048645, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 22.62589928057554, |
|
"grad_norm": 0.02326412744594279, |
|
"learning_rate": 3.460599391431008e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989117622375489, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 22.66187050359712, |
|
"grad_norm": 0.025486689306682863, |
|
"learning_rate": 3.428991954126698e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9987648904323578, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 22.697841726618705, |
|
"grad_norm": 0.020440416523992887, |
|
"learning_rate": 3.397499627070552e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989974737167359, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 22.73381294964029, |
|
"grad_norm": 0.02978506453625392, |
|
"learning_rate": 3.366122961941937e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9986671507358551, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 22.769784172661872, |
|
"grad_norm": 0.03112304520385107, |
|
"learning_rate": 3.3348625083940785e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988505899906158, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 22.805755395683452, |
|
"grad_norm": 0.02655579795920087, |
|
"learning_rate": 3.3037188140443995e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9989850640296936, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 22.841726618705035, |
|
"grad_norm": 0.022226423897356053, |
|
"learning_rate": 3.2726924244649636e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9986669540405273, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 22.87769784172662, |
|
"grad_norm": 0.01836212562974761, |
|
"learning_rate": 3.241783883172895e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988262414932251, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 22.913669064748202, |
|
"grad_norm": 0.025618306976980545, |
|
"learning_rate": 3.210993731620867e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9986548483371734, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 22.949640287769785, |
|
"grad_norm": 0.02206393950967854, |
|
"learning_rate": 3.180322509187612e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9987526178359986, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 22.985611510791365, |
|
"grad_norm": 0.025248641779480812, |
|
"learning_rate": 3.149770753168468e-05, |
|
"loss": 0.004, |
|
"mean_token_accuracy": 0.9987527012825013, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.16851434111595154, |
|
"eval_mean_token_accuracy": 0.9858345746994018, |
|
"eval_runtime": 20.6794, |
|
"eval_samples_per_second": 5.9, |
|
"eval_steps_per_second": 0.774, |
|
"step": 3197 |
|
}, |
|
{ |
|
"epoch": 23.02158273381295, |
|
"grad_norm": 0.019134482337190666, |
|
"learning_rate": 3.119338998765984e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9988585710525513, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 23.057553956834532, |
|
"grad_norm": 0.017875463612263744, |
|
"learning_rate": 3.089027779080522e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.998948335647583, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 23.093525179856115, |
|
"grad_norm": 0.023438769430109335, |
|
"learning_rate": 3.0588376251009386e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9990340173244476, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 23.1294964028777, |
|
"grad_norm": 0.021359479774257485, |
|
"learning_rate": 3.0287690656952673e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988463282585144, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 23.165467625899282, |
|
"grad_norm": 0.01767366924264051, |
|
"learning_rate": 2.9988226276014664e-05, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990708291530609, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 23.201438848920862, |
|
"grad_norm": 0.024005445801463642, |
|
"learning_rate": 2.968998835418174e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989728808403016, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 23.237410071942445, |
|
"grad_norm": 0.023067638497095064, |
|
"learning_rate": 2.9392982115955414e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9990340113639832, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 23.27338129496403, |
|
"grad_norm": 0.02597606132182271, |
|
"learning_rate": 2.909721276426064e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9989606022834778, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 23.309352517985612, |
|
"grad_norm": 0.029591792929044805, |
|
"learning_rate": 2.880268548035473e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990462243556977, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 23.345323741007196, |
|
"grad_norm": 0.027328046647646854, |
|
"learning_rate": 2.8509405423736603e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9987404048442841, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 23.381294964028775, |
|
"grad_norm": 0.025252117211303413, |
|
"learning_rate": 2.8217377732056304e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988750219345093, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 23.41726618705036, |
|
"grad_norm": 0.02406714011699659, |
|
"learning_rate": 2.792660752102514e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.998985105752945, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 23.453237410071942, |
|
"grad_norm": 0.022721023494655553, |
|
"learning_rate": 2.7637099884326e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989973485469819, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 23.489208633093526, |
|
"grad_norm": 0.02301823912808615, |
|
"learning_rate": 2.7348859893524105e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988383531570435, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 23.52517985611511, |
|
"grad_norm": 0.02705894303804802, |
|
"learning_rate": 2.7061892597978177e-05, |
|
"loss": 0.0037, |
|
"mean_token_accuracy": 0.9987648069858551, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 23.56115107913669, |
|
"grad_norm": 0.01942725691392434, |
|
"learning_rate": 2.6776203024752055e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988993644714356, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 23.597122302158272, |
|
"grad_norm": 0.027402506121057685, |
|
"learning_rate": 2.6491796178526453e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988141000270844, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 23.633093525179856, |
|
"grad_norm": 0.025450594562071174, |
|
"learning_rate": 2.6208677041511488e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9988994300365448, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 23.66906474820144, |
|
"grad_norm": 0.023278496141033778, |
|
"learning_rate": 2.5926850573359317e-05, |
|
"loss": 0.0038, |
|
"mean_token_accuracy": 0.9987892925739288, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 23.705035971223023, |
|
"grad_norm": 0.01949407921179753, |
|
"learning_rate": 2.5646321711077227e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989118635654449, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 23.741007194244606, |
|
"grad_norm": 0.022066203733673222, |
|
"learning_rate": 2.536709536894123e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9988872468471527, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 23.776978417266186, |
|
"grad_norm": 0.021463047376446793, |
|
"learning_rate": 2.508917643840981e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9988751173019409, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 23.81294964028777, |
|
"grad_norm": 0.025166598560956936, |
|
"learning_rate": 2.4812569788038463e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988750696182251, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 23.848920863309353, |
|
"grad_norm": 0.017097642676170137, |
|
"learning_rate": 2.4537280263394258e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.998960679769516, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 23.884892086330936, |
|
"grad_norm": 0.028106428762610353, |
|
"learning_rate": 2.4263312686970986e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988138198852539, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 23.92086330935252, |
|
"grad_norm": 0.04083138822929284, |
|
"learning_rate": 2.3990671858104662e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9989116787910461, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 23.9568345323741, |
|
"grad_norm": 0.022311007545414503, |
|
"learning_rate": 2.3719362552889536e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988382458686829, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 23.992805755395683, |
|
"grad_norm": 0.02668608739885611, |
|
"learning_rate": 2.3449389524094266e-05, |
|
"loss": 0.0039, |
|
"mean_token_accuracy": 0.9985814273357392, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.16874690353870392, |
|
"eval_mean_token_accuracy": 0.9842985835340288, |
|
"eval_runtime": 20.8056, |
|
"eval_samples_per_second": 5.864, |
|
"eval_steps_per_second": 0.769, |
|
"step": 3336 |
|
}, |
|
{ |
|
"epoch": 24.028776978417266, |
|
"grad_norm": 0.018047194943347125, |
|
"learning_rate": 2.3180757501078843e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989758655428886, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 24.06474820143885, |
|
"grad_norm": 0.023899729196053907, |
|
"learning_rate": 2.291347118971162e-05, |
|
"loss": 0.0033, |
|
"mean_token_accuracy": 0.9988993644714356, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 24.100719424460433, |
|
"grad_norm": 0.018510819166514855, |
|
"learning_rate": 2.2647535272286912e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.99908287525177, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 24.136690647482013, |
|
"grad_norm": 0.02938331730686069, |
|
"learning_rate": 2.2382954407443003e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9989606201648712, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 24.172661870503596, |
|
"grad_norm": 0.020826402133998383, |
|
"learning_rate": 2.2119733230080408e-05, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990829288959503, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 24.20863309352518, |
|
"grad_norm": 0.0290831582379025, |
|
"learning_rate": 2.185787635128086e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9988750517368317, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 24.244604316546763, |
|
"grad_norm": 0.027772364062720812, |
|
"learning_rate": 2.15973883582265e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989484786987305, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 24.280575539568346, |
|
"grad_norm": 0.025911646510706054, |
|
"learning_rate": 2.1338273814119325e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9990463495254517, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 24.31654676258993, |
|
"grad_norm": 0.031011036083864605, |
|
"learning_rate": 2.1080537258101517e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988994300365448, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 24.35251798561151, |
|
"grad_norm": 0.03048137643406368, |
|
"learning_rate": 2.0824183205175706e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988261342048645, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 24.388489208633093, |
|
"grad_norm": 0.02484851509688683, |
|
"learning_rate": 2.0569216146126014e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.998923909664154, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 24.424460431654676, |
|
"grad_norm": 0.019607288520278966, |
|
"learning_rate": 2.031564054743943e-05, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990830242633819, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 24.46043165467626, |
|
"grad_norm": 0.023840339055394528, |
|
"learning_rate": 2.0063460851227345e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9991563498973847, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 24.496402877697843, |
|
"grad_norm": 0.027657508791596168, |
|
"learning_rate": 1.9812681475147942e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.998960655927658, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 24.532374100719423, |
|
"grad_norm": 0.025748643261759012, |
|
"learning_rate": 1.9563306812328763e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988259911537171, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 24.568345323741006, |
|
"grad_norm": 0.020585045016937135, |
|
"learning_rate": 1.931534123128965e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9988994836807251, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 24.60431654676259, |
|
"grad_norm": 0.024469587326783445, |
|
"learning_rate": 1.9068789075866355e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990095853805542, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 24.640287769784173, |
|
"grad_norm": 0.021728578764992675, |
|
"learning_rate": 1.882365466513437e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988505184650421, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 24.676258992805757, |
|
"grad_norm": 0.019942244796565842, |
|
"learning_rate": 1.8579942293333286e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990096926689148, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 24.71223021582734, |
|
"grad_norm": 0.02821479641357632, |
|
"learning_rate": 1.8337656229791577e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990096509456634, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 24.74820143884892, |
|
"grad_norm": 0.02455993351782378, |
|
"learning_rate": 1.8096800718851705e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989973723888397, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 24.784172661870503, |
|
"grad_norm": 0.028363038053404677, |
|
"learning_rate": 1.785737997979594e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989820778369903, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 24.820143884892087, |
|
"grad_norm": 0.02773422789629047, |
|
"learning_rate": 1.761939820677241e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9987894833087921, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 24.85611510791367, |
|
"grad_norm": 0.026309951879648234, |
|
"learning_rate": 1.7382859568721465e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.999070692062378, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 24.892086330935253, |
|
"grad_norm": 0.022282492985843325, |
|
"learning_rate": 1.714776820930283e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988258957862854, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 24.928057553956833, |
|
"grad_norm": 0.024990648055085517, |
|
"learning_rate": 1.691412824682297e-05, |
|
"loss": 0.0036, |
|
"mean_token_accuracy": 0.9988260388374328, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 24.964028776978417, |
|
"grad_norm": 0.029113720763185476, |
|
"learning_rate": 1.6681943774162823e-05, |
|
"loss": 0.0034, |
|
"mean_token_accuracy": 0.9988504767417907, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.029421600632342313, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9988382577896118, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.17007607221603394, |
|
"eval_mean_token_accuracy": 0.9825018458068371, |
|
"eval_runtime": 20.6952, |
|
"eval_samples_per_second": 5.895, |
|
"eval_steps_per_second": 0.773, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 25.035971223021583, |
|
"grad_norm": 0.02049594957895056, |
|
"learning_rate": 1.622195754226906e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9991562008857727, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 25.071942446043167, |
|
"grad_norm": 0.018619055614502295, |
|
"learning_rate": 1.5994163841027266e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9991196513175964, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 25.107913669064747, |
|
"grad_norm": 0.022576514305950958, |
|
"learning_rate": 1.57678417454478e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9988995909690856, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 25.14388489208633, |
|
"grad_norm": 0.02334590282868171, |
|
"learning_rate": 1.554299522021796e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9989116668701172, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 25.179856115107913, |
|
"grad_norm": 0.02392645596443554, |
|
"learning_rate": 1.5319628204176307e-05, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991563737392426, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 25.215827338129497, |
|
"grad_norm": 0.02789413194367586, |
|
"learning_rate": 1.5097744610243403e-05, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.999180793762207, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 25.25179856115108, |
|
"grad_norm": 0.018264027514364508, |
|
"learning_rate": 1.4877348325353368e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9989115953445434, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 25.28776978417266, |
|
"grad_norm": 0.026930678697230023, |
|
"learning_rate": 1.4658443210385863e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990339398384094, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 25.323741007194243, |
|
"grad_norm": 0.023975938540105726, |
|
"learning_rate": 1.44410331000983e-05, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990951836109161, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 25.359712230215827, |
|
"grad_norm": 0.025955619076464948, |
|
"learning_rate": 1.4225121803058794e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990216612815856, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 25.39568345323741, |
|
"grad_norm": 0.02577267491308173, |
|
"learning_rate": 1.4010713101579486e-05, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990583598613739, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 25.431654676258994, |
|
"grad_norm": 0.023780752055292367, |
|
"learning_rate": 1.3797810751650032e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988504767417907, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 25.467625899280577, |
|
"grad_norm": 0.026803889925486716, |
|
"learning_rate": 1.35864184828721e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9988995373249054, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 25.503597122302157, |
|
"grad_norm": 0.023040368859518934, |
|
"learning_rate": 1.33765399983939e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9989809155464172, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 25.53956834532374, |
|
"grad_norm": 0.025198060678819276, |
|
"learning_rate": 1.3168178974845225e-05, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990095555782318, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 25.575539568345324, |
|
"grad_norm": 0.02916409633819172, |
|
"learning_rate": 1.2961339062273314e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990462839603425, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 25.611510791366907, |
|
"grad_norm": 0.0278874264136872, |
|
"learning_rate": 1.275602388407856e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989850282669067, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 25.64748201438849, |
|
"grad_norm": 0.027646524866529763, |
|
"learning_rate": 1.255223703695132e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.999070692062378, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 25.68345323741007, |
|
"grad_norm": 0.025857527943575487, |
|
"learning_rate": 1.2349982090808821e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990462481975555, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 25.719424460431654, |
|
"grad_norm": 0.02286550422259087, |
|
"learning_rate": 1.214926258873247e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988381743431092, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 25.755395683453237, |
|
"grad_norm": 0.027097388349150865, |
|
"learning_rate": 1.1950082046906086e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989361703395844, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 25.79136690647482, |
|
"grad_norm": 0.02303610795273582, |
|
"learning_rate": 1.1752443954554082e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990462839603425, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 25.827338129496404, |
|
"grad_norm": 0.02608991544129121, |
|
"learning_rate": 1.1556351773880337e-05, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.99886274933815, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 25.863309352517987, |
|
"grad_norm": 0.023156647103821718, |
|
"learning_rate": 1.1361808940007668e-05, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9989973545074463, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 25.899280575539567, |
|
"grad_norm": 0.0243464595811653, |
|
"learning_rate": 1.1168818860917574e-05, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.9989239156246186, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 25.93525179856115, |
|
"grad_norm": 0.02981427171507079, |
|
"learning_rate": 1.0977384917390576e-05, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990585505962372, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 25.971223021582734, |
|
"grad_norm": 0.032037378944090256, |
|
"learning_rate": 1.078751046294697e-05, |
|
"loss": 0.0035, |
|
"mean_token_accuracy": 0.9986914873123169, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.17206676304340363, |
|
"eval_mean_token_accuracy": 0.9879742885629336, |
|
"eval_runtime": 20.7606, |
|
"eval_samples_per_second": 5.877, |
|
"eval_steps_per_second": 0.771, |
|
"step": 3614 |
|
}, |
|
{ |
|
"epoch": 26.007194244604317, |
|
"grad_norm": 0.0213461708648987, |
|
"learning_rate": 1.0599198823788025e-05, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9992053210735321, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 26.0431654676259, |
|
"grad_norm": 0.023428034716649632, |
|
"learning_rate": 1.0412453298737823e-05, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991684496402741, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 26.07913669064748, |
|
"grad_norm": 0.023348574386097033, |
|
"learning_rate": 1.0227277159185422e-05, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990829169750214, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 26.115107913669064, |
|
"grad_norm": 0.021739767766830953, |
|
"learning_rate": 1.0043673649027518e-05, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.999131840467453, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 26.151079136690647, |
|
"grad_norm": 0.019442840163705226, |
|
"learning_rate": 9.861645984611678e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9991685032844544, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 26.18705035971223, |
|
"grad_norm": 0.019504618622678677, |
|
"learning_rate": 9.681197354679949e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990584969520568, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 26.223021582733814, |
|
"grad_norm": 0.021803811310150526, |
|
"learning_rate": 9.502330920312974e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989483237266541, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 26.258992805755394, |
|
"grad_norm": 0.0290446844890043, |
|
"learning_rate": 9.325049814874732e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990217745304107, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 26.294964028776977, |
|
"grad_norm": 0.02127288345338018, |
|
"learning_rate": 9.149357143957471e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991196155548095, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 26.33093525179856, |
|
"grad_norm": 0.026538142153862753, |
|
"learning_rate": 8.975255985327524e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.999070692062378, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 26.366906474820144, |
|
"grad_norm": 0.023809198775522854, |
|
"learning_rate": 8.802749388871224e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990461349487305, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 26.402877697841728, |
|
"grad_norm": 0.03172209954402754, |
|
"learning_rate": 8.631840376541457e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990951240062713, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 26.43884892086331, |
|
"grad_norm": 0.025297799542257252, |
|
"learning_rate": 8.462531942304896e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.999021691083908, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 26.47482014388489, |
|
"grad_norm": 0.026186042332624757, |
|
"learning_rate": 8.294827052089393e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9989819586277008, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 26.510791366906474, |
|
"grad_norm": 0.02062785999993999, |
|
"learning_rate": 8.128728643732108e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989850223064423, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 26.546762589928058, |
|
"grad_norm": 0.025889886568629488, |
|
"learning_rate": 7.964239626927994e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.999095219373703, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 26.58273381294964, |
|
"grad_norm": 0.022489670251659637, |
|
"learning_rate": 7.801362883178876e-06, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991442322731018, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 26.618705035971225, |
|
"grad_norm": 0.024715714144679234, |
|
"learning_rate": 7.640101265742883e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9989606618881226, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 26.654676258992804, |
|
"grad_norm": 0.026901869610419675, |
|
"learning_rate": 7.480457599584601e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9989850759506226, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 26.690647482014388, |
|
"grad_norm": 0.025055977716386093, |
|
"learning_rate": 7.3224346813254626e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990584552288055, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 26.72661870503597, |
|
"grad_norm": 0.03167579002764372, |
|
"learning_rate": 7.166035279194816e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991685152053833, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 26.762589928057555, |
|
"grad_norm": 0.02242351163674594, |
|
"learning_rate": 7.011262132981456e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989973664283752, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 26.798561151079138, |
|
"grad_norm": 0.02247426874697418, |
|
"learning_rate": 6.85811795398551e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991073191165925, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 26.834532374100718, |
|
"grad_norm": 0.03351273807743497, |
|
"learning_rate": 6.706605424971091e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990706741809845, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 26.8705035971223, |
|
"grad_norm": 0.028717775489285478, |
|
"learning_rate": 6.556727200119217e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990462124347687, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 26.906474820143885, |
|
"grad_norm": 0.030401223292522324, |
|
"learning_rate": 6.408485904981332e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9988750994205475, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 26.942446043165468, |
|
"grad_norm": 0.029390890433714404, |
|
"learning_rate": 6.261884136433327e-06, |
|
"loss": 0.0032, |
|
"mean_token_accuracy": 0.9988627254962921, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 26.97841726618705, |
|
"grad_norm": 0.027232194452777886, |
|
"learning_rate": 6.116924462629992e-06, |
|
"loss": 0.0031, |
|
"mean_token_accuracy": 0.998874968290329, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.17284731566905975, |
|
"eval_mean_token_accuracy": 0.9869382300160148, |
|
"eval_runtime": 20.6664, |
|
"eval_samples_per_second": 5.903, |
|
"eval_steps_per_second": 0.774, |
|
"step": 3753 |
|
}, |
|
{ |
|
"epoch": 27.014388489208635, |
|
"grad_norm": 0.016100025377121675, |
|
"learning_rate": 5.973609422960103e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9992051720619202, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 27.050359712230215, |
|
"grad_norm": 0.023343201820734655, |
|
"learning_rate": 5.831941528001894e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990951895713807, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 27.086330935251798, |
|
"grad_norm": 0.0248367264595005, |
|
"learning_rate": 5.691923259479093e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990583717823028, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 27.12230215827338, |
|
"grad_norm": 0.024747413248307617, |
|
"learning_rate": 5.55355707021743e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989116430282593, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 27.158273381294965, |
|
"grad_norm": 0.02630731839366515, |
|
"learning_rate": 5.416845384101699e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991073429584503, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 27.194244604316548, |
|
"grad_norm": 0.025754165173891827, |
|
"learning_rate": 5.281790596033232e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9992175042629242, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 27.230215827338128, |
|
"grad_norm": 0.026574099762379837, |
|
"learning_rate": 5.1483950718880456e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.999156379699707, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 27.26618705035971, |
|
"grad_norm": 0.02248640885779222, |
|
"learning_rate": 5.016661148475299e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9989850997924805, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 27.302158273381295, |
|
"grad_norm": 0.022399236095728896, |
|
"learning_rate": 4.8865911334964094e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990584075450897, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 27.33812949640288, |
|
"grad_norm": 0.027779653089896033, |
|
"learning_rate": 4.758187305504658e-06, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991685688495636, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 27.37410071942446, |
|
"grad_norm": 0.02964208013830329, |
|
"learning_rate": 4.6314519138651594e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990707218647004, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 27.41007194244604, |
|
"grad_norm": 0.02712978542770664, |
|
"learning_rate": 4.506387178715565e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9989605605602264, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 27.446043165467625, |
|
"grad_norm": 0.03281882059446763, |
|
"learning_rate": 4.382995290927161e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9991195619106292, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 27.48201438848921, |
|
"grad_norm": 0.03285754084626906, |
|
"learning_rate": 4.261278412066427e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989972472190857, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 27.51798561151079, |
|
"grad_norm": 0.02770507620979567, |
|
"learning_rate": 4.141238674357217e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991528451442718, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 27.553956834532375, |
|
"grad_norm": 0.027323468101626835, |
|
"learning_rate": 4.022878180643441e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990583479404449, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 27.58992805755396, |
|
"grad_norm": 0.026165409160412198, |
|
"learning_rate": 3.906199004352085e-06, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9992541253566742, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 27.62589928057554, |
|
"grad_norm": 0.02995860073308557, |
|
"learning_rate": 3.791203189457093e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.99905846118927, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 27.66187050359712, |
|
"grad_norm": 0.025049536857449636, |
|
"learning_rate": 3.67789275044339e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990216970443726, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 27.697841726618705, |
|
"grad_norm": 0.02435227347466978, |
|
"learning_rate": 3.5662696722716936e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990707516670227, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 27.73381294964029, |
|
"grad_norm": 0.026934231649150823, |
|
"learning_rate": 3.4563359103436886e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989239752292634, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 27.769784172661872, |
|
"grad_norm": 0.026039766458009594, |
|
"learning_rate": 3.348093390467788e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9991317570209504, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 27.805755395683452, |
|
"grad_norm": 0.028279821541101296, |
|
"learning_rate": 3.2415440088254033e-06, |
|
"loss": 0.0022, |
|
"mean_token_accuracy": 0.9991562783718109, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 27.841726618705035, |
|
"grad_norm": 0.02903501608211124, |
|
"learning_rate": 3.1366896319377283e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990950644016265, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 27.87769784172662, |
|
"grad_norm": 0.029246402437814508, |
|
"learning_rate": 3.0335320966330405e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991073131561279, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 27.913669064748202, |
|
"grad_norm": 0.024885738033322623, |
|
"learning_rate": 2.932073210014519e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991195976734162, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 27.949640287769785, |
|
"grad_norm": 0.02744346407474506, |
|
"learning_rate": 2.832314749428555e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990218222141266, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 27.985611510791365, |
|
"grad_norm": 0.036648097310783105, |
|
"learning_rate": 2.734258462433692e-06, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9989727795124054, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.1739717721939087, |
|
"eval_mean_token_accuracy": 0.9858080625534058, |
|
"eval_runtime": 20.6535, |
|
"eval_samples_per_second": 5.907, |
|
"eval_steps_per_second": 0.775, |
|
"step": 3892 |
|
}, |
|
{ |
|
"epoch": 28.02158273381295, |
|
"grad_norm": 0.022935392530238134, |
|
"learning_rate": 2.6379060667699686e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9989197750886282, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 28.057553956834532, |
|
"grad_norm": 0.028305752750639582, |
|
"learning_rate": 2.5432592503288e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991684257984161, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 28.093525179856115, |
|
"grad_norm": 0.02473903295003507, |
|
"learning_rate": 2.4503196711234576e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991562008857727, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 28.1294964028777, |
|
"grad_norm": 0.020068405620154862, |
|
"learning_rate": 2.3590889572600138e-06, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9991807460784912, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 28.165467625899282, |
|
"grad_norm": 0.023023534500934043, |
|
"learning_rate": 2.2695687069087868e-06, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9991318583488464, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 28.201438848920862, |
|
"grad_norm": 0.02063534437749836, |
|
"learning_rate": 2.1817604882763854e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990828394889831, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 28.237410071942445, |
|
"grad_norm": 0.021379802805438913, |
|
"learning_rate": 2.0956658395782202e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9990951299667359, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 28.27338129496403, |
|
"grad_norm": 0.021192132406956426, |
|
"learning_rate": 2.01128626901157e-06, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991563200950623, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 28.309352517985612, |
|
"grad_norm": 0.02399906239916882, |
|
"learning_rate": 1.928623254729134e-06, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991318345069885, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 28.345323741007196, |
|
"grad_norm": 0.025361576061434472, |
|
"learning_rate": 1.8476782448131446e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9992174208164215, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 28.381294964028775, |
|
"grad_norm": 0.026511459482348218, |
|
"learning_rate": 1.7684526572500416e-06, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9989605724811554, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 28.41726618705036, |
|
"grad_norm": 0.028448573855638794, |
|
"learning_rate": 1.6909478799055578e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9992173910140991, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 28.453237410071942, |
|
"grad_norm": 0.021467267529428444, |
|
"learning_rate": 1.615165270500485e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991808295249939, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 28.489208633093526, |
|
"grad_norm": 0.023006055777401, |
|
"learning_rate": 1.5411061565868467e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.999119633436203, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 28.52517985611511, |
|
"grad_norm": 0.02585586195710277, |
|
"learning_rate": 1.4687718355246294e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990950226783752, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 28.56115107913669, |
|
"grad_norm": 0.026300449454115953, |
|
"learning_rate": 1.3981635744590883e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990339398384094, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 28.597122302158272, |
|
"grad_norm": 0.021202713897712756, |
|
"learning_rate": 1.3292826102985212e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990707278251648, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 28.633093525179856, |
|
"grad_norm": 0.033199600046098315, |
|
"learning_rate": 1.2621301496926419e-06, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991685271263122, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 28.66906474820144, |
|
"grad_norm": 0.029822323609474746, |
|
"learning_rate": 1.196707369011396e-06, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990427136421204, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 28.705035971223023, |
|
"grad_norm": 0.021907570953874314, |
|
"learning_rate": 1.1330154143243787e-06, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991563141345978, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 28.741007194244606, |
|
"grad_norm": 0.031723215800157904, |
|
"learning_rate": 1.0710554013807495e-06, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990951001644135, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 28.776978417266186, |
|
"grad_norm": 0.02188892324131467, |
|
"learning_rate": 1.0108284155896819e-06, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9993274867534637, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 28.81294964028777, |
|
"grad_norm": 0.027770981302162885, |
|
"learning_rate": 9.523355120013677e-07, |
|
"loss": 0.0022, |
|
"mean_token_accuracy": 0.9991930305957795, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 28.848920863309353, |
|
"grad_norm": 0.03088762830415268, |
|
"learning_rate": 8.955777152885314e-07, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990583479404449, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 28.884892086330936, |
|
"grad_norm": 0.02070344017616713, |
|
"learning_rate": 8.405560197284557e-07, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990951836109161, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 28.92086330935252, |
|
"grad_norm": 0.025134927148283456, |
|
"learning_rate": 7.872713891855843e-07, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990340828895569, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 28.9568345323741, |
|
"grad_norm": 0.03278076527634437, |
|
"learning_rate": 7.357247570946357e-07, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.9990216612815856, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 28.992805755395683, |
|
"grad_norm": 0.02754408645957575, |
|
"learning_rate": 6.859170264442605e-07, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9989728093147278, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.1744653284549713, |
|
"eval_mean_token_accuracy": 0.9842423597971598, |
|
"eval_runtime": 20.7969, |
|
"eval_samples_per_second": 5.866, |
|
"eval_steps_per_second": 0.769, |
|
"step": 4031 |
|
}, |
|
{ |
|
"epoch": 29.028776978417266, |
|
"grad_norm": 0.02324930062427535, |
|
"learning_rate": 6.378490697611761e-07, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9992052540183067, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 29.06474820143885, |
|
"grad_norm": 0.02432217426882393, |
|
"learning_rate": 5.915217290949571e-07, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990829706192017, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 29.100719424460433, |
|
"grad_norm": 0.028049045181475652, |
|
"learning_rate": 5.469358160032356e-07, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991195380687714, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 29.136690647482013, |
|
"grad_norm": 0.021910112528279284, |
|
"learning_rate": 5.040921115374686e-07, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9991283357143402, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 29.172661870503596, |
|
"grad_norm": 0.023718091323160405, |
|
"learning_rate": 4.6299136622929285e-07, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9992173552513123, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 29.20863309352518, |
|
"grad_norm": 0.02590581189187291, |
|
"learning_rate": 4.2363430007740237e-07, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991929352283477, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 29.244604316546763, |
|
"grad_norm": 0.030650626796343103, |
|
"learning_rate": 3.860216025348251e-07, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9990461230278015, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 29.280575539568346, |
|
"grad_norm": 0.030720762355050432, |
|
"learning_rate": 3.5015393249698824e-07, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990951597690583, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 29.31654676258993, |
|
"grad_norm": 0.02506864688330541, |
|
"learning_rate": 3.160319182900495e-07, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991685032844544, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 29.35251798561151, |
|
"grad_norm": 0.02852340035502043, |
|
"learning_rate": 2.836561576599839e-07, |
|
"loss": 0.0028, |
|
"mean_token_accuracy": 0.999033921957016, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 29.388489208633093, |
|
"grad_norm": 0.029973206482145683, |
|
"learning_rate": 2.530272177620585e-07, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990583717823028, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 29.424460431654676, |
|
"grad_norm": 0.022189267720462317, |
|
"learning_rate": 2.241456351509186e-07, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9991807758808136, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 29.46043165467626, |
|
"grad_norm": 0.01944218340517776, |
|
"learning_rate": 1.9701191577117252e-07, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9990707099437713, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 29.496402877697843, |
|
"grad_norm": 0.01990659572163372, |
|
"learning_rate": 1.7162653494855462e-07, |
|
"loss": 0.0026, |
|
"mean_token_accuracy": 0.9991929590702057, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 29.532374100719423, |
|
"grad_norm": 0.023361693235018466, |
|
"learning_rate": 1.4798993738156518e-07, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9991930842399597, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 29.568345323741006, |
|
"grad_norm": 0.03204420748570329, |
|
"learning_rate": 1.26102537133721e-07, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991929471492768, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 29.60431654676259, |
|
"grad_norm": 0.020309140360128874, |
|
"learning_rate": 1.0596471762626126e-07, |
|
"loss": 0.0029, |
|
"mean_token_accuracy": 0.9989850461483002, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 29.640287769784173, |
|
"grad_norm": 0.02215835825264975, |
|
"learning_rate": 8.757683163144182e-08, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.99908287525177, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 29.676258992805757, |
|
"grad_norm": 0.027609261852714354, |
|
"learning_rate": 7.093920126638454e-08, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991684675216674, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 29.71223021582734, |
|
"grad_norm": 0.026309926413359074, |
|
"learning_rate": 5.605211798738186e-08, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9993030488491058, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 29.74820143884892, |
|
"grad_norm": 0.02776420304640643, |
|
"learning_rate": 4.291584258486747e-08, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9992418825626374, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 29.784172661870503, |
|
"grad_norm": 0.03297468936183595, |
|
"learning_rate": 3.153060517874229e-08, |
|
"loss": 0.003, |
|
"mean_token_accuracy": 0.9990706086158753, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 29.820143884892087, |
|
"grad_norm": 0.021453260733662226, |
|
"learning_rate": 2.1896605214455356e-08, |
|
"loss": 0.0023, |
|
"mean_token_accuracy": 0.9992419958114624, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 29.85611510791367, |
|
"grad_norm": 0.02545565832007103, |
|
"learning_rate": 1.4014011459428933e-08, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9990583419799804, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 29.892086330935253, |
|
"grad_norm": 0.02683900491178024, |
|
"learning_rate": 7.882962000138605e-09, |
|
"loss": 0.0024, |
|
"mean_token_accuracy": 0.9991440534591675, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 29.928057553956833, |
|
"grad_norm": 0.025734296592890643, |
|
"learning_rate": 3.503564239670798e-09, |
|
"loss": 0.0027, |
|
"mean_token_accuracy": 0.9990827918052674, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 29.964028776978417, |
|
"grad_norm": 0.025706770914082706, |
|
"learning_rate": 8.75894895879803e-10, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9992418885231018, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.020417514095723455, |
|
"learning_rate": 0.0, |
|
"loss": 0.0025, |
|
"mean_token_accuracy": 0.9991196155548095, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.17444376647472382, |
|
"eval_mean_token_accuracy": 0.9824385866522789, |
|
"eval_runtime": 20.2577, |
|
"eval_samples_per_second": 6.022, |
|
"eval_steps_per_second": 0.79, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"step": 4170, |
|
"total_flos": 1.3300619754508124e+18, |
|
"train_loss": 0.05172654809675914, |
|
"train_runtime": 17966.4012, |
|
"train_samples_per_second": 1.855, |
|
"train_steps_per_second": 0.232 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3300619754508124e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|