|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2361, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021177467174925877, |
|
"grad_norm": 83.68173840087012, |
|
"learning_rate": 4.219409282700422e-07, |
|
"loss": 2.1661, |
|
"mean_token_accuracy": 0.5976044356822967, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004235493434985175, |
|
"grad_norm": 71.11062161153062, |
|
"learning_rate": 8.438818565400844e-07, |
|
"loss": 2.1622, |
|
"mean_token_accuracy": 0.5995522707700729, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0063532401524777635, |
|
"grad_norm": 34.572128830715705, |
|
"learning_rate": 1.2658227848101267e-06, |
|
"loss": 2.0632, |
|
"mean_token_accuracy": 0.5944636166095734, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00847098686997035, |
|
"grad_norm": 17.011229671312382, |
|
"learning_rate": 1.6877637130801689e-06, |
|
"loss": 1.7723, |
|
"mean_token_accuracy": 0.6232941240072251, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010588733587462939, |
|
"grad_norm": 9.74778063232665, |
|
"learning_rate": 2.1097046413502114e-06, |
|
"loss": 1.5731, |
|
"mean_token_accuracy": 0.63005710542202, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012706480304955527, |
|
"grad_norm": 7.914154804733739, |
|
"learning_rate": 2.5316455696202535e-06, |
|
"loss": 1.3488, |
|
"mean_token_accuracy": 0.6629315137863159, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014824227022448115, |
|
"grad_norm": 7.497303646153751, |
|
"learning_rate": 2.9535864978902956e-06, |
|
"loss": 1.1306, |
|
"mean_token_accuracy": 0.6912301659584046, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0169419737399407, |
|
"grad_norm": 3.0571735289067807, |
|
"learning_rate": 3.3755274261603377e-06, |
|
"loss": 0.9338, |
|
"mean_token_accuracy": 0.7339568644762039, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01905972045743329, |
|
"grad_norm": 3.7299236330227674, |
|
"learning_rate": 3.7974683544303802e-06, |
|
"loss": 0.8366, |
|
"mean_token_accuracy": 0.7472610950469971, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.021177467174925878, |
|
"grad_norm": 1.94550213470537, |
|
"learning_rate": 4.219409282700423e-06, |
|
"loss": 0.813, |
|
"mean_token_accuracy": 0.7466759532690048, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.023295213892418468, |
|
"grad_norm": 1.9631003842444021, |
|
"learning_rate": 4.641350210970465e-06, |
|
"loss": 0.7409, |
|
"mean_token_accuracy": 0.7591830432415009, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.025412960609911054, |
|
"grad_norm": 2.126411421817186, |
|
"learning_rate": 5.063291139240507e-06, |
|
"loss": 0.6999, |
|
"mean_token_accuracy": 0.7653463244438171, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.027530707327403644, |
|
"grad_norm": 2.431633067673993, |
|
"learning_rate": 5.485232067510548e-06, |
|
"loss": 0.6803, |
|
"mean_token_accuracy": 0.7705583959817887, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02964845404489623, |
|
"grad_norm": 2.1470463103072706, |
|
"learning_rate": 5.907172995780591e-06, |
|
"loss": 0.6716, |
|
"mean_token_accuracy": 0.7747641324996948, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03176620076238882, |
|
"grad_norm": 1.8134772555606546, |
|
"learning_rate": 6.329113924050634e-06, |
|
"loss": 0.6442, |
|
"mean_token_accuracy": 0.7782594561576843, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0338839474798814, |
|
"grad_norm": 1.828059395701887, |
|
"learning_rate": 6.751054852320675e-06, |
|
"loss": 0.654, |
|
"mean_token_accuracy": 0.7730626821517944, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03600169419737399, |
|
"grad_norm": 2.144653892020441, |
|
"learning_rate": 7.172995780590718e-06, |
|
"loss": 0.5866, |
|
"mean_token_accuracy": 0.7909977465867997, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03811944091486658, |
|
"grad_norm": 1.8471427217270429, |
|
"learning_rate": 7.5949367088607605e-06, |
|
"loss": 0.6343, |
|
"mean_token_accuracy": 0.7756380766630173, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04023718763235917, |
|
"grad_norm": 2.3290876362673916, |
|
"learning_rate": 8.016877637130802e-06, |
|
"loss": 0.6006, |
|
"mean_token_accuracy": 0.7818532437086105, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.042354934349851756, |
|
"grad_norm": 2.038115409634727, |
|
"learning_rate": 8.438818565400846e-06, |
|
"loss": 0.6123, |
|
"mean_token_accuracy": 0.7774519264698029, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044472681067344345, |
|
"grad_norm": 1.8512017533544443, |
|
"learning_rate": 8.860759493670886e-06, |
|
"loss": 0.5782, |
|
"mean_token_accuracy": 0.787897452712059, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.046590427784836935, |
|
"grad_norm": 1.7617335354592987, |
|
"learning_rate": 9.28270042194093e-06, |
|
"loss": 0.5646, |
|
"mean_token_accuracy": 0.7916492879390716, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04870817450232952, |
|
"grad_norm": 1.7594677346726018, |
|
"learning_rate": 9.704641350210972e-06, |
|
"loss": 0.5741, |
|
"mean_token_accuracy": 0.7923983573913574, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05082592121982211, |
|
"grad_norm": 2.074930275455498, |
|
"learning_rate": 1.0126582278481014e-05, |
|
"loss": 0.5872, |
|
"mean_token_accuracy": 0.7833609640598297, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0529436679373147, |
|
"grad_norm": 1.9672826514728756, |
|
"learning_rate": 1.0548523206751056e-05, |
|
"loss": 0.5525, |
|
"mean_token_accuracy": 0.7987953841686248, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05506141465480729, |
|
"grad_norm": 1.8715882057544604, |
|
"learning_rate": 1.0970464135021096e-05, |
|
"loss": 0.5825, |
|
"mean_token_accuracy": 0.788796243071556, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05717916137229987, |
|
"grad_norm": 2.055120934997949, |
|
"learning_rate": 1.139240506329114e-05, |
|
"loss": 0.5668, |
|
"mean_token_accuracy": 0.794330358505249, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05929690808979246, |
|
"grad_norm": 2.1751457050987213, |
|
"learning_rate": 1.1814345991561182e-05, |
|
"loss": 0.5504, |
|
"mean_token_accuracy": 0.7986414194107055, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06141465480728505, |
|
"grad_norm": 2.6530737679742735, |
|
"learning_rate": 1.2236286919831224e-05, |
|
"loss": 0.5825, |
|
"mean_token_accuracy": 0.787852120399475, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06353240152477764, |
|
"grad_norm": 1.7772714746257652, |
|
"learning_rate": 1.2658227848101268e-05, |
|
"loss": 0.5321, |
|
"mean_token_accuracy": 0.7960227221250534, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06565014824227022, |
|
"grad_norm": 2.007446829510951, |
|
"learning_rate": 1.3080168776371309e-05, |
|
"loss": 0.5652, |
|
"mean_token_accuracy": 0.7959463059902191, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0677678949597628, |
|
"grad_norm": 1.5240825493431345, |
|
"learning_rate": 1.350210970464135e-05, |
|
"loss": 0.5471, |
|
"mean_token_accuracy": 0.7939142495393753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0698856416772554, |
|
"grad_norm": 2.1050082197685573, |
|
"learning_rate": 1.3924050632911395e-05, |
|
"loss": 0.5283, |
|
"mean_token_accuracy": 0.8012055605649948, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07200338839474799, |
|
"grad_norm": 1.8465069166680506, |
|
"learning_rate": 1.4345991561181437e-05, |
|
"loss": 0.5525, |
|
"mean_token_accuracy": 0.8044249504804611, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07412113511224058, |
|
"grad_norm": 1.6175686507972196, |
|
"learning_rate": 1.4767932489451477e-05, |
|
"loss": 0.5561, |
|
"mean_token_accuracy": 0.8003985285758972, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07623888182973317, |
|
"grad_norm": 1.987593615824268, |
|
"learning_rate": 1.5189873417721521e-05, |
|
"loss": 0.5637, |
|
"mean_token_accuracy": 0.7922172099351883, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07835662854722575, |
|
"grad_norm": 1.6436094243541917, |
|
"learning_rate": 1.5611814345991563e-05, |
|
"loss": 0.5662, |
|
"mean_token_accuracy": 0.7916066467761993, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.08047437526471835, |
|
"grad_norm": 1.6236359809975813, |
|
"learning_rate": 1.6033755274261603e-05, |
|
"loss": 0.5432, |
|
"mean_token_accuracy": 0.8009026974439621, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08259212198221093, |
|
"grad_norm": 1.5274031981451834, |
|
"learning_rate": 1.6455696202531647e-05, |
|
"loss": 0.5484, |
|
"mean_token_accuracy": 0.7982567459344864, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08470986869970351, |
|
"grad_norm": 2.03018219248001, |
|
"learning_rate": 1.687763713080169e-05, |
|
"loss": 0.5407, |
|
"mean_token_accuracy": 0.8011936664581298, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08682761541719611, |
|
"grad_norm": 1.8014465384883518, |
|
"learning_rate": 1.729957805907173e-05, |
|
"loss": 0.5247, |
|
"mean_token_accuracy": 0.8084624141454697, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08894536213468869, |
|
"grad_norm": 1.7271306151793162, |
|
"learning_rate": 1.7721518987341772e-05, |
|
"loss": 0.5708, |
|
"mean_token_accuracy": 0.7947988003492356, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09106310885218127, |
|
"grad_norm": 1.9191596287336676, |
|
"learning_rate": 1.8143459915611816e-05, |
|
"loss": 0.5363, |
|
"mean_token_accuracy": 0.8047021597623825, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09318085556967387, |
|
"grad_norm": 1.7320574921982606, |
|
"learning_rate": 1.856540084388186e-05, |
|
"loss": 0.5658, |
|
"mean_token_accuracy": 0.7928981482982635, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09529860228716645, |
|
"grad_norm": 1.8013279459762328, |
|
"learning_rate": 1.89873417721519e-05, |
|
"loss": 0.5283, |
|
"mean_token_accuracy": 0.8064373075962067, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09741634900465904, |
|
"grad_norm": 1.6963994963949376, |
|
"learning_rate": 1.9409282700421944e-05, |
|
"loss": 0.541, |
|
"mean_token_accuracy": 0.798399469256401, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09953409572215163, |
|
"grad_norm": 1.8211709634602606, |
|
"learning_rate": 1.9831223628691984e-05, |
|
"loss": 0.5529, |
|
"mean_token_accuracy": 0.7973109990358352, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.10165184243964422, |
|
"grad_norm": 1.800221940229098, |
|
"learning_rate": 1.9999901552991966e-05, |
|
"loss": 0.5297, |
|
"mean_token_accuracy": 0.8061769932508469, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10376958915713681, |
|
"grad_norm": 3.118430462846644, |
|
"learning_rate": 1.9999299939406875e-05, |
|
"loss": 0.567, |
|
"mean_token_accuracy": 0.7869770050048828, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1058873358746294, |
|
"grad_norm": 6.373265842936888, |
|
"learning_rate": 1.9998151437882874e-05, |
|
"loss": 0.5194, |
|
"mean_token_accuracy": 0.8079254150390625, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10800508259212198, |
|
"grad_norm": 1.7643542086224073, |
|
"learning_rate": 1.999645611123453e-05, |
|
"loss": 0.5476, |
|
"mean_token_accuracy": 0.8036489874124527, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.11012282930961458, |
|
"grad_norm": 1.7570808876197173, |
|
"learning_rate": 1.999421405218369e-05, |
|
"loss": 0.5183, |
|
"mean_token_accuracy": 0.8039919883012772, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11224057602710716, |
|
"grad_norm": 1.4650600928842654, |
|
"learning_rate": 1.9991425383354462e-05, |
|
"loss": 0.5575, |
|
"mean_token_accuracy": 0.7989150047302246, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.11435832274459974, |
|
"grad_norm": 1.5715508311626518, |
|
"learning_rate": 1.9988090257266442e-05, |
|
"loss": 0.5276, |
|
"mean_token_accuracy": 0.8024184852838516, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11647606946209234, |
|
"grad_norm": 1.5012575844730074, |
|
"learning_rate": 1.9984208856326433e-05, |
|
"loss": 0.511, |
|
"mean_token_accuracy": 0.810269170999527, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11859381617958492, |
|
"grad_norm": 2.1674114266205553, |
|
"learning_rate": 1.9979781392818424e-05, |
|
"loss": 0.5069, |
|
"mean_token_accuracy": 0.8049084335565567, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1207115628970775, |
|
"grad_norm": 1.597566985653751, |
|
"learning_rate": 1.9974808108892017e-05, |
|
"loss": 0.5097, |
|
"mean_token_accuracy": 0.810433080792427, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1228293096145701, |
|
"grad_norm": 2.721798223377223, |
|
"learning_rate": 1.9969289276549144e-05, |
|
"loss": 0.526, |
|
"mean_token_accuracy": 0.8058519691228867, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12494705633206268, |
|
"grad_norm": 1.526771766492988, |
|
"learning_rate": 1.9963225197629223e-05, |
|
"loss": 0.5172, |
|
"mean_token_accuracy": 0.8079220175743103, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12706480304955528, |
|
"grad_norm": 1.3424112355487237, |
|
"learning_rate": 1.9956616203792636e-05, |
|
"loss": 0.5135, |
|
"mean_token_accuracy": 0.806724363565445, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12918254976704785, |
|
"grad_norm": 1.5824773036593809, |
|
"learning_rate": 1.9949462656502588e-05, |
|
"loss": 0.5383, |
|
"mean_token_accuracy": 0.8001780599355698, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.13130029648454045, |
|
"grad_norm": 1.5157834737082827, |
|
"learning_rate": 1.994176494700534e-05, |
|
"loss": 0.5466, |
|
"mean_token_accuracy": 0.7970251202583313, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13341804320203304, |
|
"grad_norm": 1.8369627378901519, |
|
"learning_rate": 1.993352349630882e-05, |
|
"loss": 0.5218, |
|
"mean_token_accuracy": 0.8072717070579529, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1355357899195256, |
|
"grad_norm": 1.5676620169867563, |
|
"learning_rate": 1.9924738755159573e-05, |
|
"loss": 0.5116, |
|
"mean_token_accuracy": 0.8025958120822907, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1376535366370182, |
|
"grad_norm": 1.5442271717658778, |
|
"learning_rate": 1.9915411204018137e-05, |
|
"loss": 0.495, |
|
"mean_token_accuracy": 0.8155842959880829, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1397712833545108, |
|
"grad_norm": 1.9104862823035134, |
|
"learning_rate": 1.9905541353032744e-05, |
|
"loss": 0.4707, |
|
"mean_token_accuracy": 0.8196403324604035, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14188903007200337, |
|
"grad_norm": 1.8843041038781683, |
|
"learning_rate": 1.9895129742011434e-05, |
|
"loss": 0.5359, |
|
"mean_token_accuracy": 0.8036209732294083, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.14400677678949597, |
|
"grad_norm": 1.2996290243783448, |
|
"learning_rate": 1.9884176940392522e-05, |
|
"loss": 0.5355, |
|
"mean_token_accuracy": 0.7970023989677429, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14612452350698857, |
|
"grad_norm": 1.7409691547169837, |
|
"learning_rate": 1.9872683547213446e-05, |
|
"loss": 0.5222, |
|
"mean_token_accuracy": 0.8015773713588714, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14824227022448117, |
|
"grad_norm": 1.3236145792783143, |
|
"learning_rate": 1.9860650191078033e-05, |
|
"loss": 0.5165, |
|
"mean_token_accuracy": 0.8045854181051254, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15036001694197373, |
|
"grad_norm": 1.5674402609006048, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.5141, |
|
"mean_token_accuracy": 0.8047444432973861, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.15247776365946633, |
|
"grad_norm": 1.4948547674340282, |
|
"learning_rate": 1.98349662519774e-05, |
|
"loss": 0.493, |
|
"mean_token_accuracy": 0.8128765910863877, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15459551037695893, |
|
"grad_norm": 1.57285942684427, |
|
"learning_rate": 1.9821317073734173e-05, |
|
"loss": 0.5114, |
|
"mean_token_accuracy": 0.8024025142192841, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1567132570944515, |
|
"grad_norm": 1.3725667498479879, |
|
"learning_rate": 1.9807130741901756e-05, |
|
"loss": 0.5552, |
|
"mean_token_accuracy": 0.7975639194250107, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1588310038119441, |
|
"grad_norm": 1.6323326415858614, |
|
"learning_rate": 1.979240803236785e-05, |
|
"loss": 0.5101, |
|
"mean_token_accuracy": 0.8058428287506103, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1609487505294367, |
|
"grad_norm": 1.293657741608038, |
|
"learning_rate": 1.9777149750356044e-05, |
|
"loss": 0.4931, |
|
"mean_token_accuracy": 0.8156037211418152, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16306649724692926, |
|
"grad_norm": 1.584456213127757, |
|
"learning_rate": 1.9761356730381806e-05, |
|
"loss": 0.5066, |
|
"mean_token_accuracy": 0.8106210082769394, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16518424396442186, |
|
"grad_norm": 1.3531024564685128, |
|
"learning_rate": 1.9745029836206813e-05, |
|
"loss": 0.4862, |
|
"mean_token_accuracy": 0.8180296182632446, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16730199068191445, |
|
"grad_norm": 1.5992771952291873, |
|
"learning_rate": 1.9728169960791736e-05, |
|
"loss": 0.5158, |
|
"mean_token_accuracy": 0.8020082831382751, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.16941973739940702, |
|
"grad_norm": 1.3875752393035827, |
|
"learning_rate": 1.9710778026247367e-05, |
|
"loss": 0.5268, |
|
"mean_token_accuracy": 0.8021057844161987, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17153748411689962, |
|
"grad_norm": 1.4892475787998831, |
|
"learning_rate": 1.9692854983784235e-05, |
|
"loss": 0.5031, |
|
"mean_token_accuracy": 0.8153967589139939, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.17365523083439222, |
|
"grad_norm": 1.3435721015179996, |
|
"learning_rate": 1.9674401813660532e-05, |
|
"loss": 0.5151, |
|
"mean_token_accuracy": 0.8066144526004791, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17577297755188478, |
|
"grad_norm": 1.4757784795296558, |
|
"learning_rate": 1.9655419525128528e-05, |
|
"loss": 0.5197, |
|
"mean_token_accuracy": 0.8056324630975723, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.17789072426937738, |
|
"grad_norm": 1.8586890907074842, |
|
"learning_rate": 1.9635909156379373e-05, |
|
"loss": 0.4817, |
|
"mean_token_accuracy": 0.8227346748113632, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18000847098686998, |
|
"grad_norm": 1.3338010634125226, |
|
"learning_rate": 1.9615871774486293e-05, |
|
"loss": 0.476, |
|
"mean_token_accuracy": 0.8171389639377594, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.18212621770436255, |
|
"grad_norm": 1.467996639944381, |
|
"learning_rate": 1.959530847534627e-05, |
|
"loss": 0.4857, |
|
"mean_token_accuracy": 0.8151497721672059, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18424396442185514, |
|
"grad_norm": 1.482953746737999, |
|
"learning_rate": 1.9574220383620054e-05, |
|
"loss": 0.4922, |
|
"mean_token_accuracy": 0.8100210309028626, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.18636171113934774, |
|
"grad_norm": 5.208401516653082, |
|
"learning_rate": 1.95526086526707e-05, |
|
"loss": 0.5263, |
|
"mean_token_accuracy": 0.8080328673124313, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1884794578568403, |
|
"grad_norm": 1.5834873689672437, |
|
"learning_rate": 1.9530474464500445e-05, |
|
"loss": 0.514, |
|
"mean_token_accuracy": 0.8094299465417862, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.1905972045743329, |
|
"grad_norm": 1.3405671636751928, |
|
"learning_rate": 1.9507819029686094e-05, |
|
"loss": 0.5119, |
|
"mean_token_accuracy": 0.8087350040674209, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1927149512918255, |
|
"grad_norm": 1.3993020572279387, |
|
"learning_rate": 1.94846435873128e-05, |
|
"loss": 0.5153, |
|
"mean_token_accuracy": 0.8082747459411621, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.19483269800931807, |
|
"grad_norm": 1.3011551512989479, |
|
"learning_rate": 1.9460949404906285e-05, |
|
"loss": 0.5028, |
|
"mean_token_accuracy": 0.8120961904525756, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19695044472681067, |
|
"grad_norm": 1.6479875272294309, |
|
"learning_rate": 1.9436737778363526e-05, |
|
"loss": 0.4787, |
|
"mean_token_accuracy": 0.8184203952550888, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.19906819144430327, |
|
"grad_norm": 1.2952323822215526, |
|
"learning_rate": 1.9412010031881884e-05, |
|
"loss": 0.4811, |
|
"mean_token_accuracy": 0.8196297824382782, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20118593816179586, |
|
"grad_norm": 1.2434980503550659, |
|
"learning_rate": 1.9386767517886666e-05, |
|
"loss": 0.4992, |
|
"mean_token_accuracy": 0.8126247316598892, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.20330368487928843, |
|
"grad_norm": 1.2749730489780189, |
|
"learning_rate": 1.9361011616957165e-05, |
|
"loss": 0.5013, |
|
"mean_token_accuracy": 0.8094296991825104, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20542143159678103, |
|
"grad_norm": 1.2801081991950354, |
|
"learning_rate": 1.933474373775115e-05, |
|
"loss": 0.4914, |
|
"mean_token_accuracy": 0.8103417336940766, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.20753917831427363, |
|
"grad_norm": 1.3841139586738282, |
|
"learning_rate": 1.930796531692783e-05, |
|
"loss": 0.503, |
|
"mean_token_accuracy": 0.8150111019611359, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2096569250317662, |
|
"grad_norm": 1.2895819374549709, |
|
"learning_rate": 1.9280677819069273e-05, |
|
"loss": 0.4938, |
|
"mean_token_accuracy": 0.8058139503002166, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2117746717492588, |
|
"grad_norm": 1.2705506609214867, |
|
"learning_rate": 1.9252882736600302e-05, |
|
"loss": 0.5041, |
|
"mean_token_accuracy": 0.8078715801239014, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2138924184667514, |
|
"grad_norm": 1.3700128773821674, |
|
"learning_rate": 1.922458158970688e-05, |
|
"loss": 0.5122, |
|
"mean_token_accuracy": 0.805089196562767, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.21601016518424396, |
|
"grad_norm": 1.4292612681859336, |
|
"learning_rate": 1.9195775926252952e-05, |
|
"loss": 0.4799, |
|
"mean_token_accuracy": 0.8134547978639602, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21812791190173655, |
|
"grad_norm": 2.589810653355124, |
|
"learning_rate": 1.91664673216958e-05, |
|
"loss": 0.4686, |
|
"mean_token_accuracy": 0.8232874065637589, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.22024565861922915, |
|
"grad_norm": 1.4425686621750156, |
|
"learning_rate": 1.913665737899988e-05, |
|
"loss": 0.4885, |
|
"mean_token_accuracy": 0.815599313378334, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22236340533672172, |
|
"grad_norm": 1.4823410740282665, |
|
"learning_rate": 1.9106347728549134e-05, |
|
"loss": 0.4832, |
|
"mean_token_accuracy": 0.8109551817178726, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.22448115205421432, |
|
"grad_norm": 1.1459009249468546, |
|
"learning_rate": 1.9075540028057844e-05, |
|
"loss": 0.5156, |
|
"mean_token_accuracy": 0.8015700995922088, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2265988987717069, |
|
"grad_norm": 1.273350806844229, |
|
"learning_rate": 1.9044235962479945e-05, |
|
"loss": 0.4901, |
|
"mean_token_accuracy": 0.8163118690252305, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.22871664548919948, |
|
"grad_norm": 1.2736969034780394, |
|
"learning_rate": 1.9012437243916895e-05, |
|
"loss": 0.475, |
|
"mean_token_accuracy": 0.8155727684497833, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23083439220669208, |
|
"grad_norm": 1.1644155017049156, |
|
"learning_rate": 1.8980145611523996e-05, |
|
"loss": 0.5041, |
|
"mean_token_accuracy": 0.8130400031805038, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.23295213892418468, |
|
"grad_norm": 1.3543018612133357, |
|
"learning_rate": 1.8947362831415327e-05, |
|
"loss": 0.4668, |
|
"mean_token_accuracy": 0.8260669410228729, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23506988564167725, |
|
"grad_norm": 1.2391111005758269, |
|
"learning_rate": 1.8914090696567104e-05, |
|
"loss": 0.4809, |
|
"mean_token_accuracy": 0.8127309769392014, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.23718763235916984, |
|
"grad_norm": 2.2015980143710583, |
|
"learning_rate": 1.888033102671965e-05, |
|
"loss": 0.4922, |
|
"mean_token_accuracy": 0.8155588954687119, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.23930537907666244, |
|
"grad_norm": 1.2198454979455773, |
|
"learning_rate": 1.884608566827785e-05, |
|
"loss": 0.5168, |
|
"mean_token_accuracy": 0.8062847316265106, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.241423125794155, |
|
"grad_norm": 1.184969374617232, |
|
"learning_rate": 1.8811356494210166e-05, |
|
"loss": 0.4805, |
|
"mean_token_accuracy": 0.8132707148790359, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2435408725116476, |
|
"grad_norm": 1.187126766493632, |
|
"learning_rate": 1.8776145403946226e-05, |
|
"loss": 0.4955, |
|
"mean_token_accuracy": 0.8102918237447738, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2456586192291402, |
|
"grad_norm": 1.3821096957818944, |
|
"learning_rate": 1.874045432327289e-05, |
|
"loss": 0.4985, |
|
"mean_token_accuracy": 0.8098550081253052, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.24777636594663277, |
|
"grad_norm": 1.214604218577671, |
|
"learning_rate": 1.8704285204228973e-05, |
|
"loss": 0.4627, |
|
"mean_token_accuracy": 0.8165160745382309, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.24989411266412537, |
|
"grad_norm": 1.4526314855211653, |
|
"learning_rate": 1.866764002499846e-05, |
|
"loss": 0.4909, |
|
"mean_token_accuracy": 0.8122711658477784, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.25201185938161796, |
|
"grad_norm": 1.1543877428891598, |
|
"learning_rate": 1.8630520789802308e-05, |
|
"loss": 0.4782, |
|
"mean_token_accuracy": 0.8182896554470063, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.25412960609911056, |
|
"grad_norm": 1.3086338857944744, |
|
"learning_rate": 1.8592929528788844e-05, |
|
"loss": 0.4753, |
|
"mean_token_accuracy": 0.8180733859539032, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25624735281660316, |
|
"grad_norm": 1.3557276365311686, |
|
"learning_rate": 1.8554868297922728e-05, |
|
"loss": 0.4708, |
|
"mean_token_accuracy": 0.8193376958370209, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.2583650995340957, |
|
"grad_norm": 1.2996719117152657, |
|
"learning_rate": 1.8516339178872492e-05, |
|
"loss": 0.4518, |
|
"mean_token_accuracy": 0.8204487860202789, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2604828462515883, |
|
"grad_norm": 1.3696724777806233, |
|
"learning_rate": 1.8477344278896708e-05, |
|
"loss": 0.5072, |
|
"mean_token_accuracy": 0.8076569020748139, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.2626005929690809, |
|
"grad_norm": 1.2308629288247015, |
|
"learning_rate": 1.8437885730728738e-05, |
|
"loss": 0.5113, |
|
"mean_token_accuracy": 0.8088377475738525, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2647183396865735, |
|
"grad_norm": 1.2397238918015017, |
|
"learning_rate": 1.839796569246006e-05, |
|
"loss": 0.494, |
|
"mean_token_accuracy": 0.8118572622537613, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2668360864040661, |
|
"grad_norm": 1.3479748389387212, |
|
"learning_rate": 1.8357586347422266e-05, |
|
"loss": 0.5081, |
|
"mean_token_accuracy": 0.8135558038949966, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2689538331215587, |
|
"grad_norm": 1.1063564395200467, |
|
"learning_rate": 1.8316749904067637e-05, |
|
"loss": 0.4653, |
|
"mean_token_accuracy": 0.8218313783407212, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.2710715798390512, |
|
"grad_norm": 1.1492824512346658, |
|
"learning_rate": 1.8275458595848376e-05, |
|
"loss": 0.4817, |
|
"mean_token_accuracy": 0.8135390222072602, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2731893265565438, |
|
"grad_norm": 1.4159749106872088, |
|
"learning_rate": 1.8233714681094405e-05, |
|
"loss": 0.4616, |
|
"mean_token_accuracy": 0.8250806093215942, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.2753070732740364, |
|
"grad_norm": 1.1611107224498594, |
|
"learning_rate": 1.819152044288992e-05, |
|
"loss": 0.488, |
|
"mean_token_accuracy": 0.8166846603155136, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.277424819991529, |
|
"grad_norm": 1.3205339840836507, |
|
"learning_rate": 1.814887818894846e-05, |
|
"loss": 0.5036, |
|
"mean_token_accuracy": 0.810426139831543, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.2795425667090216, |
|
"grad_norm": 1.2642547117014469, |
|
"learning_rate": 1.810579025148674e-05, |
|
"loss": 0.5063, |
|
"mean_token_accuracy": 0.8112012058496475, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2816603134265142, |
|
"grad_norm": 5.33401159048522, |
|
"learning_rate": 1.8062258987097062e-05, |
|
"loss": 0.4478, |
|
"mean_token_accuracy": 0.8289118260145187, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.28377806014400675, |
|
"grad_norm": 1.3752087188227111, |
|
"learning_rate": 1.8018286776618446e-05, |
|
"loss": 0.4963, |
|
"mean_token_accuracy": 0.8137694984674454, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.28589580686149935, |
|
"grad_norm": 1.176266427707403, |
|
"learning_rate": 1.7973876025006407e-05, |
|
"loss": 0.4976, |
|
"mean_token_accuracy": 0.8188654541969299, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.28801355357899194, |
|
"grad_norm": 1.331341038204072, |
|
"learning_rate": 1.792902916120143e-05, |
|
"loss": 0.4939, |
|
"mean_token_accuracy": 0.8163222283124923, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.29013130029648454, |
|
"grad_norm": 1.1914829607255677, |
|
"learning_rate": 1.7883748637996113e-05, |
|
"loss": 0.4881, |
|
"mean_token_accuracy": 0.8130565702915191, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.29224904701397714, |
|
"grad_norm": 1.2277506964948814, |
|
"learning_rate": 1.7838036931901033e-05, |
|
"loss": 0.4559, |
|
"mean_token_accuracy": 0.824514701962471, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.29436679373146973, |
|
"grad_norm": 1.0800320597549389, |
|
"learning_rate": 1.7791896543009282e-05, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8174144089221954, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.29648454044896233, |
|
"grad_norm": 1.5694294317697621, |
|
"learning_rate": 1.7745329994859746e-05, |
|
"loss": 0.4914, |
|
"mean_token_accuracy": 0.8185641199350357, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29860228716645487, |
|
"grad_norm": 1.1923041867729132, |
|
"learning_rate": 1.7698339834299064e-05, |
|
"loss": 0.5008, |
|
"mean_token_accuracy": 0.8142161637544632, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.30072003388394747, |
|
"grad_norm": 1.3729946102267174, |
|
"learning_rate": 1.7650928631342364e-05, |
|
"loss": 0.4845, |
|
"mean_token_accuracy": 0.8133604645729064, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.30283778060144007, |
|
"grad_norm": 1.174456646604131, |
|
"learning_rate": 1.7603098979032683e-05, |
|
"loss": 0.4777, |
|
"mean_token_accuracy": 0.813685166835785, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.30495552731893266, |
|
"grad_norm": 1.158532302748484, |
|
"learning_rate": 1.7554853493299142e-05, |
|
"loss": 0.504, |
|
"mean_token_accuracy": 0.8088937163352966, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.30707327403642526, |
|
"grad_norm": 1.2620596837516858, |
|
"learning_rate": 1.7506194812813896e-05, |
|
"loss": 0.4817, |
|
"mean_token_accuracy": 0.8206409096717835, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.30919102075391786, |
|
"grad_norm": 1.148012521360775, |
|
"learning_rate": 1.74571255988478e-05, |
|
"loss": 0.4819, |
|
"mean_token_accuracy": 0.812398812174797, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3113087674714104, |
|
"grad_norm": 1.2373133691587057, |
|
"learning_rate": 1.740764853512485e-05, |
|
"loss": 0.49, |
|
"mean_token_accuracy": 0.8143349289894104, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.313426514188903, |
|
"grad_norm": 2.100740115519466, |
|
"learning_rate": 1.7357766327675433e-05, |
|
"loss": 0.4651, |
|
"mean_token_accuracy": 0.8216336488723754, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3155442609063956, |
|
"grad_norm": 1.4189894877798284, |
|
"learning_rate": 1.73074817046883e-05, |
|
"loss": 0.4801, |
|
"mean_token_accuracy": 0.8188165038824081, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.3176620076238882, |
|
"grad_norm": 1.2994480429040771, |
|
"learning_rate": 1.725679741636136e-05, |
|
"loss": 0.4614, |
|
"mean_token_accuracy": 0.8237657248973846, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3197797543413808, |
|
"grad_norm": 1.2308603791930401, |
|
"learning_rate": 1.720571623475128e-05, |
|
"loss": 0.492, |
|
"mean_token_accuracy": 0.8165101200342179, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3218975010588734, |
|
"grad_norm": 1.3843077010151197, |
|
"learning_rate": 1.7154240953621844e-05, |
|
"loss": 0.4564, |
|
"mean_token_accuracy": 0.825025874376297, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3240152477763659, |
|
"grad_norm": 1.1848129565884666, |
|
"learning_rate": 1.7102374388291182e-05, |
|
"loss": 0.4575, |
|
"mean_token_accuracy": 0.8252220988273621, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.3261329944938585, |
|
"grad_norm": 1.3217187216198285, |
|
"learning_rate": 1.705011937547779e-05, |
|
"loss": 0.4629, |
|
"mean_token_accuracy": 0.8198304086923599, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3282507412113511, |
|
"grad_norm": 1.3851637896221318, |
|
"learning_rate": 1.6997478773145363e-05, |
|
"loss": 0.4337, |
|
"mean_token_accuracy": 0.8338131695985794, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3303684879288437, |
|
"grad_norm": 1.423775789920787, |
|
"learning_rate": 1.6944455460346503e-05, |
|
"loss": 0.4807, |
|
"mean_token_accuracy": 0.8188902169466019, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3324862346463363, |
|
"grad_norm": 1.3680154210297841, |
|
"learning_rate": 1.6891052337065256e-05, |
|
"loss": 0.4841, |
|
"mean_token_accuracy": 0.8188378721475601, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.3346039813638289, |
|
"grad_norm": 1.1670007538420892, |
|
"learning_rate": 1.6837272324058487e-05, |
|
"loss": 0.4209, |
|
"mean_token_accuracy": 0.8359328061342239, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.33672172808132145, |
|
"grad_norm": 1.2238185684348435, |
|
"learning_rate": 1.6783118362696162e-05, |
|
"loss": 0.4687, |
|
"mean_token_accuracy": 0.8194981902837754, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.33883947479881404, |
|
"grad_norm": 1.3104844364549155, |
|
"learning_rate": 1.672859341480046e-05, |
|
"loss": 0.4605, |
|
"mean_token_accuracy": 0.8169092148542404, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.34095722151630664, |
|
"grad_norm": 1.1074420443801423, |
|
"learning_rate": 1.6673700462483776e-05, |
|
"loss": 0.4424, |
|
"mean_token_accuracy": 0.8315922617912292, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.34307496823379924, |
|
"grad_norm": 1.2002465546594834, |
|
"learning_rate": 1.661844250798565e-05, |
|
"loss": 0.4773, |
|
"mean_token_accuracy": 0.8234172344207764, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.34519271495129183, |
|
"grad_norm": 1.3643314568341807, |
|
"learning_rate": 1.6562822573508533e-05, |
|
"loss": 0.4803, |
|
"mean_token_accuracy": 0.8155502796173095, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.34731046166878443, |
|
"grad_norm": 1.1653511889703811, |
|
"learning_rate": 1.650684370105252e-05, |
|
"loss": 0.4907, |
|
"mean_token_accuracy": 0.8095988690853119, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.34942820838627703, |
|
"grad_norm": 1.2052540958169133, |
|
"learning_rate": 1.6450508952248957e-05, |
|
"loss": 0.4664, |
|
"mean_token_accuracy": 0.8265933513641357, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.35154595510376957, |
|
"grad_norm": 1.5477552328113091, |
|
"learning_rate": 1.6393821408193007e-05, |
|
"loss": 0.4783, |
|
"mean_token_accuracy": 0.8169477820396424, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.35366370182126217, |
|
"grad_norm": 1.8070494772139423, |
|
"learning_rate": 1.6336784169275132e-05, |
|
"loss": 0.454, |
|
"mean_token_accuracy": 0.8248355984687805, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.35578144853875476, |
|
"grad_norm": 1.2257376390653825, |
|
"learning_rate": 1.627940035501152e-05, |
|
"loss": 0.4506, |
|
"mean_token_accuracy": 0.8257219165563583, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.35789919525624736, |
|
"grad_norm": 1.3198794046839721, |
|
"learning_rate": 1.6221673103873474e-05, |
|
"loss": 0.4427, |
|
"mean_token_accuracy": 0.8296634495258332, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.36001694197373996, |
|
"grad_norm": 2.109231295857473, |
|
"learning_rate": 1.616360557311575e-05, |
|
"loss": 0.489, |
|
"mean_token_accuracy": 0.8102859228849411, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.36213468869123255, |
|
"grad_norm": 1.1872292152679083, |
|
"learning_rate": 1.6105200938603917e-05, |
|
"loss": 0.4681, |
|
"mean_token_accuracy": 0.8261395335197449, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.3642524354087251, |
|
"grad_norm": 1.214005452933459, |
|
"learning_rate": 1.60464623946406e-05, |
|
"loss": 0.4852, |
|
"mean_token_accuracy": 0.8179385870695114, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3663701821262177, |
|
"grad_norm": 1.0907256335398452, |
|
"learning_rate": 1.5987393153790832e-05, |
|
"loss": 0.4623, |
|
"mean_token_accuracy": 0.8248693764209747, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.3684879288437103, |
|
"grad_norm": 1.061691508146564, |
|
"learning_rate": 1.5927996446706308e-05, |
|
"loss": 0.4803, |
|
"mean_token_accuracy": 0.8169174045324326, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3706056755612029, |
|
"grad_norm": 1.1759352091149649, |
|
"learning_rate": 1.5868275521948726e-05, |
|
"loss": 0.4563, |
|
"mean_token_accuracy": 0.8279780805110931, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3727234222786955, |
|
"grad_norm": 1.2135030886876705, |
|
"learning_rate": 1.5808233645812087e-05, |
|
"loss": 0.4418, |
|
"mean_token_accuracy": 0.8301020473241806, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3748411689961881, |
|
"grad_norm": 1.1266881444254488, |
|
"learning_rate": 1.5747874102144073e-05, |
|
"loss": 0.4626, |
|
"mean_token_accuracy": 0.8214969336986542, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.3769589157136806, |
|
"grad_norm": 1.0911244736489776, |
|
"learning_rate": 1.5687200192166424e-05, |
|
"loss": 0.4635, |
|
"mean_token_accuracy": 0.8221491903066636, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3790766624311732, |
|
"grad_norm": 1.0852849507203284, |
|
"learning_rate": 1.5626215234294416e-05, |
|
"loss": 0.451, |
|
"mean_token_accuracy": 0.8251518607139587, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.3811944091486658, |
|
"grad_norm": 1.1215853338868707, |
|
"learning_rate": 1.5564922563955337e-05, |
|
"loss": 0.4608, |
|
"mean_token_accuracy": 0.8237892210483551, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3833121558661584, |
|
"grad_norm": 0.9235255903522734, |
|
"learning_rate": 1.5503325533406076e-05, |
|
"loss": 0.4676, |
|
"mean_token_accuracy": 0.8222286731004715, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.385429902583651, |
|
"grad_norm": 1.0494173037764836, |
|
"learning_rate": 1.5441427511549795e-05, |
|
"loss": 0.4652, |
|
"mean_token_accuracy": 0.8235789179801941, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3875476493011436, |
|
"grad_norm": 1.2934333868332708, |
|
"learning_rate": 1.537923188375164e-05, |
|
"loss": 0.459, |
|
"mean_token_accuracy": 0.8253506690263748, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.38966539601863615, |
|
"grad_norm": 1.045643086378396, |
|
"learning_rate": 1.5316742051653624e-05, |
|
"loss": 0.4487, |
|
"mean_token_accuracy": 0.8300421804189682, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.39178314273612874, |
|
"grad_norm": 1.0549731687620314, |
|
"learning_rate": 1.5253961432988548e-05, |
|
"loss": 0.4756, |
|
"mean_token_accuracy": 0.8141780078411103, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.39390088945362134, |
|
"grad_norm": 1.1263426428393677, |
|
"learning_rate": 1.5190893461393108e-05, |
|
"loss": 0.4698, |
|
"mean_token_accuracy": 0.8173887878656387, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.39601863617111394, |
|
"grad_norm": 1.1982411204873675, |
|
"learning_rate": 1.5127541586220077e-05, |
|
"loss": 0.4595, |
|
"mean_token_accuracy": 0.8246693462133408, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.39813638288860653, |
|
"grad_norm": 1.331125977750805, |
|
"learning_rate": 1.5063909272349664e-05, |
|
"loss": 0.466, |
|
"mean_token_accuracy": 0.8266402333974838, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.40025412960609913, |
|
"grad_norm": 1.165754254305497, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.435, |
|
"mean_token_accuracy": 0.8271202713251113, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.4023718763235917, |
|
"grad_norm": 1.1585938088360928, |
|
"learning_rate": 1.4935817264536809e-05, |
|
"loss": 0.4386, |
|
"mean_token_accuracy": 0.8255492657423019, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.40448962304108427, |
|
"grad_norm": 1.1542702135186313, |
|
"learning_rate": 1.4871364576282223e-05, |
|
"loss": 0.4769, |
|
"mean_token_accuracy": 0.8163278847932816, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.40660736975857686, |
|
"grad_norm": 1.1855267108232739, |
|
"learning_rate": 1.4806645460322804e-05, |
|
"loss": 0.4938, |
|
"mean_token_accuracy": 0.8140994518995285, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.40872511647606946, |
|
"grad_norm": 1.0583179034757253, |
|
"learning_rate": 1.4741663456316742e-05, |
|
"loss": 0.4694, |
|
"mean_token_accuracy": 0.8194496780633926, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.41084286319356206, |
|
"grad_norm": 1.2166297794886325, |
|
"learning_rate": 1.4676422118300266e-05, |
|
"loss": 0.4583, |
|
"mean_token_accuracy": 0.8240072697401046, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.41296060991105465, |
|
"grad_norm": 1.2077033819076497, |
|
"learning_rate": 1.461092501449326e-05, |
|
"loss": 0.4683, |
|
"mean_token_accuracy": 0.8127462983131408, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.41507835662854725, |
|
"grad_norm": 1.2024839451726628, |
|
"learning_rate": 1.4545175727104113e-05, |
|
"loss": 0.4746, |
|
"mean_token_accuracy": 0.817327806353569, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4171961033460398, |
|
"grad_norm": 43.895890122529586, |
|
"learning_rate": 1.4479177852133787e-05, |
|
"loss": 0.4339, |
|
"mean_token_accuracy": 0.83043053150177, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4193138500635324, |
|
"grad_norm": 1.3761452239892333, |
|
"learning_rate": 1.4412934999179169e-05, |
|
"loss": 0.4682, |
|
"mean_token_accuracy": 0.82216075360775, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.421431596781025, |
|
"grad_norm": 9.553572882081992, |
|
"learning_rate": 1.4346450791235611e-05, |
|
"loss": 0.425, |
|
"mean_token_accuracy": 0.8346862554550171, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4235493434985176, |
|
"grad_norm": 1.19535922636142, |
|
"learning_rate": 1.427972886449882e-05, |
|
"loss": 0.4916, |
|
"mean_token_accuracy": 0.8201052099466324, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4256670902160102, |
|
"grad_norm": 1.487407000401354, |
|
"learning_rate": 1.4212772868165957e-05, |
|
"loss": 0.4759, |
|
"mean_token_accuracy": 0.8201690822839737, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.4277848369335028, |
|
"grad_norm": 1.2209557581398112, |
|
"learning_rate": 1.4145586464236074e-05, |
|
"loss": 0.4776, |
|
"mean_token_accuracy": 0.8144995361566544, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4299025836509953, |
|
"grad_norm": 1.4175123984588354, |
|
"learning_rate": 1.4078173327309807e-05, |
|
"loss": 0.4697, |
|
"mean_token_accuracy": 0.820775744318962, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.4320203303684879, |
|
"grad_norm": 1.2129818965934513, |
|
"learning_rate": 1.4010537144388416e-05, |
|
"loss": 0.463, |
|
"mean_token_accuracy": 0.8259893089532853, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4341380770859805, |
|
"grad_norm": 1.12010970838833, |
|
"learning_rate": 1.3942681614672144e-05, |
|
"loss": 0.4629, |
|
"mean_token_accuracy": 0.8218669801950454, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.4362558238034731, |
|
"grad_norm": 1.1464961804103622, |
|
"learning_rate": 1.3874610449357873e-05, |
|
"loss": 0.4238, |
|
"mean_token_accuracy": 0.8335713416337966, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4383735705209657, |
|
"grad_norm": 1.1351310993680606, |
|
"learning_rate": 1.3806327371436159e-05, |
|
"loss": 0.4394, |
|
"mean_token_accuracy": 0.8307629436254501, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.4404913172384583, |
|
"grad_norm": 1.1188266853744508, |
|
"learning_rate": 1.3737836115487624e-05, |
|
"loss": 0.4663, |
|
"mean_token_accuracy": 0.8193978488445282, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.44260906395595084, |
|
"grad_norm": 1.1620199858915772, |
|
"learning_rate": 1.3669140427478693e-05, |
|
"loss": 0.4705, |
|
"mean_token_accuracy": 0.8229668527841568, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.44472681067344344, |
|
"grad_norm": 1.110101616240863, |
|
"learning_rate": 1.3600244064556702e-05, |
|
"loss": 0.4747, |
|
"mean_token_accuracy": 0.8179006308317185, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.44684455739093604, |
|
"grad_norm": 1.2783615446297392, |
|
"learning_rate": 1.353115079484444e-05, |
|
"loss": 0.4458, |
|
"mean_token_accuracy": 0.8308207571506501, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.44896230410842863, |
|
"grad_norm": 1.1007302332610067, |
|
"learning_rate": 1.3461864397234041e-05, |
|
"loss": 0.4598, |
|
"mean_token_accuracy": 0.8242943733930588, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.45108005082592123, |
|
"grad_norm": 1.2199483732995027, |
|
"learning_rate": 1.3392388661180303e-05, |
|
"loss": 0.445, |
|
"mean_token_accuracy": 0.824502220749855, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.4531977975434138, |
|
"grad_norm": 1.0010509955815885, |
|
"learning_rate": 1.332272738649345e-05, |
|
"loss": 0.4583, |
|
"mean_token_accuracy": 0.8303744524717331, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.45531554426090637, |
|
"grad_norm": 1.918284418636839, |
|
"learning_rate": 1.325288438313129e-05, |
|
"loss": 0.4269, |
|
"mean_token_accuracy": 0.8296439230442048, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.45743329097839897, |
|
"grad_norm": 1.1887902164021535, |
|
"learning_rate": 1.318286347099086e-05, |
|
"loss": 0.4625, |
|
"mean_token_accuracy": 0.8217881232500076, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.45955103769589156, |
|
"grad_norm": 1.1360766453253965, |
|
"learning_rate": 1.3112668479699486e-05, |
|
"loss": 0.4589, |
|
"mean_token_accuracy": 0.8269425123929978, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.46166878441338416, |
|
"grad_norm": 1.2399254503178083, |
|
"learning_rate": 1.3042303248405346e-05, |
|
"loss": 0.4555, |
|
"mean_token_accuracy": 0.8309968203306198, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.46378653113087676, |
|
"grad_norm": 1.0508779611719044, |
|
"learning_rate": 1.297177162556748e-05, |
|
"loss": 0.4545, |
|
"mean_token_accuracy": 0.824161484837532, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.46590427784836935, |
|
"grad_norm": 1.0822262810815348, |
|
"learning_rate": 1.2901077468745329e-05, |
|
"loss": 0.4571, |
|
"mean_token_accuracy": 0.8281063556671142, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.46802202456586195, |
|
"grad_norm": 1.0744745429140576, |
|
"learning_rate": 1.2830224644387742e-05, |
|
"loss": 0.471, |
|
"mean_token_accuracy": 0.8183866649866104, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.4701397712833545, |
|
"grad_norm": 1.2108459211634035, |
|
"learning_rate": 1.2759217027621507e-05, |
|
"loss": 0.4445, |
|
"mean_token_accuracy": 0.8313823521137238, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4722575180008471, |
|
"grad_norm": 1.1385271166035913, |
|
"learning_rate": 1.2688058502039416e-05, |
|
"loss": 0.4724, |
|
"mean_token_accuracy": 0.8208224922418594, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.4743752647183397, |
|
"grad_norm": 1.1608922255857643, |
|
"learning_rate": 1.261675295948786e-05, |
|
"loss": 0.4402, |
|
"mean_token_accuracy": 0.8260656505823135, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4764930114358323, |
|
"grad_norm": 1.2001870807148136, |
|
"learning_rate": 1.2545304299853977e-05, |
|
"loss": 0.4676, |
|
"mean_token_accuracy": 0.8217555999755859, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4786107581533249, |
|
"grad_norm": 1.099496727008847, |
|
"learning_rate": 1.2473716430852353e-05, |
|
"loss": 0.436, |
|
"mean_token_accuracy": 0.8312188684940338, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4807285048708175, |
|
"grad_norm": 2.032998570634967, |
|
"learning_rate": 1.2401993267811293e-05, |
|
"loss": 0.4317, |
|
"mean_token_accuracy": 0.8295620054006576, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.48284625158831, |
|
"grad_norm": 1.1812725212971202, |
|
"learning_rate": 1.2330138733458693e-05, |
|
"loss": 0.4156, |
|
"mean_token_accuracy": 0.8353513538837433, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4849639983058026, |
|
"grad_norm": 1.138821301405385, |
|
"learning_rate": 1.2258156757707496e-05, |
|
"loss": 0.4506, |
|
"mean_token_accuracy": 0.8284595161676407, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.4870817450232952, |
|
"grad_norm": 1.039456646381961, |
|
"learning_rate": 1.2186051277440739e-05, |
|
"loss": 0.4281, |
|
"mean_token_accuracy": 0.8340547412633896, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4891994917407878, |
|
"grad_norm": 1.0935441587184827, |
|
"learning_rate": 1.2113826236296245e-05, |
|
"loss": 0.4368, |
|
"mean_token_accuracy": 0.8294982463121414, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.4913172384582804, |
|
"grad_norm": 1.0601849025025707, |
|
"learning_rate": 1.2041485584450945e-05, |
|
"loss": 0.4496, |
|
"mean_token_accuracy": 0.8288684636354446, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.493434985175773, |
|
"grad_norm": 1.1432826242197904, |
|
"learning_rate": 1.1969033278404816e-05, |
|
"loss": 0.472, |
|
"mean_token_accuracy": 0.8184500396251678, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.49555273189326554, |
|
"grad_norm": 1.178255399480397, |
|
"learning_rate": 1.1896473280764498e-05, |
|
"loss": 0.453, |
|
"mean_token_accuracy": 0.82464899122715, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.49767047861075814, |
|
"grad_norm": 1.2123556499205794, |
|
"learning_rate": 1.1823809560026558e-05, |
|
"loss": 0.442, |
|
"mean_token_accuracy": 0.8262520909309388, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.49978822532825073, |
|
"grad_norm": 1.490671459887953, |
|
"learning_rate": 1.175104609036047e-05, |
|
"loss": 0.4493, |
|
"mean_token_accuracy": 0.8295370072126389, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5019059720457433, |
|
"grad_norm": 3.5058816478434993, |
|
"learning_rate": 1.1678186851391218e-05, |
|
"loss": 0.4593, |
|
"mean_token_accuracy": 0.8269213020801545, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.5040237187632359, |
|
"grad_norm": 1.1384716073513477, |
|
"learning_rate": 1.1605235827981673e-05, |
|
"loss": 0.4463, |
|
"mean_token_accuracy": 0.8314786165952682, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5061414654807285, |
|
"grad_norm": 1.1752572701433124, |
|
"learning_rate": 1.1532197010014636e-05, |
|
"loss": 0.4453, |
|
"mean_token_accuracy": 0.8288865953683853, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.5082592121982211, |
|
"grad_norm": 1.0006379736398943, |
|
"learning_rate": 1.1459074392174619e-05, |
|
"loss": 0.4293, |
|
"mean_token_accuracy": 0.8350226402282714, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5103769589157137, |
|
"grad_norm": 1.1784455736187447, |
|
"learning_rate": 1.138587197372937e-05, |
|
"loss": 0.4612, |
|
"mean_token_accuracy": 0.8215854614973068, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5124947056332063, |
|
"grad_norm": 1.1048766566547503, |
|
"learning_rate": 1.1312593758311143e-05, |
|
"loss": 0.4279, |
|
"mean_token_accuracy": 0.8407860666513443, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5146124523506989, |
|
"grad_norm": 1.0718700385713946, |
|
"learning_rate": 1.1239243753697728e-05, |
|
"loss": 0.4288, |
|
"mean_token_accuracy": 0.8378984898328781, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5167301990681914, |
|
"grad_norm": 1.558568433227081, |
|
"learning_rate": 1.1165825971593251e-05, |
|
"loss": 0.4678, |
|
"mean_token_accuracy": 0.825000548362732, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.518847945785684, |
|
"grad_norm": 1.082392246698731, |
|
"learning_rate": 1.1092344427408767e-05, |
|
"loss": 0.4276, |
|
"mean_token_accuracy": 0.8359992414712906, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.5209656925031766, |
|
"grad_norm": 1.256334909576375, |
|
"learning_rate": 1.1018803140042651e-05, |
|
"loss": 0.4633, |
|
"mean_token_accuracy": 0.8229638338088989, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5230834392206692, |
|
"grad_norm": 1.303814596864245, |
|
"learning_rate": 1.0945206131660787e-05, |
|
"loss": 0.469, |
|
"mean_token_accuracy": 0.8193328499794006, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5252011859381618, |
|
"grad_norm": 1.0507039996160834, |
|
"learning_rate": 1.0871557427476585e-05, |
|
"loss": 0.4414, |
|
"mean_token_accuracy": 0.8317544460296631, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5273189326556544, |
|
"grad_norm": 1.015866344156703, |
|
"learning_rate": 1.0797861055530832e-05, |
|
"loss": 0.428, |
|
"mean_token_accuracy": 0.8305379122495651, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.529436679373147, |
|
"grad_norm": 1.1624992956977676, |
|
"learning_rate": 1.07241210464714e-05, |
|
"loss": 0.467, |
|
"mean_token_accuracy": 0.820591053366661, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5315544260906395, |
|
"grad_norm": 1.2782647412686758, |
|
"learning_rate": 1.0650341433332778e-05, |
|
"loss": 0.4689, |
|
"mean_token_accuracy": 0.8219984292984008, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5336721728081322, |
|
"grad_norm": 1.1784870838731618, |
|
"learning_rate": 1.0576526251315515e-05, |
|
"loss": 0.4596, |
|
"mean_token_accuracy": 0.8260756641626358, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5357899195256247, |
|
"grad_norm": 1.1204805080469906, |
|
"learning_rate": 1.0502679537565507e-05, |
|
"loss": 0.442, |
|
"mean_token_accuracy": 0.8296466141939163, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.5379076662431174, |
|
"grad_norm": 1.0718296420595828, |
|
"learning_rate": 1.0428805330953209e-05, |
|
"loss": 0.4215, |
|
"mean_token_accuracy": 0.8308669030666351, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5400254129606099, |
|
"grad_norm": 1.1125024136410944, |
|
"learning_rate": 1.0354907671852733e-05, |
|
"loss": 0.4363, |
|
"mean_token_accuracy": 0.8332655102014541, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.5421431596781024, |
|
"grad_norm": 1.090167844275342, |
|
"learning_rate": 1.0280990601920863e-05, |
|
"loss": 0.4435, |
|
"mean_token_accuracy": 0.8282716870307922, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5442609063955951, |
|
"grad_norm": 1.0290238619990948, |
|
"learning_rate": 1.0207058163876021e-05, |
|
"loss": 0.4413, |
|
"mean_token_accuracy": 0.8311887979507446, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.5463786531130876, |
|
"grad_norm": 1.0778232888370207, |
|
"learning_rate": 1.013311440127714e-05, |
|
"loss": 0.4386, |
|
"mean_token_accuracy": 0.8266764581203461, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5484963998305803, |
|
"grad_norm": 1.1219731141973122, |
|
"learning_rate": 1.0059163358302537e-05, |
|
"loss": 0.4103, |
|
"mean_token_accuracy": 0.8391000181436539, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.5506141465480728, |
|
"grad_norm": 1.1468466517999107, |
|
"learning_rate": 9.9852090795287e-06, |
|
"loss": 0.4391, |
|
"mean_token_accuracy": 0.8361193478107453, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5527318932655655, |
|
"grad_norm": 1.0284132663014267, |
|
"learning_rate": 9.911255609709089e-06, |
|
"loss": 0.4409, |
|
"mean_token_accuracy": 0.8269284754991532, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.554849639983058, |
|
"grad_norm": 1.0310999165822667, |
|
"learning_rate": 9.83730699355294e-06, |
|
"loss": 0.4071, |
|
"mean_token_accuracy": 0.835135304927826, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5569673867005506, |
|
"grad_norm": 1.2728900066425748, |
|
"learning_rate": 9.76336727550401e-06, |
|
"loss": 0.4601, |
|
"mean_token_accuracy": 0.8267913639545441, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5590851334180432, |
|
"grad_norm": 1.2269899407592741, |
|
"learning_rate": 9.689440499519395e-06, |
|
"loss": 0.4322, |
|
"mean_token_accuracy": 0.8314703017473221, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5612028801355358, |
|
"grad_norm": 1.1418757049837882, |
|
"learning_rate": 9.615530708848373e-06, |
|
"loss": 0.4231, |
|
"mean_token_accuracy": 0.8340400338172913, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5633206268530284, |
|
"grad_norm": 1.1108149486798655, |
|
"learning_rate": 9.541641945811233e-06, |
|
"loss": 0.4492, |
|
"mean_token_accuracy": 0.8232677519321442, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.565438373570521, |
|
"grad_norm": 1.1088127297572268, |
|
"learning_rate": 9.467778251578217e-06, |
|
"loss": 0.4549, |
|
"mean_token_accuracy": 0.8236530691385269, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5675561202880135, |
|
"grad_norm": 0.9179664771961787, |
|
"learning_rate": 9.393943665948478e-06, |
|
"loss": 0.4763, |
|
"mean_token_accuracy": 0.8244054973125458, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5696738670055062, |
|
"grad_norm": 1.1777867866273308, |
|
"learning_rate": 9.320142227129158e-06, |
|
"loss": 0.4348, |
|
"mean_token_accuracy": 0.8331925332546234, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5717916137229987, |
|
"grad_norm": 1.0020743360016087, |
|
"learning_rate": 9.246377971514504e-06, |
|
"loss": 0.4161, |
|
"mean_token_accuracy": 0.8360674440860748, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5739093604404913, |
|
"grad_norm": 1.346066080223308, |
|
"learning_rate": 9.172654933465114e-06, |
|
"loss": 0.448, |
|
"mean_token_accuracy": 0.8250635206699372, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5760271071579839, |
|
"grad_norm": 1.3221207747875352, |
|
"learning_rate": 9.0989771450873e-06, |
|
"loss": 0.4228, |
|
"mean_token_accuracy": 0.8357968628406525, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5781448538754765, |
|
"grad_norm": 1.1501989319658534, |
|
"learning_rate": 9.025348636012537e-06, |
|
"loss": 0.4411, |
|
"mean_token_accuracy": 0.8290417343378067, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.5802626005929691, |
|
"grad_norm": 1.1694331116554113, |
|
"learning_rate": 8.951773433177095e-06, |
|
"loss": 0.4343, |
|
"mean_token_accuracy": 0.8303040146827698, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5823803473104616, |
|
"grad_norm": 1.2089472872967426, |
|
"learning_rate": 8.878255560601781e-06, |
|
"loss": 0.4285, |
|
"mean_token_accuracy": 0.8339911371469497, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5844980940279543, |
|
"grad_norm": 1.1555334960481487, |
|
"learning_rate": 8.804799039171863e-06, |
|
"loss": 0.4225, |
|
"mean_token_accuracy": 0.8346673488616944, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5866158407454468, |
|
"grad_norm": 0.9976941601020334, |
|
"learning_rate": 8.731407886417155e-06, |
|
"loss": 0.4538, |
|
"mean_token_accuracy": 0.8272438108921051, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5887335874629395, |
|
"grad_norm": 1.0977726966561636, |
|
"learning_rate": 8.658086116292283e-06, |
|
"loss": 0.4297, |
|
"mean_token_accuracy": 0.8334219962358475, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.590851334180432, |
|
"grad_norm": 2.0194878160007987, |
|
"learning_rate": 8.584837738957155e-06, |
|
"loss": 0.4413, |
|
"mean_token_accuracy": 0.8283408343791961, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5929690808979247, |
|
"grad_norm": 1.2186719145281468, |
|
"learning_rate": 8.511666760557638e-06, |
|
"loss": 0.4693, |
|
"mean_token_accuracy": 0.8232256740331649, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5950868276154172, |
|
"grad_norm": 1.1198588684752515, |
|
"learning_rate": 8.438577183006448e-06, |
|
"loss": 0.4221, |
|
"mean_token_accuracy": 0.8324928849935531, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5972045743329097, |
|
"grad_norm": 1.1215071963961742, |
|
"learning_rate": 8.36557300376427e-06, |
|
"loss": 0.4392, |
|
"mean_token_accuracy": 0.8286356210708619, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5993223210504024, |
|
"grad_norm": 1.107475266800191, |
|
"learning_rate": 8.292658215621139e-06, |
|
"loss": 0.4344, |
|
"mean_token_accuracy": 0.8313880443572998, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.6014400677678949, |
|
"grad_norm": 1.1686631557802003, |
|
"learning_rate": 8.219836806478049e-06, |
|
"loss": 0.4336, |
|
"mean_token_accuracy": 0.8312123149633408, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6035578144853876, |
|
"grad_norm": 1.230978585871069, |
|
"learning_rate": 8.147112759128859e-06, |
|
"loss": 0.4647, |
|
"mean_token_accuracy": 0.8231993585824966, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.6056755612028801, |
|
"grad_norm": 1.0717890273842352, |
|
"learning_rate": 8.074490051042447e-06, |
|
"loss": 0.4353, |
|
"mean_token_accuracy": 0.8321529895067215, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6077933079203727, |
|
"grad_norm": 1.085108371368418, |
|
"learning_rate": 8.001972654145194e-06, |
|
"loss": 0.4415, |
|
"mean_token_accuracy": 0.8277548223733902, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.6099110546378653, |
|
"grad_norm": 1.2119593900205077, |
|
"learning_rate": 7.929564534603722e-06, |
|
"loss": 0.4571, |
|
"mean_token_accuracy": 0.8255878984928131, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6120288013553579, |
|
"grad_norm": 1.1055437345283827, |
|
"learning_rate": 7.857269652607995e-06, |
|
"loss": 0.4406, |
|
"mean_token_accuracy": 0.8275179982185363, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6141465480728505, |
|
"grad_norm": 1.1275451956189597, |
|
"learning_rate": 7.78509196215472e-06, |
|
"loss": 0.4308, |
|
"mean_token_accuracy": 0.8301453530788422, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6162642947903431, |
|
"grad_norm": 1.2886494426253579, |
|
"learning_rate": 7.713035410831086e-06, |
|
"loss": 0.4573, |
|
"mean_token_accuracy": 0.8251194447278977, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.6183820415078357, |
|
"grad_norm": 1.1109768793864798, |
|
"learning_rate": 7.64110393959887e-06, |
|
"loss": 0.4279, |
|
"mean_token_accuracy": 0.8380070447921752, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6204997882253283, |
|
"grad_norm": 1.0182035864318235, |
|
"learning_rate": 7.569301482578885e-06, |
|
"loss": 0.4281, |
|
"mean_token_accuracy": 0.8316156834363937, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.6226175349428208, |
|
"grad_norm": 1.2074345207100396, |
|
"learning_rate": 7.497631966835828e-06, |
|
"loss": 0.4527, |
|
"mean_token_accuracy": 0.8231601238250732, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6247352816603134, |
|
"grad_norm": 0.991329003303421, |
|
"learning_rate": 7.42609931216348e-06, |
|
"loss": 0.442, |
|
"mean_token_accuracy": 0.8327670186758042, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.626853028377806, |
|
"grad_norm": 1.38024365126256, |
|
"learning_rate": 7.354707430870332e-06, |
|
"loss": 0.4335, |
|
"mean_token_accuracy": 0.8324557185173035, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6289707750952986, |
|
"grad_norm": 1.2263457500699402, |
|
"learning_rate": 7.283460227565614e-06, |
|
"loss": 0.4289, |
|
"mean_token_accuracy": 0.8289420217275619, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.6310885218127912, |
|
"grad_norm": 1.1601375730316865, |
|
"learning_rate": 7.2123615989457364e-06, |
|
"loss": 0.4465, |
|
"mean_token_accuracy": 0.832300814986229, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6332062685302838, |
|
"grad_norm": 1.3029839142463893, |
|
"learning_rate": 7.141415433581169e-06, |
|
"loss": 0.4167, |
|
"mean_token_accuracy": 0.8393772184848786, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6353240152477764, |
|
"grad_norm": 1.0421344337402514, |
|
"learning_rate": 7.070625611703762e-06, |
|
"loss": 0.4537, |
|
"mean_token_accuracy": 0.8257811456918717, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6374417619652689, |
|
"grad_norm": 1.1352186472493642, |
|
"learning_rate": 6.9999960049945406e-06, |
|
"loss": 0.4227, |
|
"mean_token_accuracy": 0.8368300348520279, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6395595086827616, |
|
"grad_norm": 0.9884985072070904, |
|
"learning_rate": 6.929530476371935e-06, |
|
"loss": 0.4189, |
|
"mean_token_accuracy": 0.8349219173192978, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6416772554002541, |
|
"grad_norm": 1.7766008455284357, |
|
"learning_rate": 6.859232879780515e-06, |
|
"loss": 0.4288, |
|
"mean_token_accuracy": 0.8374936401844024, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.6437950021177468, |
|
"grad_norm": 1.012934970024209, |
|
"learning_rate": 6.7891070599802045e-06, |
|
"loss": 0.4549, |
|
"mean_token_accuracy": 0.8239244252443314, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6459127488352393, |
|
"grad_norm": 0.9859441855867837, |
|
"learning_rate": 6.719156852336015e-06, |
|
"loss": 0.4293, |
|
"mean_token_accuracy": 0.8353272944688797, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.6480304955527318, |
|
"grad_norm": 1.261329902420831, |
|
"learning_rate": 6.649386082608256e-06, |
|
"loss": 0.428, |
|
"mean_token_accuracy": 0.8329044044017792, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6501482422702245, |
|
"grad_norm": 1.2457535519058567, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 0.4324, |
|
"mean_token_accuracy": 0.8307075470685958, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.652265988987717, |
|
"grad_norm": 1.213114456712863, |
|
"learning_rate": 6.510398110664939e-06, |
|
"loss": 0.4223, |
|
"mean_token_accuracy": 0.8351607590913772, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6543837357052097, |
|
"grad_norm": 1.155264435257233, |
|
"learning_rate": 6.441188510066092e-06, |
|
"loss": 0.4207, |
|
"mean_token_accuracy": 0.8374445289373398, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.6565014824227022, |
|
"grad_norm": 1.1756119576548756, |
|
"learning_rate": 6.372173550201346e-06, |
|
"loss": 0.4119, |
|
"mean_token_accuracy": 0.8390755444765091, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6586192291401949, |
|
"grad_norm": 1.0243897900651528, |
|
"learning_rate": 6.303357005679858e-06, |
|
"loss": 0.4478, |
|
"mean_token_accuracy": 0.8277173846960068, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.6607369758576874, |
|
"grad_norm": 1.0868676429874986, |
|
"learning_rate": 6.234742640258938e-06, |
|
"loss": 0.4552, |
|
"mean_token_accuracy": 0.827509269118309, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.66285472257518, |
|
"grad_norm": 1.1792649536698685, |
|
"learning_rate": 6.166334206638186e-06, |
|
"loss": 0.4396, |
|
"mean_token_accuracy": 0.8288001954555512, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.6649724692926726, |
|
"grad_norm": 1.171894663481444, |
|
"learning_rate": 6.0981354462542456e-06, |
|
"loss": 0.4365, |
|
"mean_token_accuracy": 0.8315492898225785, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6670902160101652, |
|
"grad_norm": 1.1333037764256397, |
|
"learning_rate": 6.030150089076199e-06, |
|
"loss": 0.4319, |
|
"mean_token_accuracy": 0.8316318243741989, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6692079627276578, |
|
"grad_norm": 1.1892286300854609, |
|
"learning_rate": 5.9623818534015275e-06, |
|
"loss": 0.4275, |
|
"mean_token_accuracy": 0.8352140128612519, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6713257094451504, |
|
"grad_norm": 4.250523219515856, |
|
"learning_rate": 5.894834445652777e-06, |
|
"loss": 0.411, |
|
"mean_token_accuracy": 0.8329778879880905, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6734434561626429, |
|
"grad_norm": 1.157008090047474, |
|
"learning_rate": 5.827511560174835e-06, |
|
"loss": 0.4242, |
|
"mean_token_accuracy": 0.832972839474678, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6755612028801355, |
|
"grad_norm": 1.1834078816860993, |
|
"learning_rate": 5.7604168790328774e-06, |
|
"loss": 0.3931, |
|
"mean_token_accuracy": 0.8443128287792205, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6776789495976281, |
|
"grad_norm": 1.0766345733639675, |
|
"learning_rate": 5.693554071810987e-06, |
|
"loss": 0.4478, |
|
"mean_token_accuracy": 0.8282081812620163, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6797966963151207, |
|
"grad_norm": 1.0314594529031804, |
|
"learning_rate": 5.626926795411447e-06, |
|
"loss": 0.4246, |
|
"mean_token_accuracy": 0.8321157455444336, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.6819144430326133, |
|
"grad_norm": 1.055274137880832, |
|
"learning_rate": 5.560538693854751e-06, |
|
"loss": 0.4193, |
|
"mean_token_accuracy": 0.8316533505916596, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6840321897501059, |
|
"grad_norm": 1.1972782090907812, |
|
"learning_rate": 5.494393398080292e-06, |
|
"loss": 0.4313, |
|
"mean_token_accuracy": 0.834712353348732, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6861499364675985, |
|
"grad_norm": 1.0962501568970522, |
|
"learning_rate": 5.428494525747769e-06, |
|
"loss": 0.4597, |
|
"mean_token_accuracy": 0.8248083680868149, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.688267683185091, |
|
"grad_norm": 1.0751444988160856, |
|
"learning_rate": 5.362845681039348e-06, |
|
"loss": 0.4321, |
|
"mean_token_accuracy": 0.8374727904796601, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6903854299025837, |
|
"grad_norm": 1.1471090324016462, |
|
"learning_rate": 5.297450454462526e-06, |
|
"loss": 0.4328, |
|
"mean_token_accuracy": 0.8296476870775222, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6925031766200762, |
|
"grad_norm": 0.962534660265453, |
|
"learning_rate": 5.23231242265375e-06, |
|
"loss": 0.4181, |
|
"mean_token_accuracy": 0.83418510556221, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.6946209233375689, |
|
"grad_norm": 1.1168651450432128, |
|
"learning_rate": 5.167435148182824e-06, |
|
"loss": 0.4176, |
|
"mean_token_accuracy": 0.8372534781694412, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6967386700550614, |
|
"grad_norm": 1.2186341287706137, |
|
"learning_rate": 5.102822179358037e-06, |
|
"loss": 0.4075, |
|
"mean_token_accuracy": 0.8409687280654907, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6988564167725541, |
|
"grad_norm": 0.9820636174800459, |
|
"learning_rate": 5.0384770500321175e-06, |
|
"loss": 0.4128, |
|
"mean_token_accuracy": 0.8384972155094147, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7009741634900466, |
|
"grad_norm": 0.943830506781205, |
|
"learning_rate": 4.97440327940895e-06, |
|
"loss": 0.4027, |
|
"mean_token_accuracy": 0.8365049093961716, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.7030919102075391, |
|
"grad_norm": 1.0574783345670844, |
|
"learning_rate": 4.910604371851091e-06, |
|
"loss": 0.4308, |
|
"mean_token_accuracy": 0.8333552926778793, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7052096569250318, |
|
"grad_norm": 1.103380699456734, |
|
"learning_rate": 4.847083816688123e-06, |
|
"loss": 0.412, |
|
"mean_token_accuracy": 0.8425119102001191, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.7073274036425243, |
|
"grad_norm": 1.117253769501395, |
|
"learning_rate": 4.783845088025807e-06, |
|
"loss": 0.4346, |
|
"mean_token_accuracy": 0.8330845534801483, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.709445150360017, |
|
"grad_norm": 1.4108563780024128, |
|
"learning_rate": 4.7208916445560625e-06, |
|
"loss": 0.414, |
|
"mean_token_accuracy": 0.8379091322422028, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.7115628970775095, |
|
"grad_norm": 1.031565575748758, |
|
"learning_rate": 4.658226929367826e-06, |
|
"loss": 0.4598, |
|
"mean_token_accuracy": 0.8240681082010269, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7136806437950021, |
|
"grad_norm": 1.2248996065912452, |
|
"learning_rate": 4.595854369758727e-06, |
|
"loss": 0.4299, |
|
"mean_token_accuracy": 0.8363937050104141, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.7157983905124947, |
|
"grad_norm": 1.1049025661918381, |
|
"learning_rate": 4.5337773770476245e-06, |
|
"loss": 0.4273, |
|
"mean_token_accuracy": 0.8340339243412018, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7179161372299873, |
|
"grad_norm": 1.1244170950870136, |
|
"learning_rate": 4.4719993463880695e-06, |
|
"loss": 0.4571, |
|
"mean_token_accuracy": 0.8225684702396393, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.7200338839474799, |
|
"grad_norm": 1.1969285633316296, |
|
"learning_rate": 4.410523656582576e-06, |
|
"loss": 0.4025, |
|
"mean_token_accuracy": 0.8440569192171097, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7221516306649725, |
|
"grad_norm": 1.122866308313561, |
|
"learning_rate": 4.349353669897856e-06, |
|
"loss": 0.4208, |
|
"mean_token_accuracy": 0.837623131275177, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.7242693773824651, |
|
"grad_norm": 1.0173115464088704, |
|
"learning_rate": 4.288492731880917e-06, |
|
"loss": 0.4148, |
|
"mean_token_accuracy": 0.8388867497444152, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7263871240999576, |
|
"grad_norm": 1.1018457774189827, |
|
"learning_rate": 4.227944171176072e-06, |
|
"loss": 0.4003, |
|
"mean_token_accuracy": 0.8392677456140518, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.7285048708174502, |
|
"grad_norm": 1.2471156860459571, |
|
"learning_rate": 4.167711299342909e-06, |
|
"loss": 0.4459, |
|
"mean_token_accuracy": 0.8256678134202957, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7306226175349428, |
|
"grad_norm": 1.1273568017592417, |
|
"learning_rate": 4.107797410675166e-06, |
|
"loss": 0.4068, |
|
"mean_token_accuracy": 0.8386416286230087, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.7327403642524354, |
|
"grad_norm": 1.20918067568615, |
|
"learning_rate": 4.048205782020544e-06, |
|
"loss": 0.4539, |
|
"mean_token_accuracy": 0.8220532357692718, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.734858110969928, |
|
"grad_norm": 1.1573583276355073, |
|
"learning_rate": 3.988939672601509e-06, |
|
"loss": 0.395, |
|
"mean_token_accuracy": 0.844212406873703, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.7369758576874206, |
|
"grad_norm": 1.1516374922245958, |
|
"learning_rate": 3.930002323837026e-06, |
|
"loss": 0.4251, |
|
"mean_token_accuracy": 0.8371291518211365, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7390936044049131, |
|
"grad_norm": 1.274643963255776, |
|
"learning_rate": 3.871396959165267e-06, |
|
"loss": 0.429, |
|
"mean_token_accuracy": 0.8348165363073349, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.7412113511224058, |
|
"grad_norm": 1.025583507042276, |
|
"learning_rate": 3.8131267838673336e-06, |
|
"loss": 0.4262, |
|
"mean_token_accuracy": 0.8343986541032791, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7433290978398983, |
|
"grad_norm": 1.1299748085754966, |
|
"learning_rate": 3.755194984891943e-06, |
|
"loss": 0.4081, |
|
"mean_token_accuracy": 0.8430469453334808, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.745446844557391, |
|
"grad_norm": 1.0603027089656643, |
|
"learning_rate": 3.6976047306811115e-06, |
|
"loss": 0.4256, |
|
"mean_token_accuracy": 0.8382641762495041, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7475645912748835, |
|
"grad_norm": 1.1281590494510496, |
|
"learning_rate": 3.6403591709968924e-06, |
|
"loss": 0.4357, |
|
"mean_token_accuracy": 0.8320927768945694, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.7496823379923762, |
|
"grad_norm": 1.0367839611389602, |
|
"learning_rate": 3.5834614367490706e-06, |
|
"loss": 0.4221, |
|
"mean_token_accuracy": 0.835366889834404, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7518000847098687, |
|
"grad_norm": 1.0958827736818129, |
|
"learning_rate": 3.526914639823973e-06, |
|
"loss": 0.4381, |
|
"mean_token_accuracy": 0.8301591634750366, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.7539178314273612, |
|
"grad_norm": 1.0559223618431266, |
|
"learning_rate": 3.4707218729142224e-06, |
|
"loss": 0.4291, |
|
"mean_token_accuracy": 0.8316712707281113, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7560355781448539, |
|
"grad_norm": 1.0792688197107765, |
|
"learning_rate": 3.414886209349615e-06, |
|
"loss": 0.4269, |
|
"mean_token_accuracy": 0.835688841342926, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.7581533248623464, |
|
"grad_norm": 1.1979681287726258, |
|
"learning_rate": 3.3594107029290347e-06, |
|
"loss": 0.4269, |
|
"mean_token_accuracy": 0.8371979027986527, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7602710715798391, |
|
"grad_norm": 1.1468783022113433, |
|
"learning_rate": 3.304298387753426e-06, |
|
"loss": 0.4311, |
|
"mean_token_accuracy": 0.8341523915529251, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.7623888182973316, |
|
"grad_norm": 1.142335742385377, |
|
"learning_rate": 3.2495522780598442e-06, |
|
"loss": 0.4174, |
|
"mean_token_accuracy": 0.8298469454050064, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7645065650148243, |
|
"grad_norm": 1.1968773332651736, |
|
"learning_rate": 3.1951753680566143e-06, |
|
"loss": 0.4383, |
|
"mean_token_accuracy": 0.8313175171613694, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.7666243117323168, |
|
"grad_norm": 1.0804618708583653, |
|
"learning_rate": 3.141170631759558e-06, |
|
"loss": 0.4086, |
|
"mean_token_accuracy": 0.8373444229364395, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7687420584498094, |
|
"grad_norm": 1.0872538790077677, |
|
"learning_rate": 3.087541022829347e-06, |
|
"loss": 0.4221, |
|
"mean_token_accuracy": 0.8371105402708053, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.770859805167302, |
|
"grad_norm": 0.9905135006363225, |
|
"learning_rate": 3.034289474409943e-06, |
|
"loss": 0.4133, |
|
"mean_token_accuracy": 0.8365035742521286, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.7729775518847946, |
|
"grad_norm": 1.0890914888672922, |
|
"learning_rate": 2.981418898968186e-06, |
|
"loss": 0.4189, |
|
"mean_token_accuracy": 0.838862606883049, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.7750952986022872, |
|
"grad_norm": 1.1417209565486737, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.4169, |
|
"mean_token_accuracy": 0.833648070693016, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7772130453197797, |
|
"grad_norm": 1.1684616910176908, |
|
"learning_rate": 2.8768322125448265e-06, |
|
"loss": 0.4469, |
|
"mean_token_accuracy": 0.83056038916111, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7793307920372723, |
|
"grad_norm": 1.1845681597767028, |
|
"learning_rate": 2.825121821683391e-06, |
|
"loss": 0.4223, |
|
"mean_token_accuracy": 0.8353413581848145, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7814485387547649, |
|
"grad_norm": 1.1732126933903428, |
|
"learning_rate": 2.7738038437271288e-06, |
|
"loss": 0.4121, |
|
"mean_token_accuracy": 0.842677703499794, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.7835662854722575, |
|
"grad_norm": 1.0292583187860371, |
|
"learning_rate": 2.7228810853908406e-06, |
|
"loss": 0.3921, |
|
"mean_token_accuracy": 0.8447476714849472, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7856840321897501, |
|
"grad_norm": 0.9892030702997285, |
|
"learning_rate": 2.67235633177373e-06, |
|
"loss": 0.4387, |
|
"mean_token_accuracy": 0.8288900941610337, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.7878017789072427, |
|
"grad_norm": 1.0050687986582967, |
|
"learning_rate": 2.6222323462070897e-06, |
|
"loss": 0.4356, |
|
"mean_token_accuracy": 0.828187745809555, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7899195256247353, |
|
"grad_norm": 1.1304197153376732, |
|
"learning_rate": 2.572511870103149e-06, |
|
"loss": 0.4125, |
|
"mean_token_accuracy": 0.8425087302923202, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.7920372723422279, |
|
"grad_norm": 1.0444576639344187, |
|
"learning_rate": 2.5231976228051526e-06, |
|
"loss": 0.4318, |
|
"mean_token_accuracy": 0.8337043792009353, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7941550190597204, |
|
"grad_norm": 1.0875080317220023, |
|
"learning_rate": 2.4742923014386154e-06, |
|
"loss": 0.4287, |
|
"mean_token_accuracy": 0.8368022799491882, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.7962727657772131, |
|
"grad_norm": 1.1517129093084153, |
|
"learning_rate": 2.4257985807638294e-06, |
|
"loss": 0.4284, |
|
"mean_token_accuracy": 0.8356128752231597, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7983905124947056, |
|
"grad_norm": 1.2213468844119533, |
|
"learning_rate": 2.3777191130295673e-06, |
|
"loss": 0.411, |
|
"mean_token_accuracy": 0.8373890697956086, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.8005082592121983, |
|
"grad_norm": 1.1105462272187794, |
|
"learning_rate": 2.330056527828013e-06, |
|
"loss": 0.4549, |
|
"mean_token_accuracy": 0.8282926619052887, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8026260059296908, |
|
"grad_norm": 1.1626653178571262, |
|
"learning_rate": 2.282813431950952e-06, |
|
"loss": 0.4295, |
|
"mean_token_accuracy": 0.8333282887935638, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.8047437526471835, |
|
"grad_norm": 1.1195581942328177, |
|
"learning_rate": 2.235992409247214e-06, |
|
"loss": 0.4338, |
|
"mean_token_accuracy": 0.8319763302803039, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.806861499364676, |
|
"grad_norm": 1.026868168904022, |
|
"learning_rate": 2.1895960204813194e-06, |
|
"loss": 0.4118, |
|
"mean_token_accuracy": 0.8370046824216842, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.8089792460821685, |
|
"grad_norm": 1.0639569641896143, |
|
"learning_rate": 2.1436268031934602e-06, |
|
"loss": 0.4411, |
|
"mean_token_accuracy": 0.8297486454248428, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8110969927996612, |
|
"grad_norm": 1.0385740186847223, |
|
"learning_rate": 2.098087271560687e-06, |
|
"loss": 0.4152, |
|
"mean_token_accuracy": 0.8370089381933212, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.8132147395171537, |
|
"grad_norm": 1.1169845772777505, |
|
"learning_rate": 2.0529799162594242e-06, |
|
"loss": 0.4094, |
|
"mean_token_accuracy": 0.839673039317131, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8153324862346464, |
|
"grad_norm": 1.0745546751170598, |
|
"learning_rate": 2.0083072043292406e-06, |
|
"loss": 0.417, |
|
"mean_token_accuracy": 0.8379459470510483, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.8174502329521389, |
|
"grad_norm": 1.206429363415916, |
|
"learning_rate": 1.9640715790379084e-06, |
|
"loss": 0.4133, |
|
"mean_token_accuracy": 0.8345289677381516, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8195679796696315, |
|
"grad_norm": 1.0452292636568519, |
|
"learning_rate": 1.920275459747796e-06, |
|
"loss": 0.4123, |
|
"mean_token_accuracy": 0.8368586808443069, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.8216857263871241, |
|
"grad_norm": 1.0706189800647066, |
|
"learning_rate": 1.8769212417835314e-06, |
|
"loss": 0.3773, |
|
"mean_token_accuracy": 0.8513321369886399, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8238034731046167, |
|
"grad_norm": 1.0974535637452612, |
|
"learning_rate": 1.8340112963009993e-06, |
|
"loss": 0.4353, |
|
"mean_token_accuracy": 0.8337271898984909, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.8259212198221093, |
|
"grad_norm": 1.0867209847341632, |
|
"learning_rate": 1.7915479701576577e-06, |
|
"loss": 0.4489, |
|
"mean_token_accuracy": 0.8291646331548691, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8280389665396019, |
|
"grad_norm": 1.1993569416921062, |
|
"learning_rate": 1.7495335857841855e-06, |
|
"loss": 0.4138, |
|
"mean_token_accuracy": 0.8385995358228684, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.8301567132570945, |
|
"grad_norm": 1.1414883228473476, |
|
"learning_rate": 1.7079704410574505e-06, |
|
"loss": 0.3859, |
|
"mean_token_accuracy": 0.8459228605031968, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.832274459974587, |
|
"grad_norm": 1.048311577922366, |
|
"learning_rate": 1.6668608091748495e-06, |
|
"loss": 0.426, |
|
"mean_token_accuracy": 0.8357879251241684, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.8343922066920796, |
|
"grad_norm": 1.0617438922962255, |
|
"learning_rate": 1.6262069385299694e-06, |
|
"loss": 0.4334, |
|
"mean_token_accuracy": 0.8343731433153152, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8365099534095722, |
|
"grad_norm": 1.1279209328755353, |
|
"learning_rate": 1.5860110525896143e-06, |
|
"loss": 0.4197, |
|
"mean_token_accuracy": 0.835442116856575, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.8386277001270648, |
|
"grad_norm": 0.9640338154076892, |
|
"learning_rate": 1.5462753497722139e-06, |
|
"loss": 0.4228, |
|
"mean_token_accuracy": 0.8363285154104233, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8407454468445574, |
|
"grad_norm": 1.065476222817932, |
|
"learning_rate": 1.5070020033275655e-06, |
|
"loss": 0.3954, |
|
"mean_token_accuracy": 0.8427035689353943, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.84286319356205, |
|
"grad_norm": 1.055480105683973, |
|
"learning_rate": 1.4681931612179901e-06, |
|
"loss": 0.4289, |
|
"mean_token_accuracy": 0.8340502351522445, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8449809402795425, |
|
"grad_norm": 1.0690985831761302, |
|
"learning_rate": 1.4298509460008491e-06, |
|
"loss": 0.4072, |
|
"mean_token_accuracy": 0.8402904689311981, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.8470986869970352, |
|
"grad_norm": 1.0063164183000968, |
|
"learning_rate": 1.39197745471245e-06, |
|
"loss": 0.4231, |
|
"mean_token_accuracy": 0.8361636906862259, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8492164337145277, |
|
"grad_norm": 1.0247616494577987, |
|
"learning_rate": 1.354574758753363e-06, |
|
"loss": 0.4189, |
|
"mean_token_accuracy": 0.8310322672128677, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.8513341804320204, |
|
"grad_norm": 1.044860588344852, |
|
"learning_rate": 1.3176449037751294e-06, |
|
"loss": 0.4404, |
|
"mean_token_accuracy": 0.8303707420825959, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8534519271495129, |
|
"grad_norm": 2.4537559889629694, |
|
"learning_rate": 1.28118990956837e-06, |
|
"loss": 0.4104, |
|
"mean_token_accuracy": 0.835821408033371, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.8555696738670056, |
|
"grad_norm": 1.0972489520800874, |
|
"learning_rate": 1.2452117699523303e-06, |
|
"loss": 0.4027, |
|
"mean_token_accuracy": 0.8460766285657882, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8576874205844981, |
|
"grad_norm": 1.2309045137234433, |
|
"learning_rate": 1.2097124526658277e-06, |
|
"loss": 0.419, |
|
"mean_token_accuracy": 0.8366678208112717, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.8598051673019906, |
|
"grad_norm": 1.0849048269411365, |
|
"learning_rate": 1.1746938992596257e-06, |
|
"loss": 0.4174, |
|
"mean_token_accuracy": 0.8296289384365082, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8619229140194833, |
|
"grad_norm": 0.989974221522167, |
|
"learning_rate": 1.1401580249902566e-06, |
|
"loss": 0.4153, |
|
"mean_token_accuracy": 0.8379861056804657, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.8640406607369758, |
|
"grad_norm": 1.0066596115748891, |
|
"learning_rate": 1.1061067187152584e-06, |
|
"loss": 0.4041, |
|
"mean_token_accuracy": 0.8417060792446136, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8661584074544685, |
|
"grad_norm": 1.0526259070425423, |
|
"learning_rate": 1.0725418427898792e-06, |
|
"loss": 0.4099, |
|
"mean_token_accuracy": 0.8398545056581497, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.868276154171961, |
|
"grad_norm": 1.134470581551777, |
|
"learning_rate": 1.0394652329652165e-06, |
|
"loss": 0.4146, |
|
"mean_token_accuracy": 0.8354752600193024, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.8703939008894537, |
|
"grad_norm": 1.130864865166622, |
|
"learning_rate": 1.0068786982878087e-06, |
|
"loss": 0.418, |
|
"mean_token_accuracy": 0.8398678600788116, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.8725116476069462, |
|
"grad_norm": 1.1500087879964977, |
|
"learning_rate": 9.747840210007021e-07, |
|
"loss": 0.4157, |
|
"mean_token_accuracy": 0.8322781622409821, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8746293943244388, |
|
"grad_norm": 0.9770307768209092, |
|
"learning_rate": 9.43182956445976e-07, |
|
"loss": 0.3977, |
|
"mean_token_accuracy": 0.8416966944932938, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.8767471410419314, |
|
"grad_norm": 1.2583818143393242, |
|
"learning_rate": 9.120772329687278e-07, |
|
"loss": 0.4251, |
|
"mean_token_accuracy": 0.8354076951742172, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.878864887759424, |
|
"grad_norm": 1.0618576291439479, |
|
"learning_rate": 8.814685518225552e-07, |
|
"loss": 0.4291, |
|
"mean_token_accuracy": 0.8308704495429993, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.8809826344769166, |
|
"grad_norm": 1.1180367611245425, |
|
"learning_rate": 8.513585870765118e-07, |
|
"loss": 0.3907, |
|
"mean_token_accuracy": 0.8452890306711197, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8831003811944091, |
|
"grad_norm": 1.230123212504081, |
|
"learning_rate": 8.217489855235338e-07, |
|
"loss": 0.4144, |
|
"mean_token_accuracy": 0.8392110764980316, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.8852181279119017, |
|
"grad_norm": 1.1108948475484288, |
|
"learning_rate": 7.926413665903931e-07, |
|
"loss": 0.4151, |
|
"mean_token_accuracy": 0.8380868971347809, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8873358746293943, |
|
"grad_norm": 1.098761542271965, |
|
"learning_rate": 7.640373222491038e-07, |
|
"loss": 0.4196, |
|
"mean_token_accuracy": 0.8407029449939728, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.8894536213468869, |
|
"grad_norm": 1.0940803341605705, |
|
"learning_rate": 7.359384169298744e-07, |
|
"loss": 0.4097, |
|
"mean_token_accuracy": 0.8401619613170623, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8915713680643795, |
|
"grad_norm": 0.9066347453646844, |
|
"learning_rate": 7.083461874355335e-07, |
|
"loss": 0.4257, |
|
"mean_token_accuracy": 0.8362819194793701, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.8936891147818721, |
|
"grad_norm": 1.0448023766882066, |
|
"learning_rate": 6.81262142857475e-07, |
|
"loss": 0.3898, |
|
"mean_token_accuracy": 0.8459620922803879, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8958068614993647, |
|
"grad_norm": 1.0611643496346475, |
|
"learning_rate": 6.546877644931315e-07, |
|
"loss": 0.4208, |
|
"mean_token_accuracy": 0.8312031596899032, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.8979246082168573, |
|
"grad_norm": 1.1224663985096108, |
|
"learning_rate": 6.286245057649542e-07, |
|
"loss": 0.3994, |
|
"mean_token_accuracy": 0.8465497404336929, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9000423549343498, |
|
"grad_norm": 1.0832056476567533, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.3867, |
|
"mean_token_accuracy": 0.8440623044967651, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.9021601016518425, |
|
"grad_norm": 1.0523110523954844, |
|
"learning_rate": 5.7803702105656e-07, |
|
"loss": 0.4127, |
|
"mean_token_accuracy": 0.8366563141345977, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.904277848369335, |
|
"grad_norm": 1.0105232913792406, |
|
"learning_rate": 5.535155618385612e-07, |
|
"loss": 0.4195, |
|
"mean_token_accuracy": 0.8335390537977219, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.9063955950868277, |
|
"grad_norm": 1.1129917485868344, |
|
"learning_rate": 5.295107556298329e-07, |
|
"loss": 0.3928, |
|
"mean_token_accuracy": 0.8431670844554902, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9085133418043202, |
|
"grad_norm": 1.145719574659648, |
|
"learning_rate": 5.060239153161872e-07, |
|
"loss": 0.4019, |
|
"mean_token_accuracy": 0.8419764310121536, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.9106310885218127, |
|
"grad_norm": 1.443628282269306, |
|
"learning_rate": 4.830563254545207e-07, |
|
"loss": 0.4233, |
|
"mean_token_accuracy": 0.8361739784479141, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9127488352393054, |
|
"grad_norm": 1.1691030241329559, |
|
"learning_rate": 4.6060924220255654e-07, |
|
"loss": 0.4257, |
|
"mean_token_accuracy": 0.8305665761232376, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.9148665819567979, |
|
"grad_norm": 1.2424085660240223, |
|
"learning_rate": 4.386838932501547e-07, |
|
"loss": 0.4303, |
|
"mean_token_accuracy": 0.8358988225460052, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9169843286742906, |
|
"grad_norm": 1.0258262640063769, |
|
"learning_rate": 4.172814777521483e-07, |
|
"loss": 0.4298, |
|
"mean_token_accuracy": 0.8366893321275711, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.9191020753917831, |
|
"grad_norm": 1.0932401792673323, |
|
"learning_rate": 3.9640316626277654e-07, |
|
"loss": 0.4172, |
|
"mean_token_accuracy": 0.836585283279419, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9212198221092758, |
|
"grad_norm": 1.0881178279493329, |
|
"learning_rate": 3.7605010067165216e-07, |
|
"loss": 0.42, |
|
"mean_token_accuracy": 0.8352493315935134, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.9233375688267683, |
|
"grad_norm": 1.057750886441079, |
|
"learning_rate": 3.562233941413096e-07, |
|
"loss": 0.3975, |
|
"mean_token_accuracy": 0.8412194460630417, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9254553155442609, |
|
"grad_norm": 1.1056774030421723, |
|
"learning_rate": 3.3692413104633226e-07, |
|
"loss": 0.3976, |
|
"mean_token_accuracy": 0.840697067975998, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.9275730622617535, |
|
"grad_norm": 1.163101779598673, |
|
"learning_rate": 3.1815336691403464e-07, |
|
"loss": 0.3751, |
|
"mean_token_accuracy": 0.8496327966451644, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.929690808979246, |
|
"grad_norm": 0.9755793569303719, |
|
"learning_rate": 2.999121283667339e-07, |
|
"loss": 0.4079, |
|
"mean_token_accuracy": 0.8418219208717346, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.9318085556967387, |
|
"grad_norm": 1.021358583461123, |
|
"learning_rate": 2.8220141306561034e-07, |
|
"loss": 0.4186, |
|
"mean_token_accuracy": 0.8352805793285369, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9339263024142312, |
|
"grad_norm": 1.0396837778560488, |
|
"learning_rate": 2.6502218965613335e-07, |
|
"loss": 0.4225, |
|
"mean_token_accuracy": 0.8338442891836166, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.9360440491317239, |
|
"grad_norm": 1.1742052357618658, |
|
"learning_rate": 2.483753977150882e-07, |
|
"loss": 0.4067, |
|
"mean_token_accuracy": 0.8387827515602112, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9381617958492164, |
|
"grad_norm": 1.0739901995137444, |
|
"learning_rate": 2.3226194769918497e-07, |
|
"loss": 0.4041, |
|
"mean_token_accuracy": 0.837730023264885, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.940279542566709, |
|
"grad_norm": 1.0246012489566791, |
|
"learning_rate": 2.1668272089526377e-07, |
|
"loss": 0.4161, |
|
"mean_token_accuracy": 0.8399739652872086, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9423972892842016, |
|
"grad_norm": 1.0463273467785923, |
|
"learning_rate": 2.0163856937210236e-07, |
|
"loss": 0.4245, |
|
"mean_token_accuracy": 0.8379955619573594, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.9445150360016942, |
|
"grad_norm": 1.13493837929642, |
|
"learning_rate": 1.8713031593380116e-07, |
|
"loss": 0.405, |
|
"mean_token_accuracy": 0.8368137925863266, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.9466327827191868, |
|
"grad_norm": 1.1326422720007092, |
|
"learning_rate": 1.731587540747903e-07, |
|
"loss": 0.4164, |
|
"mean_token_accuracy": 0.839913833141327, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.9487505294366794, |
|
"grad_norm": 1.1288581153860058, |
|
"learning_rate": 1.597246479364345e-07, |
|
"loss": 0.4345, |
|
"mean_token_accuracy": 0.8263521671295166, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9508682761541719, |
|
"grad_norm": 1.0618048867408285, |
|
"learning_rate": 1.4682873226523064e-07, |
|
"loss": 0.4116, |
|
"mean_token_accuracy": 0.8380947977304458, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.9529860228716646, |
|
"grad_norm": 1.0089748524009554, |
|
"learning_rate": 1.3447171237262912e-07, |
|
"loss": 0.4281, |
|
"mean_token_accuracy": 0.8311914891004563, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9551037695891571, |
|
"grad_norm": 1.1718653607363838, |
|
"learning_rate": 1.2265426409645676e-07, |
|
"loss": 0.4205, |
|
"mean_token_accuracy": 0.8367854833602906, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.9572215163066498, |
|
"grad_norm": 1.009709808393184, |
|
"learning_rate": 1.1137703376395304e-07, |
|
"loss": 0.4307, |
|
"mean_token_accuracy": 0.8332184463739395, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9593392630241423, |
|
"grad_norm": 1.0456672123180084, |
|
"learning_rate": 1.0064063815642178e-07, |
|
"loss": 0.4143, |
|
"mean_token_accuracy": 0.8407183200120926, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.961457009741635, |
|
"grad_norm": 1.4564232381895734, |
|
"learning_rate": 9.044566447549697e-08, |
|
"loss": 0.3935, |
|
"mean_token_accuracy": 0.843877837061882, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9635747564591275, |
|
"grad_norm": 1.006395133879737, |
|
"learning_rate": 8.079267031102844e-08, |
|
"loss": 0.4379, |
|
"mean_token_accuracy": 0.8322035163640976, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.96569250317662, |
|
"grad_norm": 1.0451381392295622, |
|
"learning_rate": 7.16821836105841e-08, |
|
"loss": 0.3998, |
|
"mean_token_accuracy": 0.8473025262355804, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9678102498941127, |
|
"grad_norm": 1.0472971428422386, |
|
"learning_rate": 6.311470265057518e-08, |
|
"loss": 0.423, |
|
"mean_token_accuracy": 0.8354467749595642, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.9699279966116052, |
|
"grad_norm": 1.1605199240395647, |
|
"learning_rate": 5.5090696009004744e-08, |
|
"loss": 0.4257, |
|
"mean_token_accuracy": 0.8360013753175736, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.9720457433290979, |
|
"grad_norm": 0.9898182837486158, |
|
"learning_rate": 4.761060253984151e-08, |
|
"loss": 0.4204, |
|
"mean_token_accuracy": 0.8367842882871628, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.9741634900465904, |
|
"grad_norm": 1.088987157568079, |
|
"learning_rate": 4.067483134901573e-08, |
|
"loss": 0.4134, |
|
"mean_token_accuracy": 0.83856400847435, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.976281236764083, |
|
"grad_norm": 1.0295706774122013, |
|
"learning_rate": 3.4283761772042623e-08, |
|
"loss": 0.4224, |
|
"mean_token_accuracy": 0.8354990780353546, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.9783989834815756, |
|
"grad_norm": 1.0694735478921555, |
|
"learning_rate": 2.84377433532812e-08, |
|
"loss": 0.4305, |
|
"mean_token_accuracy": 0.8316824287176132, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9805167301990682, |
|
"grad_norm": 1.0827744380795652, |
|
"learning_rate": 2.3137095826809564e-08, |
|
"loss": 0.402, |
|
"mean_token_accuracy": 0.8404913783073426, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.9826344769165608, |
|
"grad_norm": 1.0960538876783272, |
|
"learning_rate": 1.8382109098944444e-08, |
|
"loss": 0.4352, |
|
"mean_token_accuracy": 0.8338410943746567, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9847522236340533, |
|
"grad_norm": 1.1206569874331853, |
|
"learning_rate": 1.4173043232380557e-08, |
|
"loss": 0.4076, |
|
"mean_token_accuracy": 0.8435803085565567, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.986869970351546, |
|
"grad_norm": 1.0708342349134583, |
|
"learning_rate": 1.0510128431968635e-08, |
|
"loss": 0.4041, |
|
"mean_token_accuracy": 0.8435177773237228, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9889877170690385, |
|
"grad_norm": 1.0195754299190762, |
|
"learning_rate": 7.3935650321255156e-09, |
|
"loss": 0.4017, |
|
"mean_token_accuracy": 0.8434190511703491, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.9911054637865311, |
|
"grad_norm": 1.043562362927536, |
|
"learning_rate": 4.823523485879556e-09, |
|
"loss": 0.4441, |
|
"mean_token_accuracy": 0.8331767469644547, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9932232105040237, |
|
"grad_norm": 0.9692528305370003, |
|
"learning_rate": 2.800144355540324e-09, |
|
"loss": 0.4112, |
|
"mean_token_accuracy": 0.836205193400383, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.9953409572215163, |
|
"grad_norm": 0.9767634882260735, |
|
"learning_rate": 1.32353830502141e-09, |
|
"loss": 0.4233, |
|
"mean_token_accuracy": 0.8327444672584534, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9974587039390089, |
|
"grad_norm": 1.1074445733229596, |
|
"learning_rate": 3.9378609377971335e-10, |
|
"loss": 0.3959, |
|
"mean_token_accuracy": 0.8446923106908798, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.9995764506565015, |
|
"grad_norm": 1.0245959330198788, |
|
"learning_rate": 1.0938572402308111e-11, |
|
"loss": 0.4106, |
|
"mean_token_accuracy": 0.8339618355035782, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"mean_token_accuracy": 0.890313521027565, |
|
"step": 2361, |
|
"total_flos": 451385831948288.0, |
|
"train_loss": 0.48239256052181206, |
|
"train_runtime": 37146.7848, |
|
"train_samples_per_second": 1.017, |
|
"train_steps_per_second": 0.064 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2361, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 451385831948288.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|