|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 200, |
|
"global_step": 3252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009225092250922509, |
|
"grad_norm": 9.216251979924113, |
|
"learning_rate": 0.0, |
|
"loss": 1.1386, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004612546125461255, |
|
"grad_norm": 9.265663963684933, |
|
"learning_rate": 2.45398773006135e-07, |
|
"loss": 1.1356, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00922509225092251, |
|
"grad_norm": 5.379773922138807, |
|
"learning_rate": 5.521472392638038e-07, |
|
"loss": 1.1078, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013837638376383764, |
|
"grad_norm": 3.1833820798612313, |
|
"learning_rate": 8.588957055214725e-07, |
|
"loss": 1.0446, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01845018450184502, |
|
"grad_norm": 3.7414115093127966, |
|
"learning_rate": 1.165644171779141e-06, |
|
"loss": 1.027, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.023062730627306273, |
|
"grad_norm": 2.077966946821524, |
|
"learning_rate": 1.47239263803681e-06, |
|
"loss": 1.0126, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.027675276752767528, |
|
"grad_norm": 1.976442561168195, |
|
"learning_rate": 1.7791411042944787e-06, |
|
"loss": 0.9766, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03228782287822878, |
|
"grad_norm": 2.0558473198791685, |
|
"learning_rate": 2.085889570552147e-06, |
|
"loss": 0.9988, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03690036900369004, |
|
"grad_norm": 2.0823124494571843, |
|
"learning_rate": 2.392638036809816e-06, |
|
"loss": 1.0026, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04151291512915129, |
|
"grad_norm": 1.8983417897768358, |
|
"learning_rate": 2.699386503067485e-06, |
|
"loss": 0.9786, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.046125461254612546, |
|
"grad_norm": 2.157480675022155, |
|
"learning_rate": 3.0061349693251535e-06, |
|
"loss": 0.9712, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0507380073800738, |
|
"grad_norm": 1.770274327543251, |
|
"learning_rate": 3.312883435582822e-06, |
|
"loss": 0.9703, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.055350553505535055, |
|
"grad_norm": 1.7607024339948523, |
|
"learning_rate": 3.6196319018404913e-06, |
|
"loss": 0.9819, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05996309963099631, |
|
"grad_norm": 1.7629489637063536, |
|
"learning_rate": 3.92638036809816e-06, |
|
"loss": 0.9814, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06457564575645756, |
|
"grad_norm": 1.9708586699192496, |
|
"learning_rate": 4.233128834355829e-06, |
|
"loss": 0.9581, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06918819188191883, |
|
"grad_norm": 1.8360638750389535, |
|
"learning_rate": 4.539877300613497e-06, |
|
"loss": 0.9631, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"grad_norm": 2.267503391793938, |
|
"learning_rate": 4.846625766871166e-06, |
|
"loss": 0.9544, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07841328413284133, |
|
"grad_norm": 1.884337413377388, |
|
"learning_rate": 5.153374233128835e-06, |
|
"loss": 0.972, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08302583025830258, |
|
"grad_norm": 1.8743154625885863, |
|
"learning_rate": 5.460122699386503e-06, |
|
"loss": 0.9697, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08763837638376384, |
|
"grad_norm": 2.1189085616099, |
|
"learning_rate": 5.766871165644172e-06, |
|
"loss": 0.9741, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09225092250922509, |
|
"grad_norm": 2.212060434062884, |
|
"learning_rate": 6.073619631901841e-06, |
|
"loss": 0.9798, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09686346863468635, |
|
"grad_norm": 1.929193262402203, |
|
"learning_rate": 6.38036809815951e-06, |
|
"loss": 1.0036, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1014760147601476, |
|
"grad_norm": 2.003660126750222, |
|
"learning_rate": 6.687116564417178e-06, |
|
"loss": 0.9857, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10608856088560886, |
|
"grad_norm": 1.8476735746280122, |
|
"learning_rate": 6.993865030674847e-06, |
|
"loss": 0.991, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11070110701107011, |
|
"grad_norm": 1.8581099924431381, |
|
"learning_rate": 7.300613496932516e-06, |
|
"loss": 0.971, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11531365313653137, |
|
"grad_norm": 1.9178224374591495, |
|
"learning_rate": 7.6073619631901856e-06, |
|
"loss": 0.9944, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11992619926199262, |
|
"grad_norm": 1.942003358498506, |
|
"learning_rate": 7.914110429447854e-06, |
|
"loss": 0.9674, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12453874538745388, |
|
"grad_norm": 2.1197747747438154, |
|
"learning_rate": 8.220858895705522e-06, |
|
"loss": 0.9963, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12915129151291513, |
|
"grad_norm": 1.8664824635600694, |
|
"learning_rate": 8.527607361963191e-06, |
|
"loss": 0.9926, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.13376383763837638, |
|
"grad_norm": 2.0058004450691347, |
|
"learning_rate": 8.83435582822086e-06, |
|
"loss": 0.9712, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.13837638376383765, |
|
"grad_norm": 1.7687908496159355, |
|
"learning_rate": 9.14110429447853e-06, |
|
"loss": 0.9593, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1429889298892989, |
|
"grad_norm": 2.245349299711709, |
|
"learning_rate": 9.447852760736197e-06, |
|
"loss": 1.0105, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.14760147601476015, |
|
"grad_norm": 2.3027490792265186, |
|
"learning_rate": 9.754601226993867e-06, |
|
"loss": 0.9674, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1522140221402214, |
|
"grad_norm": 1.7846972073040872, |
|
"learning_rate": 1.0061349693251534e-05, |
|
"loss": 0.9759, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.15682656826568267, |
|
"grad_norm": 2.3489880483587093, |
|
"learning_rate": 1.0368098159509204e-05, |
|
"loss": 0.9795, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.16143911439114392, |
|
"grad_norm": 1.8342564944945305, |
|
"learning_rate": 1.0674846625766873e-05, |
|
"loss": 0.9652, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.16605166051660517, |
|
"grad_norm": 2.0880839713049935, |
|
"learning_rate": 1.0981595092024542e-05, |
|
"loss": 0.9802, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1706642066420664, |
|
"grad_norm": 2.0564885394915584, |
|
"learning_rate": 1.1288343558282208e-05, |
|
"loss": 0.9879, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1752767527675277, |
|
"grad_norm": 1.8924944592502544, |
|
"learning_rate": 1.1595092024539878e-05, |
|
"loss": 0.9721, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17988929889298894, |
|
"grad_norm": 1.826625526770449, |
|
"learning_rate": 1.1901840490797547e-05, |
|
"loss": 0.9796, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"grad_norm": 2.204915779056281, |
|
"learning_rate": 1.2208588957055216e-05, |
|
"loss": 0.9838, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"eval_loss": 0.99369215965271, |
|
"eval_runtime": 539.2617, |
|
"eval_samples_per_second": 28.465, |
|
"eval_steps_per_second": 0.111, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18911439114391143, |
|
"grad_norm": 1.9931165010626688, |
|
"learning_rate": 1.2515337423312886e-05, |
|
"loss": 0.9757, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1937269372693727, |
|
"grad_norm": 1.9621352935552971, |
|
"learning_rate": 1.2822085889570552e-05, |
|
"loss": 0.9591, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.19833948339483395, |
|
"grad_norm": 2.2039217203491632, |
|
"learning_rate": 1.3128834355828221e-05, |
|
"loss": 0.9738, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2029520295202952, |
|
"grad_norm": 1.8509141089097243, |
|
"learning_rate": 1.343558282208589e-05, |
|
"loss": 0.9966, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.20756457564575645, |
|
"grad_norm": 1.8414622043228284, |
|
"learning_rate": 1.374233128834356e-05, |
|
"loss": 1.0021, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.21217712177121772, |
|
"grad_norm": 1.80163089612214, |
|
"learning_rate": 1.4049079754601229e-05, |
|
"loss": 0.9831, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.21678966789667897, |
|
"grad_norm": 2.0579916481946983, |
|
"learning_rate": 1.4355828220858897e-05, |
|
"loss": 1.003, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.22140221402214022, |
|
"grad_norm": 2.17279455840179, |
|
"learning_rate": 1.4662576687116566e-05, |
|
"loss": 0.9951, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.22601476014760147, |
|
"grad_norm": 1.9598270944099394, |
|
"learning_rate": 1.4969325153374235e-05, |
|
"loss": 0.9963, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.23062730627306274, |
|
"grad_norm": 1.7398200214510018, |
|
"learning_rate": 1.5276073619631903e-05, |
|
"loss": 0.995, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.235239852398524, |
|
"grad_norm": 1.8787859113099807, |
|
"learning_rate": 1.5582822085889574e-05, |
|
"loss": 1.0017, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.23985239852398524, |
|
"grad_norm": 1.68690066175367, |
|
"learning_rate": 1.5889570552147238e-05, |
|
"loss": 1.0063, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2444649446494465, |
|
"grad_norm": 1.8827477836410789, |
|
"learning_rate": 1.619631901840491e-05, |
|
"loss": 1.0143, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.24907749077490776, |
|
"grad_norm": 1.7349433277324904, |
|
"learning_rate": 1.6503067484662577e-05, |
|
"loss": 1.0152, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.253690036900369, |
|
"grad_norm": 1.8180747657522185, |
|
"learning_rate": 1.6809815950920248e-05, |
|
"loss": 1.0022, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.25830258302583026, |
|
"grad_norm": 2.2464010609362735, |
|
"learning_rate": 1.7116564417177916e-05, |
|
"loss": 1.0131, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2629151291512915, |
|
"grad_norm": 1.659157693268852, |
|
"learning_rate": 1.7423312883435583e-05, |
|
"loss": 1.0029, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.26752767527675275, |
|
"grad_norm": 1.8259012389516431, |
|
"learning_rate": 1.7730061349693254e-05, |
|
"loss": 1.0149, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.272140221402214, |
|
"grad_norm": 1.7620006504062913, |
|
"learning_rate": 1.8036809815950922e-05, |
|
"loss": 1.0058, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2767527675276753, |
|
"grad_norm": 1.8231421170705873, |
|
"learning_rate": 1.834355828220859e-05, |
|
"loss": 1.0157, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28136531365313655, |
|
"grad_norm": 1.7640621041852131, |
|
"learning_rate": 1.8650306748466257e-05, |
|
"loss": 0.9917, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2859778597785978, |
|
"grad_norm": 1.6216889282252938, |
|
"learning_rate": 1.8957055214723928e-05, |
|
"loss": 1.0176, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.29059040590405905, |
|
"grad_norm": 1.8784106605665698, |
|
"learning_rate": 1.9263803680981596e-05, |
|
"loss": 1.0092, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2952029520295203, |
|
"grad_norm": 1.7812979855381206, |
|
"learning_rate": 1.9570552147239267e-05, |
|
"loss": 1.0122, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.29981549815498154, |
|
"grad_norm": 2.7924898972312375, |
|
"learning_rate": 1.9877300613496935e-05, |
|
"loss": 1.0214, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3044280442804428, |
|
"grad_norm": 1.9196461092563228, |
|
"learning_rate": 1.999994812438719e-05, |
|
"loss": 1.037, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.30904059040590404, |
|
"grad_norm": 1.9675689878341092, |
|
"learning_rate": 1.9999631108702447e-05, |
|
"loss": 1.0322, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.31365313653136534, |
|
"grad_norm": 1.965168599936586, |
|
"learning_rate": 1.999902590624309e-05, |
|
"loss": 1.0217, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3182656826568266, |
|
"grad_norm": 2.0547134862700225, |
|
"learning_rate": 1.9998132534450893e-05, |
|
"loss": 1.0312, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.32287822878228783, |
|
"grad_norm": 1.8052849563747033, |
|
"learning_rate": 1.9996951019072605e-05, |
|
"loss": 1.0062, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3274907749077491, |
|
"grad_norm": 1.7264405471248967, |
|
"learning_rate": 1.999548139415919e-05, |
|
"loss": 1.0176, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.33210332103321033, |
|
"grad_norm": 3.9478371721075995, |
|
"learning_rate": 1.9993723702064852e-05, |
|
"loss": 1.0241, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3367158671586716, |
|
"grad_norm": 1.8206235491040932, |
|
"learning_rate": 1.9991677993445832e-05, |
|
"loss": 1.0172, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3413284132841328, |
|
"grad_norm": 1.936656750339234, |
|
"learning_rate": 1.998934432725891e-05, |
|
"loss": 1.0395, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3459409594095941, |
|
"grad_norm": 1.719646103111977, |
|
"learning_rate": 1.998672277075975e-05, |
|
"loss": 1.0242, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3505535055350554, |
|
"grad_norm": 1.64985869025976, |
|
"learning_rate": 1.998381339950093e-05, |
|
"loss": 1.0168, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3551660516605166, |
|
"grad_norm": 1.5355798843409723, |
|
"learning_rate": 1.9980616297329764e-05, |
|
"loss": 1.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.35977859778597787, |
|
"grad_norm": 1.530082699368783, |
|
"learning_rate": 1.997713155638592e-05, |
|
"loss": 1.0086, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3643911439114391, |
|
"grad_norm": 1.803426492693465, |
|
"learning_rate": 1.997335927709872e-05, |
|
"loss": 1.0318, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"grad_norm": 1.7841711948275436, |
|
"learning_rate": 1.9969299568184276e-05, |
|
"loss": 1.0162, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"eval_loss": 1.032899022102356, |
|
"eval_runtime": 476.7218, |
|
"eval_samples_per_second": 32.199, |
|
"eval_steps_per_second": 0.126, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3736162361623616, |
|
"grad_norm": 1.750763339163792, |
|
"learning_rate": 1.996495254664235e-05, |
|
"loss": 1.0238, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.37822878228782286, |
|
"grad_norm": 2.084259407603671, |
|
"learning_rate": 1.996031833775297e-05, |
|
"loss": 1.0144, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3828413284132841, |
|
"grad_norm": 1.581188361061769, |
|
"learning_rate": 1.995539707507284e-05, |
|
"loss": 1.0034, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3874538745387454, |
|
"grad_norm": 1.6642375432029033, |
|
"learning_rate": 1.9950188900431464e-05, |
|
"loss": 1.0452, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.39206642066420666, |
|
"grad_norm": 1.90454885480689, |
|
"learning_rate": 1.9944693963927092e-05, |
|
"loss": 1.0156, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3966789667896679, |
|
"grad_norm": 1.9076933400032088, |
|
"learning_rate": 1.9938912423922368e-05, |
|
"loss": 1.0243, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.40129151291512916, |
|
"grad_norm": 1.5419487099448481, |
|
"learning_rate": 1.9932844447039775e-05, |
|
"loss": 1.0036, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4059040590405904, |
|
"grad_norm": 1.5469597080761535, |
|
"learning_rate": 1.992649020815683e-05, |
|
"loss": 1.0216, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.41051660516605165, |
|
"grad_norm": 1.727534201068121, |
|
"learning_rate": 1.991984989040105e-05, |
|
"loss": 1.023, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4151291512915129, |
|
"grad_norm": 1.5601876251457003, |
|
"learning_rate": 1.9912923685144673e-05, |
|
"loss": 1.0309, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.41974169741697415, |
|
"grad_norm": 1.5739530073618044, |
|
"learning_rate": 1.9905711791999135e-05, |
|
"loss": 1.009, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.42435424354243545, |
|
"grad_norm": 1.676935183830041, |
|
"learning_rate": 1.989821441880933e-05, |
|
"loss": 1.01, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4289667896678967, |
|
"grad_norm": 1.6489937405447432, |
|
"learning_rate": 1.98904317816476e-05, |
|
"loss": 1.023, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.43357933579335795, |
|
"grad_norm": 1.5735733105955712, |
|
"learning_rate": 1.9882364104807536e-05, |
|
"loss": 1.0348, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4381918819188192, |
|
"grad_norm": 1.549624253561804, |
|
"learning_rate": 1.9874011620797494e-05, |
|
"loss": 1.0302, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.44280442804428044, |
|
"grad_norm": 1.5401331906305855, |
|
"learning_rate": 1.9865374570333887e-05, |
|
"loss": 1.0217, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4474169741697417, |
|
"grad_norm": 1.5202857812071402, |
|
"learning_rate": 1.9856453202334277e-05, |
|
"loss": 1.0388, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.45202952029520294, |
|
"grad_norm": 1.7005961483689318, |
|
"learning_rate": 1.9847247773910176e-05, |
|
"loss": 1.0167, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4566420664206642, |
|
"grad_norm": 1.7121845874238086, |
|
"learning_rate": 1.9837758550359637e-05, |
|
"loss": 1.0041, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4612546125461255, |
|
"grad_norm": 1.6923734480662047, |
|
"learning_rate": 1.9827985805159626e-05, |
|
"loss": 1.0378, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.46586715867158673, |
|
"grad_norm": 1.756497601027887, |
|
"learning_rate": 1.981792981995812e-05, |
|
"loss": 1.0148, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.470479704797048, |
|
"grad_norm": 1.6043365874775524, |
|
"learning_rate": 1.980759088456601e-05, |
|
"loss": 1.0306, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.47509225092250923, |
|
"grad_norm": 1.5593042352189914, |
|
"learning_rate": 1.9796969296948723e-05, |
|
"loss": 1.0384, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4797047970479705, |
|
"grad_norm": 1.8356964322880667, |
|
"learning_rate": 1.978606536321767e-05, |
|
"loss": 1.0277, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4843173431734317, |
|
"grad_norm": 1.5388627493896347, |
|
"learning_rate": 1.9774879397621387e-05, |
|
"loss": 1.0089, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.488929889298893, |
|
"grad_norm": 1.5717047981647194, |
|
"learning_rate": 1.9763411722536503e-05, |
|
"loss": 1.0206, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4935424354243542, |
|
"grad_norm": 1.673580231538279, |
|
"learning_rate": 1.9751662668458434e-05, |
|
"loss": 1.0071, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4981549815498155, |
|
"grad_norm": 1.977373456991288, |
|
"learning_rate": 1.9739632573991877e-05, |
|
"loss": 1.0223, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5027675276752768, |
|
"grad_norm": 1.6746529870758962, |
|
"learning_rate": 1.9727321785841028e-05, |
|
"loss": 1.0105, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.507380073800738, |
|
"grad_norm": 1.515127525498714, |
|
"learning_rate": 1.9714730658799616e-05, |
|
"loss": 1.0159, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5119926199261993, |
|
"grad_norm": 1.563050922669026, |
|
"learning_rate": 1.9701859555740647e-05, |
|
"loss": 1.026, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5166051660516605, |
|
"grad_norm": 1.635916190051428, |
|
"learning_rate": 1.9688708847605977e-05, |
|
"loss": 1.0148, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5212177121771218, |
|
"grad_norm": 1.6077492732765468, |
|
"learning_rate": 1.9675278913395605e-05, |
|
"loss": 1.0126, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.525830258302583, |
|
"grad_norm": 1.6108748420566017, |
|
"learning_rate": 1.9661570140156746e-05, |
|
"loss": 1.0116, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5304428044280443, |
|
"grad_norm": 1.433325441089538, |
|
"learning_rate": 1.9647582922972696e-05, |
|
"loss": 1.012, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5350553505535055, |
|
"grad_norm": 1.5420337235660757, |
|
"learning_rate": 1.9633317664951418e-05, |
|
"loss": 1.0122, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5396678966789668, |
|
"grad_norm": 1.5924684100513768, |
|
"learning_rate": 1.9618774777213954e-05, |
|
"loss": 1.0109, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.544280442804428, |
|
"grad_norm": 1.5313594203853436, |
|
"learning_rate": 1.960395467888255e-05, |
|
"loss": 1.0031, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5488929889298892, |
|
"grad_norm": 1.4655266545088188, |
|
"learning_rate": 1.9588857797068602e-05, |
|
"loss": 1.0315, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"grad_norm": 1.5909950744220547, |
|
"learning_rate": 1.957348456686032e-05, |
|
"loss": 1.0095, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"eval_loss": 1.0302140712738037, |
|
"eval_runtime": 620.3005, |
|
"eval_samples_per_second": 24.746, |
|
"eval_steps_per_second": 0.097, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5581180811808119, |
|
"grad_norm": 1.7854466441111572, |
|
"learning_rate": 1.955783543131022e-05, |
|
"loss": 1.0181, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5627306273062731, |
|
"grad_norm": 1.5564929489350854, |
|
"learning_rate": 1.9541910841422324e-05, |
|
"loss": 1.0259, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5673431734317343, |
|
"grad_norm": 1.4897983626795097, |
|
"learning_rate": 1.952571125613918e-05, |
|
"loss": 1.0108, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5719557195571956, |
|
"grad_norm": 1.6487054053969497, |
|
"learning_rate": 1.9509237142328638e-05, |
|
"loss": 1.0217, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5765682656826568, |
|
"grad_norm": 1.55798491461706, |
|
"learning_rate": 1.949248897477038e-05, |
|
"loss": 1.0095, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5811808118081181, |
|
"grad_norm": 1.5666282016035418, |
|
"learning_rate": 1.9475467236142252e-05, |
|
"loss": 1.0197, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5857933579335793, |
|
"grad_norm": 1.5475535188970362, |
|
"learning_rate": 1.9458172417006347e-05, |
|
"loss": 1.029, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5904059040590406, |
|
"grad_norm": 1.6315436927424156, |
|
"learning_rate": 1.944060501579487e-05, |
|
"loss": 1.0298, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5950184501845018, |
|
"grad_norm": 1.468711297047145, |
|
"learning_rate": 1.9422765538795758e-05, |
|
"loss": 1.0018, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5996309963099631, |
|
"grad_norm": 1.518389031402169, |
|
"learning_rate": 1.9404654500138117e-05, |
|
"loss": 1.0226, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6042435424354243, |
|
"grad_norm": 1.4592808647120747, |
|
"learning_rate": 1.938627242177738e-05, |
|
"loss": 1.0174, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6088560885608856, |
|
"grad_norm": 1.480051570474581, |
|
"learning_rate": 1.936761983348028e-05, |
|
"loss": 1.0063, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6134686346863468, |
|
"grad_norm": 1.5455098945055998, |
|
"learning_rate": 1.9348697272809568e-05, |
|
"loss": 1.0186, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6180811808118081, |
|
"grad_norm": 1.467795849616486, |
|
"learning_rate": 1.9329505285108544e-05, |
|
"loss": 1.0223, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6226937269372693, |
|
"grad_norm": 1.4583083150731897, |
|
"learning_rate": 1.9310044423485303e-05, |
|
"loss": 1.0188, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6273062730627307, |
|
"grad_norm": 1.4520145261143442, |
|
"learning_rate": 1.9290315248796834e-05, |
|
"loss": 1.0148, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6319188191881919, |
|
"grad_norm": 1.545625151246913, |
|
"learning_rate": 1.9270318329632833e-05, |
|
"loss": 1.0124, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6365313653136532, |
|
"grad_norm": 1.5081286119716983, |
|
"learning_rate": 1.925005424229933e-05, |
|
"loss": 1.0122, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6411439114391144, |
|
"grad_norm": 1.5771614507281495, |
|
"learning_rate": 1.922952357080205e-05, |
|
"loss": 1.0304, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6457564575645757, |
|
"grad_norm": 1.3712078687992697, |
|
"learning_rate": 1.9208726906829637e-05, |
|
"loss": 0.9935, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6503690036900369, |
|
"grad_norm": 1.514260367509165, |
|
"learning_rate": 1.9187664849736542e-05, |
|
"loss": 0.9928, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6549815498154982, |
|
"grad_norm": 1.4337714275045377, |
|
"learning_rate": 1.9166338006525786e-05, |
|
"loss": 0.9999, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6595940959409594, |
|
"grad_norm": 1.5474123891067624, |
|
"learning_rate": 1.9144746991831463e-05, |
|
"loss": 1.0136, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6642066420664207, |
|
"grad_norm": 1.6417540187004078, |
|
"learning_rate": 1.9122892427901015e-05, |
|
"loss": 1.0148, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6688191881918819, |
|
"grad_norm": 1.4429113958532114, |
|
"learning_rate": 1.9100774944577303e-05, |
|
"loss": 1.0054, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6734317343173432, |
|
"grad_norm": 1.5435502423447707, |
|
"learning_rate": 1.907839517928046e-05, |
|
"loss": 1.0042, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6780442804428044, |
|
"grad_norm": 1.6291068946061023, |
|
"learning_rate": 1.9055753776989516e-05, |
|
"loss": 1.0095, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6826568265682657, |
|
"grad_norm": 1.5269546110275527, |
|
"learning_rate": 1.903285139022381e-05, |
|
"loss": 1.0091, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6872693726937269, |
|
"grad_norm": 1.6910985232351008, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 1.0105, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6918819188191881, |
|
"grad_norm": 1.473141009294655, |
|
"learning_rate": 1.898626631093399e-05, |
|
"loss": 1.0016, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6964944649446494, |
|
"grad_norm": 1.6512202388953925, |
|
"learning_rate": 1.896258496097977e-05, |
|
"loss": 1.0119, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7011070110701108, |
|
"grad_norm": 1.4887026369918788, |
|
"learning_rate": 1.8938645311651904e-05, |
|
"loss": 1.0087, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.705719557195572, |
|
"grad_norm": 1.3843944368722685, |
|
"learning_rate": 1.891444805288487e-05, |
|
"loss": 1.0091, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7103321033210332, |
|
"grad_norm": 1.428126710831411, |
|
"learning_rate": 1.888999388203739e-05, |
|
"loss": 1.0059, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7149446494464945, |
|
"grad_norm": 1.4390802486166878, |
|
"learning_rate": 1.8865283503872325e-05, |
|
"loss": 0.9994, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7195571955719557, |
|
"grad_norm": 1.5690316004271283, |
|
"learning_rate": 1.884031763053636e-05, |
|
"loss": 0.9996, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.724169741697417, |
|
"grad_norm": 1.5021545547633912, |
|
"learning_rate": 1.8815096981539494e-05, |
|
"loss": 0.9991, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7287822878228782, |
|
"grad_norm": 1.4900671252584468, |
|
"learning_rate": 1.8789622283734283e-05, |
|
"loss": 1.0101, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7333948339483395, |
|
"grad_norm": 1.630926420050374, |
|
"learning_rate": 1.8763894271294914e-05, |
|
"loss": 0.9929, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"grad_norm": 1.3834521402990088, |
|
"learning_rate": 1.873791368569603e-05, |
|
"loss": 0.9857, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"eval_loss": 1.0203672647476196, |
|
"eval_runtime": 417.6477, |
|
"eval_samples_per_second": 36.753, |
|
"eval_steps_per_second": 0.144, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.742619926199262, |
|
"grad_norm": 1.4145701317152546, |
|
"learning_rate": 1.8711681275691366e-05, |
|
"loss": 1.0197, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7472324723247232, |
|
"grad_norm": 1.6689498503071274, |
|
"learning_rate": 1.868519779729218e-05, |
|
"loss": 1.0399, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7518450184501845, |
|
"grad_norm": 1.5473797400416536, |
|
"learning_rate": 1.8658464013745443e-05, |
|
"loss": 1.0189, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7564575645756457, |
|
"grad_norm": 1.5062087120542231, |
|
"learning_rate": 1.8631480695511866e-05, |
|
"loss": 1.0154, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.761070110701107, |
|
"grad_norm": 1.4715819927221123, |
|
"learning_rate": 1.8604248620243682e-05, |
|
"loss": 0.9923, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7656826568265682, |
|
"grad_norm": 1.5582751445765881, |
|
"learning_rate": 1.8576768572762233e-05, |
|
"loss": 1.0035, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7702952029520295, |
|
"grad_norm": 1.5849009070973952, |
|
"learning_rate": 1.8549041345035354e-05, |
|
"loss": 1.013, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7749077490774908, |
|
"grad_norm": 1.53624381630094, |
|
"learning_rate": 1.8521067736154567e-05, |
|
"loss": 1.0212, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7795202952029521, |
|
"grad_norm": 1.5482989884986487, |
|
"learning_rate": 1.8492848552312016e-05, |
|
"loss": 0.9879, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7841328413284133, |
|
"grad_norm": 1.4110912591448512, |
|
"learning_rate": 1.8464384606777258e-05, |
|
"loss": 0.9973, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7887453874538746, |
|
"grad_norm": 1.4605506515813482, |
|
"learning_rate": 1.8435676719873828e-05, |
|
"loss": 1.0007, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7933579335793358, |
|
"grad_norm": 1.4855314358619898, |
|
"learning_rate": 1.8406725718955575e-05, |
|
"loss": 0.9921, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7979704797047971, |
|
"grad_norm": 1.4175285986471877, |
|
"learning_rate": 1.837753243838283e-05, |
|
"loss": 0.9947, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8025830258302583, |
|
"grad_norm": 1.5124089244421277, |
|
"learning_rate": 1.834809771949837e-05, |
|
"loss": 1.0007, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8071955719557196, |
|
"grad_norm": 1.6185245778463926, |
|
"learning_rate": 1.8318422410603162e-05, |
|
"loss": 1.0005, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8118081180811808, |
|
"grad_norm": 1.74728705302942, |
|
"learning_rate": 1.8288507366931907e-05, |
|
"loss": 0.9977, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.816420664206642, |
|
"grad_norm": 1.3700788841960891, |
|
"learning_rate": 1.8258353450628402e-05, |
|
"loss": 0.9953, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8210332103321033, |
|
"grad_norm": 1.448030688939384, |
|
"learning_rate": 1.8227961530720696e-05, |
|
"loss": 0.9927, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8256457564575646, |
|
"grad_norm": 1.3860802683248008, |
|
"learning_rate": 1.819733248309604e-05, |
|
"loss": 1.0137, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8302583025830258, |
|
"grad_norm": 1.3807867910302518, |
|
"learning_rate": 1.816646719047563e-05, |
|
"loss": 0.9985, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.834870848708487, |
|
"grad_norm": 1.5246405968433066, |
|
"learning_rate": 1.8135366542389202e-05, |
|
"loss": 0.9965, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.8394833948339483, |
|
"grad_norm": 1.5298770681063796, |
|
"learning_rate": 1.8104031435149366e-05, |
|
"loss": 0.9895, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8440959409594095, |
|
"grad_norm": 1.5737628456798298, |
|
"learning_rate": 1.807246277182578e-05, |
|
"loss": 1.0016, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8487084870848709, |
|
"grad_norm": 1.3867737980631065, |
|
"learning_rate": 1.8040661462219135e-05, |
|
"loss": 0.9905, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8533210332103321, |
|
"grad_norm": 1.4453648478128216, |
|
"learning_rate": 1.8008628422834923e-05, |
|
"loss": 1.0005, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8579335793357934, |
|
"grad_norm": 1.612991010163291, |
|
"learning_rate": 1.797636457685703e-05, |
|
"loss": 0.9915, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8625461254612546, |
|
"grad_norm": 1.4214188542336526, |
|
"learning_rate": 1.7943870854121126e-05, |
|
"loss": 0.9822, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8671586715867159, |
|
"grad_norm": 1.5619301079989365, |
|
"learning_rate": 1.791114819108788e-05, |
|
"loss": 0.9781, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8717712177121771, |
|
"grad_norm": 1.575000171763216, |
|
"learning_rate": 1.787819753081594e-05, |
|
"loss": 1.0021, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8763837638376384, |
|
"grad_norm": 1.5295894907745344, |
|
"learning_rate": 1.784501982293479e-05, |
|
"loss": 1.0077, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8809963099630996, |
|
"grad_norm": 1.455437508846778, |
|
"learning_rate": 1.781161602361737e-05, |
|
"loss": 0.9757, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8856088560885609, |
|
"grad_norm": 1.4641943856542534, |
|
"learning_rate": 1.7777987095552512e-05, |
|
"loss": 0.9918, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8902214022140221, |
|
"grad_norm": 1.56760438004586, |
|
"learning_rate": 1.7744134007917195e-05, |
|
"loss": 0.9952, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8948339483394834, |
|
"grad_norm": 1.4881030883207977, |
|
"learning_rate": 1.7710057736348622e-05, |
|
"loss": 0.9995, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8994464944649446, |
|
"grad_norm": 1.4138210092484749, |
|
"learning_rate": 1.7675759262916105e-05, |
|
"loss": 0.9814, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9040590405904059, |
|
"grad_norm": 1.5269872502666184, |
|
"learning_rate": 1.764123957609275e-05, |
|
"loss": 0.9969, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9086715867158671, |
|
"grad_norm": 1.4777357727162057, |
|
"learning_rate": 1.7606499670726972e-05, |
|
"loss": 0.9922, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.9132841328413284, |
|
"grad_norm": 1.5422388516772692, |
|
"learning_rate": 1.7571540548013836e-05, |
|
"loss": 0.9946, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9178966789667896, |
|
"grad_norm": 1.519307824816888, |
|
"learning_rate": 1.753636321546619e-05, |
|
"loss": 0.9966, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.922509225092251, |
|
"grad_norm": 1.4264538885727793, |
|
"learning_rate": 1.7500968686885634e-05, |
|
"loss": 0.9803, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.922509225092251, |
|
"eval_loss": 1.0050979852676392, |
|
"eval_runtime": 475.0904, |
|
"eval_samples_per_second": 32.31, |
|
"eval_steps_per_second": 0.126, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9271217712177122, |
|
"grad_norm": 1.5178129589503198, |
|
"learning_rate": 1.7465357982333294e-05, |
|
"loss": 0.9965, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.9317343173431735, |
|
"grad_norm": 1.4749449313002256, |
|
"learning_rate": 1.742953212810045e-05, |
|
"loss": 0.998, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9363468634686347, |
|
"grad_norm": 1.4826220358510274, |
|
"learning_rate": 1.739349215667891e-05, |
|
"loss": 0.9829, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.940959409594096, |
|
"grad_norm": 1.4366615526126456, |
|
"learning_rate": 1.735723910673132e-05, |
|
"loss": 0.9847, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9455719557195572, |
|
"grad_norm": 1.4260729595159904, |
|
"learning_rate": 1.732077402306116e-05, |
|
"loss": 0.986, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9501845018450185, |
|
"grad_norm": 1.5039785990003167, |
|
"learning_rate": 1.7284097956582694e-05, |
|
"loss": 0.9745, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9547970479704797, |
|
"grad_norm": 1.5082306367621992, |
|
"learning_rate": 1.7247211964290635e-05, |
|
"loss": 0.9966, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.959409594095941, |
|
"grad_norm": 1.4449103838652617, |
|
"learning_rate": 1.721011710922972e-05, |
|
"loss": 0.969, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9640221402214022, |
|
"grad_norm": 1.493303736853594, |
|
"learning_rate": 1.717281446046404e-05, |
|
"loss": 0.9861, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9686346863468634, |
|
"grad_norm": 1.47859930222641, |
|
"learning_rate": 1.713530509304627e-05, |
|
"loss": 0.9962, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9732472324723247, |
|
"grad_norm": 1.4442728791265258, |
|
"learning_rate": 1.709759008798663e-05, |
|
"loss": 0.9902, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.977859778597786, |
|
"grad_norm": 1.4376492964532295, |
|
"learning_rate": 1.7059670532221802e-05, |
|
"loss": 0.9831, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9824723247232472, |
|
"grad_norm": 2.484316021155763, |
|
"learning_rate": 1.7021547518583536e-05, |
|
"loss": 0.9813, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9870848708487084, |
|
"grad_norm": 1.4464291734707186, |
|
"learning_rate": 1.6983222145767198e-05, |
|
"loss": 0.9902, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9916974169741697, |
|
"grad_norm": 1.5281863553882298, |
|
"learning_rate": 1.6944695518300087e-05, |
|
"loss": 0.9807, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.996309963099631, |
|
"grad_norm": 1.4201117731708275, |
|
"learning_rate": 1.6905968746509618e-05, |
|
"loss": 0.9746, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0009225092250922, |
|
"grad_norm": 2.838028209786232, |
|
"learning_rate": 1.6867042946491306e-05, |
|
"loss": 0.9546, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.0055350553505535, |
|
"grad_norm": 2.0637438515793582, |
|
"learning_rate": 1.6827919240076612e-05, |
|
"loss": 0.7562, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0101476014760147, |
|
"grad_norm": 2.03888870440938, |
|
"learning_rate": 1.6788598754800602e-05, |
|
"loss": 0.7325, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.014760147601476, |
|
"grad_norm": 1.5943747459674837, |
|
"learning_rate": 1.6749082623869465e-05, |
|
"loss": 0.7403, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0193726937269372, |
|
"grad_norm": 1.705271980777924, |
|
"learning_rate": 1.6709371986127846e-05, |
|
"loss": 0.749, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.0239852398523985, |
|
"grad_norm": 1.7629595253417687, |
|
"learning_rate": 1.6669467986026012e-05, |
|
"loss": 0.7087, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0285977859778597, |
|
"grad_norm": 1.6633137977760193, |
|
"learning_rate": 1.662937177358691e-05, |
|
"loss": 0.7394, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.033210332103321, |
|
"grad_norm": 1.5074327799371463, |
|
"learning_rate": 1.6589084504372975e-05, |
|
"loss": 0.7164, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0378228782287824, |
|
"grad_norm": 1.6191564800390028, |
|
"learning_rate": 1.6548607339452853e-05, |
|
"loss": 0.7251, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.0424354243542435, |
|
"grad_norm": 1.6118486061526904, |
|
"learning_rate": 1.6507941445367935e-05, |
|
"loss": 0.7317, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.0470479704797049, |
|
"grad_norm": 1.6655447135885555, |
|
"learning_rate": 1.6467087994098753e-05, |
|
"loss": 0.7439, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.051660516605166, |
|
"grad_norm": 1.8659877314188116, |
|
"learning_rate": 1.6426048163031155e-05, |
|
"loss": 0.7311, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.0562730627306274, |
|
"grad_norm": 1.6691346554473123, |
|
"learning_rate": 1.6384823134922444e-05, |
|
"loss": 0.7304, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.0608856088560885, |
|
"grad_norm": 1.6067875332998411, |
|
"learning_rate": 1.634341409786723e-05, |
|
"loss": 0.7239, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.0654981549815499, |
|
"grad_norm": 1.7410755263145463, |
|
"learning_rate": 1.6301822245263212e-05, |
|
"loss": 0.7339, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.070110701107011, |
|
"grad_norm": 1.5078513231888042, |
|
"learning_rate": 1.6260048775776804e-05, |
|
"loss": 0.7344, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.0747232472324724, |
|
"grad_norm": 1.6250331925877979, |
|
"learning_rate": 1.6218094893308553e-05, |
|
"loss": 0.7418, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.0793357933579335, |
|
"grad_norm": 1.6654181563693542, |
|
"learning_rate": 1.6175961806958476e-05, |
|
"loss": 0.7265, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0839483394833949, |
|
"grad_norm": 1.7420998771046359, |
|
"learning_rate": 1.6133650730991183e-05, |
|
"loss": 0.723, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.088560885608856, |
|
"grad_norm": 1.6483689819198597, |
|
"learning_rate": 1.609116288480092e-05, |
|
"loss": 0.7316, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.0931734317343174, |
|
"grad_norm": 1.593740794425406, |
|
"learning_rate": 1.6048499492876378e-05, |
|
"loss": 0.7374, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.0977859778597785, |
|
"grad_norm": 1.5370001403119866, |
|
"learning_rate": 1.6005661784765453e-05, |
|
"loss": 0.7457, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.1023985239852399, |
|
"grad_norm": 1.4718068363327683, |
|
"learning_rate": 1.5962650995039783e-05, |
|
"loss": 0.7328, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"grad_norm": 1.5301401714407428, |
|
"learning_rate": 1.5919468363259164e-05, |
|
"loss": 0.736, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"eval_loss": 1.0061343908309937, |
|
"eval_runtime": 439.8508, |
|
"eval_samples_per_second": 34.898, |
|
"eval_steps_per_second": 0.136, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1116236162361623, |
|
"grad_norm": 1.7064428647610237, |
|
"learning_rate": 1.587611513393585e-05, |
|
"loss": 0.7297, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.1162361623616237, |
|
"grad_norm": 1.6208328161309395, |
|
"learning_rate": 1.5832592556498657e-05, |
|
"loss": 0.7346, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.1208487084870848, |
|
"grad_norm": 1.802694495701501, |
|
"learning_rate": 1.5788901885256983e-05, |
|
"loss": 0.7365, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.1254612546125462, |
|
"grad_norm": 1.5370188196224415, |
|
"learning_rate": 1.5745044379364637e-05, |
|
"loss": 0.7305, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.1300738007380073, |
|
"grad_norm": 1.4889078253244556, |
|
"learning_rate": 1.5701021302783557e-05, |
|
"loss": 0.732, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.1346863468634687, |
|
"grad_norm": 1.6294276521184954, |
|
"learning_rate": 1.56568339242474e-05, |
|
"loss": 0.7276, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1392988929889298, |
|
"grad_norm": 1.5397859025285652, |
|
"learning_rate": 1.5612483517224942e-05, |
|
"loss": 0.7354, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.1439114391143912, |
|
"grad_norm": 1.4953485038562, |
|
"learning_rate": 1.556797135988342e-05, |
|
"loss": 0.7173, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.1485239852398523, |
|
"grad_norm": 1.853185392904802, |
|
"learning_rate": 1.5523298735051657e-05, |
|
"loss": 0.7489, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.1531365313653137, |
|
"grad_norm": 1.5704541475489389, |
|
"learning_rate": 1.5478466930183107e-05, |
|
"loss": 0.7191, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.1577490774907748, |
|
"grad_norm": 1.5415559777438193, |
|
"learning_rate": 1.5433477237318765e-05, |
|
"loss": 0.7327, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.1623616236162362, |
|
"grad_norm": 1.6666092802375732, |
|
"learning_rate": 1.5388330953049907e-05, |
|
"loss": 0.7473, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.1669741697416973, |
|
"grad_norm": 1.8791358127374613, |
|
"learning_rate": 1.5343029378480733e-05, |
|
"loss": 0.7312, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.1715867158671587, |
|
"grad_norm": 1.5841817509277247, |
|
"learning_rate": 1.5297573819190873e-05, |
|
"loss": 0.7416, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.17619926199262, |
|
"grad_norm": 1.5715763468516226, |
|
"learning_rate": 1.5251965585197748e-05, |
|
"loss": 0.7307, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.1808118081180812, |
|
"grad_norm": 1.5391235781858166, |
|
"learning_rate": 1.5206205990918836e-05, |
|
"loss": 0.7212, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1854243542435423, |
|
"grad_norm": 1.5629140884896369, |
|
"learning_rate": 1.5160296355133773e-05, |
|
"loss": 0.7312, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.1900369003690037, |
|
"grad_norm": 5.396412006514146, |
|
"learning_rate": 1.5114238000946353e-05, |
|
"loss": 0.7141, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.194649446494465, |
|
"grad_norm": 1.5426421143956044, |
|
"learning_rate": 1.50680322557464e-05, |
|
"loss": 0.7308, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.1992619926199262, |
|
"grad_norm": 1.6701861441853627, |
|
"learning_rate": 1.5021680451171499e-05, |
|
"loss": 0.7415, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2038745387453875, |
|
"grad_norm": 1.5779190034035813, |
|
"learning_rate": 1.4975183923068637e-05, |
|
"loss": 0.7302, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.2084870848708487, |
|
"grad_norm": 1.619587352430737, |
|
"learning_rate": 1.492854401145569e-05, |
|
"loss": 0.7318, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.21309963099631, |
|
"grad_norm": 1.619976635054261, |
|
"learning_rate": 1.4881762060482814e-05, |
|
"loss": 0.7254, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.2177121771217712, |
|
"grad_norm": 1.5945364409345257, |
|
"learning_rate": 1.48348394183937e-05, |
|
"loss": 0.7402, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.2223247232472325, |
|
"grad_norm": 1.5229514773361725, |
|
"learning_rate": 1.4787777437486723e-05, |
|
"loss": 0.7367, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.2269372693726937, |
|
"grad_norm": 2.0971835619796932, |
|
"learning_rate": 1.4740577474075963e-05, |
|
"loss": 0.7416, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.231549815498155, |
|
"grad_norm": 1.5602315222575482, |
|
"learning_rate": 1.4693240888452121e-05, |
|
"loss": 0.7375, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.2361623616236161, |
|
"grad_norm": 2.090838147725305, |
|
"learning_rate": 1.4645769044843318e-05, |
|
"loss": 0.7375, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.2407749077490775, |
|
"grad_norm": 1.631256144537058, |
|
"learning_rate": 1.459816331137577e-05, |
|
"loss": 0.7463, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.2453874538745389, |
|
"grad_norm": 1.6259793781417131, |
|
"learning_rate": 1.4550425060034367e-05, |
|
"loss": 0.7237, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5839497005284708, |
|
"learning_rate": 1.450255566662313e-05, |
|
"loss": 0.7267, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.2546125461254611, |
|
"grad_norm": 1.4817515646858677, |
|
"learning_rate": 1.4454556510725556e-05, |
|
"loss": 0.7384, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.2592250922509225, |
|
"grad_norm": 1.619889890472081, |
|
"learning_rate": 1.4406428975664875e-05, |
|
"loss": 0.7445, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.2638376383763839, |
|
"grad_norm": 1.5742411445585174, |
|
"learning_rate": 1.4358174448464155e-05, |
|
"loss": 0.731, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.268450184501845, |
|
"grad_norm": 1.5691396578757213, |
|
"learning_rate": 1.4309794319806356e-05, |
|
"loss": 0.7445, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.2730627306273063, |
|
"grad_norm": 1.5149875801425627, |
|
"learning_rate": 1.4261289983994236e-05, |
|
"loss": 0.7265, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.2776752767527675, |
|
"grad_norm": 1.5087047437199383, |
|
"learning_rate": 1.421266283891017e-05, |
|
"loss": 0.7456, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.2822878228782288, |
|
"grad_norm": 1.5718051035987606, |
|
"learning_rate": 1.4163914285975863e-05, |
|
"loss": 0.7212, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.28690036900369, |
|
"grad_norm": 1.5655345894552062, |
|
"learning_rate": 1.411504573011197e-05, |
|
"loss": 0.7112, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.2915129151291513, |
|
"grad_norm": 1.6127771597600427, |
|
"learning_rate": 1.4066058579697593e-05, |
|
"loss": 0.7249, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2915129151291513, |
|
"eval_loss": 1.000433087348938, |
|
"eval_runtime": 377.6786, |
|
"eval_samples_per_second": 40.643, |
|
"eval_steps_per_second": 0.159, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2961254612546125, |
|
"grad_norm": 1.5789241025710268, |
|
"learning_rate": 1.4016954246529697e-05, |
|
"loss": 0.7284, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.3007380073800738, |
|
"grad_norm": 1.6370566010616505, |
|
"learning_rate": 1.3967734145782425e-05, |
|
"loss": 0.7233, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.305350553505535, |
|
"grad_norm": 1.6019988849536153, |
|
"learning_rate": 1.391839969596632e-05, |
|
"loss": 0.7305, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.3099630996309963, |
|
"grad_norm": 1.5321404428187482, |
|
"learning_rate": 1.3868952318887421e-05, |
|
"loss": 0.7161, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.3145756457564577, |
|
"grad_norm": 1.617468685697329, |
|
"learning_rate": 1.3819393439606313e-05, |
|
"loss": 0.7383, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.3191881918819188, |
|
"grad_norm": 1.4942275323967702, |
|
"learning_rate": 1.3769724486397035e-05, |
|
"loss": 0.7309, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.32380073800738, |
|
"grad_norm": 1.526992745554204, |
|
"learning_rate": 1.371994689070594e-05, |
|
"loss": 0.7241, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.3284132841328413, |
|
"grad_norm": 1.5223415766171715, |
|
"learning_rate": 1.3670062087110423e-05, |
|
"loss": 0.7369, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.3330258302583027, |
|
"grad_norm": 1.5349271200758632, |
|
"learning_rate": 1.362007151327758e-05, |
|
"loss": 0.7408, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.3376383763837638, |
|
"grad_norm": 1.5487172960940725, |
|
"learning_rate": 1.3569976609922785e-05, |
|
"loss": 0.7366, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.3422509225092252, |
|
"grad_norm": 1.4925683710432864, |
|
"learning_rate": 1.3519778820768157e-05, |
|
"loss": 0.7316, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.3468634686346863, |
|
"grad_norm": 1.508872342522196, |
|
"learning_rate": 1.3469479592500954e-05, |
|
"loss": 0.7282, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.3514760147601477, |
|
"grad_norm": 1.5205565120558908, |
|
"learning_rate": 1.3419080374731889e-05, |
|
"loss": 0.7361, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.3560885608856088, |
|
"grad_norm": 1.5925418525444446, |
|
"learning_rate": 1.3368582619953348e-05, |
|
"loss": 0.7314, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.3607011070110702, |
|
"grad_norm": 1.536022556446867, |
|
"learning_rate": 1.331798778349752e-05, |
|
"loss": 0.7297, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.3653136531365313, |
|
"grad_norm": 1.6415269938571113, |
|
"learning_rate": 1.326729732349447e-05, |
|
"loss": 0.7236, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.3699261992619927, |
|
"grad_norm": 1.5571914542665708, |
|
"learning_rate": 1.3216512700830104e-05, |
|
"loss": 0.7456, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.3745387453874538, |
|
"grad_norm": 1.537220055899943, |
|
"learning_rate": 1.3165635379104079e-05, |
|
"loss": 0.7283, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.3791512915129152, |
|
"grad_norm": 1.586303040039408, |
|
"learning_rate": 1.31146668245876e-05, |
|
"loss": 0.74, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.3837638376383765, |
|
"grad_norm": 1.599460902002627, |
|
"learning_rate": 1.3063608506181189e-05, |
|
"loss": 0.7269, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.3883763837638377, |
|
"grad_norm": 1.5845763646878845, |
|
"learning_rate": 1.3012461895372343e-05, |
|
"loss": 0.7207, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.3929889298892988, |
|
"grad_norm": 1.6101936272120416, |
|
"learning_rate": 1.2961228466193116e-05, |
|
"loss": 0.7491, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.3976014760147601, |
|
"grad_norm": 1.5328730912860027, |
|
"learning_rate": 1.2909909695177647e-05, |
|
"loss": 0.7428, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.4022140221402215, |
|
"grad_norm": 1.4700749106652151, |
|
"learning_rate": 1.28585070613196e-05, |
|
"loss": 0.7337, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.4068265682656826, |
|
"grad_norm": 1.595108354129352, |
|
"learning_rate": 1.2807022046029556e-05, |
|
"loss": 0.7476, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.4114391143911438, |
|
"grad_norm": 1.4846465873330463, |
|
"learning_rate": 1.2755456133092295e-05, |
|
"loss": 0.7471, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.4160516605166051, |
|
"grad_norm": 1.6404654691954998, |
|
"learning_rate": 1.2703810808624051e-05, |
|
"loss": 0.7338, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.4206642066420665, |
|
"grad_norm": 1.5924740161540634, |
|
"learning_rate": 1.2652087561029682e-05, |
|
"loss": 0.7349, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.4252767527675276, |
|
"grad_norm": 1.4942826903716253, |
|
"learning_rate": 1.2600287880959762e-05, |
|
"loss": 0.725, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.429889298892989, |
|
"grad_norm": 1.5398233388725502, |
|
"learning_rate": 1.254841326126764e-05, |
|
"loss": 0.7376, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.4345018450184501, |
|
"grad_norm": 1.5654070863650702, |
|
"learning_rate": 1.2496465196966393e-05, |
|
"loss": 0.7318, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.4391143911439115, |
|
"grad_norm": 1.5898873014454018, |
|
"learning_rate": 1.2444445185185763e-05, |
|
"loss": 0.7306, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.4437269372693726, |
|
"grad_norm": 1.5102337179405925, |
|
"learning_rate": 1.239235472512899e-05, |
|
"loss": 0.7057, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.448339483394834, |
|
"grad_norm": 1.5134306661913561, |
|
"learning_rate": 1.2340195318029623e-05, |
|
"loss": 0.7216, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.4529520295202953, |
|
"grad_norm": 1.4927818706889626, |
|
"learning_rate": 1.228796846710825e-05, |
|
"loss": 0.7402, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.4575645756457565, |
|
"grad_norm": 1.455448272385386, |
|
"learning_rate": 1.2235675677529158e-05, |
|
"loss": 0.7172, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.4621771217712176, |
|
"grad_norm": 1.5634881056548133, |
|
"learning_rate": 1.2183318456356984e-05, |
|
"loss": 0.7389, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.466789667896679, |
|
"grad_norm": 1.4898756090326564, |
|
"learning_rate": 1.2130898312513255e-05, |
|
"loss": 0.7378, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.4714022140221403, |
|
"grad_norm": 1.5009726152389429, |
|
"learning_rate": 1.2078416756732925e-05, |
|
"loss": 0.7235, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"grad_norm": 1.5101924554242023, |
|
"learning_rate": 1.2025875301520811e-05, |
|
"loss": 0.7355, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"eval_loss": 0.9855098724365234, |
|
"eval_runtime": 380.6287, |
|
"eval_samples_per_second": 40.328, |
|
"eval_steps_per_second": 0.158, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.4806273062730626, |
|
"grad_norm": 1.4634212318812496, |
|
"learning_rate": 1.1973275461108027e-05, |
|
"loss": 0.7252, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.485239852398524, |
|
"grad_norm": 1.6489180454948977, |
|
"learning_rate": 1.1920618751408328e-05, |
|
"loss": 0.7196, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.4898523985239853, |
|
"grad_norm": 1.5637692459131638, |
|
"learning_rate": 1.186790668997443e-05, |
|
"loss": 0.7292, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.4944649446494465, |
|
"grad_norm": 1.5196144325218592, |
|
"learning_rate": 1.1815140795954268e-05, |
|
"loss": 0.7317, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.4990774907749078, |
|
"grad_norm": 1.5589118343089332, |
|
"learning_rate": 1.176232259004722e-05, |
|
"loss": 0.7282, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.503690036900369, |
|
"grad_norm": 1.4776701136160202, |
|
"learning_rate": 1.1709453594460279e-05, |
|
"loss": 0.7142, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.5083025830258303, |
|
"grad_norm": 1.5113121821467062, |
|
"learning_rate": 1.165653533286418e-05, |
|
"loss": 0.7267, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.5129151291512914, |
|
"grad_norm": 1.5621038457306815, |
|
"learning_rate": 1.1603569330349502e-05, |
|
"loss": 0.7194, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.5175276752767528, |
|
"grad_norm": 1.4342503333735157, |
|
"learning_rate": 1.1550557113382697e-05, |
|
"loss": 0.732, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.5221402214022142, |
|
"grad_norm": 1.4925030679944211, |
|
"learning_rate": 1.1497500209762102e-05, |
|
"loss": 0.7311, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.5267527675276753, |
|
"grad_norm": 1.5884492115586761, |
|
"learning_rate": 1.1444400148573918e-05, |
|
"loss": 0.7306, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.5313653136531364, |
|
"grad_norm": 1.4907133011402547, |
|
"learning_rate": 1.1391258460148135e-05, |
|
"loss": 0.7291, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.5359778597785978, |
|
"grad_norm": 1.5177536049645275, |
|
"learning_rate": 1.1338076676014427e-05, |
|
"loss": 0.7243, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.5405904059040592, |
|
"grad_norm": 1.5089626714379156, |
|
"learning_rate": 1.1284856328858017e-05, |
|
"loss": 0.7174, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.5452029520295203, |
|
"grad_norm": 1.518407729563154, |
|
"learning_rate": 1.1231598952475504e-05, |
|
"loss": 0.7188, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.5498154981549814, |
|
"grad_norm": 2.2443040427872973, |
|
"learning_rate": 1.1178306081730666e-05, |
|
"loss": 0.7274, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.5544280442804428, |
|
"grad_norm": 1.557724202719455, |
|
"learning_rate": 1.1124979252510209e-05, |
|
"loss": 0.7306, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.5590405904059041, |
|
"grad_norm": 1.5461146750078991, |
|
"learning_rate": 1.1071620001679514e-05, |
|
"loss": 0.7265, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.5636531365313653, |
|
"grad_norm": 1.677798283188097, |
|
"learning_rate": 1.1018229867038358e-05, |
|
"loss": 0.7296, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.5682656826568264, |
|
"grad_norm": 1.5842640863093371, |
|
"learning_rate": 1.0964810387276561e-05, |
|
"loss": 0.7136, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.5728782287822878, |
|
"grad_norm": 1.563395389439852, |
|
"learning_rate": 1.0911363101929677e-05, |
|
"loss": 0.7244, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.5774907749077491, |
|
"grad_norm": 1.5223804974728257, |
|
"learning_rate": 1.085788955133461e-05, |
|
"loss": 0.7263, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.5821033210332103, |
|
"grad_norm": 1.4519511890413386, |
|
"learning_rate": 1.080439127658521e-05, |
|
"loss": 0.7125, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.5867158671586716, |
|
"grad_norm": 1.522533945377353, |
|
"learning_rate": 1.0750869819487884e-05, |
|
"loss": 0.7273, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.591328413284133, |
|
"grad_norm": 1.5254481413189622, |
|
"learning_rate": 1.0697326722517137e-05, |
|
"loss": 0.7278, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.5959409594095941, |
|
"grad_norm": 1.5231671586261868, |
|
"learning_rate": 1.0643763528771136e-05, |
|
"loss": 0.7395, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.6005535055350553, |
|
"grad_norm": 1.4650142883805686, |
|
"learning_rate": 1.0590181781927229e-05, |
|
"loss": 0.7349, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.6051660516605166, |
|
"grad_norm": 1.4963676540506488, |
|
"learning_rate": 1.0536583026197462e-05, |
|
"loss": 0.7227, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.609778597785978, |
|
"grad_norm": 1.568382581129352, |
|
"learning_rate": 1.0482968806284073e-05, |
|
"loss": 0.7104, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.6143911439114391, |
|
"grad_norm": 1.4263377420776584, |
|
"learning_rate": 1.042934066733497e-05, |
|
"loss": 0.7295, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.6190036900369003, |
|
"grad_norm": 1.557321427633267, |
|
"learning_rate": 1.0375700154899208e-05, |
|
"loss": 0.7221, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.6236162361623616, |
|
"grad_norm": 1.4697183106878222, |
|
"learning_rate": 1.0322048814882438e-05, |
|
"loss": 0.7137, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.628228782287823, |
|
"grad_norm": 1.5186029224528301, |
|
"learning_rate": 1.0268388193502365e-05, |
|
"loss": 0.7064, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.632841328413284, |
|
"grad_norm": 1.4447029146830694, |
|
"learning_rate": 1.0214719837244176e-05, |
|
"loss": 0.7288, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.6374538745387452, |
|
"grad_norm": 1.6070031997265373, |
|
"learning_rate": 1.0161045292815974e-05, |
|
"loss": 0.707, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.6420664206642066, |
|
"grad_norm": 1.426331730931973, |
|
"learning_rate": 1.010736610710421e-05, |
|
"loss": 0.709, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.646678966789668, |
|
"grad_norm": 1.452095892694617, |
|
"learning_rate": 1.0053683827129091e-05, |
|
"loss": 0.7121, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.651291512915129, |
|
"grad_norm": 1.57091551946505, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7134, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.6559040590405905, |
|
"grad_norm": 1.5750336048966571, |
|
"learning_rate": 9.946316172870909e-06, |
|
"loss": 0.7136, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.6605166051660518, |
|
"grad_norm": 1.4906719471502183, |
|
"learning_rate": 9.892633892895795e-06, |
|
"loss": 0.7151, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.6605166051660518, |
|
"eval_loss": 0.97125643491745, |
|
"eval_runtime": 375.0586, |
|
"eval_samples_per_second": 40.927, |
|
"eval_steps_per_second": 0.16, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.665129151291513, |
|
"grad_norm": 1.5072106225983386, |
|
"learning_rate": 9.83895470718403e-06, |
|
"loss": 0.7227, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.669741697416974, |
|
"grad_norm": 1.4508206944932882, |
|
"learning_rate": 9.785280162755825e-06, |
|
"loss": 0.724, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.6743542435424354, |
|
"grad_norm": 1.5498476457151749, |
|
"learning_rate": 9.731611806497637e-06, |
|
"loss": 0.7026, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.6789667896678968, |
|
"grad_norm": 1.5177452024002571, |
|
"learning_rate": 9.677951185117565e-06, |
|
"loss": 0.7129, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.683579335793358, |
|
"grad_norm": 1.7536225872991078, |
|
"learning_rate": 9.624299845100795e-06, |
|
"loss": 0.7157, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.688191881918819, |
|
"grad_norm": 1.5448154946820392, |
|
"learning_rate": 9.570659332665032e-06, |
|
"loss": 0.7029, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.6928044280442804, |
|
"grad_norm": 1.5229582975841924, |
|
"learning_rate": 9.51703119371593e-06, |
|
"loss": 0.7231, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.6974169741697418, |
|
"grad_norm": 1.401053250210059, |
|
"learning_rate": 9.463416973802541e-06, |
|
"loss": 0.6987, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.702029520295203, |
|
"grad_norm": 1.486158083271756, |
|
"learning_rate": 9.409818218072774e-06, |
|
"loss": 0.7187, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.706642066420664, |
|
"grad_norm": 1.4844923332820112, |
|
"learning_rate": 9.35623647122887e-06, |
|
"loss": 0.7038, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.7112546125461254, |
|
"grad_norm": 1.4768083176878029, |
|
"learning_rate": 9.302673277482867e-06, |
|
"loss": 0.7156, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.7158671586715868, |
|
"grad_norm": 1.4690802202313877, |
|
"learning_rate": 9.249130180512118e-06, |
|
"loss": 0.7007, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.720479704797048, |
|
"grad_norm": 1.521642869722996, |
|
"learning_rate": 9.19560872341479e-06, |
|
"loss": 0.7124, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.725092250922509, |
|
"grad_norm": 1.4844935676770985, |
|
"learning_rate": 9.142110448665394e-06, |
|
"loss": 0.7137, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.7297047970479706, |
|
"grad_norm": 1.4802119797626268, |
|
"learning_rate": 9.088636898070326e-06, |
|
"loss": 0.7142, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.7343173431734318, |
|
"grad_norm": 1.4925162048619054, |
|
"learning_rate": 9.035189612723444e-06, |
|
"loss": 0.7128, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.738929889298893, |
|
"grad_norm": 1.5072960944547247, |
|
"learning_rate": 8.981770132961649e-06, |
|
"loss": 0.7, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.7435424354243543, |
|
"grad_norm": 1.463889514395925, |
|
"learning_rate": 8.928379998320489e-06, |
|
"loss": 0.7057, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.7481549815498156, |
|
"grad_norm": 1.6039296690992704, |
|
"learning_rate": 8.875020747489795e-06, |
|
"loss": 0.7233, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.7527675276752768, |
|
"grad_norm": 1.5035319064665877, |
|
"learning_rate": 8.821693918269334e-06, |
|
"loss": 0.7049, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.757380073800738, |
|
"grad_norm": 1.4420897442537834, |
|
"learning_rate": 8.768401047524498e-06, |
|
"loss": 0.7097, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.7619926199261993, |
|
"grad_norm": 1.4863140306951776, |
|
"learning_rate": 8.715143671141985e-06, |
|
"loss": 0.7131, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.7666051660516606, |
|
"grad_norm": 1.4372944065533075, |
|
"learning_rate": 8.661923323985576e-06, |
|
"loss": 0.7066, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.7712177121771218, |
|
"grad_norm": 1.5118603952311795, |
|
"learning_rate": 8.60874153985187e-06, |
|
"loss": 0.711, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.775830258302583, |
|
"grad_norm": 1.4309158451109616, |
|
"learning_rate": 8.555599851426086e-06, |
|
"loss": 0.7017, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.7804428044280443, |
|
"grad_norm": 1.4482092213054845, |
|
"learning_rate": 8.5024997902379e-06, |
|
"loss": 0.7043, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.7850553505535056, |
|
"grad_norm": 1.4820015255772456, |
|
"learning_rate": 8.449442886617308e-06, |
|
"loss": 0.7134, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.7896678966789668, |
|
"grad_norm": 1.5178706892202136, |
|
"learning_rate": 8.396430669650501e-06, |
|
"loss": 0.6986, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.7942804428044279, |
|
"grad_norm": 1.5192714047507399, |
|
"learning_rate": 8.343464667135821e-06, |
|
"loss": 0.7098, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.7988929889298892, |
|
"grad_norm": 1.5584757005755172, |
|
"learning_rate": 8.290546405539726e-06, |
|
"loss": 0.7007, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.8035055350553506, |
|
"grad_norm": 1.4555072659251027, |
|
"learning_rate": 8.237677409952784e-06, |
|
"loss": 0.7069, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.8081180811808117, |
|
"grad_norm": 1.435117018138001, |
|
"learning_rate": 8.184859204045736e-06, |
|
"loss": 0.7126, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.812730627306273, |
|
"grad_norm": 1.497994099179169, |
|
"learning_rate": 8.132093310025572e-06, |
|
"loss": 0.6918, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.8173431734317345, |
|
"grad_norm": 1.5231919974046568, |
|
"learning_rate": 8.079381248591675e-06, |
|
"loss": 0.6999, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.8219557195571956, |
|
"grad_norm": 1.416052373581155, |
|
"learning_rate": 8.026724538891976e-06, |
|
"loss": 0.7007, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.8265682656826567, |
|
"grad_norm": 1.4579047643243503, |
|
"learning_rate": 7.974124698479192e-06, |
|
"loss": 0.6987, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.831180811808118, |
|
"grad_norm": 1.4768646334937987, |
|
"learning_rate": 7.921583243267079e-06, |
|
"loss": 0.721, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.8357933579335795, |
|
"grad_norm": 1.4193207071096974, |
|
"learning_rate": 7.869101687486748e-06, |
|
"loss": 0.6998, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.8404059040590406, |
|
"grad_norm": 1.5195143082878666, |
|
"learning_rate": 7.816681543643019e-06, |
|
"loss": 0.7035, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"grad_norm": 1.492818689804546, |
|
"learning_rate": 7.764324322470842e-06, |
|
"loss": 0.7023, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"eval_loss": 0.9556826949119568, |
|
"eval_runtime": 438.5581, |
|
"eval_samples_per_second": 35.001, |
|
"eval_steps_per_second": 0.137, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.849630996309963, |
|
"grad_norm": 1.4550020776516512, |
|
"learning_rate": 7.712031532891754e-06, |
|
"loss": 0.6959, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.8542435424354244, |
|
"grad_norm": 1.532357943989181, |
|
"learning_rate": 7.659804681970378e-06, |
|
"loss": 0.716, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.8588560885608856, |
|
"grad_norm": 1.4932493146047923, |
|
"learning_rate": 7.607645274871013e-06, |
|
"loss": 0.7103, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.8634686346863467, |
|
"grad_norm": 1.3849647891235997, |
|
"learning_rate": 7.555554814814243e-06, |
|
"loss": 0.7091, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.868081180811808, |
|
"grad_norm": 1.4686606572709258, |
|
"learning_rate": 7.50353480303361e-06, |
|
"loss": 0.7065, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.8726937269372694, |
|
"grad_norm": 1.4890927562018215, |
|
"learning_rate": 7.451586738732362e-06, |
|
"loss": 0.7045, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.8773062730627306, |
|
"grad_norm": 1.4388080292308094, |
|
"learning_rate": 7.3997121190402375e-06, |
|
"loss": 0.7062, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.881918819188192, |
|
"grad_norm": 1.595891571353621, |
|
"learning_rate": 7.347912438970324e-06, |
|
"loss": 0.693, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.8865313653136533, |
|
"grad_norm": 1.4244467986676035, |
|
"learning_rate": 7.296189191375953e-06, |
|
"loss": 0.6941, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.8911439114391144, |
|
"grad_norm": 1.4475135191897706, |
|
"learning_rate": 7.24454386690771e-06, |
|
"loss": 0.7073, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.8957564575645756, |
|
"grad_norm": 1.4673852319078244, |
|
"learning_rate": 7.192977953970448e-06, |
|
"loss": 0.7078, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.900369003690037, |
|
"grad_norm": 1.4719457932619668, |
|
"learning_rate": 7.141492938680401e-06, |
|
"loss": 0.691, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.9049815498154983, |
|
"grad_norm": 1.466731728037535, |
|
"learning_rate": 7.090090304822356e-06, |
|
"loss": 0.7062, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.9095940959409594, |
|
"grad_norm": 1.4553947793369755, |
|
"learning_rate": 7.038771533806884e-06, |
|
"loss": 0.7106, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.9142066420664205, |
|
"grad_norm": 1.478652231013823, |
|
"learning_rate": 6.9875381046276605e-06, |
|
"loss": 0.6931, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.918819188191882, |
|
"grad_norm": 1.4356929483984957, |
|
"learning_rate": 6.936391493818814e-06, |
|
"loss": 0.6898, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.9234317343173433, |
|
"grad_norm": 1.5536671032832632, |
|
"learning_rate": 6.885333175412406e-06, |
|
"loss": 0.6928, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.9280442804428044, |
|
"grad_norm": 1.4991276022393414, |
|
"learning_rate": 6.834364620895928e-06, |
|
"loss": 0.6935, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.9326568265682655, |
|
"grad_norm": 1.5046326188308679, |
|
"learning_rate": 6.783487299169897e-06, |
|
"loss": 0.6983, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.937269372693727, |
|
"grad_norm": 1.4351756608326394, |
|
"learning_rate": 6.732702676505531e-06, |
|
"loss": 0.7065, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.9418819188191883, |
|
"grad_norm": 1.5547998479904102, |
|
"learning_rate": 6.6820122165024845e-06, |
|
"loss": 0.6879, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.9464944649446494, |
|
"grad_norm": 1.49827559538925, |
|
"learning_rate": 6.631417380046656e-06, |
|
"loss": 0.7025, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.9511070110701108, |
|
"grad_norm": 1.531002649653087, |
|
"learning_rate": 6.580919625268114e-06, |
|
"loss": 0.6909, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.9557195571955721, |
|
"grad_norm": 1.509365230765324, |
|
"learning_rate": 6.530520407499049e-06, |
|
"loss": 0.686, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.9603321033210332, |
|
"grad_norm": 1.5743592553630588, |
|
"learning_rate": 6.480221179231849e-06, |
|
"loss": 0.7051, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.9649446494464944, |
|
"grad_norm": 1.6561005765337469, |
|
"learning_rate": 6.430023390077218e-06, |
|
"loss": 0.6975, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.9695571955719557, |
|
"grad_norm": 1.4695572898069678, |
|
"learning_rate": 6.379928486722421e-06, |
|
"loss": 0.703, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.974169741697417, |
|
"grad_norm": 1.436121247379392, |
|
"learning_rate": 6.329937912889582e-06, |
|
"loss": 0.7037, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.9787822878228782, |
|
"grad_norm": 1.4604394210363645, |
|
"learning_rate": 6.280053109294064e-06, |
|
"loss": 0.6861, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.9833948339483394, |
|
"grad_norm": 1.49703841483432, |
|
"learning_rate": 6.230275513602968e-06, |
|
"loss": 0.6848, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.9880073800738007, |
|
"grad_norm": 1.4776846632157035, |
|
"learning_rate": 6.180606560393694e-06, |
|
"loss": 0.6854, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.992619926199262, |
|
"grad_norm": 1.469102349555009, |
|
"learning_rate": 6.131047681112583e-06, |
|
"loss": 0.6901, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.9972324723247232, |
|
"grad_norm": 1.4916818504881257, |
|
"learning_rate": 6.081600304033682e-06, |
|
"loss": 0.6986, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.0018450184501844, |
|
"grad_norm": 3.4623791161329507, |
|
"learning_rate": 6.032265854217574e-06, |
|
"loss": 0.5805, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.006457564575646, |
|
"grad_norm": 2.5409394096245324, |
|
"learning_rate": 5.983045753470308e-06, |
|
"loss": 0.4067, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.011070110701107, |
|
"grad_norm": 1.963539215801178, |
|
"learning_rate": 5.933941420302412e-06, |
|
"loss": 0.41, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.015682656826568, |
|
"grad_norm": 1.9578493570683806, |
|
"learning_rate": 5.884954269888032e-06, |
|
"loss": 0.4078, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.0202952029520294, |
|
"grad_norm": 1.6857102876256314, |
|
"learning_rate": 5.83608571402414e-06, |
|
"loss": 0.4126, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.024907749077491, |
|
"grad_norm": 1.7969907397732796, |
|
"learning_rate": 5.787337161089836e-06, |
|
"loss": 0.4086, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.029520295202952, |
|
"grad_norm": 1.6058011608079648, |
|
"learning_rate": 5.738710016005766e-06, |
|
"loss": 0.3925, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.029520295202952, |
|
"eval_loss": 1.0149868726730347, |
|
"eval_runtime": 417.6368, |
|
"eval_samples_per_second": 36.754, |
|
"eval_steps_per_second": 0.144, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.034132841328413, |
|
"grad_norm": 1.6505939948003603, |
|
"learning_rate": 5.690205680193647e-06, |
|
"loss": 0.3948, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.0387453874538743, |
|
"grad_norm": 1.6068608464647989, |
|
"learning_rate": 5.641825551535849e-06, |
|
"loss": 0.3878, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.043357933579336, |
|
"grad_norm": 1.6707505723255622, |
|
"learning_rate": 5.593571024335126e-06, |
|
"loss": 0.3977, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.047970479704797, |
|
"grad_norm": 1.6484182975706831, |
|
"learning_rate": 5.545443489274444e-06, |
|
"loss": 0.4009, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.052583025830258, |
|
"grad_norm": 1.6146171821462916, |
|
"learning_rate": 5.497444333376874e-06, |
|
"loss": 0.3991, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.0571955719557193, |
|
"grad_norm": 1.6445160771788836, |
|
"learning_rate": 5.449574939965637e-06, |
|
"loss": 0.4019, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.061808118081181, |
|
"grad_norm": 1.6446158891460347, |
|
"learning_rate": 5.401836688624231e-06, |
|
"loss": 0.3885, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.066420664206642, |
|
"grad_norm": 1.5856595607293187, |
|
"learning_rate": 5.354230955156684e-06, |
|
"loss": 0.4052, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.071033210332103, |
|
"grad_norm": 1.5870199236748768, |
|
"learning_rate": 5.306759111547881e-06, |
|
"loss": 0.4029, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.0756457564575648, |
|
"grad_norm": 1.6193242932623593, |
|
"learning_rate": 5.259422525924037e-06, |
|
"loss": 0.3907, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.080258302583026, |
|
"grad_norm": 1.6256137352538118, |
|
"learning_rate": 5.212222562513278e-06, |
|
"loss": 0.3989, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.084870848708487, |
|
"grad_norm": 1.6417107171770284, |
|
"learning_rate": 5.165160581606301e-06, |
|
"loss": 0.3982, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.089483394833948, |
|
"grad_norm": 1.583054814160985, |
|
"learning_rate": 5.11823793951719e-06, |
|
"loss": 0.3857, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.0940959409594098, |
|
"grad_norm": 1.5988302758967783, |
|
"learning_rate": 5.0714559885443115e-06, |
|
"loss": 0.3912, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.098708487084871, |
|
"grad_norm": 1.5549504658907112, |
|
"learning_rate": 5.024816076931366e-06, |
|
"loss": 0.3964, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.103321033210332, |
|
"grad_norm": 1.6288272703564912, |
|
"learning_rate": 4.978319548828504e-06, |
|
"loss": 0.3979, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.107933579335793, |
|
"grad_norm": 1.6075015238877752, |
|
"learning_rate": 4.931967744253601e-06, |
|
"loss": 0.3859, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.1125461254612548, |
|
"grad_norm": 1.6429738157314036, |
|
"learning_rate": 4.885761999053647e-06, |
|
"loss": 0.3962, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.117158671586716, |
|
"grad_norm": 1.6796427577430482, |
|
"learning_rate": 4.839703644866228e-06, |
|
"loss": 0.4075, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.121771217712177, |
|
"grad_norm": 1.619217591723515, |
|
"learning_rate": 4.793794009081167e-06, |
|
"loss": 0.4085, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.126383763837638, |
|
"grad_norm": 1.6669420506553294, |
|
"learning_rate": 4.7480344148022535e-06, |
|
"loss": 0.4009, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.1309963099630997, |
|
"grad_norm": 1.601567836454024, |
|
"learning_rate": 4.702426180809132e-06, |
|
"loss": 0.3893, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.135608856088561, |
|
"grad_norm": 1.6330436004091688, |
|
"learning_rate": 4.65697062151927e-06, |
|
"loss": 0.3935, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.140221402214022, |
|
"grad_norm": 1.686740725167288, |
|
"learning_rate": 4.611669046950093e-06, |
|
"loss": 0.4062, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.1448339483394836, |
|
"grad_norm": 1.5889691549399958, |
|
"learning_rate": 4.566522762681239e-06, |
|
"loss": 0.3979, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.1494464944649447, |
|
"grad_norm": 1.6850634151797181, |
|
"learning_rate": 4.521533069816895e-06, |
|
"loss": 0.3999, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.154059040590406, |
|
"grad_norm": 1.5458146169307518, |
|
"learning_rate": 4.4767012649483484e-06, |
|
"loss": 0.3903, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.158671586715867, |
|
"grad_norm": 1.6016021907561413, |
|
"learning_rate": 4.432028640116581e-06, |
|
"loss": 0.3885, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.1632841328413286, |
|
"grad_norm": 1.6830749800846674, |
|
"learning_rate": 4.387516482775058e-06, |
|
"loss": 0.3897, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.1678966789667897, |
|
"grad_norm": 1.657726450794809, |
|
"learning_rate": 4.343166075752605e-06, |
|
"loss": 0.3995, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.172509225092251, |
|
"grad_norm": 1.6089499042377242, |
|
"learning_rate": 4.298978697216442e-06, |
|
"loss": 0.3906, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.177121771217712, |
|
"grad_norm": 1.6433325368187606, |
|
"learning_rate": 4.254955620635371e-06, |
|
"loss": 0.3836, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.1817343173431736, |
|
"grad_norm": 1.6228967594394044, |
|
"learning_rate": 4.21109811474302e-06, |
|
"loss": 0.3953, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.1863468634686347, |
|
"grad_norm": 1.6564397078095119, |
|
"learning_rate": 4.1674074435013445e-06, |
|
"loss": 0.3975, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.190959409594096, |
|
"grad_norm": 1.6855648962846128, |
|
"learning_rate": 4.1238848660641504e-06, |
|
"loss": 0.389, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.195571955719557, |
|
"grad_norm": 1.5746673175696069, |
|
"learning_rate": 4.080531636740836e-06, |
|
"loss": 0.3844, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.2001845018450186, |
|
"grad_norm": 1.6344618154669375, |
|
"learning_rate": 4.03734900496022e-06, |
|
"loss": 0.3988, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.2047970479704797, |
|
"grad_norm": 1.6281928689117737, |
|
"learning_rate": 3.994338215234547e-06, |
|
"loss": 0.3896, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.209409594095941, |
|
"grad_norm": 1.6257611833188321, |
|
"learning_rate": 3.9515005071236274e-06, |
|
"loss": 0.3961, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"grad_norm": 1.5957292745421947, |
|
"learning_rate": 3.908837115199086e-06, |
|
"loss": 0.3871, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"eval_loss": 1.0319310426712036, |
|
"eval_runtime": 393.3456, |
|
"eval_samples_per_second": 39.024, |
|
"eval_steps_per_second": 0.153, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.2186346863468636, |
|
"grad_norm": 1.5729201496024248, |
|
"learning_rate": 3.866349269008819e-06, |
|
"loss": 0.385, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.2232472324723247, |
|
"grad_norm": 1.6183544120950042, |
|
"learning_rate": 3.824038193041529e-06, |
|
"loss": 0.3968, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.227859778597786, |
|
"grad_norm": 1.6955717019033336, |
|
"learning_rate": 3.781905106691447e-06, |
|
"loss": 0.4004, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.2324723247232474, |
|
"grad_norm": 1.6580260222032042, |
|
"learning_rate": 3.7399512242231994e-06, |
|
"loss": 0.3842, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.2370848708487086, |
|
"grad_norm": 1.6490315910819098, |
|
"learning_rate": 3.698177754736787e-06, |
|
"loss": 0.3862, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.2416974169741697, |
|
"grad_norm": 1.6351326865393605, |
|
"learning_rate": 3.6565859021327777e-06, |
|
"loss": 0.3952, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.246309963099631, |
|
"grad_norm": 1.7016714416453813, |
|
"learning_rate": 3.6151768650775577e-06, |
|
"loss": 0.3906, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.2509225092250924, |
|
"grad_norm": 1.5855218888376186, |
|
"learning_rate": 3.5739518369688454e-06, |
|
"loss": 0.391, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.2555350553505535, |
|
"grad_norm": 1.5979300773582483, |
|
"learning_rate": 3.5329120059012536e-06, |
|
"loss": 0.3884, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.2601476014760147, |
|
"grad_norm": 1.631933769302546, |
|
"learning_rate": 3.492058554632063e-06, |
|
"loss": 0.4012, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.264760147601476, |
|
"grad_norm": 1.6784627234471698, |
|
"learning_rate": 3.4513926605471504e-06, |
|
"loss": 0.3956, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.2693726937269374, |
|
"grad_norm": 1.5994089046113484, |
|
"learning_rate": 3.4109154956270253e-06, |
|
"loss": 0.3919, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.2739852398523985, |
|
"grad_norm": 1.5962979043384486, |
|
"learning_rate": 3.370628226413093e-06, |
|
"loss": 0.3975, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.2785977859778597, |
|
"grad_norm": 1.6734564837873838, |
|
"learning_rate": 3.330532013973987e-06, |
|
"loss": 0.3887, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.2832103321033212, |
|
"grad_norm": 1.5662431409042064, |
|
"learning_rate": 3.290628013872159e-06, |
|
"loss": 0.3841, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.2878228782287824, |
|
"grad_norm": 1.5635156068197413, |
|
"learning_rate": 3.250917376130538e-06, |
|
"loss": 0.3951, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.2924354243542435, |
|
"grad_norm": 1.6010796160602963, |
|
"learning_rate": 3.211401245199398e-06, |
|
"loss": 0.3942, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.2970479704797047, |
|
"grad_norm": 1.6199565737985742, |
|
"learning_rate": 3.1720807599233903e-06, |
|
"loss": 0.3927, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.3016605166051662, |
|
"grad_norm": 1.6515237978384454, |
|
"learning_rate": 3.132957053508696e-06, |
|
"loss": 0.3978, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.3062730627306274, |
|
"grad_norm": 1.6541451651760768, |
|
"learning_rate": 3.0940312534903848e-06, |
|
"loss": 0.397, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.3108856088560885, |
|
"grad_norm": 1.6636499883415654, |
|
"learning_rate": 3.0553044816999133e-06, |
|
"loss": 0.3771, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.3154981549815496, |
|
"grad_norm": 1.5866618676441615, |
|
"learning_rate": 3.0167778542328053e-06, |
|
"loss": 0.3967, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.3201107011070112, |
|
"grad_norm": 1.6302246602193289, |
|
"learning_rate": 2.9784524814164673e-06, |
|
"loss": 0.4006, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.3247232472324724, |
|
"grad_norm": 1.6231412036835564, |
|
"learning_rate": 2.940329467778198e-06, |
|
"loss": 0.3959, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.3293357933579335, |
|
"grad_norm": 1.6840760426840755, |
|
"learning_rate": 2.9024099120133674e-06, |
|
"loss": 0.3908, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.3339483394833946, |
|
"grad_norm": 1.7187956725298614, |
|
"learning_rate": 2.8646949069537343e-06, |
|
"loss": 0.3908, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.338560885608856, |
|
"grad_norm": 1.56700956891004, |
|
"learning_rate": 2.8271855395359613e-06, |
|
"loss": 0.3961, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.3431734317343174, |
|
"grad_norm": 1.6097463271059957, |
|
"learning_rate": 2.7898828907702826e-06, |
|
"loss": 0.3894, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.3477859778597785, |
|
"grad_norm": 1.5810624302563459, |
|
"learning_rate": 2.7527880357093673e-06, |
|
"loss": 0.3853, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.35239852398524, |
|
"grad_norm": 1.6230445922415717, |
|
"learning_rate": 2.71590204341731e-06, |
|
"loss": 0.3904, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.357011070110701, |
|
"grad_norm": 1.631243837661556, |
|
"learning_rate": 2.6792259769388394e-06, |
|
"loss": 0.3854, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.3616236162361623, |
|
"grad_norm": 1.6015681704315312, |
|
"learning_rate": 2.642760893268684e-06, |
|
"loss": 0.3897, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.3662361623616235, |
|
"grad_norm": 1.629094206408994, |
|
"learning_rate": 2.6065078433210913e-06, |
|
"loss": 0.3956, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.3708487084870846, |
|
"grad_norm": 1.6768173921403078, |
|
"learning_rate": 2.570467871899557e-06, |
|
"loss": 0.3882, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.375461254612546, |
|
"grad_norm": 1.6096361607682703, |
|
"learning_rate": 2.5346420176667052e-06, |
|
"loss": 0.3841, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.3800738007380073, |
|
"grad_norm": 1.5785068874659574, |
|
"learning_rate": 2.4990313131143716e-06, |
|
"loss": 0.407, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.3846863468634685, |
|
"grad_norm": 1.639780492512362, |
|
"learning_rate": 2.463636784533813e-06, |
|
"loss": 0.3872, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.38929889298893, |
|
"grad_norm": 1.5634276209766216, |
|
"learning_rate": 2.4284594519861637e-06, |
|
"loss": 0.3844, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.393911439114391, |
|
"grad_norm": 1.6051539919308102, |
|
"learning_rate": 2.3935003292730295e-06, |
|
"loss": 0.3845, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.3985239852398523, |
|
"grad_norm": 1.5819118683660134, |
|
"learning_rate": 2.3587604239072535e-06, |
|
"loss": 0.3927, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.3985239852398523, |
|
"eval_loss": 1.0269191265106201, |
|
"eval_runtime": 441.0938, |
|
"eval_samples_per_second": 34.8, |
|
"eval_steps_per_second": 0.136, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.4031365313653135, |
|
"grad_norm": 1.625902982522089, |
|
"learning_rate": 2.324240737083897e-06, |
|
"loss": 0.3967, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.407749077490775, |
|
"grad_norm": 1.6115877735569477, |
|
"learning_rate": 2.2899422636513768e-06, |
|
"loss": 0.3888, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.412361623616236, |
|
"grad_norm": 1.6393950015907957, |
|
"learning_rate": 2.2558659920828095e-06, |
|
"loss": 0.3866, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.4169741697416973, |
|
"grad_norm": 1.6289337323279018, |
|
"learning_rate": 2.2220129044474903e-06, |
|
"loss": 0.3822, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.421586715867159, |
|
"grad_norm": 1.626399821921426, |
|
"learning_rate": 2.1883839763826285e-06, |
|
"loss": 0.3917, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.42619926199262, |
|
"grad_norm": 1.5750110015076921, |
|
"learning_rate": 2.15498017706521e-06, |
|
"loss": 0.3818, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.430811808118081, |
|
"grad_norm": 1.5656770730106075, |
|
"learning_rate": 2.1218024691840646e-06, |
|
"loss": 0.3949, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.4354243542435423, |
|
"grad_norm": 1.5878185538716267, |
|
"learning_rate": 2.088851808912126e-06, |
|
"loss": 0.39, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.4400369003690034, |
|
"grad_norm": 1.6108620558982116, |
|
"learning_rate": 2.0561291458788736e-06, |
|
"loss": 0.3968, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.444649446494465, |
|
"grad_norm": 1.5917905556601293, |
|
"learning_rate": 2.0236354231429743e-06, |
|
"loss": 0.3835, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.449261992619926, |
|
"grad_norm": 1.5749010329322541, |
|
"learning_rate": 1.9913715771650798e-06, |
|
"loss": 0.3878, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.4538745387453873, |
|
"grad_norm": 1.5853197835560284, |
|
"learning_rate": 1.959338537780868e-06, |
|
"loss": 0.3793, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.458487084870849, |
|
"grad_norm": 1.675999301164994, |
|
"learning_rate": 1.9275372281742242e-06, |
|
"loss": 0.3888, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.46309963099631, |
|
"grad_norm": 1.5909593196232035, |
|
"learning_rate": 1.8959685648506365e-06, |
|
"loss": 0.379, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.467712177121771, |
|
"grad_norm": 1.6580184737262997, |
|
"learning_rate": 1.8646334576107993e-06, |
|
"loss": 0.385, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.4723247232472323, |
|
"grad_norm": 1.694950561030231, |
|
"learning_rate": 1.83353280952437e-06, |
|
"loss": 0.4061, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.476937269372694, |
|
"grad_norm": 1.5612968845817867, |
|
"learning_rate": 1.8026675169039654e-06, |
|
"loss": 0.3717, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.481549815498155, |
|
"grad_norm": 1.6589816057109397, |
|
"learning_rate": 1.7720384692793036e-06, |
|
"loss": 0.3907, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.486162361623616, |
|
"grad_norm": 1.6036469226772128, |
|
"learning_rate": 1.7416465493715984e-06, |
|
"loss": 0.3777, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.4907749077490777, |
|
"grad_norm": 1.6327667406661128, |
|
"learning_rate": 1.7114926330680958e-06, |
|
"loss": 0.3875, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.495387453874539, |
|
"grad_norm": 1.5827244143165553, |
|
"learning_rate": 1.681577589396839e-06, |
|
"loss": 0.3859, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.6514702752041677, |
|
"learning_rate": 1.6519022805016305e-06, |
|
"loss": 0.3843, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.504612546125461, |
|
"grad_norm": 1.5570332325787979, |
|
"learning_rate": 1.6224675616171737e-06, |
|
"loss": 0.3715, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.5092250922509223, |
|
"grad_norm": 1.630413477205012, |
|
"learning_rate": 1.5932742810444314e-06, |
|
"loss": 0.3836, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.513837638376384, |
|
"grad_norm": 1.6521730580375191, |
|
"learning_rate": 1.5643232801261731e-06, |
|
"loss": 0.3948, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.518450184501845, |
|
"grad_norm": 1.6155666031902267, |
|
"learning_rate": 1.5356153932227423e-06, |
|
"loss": 0.3898, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.523062730627306, |
|
"grad_norm": 1.6555206625970396, |
|
"learning_rate": 1.5071514476879878e-06, |
|
"loss": 0.384, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.5276752767527677, |
|
"grad_norm": 1.6677020481929692, |
|
"learning_rate": 1.478932263845435e-06, |
|
"loss": 0.3952, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.532287822878229, |
|
"grad_norm": 1.66071104050102, |
|
"learning_rate": 1.450958654964647e-06, |
|
"loss": 0.3883, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.53690036900369, |
|
"grad_norm": 1.5550008356755514, |
|
"learning_rate": 1.4232314272377723e-06, |
|
"loss": 0.3867, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.541512915129151, |
|
"grad_norm": 1.6135337105923544, |
|
"learning_rate": 1.3957513797563227e-06, |
|
"loss": 0.3895, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.5461254612546127, |
|
"grad_norm": 1.6353151843919402, |
|
"learning_rate": 1.368519304488134e-06, |
|
"loss": 0.3868, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.550738007380074, |
|
"grad_norm": 1.5705890317457465, |
|
"learning_rate": 1.3415359862545574e-06, |
|
"loss": 0.3834, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.555350553505535, |
|
"grad_norm": 1.654601851442529, |
|
"learning_rate": 1.3148022027078223e-06, |
|
"loss": 0.3832, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.5599630996309966, |
|
"grad_norm": 1.5695058446628602, |
|
"learning_rate": 1.2883187243086338e-06, |
|
"loss": 0.3893, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.5645756457564577, |
|
"grad_norm": 1.6350847841681155, |
|
"learning_rate": 1.262086314303973e-06, |
|
"loss": 0.3898, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.569188191881919, |
|
"grad_norm": 1.608716488298126, |
|
"learning_rate": 1.2361057287050892e-06, |
|
"loss": 0.3834, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.57380073800738, |
|
"grad_norm": 1.6204640984588243, |
|
"learning_rate": 1.2103777162657205e-06, |
|
"loss": 0.3972, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.578413284132841, |
|
"grad_norm": 1.6231664675158106, |
|
"learning_rate": 1.1849030184605092e-06, |
|
"loss": 0.3831, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"grad_norm": 1.6455527588787915, |
|
"learning_rate": 1.1596823694636427e-06, |
|
"loss": 0.3872, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"eval_loss": 1.0266528129577637, |
|
"eval_runtime": 403.4694, |
|
"eval_samples_per_second": 38.045, |
|
"eval_steps_per_second": 0.149, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.587638376383764, |
|
"grad_norm": 1.6501434655468525, |
|
"learning_rate": 1.134716496127679e-06, |
|
"loss": 0.3866, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.592250922509225, |
|
"grad_norm": 1.5937430607804544, |
|
"learning_rate": 1.110006117962612e-06, |
|
"loss": 0.3746, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.5968634686346865, |
|
"grad_norm": 1.6042558353979863, |
|
"learning_rate": 1.085551947115131e-06, |
|
"loss": 0.3813, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.6014760147601477, |
|
"grad_norm": 1.5880288341663498, |
|
"learning_rate": 1.0613546883480974e-06, |
|
"loss": 0.3879, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.606088560885609, |
|
"grad_norm": 1.6687358545150779, |
|
"learning_rate": 1.0374150390202308e-06, |
|
"loss": 0.3764, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.61070110701107, |
|
"grad_norm": 1.6305478991844253, |
|
"learning_rate": 1.013733689066012e-06, |
|
"loss": 0.3936, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.6153136531365315, |
|
"grad_norm": 1.5959262607963571, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.3809, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.6199261992619927, |
|
"grad_norm": 1.6326891680063345, |
|
"learning_rate": 9.671486097761918e-07, |
|
"loss": 0.3851, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.624538745387454, |
|
"grad_norm": 1.5671079064226814, |
|
"learning_rate": 9.442462230104876e-07, |
|
"loss": 0.3813, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.6291512915129154, |
|
"grad_norm": 1.6290077759272117, |
|
"learning_rate": 9.216048207195438e-07, |
|
"loss": 0.3815, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.6337638376383765, |
|
"grad_norm": 1.6273985534855235, |
|
"learning_rate": 8.992250554227011e-07, |
|
"loss": 0.4061, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 2.6383763837638377, |
|
"grad_norm": 1.6210509601819985, |
|
"learning_rate": 8.771075720989886e-07, |
|
"loss": 0.3752, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.642988929889299, |
|
"grad_norm": 1.555467095004222, |
|
"learning_rate": 8.552530081685384e-07, |
|
"loss": 0.3875, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.64760147601476, |
|
"grad_norm": 1.5802373017514193, |
|
"learning_rate": 8.336619934742151e-07, |
|
"loss": 0.3819, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.6522140221402215, |
|
"grad_norm": 1.6154443890108339, |
|
"learning_rate": 8.123351502634625e-07, |
|
"loss": 0.3888, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.6568265682656826, |
|
"grad_norm": 1.6154395851668601, |
|
"learning_rate": 7.91273093170365e-07, |
|
"loss": 0.3808, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.661439114391144, |
|
"grad_norm": 1.587280501111623, |
|
"learning_rate": 7.704764291979516e-07, |
|
"loss": 0.3774, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 2.6660516605166054, |
|
"grad_norm": 1.6071595650095456, |
|
"learning_rate": 7.499457577006753e-07, |
|
"loss": 0.3819, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.6706642066420665, |
|
"grad_norm": 1.648848686369746, |
|
"learning_rate": 7.296816703671683e-07, |
|
"loss": 0.3855, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.6752767527675276, |
|
"grad_norm": 1.5617730378995032, |
|
"learning_rate": 7.09684751203168e-07, |
|
"loss": 0.3909, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.6798892988929888, |
|
"grad_norm": 1.6183910290801358, |
|
"learning_rate": 6.899555765147004e-07, |
|
"loss": 0.3826, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 2.6845018450184504, |
|
"grad_norm": 1.674038262833668, |
|
"learning_rate": 6.704947148914608e-07, |
|
"loss": 0.382, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.6891143911439115, |
|
"grad_norm": 1.6448057348626846, |
|
"learning_rate": 6.513027271904315e-07, |
|
"loss": 0.3854, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 2.6937269372693726, |
|
"grad_norm": 1.6374354465922731, |
|
"learning_rate": 6.323801665197238e-07, |
|
"loss": 0.3851, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.698339483394834, |
|
"grad_norm": 1.5834812445833784, |
|
"learning_rate": 6.137275782226216e-07, |
|
"loss": 0.3819, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.7029520295202953, |
|
"grad_norm": 1.623319595480127, |
|
"learning_rate": 5.953454998618857e-07, |
|
"loss": 0.3856, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.7075645756457565, |
|
"grad_norm": 1.5967961053246091, |
|
"learning_rate": 5.772344612042435e-07, |
|
"loss": 0.3862, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.7121771217712176, |
|
"grad_norm": 1.5710116912601946, |
|
"learning_rate": 5.593949842051338e-07, |
|
"loss": 0.3842, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.7167896678966788, |
|
"grad_norm": 1.57362891686515, |
|
"learning_rate": 5.418275829936537e-07, |
|
"loss": 0.3711, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.7214022140221403, |
|
"grad_norm": 1.580310931596939, |
|
"learning_rate": 5.24532763857749e-07, |
|
"loss": 0.3835, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.7260147601476015, |
|
"grad_norm": 1.6145634716470691, |
|
"learning_rate": 5.075110252296245e-07, |
|
"loss": 0.3882, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.7306273062730626, |
|
"grad_norm": 1.6743712939756843, |
|
"learning_rate": 4.907628576713663e-07, |
|
"loss": 0.3838, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.735239852398524, |
|
"grad_norm": 1.5631707817004297, |
|
"learning_rate": 4.742887438608235e-07, |
|
"loss": 0.387, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 2.7398523985239853, |
|
"grad_norm": 1.6160350852160132, |
|
"learning_rate": 4.5808915857768035e-07, |
|
"loss": 0.3733, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.7444649446494465, |
|
"grad_norm": 1.6391404550180673, |
|
"learning_rate": 4.4216456868978243e-07, |
|
"loss": 0.3863, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.7490774907749076, |
|
"grad_norm": 1.5869883487994245, |
|
"learning_rate": 4.265154331396815e-07, |
|
"loss": 0.3803, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.7536900369003687, |
|
"grad_norm": 1.6338303616426142, |
|
"learning_rate": 4.111422029314016e-07, |
|
"loss": 0.367, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.7583025830258303, |
|
"grad_norm": 1.6222090885217113, |
|
"learning_rate": 3.960453211174531e-07, |
|
"loss": 0.3913, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.7629151291512914, |
|
"grad_norm": 1.5453198744376195, |
|
"learning_rate": 3.8122522278605024e-07, |
|
"loss": 0.3884, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 2.767527675276753, |
|
"grad_norm": 1.6194002178218827, |
|
"learning_rate": 3.6668233504858486e-07, |
|
"loss": 0.3918, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.767527675276753, |
|
"eval_loss": 1.0241528749465942, |
|
"eval_runtime": 583.1671, |
|
"eval_samples_per_second": 26.322, |
|
"eval_steps_per_second": 0.103, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.772140221402214, |
|
"grad_norm": 1.5792021657673334, |
|
"learning_rate": 3.524170770273072e-07, |
|
"loss": 0.3836, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 2.7767527675276753, |
|
"grad_norm": 1.5855513913689898, |
|
"learning_rate": 3.384298598432545e-07, |
|
"loss": 0.3836, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.7813653136531364, |
|
"grad_norm": 1.5563076811229497, |
|
"learning_rate": 3.2472108660439706e-07, |
|
"loss": 0.3802, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.7859778597785976, |
|
"grad_norm": 1.5823633538445738, |
|
"learning_rate": 3.112911523940232e-07, |
|
"loss": 0.383, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.790590405904059, |
|
"grad_norm": 1.629806513708018, |
|
"learning_rate": 2.9814044425935605e-07, |
|
"loss": 0.3821, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.7952029520295203, |
|
"grad_norm": 1.6493594318817755, |
|
"learning_rate": 2.852693412003882e-07, |
|
"loss": 0.3832, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.7998154981549814, |
|
"grad_norm": 1.5938360437660848, |
|
"learning_rate": 2.7267821415897343e-07, |
|
"loss": 0.3739, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 2.804428044280443, |
|
"grad_norm": 1.6409833373595797, |
|
"learning_rate": 2.6036742600812683e-07, |
|
"loss": 0.3824, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.809040590405904, |
|
"grad_norm": 1.6120109521550734, |
|
"learning_rate": 2.4833733154156716e-07, |
|
"loss": 0.3791, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.8136531365313653, |
|
"grad_norm": 1.6148694677775197, |
|
"learning_rate": 2.3658827746349976e-07, |
|
"loss": 0.3716, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.8182656826568264, |
|
"grad_norm": 1.6335439307552395, |
|
"learning_rate": 2.2512060237861455e-07, |
|
"loss": 0.377, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 2.8228782287822876, |
|
"grad_norm": 1.6687024271570985, |
|
"learning_rate": 2.139346367823314e-07, |
|
"loss": 0.3824, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.827490774907749, |
|
"grad_norm": 1.5961399559857916, |
|
"learning_rate": 2.030307030512768e-07, |
|
"loss": 0.38, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 2.8321033210332103, |
|
"grad_norm": 1.628301167538455, |
|
"learning_rate": 1.9240911543399465e-07, |
|
"loss": 0.3861, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.836715867158672, |
|
"grad_norm": 1.5876724378264213, |
|
"learning_rate": 1.8207018004188338e-07, |
|
"loss": 0.375, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.841328413284133, |
|
"grad_norm": 1.6533761523786383, |
|
"learning_rate": 1.7201419484037861e-07, |
|
"loss": 0.3847, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.845940959409594, |
|
"grad_norm": 1.65860087801836, |
|
"learning_rate": 1.622414496403668e-07, |
|
"loss": 0.4014, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 2.8505535055350553, |
|
"grad_norm": 1.5158450003834456, |
|
"learning_rate": 1.527522260898273e-07, |
|
"loss": 0.3743, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.8551660516605164, |
|
"grad_norm": 1.543441072095729, |
|
"learning_rate": 1.4354679766572344e-07, |
|
"loss": 0.3867, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 2.859778597785978, |
|
"grad_norm": 1.6190829129675481, |
|
"learning_rate": 1.3462542966611314e-07, |
|
"loss": 0.3697, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.864391143911439, |
|
"grad_norm": 1.6865745421103189, |
|
"learning_rate": 1.259883792025085e-07, |
|
"loss": 0.3744, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.8690036900369003, |
|
"grad_norm": 1.6329028917573583, |
|
"learning_rate": 1.1763589519246388e-07, |
|
"loss": 0.3722, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.873616236162362, |
|
"grad_norm": 1.5873886153372632, |
|
"learning_rate": 1.095682183524005e-07, |
|
"loss": 0.3797, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 2.878228782287823, |
|
"grad_norm": 1.580205694330548, |
|
"learning_rate": 1.0178558119067316e-07, |
|
"loss": 0.3705, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.882841328413284, |
|
"grad_norm": 1.5748354606004111, |
|
"learning_rate": 9.428820800086558e-08, |
|
"loss": 0.3832, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.8874538745387452, |
|
"grad_norm": 1.6471024830756282, |
|
"learning_rate": 8.707631485532775e-08, |
|
"loss": 0.3886, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.8920664206642064, |
|
"grad_norm": 1.6632577517398965, |
|
"learning_rate": 8.015010959894986e-08, |
|
"loss": 0.384, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 2.896678966789668, |
|
"grad_norm": 1.6208970211899278, |
|
"learning_rate": 7.350979184317153e-08, |
|
"loss": 0.3861, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.901291512915129, |
|
"grad_norm": 1.64161859212595, |
|
"learning_rate": 6.715555296022746e-08, |
|
"loss": 0.3767, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 2.9059040590405907, |
|
"grad_norm": 1.6172525804643438, |
|
"learning_rate": 6.108757607763305e-08, |
|
"loss": 0.3857, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.910516605166052, |
|
"grad_norm": 1.580351811354319, |
|
"learning_rate": 5.530603607290852e-08, |
|
"loss": 0.3771, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 2.915129151291513, |
|
"grad_norm": 1.572483075790589, |
|
"learning_rate": 4.981109956853747e-08, |
|
"loss": 0.3749, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.919741697416974, |
|
"grad_norm": 1.64962807846865, |
|
"learning_rate": 4.460292492716512e-08, |
|
"loss": 0.3795, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 2.9243542435424352, |
|
"grad_norm": 1.605735844681554, |
|
"learning_rate": 3.968166224703085e-08, |
|
"loss": 0.3795, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.928966789667897, |
|
"grad_norm": 1.5747261446081762, |
|
"learning_rate": 3.504745335765169e-08, |
|
"loss": 0.3793, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.933579335793358, |
|
"grad_norm": 1.653000934575167, |
|
"learning_rate": 3.0700431815724464e-08, |
|
"loss": 0.3903, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.938191881918819, |
|
"grad_norm": 1.6378961953845004, |
|
"learning_rate": 2.664072290128217e-08, |
|
"loss": 0.3889, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 2.9428044280442807, |
|
"grad_norm": 1.584042922862379, |
|
"learning_rate": 2.2868443614082468e-08, |
|
"loss": 0.3878, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.947416974169742, |
|
"grad_norm": 1.5647903615149348, |
|
"learning_rate": 1.9383702670235927e-08, |
|
"loss": 0.382, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"grad_norm": 1.580064728520442, |
|
"learning_rate": 1.6186600499074055e-08, |
|
"loss": 0.3764, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"eval_loss": 1.024267315864563, |
|
"eval_runtime": 436.1706, |
|
"eval_samples_per_second": 35.193, |
|
"eval_steps_per_second": 0.138, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.956642066420664, |
|
"grad_norm": 1.6540116549017054, |
|
"learning_rate": 1.3277229240249435e-08, |
|
"loss": 0.3945, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 2.961254612546125, |
|
"grad_norm": 1.6470444524138421, |
|
"learning_rate": 1.0655672741090028e-08, |
|
"loss": 0.3806, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.965867158671587, |
|
"grad_norm": 1.5413563389524112, |
|
"learning_rate": 8.322006554171147e-09, |
|
"loss": 0.3818, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 2.970479704797048, |
|
"grad_norm": 1.6164344748774018, |
|
"learning_rate": 6.276297935149389e-09, |
|
"loss": 0.3847, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.975092250922509, |
|
"grad_norm": 1.6066275827671024, |
|
"learning_rate": 4.5186058408153156e-09, |
|
"loss": 0.3823, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.9797047970479706, |
|
"grad_norm": 1.6333269508170274, |
|
"learning_rate": 3.0489809273981375e-09, |
|
"loss": 0.3801, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.984317343173432, |
|
"grad_norm": 1.6103531418344368, |
|
"learning_rate": 1.8674655491091043e-09, |
|
"loss": 0.3932, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 2.988929889298893, |
|
"grad_norm": 1.6234448383351934, |
|
"learning_rate": 9.740937569135967e-10, |
|
"loss": 0.3832, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.993542435424354, |
|
"grad_norm": 1.6318211007601922, |
|
"learning_rate": 3.6889129755413033e-10, |
|
"loss": 0.3871, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 2.9981549815498156, |
|
"grad_norm": 1.609675292027893, |
|
"learning_rate": 5.187561280983744e-11, |
|
"loss": 0.3788, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 3252, |
|
"total_flos": 1361805280542720.0, |
|
"train_loss": 0.7043063408920832, |
|
"train_runtime": 81819.3933, |
|
"train_samples_per_second": 5.085, |
|
"train_steps_per_second": 0.04 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1361805280542720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|