|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 530, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037735849056603774, |
|
"grad_norm": 15.33187898922165, |
|
"learning_rate": 3.773584905660378e-07, |
|
"loss": 1.3947, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018867924528301886, |
|
"grad_norm": 14.587979298665214, |
|
"learning_rate": 1.8867924528301889e-06, |
|
"loss": 1.2879, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03773584905660377, |
|
"grad_norm": 5.156574643577073, |
|
"learning_rate": 3.7735849056603777e-06, |
|
"loss": 1.1356, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05660377358490566, |
|
"grad_norm": 2.4413584926483494, |
|
"learning_rate": 5.660377358490566e-06, |
|
"loss": 1.056, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 1.939557281184319, |
|
"learning_rate": 7.5471698113207555e-06, |
|
"loss": 1.0035, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09433962264150944, |
|
"grad_norm": 1.445408046384172, |
|
"learning_rate": 9.433962264150944e-06, |
|
"loss": 0.9666, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11320754716981132, |
|
"grad_norm": 1.5429900501633669, |
|
"learning_rate": 1.1320754716981132e-05, |
|
"loss": 0.9673, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1320754716981132, |
|
"grad_norm": 1.2999242817660164, |
|
"learning_rate": 1.320754716981132e-05, |
|
"loss": 0.9161, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 1.1690976259696864, |
|
"learning_rate": 1.5094339622641511e-05, |
|
"loss": 0.9029, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16981132075471697, |
|
"grad_norm": 1.2404008101715356, |
|
"learning_rate": 1.69811320754717e-05, |
|
"loss": 0.8976, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 1.2490209366453322, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 0.9202, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20754716981132076, |
|
"grad_norm": 1.2687062081137546, |
|
"learning_rate": 1.9999132465602526e-05, |
|
"loss": 0.9125, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 1.2693770693981177, |
|
"learning_rate": 1.998937443221316e-05, |
|
"loss": 0.918, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24528301886792453, |
|
"grad_norm": 1.17511999372123, |
|
"learning_rate": 1.9968784563700586e-05, |
|
"loss": 0.8731, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2641509433962264, |
|
"grad_norm": 1.1148641046803303, |
|
"learning_rate": 1.9937385186393888e-05, |
|
"loss": 0.9063, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2830188679245283, |
|
"grad_norm": 1.2209470360517312, |
|
"learning_rate": 1.9895210347758233e-05, |
|
"loss": 0.8973, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 1.2489264571555163, |
|
"learning_rate": 1.984230577947597e-05, |
|
"loss": 0.9086, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32075471698113206, |
|
"grad_norm": 1.7088824250739199, |
|
"learning_rate": 1.977872884785815e-05, |
|
"loss": 0.891, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.33962264150943394, |
|
"grad_norm": 1.1609543047434623, |
|
"learning_rate": 1.9704548491640195e-05, |
|
"loss": 0.9064, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3584905660377358, |
|
"grad_norm": 1.1548579761616171, |
|
"learning_rate": 1.961984514722914e-05, |
|
"loss": 0.886, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 1.2072363172467877, |
|
"learning_rate": 1.9524710661483594e-05, |
|
"loss": 0.914, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39622641509433965, |
|
"grad_norm": 1.2486639912473576, |
|
"learning_rate": 1.94192481921209e-05, |
|
"loss": 0.8858, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.41509433962264153, |
|
"grad_norm": 1.1907067530146591, |
|
"learning_rate": 1.9303572095859545e-05, |
|
"loss": 0.883, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4339622641509434, |
|
"grad_norm": 1.205701711118422, |
|
"learning_rate": 1.91778078044181e-05, |
|
"loss": 0.903, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 1.4297409544875106, |
|
"learning_rate": 1.9042091688505104e-05, |
|
"loss": 0.8903, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 1.3015347726404307, |
|
"learning_rate": 1.8896570909947477e-05, |
|
"loss": 0.8975, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.49056603773584906, |
|
"grad_norm": 1.1056494521961182, |
|
"learning_rate": 1.874140326211766e-05, |
|
"loss": 0.8973, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5094339622641509, |
|
"grad_norm": 1.3117079939596017, |
|
"learning_rate": 1.8576756998832667e-05, |
|
"loss": 0.8665, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5283018867924528, |
|
"grad_norm": 1.0536496719085662, |
|
"learning_rate": 1.8402810651910444e-05, |
|
"loss": 0.8638, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5471698113207547, |
|
"grad_norm": 2.702397102319974, |
|
"learning_rate": 1.8219752837581466e-05, |
|
"loss": 0.882, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 1.0422440813400673, |
|
"learning_rate": 1.8027782051965408e-05, |
|
"loss": 0.8657, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5849056603773585, |
|
"grad_norm": 1.3024995956700887, |
|
"learning_rate": 1.782710645583473e-05, |
|
"loss": 0.8599, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 1.1482394390531847, |
|
"learning_rate": 1.761794364889855e-05, |
|
"loss": 0.8926, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6226415094339622, |
|
"grad_norm": 1.079599574770899, |
|
"learning_rate": 1.7400520433851457e-05, |
|
"loss": 0.8428, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6415094339622641, |
|
"grad_norm": 1.0901592446951243, |
|
"learning_rate": 1.717507257044331e-05, |
|
"loss": 0.8496, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.660377358490566, |
|
"grad_norm": 1.1077239121822327, |
|
"learning_rate": 1.694184451983651e-05, |
|
"loss": 0.9012, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6792452830188679, |
|
"grad_norm": 1.0853595621923493, |
|
"learning_rate": 1.6701089179528032e-05, |
|
"loss": 0.8681, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6981132075471698, |
|
"grad_norm": 1.0734841287772214, |
|
"learning_rate": 1.6453067609123656e-05, |
|
"loss": 0.856, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7169811320754716, |
|
"grad_norm": 1.088487035960174, |
|
"learning_rate": 1.619804874726171e-05, |
|
"loss": 0.8652, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7358490566037735, |
|
"grad_norm": 1.0825029747031027, |
|
"learning_rate": 1.5936309119993333e-05, |
|
"loss": 0.8565, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 1.200978191088004, |
|
"learning_rate": 1.566813254093538e-05, |
|
"loss": 0.8438, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7735849056603774, |
|
"grad_norm": 1.0577639466302755, |
|
"learning_rate": 1.5393809803521213e-05, |
|
"loss": 0.8681, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7924528301886793, |
|
"grad_norm": 1.1116902648533118, |
|
"learning_rate": 1.5113638365682996e-05, |
|
"loss": 0.8534, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8113207547169812, |
|
"grad_norm": 1.0957222267960907, |
|
"learning_rate": 1.482792202730745e-05, |
|
"loss": 0.8598, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8301886792452831, |
|
"grad_norm": 1.0870428672337984, |
|
"learning_rate": 1.4536970600814789e-05, |
|
"loss": 0.8854, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8490566037735849, |
|
"grad_norm": 1.0340161162672956, |
|
"learning_rate": 1.424109957521806e-05, |
|
"loss": 0.865, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8679245283018868, |
|
"grad_norm": 1.062102724438973, |
|
"learning_rate": 1.394062977402717e-05, |
|
"loss": 0.8664, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8867924528301887, |
|
"grad_norm": 1.0590868294575073, |
|
"learning_rate": 1.3635887007368467e-05, |
|
"loss": 0.8419, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 1.0864493134204916, |
|
"learning_rate": 1.3327201718697232e-05, |
|
"loss": 0.8519, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9245283018867925, |
|
"grad_norm": 1.0168196512793692, |
|
"learning_rate": 1.3014908626486032e-05, |
|
"loss": 0.8629, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 1.0404726364027557, |
|
"learning_rate": 1.2699346361277538e-05, |
|
"loss": 0.8337, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9622641509433962, |
|
"grad_norm": 1.018131650974526, |
|
"learning_rate": 1.2380857098495355e-05, |
|
"loss": 0.9745, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9811320754716981, |
|
"grad_norm": 1.0037862694896684, |
|
"learning_rate": 1.2059786187410984e-05, |
|
"loss": 0.8712, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.989425132088932, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.8495, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8464146852493286, |
|
"eval_runtime": 4.2653, |
|
"eval_samples_per_second": 40.091, |
|
"eval_steps_per_second": 0.703, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0188679245283019, |
|
"grad_norm": 3.0712777016007875, |
|
"learning_rate": 1.1411294436778562e-05, |
|
"loss": 0.5752, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0377358490566038, |
|
"grad_norm": 1.367908639519453, |
|
"learning_rate": 1.1084576779974257e-05, |
|
"loss": 0.5572, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.0566037735849056, |
|
"grad_norm": 1.1027538267727532, |
|
"learning_rate": 1.0756683077869133e-05, |
|
"loss": 0.5489, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0754716981132075, |
|
"grad_norm": 1.1284473887078348, |
|
"learning_rate": 1.0427968877303809e-05, |
|
"loss": 0.549, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.0943396226415094, |
|
"grad_norm": 1.0579765238733445, |
|
"learning_rate": 1.0098790614814658e-05, |
|
"loss": 0.5584, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1132075471698113, |
|
"grad_norm": 1.0510922170571144, |
|
"learning_rate": 9.769505230136962e-06, |
|
"loss": 0.5326, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"grad_norm": 1.0257584558372113, |
|
"learning_rate": 9.440469779162407e-06, |
|
"loss": 0.5457, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.150943396226415, |
|
"grad_norm": 1.0531732710979618, |
|
"learning_rate": 9.112041046770653e-06, |
|
"loss": 0.5378, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.169811320754717, |
|
"grad_norm": 1.0708673795759969, |
|
"learning_rate": 8.784575159954748e-06, |
|
"loss": 0.5425, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1886792452830188, |
|
"grad_norm": 1.0132141363080573, |
|
"learning_rate": 8.458427201659926e-06, |
|
"loss": 0.5391, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2075471698113207, |
|
"grad_norm": 1.0680470294860878, |
|
"learning_rate": 8.133950825754511e-06, |
|
"loss": 0.5447, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2264150943396226, |
|
"grad_norm": 0.9666850547234715, |
|
"learning_rate": 7.81149787355039e-06, |
|
"loss": 0.5379, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2452830188679245, |
|
"grad_norm": 0.9710527007406873, |
|
"learning_rate": 7.491417992288927e-06, |
|
"loss": 0.5326, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2641509433962264, |
|
"grad_norm": 1.1620230853698044, |
|
"learning_rate": 7.174058256006012e-06, |
|
"loss": 0.5458, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.2830188679245282, |
|
"grad_norm": 1.0264570630272518, |
|
"learning_rate": 6.859762789187259e-06, |
|
"loss": 0.5521, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3018867924528301, |
|
"grad_norm": 1.0303843487410083, |
|
"learning_rate": 6.548872393621578e-06, |
|
"loss": 0.5465, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.320754716981132, |
|
"grad_norm": 0.9494639965897911, |
|
"learning_rate": 6.241724178857621e-06, |
|
"loss": 0.5531, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3396226415094339, |
|
"grad_norm": 0.998464274247554, |
|
"learning_rate": 5.938651196663865e-06, |
|
"loss": 0.5528, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.3584905660377358, |
|
"grad_norm": 0.9719381109648194, |
|
"learning_rate": 5.6399820798887266e-06, |
|
"loss": 0.5351, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3773584905660377, |
|
"grad_norm": 0.9816121465159701, |
|
"learning_rate": 5.346040686112189e-06, |
|
"loss": 0.551, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.3962264150943398, |
|
"grad_norm": 0.9743057773598173, |
|
"learning_rate": 5.0571457464755226e-06, |
|
"loss": 0.5345, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4150943396226414, |
|
"grad_norm": 0.9293281745891194, |
|
"learning_rate": 4.773610520069706e-06, |
|
"loss": 0.5455, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.4339622641509435, |
|
"grad_norm": 0.9573637805384148, |
|
"learning_rate": 4.495742454257418e-06, |
|
"loss": 0.546, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4528301886792452, |
|
"grad_norm": 0.9286559221860932, |
|
"learning_rate": 4.223842851296907e-06, |
|
"loss": 0.5179, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.4716981132075473, |
|
"grad_norm": 0.9387936040725331, |
|
"learning_rate": 3.9582065416291926e-06, |
|
"loss": 0.5561, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.490566037735849, |
|
"grad_norm": 0.9706784805714281, |
|
"learning_rate": 3.6991215641828903e-06, |
|
"loss": 0.5471, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"grad_norm": 0.998331330776895, |
|
"learning_rate": 3.4468688540433425e-06, |
|
"loss": 0.5462, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5283018867924527, |
|
"grad_norm": 0.9387337728890874, |
|
"learning_rate": 3.2017219378246734e-06, |
|
"loss": 0.5658, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.5471698113207548, |
|
"grad_norm": 0.9419101032950977, |
|
"learning_rate": 2.963946637075107e-06, |
|
"loss": 0.5416, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5660377358490565, |
|
"grad_norm": 0.9575101628921765, |
|
"learning_rate": 2.7338007800372024e-06, |
|
"loss": 0.5323, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.5849056603773586, |
|
"grad_norm": 1.0034727804533015, |
|
"learning_rate": 2.5115339220754796e-06, |
|
"loss": 0.5402, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6037735849056602, |
|
"grad_norm": 7.5029005576343195, |
|
"learning_rate": 2.2973870750746253e-06, |
|
"loss": 0.5655, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.6226415094339623, |
|
"grad_norm": 1.0027923672255161, |
|
"learning_rate": 2.09159244610172e-06, |
|
"loss": 0.5542, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.641509433962264, |
|
"grad_norm": 0.9357231473507377, |
|
"learning_rate": 1.8943731856158299e-06, |
|
"loss": 0.6383, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.6603773584905661, |
|
"grad_norm": 0.9686189698665412, |
|
"learning_rate": 1.7059431454979825e-06, |
|
"loss": 0.5562, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6792452830188678, |
|
"grad_norm": 0.9045883081564006, |
|
"learning_rate": 1.5265066471639701e-06, |
|
"loss": 0.5348, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.6981132075471699, |
|
"grad_norm": 1.0056737797057134, |
|
"learning_rate": 1.3562582600113295e-06, |
|
"loss": 0.5352, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7169811320754715, |
|
"grad_norm": 0.9512535781198771, |
|
"learning_rate": 1.1953825904408033e-06, |
|
"loss": 0.5464, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.7358490566037736, |
|
"grad_norm": 0.9930921183835165, |
|
"learning_rate": 1.0440540816810395e-06, |
|
"loss": 0.561, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7547169811320755, |
|
"grad_norm": 0.9410700940787784, |
|
"learning_rate": 9.024368246335735e-07, |
|
"loss": 0.5456, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.7735849056603774, |
|
"grad_norm": 0.9323535925403078, |
|
"learning_rate": 7.706843799431985e-07, |
|
"loss": 0.5243, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7924528301886793, |
|
"grad_norm": 0.9689906420605915, |
|
"learning_rate": 6.489396114866942e-07, |
|
"loss": 0.5296, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.8113207547169812, |
|
"grad_norm": 1.2229314873705728, |
|
"learning_rate": 5.373345314604206e-07, |
|
"loss": 0.53, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.830188679245283, |
|
"grad_norm": 0.9545280104598107, |
|
"learning_rate": 4.359901572347758e-07, |
|
"loss": 0.5031, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.849056603773585, |
|
"grad_norm": 0.9636644642636012, |
|
"learning_rate": 3.450163801307582e-07, |
|
"loss": 0.5159, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8679245283018868, |
|
"grad_norm": 0.9314616621215778, |
|
"learning_rate": 2.6451184626087646e-07, |
|
"loss": 0.5212, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 0.9019228681615893, |
|
"learning_rate": 1.9456384956365149e-07, |
|
"loss": 0.539, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9056603773584906, |
|
"grad_norm": 0.8782721096424342, |
|
"learning_rate": 1.3524823714768375e-07, |
|
"loss": 0.5462, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.9245283018867925, |
|
"grad_norm": 0.9377029041620999, |
|
"learning_rate": 8.662932704792793e-08, |
|
"loss": 0.5352, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.9433962264150944, |
|
"grad_norm": 0.9313929690259365, |
|
"learning_rate": 4.8759838483358745e-08, |
|
"loss": 0.5129, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.9622641509433962, |
|
"grad_norm": 0.9729472297721098, |
|
"learning_rate": 2.1680834691628627e-08, |
|
"loss": 0.5358, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9811320754716981, |
|
"grad_norm": 0.9341412228490686, |
|
"learning_rate": 5.421678402741659e-09, |
|
"loss": 0.5717, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.945793151587699, |
|
"learning_rate": 0.0, |
|
"loss": 0.5006, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.852841854095459, |
|
"eval_runtime": 6.8692, |
|
"eval_samples_per_second": 24.894, |
|
"eval_steps_per_second": 0.437, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 530, |
|
"total_flos": 110971217510400.0, |
|
"train_loss": 0.7231714307137256, |
|
"train_runtime": 3756.5168, |
|
"train_samples_per_second": 9.007, |
|
"train_steps_per_second": 0.141 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 530, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 110971217510400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|