|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99626400996264, |
|
"eval_steps": 500, |
|
"global_step": 1203, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024906600249066, |
|
"grad_norm": 44.361316887243134, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0469, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.049813200498132, |
|
"grad_norm": 1.9340361918852786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9485, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.074719800747198, |
|
"grad_norm": 1.5810624474405115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9118, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.099626400996264, |
|
"grad_norm": 1.0226804734735437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.896, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12453300124533001, |
|
"grad_norm": 0.8059494431972822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.885, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.149439601494396, |
|
"grad_norm": 1.0759666179613023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8743, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17434620174346202, |
|
"grad_norm": 0.6070805032397537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.865, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.199252801992528, |
|
"grad_norm": 0.8206282215157027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8637, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22415940224159403, |
|
"grad_norm": 0.8375312112666434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8538, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24906600249066002, |
|
"grad_norm": 0.8745465775067757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8462, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.7671374593440573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8516, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.298879202988792, |
|
"grad_norm": 0.6357492120016651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8424, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32378580323785805, |
|
"grad_norm": 0.7985944551248969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.838, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34869240348692404, |
|
"grad_norm": 0.6906428387839867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8425, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37359900373599003, |
|
"grad_norm": 0.63382119448828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.398505603985056, |
|
"grad_norm": 0.7372274527250459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8331, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.42341220423412207, |
|
"grad_norm": 0.6083911703654408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8326, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44831880448318806, |
|
"grad_norm": 1.1261618495477859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8279, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47322540473225405, |
|
"grad_norm": 0.7015056078596724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8289, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49813200498132004, |
|
"grad_norm": 0.7547586360167027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8269, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.523038605230386, |
|
"grad_norm": 0.729606315703109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.827, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.576263809426071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8278, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.572851805728518, |
|
"grad_norm": 0.608656960705352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8255, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.597758405977584, |
|
"grad_norm": 0.6942678291488105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8247, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6226650062266501, |
|
"grad_norm": 0.8474112920696388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8211, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6475716064757161, |
|
"grad_norm": 0.7436541561484579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8205, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6724782067247821, |
|
"grad_norm": 0.5723501381180358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8175, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6973848069738481, |
|
"grad_norm": 0.7393336592407068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8266, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7222914072229141, |
|
"grad_norm": 0.8642437433242355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8213, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7471980074719801, |
|
"grad_norm": 0.6599736959065436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8197, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.772104607721046, |
|
"grad_norm": 0.589894020890247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8171, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.797011207970112, |
|
"grad_norm": 0.6770015328448542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8159, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.6953955951335576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8156, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8468244084682441, |
|
"grad_norm": 0.8401761226093455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8136, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8717310087173101, |
|
"grad_norm": 0.6479655559695816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8091, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8966376089663761, |
|
"grad_norm": 0.7636033751591921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8127, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9215442092154421, |
|
"grad_norm": 0.5680882933927079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8139, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9464508094645081, |
|
"grad_norm": 0.5317095758960971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8148, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9713574097135741, |
|
"grad_norm": 0.5355215121901621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8133, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9962640099626401, |
|
"grad_norm": 0.5034767977871308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8102, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9987546699875467, |
|
"eval_loss": 0.8109647631645203, |
|
"eval_runtime": 429.8587, |
|
"eval_samples_per_second": 25.166, |
|
"eval_steps_per_second": 0.395, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.0211706102117062, |
|
"grad_norm": 0.7772306063471781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8079, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.046077210460772, |
|
"grad_norm": 0.547172758232467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7693, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0709838107098382, |
|
"grad_norm": 0.6602663338851202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7659, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.5563021526867751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7669, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1207970112079702, |
|
"grad_norm": 0.5764004393923637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7676, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.145703611457036, |
|
"grad_norm": 0.6104368430877777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7668, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1706102117061021, |
|
"grad_norm": 0.5856299384522291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7653, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.195516811955168, |
|
"grad_norm": 0.5968500894238352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7691, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2204234122042341, |
|
"grad_norm": 0.6425311166512483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7692, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2453300124533002, |
|
"grad_norm": 0.5800761501783642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.77, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.270236612702366, |
|
"grad_norm": 0.5217881601799819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7649, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2951432129514322, |
|
"grad_norm": 0.7577819320627684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7682, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.320049813200498, |
|
"grad_norm": 0.7467713696988785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7712, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3449564134495642, |
|
"grad_norm": 0.6010822997576867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.689181454056687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7679, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3947696139476962, |
|
"grad_norm": 0.4747660184884877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7639, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.419676214196762, |
|
"grad_norm": 0.5116161932838977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7677, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4445828144458281, |
|
"grad_norm": 0.6433790988167347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7626, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4694894146948942, |
|
"grad_norm": 0.6145972003931011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7639, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4943960149439601, |
|
"grad_norm": 0.5887457741602182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7612, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.519302615193026, |
|
"grad_norm": 0.5628593594779383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7685, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.544209215442092, |
|
"grad_norm": 0.49978624448408865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7655, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5691158156911582, |
|
"grad_norm": 0.5121970961880906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7646, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5940224159402243, |
|
"grad_norm": 0.5120901081120943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7633, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6189290161892902, |
|
"grad_norm": 0.5708046084852306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7701, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.559772892922969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7671, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6687422166874222, |
|
"grad_norm": 0.508876685275154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7627, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6936488169364883, |
|
"grad_norm": 0.5547904679119214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7665, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7185554171855542, |
|
"grad_norm": 0.5327048566040764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7612, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.74346201743462, |
|
"grad_norm": 0.5681641331800833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7625, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7683686176836861, |
|
"grad_norm": 0.5583754277477581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7678, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7932752179327522, |
|
"grad_norm": 0.5821109954208563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7641, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.6033561880814401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7671, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8430884184308842, |
|
"grad_norm": 0.6575859282775093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7569, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.86799501867995, |
|
"grad_norm": 0.5332781614516378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7617, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8929016189290162, |
|
"grad_norm": 0.6171829234250781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7628, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.553381597192015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7623, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9427148194271482, |
|
"grad_norm": 0.5971496735780886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7595, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.967621419676214, |
|
"grad_norm": 0.566450928468519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7613, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9925280199252802, |
|
"grad_norm": 0.6533740130175245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7613, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.798328697681427, |
|
"eval_runtime": 427.8716, |
|
"eval_samples_per_second": 25.283, |
|
"eval_steps_per_second": 0.397, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 2.0174346201743463, |
|
"grad_norm": 0.9428578537809662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.764, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0423412204234124, |
|
"grad_norm": 0.682790769401012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7174, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.067247820672478, |
|
"grad_norm": 0.732474650025201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7136, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.092154420921544, |
|
"grad_norm": 0.5517924405803882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7141, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1170610211706102, |
|
"grad_norm": 0.527980258175362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7205, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1419676214196763, |
|
"grad_norm": 0.5578355324627287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7182, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.166874221668742, |
|
"grad_norm": 0.5384061514408854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7174, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.5433323621552549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7227, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.216687422166874, |
|
"grad_norm": 0.5934434020270568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7154, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2415940224159403, |
|
"grad_norm": 0.5610116690136854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7181, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2665006226650064, |
|
"grad_norm": 0.5956518936383002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7188, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.291407222914072, |
|
"grad_norm": 0.5700434018521554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7189, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.316313823163138, |
|
"grad_norm": 0.6159365804430498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.3412204234122043, |
|
"grad_norm": 0.5739840813262334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7191, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3661270236612704, |
|
"grad_norm": 0.517298472276118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7254, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.391033623910336, |
|
"grad_norm": 0.5309037963536546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7177, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.415940224159402, |
|
"grad_norm": 0.5464343303315381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7162, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4408468244084682, |
|
"grad_norm": 0.5884939657248605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7238, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.6058278477423068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7217, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.4906600249066004, |
|
"grad_norm": 0.575706246130651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7211, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.515566625155666, |
|
"grad_norm": 0.6210182727077225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.722, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.540473225404732, |
|
"grad_norm": 0.6248334338554098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7226, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.5653798256537983, |
|
"grad_norm": 0.6075603863013977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7201, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.5902864259028644, |
|
"grad_norm": 0.6125989005343908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7225, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.61519302615193, |
|
"grad_norm": 0.5723698102141317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7184, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.640099626400996, |
|
"grad_norm": 0.5988876404053375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7228, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.6650062266500623, |
|
"grad_norm": 0.5535541669685047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7219, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.6899128268991284, |
|
"grad_norm": 0.6560134586878092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7231, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.7148194271481945, |
|
"grad_norm": 0.570154529031656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7207, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.6903793080558596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7225, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7646326276463262, |
|
"grad_norm": 0.609309475396782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7199, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.7895392278953923, |
|
"grad_norm": 0.4982077265492007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.723, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8144458281444584, |
|
"grad_norm": 0.5520401600798728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7195, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.839352428393524, |
|
"grad_norm": 0.5678772706098874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7241, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.86425902864259, |
|
"grad_norm": 0.6919987752510048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7218, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.8891656288916563, |
|
"grad_norm": 0.5523800519721218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7223, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9140722291407224, |
|
"grad_norm": 0.5786175424826561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7248, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.9389788293897885, |
|
"grad_norm": 0.5805260846296417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7186, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.963885429638854, |
|
"grad_norm": 0.6087027130014465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7225, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.9887920298879203, |
|
"grad_norm": 0.6138969910749299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7228, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.99626400996264, |
|
"eval_loss": 0.7962795495986938, |
|
"eval_runtime": 431.1284, |
|
"eval_samples_per_second": 25.092, |
|
"eval_steps_per_second": 0.394, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 2.99626400996264, |
|
"step": 1203, |
|
"total_flos": 2014860426608640.0, |
|
"train_loss": 0.7768312251676843, |
|
"train_runtime": 70934.0832, |
|
"train_samples_per_second": 8.693, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1203, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2014860426608640.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|