{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99626400996264, "eval_steps": 500, "global_step": 1203, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024906600249066, "grad_norm": 44.361316887243134, "learning_rate": 5e-06, "loss": 1.0469, "step": 10 }, { "epoch": 0.049813200498132, "grad_norm": 1.9340361918852786, "learning_rate": 5e-06, "loss": 0.9485, "step": 20 }, { "epoch": 0.074719800747198, "grad_norm": 1.5810624474405115, "learning_rate": 5e-06, "loss": 0.9118, "step": 30 }, { "epoch": 0.099626400996264, "grad_norm": 1.0226804734735437, "learning_rate": 5e-06, "loss": 0.896, "step": 40 }, { "epoch": 0.12453300124533001, "grad_norm": 0.8059494431972822, "learning_rate": 5e-06, "loss": 0.885, "step": 50 }, { "epoch": 0.149439601494396, "grad_norm": 1.0759666179613023, "learning_rate": 5e-06, "loss": 0.8743, "step": 60 }, { "epoch": 0.17434620174346202, "grad_norm": 0.6070805032397537, "learning_rate": 5e-06, "loss": 0.865, "step": 70 }, { "epoch": 0.199252801992528, "grad_norm": 0.8206282215157027, "learning_rate": 5e-06, "loss": 0.8637, "step": 80 }, { "epoch": 0.22415940224159403, "grad_norm": 0.8375312112666434, "learning_rate": 5e-06, "loss": 0.8538, "step": 90 }, { "epoch": 0.24906600249066002, "grad_norm": 0.8745465775067757, "learning_rate": 5e-06, "loss": 0.8462, "step": 100 }, { "epoch": 0.273972602739726, "grad_norm": 0.7671374593440573, "learning_rate": 5e-06, "loss": 0.8516, "step": 110 }, { "epoch": 0.298879202988792, "grad_norm": 0.6357492120016651, "learning_rate": 5e-06, "loss": 0.8424, "step": 120 }, { "epoch": 0.32378580323785805, "grad_norm": 0.7985944551248969, "learning_rate": 5e-06, "loss": 0.838, "step": 130 }, { "epoch": 0.34869240348692404, "grad_norm": 0.6906428387839867, "learning_rate": 5e-06, "loss": 0.8425, "step": 140 }, { "epoch": 0.37359900373599003, "grad_norm": 0.63382119448828, "learning_rate": 5e-06, "loss": 0.8394, "step": 150 }, { "epoch": 0.398505603985056, "grad_norm": 0.7372274527250459, "learning_rate": 5e-06, "loss": 0.8331, "step": 160 }, { "epoch": 0.42341220423412207, "grad_norm": 0.6083911703654408, "learning_rate": 5e-06, "loss": 0.8326, "step": 170 }, { "epoch": 0.44831880448318806, "grad_norm": 1.1261618495477859, "learning_rate": 5e-06, "loss": 0.8279, "step": 180 }, { "epoch": 0.47322540473225405, "grad_norm": 0.7015056078596724, "learning_rate": 5e-06, "loss": 0.8289, "step": 190 }, { "epoch": 0.49813200498132004, "grad_norm": 0.7547586360167027, "learning_rate": 5e-06, "loss": 0.8269, "step": 200 }, { "epoch": 0.523038605230386, "grad_norm": 0.729606315703109, "learning_rate": 5e-06, "loss": 0.827, "step": 210 }, { "epoch": 0.547945205479452, "grad_norm": 0.576263809426071, "learning_rate": 5e-06, "loss": 0.8278, "step": 220 }, { "epoch": 0.572851805728518, "grad_norm": 0.608656960705352, "learning_rate": 5e-06, "loss": 0.8255, "step": 230 }, { "epoch": 0.597758405977584, "grad_norm": 0.6942678291488105, "learning_rate": 5e-06, "loss": 0.8247, "step": 240 }, { "epoch": 0.6226650062266501, "grad_norm": 0.8474112920696388, "learning_rate": 5e-06, "loss": 0.8211, "step": 250 }, { "epoch": 0.6475716064757161, "grad_norm": 0.7436541561484579, "learning_rate": 5e-06, "loss": 0.8205, "step": 260 }, { "epoch": 0.6724782067247821, "grad_norm": 0.5723501381180358, "learning_rate": 5e-06, "loss": 0.8175, "step": 270 }, { "epoch": 0.6973848069738481, "grad_norm": 0.7393336592407068, "learning_rate": 5e-06, "loss": 0.8266, "step": 280 }, { "epoch": 0.7222914072229141, "grad_norm": 0.8642437433242355, "learning_rate": 5e-06, "loss": 0.8213, "step": 290 }, { "epoch": 0.7471980074719801, "grad_norm": 0.6599736959065436, "learning_rate": 5e-06, "loss": 0.8197, "step": 300 }, { "epoch": 0.772104607721046, "grad_norm": 0.589894020890247, "learning_rate": 5e-06, "loss": 0.8171, "step": 310 }, { "epoch": 0.797011207970112, "grad_norm": 0.6770015328448542, "learning_rate": 5e-06, "loss": 0.8159, "step": 320 }, { "epoch": 0.821917808219178, "grad_norm": 0.6953955951335576, "learning_rate": 5e-06, "loss": 0.8156, "step": 330 }, { "epoch": 0.8468244084682441, "grad_norm": 0.8401761226093455, "learning_rate": 5e-06, "loss": 0.8136, "step": 340 }, { "epoch": 0.8717310087173101, "grad_norm": 0.6479655559695816, "learning_rate": 5e-06, "loss": 0.8091, "step": 350 }, { "epoch": 0.8966376089663761, "grad_norm": 0.7636033751591921, "learning_rate": 5e-06, "loss": 0.8127, "step": 360 }, { "epoch": 0.9215442092154421, "grad_norm": 0.5680882933927079, "learning_rate": 5e-06, "loss": 0.8139, "step": 370 }, { "epoch": 0.9464508094645081, "grad_norm": 0.5317095758960971, "learning_rate": 5e-06, "loss": 0.8148, "step": 380 }, { "epoch": 0.9713574097135741, "grad_norm": 0.5355215121901621, "learning_rate": 5e-06, "loss": 0.8133, "step": 390 }, { "epoch": 0.9962640099626401, "grad_norm": 0.5034767977871308, "learning_rate": 5e-06, "loss": 0.8102, "step": 400 }, { "epoch": 0.9987546699875467, "eval_loss": 0.8109647631645203, "eval_runtime": 429.8587, "eval_samples_per_second": 25.166, "eval_steps_per_second": 0.395, "step": 401 }, { "epoch": 1.0211706102117062, "grad_norm": 0.7772306063471781, "learning_rate": 5e-06, "loss": 0.8079, "step": 410 }, { "epoch": 1.046077210460772, "grad_norm": 0.547172758232467, "learning_rate": 5e-06, "loss": 0.7693, "step": 420 }, { "epoch": 1.0709838107098382, "grad_norm": 0.6602663338851202, "learning_rate": 5e-06, "loss": 0.7659, "step": 430 }, { "epoch": 1.095890410958904, "grad_norm": 0.5563021526867751, "learning_rate": 5e-06, "loss": 0.7669, "step": 440 }, { "epoch": 1.1207970112079702, "grad_norm": 0.5764004393923637, "learning_rate": 5e-06, "loss": 0.7676, "step": 450 }, { "epoch": 1.145703611457036, "grad_norm": 0.6104368430877777, "learning_rate": 5e-06, "loss": 0.7668, "step": 460 }, { "epoch": 1.1706102117061021, "grad_norm": 0.5856299384522291, "learning_rate": 5e-06, "loss": 0.7653, "step": 470 }, { "epoch": 1.195516811955168, "grad_norm": 0.5968500894238352, "learning_rate": 5e-06, "loss": 0.7691, "step": 480 }, { "epoch": 1.2204234122042341, "grad_norm": 0.6425311166512483, "learning_rate": 5e-06, "loss": 0.7692, "step": 490 }, { "epoch": 1.2453300124533002, "grad_norm": 0.5800761501783642, "learning_rate": 5e-06, "loss": 0.77, "step": 500 }, { "epoch": 1.270236612702366, "grad_norm": 0.5217881601799819, "learning_rate": 5e-06, "loss": 0.7649, "step": 510 }, { "epoch": 1.2951432129514322, "grad_norm": 0.7577819320627684, "learning_rate": 5e-06, "loss": 0.7682, "step": 520 }, { "epoch": 1.320049813200498, "grad_norm": 0.7467713696988785, "learning_rate": 5e-06, "loss": 0.7712, "step": 530 }, { "epoch": 1.3449564134495642, "grad_norm": 0.6010822997576867, "learning_rate": 5e-06, "loss": 0.7664, "step": 540 }, { "epoch": 1.36986301369863, "grad_norm": 0.689181454056687, "learning_rate": 5e-06, "loss": 0.7679, "step": 550 }, { "epoch": 1.3947696139476962, "grad_norm": 0.4747660184884877, "learning_rate": 5e-06, "loss": 0.7639, "step": 560 }, { "epoch": 1.419676214196762, "grad_norm": 0.5116161932838977, "learning_rate": 5e-06, "loss": 0.7677, "step": 570 }, { "epoch": 1.4445828144458281, "grad_norm": 0.6433790988167347, "learning_rate": 5e-06, "loss": 0.7626, "step": 580 }, { "epoch": 1.4694894146948942, "grad_norm": 0.6145972003931011, "learning_rate": 5e-06, "loss": 0.7639, "step": 590 }, { "epoch": 1.4943960149439601, "grad_norm": 0.5887457741602182, "learning_rate": 5e-06, "loss": 0.7612, "step": 600 }, { "epoch": 1.519302615193026, "grad_norm": 0.5628593594779383, "learning_rate": 5e-06, "loss": 0.7685, "step": 610 }, { "epoch": 1.544209215442092, "grad_norm": 0.49978624448408865, "learning_rate": 5e-06, "loss": 0.7655, "step": 620 }, { "epoch": 1.5691158156911582, "grad_norm": 0.5121970961880906, "learning_rate": 5e-06, "loss": 0.7646, "step": 630 }, { "epoch": 1.5940224159402243, "grad_norm": 0.5120901081120943, "learning_rate": 5e-06, "loss": 0.7633, "step": 640 }, { "epoch": 1.6189290161892902, "grad_norm": 0.5708046084852306, "learning_rate": 5e-06, "loss": 0.7701, "step": 650 }, { "epoch": 1.643835616438356, "grad_norm": 0.559772892922969, "learning_rate": 5e-06, "loss": 0.7671, "step": 660 }, { "epoch": 1.6687422166874222, "grad_norm": 0.508876685275154, "learning_rate": 5e-06, "loss": 0.7627, "step": 670 }, { "epoch": 1.6936488169364883, "grad_norm": 0.5547904679119214, "learning_rate": 5e-06, "loss": 0.7665, "step": 680 }, { "epoch": 1.7185554171855542, "grad_norm": 0.5327048566040764, "learning_rate": 5e-06, "loss": 0.7612, "step": 690 }, { "epoch": 1.74346201743462, "grad_norm": 0.5681641331800833, "learning_rate": 5e-06, "loss": 0.7625, "step": 700 }, { "epoch": 1.7683686176836861, "grad_norm": 0.5583754277477581, "learning_rate": 5e-06, "loss": 0.7678, "step": 710 }, { "epoch": 1.7932752179327522, "grad_norm": 0.5821109954208563, "learning_rate": 5e-06, "loss": 0.7641, "step": 720 }, { "epoch": 1.8181818181818183, "grad_norm": 0.6033561880814401, "learning_rate": 5e-06, "loss": 0.7671, "step": 730 }, { "epoch": 1.8430884184308842, "grad_norm": 0.6575859282775093, "learning_rate": 5e-06, "loss": 0.7569, "step": 740 }, { "epoch": 1.86799501867995, "grad_norm": 0.5332781614516378, "learning_rate": 5e-06, "loss": 0.7617, "step": 750 }, { "epoch": 1.8929016189290162, "grad_norm": 0.6171829234250781, "learning_rate": 5e-06, "loss": 0.7628, "step": 760 }, { "epoch": 1.9178082191780823, "grad_norm": 0.553381597192015, "learning_rate": 5e-06, "loss": 0.7623, "step": 770 }, { "epoch": 1.9427148194271482, "grad_norm": 0.5971496735780886, "learning_rate": 5e-06, "loss": 0.7595, "step": 780 }, { "epoch": 1.967621419676214, "grad_norm": 0.566450928468519, "learning_rate": 5e-06, "loss": 0.7613, "step": 790 }, { "epoch": 1.9925280199252802, "grad_norm": 0.6533740130175245, "learning_rate": 5e-06, "loss": 0.7613, "step": 800 }, { "epoch": 2.0, "eval_loss": 0.798328697681427, "eval_runtime": 427.8716, "eval_samples_per_second": 25.283, "eval_steps_per_second": 0.397, "step": 803 }, { "epoch": 2.0174346201743463, "grad_norm": 0.9428578537809662, "learning_rate": 5e-06, "loss": 0.764, "step": 810 }, { "epoch": 2.0423412204234124, "grad_norm": 0.682790769401012, "learning_rate": 5e-06, "loss": 0.7174, "step": 820 }, { "epoch": 2.067247820672478, "grad_norm": 0.732474650025201, "learning_rate": 5e-06, "loss": 0.7136, "step": 830 }, { "epoch": 2.092154420921544, "grad_norm": 0.5517924405803882, "learning_rate": 5e-06, "loss": 0.7141, "step": 840 }, { "epoch": 2.1170610211706102, "grad_norm": 0.527980258175362, "learning_rate": 5e-06, "loss": 0.7205, "step": 850 }, { "epoch": 2.1419676214196763, "grad_norm": 0.5578355324627287, "learning_rate": 5e-06, "loss": 0.7182, "step": 860 }, { "epoch": 2.166874221668742, "grad_norm": 0.5384061514408854, "learning_rate": 5e-06, "loss": 0.7174, "step": 870 }, { "epoch": 2.191780821917808, "grad_norm": 0.5433323621552549, "learning_rate": 5e-06, "loss": 0.7227, "step": 880 }, { "epoch": 2.216687422166874, "grad_norm": 0.5934434020270568, "learning_rate": 5e-06, "loss": 0.7154, "step": 890 }, { "epoch": 2.2415940224159403, "grad_norm": 0.5610116690136854, "learning_rate": 5e-06, "loss": 0.7181, "step": 900 }, { "epoch": 2.2665006226650064, "grad_norm": 0.5956518936383002, "learning_rate": 5e-06, "loss": 0.7188, "step": 910 }, { "epoch": 2.291407222914072, "grad_norm": 0.5700434018521554, "learning_rate": 5e-06, "loss": 0.7189, "step": 920 }, { "epoch": 2.316313823163138, "grad_norm": 0.6159365804430498, "learning_rate": 5e-06, "loss": 0.7208, "step": 930 }, { "epoch": 2.3412204234122043, "grad_norm": 0.5739840813262334, "learning_rate": 5e-06, "loss": 0.7191, "step": 940 }, { "epoch": 2.3661270236612704, "grad_norm": 0.517298472276118, "learning_rate": 5e-06, "loss": 0.7254, "step": 950 }, { "epoch": 2.391033623910336, "grad_norm": 0.5309037963536546, "learning_rate": 5e-06, "loss": 0.7177, "step": 960 }, { "epoch": 2.415940224159402, "grad_norm": 0.5464343303315381, "learning_rate": 5e-06, "loss": 0.7162, "step": 970 }, { "epoch": 2.4408468244084682, "grad_norm": 0.5884939657248605, "learning_rate": 5e-06, "loss": 0.7238, "step": 980 }, { "epoch": 2.4657534246575343, "grad_norm": 0.6058278477423068, "learning_rate": 5e-06, "loss": 0.7217, "step": 990 }, { "epoch": 2.4906600249066004, "grad_norm": 0.575706246130651, "learning_rate": 5e-06, "loss": 0.7211, "step": 1000 }, { "epoch": 2.515566625155666, "grad_norm": 0.6210182727077225, "learning_rate": 5e-06, "loss": 0.722, "step": 1010 }, { "epoch": 2.540473225404732, "grad_norm": 0.6248334338554098, "learning_rate": 5e-06, "loss": 0.7226, "step": 1020 }, { "epoch": 2.5653798256537983, "grad_norm": 0.6075603863013977, "learning_rate": 5e-06, "loss": 0.7201, "step": 1030 }, { "epoch": 2.5902864259028644, "grad_norm": 0.6125989005343908, "learning_rate": 5e-06, "loss": 0.7225, "step": 1040 }, { "epoch": 2.61519302615193, "grad_norm": 0.5723698102141317, "learning_rate": 5e-06, "loss": 0.7184, "step": 1050 }, { "epoch": 2.640099626400996, "grad_norm": 0.5988876404053375, "learning_rate": 5e-06, "loss": 0.7228, "step": 1060 }, { "epoch": 2.6650062266500623, "grad_norm": 0.5535541669685047, "learning_rate": 5e-06, "loss": 0.7219, "step": 1070 }, { "epoch": 2.6899128268991284, "grad_norm": 0.6560134586878092, "learning_rate": 5e-06, "loss": 0.7231, "step": 1080 }, { "epoch": 2.7148194271481945, "grad_norm": 0.570154529031656, "learning_rate": 5e-06, "loss": 0.7207, "step": 1090 }, { "epoch": 2.73972602739726, "grad_norm": 0.6903793080558596, "learning_rate": 5e-06, "loss": 0.7225, "step": 1100 }, { "epoch": 2.7646326276463262, "grad_norm": 0.609309475396782, "learning_rate": 5e-06, "loss": 0.7199, "step": 1110 }, { "epoch": 2.7895392278953923, "grad_norm": 0.4982077265492007, "learning_rate": 5e-06, "loss": 0.723, "step": 1120 }, { "epoch": 2.8144458281444584, "grad_norm": 0.5520401600798728, "learning_rate": 5e-06, "loss": 0.7195, "step": 1130 }, { "epoch": 2.839352428393524, "grad_norm": 0.5678772706098874, "learning_rate": 5e-06, "loss": 0.7241, "step": 1140 }, { "epoch": 2.86425902864259, "grad_norm": 0.6919987752510048, "learning_rate": 5e-06, "loss": 0.7218, "step": 1150 }, { "epoch": 2.8891656288916563, "grad_norm": 0.5523800519721218, "learning_rate": 5e-06, "loss": 0.7223, "step": 1160 }, { "epoch": 2.9140722291407224, "grad_norm": 0.5786175424826561, "learning_rate": 5e-06, "loss": 0.7248, "step": 1170 }, { "epoch": 2.9389788293897885, "grad_norm": 0.5805260846296417, "learning_rate": 5e-06, "loss": 0.7186, "step": 1180 }, { "epoch": 2.963885429638854, "grad_norm": 0.6087027130014465, "learning_rate": 5e-06, "loss": 0.7225, "step": 1190 }, { "epoch": 2.9887920298879203, "grad_norm": 0.6138969910749299, "learning_rate": 5e-06, "loss": 0.7228, "step": 1200 }, { "epoch": 2.99626400996264, "eval_loss": 0.7962795495986938, "eval_runtime": 431.1284, "eval_samples_per_second": 25.092, "eval_steps_per_second": 0.394, "step": 1203 }, { "epoch": 2.99626400996264, "step": 1203, "total_flos": 2014860426608640.0, "train_loss": 0.7768312251676843, "train_runtime": 70934.0832, "train_samples_per_second": 8.693, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 1203, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2014860426608640.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }