{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 3.1039228439331055, "learning_rate": 1.3333333333333334e-06, "loss": 1.2535, "step": 10 }, { "epoch": 0.16, "grad_norm": 1.9736298322677612, "learning_rate": 2.666666666666667e-06, "loss": 1.1389, "step": 20 }, { "epoch": 0.24, "grad_norm": 0.9868311285972595, "learning_rate": 4.000000000000001e-06, "loss": 1.1137, "step": 30 }, { "epoch": 0.32, "grad_norm": 0.7625114321708679, "learning_rate": 5.333333333333334e-06, "loss": 1.0243, "step": 40 }, { "epoch": 0.4, "grad_norm": 0.8334870338439941, "learning_rate": 6.666666666666667e-06, "loss": 1.0505, "step": 50 }, { "epoch": 0.48, "grad_norm": 0.7253231406211853, "learning_rate": 8.000000000000001e-06, "loss": 0.9779, "step": 60 }, { "epoch": 0.56, "grad_norm": 0.6524099707603455, "learning_rate": 9.333333333333334e-06, "loss": 0.9797, "step": 70 }, { "epoch": 0.64, "grad_norm": 0.68406742811203, "learning_rate": 9.99864620589731e-06, "loss": 0.9878, "step": 80 }, { "epoch": 0.72, "grad_norm": 0.6620015501976013, "learning_rate": 9.987820251299121e-06, "loss": 0.9352, "step": 90 }, { "epoch": 0.8, "grad_norm": 0.7781227231025696, "learning_rate": 9.966191788709716e-06, "loss": 0.9205, "step": 100 }, { "epoch": 0.88, "grad_norm": 0.6981615424156189, "learning_rate": 9.933807660562898e-06, "loss": 0.9168, "step": 110 }, { "epoch": 0.96, "grad_norm": 0.5809632539749146, "learning_rate": 9.890738003669029e-06, "loss": 0.9345, "step": 120 }, { "epoch": 1.04, "grad_norm": 0.6306660771369934, "learning_rate": 9.83707609731432e-06, "loss": 0.8842, "step": 130 }, { "epoch": 1.12, "grad_norm": 0.645531952381134, "learning_rate": 9.77293816123866e-06, "loss": 0.8496, "step": 140 }, { "epoch": 1.2, "grad_norm": 0.7515069842338562, "learning_rate": 9.698463103929542e-06, "loss": 0.8134, "step": 150 }, { "epoch": 1.28, "grad_norm": 0.8647974133491516, "learning_rate": 9.613812221777212e-06, "loss": 0.8645, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 0.5082105398178101, "learning_rate": 9.519168849742603e-06, "loss": 0.8339, "step": 170 }, { "epoch": 1.44, "grad_norm": 0.6905492544174194, "learning_rate": 9.414737964294636e-06, "loss": 0.8524, "step": 180 }, { "epoch": 1.52, "grad_norm": 0.7012932896614075, "learning_rate": 9.30074573947683e-06, "loss": 0.8566, "step": 190 }, { "epoch": 1.6, "grad_norm": 0.6087114810943604, "learning_rate": 9.177439057064684e-06, "loss": 0.8107, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 0.7151578664779663, "learning_rate": 9.045084971874738e-06, "loss": 0.9001, "step": 210 }, { "epoch": 1.76, "grad_norm": 0.746478796005249, "learning_rate": 8.903970133383297e-06, "loss": 0.8522, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 0.5843214392662048, "learning_rate": 8.754400164907496e-06, "loss": 0.8164, "step": 230 }, { "epoch": 1.92, "grad_norm": 0.6262736916542053, "learning_rate": 8.596699001693257e-06, "loss": 0.8739, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.6539023518562317, "learning_rate": 8.43120818934367e-06, "loss": 0.8353, "step": 250 }, { "epoch": 2.08, "grad_norm": 0.7221904397010803, "learning_rate": 8.258286144107277e-06, "loss": 0.7376, "step": 260 }, { "epoch": 2.16, "grad_norm": 0.6120831966400146, "learning_rate": 8.078307376628292e-06, "loss": 0.7247, "step": 270 }, { "epoch": 2.24, "grad_norm": 0.5770366787910461, "learning_rate": 7.891661680839932e-06, "loss": 0.6993, "step": 280 }, { "epoch": 2.32, "grad_norm": 0.4881764352321625, "learning_rate": 7.698753289757565e-06, "loss": 0.6871, "step": 290 }, { "epoch": 2.4, "grad_norm": 0.6157920360565186, "learning_rate": 7.500000000000001e-06, "loss": 0.7247, "step": 300 }, { "epoch": 2.48, "grad_norm": 0.5322688817977905, "learning_rate": 7.295832266935059e-06, "loss": 0.7549, "step": 310 }, { "epoch": 2.56, "grad_norm": 0.7124238014221191, "learning_rate": 7.08669227240909e-06, "loss": 0.6858, "step": 320 }, { "epoch": 2.64, "grad_norm": 0.5642361044883728, "learning_rate": 6.873032967079562e-06, "loss": 0.6737, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 0.7349158525466919, "learning_rate": 6.655317089424791e-06, "loss": 0.7092, "step": 340 }, { "epoch": 2.8, "grad_norm": 0.6204760670661926, "learning_rate": 6.434016163555452e-06, "loss": 0.6878, "step": 350 }, { "epoch": 2.88, "grad_norm": 0.4934922754764557, "learning_rate": 6.209609477998339e-06, "loss": 0.6883, "step": 360 }, { "epoch": 2.96, "grad_norm": 0.5973473787307739, "learning_rate": 5.982583047664151e-06, "loss": 0.7312, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.9578109979629517, "learning_rate": 5.753428561247416e-06, "loss": 0.6466, "step": 380 }, { "epoch": 3.12, "grad_norm": 0.6420013904571533, "learning_rate": 5.522642316338268e-06, "loss": 0.5745, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.6083827614784241, "learning_rate": 5.290724144552379e-06, "loss": 0.6074, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.5411021709442139, "learning_rate": 5.0581763290069865e-06, "loss": 0.5958, "step": 410 }, { "epoch": 3.36, "grad_norm": 0.542936384677887, "learning_rate": 4.825502516487497e-06, "loss": 0.517, "step": 420 }, { "epoch": 3.44, "grad_norm": 0.643763542175293, "learning_rate": 4.59320662666071e-06, "loss": 0.6125, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.5902148485183716, "learning_rate": 4.361791760697027e-06, "loss": 0.5574, "step": 440 }, { "epoch": 3.6, "grad_norm": 0.5409561991691589, "learning_rate": 4.131759111665349e-06, "loss": 0.5619, "step": 450 }, { "epoch": 3.68, "grad_norm": 0.5303182601928711, "learning_rate": 3.903606879060483e-06, "loss": 0.5723, "step": 460 }, { "epoch": 3.76, "grad_norm": 0.5597081184387207, "learning_rate": 3.6778291898139907e-06, "loss": 0.57, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.557049572467804, "learning_rate": 3.4549150281252635e-06, "loss": 0.558, "step": 480 }, { "epoch": 3.92, "grad_norm": 0.5532881021499634, "learning_rate": 3.2353471764306567e-06, "loss": 0.5642, "step": 490 }, { "epoch": 4.0, "grad_norm": 0.6130328178405762, "learning_rate": 3.019601169804216e-06, "loss": 0.582, "step": 500 }, { "epoch": 4.08, "grad_norm": 0.6685742139816284, "learning_rate": 2.8081442660546126e-06, "loss": 0.4826, "step": 510 }, { "epoch": 4.16, "grad_norm": 0.7372143268585205, "learning_rate": 2.601434433748771e-06, "loss": 0.4799, "step": 520 }, { "epoch": 4.24, "grad_norm": 0.5380725860595703, "learning_rate": 2.3999193603539234e-06, "loss": 0.4595, "step": 530 }, { "epoch": 4.32, "grad_norm": 0.611395537853241, "learning_rate": 2.204035482646267e-06, "loss": 0.4414, "step": 540 }, { "epoch": 4.4, "grad_norm": 0.6131682991981506, "learning_rate": 2.0142070414860704e-06, "loss": 0.42, "step": 550 }, { "epoch": 4.48, "grad_norm": 0.5948655605316162, "learning_rate": 1.8308451630064484e-06, "loss": 0.4228, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 0.5260858535766602, "learning_rate": 1.6543469682057105e-06, "loss": 0.4486, "step": 570 }, { "epoch": 4.64, "grad_norm": 0.6490086913108826, "learning_rate": 1.4850947128716914e-06, "loss": 0.4064, "step": 580 }, { "epoch": 4.72, "grad_norm": 0.512779951095581, "learning_rate": 1.3234549597008572e-06, "loss": 0.4806, "step": 590 }, { "epoch": 4.8, "grad_norm": 0.6591514945030212, "learning_rate": 1.1697777844051105e-06, "loss": 0.4537, "step": 600 }, { "epoch": 4.88, "grad_norm": 0.5841071605682373, "learning_rate": 1.0243960175257605e-06, "loss": 0.425, "step": 610 }, { "epoch": 4.96, "grad_norm": 0.5767258405685425, "learning_rate": 8.876245235966884e-07, "loss": 0.4903, "step": 620 }, { "epoch": 5.04, "grad_norm": 0.681662380695343, "learning_rate": 7.597595192178702e-07, "loss": 0.4259, "step": 630 }, { "epoch": 5.12, "grad_norm": 0.6221340894699097, "learning_rate": 6.410779315161885e-07, "loss": 0.4243, "step": 640 }, { "epoch": 5.2, "grad_norm": 0.5294339656829834, "learning_rate": 5.318367983829393e-07, "loss": 0.3387, "step": 650 }, { "epoch": 5.28, "grad_norm": 0.5295357704162598, "learning_rate": 4.322727117869951e-07, "loss": 0.3378, "step": 660 }, { "epoch": 5.36, "grad_norm": 0.5497756004333496, "learning_rate": 3.426013053692878e-07, "loss": 0.3857, "step": 670 }, { "epoch": 5.44, "grad_norm": 0.5334697961807251, "learning_rate": 2.63016787428354e-07, "loss": 0.3996, "step": 680 }, { "epoch": 5.52, "grad_norm": 0.48104792833328247, "learning_rate": 1.9369152030840553e-07, "loss": 0.3743, "step": 690 }, { "epoch": 5.6, "grad_norm": 0.53028404712677, "learning_rate": 1.3477564710088097e-07, "loss": 0.3542, "step": 700 }, { "epoch": 5.68, "grad_norm": 0.5596076846122742, "learning_rate": 8.639676646793382e-08, "loss": 0.3587, "step": 710 }, { "epoch": 5.76, "grad_norm": 0.5660611391067505, "learning_rate": 4.865965629214819e-08, "loss": 0.414, "step": 720 }, { "epoch": 5.84, "grad_norm": 0.4768368899822235, "learning_rate": 2.1646046750978255e-08, "loss": 0.3887, "step": 730 }, { "epoch": 5.92, "grad_norm": 0.5675653219223022, "learning_rate": 5.414443307377171e-09, "loss": 0.3811, "step": 740 }, { "epoch": 6.0, "grad_norm": 0.5585565567016602, "learning_rate": 0.0, "loss": 0.3763, "step": 750 }, { "epoch": 6.0, "step": 750, "total_flos": 118427603697664.0, "train_loss": 0.662273271560669, "train_runtime": 26488.1763, "train_samples_per_second": 0.227, "train_steps_per_second": 0.028 } ], "logging_steps": 10, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 118427603697664.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }