{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038461538461538464, "grad_norm": 0.09405899047851562, "learning_rate": 7.692307692307694e-06, "loss": 2.0595, "mean_token_accuracy": 0.5536306202411652, "step": 1 }, { "epoch": 0.019230769230769232, "grad_norm": 0.08910957723855972, "learning_rate": 3.846153846153846e-05, "loss": 1.8585, "mean_token_accuracy": 0.5810351483523846, "step": 5 }, { "epoch": 0.038461538461538464, "grad_norm": 0.08904706686735153, "learning_rate": 7.692307692307693e-05, "loss": 1.9022, "mean_token_accuracy": 0.5676299393177032, "step": 10 }, { "epoch": 0.057692307692307696, "grad_norm": 0.08470770716667175, "learning_rate": 0.00011538461538461538, "loss": 1.7707, "mean_token_accuracy": 0.5904860138893128, "step": 15 }, { "epoch": 0.07692307692307693, "grad_norm": 0.08369024842977524, "learning_rate": 0.00015384615384615385, "loss": 1.8988, "mean_token_accuracy": 0.5750770568847656, "step": 20 }, { "epoch": 0.09615384615384616, "grad_norm": 0.09705021977424622, "learning_rate": 0.00019230769230769233, "loss": 1.7922, "mean_token_accuracy": 0.5889591336250305, "step": 25 }, { "epoch": 0.11538461538461539, "grad_norm": 0.09560074657201767, "learning_rate": 0.00019985583705641418, "loss": 1.8499, "mean_token_accuracy": 0.580015218257904, "step": 30 }, { "epoch": 0.1346153846153846, "grad_norm": 0.10197644680738449, "learning_rate": 0.0001992708874098054, "loss": 1.7104, "mean_token_accuracy": 0.6041951894760131, "step": 35 }, { "epoch": 0.15384615384615385, "grad_norm": 0.13560408353805542, "learning_rate": 0.00019823877374156647, "loss": 1.6712, "mean_token_accuracy": 0.6099030375480652, "step": 40 }, { "epoch": 0.17307692307692307, "grad_norm": 0.08910802006721497, "learning_rate": 0.00019676414519013781, "loss": 1.7598, "mean_token_accuracy": 0.6009629964828491, "step": 45 }, { "epoch": 0.19230769230769232, "grad_norm": 0.07992758601903915, "learning_rate": 0.00019485364419471454, "loss": 1.7385, "mean_token_accuracy": 0.6025008857250214, "step": 50 }, { "epoch": 0.21153846153846154, "grad_norm": 0.10774888843297958, "learning_rate": 0.00019251587657449236, "loss": 1.7556, "mean_token_accuracy": 0.5930293738842011, "step": 55 }, { "epoch": 0.23076923076923078, "grad_norm": 0.09247864037752151, "learning_rate": 0.0001897613727639014, "loss": 1.8556, "mean_token_accuracy": 0.5815793335437774, "step": 60 }, { "epoch": 0.25, "grad_norm": 0.09036409109830856, "learning_rate": 0.00018660254037844388, "loss": 1.7937, "mean_token_accuracy": 0.5908996999263764, "step": 65 }, { "epoch": 0.2692307692307692, "grad_norm": 0.08717014640569687, "learning_rate": 0.00018305360832480117, "loss": 1.7211, "mean_token_accuracy": 0.5962238609790802, "step": 70 }, { "epoch": 0.28846153846153844, "grad_norm": 0.0832277238368988, "learning_rate": 0.0001791305627069662, "loss": 1.7616, "mean_token_accuracy": 0.5980025470256806, "step": 75 }, { "epoch": 0.3076923076923077, "grad_norm": 0.09641429781913757, "learning_rate": 0.00017485107481711012, "loss": 1.9454, "mean_token_accuracy": 0.5685357481241227, "step": 80 }, { "epoch": 0.3269230769230769, "grad_norm": 0.0839536264538765, "learning_rate": 0.00017023442153554777, "loss": 1.7512, "mean_token_accuracy": 0.601141732931137, "step": 85 }, { "epoch": 0.34615384615384615, "grad_norm": 0.09633956849575043, "learning_rate": 0.0001653013984983585, "loss": 1.7946, "mean_token_accuracy": 0.5880300402641296, "step": 90 }, { "epoch": 0.36538461538461536, "grad_norm": 0.1043943241238594, "learning_rate": 0.0001600742264237979, "loss": 1.8304, "mean_token_accuracy": 0.5875581204891205, "step": 95 }, { "epoch": 0.38461538461538464, "grad_norm": 0.0865144208073616, "learning_rate": 0.00015457645101945046, "loss": 1.6875, "mean_token_accuracy": 0.6015634536743164, "step": 100 }, { "epoch": 0.40384615384615385, "grad_norm": 0.09035369008779526, "learning_rate": 0.00014883283692099112, "loss": 1.8378, "mean_token_accuracy": 0.5831819474697113, "step": 105 }, { "epoch": 0.4230769230769231, "grad_norm": 0.08860991895198822, "learning_rate": 0.00014286925614030542, "loss": 1.798, "mean_token_accuracy": 0.5853480577468873, "step": 110 }, { "epoch": 0.4423076923076923, "grad_norm": 0.08836635202169418, "learning_rate": 0.00013671257152545277, "loss": 1.801, "mean_token_accuracy": 0.5812891662120819, "step": 115 }, { "epoch": 0.46153846153846156, "grad_norm": 0.09542136639356613, "learning_rate": 0.0001303905157574247, "loss": 1.69, "mean_token_accuracy": 0.6061259090900422, "step": 120 }, { "epoch": 0.4807692307692308, "grad_norm": 0.08438368141651154, "learning_rate": 0.0001239315664287558, "loss": 1.7784, "mean_token_accuracy": 0.5900569677352905, "step": 125 }, { "epoch": 0.5, "grad_norm": 0.10227668285369873, "learning_rate": 0.00011736481776669306, "loss": 1.7175, "mean_token_accuracy": 0.5924076437950134, "step": 130 }, { "epoch": 0.5192307692307693, "grad_norm": 0.0860576331615448, "learning_rate": 0.00011071984957874479, "loss": 1.6981, "mean_token_accuracy": 0.6106000781059265, "step": 135 }, { "epoch": 0.5384615384615384, "grad_norm": 0.09762172400951385, "learning_rate": 0.00010402659401094152, "loss": 1.762, "mean_token_accuracy": 0.5936734616756439, "step": 140 }, { "epoch": 0.5576923076923077, "grad_norm": 0.08866383880376816, "learning_rate": 9.73152007189939e-05, "loss": 1.6937, "mean_token_accuracy": 0.6022425711154937, "step": 145 }, { "epoch": 0.5769230769230769, "grad_norm": 0.09346310794353485, "learning_rate": 9.061590105968208e-05, "loss": 1.7799, "mean_token_accuracy": 0.5884160101413727, "step": 150 }, { "epoch": 0.5961538461538461, "grad_norm": 0.10388106107711792, "learning_rate": 8.395887191422397e-05, "loss": 1.7951, "mean_token_accuracy": 0.5963736653327942, "step": 155 }, { "epoch": 0.6153846153846154, "grad_norm": 0.09293079376220703, "learning_rate": 7.73740997570278e-05, "loss": 1.9635, "mean_token_accuracy": 0.5650549530982971, "step": 160 }, { "epoch": 0.6346153846153846, "grad_norm": 0.09150257706642151, "learning_rate": 7.089124558212871e-05, "loss": 1.7436, "mean_token_accuracy": 0.5971101462841034, "step": 165 }, { "epoch": 0.6538461538461539, "grad_norm": 0.09019029885530472, "learning_rate": 6.453951129574644e-05, "loss": 1.7784, "mean_token_accuracy": 0.5899733185768128, "step": 170 }, { "epoch": 0.6730769230769231, "grad_norm": 0.09248427301645279, "learning_rate": 5.834750817679606e-05, "loss": 1.6719, "mean_token_accuracy": 0.6091178715229034, "step": 175 }, { "epoch": 0.6923076923076923, "grad_norm": 0.11273639649152756, "learning_rate": 5.234312799786921e-05, "loss": 1.7856, "mean_token_accuracy": 0.5923596352338791, "step": 180 }, { "epoch": 0.7115384615384616, "grad_norm": 0.09874605387449265, "learning_rate": 4.6553417387219886e-05, "loss": 1.7215, "mean_token_accuracy": 0.6017483115196228, "step": 185 }, { "epoch": 0.7307692307692307, "grad_norm": 0.09074950963258743, "learning_rate": 4.100445599768774e-05, "loss": 1.9318, "mean_token_accuracy": 0.5736395329236984, "step": 190 }, { "epoch": 0.75, "grad_norm": 0.12326296418905258, "learning_rate": 3.5721239031346066e-05, "loss": 1.7836, "mean_token_accuracy": 0.5871184587478637, "step": 195 }, { "epoch": 0.7692307692307693, "grad_norm": 0.10748053342103958, "learning_rate": 3.072756464904006e-05, "loss": 1.6282, "mean_token_accuracy": 0.6153640151023865, "step": 200 }, { "epoch": 0.7884615384615384, "grad_norm": 0.093140609562397, "learning_rate": 2.6045926771976303e-05, "loss": 1.6998, "mean_token_accuracy": 0.5974012076854706, "step": 205 }, { "epoch": 0.8076923076923077, "grad_norm": 0.10797327011823654, "learning_rate": 2.1697413758237784e-05, "loss": 1.6651, "mean_token_accuracy": 0.6072666823863984, "step": 210 }, { "epoch": 0.8269230769230769, "grad_norm": 0.0944468230009079, "learning_rate": 1.7701613410634365e-05, "loss": 1.6446, "mean_token_accuracy": 0.6169975221157074, "step": 215 }, { "epoch": 0.8461538461538461, "grad_norm": 0.1349770575761795, "learning_rate": 1.4076524743778319e-05, "loss": 1.6874, "mean_token_accuracy": 0.610532957315445, "step": 220 }, { "epoch": 0.8653846153846154, "grad_norm": 0.09194570034742355, "learning_rate": 1.083847690782972e-05, "loss": 1.8036, "mean_token_accuracy": 0.5944990277290344, "step": 225 }, { "epoch": 0.8846153846153846, "grad_norm": 0.09132257848978043, "learning_rate": 8.002055634117578e-06, "loss": 1.6791, "mean_token_accuracy": 0.6079061985015869, "step": 230 }, { "epoch": 0.9038461538461539, "grad_norm": 0.08958450704813004, "learning_rate": 5.580037533961546e-06, "loss": 1.7627, "mean_token_accuracy": 0.6024388492107391, "step": 235 }, { "epoch": 0.9230769230769231, "grad_norm": 0.08657591044902802, "learning_rate": 3.5833325466437694e-06, "loss": 1.6514, "mean_token_accuracy": 0.6082013726234436, "step": 240 }, { "epoch": 0.9423076923076923, "grad_norm": 0.09413374960422516, "learning_rate": 2.0209347957732328e-06, "loss": 1.6913, "mean_token_accuracy": 0.6105992496013641, "step": 245 }, { "epoch": 0.9615384615384616, "grad_norm": 0.09713339805603027, "learning_rate": 8.998820754091531e-07, "loss": 1.7552, "mean_token_accuracy": 0.5954502403736115, "step": 250 }, { "epoch": 0.9807692307692307, "grad_norm": 0.10334648191928864, "learning_rate": 2.2522414843748618e-07, "loss": 1.6602, "mean_token_accuracy": 0.6200077295303345, "step": 255 }, { "epoch": 1.0, "grad_norm": 0.1041092649102211, "learning_rate": 0.0, "loss": 1.7604, "mean_token_accuracy": 0.5999625205993653, "step": 260 }, { "epoch": 1.0, "step": 260, "total_flos": 2381129238708224.0, "train_loss": 0.0, "train_runtime": 0.6875, "train_samples_per_second": 3024.065, "train_steps_per_second": 378.19 } ], "logging_steps": 5, "max_steps": 260, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2381129238708224.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }