{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 4.338859558105469, "learning_rate": 2.9528747416929465e-06, "loss": 0.7159, "step": 50 }, { "epoch": 0.08, "eval_loss": 0.6576318144798279, "eval_runtime": 12.5345, "eval_samples_per_second": 159.56, "eval_steps_per_second": 9.972, "step": 50 }, { "epoch": 0.16, "grad_norm": 4.263615131378174, "learning_rate": 2.814460020065795e-06, "loss": 0.6551, "step": 100 }, { "epoch": 0.16, "eval_loss": 0.6588318943977356, "eval_runtime": 12.1193, "eval_samples_per_second": 165.026, "eval_steps_per_second": 10.314, "step": 100 }, { "epoch": 0.24, "grad_norm": 4.684665679931641, "learning_rate": 2.5934529411321173e-06, "loss": 0.6519, "step": 150 }, { "epoch": 0.24, "eval_loss": 0.6581148505210876, "eval_runtime": 12.4709, "eval_samples_per_second": 160.373, "eval_steps_per_second": 10.023, "step": 150 }, { "epoch": 0.32, "grad_norm": 4.372674942016602, "learning_rate": 2.303740192468495e-06, "loss": 0.6278, "step": 200 }, { "epoch": 0.32, "eval_loss": 0.656774640083313, "eval_runtime": 12.0295, "eval_samples_per_second": 166.258, "eval_steps_per_second": 10.391, "step": 200 }, { "epoch": 0.4, "grad_norm": 3.940370798110962, "learning_rate": 1.963525491562421e-06, "loss": 0.6394, "step": 250 }, { "epoch": 0.4, "eval_loss": 0.6532722115516663, "eval_runtime": 12.0522, "eval_samples_per_second": 165.945, "eval_steps_per_second": 10.372, "step": 250 }, { "epoch": 0.48, "grad_norm": 4.178117752075195, "learning_rate": 1.5941857792939703e-06, "loss": 0.6528, "step": 300 }, { "epoch": 0.48, "eval_loss": 0.6502550840377808, "eval_runtime": 12.0395, "eval_samples_per_second": 166.12, "eval_steps_per_second": 10.383, "step": 300 }, { "epoch": 0.56, "grad_norm": 3.7875773906707764, "learning_rate": 1.2189280281214128e-06, "loss": 0.6382, "step": 350 }, { "epoch": 0.56, "eval_loss": 0.6453887820243835, "eval_runtime": 12.102, "eval_samples_per_second": 165.261, "eval_steps_per_second": 10.329, "step": 350 }, { "epoch": 0.64, "grad_norm": 3.9947192668914795, "learning_rate": 8.613310626523911e-07, "loss": 0.638, "step": 400 }, { "epoch": 0.64, "eval_loss": 0.6425909399986267, "eval_runtime": 12.0649, "eval_samples_per_second": 165.77, "eval_steps_per_second": 10.361, "step": 400 }, { "epoch": 0.72, "grad_norm": 3.9734668731689453, "learning_rate": 5.438640153769653e-07, "loss": 0.618, "step": 450 }, { "epoch": 0.72, "eval_loss": 0.6400230526924133, "eval_runtime": 12.0503, "eval_samples_per_second": 165.97, "eval_steps_per_second": 10.373, "step": 450 }, { "epoch": 0.8, "grad_norm": 3.733959436416626, "learning_rate": 2.86474508437579e-07, "loss": 0.6378, "step": 500 }, { "epoch": 0.8, "eval_loss": 0.6379128694534302, "eval_runtime": 12.1078, "eval_samples_per_second": 165.183, "eval_steps_per_second": 10.324, "step": 500 }, { "epoch": 0.88, "grad_norm": 3.8779754638671875, "learning_rate": 1.0533527116762298e-07, "loss": 0.6338, "step": 550 }, { "epoch": 0.88, "eval_loss": 0.636811375617981, "eval_runtime": 13.2199, "eval_samples_per_second": 151.287, "eval_steps_per_second": 9.455, "step": 550 }, { "epoch": 0.96, "grad_norm": 3.924581527709961, "learning_rate": 1.1827948028283353e-08, "loss": 0.6284, "step": 600 }, { "epoch": 0.96, "eval_loss": 0.6364374160766602, "eval_runtime": 14.3025, "eval_samples_per_second": 139.836, "eval_steps_per_second": 8.74, "step": 600 }, { "epoch": 1.0, "step": 625, "total_flos": 9.391098276138189e+16, "train_loss": 0.6481949188232422, "train_runtime": 2478.7348, "train_samples_per_second": 4.034, "train_steps_per_second": 0.252 } ], "logging_steps": 50, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.391098276138189e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }