{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 44343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.76544212164265e-05, "grad_norm": 11.8125, "learning_rate": 0.0002999932345578783, "loss": 4.9688, "step": 1 }, { "epoch": 0.20296326364927947, "grad_norm": 0.99609375, "learning_rate": 0.000279703673635072, "loss": 3.1317, "step": 3000 }, { "epoch": 0.20296326364927947, "eval_loss": 2.590404748916626, "eval_runtime": 54.625, "eval_samples_per_second": 1728.238, "eval_steps_per_second": 6.755, "step": 3000 }, { "epoch": 0.40592652729855894, "grad_norm": 0.91015625, "learning_rate": 0.0002594073472701441, "loss": 3.0056, "step": 6000 }, { "epoch": 0.40592652729855894, "eval_loss": 2.559239387512207, "eval_runtime": 54.5006, "eval_samples_per_second": 1732.184, "eval_steps_per_second": 6.771, "step": 6000 }, { "epoch": 0.6088897909478385, "grad_norm": 0.94921875, "learning_rate": 0.0002391110209052161, "loss": 2.9796, "step": 9000 }, { "epoch": 0.6088897909478385, "eval_loss": 2.547764301300049, "eval_runtime": 54.3734, "eval_samples_per_second": 1736.236, "eval_steps_per_second": 6.786, "step": 9000 }, { "epoch": 0.8118530545971179, "grad_norm": 1.109375, "learning_rate": 0.0002188146945402882, "loss": 2.9712, "step": 12000 }, { "epoch": 0.8118530545971179, "eval_loss": 2.53904128074646, "eval_runtime": 54.3708, "eval_samples_per_second": 1736.317, "eval_steps_per_second": 6.787, "step": 12000 }, { "epoch": 1.0148163182463974, "grad_norm": 0.921875, "learning_rate": 0.00019851836817536025, "loss": 2.9638, "step": 15000 }, { "epoch": 1.0148163182463974, "eval_loss": 2.5364582538604736, "eval_runtime": 54.5162, "eval_samples_per_second": 1731.687, "eval_steps_per_second": 6.769, "step": 15000 }, { "epoch": 1.217779581895677, "grad_norm": 1.15625, "learning_rate": 0.0001782220418104323, "loss": 2.9598, "step": 18000 }, { "epoch": 1.217779581895677, "eval_loss": 2.5331978797912598, "eval_runtime": 54.5086, "eval_samples_per_second": 1731.927, "eval_steps_per_second": 6.77, "step": 18000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.375, "learning_rate": 0.00015792571544550436, "loss": 2.958, "step": 21000 }, { "epoch": 1.4207428455449564, "eval_loss": 2.5318005084991455, "eval_runtime": 54.5162, "eval_samples_per_second": 1731.687, "eval_steps_per_second": 6.769, "step": 21000 }, { "epoch": 1.6237061091942357, "grad_norm": 1.1640625, "learning_rate": 0.0001376293890805764, "loss": 2.9557, "step": 24000 }, { "epoch": 1.6237061091942357, "eval_loss": 2.53218150138855, "eval_runtime": 54.6541, "eval_samples_per_second": 1727.317, "eval_steps_per_second": 6.752, "step": 24000 }, { "epoch": 1.8266693728435153, "grad_norm": 1.1328125, "learning_rate": 0.00011733306271564845, "loss": 2.9542, "step": 27000 }, { "epoch": 1.8266693728435153, "eval_loss": 2.5293021202087402, "eval_runtime": 54.5038, "eval_samples_per_second": 1732.08, "eval_steps_per_second": 6.77, "step": 27000 }, { "epoch": 2.029632636492795, "grad_norm": 1.0859375, "learning_rate": 9.703673635072052e-05, "loss": 2.9541, "step": 30000 }, { "epoch": 2.029632636492795, "eval_loss": 2.5292599201202393, "eval_runtime": 54.4969, "eval_samples_per_second": 1732.3, "eval_steps_per_second": 6.771, "step": 30000 }, { "epoch": 2.232595900142074, "grad_norm": 1.2578125, "learning_rate": 7.674040998579256e-05, "loss": 2.9518, "step": 33000 }, { "epoch": 2.232595900142074, "eval_loss": 2.5298101902008057, "eval_runtime": 54.3826, "eval_samples_per_second": 1735.942, "eval_steps_per_second": 6.785, "step": 33000 }, { "epoch": 2.435559163791354, "grad_norm": 1.546875, "learning_rate": 5.644408362086462e-05, "loss": 2.955, "step": 36000 }, { "epoch": 2.435559163791354, "eval_loss": 2.529937267303467, "eval_runtime": 54.3621, "eval_samples_per_second": 1736.595, "eval_steps_per_second": 6.788, "step": 36000 }, { "epoch": 2.638522427440633, "grad_norm": 0.93359375, "learning_rate": 3.614775725593667e-05, "loss": 2.9542, "step": 39000 }, { "epoch": 2.638522427440633, "eval_loss": 2.5293867588043213, "eval_runtime": 54.3643, "eval_samples_per_second": 1736.525, "eval_steps_per_second": 6.788, "step": 39000 }, { "epoch": 2.841485691089913, "grad_norm": 1.2265625, "learning_rate": 1.5851430891008727e-05, "loss": 2.9536, "step": 42000 }, { "epoch": 2.841485691089913, "eval_loss": 2.5295138359069824, "eval_runtime": 54.3667, "eval_samples_per_second": 1736.449, "eval_steps_per_second": 6.787, "step": 42000 }, { "epoch": 3.0, "step": 44343, "total_flos": 3.8956704097296384e+17, "train_loss": 2.9737832070450803, "train_runtime": 20072.1702, "train_samples_per_second": 565.532, "train_steps_per_second": 2.209 } ], "logging_steps": 3000, "max_steps": 44343, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8956704097296384e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }