{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 44343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.76544212164265e-05, "grad_norm": 37.0, "learning_rate": 0.0002999932345578783, "loss": 9.25, "step": 1 }, { "epoch": 0.20296326364927947, "grad_norm": 0.859375, "learning_rate": 0.000279703673635072, "loss": 4.2275, "step": 3000 }, { "epoch": 0.20296326364927947, "eval_loss": 3.5942580699920654, "eval_runtime": 50.1044, "eval_samples_per_second": 1884.165, "eval_steps_per_second": 7.365, "step": 3000 }, { "epoch": 0.40592652729855894, "grad_norm": 0.83203125, "learning_rate": 0.0002594073472701441, "loss": 4.0104, "step": 6000 }, { "epoch": 0.40592652729855894, "eval_loss": 3.5292599201202393, "eval_runtime": 49.9718, "eval_samples_per_second": 1889.165, "eval_steps_per_second": 7.384, "step": 6000 }, { "epoch": 0.6088897909478385, "grad_norm": 0.9609375, "learning_rate": 0.0002391110209052161, "loss": 3.9774, "step": 9000 }, { "epoch": 0.6088897909478385, "eval_loss": 3.5074524879455566, "eval_runtime": 49.8418, "eval_samples_per_second": 1894.092, "eval_steps_per_second": 7.403, "step": 9000 }, { "epoch": 0.8118530545971179, "grad_norm": 0.96484375, "learning_rate": 0.0002188146945402882, "loss": 3.9649, "step": 12000 }, { "epoch": 0.8118530545971179, "eval_loss": 3.4984755516052246, "eval_runtime": 49.8451, "eval_samples_per_second": 1893.969, "eval_steps_per_second": 7.403, "step": 12000 }, { "epoch": 1.0148163182463974, "grad_norm": 0.83984375, "learning_rate": 0.00019851836817536025, "loss": 3.9601, "step": 15000 }, { "epoch": 1.0148163182463974, "eval_loss": 3.492377996444702, "eval_runtime": 49.9766, "eval_samples_per_second": 1888.985, "eval_steps_per_second": 7.383, "step": 15000 }, { "epoch": 1.217779581895677, "grad_norm": 0.84375, "learning_rate": 0.0001782220418104323, "loss": 3.9568, "step": 18000 }, { "epoch": 1.217779581895677, "eval_loss": 3.489668130874634, "eval_runtime": 49.8577, "eval_samples_per_second": 1893.491, "eval_steps_per_second": 7.401, "step": 18000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.03125, "learning_rate": 0.00015792571544550436, "loss": 3.9542, "step": 21000 }, { "epoch": 1.4207428455449564, "eval_loss": 3.488651752471924, "eval_runtime": 49.854, "eval_samples_per_second": 1893.63, "eval_steps_per_second": 7.402, "step": 21000 }, { "epoch": 1.6237061091942357, "grad_norm": 0.86328125, "learning_rate": 0.0001376293890805764, "loss": 3.9522, "step": 24000 }, { "epoch": 1.6237061091942357, "eval_loss": 3.4861111640930176, "eval_runtime": 49.8638, "eval_samples_per_second": 1893.256, "eval_steps_per_second": 7.4, "step": 24000 }, { "epoch": 1.8266693728435153, "grad_norm": 0.80078125, "learning_rate": 0.00011733306271564845, "loss": 3.9512, "step": 27000 }, { "epoch": 1.8266693728435153, "eval_loss": 3.4858570098876953, "eval_runtime": 49.8699, "eval_samples_per_second": 1893.027, "eval_steps_per_second": 7.399, "step": 27000 }, { "epoch": 2.029632636492795, "grad_norm": 0.8671875, "learning_rate": 9.703673635072052e-05, "loss": 3.9523, "step": 30000 }, { "epoch": 2.029632636492795, "eval_loss": 3.485603094100952, "eval_runtime": 49.9876, "eval_samples_per_second": 1888.568, "eval_steps_per_second": 7.382, "step": 30000 }, { "epoch": 2.232595900142074, "grad_norm": 0.8359375, "learning_rate": 7.674040998579256e-05, "loss": 3.95, "step": 33000 }, { "epoch": 2.232595900142074, "eval_loss": 3.485264301300049, "eval_runtime": 49.8575, "eval_samples_per_second": 1893.498, "eval_steps_per_second": 7.401, "step": 33000 }, { "epoch": 2.435559163791354, "grad_norm": 0.84375, "learning_rate": 5.644408362086462e-05, "loss": 3.9525, "step": 36000 }, { "epoch": 2.435559163791354, "eval_loss": 3.4850525856018066, "eval_runtime": 49.8563, "eval_samples_per_second": 1893.542, "eval_steps_per_second": 7.401, "step": 36000 }, { "epoch": 2.638522427440633, "grad_norm": 0.9296875, "learning_rate": 3.614775725593667e-05, "loss": 3.9509, "step": 39000 }, { "epoch": 2.638522427440633, "eval_loss": 3.4850101470947266, "eval_runtime": 49.841, "eval_samples_per_second": 1894.125, "eval_steps_per_second": 7.404, "step": 39000 }, { "epoch": 2.841485691089913, "grad_norm": 0.87890625, "learning_rate": 1.5851430891008727e-05, "loss": 3.9511, "step": 42000 }, { "epoch": 2.841485691089913, "eval_loss": 3.4849255084991455, "eval_runtime": 49.8566, "eval_samples_per_second": 1893.53, "eval_steps_per_second": 7.401, "step": 42000 }, { "epoch": 3.0, "step": 44343, "total_flos": 3.41551228634284e+17, "train_loss": 3.977961572288749, "train_runtime": 18296.5341, "train_samples_per_second": 620.416, "train_steps_per_second": 2.424 } ], "logging_steps": 3000, "max_steps": 44343, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.41551228634284e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }