{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.891891891891891, "eval_steps": 500, "global_step": 366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2702702702702703, "grad_norm": 4.698714733123779, "learning_rate": 0.00010526315789473685, "loss": 0.9004, "step": 10 }, { "epoch": 0.5405405405405406, "grad_norm": 1.5324004888534546, "learning_rate": 0.00019999590166142655, "loss": 0.2857, "step": 20 }, { "epoch": 0.8108108108108109, "grad_norm": 1.7758814096450806, "learning_rate": 0.00019950450737506824, "loss": 0.1811, "step": 30 }, { "epoch": 1.0810810810810811, "grad_norm": 0.9645628929138184, "learning_rate": 0.00019819805815653768, "loss": 0.1416, "step": 40 }, { "epoch": 1.3513513513513513, "grad_norm": 0.9237997531890869, "learning_rate": 0.00019608725530879375, "loss": 0.1186, "step": 50 }, { "epoch": 1.6216216216216215, "grad_norm": 0.8899639844894409, "learning_rate": 0.00019318938870459984, "loss": 0.1076, "step": 60 }, { "epoch": 1.8918918918918919, "grad_norm": 0.8379050493240356, "learning_rate": 0.0001895281951628281, "loss": 0.1, "step": 70 }, { "epoch": 2.1621621621621623, "grad_norm": 0.49454841017723083, "learning_rate": 0.00018513366401695276, "loss": 0.0912, "step": 80 }, { "epoch": 2.4324324324324325, "grad_norm": 0.5223703384399414, "learning_rate": 0.0001800417914683471, "loss": 0.079, "step": 90 }, { "epoch": 2.7027027027027026, "grad_norm": 0.26653391122817993, "learning_rate": 0.00017429428573651024, "loss": 0.077, "step": 100 }, { "epoch": 2.972972972972973, "grad_norm": 0.6166922450065613, "learning_rate": 0.0001679382254213768, "loss": 0.0717, "step": 110 }, { "epoch": 3.2432432432432434, "grad_norm": 0.5702475309371948, "learning_rate": 0.0001610256738761125, "loss": 0.0612, "step": 120 }, { "epoch": 3.5135135135135136, "grad_norm": 0.4581798017024994, "learning_rate": 0.00015361325274911779, "loss": 0.0544, "step": 130 }, { "epoch": 3.7837837837837838, "grad_norm": 0.44726666808128357, "learning_rate": 0.0001457616781884173, "loss": 0.0628, "step": 140 }, { "epoch": 4.054054054054054, "grad_norm": 0.3319581151008606, "learning_rate": 0.0001375352635074461, "loss": 0.0648, "step": 150 }, { "epoch": 4.324324324324325, "grad_norm": 0.5032352805137634, "learning_rate": 0.00012900139238596598, "loss": 0.058, "step": 160 }, { "epoch": 4.594594594594595, "grad_norm": 0.36259725689888, "learning_rate": 0.00012022996692119424, "loss": 0.0497, "step": 170 }, { "epoch": 4.864864864864865, "grad_norm": 0.17420651018619537, "learning_rate": 0.00011129283505023274, "loss": 0.053, "step": 180 }, { "epoch": 5.135135135135135, "grad_norm": 0.32951629161834717, "learning_rate": 0.00010226320203385878, "loss": 0.0518, "step": 190 }, { "epoch": 5.405405405405405, "grad_norm": 0.27635061740875244, "learning_rate": 9.321503082229282e-05, "loss": 0.048, "step": 200 }, { "epoch": 5.675675675675675, "grad_norm": 0.3280782699584961, "learning_rate": 8.422243621462969e-05, "loss": 0.0516, "step": 210 }, { "epoch": 5.945945945945946, "grad_norm": 0.29008397459983826, "learning_rate": 7.535907777445449e-05, "loss": 0.0465, "step": 220 }, { "epoch": 6.216216216216216, "grad_norm": 0.24701108038425446, "learning_rate": 6.669755647435474e-05, "loss": 0.0436, "step": 230 }, { "epoch": 6.486486486486487, "grad_norm": 0.34741994738578796, "learning_rate": 5.830882001149517e-05, "loss": 0.0499, "step": 240 }, { "epoch": 6.756756756756757, "grad_norm": 0.2663642168045044, "learning_rate": 5.0261581665395475e-05, "loss": 0.0439, "step": 250 }, { "epoch": 7.027027027027027, "grad_norm": 0.34826692938804626, "learning_rate": 4.2621757458127285e-05, "loss": 0.0465, "step": 260 }, { "epoch": 7.297297297297297, "grad_norm": 0.2025139331817627, "learning_rate": 3.5451926227225997e-05, "loss": 0.041, "step": 270 }, { "epoch": 7.5675675675675675, "grad_norm": 0.2484084963798523, "learning_rate": 2.8810817033934656e-05, "loss": 0.0384, "step": 280 }, { "epoch": 7.837837837837838, "grad_norm": 0.19896186888217926, "learning_rate": 2.275282810548811e-05, "loss": 0.0368, "step": 290 }, { "epoch": 8.108108108108109, "grad_norm": 0.24585744738578796, "learning_rate": 1.73275812518469e-05, "loss": 0.0382, "step": 300 }, { "epoch": 8.378378378378379, "grad_norm": 0.2137955129146576, "learning_rate": 1.2579515406713193e-05, "loss": 0.0417, "step": 310 }, { "epoch": 8.64864864864865, "grad_norm": 0.29916706681251526, "learning_rate": 8.547522622190385e-06, "loss": 0.0349, "step": 320 }, { "epoch": 8.91891891891892, "grad_norm": 0.21314117312431335, "learning_rate": 5.264629498702967e-06, "loss": 0.038, "step": 330 }, { "epoch": 9.18918918918919, "grad_norm": 0.2648143470287323, "learning_rate": 2.7577266596274576e-06, "loss": 0.0386, "step": 340 }, { "epoch": 9.45945945945946, "grad_norm": 0.21586932241916656, "learning_rate": 1.0473484865448525e-06, "loss": 0.0356, "step": 350 }, { "epoch": 9.72972972972973, "grad_norm": 0.24431149661540985, "learning_rate": 1.4750491933247512e-07, "loss": 0.0428, "step": 360 }, { "epoch": 9.891891891891891, "step": 366, "total_flos": 4.991850946034842e+16, "train_loss": 0.09146885374367562, "train_runtime": 453.0584, "train_samples_per_second": 51.702, "train_steps_per_second": 0.808 } ], "logging_steps": 10, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.991850946034842e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }