{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9714285714285715, "eval_steps": 500, "global_step": 129, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06857142857142857, "grad_norm": 0.9437829852104187, "learning_rate": 0.0002, "loss": 4.167, "step": 3 }, { "epoch": 0.13714285714285715, "grad_norm": 0.706287682056427, "learning_rate": 0.0002, "loss": 4.0218, "step": 6 }, { "epoch": 0.2057142857142857, "grad_norm": 0.9851099848747253, "learning_rate": 0.0002, "loss": 3.989, "step": 9 }, { "epoch": 0.2742857142857143, "grad_norm": 0.6341831088066101, "learning_rate": 0.0002, "loss": 3.6726, "step": 12 }, { "epoch": 0.34285714285714286, "grad_norm": 0.4039613902568817, "learning_rate": 0.0002, "loss": 3.4053, "step": 15 }, { "epoch": 0.4114285714285714, "grad_norm": 0.44874435663223267, "learning_rate": 0.0002, "loss": 3.4123, "step": 18 }, { "epoch": 0.48, "grad_norm": 0.4203323721885681, "learning_rate": 0.0002, "loss": 3.3871, "step": 21 }, { "epoch": 0.5485714285714286, "grad_norm": 0.4040702283382416, "learning_rate": 0.0002, "loss": 3.2221, "step": 24 }, { "epoch": 0.6171428571428571, "grad_norm": 0.40100640058517456, "learning_rate": 0.0002, "loss": 3.2005, "step": 27 }, { "epoch": 0.6857142857142857, "grad_norm": 0.6720607280731201, "learning_rate": 0.0002, "loss": 3.1844, "step": 30 }, { "epoch": 0.7542857142857143, "grad_norm": 0.3040887415409088, "learning_rate": 0.0002, "loss": 3.2236, "step": 33 }, { "epoch": 0.8228571428571428, "grad_norm": 0.3039924204349518, "learning_rate": 0.0002, "loss": 3.1461, "step": 36 }, { "epoch": 0.8914285714285715, "grad_norm": 0.5960268974304199, "learning_rate": 0.0002, "loss": 3.1462, "step": 39 }, { "epoch": 0.96, "grad_norm": 0.22920484840869904, "learning_rate": 0.0002, "loss": 3.1491, "step": 42 }, { "epoch": 1.04, "grad_norm": 0.21868771314620972, "learning_rate": 0.0002, "loss": 4.0082, "step": 45 }, { "epoch": 1.1085714285714285, "grad_norm": 0.2646636962890625, "learning_rate": 0.0002, "loss": 3.0884, "step": 48 }, { "epoch": 1.177142857142857, "grad_norm": 0.30294740200042725, "learning_rate": 0.0002, "loss": 3.0319, "step": 51 }, { "epoch": 1.2457142857142858, "grad_norm": 0.6299964785575867, "learning_rate": 0.0002, "loss": 3.0376, "step": 54 }, { "epoch": 1.3142857142857143, "grad_norm": 0.4104610085487366, "learning_rate": 0.0002, "loss": 3.1366, "step": 57 }, { "epoch": 1.3828571428571428, "grad_norm": 0.4743083715438843, "learning_rate": 0.0002, "loss": 3.0527, "step": 60 }, { "epoch": 1.4514285714285715, "grad_norm": 0.6701816916465759, "learning_rate": 0.0002, "loss": 3.069, "step": 63 }, { "epoch": 1.52, "grad_norm": 0.321772962808609, "learning_rate": 0.0002, "loss": 3.0445, "step": 66 }, { "epoch": 1.5885714285714285, "grad_norm": 0.2866778075695038, "learning_rate": 0.0002, "loss": 2.9932, "step": 69 }, { "epoch": 1.657142857142857, "grad_norm": 0.45329466462135315, "learning_rate": 0.0002, "loss": 3.0257, "step": 72 }, { "epoch": 1.7257142857142858, "grad_norm": 0.38146278262138367, "learning_rate": 0.0002, "loss": 2.9753, "step": 75 }, { "epoch": 1.7942857142857143, "grad_norm": 0.25521424412727356, "learning_rate": 0.0002, "loss": 3.0399, "step": 78 }, { "epoch": 1.862857142857143, "grad_norm": 0.40487968921661377, "learning_rate": 0.0002, "loss": 3.0027, "step": 81 }, { "epoch": 1.9314285714285715, "grad_norm": 0.6657207012176514, "learning_rate": 0.0002, "loss": 3.0455, "step": 84 }, { "epoch": 2.0114285714285716, "grad_norm": 0.837748646736145, "learning_rate": 0.0002, "loss": 3.7381, "step": 87 }, { "epoch": 2.08, "grad_norm": 0.4829089045524597, "learning_rate": 0.0002, "loss": 3.0417, "step": 90 }, { "epoch": 2.1485714285714286, "grad_norm": 0.30258846282958984, "learning_rate": 0.0002, "loss": 2.9999, "step": 93 }, { "epoch": 2.217142857142857, "grad_norm": 0.8457381129264832, "learning_rate": 0.0002, "loss": 2.9085, "step": 96 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5655274391174316, "learning_rate": 0.0002, "loss": 2.9864, "step": 99 }, { "epoch": 2.354285714285714, "grad_norm": 0.34071195125579834, "learning_rate": 0.0002, "loss": 2.9532, "step": 102 }, { "epoch": 2.422857142857143, "grad_norm": 0.5198696255683899, "learning_rate": 0.0002, "loss": 2.9002, "step": 105 }, { "epoch": 2.4914285714285715, "grad_norm": 0.48464879393577576, "learning_rate": 0.0002, "loss": 2.9063, "step": 108 }, { "epoch": 2.56, "grad_norm": 0.4100594222545624, "learning_rate": 0.0002, "loss": 3.0438, "step": 111 }, { "epoch": 2.6285714285714286, "grad_norm": 0.5050905346870422, "learning_rate": 0.0002, "loss": 2.9542, "step": 114 }, { "epoch": 2.697142857142857, "grad_norm": 0.5607490539550781, "learning_rate": 0.0002, "loss": 2.8878, "step": 117 }, { "epoch": 2.7657142857142856, "grad_norm": 0.31164994835853577, "learning_rate": 0.0002, "loss": 3.1095, "step": 120 }, { "epoch": 2.8342857142857145, "grad_norm": 0.37856680154800415, "learning_rate": 0.0002, "loss": 2.9569, "step": 123 }, { "epoch": 2.902857142857143, "grad_norm": 0.642971932888031, "learning_rate": 0.0002, "loss": 2.9127, "step": 126 }, { "epoch": 2.9714285714285715, "grad_norm": 0.6274252533912659, "learning_rate": 0.0002, "loss": 2.9586, "step": 129 } ], "logging_steps": 3, "max_steps": 129, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.395590565882757e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }