{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04782608695652174, "eval_steps": 27, "global_step": 22, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002173913043478261, "grad_norm": 0.31070002913475037, "learning_rate": 0.0, "loss": 1.0674, "step": 1 }, { "epoch": 0.002173913043478261, "eval_loss": 4.08291482925415, "eval_runtime": 14.2183, "eval_samples_per_second": 27.289, "eval_steps_per_second": 1.758, "step": 1 }, { "epoch": 0.004347826086956522, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 2.6769, "step": 2 }, { "epoch": 0.006521739130434782, "grad_norm": 0.6840159893035889, "learning_rate": 2e-05, "loss": 3.095, "step": 3 }, { "epoch": 0.008695652173913044, "grad_norm": 0.9547252655029297, "learning_rate": 4e-05, "loss": 3.3761, "step": 4 }, { "epoch": 0.010869565217391304, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 6.3563, "step": 5 }, { "epoch": 0.013043478260869565, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 6.813, "step": 6 }, { "epoch": 0.015217391304347827, "grad_norm": 1.6616406440734863, "learning_rate": 6e-05, "loss": 2.632, "step": 7 }, { "epoch": 0.017391304347826087, "grad_norm": 0.5449294447898865, "learning_rate": 8e-05, "loss": 1.8506, "step": 8 }, { "epoch": 0.01956521739130435, "grad_norm": 1.0437792539596558, "learning_rate": 0.0001, "loss": 2.8176, "step": 9 }, { "epoch": 0.021739130434782608, "grad_norm": 1.3472729921340942, "learning_rate": 0.00012, "loss": 3.2039, "step": 10 }, { "epoch": 0.02391304347826087, "grad_norm": 3.4967548847198486, "learning_rate": 0.00014, "loss": 4.6888, "step": 11 }, { "epoch": 0.02608695652173913, "grad_norm": 8.567302703857422, "learning_rate": 0.00016, "loss": 6.068, "step": 12 }, { "epoch": 0.02826086956521739, "grad_norm": 6.242945194244385, "learning_rate": 0.00018, "loss": 3.8248, "step": 13 }, { "epoch": 0.030434782608695653, "grad_norm": 1.132996678352356, "learning_rate": 0.0002, "loss": 1.7484, "step": 14 }, { "epoch": 0.03260869565217391, "grad_norm": 0.8223450779914856, "learning_rate": 0.0001999486216200688, "loss": 3.1641, "step": 15 }, { "epoch": 0.034782608695652174, "grad_norm": 2.4242587089538574, "learning_rate": 0.00019979453927503364, "loss": 3.1671, "step": 16 }, { "epoch": 0.03695652173913044, "grad_norm": 2.9778759479522705, "learning_rate": 0.00019953791129491983, "loss": 4.0796, "step": 17 }, { "epoch": 0.0391304347826087, "grad_norm": 5.1453986167907715, "learning_rate": 0.0001991790013823246, "loss": 4.1276, "step": 18 }, { "epoch": 0.041304347826086954, "grad_norm": 5.806064605712891, "learning_rate": 0.00019871817834144504, "loss": 4.0408, "step": 19 }, { "epoch": 0.043478260869565216, "grad_norm": 2.1800103187561035, "learning_rate": 0.00019815591569910654, "loss": 1.1865, "step": 20 }, { "epoch": 0.04565217391304348, "grad_norm": 5.802156925201416, "learning_rate": 0.00019749279121818235, "loss": 2.7308, "step": 21 }, { "epoch": 0.04782608695652174, "grad_norm": 7.91619348526001, "learning_rate": 0.00019672948630390294, "loss": 3.4633, "step": 22 } ], "logging_steps": 1, "max_steps": 108, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 22, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8207648948224000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }