{ "best_global_step": 30, "best_metric": 0.009350189939141273, "best_model_checkpoint": "outputs/microsoft/Phi-3.5-mini-instruct/countdown_n100_mcl_256_pretrained/checkpoint-30", "epoch": 3.0, "eval_steps": 5, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 0.04710305854678154, "learning_rate": 9.466666666666666e-07, "loss": 0.0581, "step": 5 }, { "epoch": 0.2, "eval_clip_ratio": 0.0, "eval_completion_length": 249.7375, "eval_kl": 2.7620792388916014e-05, "eval_loss": 0.011466369964182377, "eval_num_tokens": 30743.0, "eval_reward": 0.0875, "eval_reward_std": 0.14787135720252992, "eval_rewards/equation_reward_func": 0.0375, "eval_rewards/format_reward_func": 0.05, "eval_runtime": 283.5242, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 5 }, { "epoch": 0.4, "grad_norm": 0.0248898696154356, "learning_rate": 8.799999999999999e-07, "loss": 0.0305, "step": 10 }, { "epoch": 0.4, "eval_clip_ratio": 0.0, "eval_completion_length": 248.3125, "eval_kl": 3.051459789276123e-05, "eval_loss": 0.010714234784245491, "eval_num_tokens": 61654.0, "eval_reward": 0.1, "eval_reward_std": 0.11969234347343445, "eval_rewards/equation_reward_func": 0.075, "eval_rewards/format_reward_func": 0.025, "eval_runtime": 279.7789, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 10 }, { "epoch": 0.6, "grad_norm": 0.03560088202357292, "learning_rate": 8.133333333333333e-07, "loss": 0.0333, "step": 15 }, { "epoch": 0.6, "eval_clip_ratio": 0.0, "eval_completion_length": 248.8125, "eval_kl": 3.6323070526123046e-05, "eval_loss": 0.016332309693098068, "eval_num_tokens": 92513.0, "eval_reward": 0.1, "eval_reward_std": 0.15773502588272095, "eval_rewards/equation_reward_func": 0.0375, "eval_rewards/format_reward_func": 0.0625, "eval_runtime": 282.7334, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 15 }, { "epoch": 0.8, "grad_norm": 0.02903689257800579, "learning_rate": 7.466666666666667e-07, "loss": 0.0363, "step": 20 }, { "epoch": 0.8, "eval_clip_ratio": 0.0, "eval_completion_length": 241.4375, "eval_kl": 3.466010093688965e-05, "eval_loss": 0.03578554838895798, "eval_num_tokens": 123451.0, "eval_reward": 0.1125, "eval_reward_std": 0.1978713572025299, "eval_rewards/equation_reward_func": 0.0625, "eval_rewards/format_reward_func": 0.05, "eval_runtime": 282.549, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 20 }, { "epoch": 1.0, "grad_norm": 5.04811282553419e-07, "learning_rate": 6.800000000000001e-07, "loss": 0.0011, "step": 25 }, { "epoch": 1.0, "eval_clip_ratio": 0.0, "eval_completion_length": 247.55, "eval_kl": 3.0243396759033202e-05, "eval_loss": 0.023990554735064507, "eval_num_tokens": 154568.0, "eval_reward": 0.1, "eval_reward_std": 0.17886751294136047, "eval_rewards/equation_reward_func": 0.05, "eval_rewards/format_reward_func": 0.05, "eval_runtime": 282.4574, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 25 }, { "epoch": 1.2, "grad_norm": 0.0, "learning_rate": 6.133333333333332e-07, "loss": 0.0337, "step": 30 }, { "epoch": 1.2, "eval_clip_ratio": 0.0, "eval_completion_length": 249.3875, "eval_kl": 2.872645854949951e-05, "eval_loss": 0.009350189939141273, "eval_num_tokens": 185750.0, "eval_reward": 0.1, "eval_reward_std": 0.15, "eval_rewards/equation_reward_func": 0.0625, "eval_rewards/format_reward_func": 0.0375, "eval_runtime": 283.2443, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 30 }, { "epoch": 1.4, "grad_norm": 0.025708282366394997, "learning_rate": 5.466666666666666e-07, "loss": 0.0273, "step": 35 }, { "epoch": 1.4, "eval_clip_ratio": 0.0, "eval_completion_length": 242.05, "eval_kl": 3.2845139503479e-05, "eval_loss": 0.038312580436468124, "eval_num_tokens": 217087.0, "eval_reward": 0.1375, "eval_reward_std": 0.1978713572025299, "eval_rewards/equation_reward_func": 0.0625, "eval_rewards/format_reward_func": 0.075, "eval_runtime": 283.9676, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 35 }, { "epoch": 1.6, "grad_norm": 0.04088641330599785, "learning_rate": 4.8e-07, "loss": 0.0318, "step": 40 }, { "epoch": 1.6, "eval_clip_ratio": 0.0, "eval_completion_length": 247.9375, "eval_kl": 3.358125686645508e-05, "eval_loss": 0.028362590819597244, "eval_num_tokens": 247949.0, "eval_reward": 0.1375, "eval_reward_std": 0.1886961877346039, "eval_rewards/equation_reward_func": 0.0875, "eval_rewards/format_reward_func": 0.05, "eval_runtime": 285.356, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 40 }, { "epoch": 1.8, "grad_norm": 0.024730732664465904, "learning_rate": 4.1333333333333333e-07, "loss": 0.0578, "step": 45 }, { "epoch": 1.8, "eval_clip_ratio": 0.0, "eval_completion_length": 246.9625, "eval_kl": 2.7140974998474122e-05, "eval_loss": 0.043169133365154266, "eval_num_tokens": 278710.0, "eval_reward": 0.125, "eval_reward_std": 0.2, "eval_rewards/equation_reward_func": 0.075, "eval_rewards/format_reward_func": 0.05, "eval_runtime": 285.2657, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 45 }, { "epoch": 2.0, "grad_norm": 0.03765915334224701, "learning_rate": 3.4666666666666665e-07, "loss": 0.0234, "step": 50 }, { "epoch": 2.0, "eval_clip_ratio": 0.0, "eval_completion_length": 247.05, "eval_kl": 3.2660365104675296e-05, "eval_loss": 0.026807209476828575, "eval_num_tokens": 309954.0, "eval_reward": 0.125, "eval_reward_std": 0.20773502588272094, "eval_rewards/equation_reward_func": 0.0875, "eval_rewards/format_reward_func": 0.0375, "eval_runtime": 284.404, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 50 }, { "epoch": 2.2, "grad_norm": 0.01991177722811699, "learning_rate": 2.8e-07, "loss": 0.0691, "step": 55 }, { "epoch": 2.2, "eval_clip_ratio": 0.0, "eval_completion_length": 242.6375, "eval_kl": 3.68952751159668e-05, "eval_loss": 0.04483898729085922, "eval_num_tokens": 340605.0, "eval_reward": 0.1625, "eval_reward_std": 0.25560638308525085, "eval_rewards/equation_reward_func": 0.1, "eval_rewards/format_reward_func": 0.0625, "eval_runtime": 283.6459, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 55 }, { "epoch": 2.4, "grad_norm": 0.0200728178024292, "learning_rate": 2.1333333333333334e-07, "loss": 0.0518, "step": 60 }, { "epoch": 2.4, "eval_clip_ratio": 0.0, "eval_completion_length": 250.45, "eval_kl": 3.1629204750061034e-05, "eval_loss": 0.009741068817675114, "eval_num_tokens": 371457.0, "eval_reward": 0.075, "eval_reward_std": 0.12886751294136048, "eval_rewards/equation_reward_func": 0.05, "eval_rewards/format_reward_func": 0.025, "eval_runtime": 283.7186, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 60 }, { "epoch": 2.6, "grad_norm": 0.021919438615441322, "learning_rate": 1.4666666666666666e-07, "loss": 0.0167, "step": 65 }, { "epoch": 2.6, "eval_clip_ratio": 0.0, "eval_completion_length": 250.35, "eval_kl": 2.8392672538757326e-05, "eval_loss": 0.016574550420045853, "eval_num_tokens": 402684.0, "eval_reward": 0.1125, "eval_reward_std": 0.14469234347343446, "eval_rewards/equation_reward_func": 0.075, "eval_rewards/format_reward_func": 0.0375, "eval_runtime": 283.4564, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.018, "step": 65 }, { "epoch": 2.8, "grad_norm": 0.03317731246352196, "learning_rate": 8e-08, "loss": 0.0358, "step": 70 }, { "epoch": 2.8, "eval_clip_ratio": 0.0, "eval_completion_length": 246.825, "eval_kl": 3.1587481498718264e-05, "eval_loss": 0.04395188018679619, "eval_num_tokens": 433755.0, "eval_reward": 0.2, "eval_reward_std": 0.3154700517654419, "eval_rewards/equation_reward_func": 0.0625, "eval_rewards/format_reward_func": 0.1375, "eval_runtime": 284.1288, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 70 }, { "epoch": 3.0, "grad_norm": 0.04194802790880203, "learning_rate": 1.3333333333333334e-08, "loss": 0.0372, "step": 75 }, { "epoch": 3.0, "eval_clip_ratio": 0.0, "eval_completion_length": 247.825, "eval_kl": 3.3229589462280273e-05, "eval_loss": 0.034269753843545914, "eval_num_tokens": 464648.0, "eval_reward": 0.1, "eval_reward_std": 0.15, "eval_rewards/equation_reward_func": 0.0625, "eval_rewards/format_reward_func": 0.0375, "eval_runtime": 283.7321, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.018, "step": 75 } ], "logging_steps": 5, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }