{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9884169884169884, "eval_steps": 100, "global_step": 32, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 287.9553693771362, "epoch": 0.15444015444015444, "grad_norm": 712.992919921875, "kl": 3.120312976837158, "learning_rate": 1.9937122098932428e-05, "loss": 0.1248, "reward": 0.6392857432365417, "reward_std": 0.24748737085610628, "rewards/accuracy_reward": 0.11250000586733222, "rewards/format_reward": 0.5267857377417385, "step": 5 }, { "completion_length": 76.98839626312255, "epoch": 0.3088803088803089, "grad_norm": 0.6054267287254333, "kl": 0.269256591796875, "learning_rate": 1.78183148246803e-05, "loss": 0.0108, "reward": 1.0053571820259095, "reward_std": 0.13131982944905757, "rewards/accuracy_reward": 0.058928574342280625, "rewards/format_reward": 0.946428595483303, "step": 10 }, { "completion_length": 53.2321453332901, "epoch": 0.46332046332046334, "grad_norm": 0.6564269661903381, "kl": 0.46717529296875, "learning_rate": 1.3302790619551673e-05, "loss": 0.0187, "reward": 1.0392857573926448, "reward_std": 0.13637059200555085, "rewards/accuracy_reward": 0.09196429075673222, "rewards/format_reward": 0.9473214484751225, "step": 15 }, { "completion_length": 94.31786141395568, "epoch": 0.6177606177606177, "grad_norm": 0.3075297772884369, "kl": 0.252783203125, "learning_rate": 7.774790660436857e-06, "loss": 0.0101, "reward": 1.0741071864962577, "reward_std": 0.13005714006721975, "rewards/accuracy_reward": 0.10446429094299674, "rewards/format_reward": 0.9696428716182709, "step": 20 }, { "completion_length": 132.22054238319396, "epoch": 0.7722007722007722, "grad_norm": 0.4520126283168793, "kl": 0.225750732421875, "learning_rate": 2.9289321881345257e-06, "loss": 0.009, "reward": 1.0294643245637416, "reward_std": 0.17046323977410793, "rewards/accuracy_reward": 0.09375000428408384, "rewards/format_reward": 0.935714315623045, "step": 25 }, { "completion_length": 143.1794707775116, "epoch": 0.9266409266409267, "grad_norm": 0.4325202405452728, "kl": 0.220745849609375, "learning_rate": 2.507208781817638e-07, "loss": 0.0088, "reward": 1.0116071835160256, "reward_std": 0.16036171615123748, "rewards/accuracy_reward": 0.08125000409781932, "rewards/format_reward": 0.9303571656346321, "step": 30 }, { "completion_length": 134.81697022914886, "epoch": 0.9884169884169884, "kl": 0.248382568359375, "reward": 1.020089328289032, "reward_std": 0.12311234045773745, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.944196455180645, "step": 32, "total_flos": 0.0, "train_loss": 0.029145897831767797, "train_runtime": 1931.8778, "train_samples_per_second": 1.875, "train_steps_per_second": 0.017 } ], "logging_steps": 5, "max_steps": 32, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }