{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15458937198067632, "eval_steps": 2, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 653.5100730895996, "epoch": 0.015458937198067632, "grad_norm": 13904.753479263767, "kl": 9.926961135864257, "learning_rate": 2e-05, "loss": 0.3968, "reward": 0.6000000268220902, "reward_std": 0.33480377998203037, "rewards/accuracy_reward": 0.5995535984635353, "rewards/format_reward": 0.00044642859138548373, "step": 5 }, { "completion_length": 660.7125312805176, "epoch": 0.030917874396135265, "grad_norm": 197221.6943728757, "kl": 1280566.1007873535, "learning_rate": 1.9396926207859085e-05, "loss": 51005.4313, "reward": 0.6585937816649675, "reward_std": 0.2747358553111553, "rewards/accuracy_reward": 0.6585937816649675, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 658.5587333679199, "epoch": 0.0463768115942029, "grad_norm": 5.735340194391574, "kl": 16.464306640625, "learning_rate": 1.766044443118978e-05, "loss": 0.6587, "reward": 0.6368303872644901, "reward_std": 0.2583037616685033, "rewards/accuracy_reward": 0.6368303872644901, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 665.5614128112793, "epoch": 0.06183574879227053, "grad_norm": 0.11683844185999785, "kl": 0.7914215087890625, "learning_rate": 1.5000000000000002e-05, "loss": 0.0317, "reward": 0.6370535973459482, "reward_std": 0.25866390075534584, "rewards/accuracy_reward": 0.6370535973459482, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 638.1945602416993, "epoch": 0.07729468599033816, "grad_norm": 0.07460429310613971, "kl": 0.0471527099609375, "learning_rate": 1.1736481776669307e-05, "loss": 0.0019, "reward": 0.6760044947266579, "reward_std": 0.2249570596963167, "rewards/accuracy_reward": 0.6760044947266579, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 623.0960060119629, "epoch": 0.0927536231884058, "grad_norm": 0.07393904563968409, "kl": 0.043280029296875, "learning_rate": 8.263518223330698e-06, "loss": 0.0017, "reward": 0.693080386891961, "reward_std": 0.219588840380311, "rewards/accuracy_reward": 0.693080386891961, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 607.3904300689697, "epoch": 0.10821256038647344, "grad_norm": 0.07179972285540803, "kl": 0.0819915771484375, "learning_rate": 5.000000000000003e-06, "loss": 0.0033, "reward": 0.7189732462167739, "reward_std": 0.2072824534960091, "rewards/accuracy_reward": 0.7189732462167739, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 601.369002532959, "epoch": 0.12367149758454106, "grad_norm": 0.0620520373411285, "kl": 0.03866424560546875, "learning_rate": 2.339555568810221e-06, "loss": 0.0015, "reward": 0.7085937805473804, "reward_std": 0.209117239061743, "rewards/accuracy_reward": 0.7085937805473804, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 607.5365215301514, "epoch": 0.1391304347826087, "grad_norm": 0.833460229793663, "kl": 0.03985748291015625, "learning_rate": 6.030737921409169e-07, "loss": 0.0016, "reward": 0.700892886519432, "reward_std": 0.2034242927096784, "rewards/accuracy_reward": 0.700892886519432, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 611.2590663909912, "epoch": 0.15458937198067632, "grad_norm": 0.06368324518689576, "kl": 0.03762359619140625, "learning_rate": 0.0, "loss": 0.0015, "reward": 0.7181919991970063, "reward_std": 0.21190554238855838, "rewards/accuracy_reward": 0.7181919991970063, "rewards/format_reward": 0.0, "step": 50 }, { "epoch": 0.15458937198067632, "step": 50, "total_flos": 0.0, "train_loss": 5100.652998053767, "train_runtime": 11710.1509, "train_samples_per_second": 0.956, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }