{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 607.8761520385742, "epoch": 0.017057569296375266, "grad_norm": 0.21163204312324524, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0347, "reward": 0.7064732536673546, "reward_std": 0.2907280754297972, "rewards/accuracy_reward": 0.6964286044239998, "rewards/format_reward": 0.010044643422588706, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 598.3354082107544, "epoch": 0.08528784648187633, "grad_norm": 0.22014343738555908, "kl": 0.00016763806343078613, "learning_rate": 2.5e-06, "loss": 0.0443, "reward": 0.6668527107685804, "reward_std": 0.30428835609927773, "rewards/accuracy_reward": 0.6609933376312256, "rewards/format_reward": 0.005859375291038305, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 602.3207862854003, "epoch": 0.17057569296375266, "grad_norm": 0.1867409348487854, "kl": 0.006086993217468262, "learning_rate": 2.956412726139078e-06, "loss": 0.0561, "reward": 0.6991071760654449, "reward_std": 0.28411166220903394, "rewards/accuracy_reward": 0.6957589611411095, "rewards/format_reward": 0.0033482144586741923, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 591.975025177002, "epoch": 0.255863539445629, "grad_norm": 0.272446870803833, "kl": 0.006195259094238281, "learning_rate": 2.7836719084521715e-06, "loss": 0.0687, "reward": 0.7625000327825546, "reward_std": 0.21445324290543794, "rewards/accuracy_reward": 0.7611607477068901, "rewards/format_reward": 0.0013392857741564511, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 580.5134178161621, "epoch": 0.3411513859275053, "grad_norm": 0.4638945162296295, "kl": 0.0028181076049804688, "learning_rate": 2.4946839873611927e-06, "loss": 0.0489, "reward": 0.7725446745753288, "reward_std": 0.1972122782841325, "rewards/accuracy_reward": 0.7714286059141159, "rewards/format_reward": 0.0011160714784637094, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 586.8551612854004, "epoch": 0.42643923240938164, "grad_norm": 0.13729101419448853, "kl": 0.0033502578735351562, "learning_rate": 2.1156192081791355e-06, "loss": 0.0425, "reward": 0.7703125372529029, "reward_std": 0.19164941012859343, "rewards/accuracy_reward": 0.769866107404232, "rewards/format_reward": 0.00044642859138548373, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 589.1917663574219, "epoch": 0.511727078891258, "grad_norm": 0.10644034296274185, "kl": 0.004234695434570312, "learning_rate": 1.6808050203829845e-06, "loss": 0.034, "reward": 0.7531250387430191, "reward_std": 0.18948373831808568, "rewards/accuracy_reward": 0.7511161044239998, "rewards/format_reward": 0.002008928661234677, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 578.0815017700195, "epoch": 0.5970149253731343, "grad_norm": 0.09403481334447861, "kl": 0.004046249389648438, "learning_rate": 1.2296174432791415e-06, "loss": 0.0411, "reward": 0.7562500327825546, "reward_std": 0.18806953858584166, "rewards/accuracy_reward": 0.7495536088943482, "rewards/format_reward": 0.006696428894065321, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 564.9935516357422, "epoch": 0.6823027718550106, "grad_norm": 0.22079423069953918, "kl": 0.00465545654296875, "learning_rate": 8.029152419343472e-07, "loss": 0.0326, "reward": 0.7790178969502449, "reward_std": 0.18834841772913932, "rewards/accuracy_reward": 0.7680803954601287, "rewards/format_reward": 0.010937500628642738, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 582.787744140625, "epoch": 0.767590618336887, "grad_norm": 0.16064484417438507, "kl": 0.005054092407226563, "learning_rate": 4.3933982822017883e-07, "loss": 0.0423, "reward": 0.7665178939700127, "reward_std": 0.22603920232504607, "rewards/accuracy_reward": 0.7470982491970062, "rewards/format_reward": 0.019419643795117736, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 580.5013664245605, "epoch": 0.8528784648187633, "grad_norm": 0.18539777398109436, "kl": 0.00699310302734375, "learning_rate": 1.718159615201853e-07, "loss": 0.0301, "reward": 0.7915178954601287, "reward_std": 0.2188779940828681, "rewards/accuracy_reward": 0.755357176065445, "rewards/format_reward": 0.036160716065205635, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 580.8031463623047, "epoch": 0.9381663113006397, "grad_norm": 0.2504195272922516, "kl": 0.009212875366210937, "learning_rate": 2.4570139579284723e-08, "loss": 0.038, "reward": 0.8113839596509933, "reward_std": 0.2401964722201228, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 0.03236607303842902, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 567.5023994445801, "epoch": 0.9893390191897654, "kl": 0.011606852213541666, "reward": 0.8020833730697632, "reward_std": 0.23279494047164917, "rewards/accuracy_reward": 0.7678571765621504, "rewards/format_reward": 0.03422619208383063, "step": 58, "total_flos": 0.0, "train_loss": 0.04428564981910689, "train_runtime": 7921.619, "train_samples_per_second": 0.947, "train_steps_per_second": 0.007 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }