{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9872340425531916, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 971.7639999389648, "epoch": 0.03404255319148936, "grad_norm": 0.2631084113360146, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.1238, "reward": 0.698086328804493, "reward_std": 0.13120541395619512, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.04744941322132945, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 979.8407287597656, "epoch": 0.1702127659574468, "grad_norm": 0.22684407077545118, "kl": 7.544457912445068e-05, "learning_rate": 2.5e-06, "loss": 0.1155, "reward": 0.6869497802108526, "reward_std": 0.16120319138281047, "rewards/accuracy_reward": 0.734793558716774, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.047843783744610846, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 872.467677307129, "epoch": 0.3404255319148936, "grad_norm": 0.4684158119972997, "kl": 0.00807795524597168, "learning_rate": 2.956412726139078e-06, "loss": 0.1146, "reward": 0.7163276463747025, "reward_std": 0.16980856116861104, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.04260096037760377, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 751.1273788452148, "epoch": 0.5106382978723404, "grad_norm": 1.2177198858526401, "kl": 0.34739990234375, "learning_rate": 2.7836719084521715e-06, "loss": 0.1264, "reward": 0.6330783508718014, "reward_std": 0.20811637695878743, "rewards/accuracy_reward": 0.6697544969618321, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.03667614138685167, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 613.7534912109375, "epoch": 0.6808510638297872, "grad_norm": 1.3980210820254566, "kl": 0.744873046875, "learning_rate": 2.4946839873611927e-06, "loss": 0.1578, "reward": 0.5512815967202187, "reward_std": 0.23243679329752923, "rewards/accuracy_reward": 0.581250024586916, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.0299684323836118, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 582.3771484375, "epoch": 0.851063829787234, "grad_norm": 1.8890671871166043, "kl": 0.9321044921875, "learning_rate": 2.1156192081791355e-06, "loss": 0.1965, "reward": 0.56687613427639, "reward_std": 0.21850477196276188, "rewards/accuracy_reward": 0.5953125260770321, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.028436384443193675, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 557.1671844482422, "epoch": 1.0340425531914894, "grad_norm": 6.969148820152461, "kl": 0.90537109375, "learning_rate": 1.6808050203829845e-06, "loss": 0.1776, "reward": 0.609768246114254, "reward_std": 0.20966911502182484, "rewards/accuracy_reward": 0.6368303805589676, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.027062145154923202, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 529.0748001098633, "epoch": 1.2042553191489362, "grad_norm": 1.6903572527831305, "kl": 0.88388671875, "learning_rate": 1.2296174432791415e-06, "loss": 0.1976, "reward": 0.6035189718008042, "reward_std": 0.20989050157368183, "rewards/accuracy_reward": 0.6293527103960515, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.025833730399608613, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 518.8400924682617, "epoch": 1.374468085106383, "grad_norm": 1.140228131246192, "kl": 1.342578125, "learning_rate": 8.029152419343472e-07, "loss": 0.2382, "reward": 0.5803579963743687, "reward_std": 0.23556350730359554, "rewards/accuracy_reward": 0.6056919865310192, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.025333988945931196, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 512.1945533752441, "epoch": 1.5446808510638297, "grad_norm": 1.4935623924313746, "kl": 0.83896484375, "learning_rate": 4.3933982822017883e-07, "loss": 0.2043, "reward": 0.5334726713597775, "reward_std": 0.25458414256572726, "rewards/accuracy_reward": 0.5584821730852128, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.025009499955922367, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 510.34209899902345, "epoch": 1.7148936170212767, "grad_norm": 2.5529208106611776, "kl": 1.512890625, "learning_rate": 1.718159615201853e-07, "loss": 0.2564, "reward": 0.5088979430496693, "reward_std": 0.26402820013463496, "rewards/accuracy_reward": 0.5338169865310192, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.02491904767230153, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 509.2735771179199, "epoch": 1.8851063829787233, "grad_norm": 1.2580199216017853, "kl": 1.3421875, "learning_rate": 2.4570139579284723e-08, "loss": 0.2429, "reward": 0.5227894008159637, "reward_std": 0.24820317029953004, "rewards/accuracy_reward": 0.5476562783122063, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.02486687391065061, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 513.0390764872233, "epoch": 1.9872340425531916, "kl": 1.3177083333333333, "reward": 0.5240900913874308, "reward_std": 0.256128067150712, "rewards/accuracy_reward": 0.54892115543286, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.024831065131972235, "step": 58, "total_flos": 0.0, "train_loss": 0.1866766851523827, "train_runtime": 17842.1504, "train_samples_per_second": 0.841, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }