{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9872340425531916, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 973.0485916137695, "epoch": 0.03404255319148936, "grad_norm": 0.26080044292213134, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.1215, "reward": 0.7184314727783203, "reward_std": 0.15251445909962058, "rewards/accuracy_reward": 0.7421875298023224, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.02375606936402619, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 975.6433238983154, "epoch": 0.1702127659574468, "grad_norm": 0.2057173239852816, "kl": 7.790327072143555e-05, "learning_rate": 2.5e-06, "loss": 0.1143, "reward": 0.7099975757300854, "reward_std": 0.147755601326935, "rewards/accuracy_reward": 0.7338169943541288, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.02381941577186808, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 866.1272705078125, "epoch": 0.3404255319148936, "grad_norm": 0.4006290505834594, "kl": 0.005752372741699219, "learning_rate": 2.956412726139078e-06, "loss": 0.1039, "reward": 0.7401266694068909, "reward_std": 0.16346940137445926, "rewards/accuracy_reward": 0.7612723588943482, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.02114568497054279, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 727.8928909301758, "epoch": 0.5106382978723404, "grad_norm": 0.789066138623355, "kl": 0.174072265625, "learning_rate": 2.7836719084521715e-06, "loss": 0.1145, "reward": 0.6513140380382538, "reward_std": 0.1966270312666893, "rewards/accuracy_reward": 0.6690848499536515, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.017770822765305637, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 573.2581741333008, "epoch": 0.6808510638297872, "grad_norm": 1.576084537592535, "kl": 0.8208984375, "learning_rate": 2.4946839873611927e-06, "loss": 0.1556, "reward": 0.585111603885889, "reward_std": 0.22033293917775154, "rewards/accuracy_reward": 0.5991071686148643, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.013995561096817256, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 543.6825035095214, "epoch": 0.851063829787234, "grad_norm": 2.632876303519198, "kl": 2.15634765625, "learning_rate": 2.1156192081791355e-06, "loss": 0.3664, "reward": 0.606927415728569, "reward_std": 0.2152379907667637, "rewards/accuracy_reward": 0.6202009156346321, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.013273498858325183, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 515.0230895996094, "epoch": 1.0340425531914894, "grad_norm": 1.7973794229669702, "kl": 1.01513671875, "learning_rate": 1.6808050203829845e-06, "loss": 0.1836, "reward": 0.6228495962917805, "reward_std": 0.21852513104677201, "rewards/accuracy_reward": 0.6353794872760773, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.01252990250941366, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 475.1066070556641, "epoch": 1.2042553191489362, "grad_norm": 3.0368202315739117, "kl": 1.555859375, "learning_rate": 1.2296174432791415e-06, "loss": 0.2713, "reward": 0.559271278232336, "reward_std": 0.24244545102119447, "rewards/accuracy_reward": 0.5708705581724643, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011599282221868634, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 469.2449981689453, "epoch": 1.374468085106383, "grad_norm": 3.5433702308800723, "kl": 1.345703125, "learning_rate": 8.029152419343472e-07, "loss": 0.2314, "reward": 0.5598608091473579, "reward_std": 0.26654755510389805, "rewards/accuracy_reward": 0.5713169902563096, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011456176661886275, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 475.7722343444824, "epoch": 1.5446808510638297, "grad_norm": 1.1370291260510161, "kl": 1.3955078125, "learning_rate": 4.3933982822017883e-07, "loss": 0.2259, "reward": 0.5786746814846992, "reward_std": 0.2622571483254433, "rewards/accuracy_reward": 0.5902902036905289, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011615533195436002, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 474.67189865112306, "epoch": 1.7148936170212767, "grad_norm": 1.7209049738299256, "kl": 1.1173828125, "learning_rate": 1.718159615201853e-07, "loss": 0.1973, "reward": 0.5861792124807834, "reward_std": 0.25209669955074787, "rewards/accuracy_reward": 0.5977678894996643, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011588669382035732, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 478.20337142944334, "epoch": 1.8851063829787233, "grad_norm": 1.3684947080685528, "kl": 1.382421875, "learning_rate": 2.4570139579284723e-08, "loss": 0.2311, "reward": 0.5730349622666836, "reward_std": 0.24952978417277336, "rewards/accuracy_reward": 0.5847098492085934, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011674887221306562, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 478.4012222290039, "epoch": 1.9872340425531916, "kl": 1.4332682291666667, "reward": 0.5787627287209034, "reward_std": 0.2581109864016374, "rewards/accuracy_reward": 0.5904018133878708, "rewards/format_reward": 0.0, "rewards/len_penalty": -0.011639068795678517, "step": 58, "total_flos": 0.0, "train_loss": 0.1985041233229226, "train_runtime": 18468.5879, "train_samples_per_second": 0.812, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }