|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 250, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 110.15, |
|
"epoch": 0.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.001616668701171875, |
|
"learning_rate": 9.97e-07, |
|
"loss": -0.0, |
|
"reward": 0.0008232486434280872, |
|
"reward_std": 0.0021940818056464194, |
|
"rewards/DCR_reward": 0.0008232486434280872, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 98.4125, |
|
"epoch": 0.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.0034271240234375, |
|
"learning_rate": 9.936666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.00022143989917822182, |
|
"reward_std": 0.000626326643396169, |
|
"rewards/DCR_reward": 0.00022143989917822182, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 84.2, |
|
"epoch": 0.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.003081512451171875, |
|
"learning_rate": 9.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 105.125, |
|
"epoch": 0.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.004283905029296875, |
|
"learning_rate": 9.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 81.4, |
|
"epoch": 0.5, |
|
"grad_norm": 26.5, |
|
"kl": 0.0041351318359375, |
|
"learning_rate": 9.836666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.018291093641892076, |
|
"reward_std": 0.05173502548132092, |
|
"rewards/DCR_reward": 0.018291093641892076, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 104.425, |
|
"epoch": 0.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.006622314453125, |
|
"learning_rate": 9.803333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.009192863292992116, |
|
"reward_std": 0.025964342057704926, |
|
"rewards/DCR_reward": 0.009192863292992116, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 109.35, |
|
"epoch": 0.7, |
|
"grad_norm": 39.25, |
|
"kl": 0.003626251220703125, |
|
"learning_rate": 9.77e-07, |
|
"loss": 0.0, |
|
"reward": 0.013767135608941317, |
|
"reward_std": 0.03893933929502964, |
|
"rewards/DCR_reward": 0.013767135608941317, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 89.1, |
|
"epoch": 0.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.013692855834960938, |
|
"learning_rate": 9.736666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.011479373268957715, |
|
"reward_std": 0.032468570384662596, |
|
"rewards/DCR_reward": 0.011479373268957715, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 97.575, |
|
"epoch": 0.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0066986083984375, |
|
"learning_rate": 9.703333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.009072506427764892, |
|
"reward_std": 0.025660922378301622, |
|
"rewards/DCR_reward": 0.009072506427764892, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 114.3625, |
|
"epoch": 1.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.00914459228515625, |
|
"learning_rate": 9.67e-07, |
|
"loss": 0.0, |
|
"reward": 0.011166714504361153, |
|
"reward_std": 0.03158423751592636, |
|
"rewards/DCR_reward": 0.011166714504361153, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 88.5375, |
|
"epoch": 1.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.011919403076171875, |
|
"learning_rate": 9.636666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 2.8343783924356103e-05, |
|
"reward_std": 8.016832871362566e-05, |
|
"rewards/DCR_reward": 2.8343783924356103e-05, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 103.875, |
|
"epoch": 1.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.010530471801757812, |
|
"learning_rate": 9.603333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.005421069198928308, |
|
"reward_std": 0.015333099680719896, |
|
"rewards/DCR_reward": 0.005421069198928308, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 121.75, |
|
"epoch": 1.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.00739288330078125, |
|
"learning_rate": 9.57e-07, |
|
"loss": -0.0, |
|
"reward": 3.4938057069666684e-05, |
|
"reward_std": 9.881975129246711e-05, |
|
"rewards/DCR_reward": 3.4938057069666684e-05, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 98.975, |
|
"epoch": 1.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.009772491455078126, |
|
"learning_rate": 9.536666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 112.7875, |
|
"epoch": 1.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.01072845458984375, |
|
"learning_rate": 9.503333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 4.633456701412797e-05, |
|
"reward_std": 8.579618879593909e-05, |
|
"rewards/DCR_reward": 4.633456701412797e-05, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 116.125, |
|
"epoch": 1.6, |
|
"grad_norm": 16.25, |
|
"kl": 0.0097686767578125, |
|
"learning_rate": 9.469999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.011431791516952217, |
|
"reward_std": 0.03233398855663836, |
|
"rewards/DCR_reward": 0.011431791516952217, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 100.25, |
|
"epoch": 1.7, |
|
"grad_norm": 25.875, |
|
"kl": 0.0120208740234375, |
|
"learning_rate": 9.436666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.006299582323117647, |
|
"reward_std": 0.017817909209406936, |
|
"rewards/DCR_reward": 0.006299582323117647, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 106.425, |
|
"epoch": 1.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.0339019775390625, |
|
"learning_rate": 9.403333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.014636733755469322, |
|
"reward_std": 0.04139893501996994, |
|
"rewards/DCR_reward": 0.014636733755469322, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 82.725, |
|
"epoch": 1.9, |
|
"grad_norm": 33.75, |
|
"kl": 0.008979415893554688, |
|
"learning_rate": 9.37e-07, |
|
"loss": 0.0, |
|
"reward": 0.005478436907287687, |
|
"reward_std": 0.015495359338819981, |
|
"rewards/DCR_reward": 0.005478436907287687, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 71.5875, |
|
"epoch": 2.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.01548919677734375, |
|
"learning_rate": 9.336666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.025108913704752923, |
|
"reward_std": 0.046553592692362145, |
|
"rewards/DCR_reward": 0.025108913704752923, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 128.8125, |
|
"epoch": 2.1, |
|
"grad_norm": 10.25, |
|
"kl": 0.016112518310546876, |
|
"learning_rate": 9.303333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.006037246529012918, |
|
"reward_std": 0.017075913585722448, |
|
"rewards/DCR_reward": 0.006037246529012918, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 91.65, |
|
"epoch": 2.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.0196868896484375, |
|
"learning_rate": 9.27e-07, |
|
"loss": 0.0, |
|
"reward": 0.018143273887835674, |
|
"reward_std": 0.05131692748691421, |
|
"rewards/DCR_reward": 0.018143273887835674, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 102.8, |
|
"epoch": 2.3, |
|
"grad_norm": 23.5, |
|
"kl": 0.02627716064453125, |
|
"learning_rate": 9.236666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.006400418069824809, |
|
"reward_std": 0.018103116168640555, |
|
"rewards/DCR_reward": 0.006400418069824809, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 94.325, |
|
"epoch": 2.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.01078948974609375, |
|
"learning_rate": 9.203333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0037326388992369175, |
|
"reward_std": 0.010557496920228004, |
|
"rewards/DCR_reward": 0.0037326388992369175, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 108.1125, |
|
"epoch": 2.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.0536102294921875, |
|
"learning_rate": 9.17e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_completion_length": 109.77125, |
|
"eval_kl": 0.026533355712890627, |
|
"eval_loss": 1.797125004365796e-09, |
|
"eval_reward": 0.014695163480864722, |
|
"eval_reward_std": 0.03626910645980388, |
|
"eval_rewards/DCR_reward": 0.014695163480864722, |
|
"eval_runtime": 851.0362, |
|
"eval_samples_per_second": 0.118, |
|
"eval_steps_per_second": 0.015, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 112.5, |
|
"epoch": 2.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.0502593994140625, |
|
"learning_rate": 9.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 6.693824325338937e-05, |
|
"reward_std": 0.00018932993698399513, |
|
"rewards/DCR_reward": 6.693824325338937e-05, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 102.9875, |
|
"epoch": 2.7, |
|
"grad_norm": 20.875, |
|
"kl": 0.0537139892578125, |
|
"learning_rate": 9.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.005209430219838396, |
|
"reward_std": 0.014680334192235023, |
|
"rewards/DCR_reward": 0.005209430219838396, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 104.2875, |
|
"epoch": 2.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.0519927978515625, |
|
"learning_rate": 9.07e-07, |
|
"loss": 0.0, |
|
"reward": 3.591954009607434e-05, |
|
"reward_std": 0.00010159580269828439, |
|
"rewards/DCR_reward": 3.591954009607434e-05, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 132.2, |
|
"epoch": 2.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0234283447265625, |
|
"learning_rate": 9.036666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.01878409832715988, |
|
"reward_std": 0.05312945321202278, |
|
"rewards/DCR_reward": 0.01878409832715988, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 97.65, |
|
"epoch": 3.0, |
|
"grad_norm": 16.25, |
|
"kl": 0.01286468505859375, |
|
"learning_rate": 9.003333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.023834712347888854, |
|
"reward_std": 0.033087541203713045, |
|
"rewards/DCR_reward": 0.023834712347888854, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 113.275, |
|
"epoch": 3.1, |
|
"grad_norm": 15.75, |
|
"kl": 0.085699462890625, |
|
"learning_rate": 8.969999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.054491185687948016, |
|
"reward_std": 0.0751757369551342, |
|
"rewards/DCR_reward": 0.054491185687948016, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 96.375, |
|
"epoch": 3.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.051617431640625, |
|
"learning_rate": 8.936666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.02024147715419531, |
|
"reward_std": 0.05725154206156731, |
|
"rewards/DCR_reward": 0.02024147715419531, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 118.075, |
|
"epoch": 3.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.020208740234375, |
|
"learning_rate": 8.903333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.03521969501452986, |
|
"reward_std": 0.08483822367852553, |
|
"rewards/DCR_reward": 0.03521969501452986, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 117.7625, |
|
"epoch": 3.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.026739501953125, |
|
"learning_rate": 8.869999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.021311641685315408, |
|
"reward_std": 0.03974629848089535, |
|
"rewards/DCR_reward": 0.021311641685315408, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 126.9375, |
|
"epoch": 3.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.041534423828125, |
|
"learning_rate": 8.836666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.018433896452188493, |
|
"reward_std": 0.04472574144601822, |
|
"rewards/DCR_reward": 0.018433896452188493, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 123.275, |
|
"epoch": 3.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.029638671875, |
|
"learning_rate": 8.803333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.008903130898397648, |
|
"reward_std": 0.023928308754693716, |
|
"rewards/DCR_reward": 0.008903130898397648, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 141.2875, |
|
"epoch": 3.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.021136474609375, |
|
"learning_rate": 8.769999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.0002970047564303968, |
|
"reward_std": 0.000840056300512515, |
|
"rewards/DCR_reward": 0.0002970047564303968, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 143.6125, |
|
"epoch": 3.8, |
|
"grad_norm": 28.625, |
|
"kl": 0.071319580078125, |
|
"learning_rate": 8.736666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.025264605600386857, |
|
"reward_std": 0.062286792299710216, |
|
"rewards/DCR_reward": 0.025264605600386857, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 123.4125, |
|
"epoch": 3.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0508056640625, |
|
"learning_rate": 8.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.011455658166960347, |
|
"reward_std": 0.032401493715588, |
|
"rewards/DCR_reward": 0.011455658166960347, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 99.8875, |
|
"epoch": 4.0, |
|
"grad_norm": 64.5, |
|
"kl": 0.0475677490234375, |
|
"learning_rate": 8.669999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.020158628193894402, |
|
"reward_std": 0.04393248584237881, |
|
"rewards/DCR_reward": 0.020158628193894402, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 122.4, |
|
"epoch": 4.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.0520751953125, |
|
"learning_rate": 8.636666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.027829134710191283, |
|
"reward_std": 0.07858068596397061, |
|
"rewards/DCR_reward": 0.027829134710191283, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 161.3375, |
|
"epoch": 4.2, |
|
"grad_norm": 63.75, |
|
"kl": 0.0515869140625, |
|
"learning_rate": 8.603333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.021562288980931044, |
|
"reward_std": 0.06093565103947185, |
|
"rewards/DCR_reward": 0.021562288980931044, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 132.5625, |
|
"epoch": 4.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.06763916015625, |
|
"learning_rate": 8.569999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.01166691112157423, |
|
"reward_std": 0.0329432392900344, |
|
"rewards/DCR_reward": 0.01166691112157423, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 116.35, |
|
"epoch": 4.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.05010986328125, |
|
"learning_rate": 8.536666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.001521215244429186, |
|
"reward_std": 0.004162183034350164, |
|
"rewards/DCR_reward": 0.001521215244429186, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 151.25, |
|
"epoch": 4.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.087615966796875, |
|
"learning_rate": 8.503333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.017801515758037567, |
|
"reward_std": 0.05035028904676438, |
|
"rewards/DCR_reward": 0.017801515758037567, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 104.8125, |
|
"epoch": 4.6, |
|
"grad_norm": 16.875, |
|
"kl": 0.05291748046875, |
|
"learning_rate": 8.469999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.007641428161878139, |
|
"reward_std": 0.021491285803494974, |
|
"rewards/DCR_reward": 0.007641428161878139, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 126.9, |
|
"epoch": 4.7, |
|
"grad_norm": 19.75, |
|
"kl": 0.10247802734375, |
|
"learning_rate": 8.436666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.011310203482571524, |
|
"reward_std": 0.03199008805677295, |
|
"rewards/DCR_reward": 0.011310203482571524, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 141.6625, |
|
"epoch": 4.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.0375762939453125, |
|
"learning_rate": 8.403333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.0044936020654859025, |
|
"reward_std": 0.012646565132308751, |
|
"rewards/DCR_reward": 0.0044936020654859025, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 121.0375, |
|
"epoch": 4.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0927490234375, |
|
"learning_rate": 8.369999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0428094768547453, |
|
"reward_std": 0.096446827147156, |
|
"rewards/DCR_reward": 0.0428094768547453, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 159.7625, |
|
"epoch": 5.0, |
|
"grad_norm": 21.5, |
|
"kl": 0.05220947265625, |
|
"learning_rate": 8.336666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.028198828249878717, |
|
"reward_std": 0.05388287528476212, |
|
"rewards/DCR_reward": 0.028198828249878717, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_completion_length": 119.94125, |
|
"eval_kl": 0.0939080810546875, |
|
"eval_loss": -1.4601639897193763e-09, |
|
"eval_reward": 0.02450084381052875, |
|
"eval_reward_std": 0.053787164441891945, |
|
"eval_rewards/DCR_reward": 0.02450084381052875, |
|
"eval_runtime": 1081.7831, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.012, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 114.225, |
|
"epoch": 5.1, |
|
"grad_norm": 14.875, |
|
"kl": 0.0663818359375, |
|
"learning_rate": 8.303333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.04443554246754502, |
|
"reward_std": 0.07462939244578592, |
|
"rewards/DCR_reward": 0.04443554246754502, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 111.1875, |
|
"epoch": 5.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.0774383544921875, |
|
"learning_rate": 8.269999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0301513435493689, |
|
"reward_std": 0.06074108343455009, |
|
"rewards/DCR_reward": 0.0301513435493689, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 118.0, |
|
"epoch": 5.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.07442626953125, |
|
"learning_rate": 8.236666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.022635633018944647, |
|
"reward_std": 0.060236827132757756, |
|
"rewards/DCR_reward": 0.022635633018944647, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 146.4625, |
|
"epoch": 5.4, |
|
"grad_norm": 25.375, |
|
"kl": 0.0943115234375, |
|
"learning_rate": 8.203333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.027474326699302765, |
|
"reward_std": 0.04014584248652682, |
|
"rewards/DCR_reward": 0.027474326699302765, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 135.9125, |
|
"epoch": 5.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.10826416015625, |
|
"learning_rate": 8.169999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.03092897320893826, |
|
"reward_std": 0.06343855138984508, |
|
"rewards/DCR_reward": 0.03092897320893826, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 113.6125, |
|
"epoch": 5.6, |
|
"grad_norm": 60.75, |
|
"kl": 0.07705078125, |
|
"learning_rate": 8.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.028095364570617676, |
|
"reward_std": 0.05137596220884007, |
|
"rewards/DCR_reward": 0.028095364570617676, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 120.625, |
|
"epoch": 5.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.212255859375, |
|
"learning_rate": 8.103333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.0055281239823671054, |
|
"reward_std": 0.015635895385639743, |
|
"rewards/DCR_reward": 0.0055281239823671054, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 112.575, |
|
"epoch": 5.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.0977294921875, |
|
"learning_rate": 8.070000000000001e-07, |
|
"loss": -0.0, |
|
"reward": 0.027797411862411536, |
|
"reward_std": 0.05734402615926228, |
|
"rewards/DCR_reward": 0.027797411862411536, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 146.475, |
|
"epoch": 5.9, |
|
"grad_norm": 101.5, |
|
"kl": 0.1387939453125, |
|
"learning_rate": 8.036666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.0692019445807091, |
|
"reward_std": 0.09863622895500157, |
|
"rewards/DCR_reward": 0.0692019445807091, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 135.4875, |
|
"epoch": 6.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.07392578125, |
|
"learning_rate": 8.003333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.042109476366749735, |
|
"reward_std": 0.07917119265184738, |
|
"rewards/DCR_reward": 0.042109476366749735, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 151.15, |
|
"epoch": 6.1, |
|
"grad_norm": 66.0, |
|
"kl": 0.112451171875, |
|
"learning_rate": 7.970000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.022725265215558465, |
|
"reward_std": 0.03912785020947922, |
|
"rewards/DCR_reward": 0.022725265215558465, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 114.7375, |
|
"epoch": 6.2, |
|
"grad_norm": 41.0, |
|
"kl": 0.0866943359375, |
|
"learning_rate": 7.936666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.06757222047017422, |
|
"reward_std": 0.10912240504694637, |
|
"rewards/DCR_reward": 0.06757222047017422, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 175.425, |
|
"epoch": 6.3, |
|
"grad_norm": 18.0, |
|
"kl": 0.0874755859375, |
|
"learning_rate": 7.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.017062357207760215, |
|
"reward_std": 0.03701434804825112, |
|
"rewards/DCR_reward": 0.017062357207760215, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 101.1, |
|
"epoch": 6.4, |
|
"grad_norm": 32.5, |
|
"kl": 0.1383544921875, |
|
"learning_rate": 7.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.031146167902625164, |
|
"reward_std": 0.06654989629168995, |
|
"rewards/DCR_reward": 0.031146167902625164, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 147.6875, |
|
"epoch": 6.5, |
|
"grad_norm": 22.375, |
|
"kl": 0.1053955078125, |
|
"learning_rate": 7.836666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.03336541847675108, |
|
"reward_std": 0.0738158110238146, |
|
"rewards/DCR_reward": 0.03336541847675108, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 167.375, |
|
"epoch": 6.6, |
|
"grad_norm": 37.75, |
|
"kl": 0.09075927734375, |
|
"learning_rate": 7.803333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.054513926352956335, |
|
"reward_std": 0.11440533803834115, |
|
"rewards/DCR_reward": 0.054513926352956335, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 113.5, |
|
"epoch": 6.7, |
|
"grad_norm": 24.375, |
|
"kl": 0.13349609375, |
|
"learning_rate": 7.77e-07, |
|
"loss": 0.0, |
|
"reward": 0.022631679168262052, |
|
"reward_std": 0.04243781621917151, |
|
"rewards/DCR_reward": 0.022631679168262052, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 128.1875, |
|
"epoch": 6.8, |
|
"grad_norm": 10.8125, |
|
"kl": 0.098681640625, |
|
"learning_rate": 7.736666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.011361529098212485, |
|
"reward_std": 0.03208850735099986, |
|
"rewards/DCR_reward": 0.011361529098212485, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 122.525, |
|
"epoch": 6.9, |
|
"grad_norm": 43.75, |
|
"kl": 0.1169921875, |
|
"learning_rate": 7.703333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.017346063705190318, |
|
"reward_std": 0.04881023944763001, |
|
"rewards/DCR_reward": 0.017346063705190318, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 150.05, |
|
"epoch": 7.0, |
|
"grad_norm": 22.875, |
|
"kl": 0.112109375, |
|
"learning_rate": 7.67e-07, |
|
"loss": 0.0, |
|
"reward": 0.04065933156089159, |
|
"reward_std": 0.10374580940115266, |
|
"rewards/DCR_reward": 0.04065933156089159, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 116.2125, |
|
"epoch": 7.1, |
|
"grad_norm": 132.0, |
|
"kl": 0.1211669921875, |
|
"learning_rate": 7.636666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.06279535398061853, |
|
"reward_std": 0.11337692766683176, |
|
"rewards/DCR_reward": 0.06279535398061853, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 159.625, |
|
"epoch": 7.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.1438232421875, |
|
"learning_rate": 7.603333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.020437985911848956, |
|
"reward_std": 0.048951211775420236, |
|
"rewards/DCR_reward": 0.020437985911848956, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 165.2625, |
|
"epoch": 7.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.209228515625, |
|
"learning_rate": 7.57e-07, |
|
"loss": -0.0, |
|
"reward": 0.02226241144235246, |
|
"reward_std": 0.05247671899269335, |
|
"rewards/DCR_reward": 0.02226241144235246, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 115.8, |
|
"epoch": 7.4, |
|
"grad_norm": 32.5, |
|
"kl": 0.1295166015625, |
|
"learning_rate": 7.536666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.014459636958054033, |
|
"reward_std": 0.04082847375248093, |
|
"rewards/DCR_reward": 0.014459636958054033, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 158.675, |
|
"epoch": 7.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.190478515625, |
|
"learning_rate": 7.503333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.010101895526895532, |
|
"reward_std": 0.02844048692204524, |
|
"rewards/DCR_reward": 0.010101895526895532, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_completion_length": 154.69125, |
|
"eval_kl": 0.22284423828125, |
|
"eval_loss": 3.744010435013934e-09, |
|
"eval_reward": 0.04045595111856528, |
|
"eval_reward_std": 0.0814886652509449, |
|
"eval_rewards/DCR_reward": 0.04045595111856528, |
|
"eval_runtime": 1402.6056, |
|
"eval_samples_per_second": 0.071, |
|
"eval_steps_per_second": 0.009, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 166.85, |
|
"epoch": 7.6, |
|
"grad_norm": 20.75, |
|
"kl": 0.117431640625, |
|
"learning_rate": 7.47e-07, |
|
"loss": 0.0, |
|
"reward": 0.005159654482849873, |
|
"reward_std": 0.01448775691096671, |
|
"rewards/DCR_reward": 0.005159654482849873, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 116.125, |
|
"epoch": 7.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.1875244140625, |
|
"learning_rate": 7.436666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.028679777635261416, |
|
"reward_std": 0.059860612216289154, |
|
"rewards/DCR_reward": 0.028679777635261416, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 139.5, |
|
"epoch": 7.8, |
|
"grad_norm": 30.125, |
|
"kl": 0.116748046875, |
|
"learning_rate": 7.403333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.05339058640311123, |
|
"reward_std": 0.09481030252063646, |
|
"rewards/DCR_reward": 0.05339058640311123, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 145.7, |
|
"epoch": 7.9, |
|
"grad_norm": 26.875, |
|
"kl": 0.118505859375, |
|
"learning_rate": 7.37e-07, |
|
"loss": -0.0, |
|
"reward": 0.07039015240879962, |
|
"reward_std": 0.11058784131892026, |
|
"rewards/DCR_reward": 0.07039015240879962, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 133.6875, |
|
"epoch": 8.0, |
|
"grad_norm": 62.75, |
|
"kl": 0.1423095703125, |
|
"learning_rate": 7.336666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.04605545370723121, |
|
"reward_std": 0.07735684312647209, |
|
"rewards/DCR_reward": 0.04605545370723121, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 149.9625, |
|
"epoch": 8.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.10341796875, |
|
"learning_rate": 7.303333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.04631754137226381, |
|
"reward_std": 0.0893157648271881, |
|
"rewards/DCR_reward": 0.04631754137226381, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 128.975, |
|
"epoch": 8.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.152685546875, |
|
"learning_rate": 7.27e-07, |
|
"loss": 0.0, |
|
"reward": 0.02897152792502311, |
|
"reward_std": 0.058125091606052594, |
|
"rewards/DCR_reward": 0.02897152792502311, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 200.3, |
|
"epoch": 8.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.18502197265625, |
|
"learning_rate": 7.236666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.017227291383460398, |
|
"reward_std": 0.03782382531207986, |
|
"rewards/DCR_reward": 0.017227291383460398, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 153.6, |
|
"epoch": 8.4, |
|
"grad_norm": 44.0, |
|
"kl": 0.122509765625, |
|
"learning_rate": 7.203333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.05631122866761871, |
|
"reward_std": 0.09404923066613265, |
|
"rewards/DCR_reward": 0.05631122866761871, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 118.475, |
|
"epoch": 8.5, |
|
"grad_norm": 37.25, |
|
"kl": 0.15625, |
|
"learning_rate": 7.17e-07, |
|
"loss": 0.0, |
|
"reward": 0.057633067340066194, |
|
"reward_std": 0.09432423976832069, |
|
"rewards/DCR_reward": 0.057633067340066194, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 144.9375, |
|
"epoch": 8.6, |
|
"grad_norm": 20.375, |
|
"kl": 0.2320068359375, |
|
"learning_rate": 7.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.05644137697381666, |
|
"reward_std": 0.09782474825915415, |
|
"rewards/DCR_reward": 0.05644137697381666, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 122.225, |
|
"epoch": 8.7, |
|
"grad_norm": 17.125, |
|
"kl": 0.144970703125, |
|
"learning_rate": 7.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.026275344849273095, |
|
"reward_std": 0.06332521875447128, |
|
"rewards/DCR_reward": 0.026275344849273095, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 162.525, |
|
"epoch": 8.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.161083984375, |
|
"learning_rate": 7.07e-07, |
|
"loss": 0.0, |
|
"reward": 0.07339370545232668, |
|
"reward_std": 0.13417805570643396, |
|
"rewards/DCR_reward": 0.07339370545232668, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 140.4875, |
|
"epoch": 8.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.202880859375, |
|
"learning_rate": 7.036666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.03404433502000757, |
|
"reward_std": 0.06663689499837347, |
|
"rewards/DCR_reward": 0.03404433502000757, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 166.25, |
|
"epoch": 9.0, |
|
"grad_norm": 50.75, |
|
"kl": 0.18447265625, |
|
"learning_rate": 7.003333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.07551841043459717, |
|
"reward_std": 0.10837250614131336, |
|
"rewards/DCR_reward": 0.07551841043459717, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 171.9, |
|
"epoch": 9.1, |
|
"grad_norm": 70.5, |
|
"kl": 0.1385986328125, |
|
"learning_rate": 6.97e-07, |
|
"loss": 0.0, |
|
"reward": 0.06921597410691901, |
|
"reward_std": 0.09217902371892706, |
|
"rewards/DCR_reward": 0.06921597410691901, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 134.325, |
|
"epoch": 9.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.2784912109375, |
|
"learning_rate": 6.936666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.022432816278160315, |
|
"reward_std": 0.04160968857759144, |
|
"rewards/DCR_reward": 0.022432816278160315, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 149.1875, |
|
"epoch": 9.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.164208984375, |
|
"learning_rate": 6.903333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.04467101704794914, |
|
"reward_std": 0.09531959185260348, |
|
"rewards/DCR_reward": 0.04467101704794914, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 147.125, |
|
"epoch": 9.4, |
|
"grad_norm": 13.5625, |
|
"kl": 0.1576171875, |
|
"learning_rate": 6.87e-07, |
|
"loss": -0.0, |
|
"reward": 0.053139488815213555, |
|
"reward_std": 0.11639446567569393, |
|
"rewards/DCR_reward": 0.053139488815213555, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 171.2875, |
|
"epoch": 9.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.1521484375, |
|
"learning_rate": 6.836666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.014011116385518108, |
|
"reward_std": 0.01976964412315283, |
|
"rewards/DCR_reward": 0.014011116385518108, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 130.7625, |
|
"epoch": 9.6, |
|
"grad_norm": 48.5, |
|
"kl": 0.184765625, |
|
"learning_rate": 6.803333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.05880497039906914, |
|
"reward_std": 0.08855972705059685, |
|
"rewards/DCR_reward": 0.05880497039906914, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 139.825, |
|
"epoch": 9.7, |
|
"grad_norm": 25.0, |
|
"kl": 0.162353515625, |
|
"learning_rate": 6.77e-07, |
|
"loss": -0.0, |
|
"reward": 0.07512311937389313, |
|
"reward_std": 0.12237471891276072, |
|
"rewards/DCR_reward": 0.07512311937389313, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 151.1125, |
|
"epoch": 9.8, |
|
"grad_norm": 25.75, |
|
"kl": 0.1543701171875, |
|
"learning_rate": 6.736666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.020507018158969003, |
|
"reward_std": 0.03885748497850727, |
|
"rewards/DCR_reward": 0.020507018158969003, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 119.05, |
|
"epoch": 9.9, |
|
"grad_norm": 68.0, |
|
"kl": 0.168115234375, |
|
"learning_rate": 6.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.024185732538899173, |
|
"reward_std": 0.05010633007332217, |
|
"rewards/DCR_reward": 0.024185732538899173, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 128.65, |
|
"epoch": 10.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.177001953125, |
|
"learning_rate": 6.67e-07, |
|
"loss": -0.0, |
|
"reward": 0.059286916424025546, |
|
"reward_std": 0.09370468626730144, |
|
"rewards/DCR_reward": 0.059286916424025546, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_completion_length": 154.835, |
|
"eval_kl": 0.180048828125, |
|
"eval_loss": 3.1449687298845674e-09, |
|
"eval_reward": 0.0444011578221398, |
|
"eval_reward_std": 0.07596371626394102, |
|
"eval_rewards/DCR_reward": 0.0444011578221398, |
|
"eval_runtime": 1449.1033, |
|
"eval_samples_per_second": 0.069, |
|
"eval_steps_per_second": 0.009, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|